In [144]:
import sklearn
import nltk
import numpy as np
import pandas as pd
import os
from scipy.sparse import hstack
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

train = pd.read_excel(os.getcwd()+'/train.xlsx')
test = pd.read_excel(os.getcwd()+'/test.xlsx')

In [145]:
train.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos


In [146]:
def rem_mentions_hasht(tweet):
    words = tweet.split()
    relevant_tokens = [w for w in words if '@' not in w and '#' not in w]
    return( " ".join(relevant_tokens))

##POS Features
import pickle
train_stfrd_POStagged = pickle.load( open( "train_stfrd_POStagged.p", "rb" ) )
test_stfrd_POStagged = pickle.load( open( "test_stfrd_POStagged.p", "rb" ) )

##NER Features
ner_tagged_stnfrd = pickle.load( open( "train_stfrd_NERtagged.p", "rb" ) )

ner_t_tagged_stnfrd = pickle.load( open( "test_stfrd_NERtagged.p", "rb" ) )

##Argument lexicon features
train_arg_lex = pickle.load( open( "train_arg_lex.p", "rb" ) )

test_arg_lex = pickle.load( open( "test_arg_lex.p", "rb" ) )
##Subjectivity lexicon features
train_mpqa = pickle.load( open( "train_mpqa.p", "rb" ) )
test_mpqa = pickle.load( open( "test_mpqa.p", "rb" ) )

##sentiment and subjectivity features as given by textblob
from textblob import TextBlob

train_senti = train.Tweet.apply(lambda a: TextBlob(rem_mentions_hasht(a)).sentiment[0]).reshape(train.shape[0],1)
test_senti = test.Tweet.apply(lambda a: TextBlob(rem_mentions_hasht(a)).sentiment[0]).reshape(test.shape[0],1)
train_subj = train.Tweet.apply(lambda a: TextBlob(rem_mentions_hasht(a)).sentiment[1]).reshape(train.shape[0],1)
test_subj = test.Tweet.apply(lambda a: TextBlob(rem_mentions_hasht(a)).sentiment[1]).reshape(test.shape[0],1)

##Helper Functions
def ner_features(ner_tagged):
    def ner_count(ner_tagged):
        count =  {}
        count['ORGANIZATION'] = 0
        count['LOCATION'] = 0
        count['PERSON'] = 0
        count['O'] = 0
        count = [0,0,0,0]
        for word in ner_tagged:
            if word[1] == 'O':
                count[0]+=1
            elif word[1] == 'ORGANIZATION':
                count[1]+=1
            elif word[1] == 'PERSON':
                count[2]+=1
            elif word[1] == 'LOCATION':
                count[3]+=1
        return count
        
    ner_counts = ner_tagged.apply(lambda a: ner_count(a))
    ner1 = ner_counts.apply(lambda a: a[1])
    ner2 = ner_counts.apply(lambda a: a[2])
    ner3 = ner_counts.apply(lambda a: a[3])
            
    return np.array([ner1,ner2,ner3]).T

def create_dict(pos_tagged):
    pos_dict = {}
    for i in range(0,len(pos_tagged)):
        for j in pos_tagged.iloc[i]:
            if j[1] in  pos_dict.keys():
                if j[0] not in pos_dict[j[1]]:
                    pos_dict[j[1]].append(j[0])
            else:
                pos_dict[j[1]] = []
    return pos_dict

def create_dummy_cat_cols(df, cat_cols):
    cat_dummy_cols = {}
    for col in cat_cols:
        dummies = pd.get_dummies(df[col], prefix=col, dummy_na=False).ix[:,:-1]
        df = pd.concat([dummies,df], axis=1)
        cat_dummy_cols[col] = dummies.columns.values
    return df, cat_dummy_cols
    
##F score calculator
def f_score(predictions, actual, class_type='AGAINST'):
    tot_class_pred = sum(predictions==class_type)
    correctly_classified = sum(((actual==class_type).values)&(predictions==class_type))
    tot_class_act = sum(((actual==class_type).values))
    if tot_class_pred == 0:
        prec = 0
    else:
        prec = float(correctly_classified/tot_class_pred)
    if tot_class_act == 0:
        recall = 1
    else:
        recall = float(correctly_classified/tot_class_act)
    if prec+recall==0:
        return 0
    else:
        return ((2*prec*recall)/(prec+recall))
##Final score
def custom_scorer(actual, predictions):
    against = f_score(predictions, actual, class_type='AGAINST')
    favor = f_score(predictions, actual, class_type='FAVOR')
    return favor+against/2




In [147]:
ner_t_tagged_stnfrd.ix[1]


[('RT', 'O'),
 ('I', 'O'),
 ('remove', 'O'),
 ('Nehushtan', 'PERSON'),
 ('-previous', 'O'),
 ('moves', 'O'),
 ('of', 'O'),
 ('God', 'O'),
 ('that', 'O'),
 ('have', 'O'),
 ('become', 'O'),
 ('idols', 'O'),
 (',', 'O'),
 ('from', 'O'),
 ('the', 'O'),
 ('high', 'O'),
 ('places', 'O'),
 ('-2', 'O'),
 ('Kings', 'O'),
 ('18:4', 'O')]

In [148]:
## all words as features
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(data.Tweet)
    features_t = vectorizer.transform(data_t.Tweet)
    return features, features_t



In [149]:
##using nouns verbs and adjectives
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    pos_dict = create_dict(train_stfrd_POStagged.ix[data.index])

    Imp_Words = list(set(pos_dict['NN']+pos_dict['NNS']+pos_dict['JJ']+pos_dict['JJR']+pos_dict['JJS']+pos_dict['VB']+pos_dict['VBD']+pos_dict['VBG']+pos_dict['VBN']+pos_dict['VBP']+pos_dict['VBZ']))

    vectorizer = CountVectorizer(vocabulary=Imp_Words)
    features = vectorizer.fit_transform(data.Tweet)
    features_t = vectorizer.transform(data_t.Tweet)
    return features, features_t



In [150]:
##optimising the features
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(ngram_range = (1,5), min_df=5, stop_words='english', strip_accents='ascii')
    features = vectorizer.fit_transform(data.Tweet)
    features_t = vectorizer.transform(data_t.Tweet)
    return features, features_t

In [123]:
##optimising the features and mpqa subjectivity
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(ngram_range = (1,5), min_df=5, stop_words='english', strip_accents='ascii')
    features = vectorizer.fit_transform(data.Tweet)
    features_ner = ner_features(ner_tagged_stnfrd.ix[data.index])
    features_sent = create_dummy_cat_cols(data, ["Sentiment"])[0].ix[:,0:2]
    features_arg = train_arg_lex[data.index]
    features_mpqa = train_mpqa[data.index]
    features_senti = train_senti[data.index]
    features_subj = train_subj[data.index]
    features = hstack([features, features_mpqa])
    features_t = vectorizer.transform(data_t.Tweet)
    features_arg_t = test_arg_lex[data_t.index]
    features_ner_t = ner_features(ner_t_tagged_stnfrd.ix[data_t.index])
    features_mpqa_t = test_mpqa[data_t.index]
    features_senti_t = test_senti[data_t.index]
    features_subj_t = test_subj[data_t.index]
    features_sent_t = create_dummy_cat_cols(data_t, ["Sentiment"])[0].ix[:,0:2]
    features_t = hstack([features_t, features_mpqa_t])
    return features, features_t
    

In [132]:
##optimising the features and arguing subjectivity
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(ngram_range = (1,5), min_df=5, stop_words='english', strip_accents='ascii')
    features = vectorizer.fit_transform(data.Tweet)
    features_ner = ner_features(ner_tagged_stnfrd.ix[data.index])
    features_sent = create_dummy_cat_cols(data, ["Sentiment"])[0].ix[:,0:2]
    features_arg = train_arg_lex[data.index]
    features_mpqa = train_mpqa[data.index]
    features_senti = train_senti[data.index]
    features_subj = train_subj[data.index]
    features = hstack([features, features_arg])
    features_t = vectorizer.transform(data_t.Tweet)
    features_arg_t = test_arg_lex[data_t.index]
    features_ner_t = ner_features(ner_t_tagged_stnfrd.ix[data_t.index])
    features_mpqa_t = test_mpqa[data_t.index]
    features_senti_t = test_senti[data_t.index]
    features_subj_t = test_subj[data_t.index]
    features_sent_t = create_dummy_cat_cols(data_t, ["Sentiment"])[0].ix[:,0:2]
    features_t = hstack([features_t, features_arg_t])
    return features, features_t



In [151]:
##All features
def create_features(data, data_t):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(ngram_range = (1,5), min_df=5, stop_words='english', strip_accents='ascii')
    features = vectorizer.fit_transform(data.Tweet)
    features_ner = ner_features(ner_tagged_stnfrd.ix[data.index])
    features_sent = create_dummy_cat_cols(data, ["Sentiment"])[0].ix[:,0:2]
    features_arg = train_arg_lex[data.index]
    features_mpqa = train_mpqa[data.index]
    features_senti = train_senti[data.index]
    features_subj = train_subj[data.index]
    features = hstack([features_ner, features, features_arg, features_mpqa, features_senti, features_subj])
    features_t = vectorizer.transform(data_t.Tweet)
    features_arg_t = test_arg_lex[data_t.index]
    features_ner_t = ner_features(ner_t_tagged_stnfrd.ix[data_t.index])
    features_mpqa_t = test_mpqa[data_t.index]
    features_senti_t = test_senti[data_t.index]
    features_subj_t = test_subj[data_t.index]
    features_sent_t = create_dummy_cat_cols(data_t, ["Sentiment"])[0].ix[:,0:2]
    features_t = hstack([features_ner_t, features_t, features_arg_t, features_mpqa_t, features_senti_t, features_subj_t])
    return features, features_t

In [152]:
# Running features 
train_f,test_f = create_features(train,test)

In [153]:

train_f.toarray()
test_f.toarray()

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.80000000e+01,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          1.90000000e+01,  -3.33333333e-03,   3.53333333e-01],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.60000000e+01,   3.44444444e-01,   7.44444444e-01],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          1.30000000e+01,  -4.00000000e-01,   5.50000000e-01],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.70000000e+01,   4.00000000e-01,   8.00000000e-01]])

In [154]:
# Converting to array for data frame
train_final = np.hstack([train_f.toarray(),train.Target.reshape(2914,1),train.Stance.reshape(2914,1)])
test_final = np.hstack([test_f.toarray(),test.Target.reshape(1956,1), test.Stance.reshape(1956,1)])

train_final_list = list(train_final)
test_final_list = list(test_final)

df = pd.DataFrame(train_final_list)
df_test = pd.DataFrame(test_final_list)


In [155]:
df_test.head()
#df_test_feminism.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,Atheism,AGAINST
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,19.0,-0.003333,0.353333,Atheism,AGAINST
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,16.0,0.344444,0.744444,Atheism,AGAINST
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,5.0,-0.25,0.5,Atheism,AGAINST
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,-0.25,0.625,Atheism,AGAINST


In [156]:
# Creating seperate train datasets
i_train = 1203
df_hillary = df[df[i_train]=="Hillary Clinton"]
df_legalization = df[df[i_train]=="Legalization of Abortion"]
df_atheism = df[df[i_train]=="Atheism"]
df_climate = df[df[i_train]=="Climate Change is a Real Concern"]
df_feminism = df[df[i_train]=="Feminist Movement"]


In [157]:
# Creating seperate test datasets
i_test = 1203
df_test_hillary = df_test[df_test[i_test]=="Hillary Clinton"]
df_test_legalization = df_test[df_test[i_test]=="Legalization of Abortion"]
df_test_atheism = df_test[df_test[i_test]=="Atheism"]
df_test_climate = df_test[df_test[i_test]=="Climate Change is a Real Concern"]
df_test_feminism = df_test[df_test[i_test]=="Feminist Movement"]

In [158]:
df_test_hillary.head()
#df_test_climate.head()
#len(df_test_atheism)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204
674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.6,Hillary Clinton,AGAINST
675,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,Hillary Clinton,AGAINST
676,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,2.0,20.0,-0.027778,0.122222,Hillary Clinton,AGAINST
677,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,13.0,-0.095833,0.233333,Hillary Clinton,AGAINST
678,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,19.0,0.0,0.0,Hillary Clinton,AGAINST


In [159]:
# Dropping the target column from test and train data 

df_climate.drop(i_train,1, inplace=True)
df_hillary.drop(i_train,1, inplace=True)
df_atheism.drop(i_train,1, inplace=True)
df_feminism.drop(i_train,1, inplace=True)
df_legalization.drop(i_train,1, inplace=True)


df_test_climate.drop(i_test,1, inplace=True)
df_test_hillary.drop(i_test,1, inplace=True)
df_test_atheism.drop(i_test,1, inplace=True)
df_test_feminism.drop(i_test,1, inplace=True)
df_test_legalization.drop(i_test,1, inplace=True)

#df_climate = df_climate.rename(columns={6068: 6067})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set

In [160]:
#Saving data to csv for use with timbl 
type_of_data = "all_opt"
df_hillary.to_csv("train_"+type_of_data+"_hillary.csv", sep="\t", index=False)
df_test_hillary.to_csv("test_"+type_of_data+"_hillary.csv", sep="\t", index=False)

df_atheism.to_csv("train_"+type_of_data+"_atheism.csv", sep="\t", index=False)
df_test_atheism.to_csv("test_"+type_of_data+"_atheism.csv", sep="\t", index=False)

df_feminism.to_csv("train_"+type_of_data+"_feminism.csv", sep="\t", index=False)
df_test_feminism.to_csv("test_"+type_of_data+"_feminism.csv", sep="\t", index=False)

df_legalization.to_csv("train_"+type_of_data+"_legalization.csv", sep="\t", index=False)
df_test_legalization.to_csv("test_"+type_of_data+"_legalization.csv", sep="\t", index=False)

df_climate.to_csv("train_"+type_of_data+"_climate.csv", sep="\t", index=False)
df_test_climate.to_csv("test_"+type_of_data+"_climate.csv", sep="\t", index=False)

In [161]:
#df_hillary.drop(9176,1, inplace=True)

In [162]:
df_test_hillary

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1194,1195,1196,1197,1198,1199,1200,1201,1202,1204
674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.000000,0.600000,AGAINST
675,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,AGAINST
676,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,2.0,20.0,-0.027778,0.122222,AGAINST
677,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,13.0,-0.095833,0.233333,AGAINST
678,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,19.0,0.000000,0.000000,AGAINST
679,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,17.0,0.000000,0.000000,AGAINST
680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.000000,0.000000,AGAINST
681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.100000,0.050000,AGAINST
682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,9.0,0.400000,0.800000,AGAINST
683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,-0.800000,1.000000,AGAINST


In [23]:
df_hillary

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1163,1164,1165,1166,1167,1168,1169,1170,1171,1173
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,FAVOR
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,NONE
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,NONE
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,NONE
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AGAINST


In [227]:
df_new = pd.read_csv("/Users/Abhishek/Downloads/codeforproducingdatasetfortimble/Dataset/all/hillary.csv")
hillary = df_new[[9176,9177]]
hillary.to_csv("/Users/Abhishek/Downloads/codeforproducingdatasetfortimble/Dataset/all/hillary2.csv")

In [None]:
def f_score(predictions, actual, class_type='AGAINST'):
    tot_class_pred = sum(predictions==class_type)
    correctly_classified = sum(((actual==class_type).values)&(predictions==class_type))
    tot_class_act = sum(((actual==class_type).values))
    if tot_class_pred == 0:
        prec = 0
    else:
        prec = float(correctly_classified/tot_class_pred)
    if tot_class_act == 0:
        recall = 1
    else:
        recall = float(correctly_classified/tot_class_act)
    if prec+recall==0:
        return 0
    else:
        return ((2*prec*recall)/(prec+recall))
##Final score
def custom_scorer(actual, predictions):
    against = f_score(predictions, actual, class_type='AGAINST')
    favor = f_score(predictions, actual, class_type='FAVOR')
    return (favor+against)/2

In [323]:
import csv
file_location = "all_opt/legalization"
with open("/Users/Abhishek/Downloads/codeforproducingdatasetfortimble/Dataset/"+file_location+".csv", 'r') as f:
    reader = csv.reader(f)
    prediction_list = list(reader)
    prediction_list = prediction_list[1:]

actual=[]
predicted =[]
for i in range(len(prediction_list)):
    actual.append(prediction_list[i][0])
    predicted.append(prediction_list[i][1])
actual = pd.Series(actual)    
predicted = pd.Series(predicted)

custom_scorer(actual,predicted)

0.42611424984306334

In [None]:
atheism

In [None]:
climate

In [286]:
feminism

In [287]:
hillary

0.42063492063492064

In [None]:
legalization