In [69]:
import re
import pandas as pd
import numpy as np
import warnings
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from wordsegment import segment
from html import unescape
import itertools
import random
warnings.filterwarnings('ignore')


from sklearn.cross_validation import train_test_split,cross_val_score, KFold,cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import precision_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
xl_file = pd.ExcelFile("testing-Obama-Romney-tweets.xlsx", encoding='utf-8')

In [3]:
dfs = {sheet_name: xl_file.parse('Obama',encoding='utf-8') 
          for sheet_name in xl_file.sheet_names}
obamaData=dfs.get('Obama')

In [4]:
dfs = {sheet_name: xl_file.parse('Romney',encoding='utf-8') 
          for sheet_name in xl_file.sheet_names}
romneyData=dfs.get('Romney')

In [7]:
def clearColumns(dataDF):
    dataDF=dataDF.drop(dataDF.columns[[1,2,3]], axis=1)
    dataDF.columns = ['tweet', 'label']
    dataDF = dataDF.ix[1:]
    print(dataDF)
    print(dataDF['label'].unique())
    dataDF=dataDF[dataDF['label'].isin([0,1,-1,u'-1',u'1',u'0'])]
    return dataDF

In [8]:
obamaDataDF = clearColumns(obamaData)
romneyDataDF = clearColumns(romneyData)


                                                  tweet label
1     <e>Obama</e> wants to tax foreign earnings. Th...    -1
2     <e>Obama</e> has to maintain his professionali...     1
3     I hate <e>Obama</e> with a BURNING PASSION #de...    -1
4     I don't like<e>Obama</e> because his stupid <a...    -1
5     The only thing I don't like about <e>Obama</e>...    -1
6     <e>Obama</e>snuck the 47% in when<e>Romney </e...     2
7     The people who vote for<e>Obama</e> dont reali...    -1
8     It's difficult to give <e>Obama</e> a grade wh...    -1
9     cspanwj <e>Obama</e> LOST IN BOTH STYLE AND SU...    -1
10    <e>Obama</e><a> lied about his rosé garden cla...    -1
11    <e>Obama</e> went into the debate swinging and...     1
12    <e>Obama</e> came across as bitter, condescend...    -1
13                 I literally can't stand<e>Obama</e>.    -1
14    Can anyone who is a <e>Obama</e> supporter giv...     0
15    I loved when <e>Romney </e>kept asking <e>Obam...     2
16    Th

In [9]:
def cleanURLS(tweet):
    tweet=re.sub('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', tweet)
    return tweet
    
def getText_fromHTML(tweet):
    soup = BeautifulSoup(tweet)
    tweet = soup.get_text()
    return tweet

def removeAppostophes(tweet):
    appostophes = {"s": "is", "re": "are", "em": "them", "ll": "will", "t": "it", "m": "am", "ve": "have", "d": "did"}
    words = []
    [words.append(terms) for word in tweet.split() for terms in [appostophes[term] if term in appostophes else term for
                                                                 term in word.split("'")]]
    tweet = " ".join(words)
    return tweet

def splitAttachedwords(tweet):
    tweet = " ".join(segment(tweet))
    return tweet

def lemmatisation(tweet):
    return " ".join(WordNetLemmatizer().lemmatize(word) for word in tweet.split())

def stemming(tweet):
    return " ".join(PorterStemmer().stem(word) for word in tweet.split())

In [10]:
def cleanTweets(dataDF):
    rowsToBeRemoved=[]
    TAG_RE = re.compile(r'<[^>]+>')
    for index, row in dataDF.iterrows():
        tweet=row['tweet']
        if isinstance(row['label'], str) == True:
            row['label']=int(row['label'])
        output=""      
        if isinstance(tweet, str) == False:
            rowsToBeRemoved.append(index)
            continue
        tweet=tweet.lower()
        
        # remove the urls from tweet
        tweet = cleanURLS(tweet)

        # remove HTML tags
        tweet = getText_fromHTML(tweet)
        
        # remove appostophes
        tweet = removeAppostophes(tweet)
        
        # split attached words ex: goodboy = good boy
        tweet = splitAttachedwords(tweet)
        
        for word in tweet.split(" "):
            # removing usernames
            if word.strip().startswith('@'):
                word=""
            # removing hashtags
            if word.strip().startswith('#'):
                word=word[1:]
            # strip punctuation
            word=re.sub(r'[^\w\s]','',word)
            if word != "" and word.isalpha():
                output+=" "+word
        if len(output)<1:
            rowsToBeRemoved.append(index)
        output = lemmatisation(output)
        output = stemming(output)   
        row['tweet']=output.strip()
    return dataDF.dropna()

In [11]:
obamaCleanedData=cleanTweets(obamaDataDF.copy(deep=True))
romneyCleanedData=cleanTweets(romneyDataDF.copy(deep=True))
# romneyCleanedData.dropna(axis=0,inplace=True)

## For Obama Data

In [60]:
obamaData = pd.read_csv("obamaCleanedData.csv")
print(len(obamaData))
obamaData.dropna(axis=0,inplace=True)
print(len(obamaData))

5624
5600


In [99]:
count_vect = CountVectorizer(max_features=4800)
tfidf_transformer = TfidfTransformer()
# Training Data
obamaData_counts = count_vect.fit_transform(obamaData['tweet'])
obamaData_tfidf = tfidf_transformer.fit_transform(obamaData_counts)
obamaData_labels = obamaData['label']

#Testing Data
obamaCleanedData_counts = count_vect.transform(obamaCleanedData['tweet'])
obamaCleanedData_labels = obamaCleanedData['label']


## Naive Bayes

In [101]:
clf_mnb = MultinomialNB()
fit_data = clf_mnb.fit(obamaData_counts,obamaData_labels)
predictions = clf_mnb.predict(obamaCleanedData_counts)
obamaCleanedData_labels = pd.to_numeric(obamaCleanedData_labels)
accuracy = ((predictions==obamaCleanedData_labels).sum()*100/float(len(obamaCleanedData_labels)))
confusion = confusion_matrix(obamaCleanedData_labels, predictions)
print(accuracy)
print(classification_report(obamaCleanedData_labels,predictions))
print(confusion)


56.4102564103
             precision    recall  f1-score   support

         -1       0.56      0.68      0.61       687
          0       0.55      0.47      0.51       681
          1       0.58      0.54      0.56       582

avg / total       0.56      0.56      0.56      1950

[[467 129  91]
 [224 321 136]
 [141 129 312]]


## Random Forest

In [102]:
clf_rfc = RandomForestClassifier(n_estimators=22,class_weight="balanced_subsample")
fit_data = clf_rfc.fit(obamaData_counts,obamaData_labels)
predictions = clf_rfc.predict(obamaCleanedData_counts)
obamaCleanedData_labels = pd.to_numeric(obamaCleanedData_labels)
accuracy = ((predictions==obamaCleanedData_labels).sum()*100/float(len(obamaCleanedData_labels)))
confusion = confusion_matrix(obamaCleanedData_labels, predictions)
print(accuracy)
print(classification_report(obamaCleanedData_labels,predictions))
print(confusion)


52.2051282051
             precision    recall  f1-score   support

         -1       0.52      0.59      0.55       687
          0       0.50      0.48      0.49       681
          1       0.55      0.49      0.52       582

avg / total       0.52      0.52      0.52      1950

[[405 180 102]
 [224 325 132]
 [146 148 288]]


## Support Vector

In [103]:
clf_svc = LinearSVC(C=0.5,loss="hinge",multi_class="ovr",penalty="l2")
fit_data = clf_svc.fit(obamaData_counts,obamaData_labels)
predictions = clf_svc.predict(obamaCleanedData_counts)
obamaCleanedData_labels = pd.to_numeric(obamaCleanedData_labels)
accuracy = ((predictions==obamaCleanedData_labels).sum()*100/float(len(obamaCleanedData_labels)))
confusion = confusion_matrix(obamaCleanedData_labels, predictions)
print(accuracy)
print(classification_report(obamaCleanedData_labels,predictions))
print(confusion)


55.1794871795
             precision    recall  f1-score   support

         -1       0.57      0.58      0.57       687
          0       0.53      0.53      0.53       681
          1       0.56      0.54      0.55       582

avg / total       0.55      0.55      0.55      1950

[[396 173 118]
 [183 364 134]
 [112 154 316]]


## Voting Classifier - Naive Bayes, Random Forest, Neural Network

In [100]:
clf_voting = VotingClassifier(estimators=[('mnb', clf_mnb), ('rfc', clf_rfc)],
                              voting='soft',n_jobs=-1)
fit_data = clf_voting.fit(obamaData_counts,obamaData_labels)
predictions = clf_voting.predict(obamaCleanedData_counts)
obamaCleanedData_labels = pd.to_numeric(obamaCleanedData_labels)
accuracy = ((predictions==obamaCleanedData_labels).sum()*100/float(len(obamaCleanedData_labels)))
confusion = confusion_matrix(obamaCleanedData_labels, predictions)
print(accuracy)
print(classification_report(obamaCleanedData_labels,predictions))
print(confusion)


57.7948717949
             precision    recall  f1-score   support

         -1       0.58      0.68      0.63       687
          0       0.57      0.49      0.52       681
          1       0.59      0.56      0.57       582

avg / total       0.58      0.58      0.58      1950

[[469 134  84]
 [203 333 145]
 [136 121 325]]


## Romney Data

In [74]:
romneyData = pd.read_csv("romneyCleanedData.csv")
print(len(romneyData))
romneyData.dropna(axis=0,inplace=True)
print(len(romneyData))

5648
5640


In [93]:
count_vect = CountVectorizer(max_features=1500)
tfidf_transformer = TfidfTransformer()
# Training Data
train_romney_counts = count_vect.fit_transform(romneyData['tweet'])
train_romney_labels = romneyData['label']

#Testing Data
test_romney_counts = count_vect.transform(romneyCleanedData['tweet'])
test_romney_labels = romneyCleanedData['label']


## Voting Classifier - Naive Bayes, Random Forest, Neural Network

In [94]:
clf_voting = VotingClassifier(estimators=[('mnb', clf_mnb), ('rfc', clf_rfc)],
                              voting='soft',n_jobs=-1)
fit_data = clf_voting.fit(train_romney_counts,train_romney_labels)
predictions = clf_voting.predict(test_romney_counts)
test_romney_labels = pd.to_numeric(test_romney_labels)
accuracy = ((predictions==test_romney_labels).sum()*100/float(len(test_romney_labels)))
confusion = confusion_matrix(test_romney_labels, predictions)
print(accuracy)
print(classification_report(test_romney_labels,predictions))
print(confusion)


58.3464981569
             precision    recall  f1-score   support

         -1       0.63      0.78      0.69       960
          0       0.47      0.42      0.45       555
          1       0.60      0.33      0.43       384

avg / total       0.58      0.58      0.57      1899

[[746 176  38]
 [274 234  47]
 [172  84 128]]


## Romney Sampled Data

In [78]:
romney_sampledData = pd.read_csv("romneyCleanedData_with_sampling_2.csv")
print(len(romney_sampledData))
romney_sampledData.dropna(axis=0,inplace=True)
print(len(romney_sampledData))

6242
6232


In [95]:
count_vect = CountVectorizer(max_features=1500)
tfidf_transformer = TfidfTransformer()
# Training Data
train_romney_counts = count_vect.fit_transform(romney_sampledData['tweet'])
train_romney_labels = romney_sampledData['label']

#Testing Data
test_romney_counts = count_vect.transform(romneyCleanedData['tweet'])
test_romney_labels = romneyCleanedData['label']


## Voting Classifier - Naive Bayes, Random Forest, Neural Network

In [96]:
clf_voting = VotingClassifier(estimators=[('mnb', clf_mnb), ('rfc', clf_rfc)],
                              voting='soft',n_jobs=-1)
fit_data = clf_voting.fit(train_romney_counts,train_romney_labels)
predictions = clf_voting.predict(test_romney_counts)
test_romney_labels = pd.to_numeric(test_romney_labels)
accuracy = ((predictions==test_romney_labels).sum()*100/float(len(test_romney_labels)))
confusion = confusion_matrix(test_romney_labels, predictions)
print(accuracy)
print(classification_report(test_romney_labels,predictions))
print(confusion)


57.9778830964
             precision    recall  f1-score   support

         -1       0.63      0.75      0.69       960
          0       0.47      0.41      0.44       555
          1       0.57      0.40      0.47       384

avg / total       0.57      0.58      0.57      1899

[[720 181  59]
 [271 227  57]
 [151  79 154]]


In [87]:
romney_sampledData['label'].value_counts()

-1    2891
 0    1679
 1    1662
Name: label, dtype: int64

## Romney sampled data 2

In [82]:
romney_sampled2Data = pd.read_csv("romneyCleanedData_with_sampling.csv")
print(len(romney_sampled2Data))
romney_sampled2Data.dropna(axis=0,inplace=True)
print(len(romney_sampled2Data))

6642
6629


In [97]:
count_vect = CountVectorizer(max_features=1500)
tfidf_transformer = TfidfTransformer()
# Training Data
train_romney_counts = count_vect.fit_transform(romney_sampled2Data['tweet'])
train_romney_labels = romney_sampled2Data['label']

#Testing Data
test_romney_counts = count_vect.transform(romneyCleanedData['tweet'])
test_romney_labels = romneyCleanedData['label']


## Voting Classifier - Naive Bayes, Random Forest, Neural Network

In [98]:
clf_voting = VotingClassifier(estimators=[('mnb', clf_mnb), ('rfc', clf_rfc)],
                              voting='soft',n_jobs=-1)
fit_data = clf_voting.fit(train_romney_counts,train_romney_labels)
predictions = clf_voting.predict(test_romney_counts)
test_romney_labels = pd.to_numeric(test_romney_labels)
accuracy = ((predictions==test_romney_labels).sum()*100/float(len(test_romney_labels)))
confusion = confusion_matrix(test_romney_labels, predictions)
print(accuracy)
print(classification_report(test_romney_labels,predictions))
print(confusion)


58.9257503949
             precision    recall  f1-score   support

         -1       0.64      0.75      0.69       960
          0       0.48      0.43      0.45       555
          1       0.57      0.42      0.49       384

avg / total       0.58      0.59      0.58      1899

[[717 176  67]
 [259 239  57]
 [140  81 163]]


In [88]:
romney_sampled2Data['label'].value_counts()

-1    2891
 1    2059
 0    1679
Name: label, dtype: int64