In [67]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime


In [48]:
tweets = pd.read_csv('~/Datasets/tweets.csv', encoding='utf-8')

In [49]:
tweets.head(1)

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...


In [50]:
twr = tweets[tweets['is_retweet'] == False]
twr = twr[['handle', 'text', 'time']]
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

In [85]:
twr.isnull().sum()

handle            0
text              0
time              0
is_trump          0
processed_text    0
tod_q1            0
tod_h1            0
tod_h2            0
tod_h3            0
tod_h4            0
tod_h5            0
tod_h6            0
dtype: int64

In [91]:
twr.head(20)

Unnamed: 0,handle,text,time,is_trump,processed_text,tod_q1,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28 00:22:34,0,question elect put plan action make life bette...,1,1,0,0,0,0,0
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27 23:08:41,0,stand togeth there noth cant \n\nmake sure you...,0,0,0,0,0,0,1
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27 22:30:27,0,candid ask theyd confront racial injustic one ...,0,0,0,0,0,0,1
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27 22:13:24,1,join 3pm ralli tomorrow midamerica center cou...,0,0,0,0,0,0,1
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27 21:35:28,0,elect import sit go httpstcottgeqxnqym make su...,0,0,0,0,0,0,1
7,HillaryClinton,When Donald Trump goes low...register to vote:...,2016-09-27 21:25:31,0,donald trump goe lowregist vote httpstcottgeqx...,0,0,0,0,0,0,1
8,realDonaldTrump,"Once again, we will have a government of, by a...",2016-09-27 21:08:22,1,govern peopl join movement today httpstcolwjyd...,0,0,0,0,0,0,1
12,realDonaldTrump,Hillary Clinton's Campaign Continues To Make F...,2016-09-27 20:14:33,1,hillari clinton campaign continu make fals cla...,0,0,0,0,0,0,1
13,realDonaldTrump,"'CNBC, Time magazine online polls say Donald T...",2016-09-27 20:06:25,1,cnbc time magazin onlin poll say donald trump ...,0,0,0,0,0,0,1
14,HillaryClinton,Donald Trump lied to the American people at le...,2016-09-27 19:59:28,0,donald trump lie american peopl least 58 time ...,0,0,0,0,0,1,0


In [66]:
twr.dtypes

handle            object
text              object
time              object
is_trump           int64
processed_text    object
dtype: object

In [68]:
twr['time'] = twr['time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))

In [74]:
twr['tod_h1'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=0) and (int(datetime.strftime(x, '%H'))<4) else 0)
twr['tod_h2'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=4) and (int(datetime.strftime(x, '%H'))<8) else 0)
twr['tod_h3'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=8) and (int(datetime.strftime(x, '%H'))<12) else 0)
twr['tod_h4'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=12) and (int(datetime.strftime(x, '%H'))<16) else 0)
twr['tod_h5'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=16) and (int(datetime.strftime(x, '%H'))<20) else 0)
twr['tod_h6'] = twr['time'].apply(lambda x: 1 if (int(datetime.strftime(x, '%H'))>=20) else 0)

In [73]:
twr.head(2)

Unnamed: 0,handle,text,time,is_trump,processed_text,tod_q1
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28 00:22:34,0,question elect put plan action make life bette...,1
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27 23:08:41,0,stand togeth there noth cant \n\nmake sure you...,0


In [69]:
twr.dtypes

handle                    object
text                      object
time              datetime64[ns]
is_trump                   int64
processed_text            object
dtype: object

# Preprocessing: punctuation, stopwords, stemming

In [75]:
stemmer = PorterStemmer()

def remove_punctuation(text):
    lower = text.lower()
    exclude = set(string.punctuation)
    return "".join(ch for ch in lower if ch not in exclude)

def remove_stopwords(x):
    lower = x.lower()
    split = lower.split(" ")
    final = ''
    for i in split:
        if i not in stopwords.words('english'):
            final += (i + " ")
    return final

def split_and_stem(string):
    string = string.split(' ')
    temp = ""
    for i in string:
        temp += (stemmer.stem(i) + " ")
    return temp

In [76]:
twr['processed_text'] = twr['text'].apply(remove_punctuation)
twr['processed_text'] = twr['processed_text'].apply(remove_stopwords)
twr['processed_text'] = twr['processed_text'].apply(split_and_stem)

In [54]:
twr.head(2)

Unnamed: 0,handle,text,time,is_trump,processed_text
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0,question elect put plan action make life bette...
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0,stand togeth there noth cant \n\nmake sure you...


## Defining, x + y, TTS

In [78]:
x = twr[['processed_text', 'tod_h1', 'tod_h2', 'tod_h3', 'tod_h4', 'tod_h5', 'tod_h6']]
y = twr['is_trump']
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [56]:
x.head()

0    question elect put plan action make life bette...
3    stand togeth there noth cant \n\nmake sure you...
4    candid ask theyd confront racial injustic one ...
5    join 3pm ralli  tomorrow midamerica center cou...
6    elect import sit go httpstcottgeqxnqym make su...
Name: processed_text, dtype: object

In [57]:
x_train.head()

5585    mjp1370 realdonaldtrump cruz talk hillari time...
272                     none us get life alon look lift  
1602    crook hillari said loudli world see short circ...
6101    thank america\nmakeamericagreatagain trump2016...
960     go\n\ntrump perfectli clear intent million imm...
Name: processed_text, dtype: object

# TF-IDF

In [79]:
tfidf = TfidfVectorizer()

In [80]:
df_train = pd.DataFrame(tfidf.fit_transform(x_train['processed_text']).todense(), columns=tfidf.get_feature_names())
df_test = pd.DataFrame(tfidf.transform(x_test['processed_text']).todense(), columns=tfidf.get_feature_names())

# Rebuilding dataset

In [94]:
df_train['tod_h1'] = x_train['tod_h1'].values
df_train['tod_h2'] = x_train['tod_h2'].values
df_train['tod_h3'] = x_train['tod_h3'].values
df_train['tod_h4'] = x_train['tod_h4'].values
df_train['tod_h5'] = x_train['tod_h5'].values
df_train['tod_h6'] = x_train['tod_h6'].values

In [96]:
df_test['tod_h1'] = x_test['tod_h1'].values
df_test['tod_h2'] = x_test['tod_h2'].values
df_test['tod_h3'] = x_test['tod_h3'].values
df_test['tod_h4'] = x_test['tod_h4'].values
df_test['tod_h5'] = x_test['tod_h5'].values
df_test['tod_h6'] = x_test['tod_h6'].values

In [104]:
df_test.head()

Unnamed: 0,00,01,079,10,100,1000,10000,100000,1000000,100000000,...,zuluout,él,única,único,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0


In [105]:
x_test.head()

Unnamed: 0,processed_text,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
6011,interview face nation jdickerson morn enjoy,0,0,0,1,0,0
5839,copyover2001 realdonaldtrump american realli...,0,0,0,0,0,1
4941,thank america trump2016 httpstcoptkkrleucb,0,0,0,0,0,1
4157,let make easier rais famili america httpstcoao...,0,0,0,1,0,0
3084,today vote plan parenthood nh execut council c...,0,0,0,0,1,0


In [106]:
df_test.index = x_test.index

In [107]:
df_test.head()

Unnamed: 0,00,01,079,10,100,1000,10000,100000,1000000,100000000,...,zuluout,él,única,único,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
6011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
5839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
4941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
4157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
3084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0


In [108]:
x_test.head()

Unnamed: 0,processed_text,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
6011,interview face nation jdickerson morn enjoy,0,0,0,1,0,0
5839,copyover2001 realdonaldtrump american realli...,0,0,0,0,0,1
4941,thank america trump2016 httpstcoptkkrleucb,0,0,0,0,0,1
4157,let make easier rais famili america httpstcoao...,0,0,0,1,0,0
3084,today vote plan parenthood nh execut council c...,0,0,0,0,1,0


In [87]:
x_test.head()

Unnamed: 0,processed_text,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
6011,interview face nation jdickerson morn enjoy,0,0,0,1,0,0
5839,copyover2001 realdonaldtrump american realli...,0,0,0,0,0,1
4941,thank america trump2016 httpstcoptkkrleucb,0,0,0,0,0,1
4157,let make easier rais famili america httpstcoao...,0,0,0,1,0,0
3084,today vote plan parenthood nh execut council c...,0,0,0,0,1,0


In [88]:
x_test.shape

(1431, 7)

In [97]:
df_test.head(10)

Unnamed: 0,00,01,079,10,100,1000,10000,100000,1000000,100000000,...,zuluout,él,única,único,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0


In [89]:
df_test.shape

(1431, 8754)

In [95]:
df_train.head()

Unnamed: 0,00,01,079,10,100,1000,10000,100000,1000000,100000000,...,zuluout,él,única,único,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1


In [93]:
x_train.head()

Unnamed: 0,processed_text,tod_h1,tod_h2,tod_h3,tod_h4,tod_h5,tod_h6
5934,bigcarsonrock americastop dupedwak cruz amp ru...,0,0,0,0,1,0
4823,hillari clinton luchará para lo inmigrant para...,0,0,0,0,1,0
1283,soon call mr brexit,0,0,0,1,0,0
3230,get readi open magnific turnberri scotland gre...,0,0,1,0,0,0
369,next week hillari walk stage first presidenti ...,0,0,0,0,0,1


# Modeling

In [60]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [61]:
rf = RandomForestClassifier(min_samples_leaf=3)
et = ExtraTreesClassifier(min_samples_leaf=2)
ad = AdaBoostClassifier()
lin = LinearSVC()

In [98]:
run_model(df_train, y_train, df_test, y_test, rf)

Base model score: 0.5387
Training set score:  0.9137
Test set score:  0.8679

Confusion Matrix:
      predicted_0  predicted_1
is_0          581           79
is_1          110          661

Classification Report:
             precision    recall  f1-score   support

          0       0.84      0.88      0.86       660
          1       0.89      0.86      0.87       771

avg / total       0.87      0.87      0.87      1431



In [99]:
run_model(df_train, y_train, df_test, y_test, et)

Base model score: 0.5387
Training set score:  0.9487
Test set score:  0.8916

Confusion Matrix:
      predicted_0  predicted_1
is_0          590           70
is_1           85          686

Classification Report:
             precision    recall  f1-score   support

          0       0.87      0.89      0.88       660
          1       0.91      0.89      0.90       771

avg / total       0.89      0.89      0.89      1431



In [100]:
run_model(df_train, y_train, df_test, y_test, lin)

Base model score: 0.5387
Training set score:  0.9990
Test set score:  0.9245

Confusion Matrix:
      predicted_0  predicted_1
is_0          598           62
is_1           46          725

Classification Report:
             precision    recall  f1-score   support

          0       0.93      0.91      0.92       660
          1       0.92      0.94      0.93       771

avg / total       0.92      0.92      0.92      1431



In [65]:
# run_model(df_train, y_train, df_test, y_test, ad)