In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime



In [2]:
tweets = pd.read_csv('~/Datasets/tweets.csv', encoding='utf-8')

In [3]:
twr = tweets[tweets['is_retweet'] == False]
twr = twr[['handle', 'text', 'time']]
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

In [4]:
twr['time'] = twr['time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))

In [6]:
for i in range(0,24):
    twr['hour_' + str(i)] = twr['time'].apply(lambda x: 1 if i==int(datetime.strftime(x, '%H')) else 0)

In [8]:
for i in range(1,13):
    twr['month_'+str(i)] = twr['time'].apply(lambda x: 1 if i==int(datetime.strftime(x, '%m')) else 0)

In [25]:
for i in range(1,8):
    twr['weekday_'+str(1)] = twr['time'].apply(lambda x: 1 if i==int(datetime.strftime(x, '%w')) else 0)

In [9]:
twr.head()

Unnamed: 0,handle,text,time,is_trump,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28 00:22:34,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27 23:08:41,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27 22:30:27,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27 22:13:24,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27 21:35:28,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Setting X and Y

In [26]:
y = twr['is_trump']
x = twr.drop(['handle', 'text', 'time', 'is_trump'], axis=1)

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [16]:
y.head()

0    0
3    0
4    0
5    1
6    0
Name: is_trump, dtype: int64

In [17]:
x.head()

Unnamed: 0,is_trump,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Modeling

In [14]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [13]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
xgb = XGBClassifier()

In [35]:
run_model(x_train, y_train, x_test, y_test, rf)

Base model score: 0.5380
Training set score:  0.7760
Test set score:  0.7631

Confusion Matrix:
      predicted_0  predicted_1
is_0          544          117
is_1          222          548

Classification Report:
             precision    recall  f1-score   support

          0       0.71      0.82      0.76       661
          1       0.82      0.71      0.76       770

avg / total       0.77      0.76      0.76      1431



In [36]:
run_model(x_train, y_train, x_test, y_test, et)

Base model score: 0.5380
Training set score:  0.7762
Test set score:  0.7651

Confusion Matrix:
      predicted_0  predicted_1
is_0          545          116
is_1          220          550

Classification Report:
             precision    recall  f1-score   support

          0       0.71      0.82      0.76       661
          1       0.83      0.71      0.77       770

avg / total       0.77      0.77      0.77      1431



In [37]:
run_model(x_train, y_train, x_test, y_test, xgb)

Base model score: 0.5380
Training set score:  0.7601
Test set score:  0.7645

Confusion Matrix:
      predicted_0  predicted_1
is_0          593           68
is_1          269          501

Classification Report:
             precision    recall  f1-score   support

          0       0.69      0.90      0.78       661
          1       0.88      0.65      0.75       770

avg / total       0.79      0.76      0.76      1431

