In [85]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC

In [4]:
tweets = pd.read_csv('~/Datasets/tweets.csv')

In [5]:
tweets.head()

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",


In [6]:
tweets.columns

Index([u'id', u'handle', u'text', u'is_retweet', u'original_author', u'time',
       u'in_reply_to_screen_name', u'in_reply_to_status_id',
       u'in_reply_to_user_id', u'is_quote_status', u'lang', u'retweet_count',
       u'favorite_count', u'longitude', u'latitude', u'place_id',
       u'place_full_name', u'place_name', u'place_type', u'place_country_code',
       u'place_country', u'place_contained_within', u'place_attributes',
       u'place_bounding_box', u'source_url', u'truncated', u'entities',
       u'extended_entities'],
      dtype='object')

In [7]:
tweets.isnull().sum()

id                            0
handle                        0
text                          0
is_retweet                    0
original_author            5722
time                          0
in_reply_to_screen_name    6236
in_reply_to_status_id      6242
in_reply_to_user_id        6236
is_quote_status               0
lang                          0
retweet_count                 0
favorite_count                0
longitude                  6432
latitude                   6432
place_id                   6240
place_full_name            6240
place_name                 6240
place_type                 6240
place_country_code         6240
place_country              6240
place_contained_within     6240
place_attributes           6240
place_bounding_box         6240
source_url                    0
truncated                     0
entities                      0
extended_entities          5096
dtype: int64

In [8]:
tweets.shape

(6444, 28)

In [18]:
twr = tweets[tweets['is_retweet'] == False]

In [15]:
twr.shape

(5722, 28)

In [19]:
twr = twr[['handle', 'text', 'time']]

In [21]:
twr.head()

Unnamed: 0,handle,text,time
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27T22:13:24
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27T21:35:28


In [22]:
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

In [23]:
twr.head()

Unnamed: 0,handle,text,time,is_trump
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27,0
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27T22:13:24,1
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27T21:35:28,0


## preprocessing

### lowercase and remove punctuation

In [34]:
def remove_punctuation(text):
    lower = text.lower()
    exclude = set(string.punctuation)
    return "".join(ch for ch in lower if ch not in exclude)

In [42]:
twr['processed_text'] = twr['text'].apply(remove_punctuation)

In [43]:
twr.head()

Unnamed: 0,handle,text,time,is_trump,processed_text
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0,the question in this election who can put the ...
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0,if we stand together theres nothing we cant do...
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27,0,both candidates were asked about how theyd con...
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27T22:13:24,1,join me for a 3pm rally tomorrow at the midam...
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27T21:35:28,0,this election is too important to sit out go t...


### make lowercase and remove stopwords

In [44]:
def remove_stopwords(x):
    lower = x.lower()
    split = lower.split(" ")
    final = ''
    for i in split:
        if i not in stopwords.words('english'):
            final += (i + " ")
    return final

In [45]:
twr['processed_text'] = twr['processed_text'].apply(remove_stopwords)



In [46]:
twr.head()

Unnamed: 0,handle,text,time,is_trump,processed_text
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0,question election put plans action make life b...
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0,stand together theres nothing cant \n\nmake su...
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27,0,candidates asked theyd confront racial injusti...
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27T22:13:24,1,join 3pm rally tomorrow midamerica center cou...
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27T21:35:28,0,election important sit go httpstcottgeqxnqym m...


# Selecting x and y, TTS

In [49]:
x = twr['processed_text']
y = twr['is_trump']

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

# count vectorizer

In [52]:
cv = CountVectorizer()

In [57]:
df = pd.DataFrame(cv.fit_transform(x_train).todense(), columns=cv.get_feature_names())

In [58]:
df.head()

Unnamed: 0,00,007cigarjoe,01,079,10,100,1000,10000,100000,1000000,...,zandi,zero,zika,zip,zogby,zucker,zuckerman,él,única,únicos
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
df_test = pd.DataFrame(cv.transform(x_test).todense(), columns=cv.get_feature_names())

In [60]:
df_test.head()

Unnamed: 0,00,007cigarjoe,01,079,10,100,1000,10000,100000,1000000,...,zandi,zero,zika,zip,zogby,zucker,zuckerman,él,única,únicos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Modeling

In [86]:
rf = RandomForestClassifier(min_samples_leaf=2)
xgb = XGBClassifier()
svc = SVC(verbose=True)
nusvc = NuSVC()
linsvc = LinearSVC()

In [63]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [None]:
rf.fit(df, y_train)
rf.score(df, y_train)
rf.score(df_test, y_test)

In [84]:
run_model(df, y_train, df_test, y_test, rf)

Base model score: 0.5450
Training set score:  0.9324
Test set score:  0.8860

Confusion Matrix:
      predicted_0  predicted_1
is_0          579           72
is_1           91          689

Classification Report:
             precision    recall  f1-score   support

          0       0.86      0.89      0.88       651
          1       0.91      0.88      0.89       780

avg / total       0.89      0.89      0.89      1431



In [69]:
run_model(df, y_train, df_test, y_test, xgb)

Base model score: 0.5450
Training set score:  0.8364
Test set score:  0.8245

Confusion Matrix:
      predicted_0  predicted_1
is_0          599           52
is_1          199          581

Classification Report:
             precision    recall  f1-score   support

          0       0.75      0.92      0.83       651
          1       0.92      0.74      0.82       780

avg / total       0.84      0.82      0.82      1431



In [87]:
run_model(df, y_train, df_test, y_test, linsvc)

Base model score: 0.5450
Training set score:  0.9997
Test set score:  0.9168

Confusion Matrix:
      predicted_0  predicted_1
is_0          596           55
is_1           64          716

Classification Report:
             precision    recall  f1-score   support

          0       0.90      0.92      0.91       651
          1       0.93      0.92      0.92       780

avg / total       0.92      0.92      0.92      1431

