In [47]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer



In [48]:
tweets = pd.read_csv('~/Datasets/tweets.csv', encoding='utf-8')

In [49]:
tweets.head(1)

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...


In [50]:
twr = tweets[tweets['is_retweet'] == False]
twr = twr[['handle', 'text', 'time']]
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

In [51]:
twr.head(2)

Unnamed: 0,handle,text,time,is_trump
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0


# Preprocessing: punctuation, stopwords, stemming

In [52]:
stemmer = PorterStemmer()

def remove_punctuation(text):
    lower = text.lower()
    exclude = set(string.punctuation)
    return "".join(ch for ch in lower if ch not in exclude)

def remove_stopwords(x):
    lower = x.lower()
    split = lower.split(" ")
    final = ''
    for i in split:
        if i not in stopwords.words('english'):
            final += (i + " ")
    return final

def split_and_stem(string):
    string = string.split(' ')
    temp = ""
    for i in string:
        temp += (stemmer.stem(i) + " ")
    return temp

In [53]:
twr['processed_text'] = twr['text'].apply(remove_punctuation)
twr['processed_text'] = twr['processed_text'].apply(remove_stopwords)
twr['processed_text'] = twr['processed_text'].apply(split_and_stem)

In [54]:
twr.head(2)

Unnamed: 0,handle,text,time,is_trump,processed_text
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0,question elect put plan action make life bette...
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0,stand togeth there noth cant \n\nmake sure you...


## Defining, x + y, TTS

In [55]:
x = twr['processed_text']
y = twr['is_trump']
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [56]:
x.head()

0    question elect put plan action make life bette...
3    stand togeth there noth cant \n\nmake sure you...
4    candid ask theyd confront racial injustic one ...
5    join 3pm ralli  tomorrow midamerica center cou...
6    elect import sit go httpstcottgeqxnqym make su...
Name: processed_text, dtype: object

In [57]:
x_train.head()

5585    mjp1370 realdonaldtrump cruz talk hillari time...
272                     none us get life alon look lift  
1602    crook hillari said loudli world see short circ...
6101    thank america\nmakeamericagreatagain trump2016...
960     go\n\ntrump perfectli clear intent million imm...
Name: processed_text, dtype: object

# TF-IDF

In [58]:
tfidf = TfidfVectorizer()

In [59]:
df_train = pd.DataFrame(tfidf.fit_transform(x_train).todense(), columns=tfidf.get_feature_names())
df_test = pd.DataFrame(tfidf.transform(x_test).todense(), columns=tfidf.get_feature_names())

# Modeling

In [60]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [61]:
rf = RandomForestClassifier(min_samples_leaf=3)
et = ExtraTreesClassifier(min_samples_leaf=2)
ad = AdaBoostClassifier()
lin = LinearSVC()

In [62]:
run_model(df_train, y_train, df_test, y_test, rf)

Base model score: 0.5457
Training set score:  0.9212
Test set score:  0.8832

Confusion Matrix:
      predicted_0  predicted_1
is_0          586           64
is_1          103          678

Classification Report:
             precision    recall  f1-score   support

          0       0.85      0.90      0.88       650
          1       0.91      0.87      0.89       781

avg / total       0.89      0.88      0.88      1431



In [63]:
run_model(df_train, y_train, df_test, y_test, et)

Base model score: 0.5457
Training set score:  0.9510
Test set score:  0.8979

Confusion Matrix:
      predicted_0  predicted_1
is_0          590           60
is_1           86          695

Classification Report:
             precision    recall  f1-score   support

          0       0.87      0.91      0.89       650
          1       0.92      0.89      0.90       781

avg / total       0.90      0.90      0.90      1431



In [64]:
run_model(df_train, y_train, df_test, y_test, lin)

Base model score: 0.5457
Training set score:  0.9990
Test set score:  0.9231

Confusion Matrix:
      predicted_0  predicted_1
is_0          592           58
is_1           52          729

Classification Report:
             precision    recall  f1-score   support

          0       0.92      0.91      0.91       650
          1       0.93      0.93      0.93       781

avg / total       0.92      0.92      0.92      1431



In [65]:
# run_model(df_train, y_train, df_test, y_test, ad)