In [8]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from nltk.stem import PorterStemmer

In [19]:
# pd.read_csv?

In [20]:
tweets = pd.read_csv('~/Datasets/tweets.csv', encoding='utf-8')

In [21]:
twr = tweets[tweets['is_retweet'] == False]
twr = twr[['handle', 'text', 'time']]
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

# Preprocessing: punctuation, stopwords, stemming

In [22]:
stemmer = PorterStemmer()

def remove_punctuation(text):
    lower = text.lower()
    exclude = set(string.punctuation)
    return "".join(ch for ch in lower if ch not in exclude)

def remove_stopwords(x):
    lower = x.lower()
    split = lower.split(" ")
    final = ''
    for i in split:
        if i not in stopwords.words('english'):
            final += (i + " ")
    return final

def split_and_stem(string):
    string = string.split(' ')
    temp = ""
    for i in string:
        temp += (stemmer.stem(i) + " ")
    return temp

In [23]:
twr['processed_text'] = twr['text'].apply(remove_punctuation)
twr['processed_text'] = twr['processed_text'].apply(remove_stopwords)

In [24]:
twr['processed_text'] = twr['processed_text'].apply(split_and_stem)

In [25]:
twr.head()

Unnamed: 0,handle,text,time,is_trump,processed_text
0,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,0,question elect put plan action make life bette...
3,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,0,stand togeth there noth cant \n\nmake sure you...
4,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27,0,candid ask theyd confront racial injustic one ...
5,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,2016-09-27T22:13:24,1,join 3pm ralli tomorrow midamerica center cou...
6,HillaryClinton,This election is too important to sit out. Go ...,2016-09-27T21:35:28,0,elect import sit go httpstcottgeqxnqym make su...


## Defining, x + y, TTS

In [26]:
x = twr['processed_text']
y = twr['is_trump']
x_train, x_test, y_train, y_test = train_test_split(x,y)

## Count Vectorizer

In [27]:
cv = CountVectorizer()

In [28]:
df_train = pd.DataFrame(cv.fit_transform(x_train).todense(), columns=cv.get_feature_names())
df_test = pd.DataFrame(cv.transform(x_test).todense(), columns=cv.get_feature_names())

# Modeling

In [34]:
rf = RandomForestClassifier(min_samples_leaf=2)

linsvc = LinearSVC()

In [30]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [38]:
run_model(df_train, y_train, df_test, y_test, rf)

Base model score: 0.5471
Training set score:  0.9270
Test set score:  0.8770

Confusion Matrix:
      predicted_0  predicted_1
is_0          570           78
is_1           98          685

Classification Report:
             precision    recall  f1-score   support

          0       0.85      0.88      0.87       648
          1       0.90      0.87      0.89       783

avg / total       0.88      0.88      0.88      1431



In [39]:
run_model(df_train, y_train, df_test, y_test, linsvc)

Base model score: 0.5471
Training set score:  0.9997
Test set score:  0.9147

Confusion Matrix:
      predicted_0  predicted_1
is_0          600           48
is_1           74          709

Classification Report:
             precision    recall  f1-score   support

          0       0.89      0.93      0.91       648
          1       0.94      0.91      0.92       783

avg / total       0.92      0.91      0.91      1431

