In [20]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import os

In [3]:
tweets = pd.read_csv('~/Datasets/tweets.csv', encoding='utf-8')

In [4]:
twr = tweets[tweets['is_retweet'] == False]
twr = twr[['handle', 'text', 'time']]
twr['is_trump'] = twr['handle'].apply(lambda x: 1 if x=="realDonaldTrump" else 0)

# Preprocessing: punctuation, stopwords, stemming


In [5]:
stemmer = PorterStemmer()

def remove_punctuation(text):
    lower = text.lower()
    exclude = set(string.punctuation)
    return "".join(ch for ch in lower if ch not in exclude)

def remove_stopwords(x):
    lower = x.lower()
    split = lower.split(" ")
    final = ''
    for i in split:
        if i not in stopwords.words('english'):
            final += (i + " ")
    return final

def split_and_stem(string):
    string = string.split(' ')
    temp = ""
    for i in string:
        temp += (stemmer.stem(i) + " ")
    return temp

In [6]:
## remove punctuation and stopwords, apply stemmer

twr['processed_text'] = twr['text'].apply(remove_punctuation)
twr['processed_text'] = twr['processed_text'].apply(remove_stopwords)
twr['processed_text'] = twr['processed_text'].apply(split_and_stem)

# Defining x and y

In [7]:
x = twr[['processed_text']]
y = twr['is_trump']
x_train, x_test, y_train, y_test = train_test_split(x,y)

# Additional Preprocessing

In [8]:
## TFIDF 

tfidf = TfidfVectorizer()

df_train = pd.DataFrame(tfidf.fit_transform(x_train['processed_text']).todense(), columns=tfidf.get_feature_names())
df_test = pd.DataFrame(tfidf.transform(x_test['processed_text']).todense(), columns=tfidf.get_feature_names())



In [9]:
## standardizing data

scaler = StandardScaler()

df_train = scaler.fit_transform(df_train)
df_test = scaler.transform(df_test)

In [None]:
df_train[:5]

# Modeling

In [10]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(model.score(x_train, y_train))[:6]
    print "Test set score: ", str(model.score(x_test, y_test))[:6]
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1'])
    print "\nClassification Report:\n",classification_report(y_test, predictions)

In [11]:
rf = RandomForestClassifier(min_samples_leaf=5)

In [21]:
mlp_1 = MLPClassifier(max_iter = 50)
mlp_2 = MLPClassifier(max_iter = 100)
mlp_3 = MLPClassifier(max_iter = 200)
mlp_4 = MLPClassifier(max_iter = 400)

In [22]:
models = [rf, mlp_1, mlp_2, mlp_3, mlp_4]
for model in models:
    print "Model: ", str(model).split("(")[0]
    print "Hyperparameters: " + str(model)[len(str(model).split("(")[0])+1:-1]
    run_model(df_train, y_train, df_test, y_test, model)
    print "\n"
os.system("say model is done")

Model:  RandomForestClassifier
Hyperparameters: bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False
Base model score: 0.5590
Training set score:  0.8976
Test set score:  0.8609

Confusion Matrix:
      predicted_0  predicted_1
is_0          560           71
is_1          128          672

Classification Report:
             precision    recall  f1-score   support

          0       0.81      0.89      0.85       631
          1       0.90      0.84      0.87       800

avg / total       0.86      0.86      0.86      1431



Model:  MLPClassifier
Hyperparameters: activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
  

0

In [None]:
params = {'activation': ['tanh', 'logistic', 'relu', 'identity'],
         'max_iter': [50, 100, 200, 400]}

In [None]:
gs = GridSearchCV(mlp_1, cv=5, params = params)