# Information Warfare
## Russia’s use of Twitter during the 2016 US Presidential Election
---

Last updated by Benjamin Forleo 06/01/19

### Import libraries

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

import spacy
import os
import pickle

from collections import Counter

from plotly import tools
import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.offline as py

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.io as pio

from IPython.display import Image

init_notebook_mode(connected=True)

In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.externals import joblib

### Import data

In [3]:
# English Language Tweets (Clemson Dataset)
vec_df = pd.read_csv('./data/eng_labeled_docvecs.csv')

vec_array = np.array(vec_df.iloc[:,2:])

x = np.array(vec_df.iloc[:,2:])
y = vec_df.account_category.copy()

print(Counter(y))

Counter({'RightTroll': 223, 'LeftTroll': 118, 'HashtagGamer': 63, 'NewsFeed': 45, 'NonEnglish': 7, 'Commercial': 5, 'Unknown': 2})


### Modeling (Work in Progress)

First we need to check our class balance

Due to the Limited number of samples, I am going to classify NonEnglish, Commercial, and Unknown as Other.

In [4]:
other_list = ['NonEnglish','Commercial', 'Unknown']

y = y.apply(lambda x: 'Other' if x in other_list else x)

In [5]:
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y)

sm = SMOTE(random_state = 123, k_neighbors = 6)

x_res, y_res = sm.fit_resample(x_train, y_train)

#### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

logreg = OneVsRestClassifier(LogisticRegression(solver = 'liblinear'))

logreg.fit(x_res, y_res)

# Save the logreg model
filename = './saved_models/eng_predictive_models/eng_logreg.sav'
joblib.dump(logreg, filename)

y_pred = logreg.predict(x_test)

In [7]:
print("Accuracy Score: ",accuracy_score(y_test, y_pred), "\n", "\n")
print(classification_report(y_test, y_pred))

Accuracy Score:  0.9913793103448276 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      1.00      1.00        16
   LeftTroll       1.00      0.97      0.98        30
    NewsFeed       1.00      1.00      1.00        11
       Other       1.00      1.00      1.00         3
  RightTroll       0.98      1.00      0.99        56

   micro avg       0.99      0.99      0.99       116
   macro avg       1.00      0.99      0.99       116
weighted avg       0.99      0.99      0.99       116



### SVM

In [8]:
from sklearn.svm import SVC

svc = OneVsRestClassifier(SVC(gamma = 'auto'))

param_grid = {'estimator__C': [1, 3, 5, 7]}

grid_svc = GridSearchCV(estimator = svc, param_grid = param_grid, 
                        scoring = 'accuracy', n_jobs = -1, verbose = 0,
                       cv = 4)

grid_svc.fit(x_res, y_res)

# Save the SVM model
filename = './saved_models/eng_predictive_models/eng_svm.sav'
joblib.dump(grid_svc.best_estimator_, filename)

y_pred_svc = grid_svc.predict(x_test)

print('SVM Accuracy Score: ',accuracy_score(y_test, y_pred_svc), "\n", "\n")

print(classification_report(y_test, y_pred_svc))

SVM Accuracy Score:  0.9827586206896551 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      1.00      1.00        16
   LeftTroll       1.00      0.93      0.97        30
    NewsFeed       1.00      1.00      1.00        11
       Other       1.00      1.00      1.00         3
  RightTroll       0.97      1.00      0.98        56

   micro avg       0.98      0.98      0.98       116
   macro avg       0.99      0.99      0.99       116
weighted avg       0.98      0.98      0.98       116



### Random Forrest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 123, n_estimators = 10)


params_rf = {'max_depth':[2,3,4,5,6],
             'min_samples_leaf': [0.04, 0.06, 0.08],
             'max_features': [0.2, 0.4, 0.6, 0.8],
             'criterion':['gini', 'entropy']}
             
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, 
                       scoring = 'accuracy', cv= 10, n_jobs = -1, iid = True)


grid_rf.fit(x_res, y_res)

filename = './saved_models/eng_predictive_models/eng_rf.sav'
joblib.dump(grid_rf.best_estimator_, filename)

y_pred_rf = grid_rf.predict(x_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf), "\n", "\n")
print(classification_report(y_test, y_pred_rf))

Accuracy Score:  0.9655172413793104 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      1.00      1.00        16
   LeftTroll       1.00      0.90      0.95        30
    NewsFeed       1.00      1.00      1.00        11
       Other       0.50      1.00      0.67         3
  RightTroll       0.98      0.98      0.98        56

   micro avg       0.97      0.97      0.97       116
   macro avg       0.90      0.98      0.92       116
weighted avg       0.98      0.97      0.97       116



### XGboost

In [10]:
import xgboost as xgb

param_grid = {
  'learning_rate': np.arange(0.05, 1.05, .10),
  'n_estimators' : [50],
  'subsample' : np.arange(0.05, 1.05, .05),
  'max_depth': [2,4,6]
  }

xg_cl = xgb.XGBClassifier(objective = 'multi:softmax')

randomized_xg_cl = RandomizedSearchCV(estimator = xg_cl, 
                                      param_distributions = param_grid,
                                      n_iter = 5, 
                                      cv = 5,
                                      scoring = 'accuracy',
                                      n_jobs = -1,
                                      verbose = 0)

randomized_xg_cl.fit(x_res, y_res)

filename = './saved_models/eng_predictive_models/eng_xgb.sav'
joblib.dump(randomized_xg_cl.best_estimator_, filename)

print("Best Accuracy Score Train CV: ", randomized_xg_cl.best_score_)


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



Best Accuracy Score Train CV:  0.9880239520958084


In [11]:
y_pred = randomized_xg_cl.predict(x_test)

print('XGBoost Test Accuracy: ', accuracy_score(y_test, y_pred), "\n", "\n")

print(classification_report(y_test, y_pred))

XGBoost Test Accuracy:  0.9655172413793104 
 

              precision    recall  f1-score   support

HashtagGamer       0.94      1.00      0.97        16
   LeftTroll       1.00      0.93      0.97        30
    NewsFeed       1.00      1.00      1.00        11
       Other       1.00      0.67      0.80         3
  RightTroll       0.95      0.98      0.96        56

   micro avg       0.97      0.97      0.97       116
   macro avg       0.98      0.92      0.94       116
weighted avg       0.97      0.97      0.96       116

