# Information Warfare
## Russia’s use of Twitter during the 2016 US Presidential Election
---

### Import libraries

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

import spacy
import os
import pickle

from collections import Counter

from plotly import tools
import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.offline as py

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.io as pio

from IPython.display import Image

init_notebook_mode(connected=True)

### Import data

In [2]:
# All Tweets
df = pd.read_pickle('data/processed/tweets.pkl')
df.reset_index(drop = True, inplace = True)

# Only English language Tweets
dfEng = pd.read_pickle('data/processed/tweetsEng.pkl')
dfEng.reset_index(drop = True, inplace = True)

# Only non-English language Tweets
dfOth = pd.read_pickle('data/processed/tweetsOth.pkl')
dfOth.reset_index(drop = True, inplace = True)

# Modeling (Work in Progress)

In [29]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

First we need to check our class balance

In [30]:
y = df_grouped.account_category.copy()
x = np.copy(vec_array)

print(Counter(y))

Counter({'RightTroll': 223, 'LeftTroll': 118, 'HashtagGamer': 63, 'NewsFeed': 45, 'NonEnglish': 7, 'Commercial': 5, 'Unknown': 2})


Due to the Limited number of samples, I am going to classify NonEnglish, Commercial, and Unknown as Other.

In [31]:
other_list = ['NonEnglish','Commercial', 'Unknown']

y = y.apply(lambda x: 'Other' if x in other_list else x)

In [32]:
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y)

sm = SMOTE(random_state = 123, k_neighbors = 6)

x_res, y_res = sm.fit_resample(x_train, y_train)

#### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

logreg = OneVsRestClassifier(LogisticRegression(solver = 'liblinear'))

logreg.fit(x_res, y_res)

y_pred = logreg.predict(x_test)

In [34]:
print("Accuracy Score: ",accuracy_score(y_test, y_pred), "\n", "\n")
print(classification_report(y_test, y_pred))

Accuracy Score:  0.9396551724137931 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      1.00      1.00        16
   LeftTroll       0.93      0.93      0.93        30
    NewsFeed       0.85      1.00      0.92        11
       Other       0.50      0.33      0.40         3
  RightTroll       0.96      0.95      0.95        56

   micro avg       0.94      0.94      0.94       116
   macro avg       0.85      0.84      0.84       116
weighted avg       0.94      0.94      0.94       116



### SVM

In [35]:
from sklearn.svm import SVC

svc = OneVsRestClassifier(SVC(gamma = 'auto'))

param_grid = {'estimator__C': [1, 3, 5, 7]}

grid_svc = GridSearchCV(estimator = svc, param_grid = param_grid, 
                        scoring = 'accuracy', n_jobs = -1, verbose = 0,
                       cv = 4)

grid_svc.fit(x_res, y_res)

y_pred_svc = grid_svc.predict(x_test)

print('SVM Accuracy Score: ',accuracy_score(y_test, y_pred_svc), "\n", "\n")

print(classification_report(y_test, y_pred_svc))

SVM Accuracy Score:  0.9396551724137931 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      1.00      1.00        16
   LeftTroll       0.93      0.93      0.93        30
    NewsFeed       0.92      1.00      0.96        11
       Other       0.40      0.67      0.50         3
  RightTroll       0.98      0.93      0.95        56

   micro avg       0.94      0.94      0.94       116
   macro avg       0.85      0.91      0.87       116
weighted avg       0.95      0.94      0.94       116



### Random Forrest

In [36]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 123, n_estimators = 10)


params_rf = {'max_depth':[2,3,4,5,6],
             'min_samples_leaf': [0.04, 0.06, 0.08],
             'max_features': [0.2, 0.4, 0.6, 0.8],
             'criterion':['gini', 'entropy']}
             
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, 
                       scoring = 'accuracy', cv= 10, n_jobs = -1, iid = True)


grid_rf.fit(x_res, y_res)

y_pred_rf = grid_rf.predict(x_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf), "\n", "\n")
print(classification_report(y_test, y_pred_rf))

Accuracy Score:  0.9396551724137931 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      0.94      0.97        16
   LeftTroll       0.96      0.90      0.93        30
    NewsFeed       0.92      1.00      0.96        11
       Other       0.50      0.67      0.57         3
  RightTroll       0.95      0.96      0.96        56

   micro avg       0.94      0.94      0.94       116
   macro avg       0.87      0.89      0.88       116
weighted avg       0.94      0.94      0.94       116



### XGboost

In [37]:
import xgboost as xgb

param_grid = {
  'learning_rate': np.arange(0.05, 1.05, .10),
  'n_estimators' : [50],
  'subsample' : np.arange(0.05, 1.05, .05),
  'max_depth': [2,4,6]
  }

xg_cl = xgb.XGBClassifier(objective = 'multi:softmax')

randomized_xg_cl = RandomizedSearchCV(estimator = xg_cl, 
                                      param_distributions = param_grid,
                                      n_iter = 5, 
                                      cv = 5,
                                      scoring = 'accuracy',
                                      n_jobs = -1,
                                      verbose = 0)

randomized_xg_cl.fit(x_res, y_res)

print("Best Accuracy Score Train CV: ", randomized_xg_cl.best_score_)

Best Accuracy Score Train CV:  0.9928143712574851


In [38]:
y_pred = randomized_xg_cl.predict(x_test)

print('XGBoost Test Accuracy: ', accuracy_score(y_test, y_pred), "\n", "\n")

print(classification_report(y_test, y_pred))

XGBoost Test Accuracy:  0.9310344827586207 
 

              precision    recall  f1-score   support

HashtagGamer       1.00      0.94      0.97        16
   LeftTroll       0.93      0.93      0.93        30
    NewsFeed       0.85      1.00      0.92        11
       Other       0.50      0.33      0.40         3
  RightTroll       0.95      0.95      0.95        56

   micro avg       0.93      0.93      0.93       116
   macro avg       0.85      0.83      0.83       116
weighted avg       0.93      0.93      0.93       116

