In [1]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import gender_guesser.detector as gender
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
X_train_all = pd.read_csv('data/X_train_all.csv', index_col = 'id')
X_test_all = pd.read_csv('data/X_test_all.csv', index_col = 'id')

y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train_tfidf = X_train_all.loc[:,X_train_all.columns.difference(['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons'])]
X_test_tfidf = X_test_all.loc[:,X_test_all.columns.difference(['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons'])]

X_train_feat = X_train_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]
X_test_feat = X_test_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]

In [8]:
#Predicting based on features only, default MLP set up
param_space = {'max_iter' : [200],
               'alpha': [0.0001],
               'learning_rate': ['constant'],
              'activation' : ['relu'],
               'solver': ['adam']}

mlp = MLPClassifier(random_state = 0)
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results.loc[:,'mean_test_score'] #0.645236 cv test mean



0    0.645236
Name: mean_test_score, dtype: float64

In [20]:
#Predicting based on tf-idf only
mlp = MLPClassifier(random_state = 0)
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_tfidf, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results.loc[:,'mean_test_score'] #0.610363 cv test mean

0    0.653634
Name: mean_test_score, dtype: float64

In [34]:
#Predicting based on both tf-idf and constructed features
mlp = MLPClassifier(random_state = 0)
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results.loc[:,'mean_test_score'] # 0.606185 cv test mean

0    0.606185
Name: mean_test_score, dtype: float64

An MLP model consisting of only our engineered features appears to do best in our cross validating work. Below we focus on tuning the parameters of the MLP model specifically with this feature set by looking at activation function, solver, maximum number of iterations, hidden layer sizes, learning rate, and alpha value.

In [39]:
mlp = MLPClassifier(random_state = 0)
param_space = {'activation' : ['tanh', 'relu'],
              'solver': ['sgd', 'adam']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_solver', 'param_activation', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #SGD looks best



Unnamed: 0,param_solver,param_activation,mean_train_score,mean_test_score,rank_test_score
0,sgd,tanh,0.653632,0.653634,1
2,sgd,relu,0.653632,0.653634,1
1,adam,tanh,0.657117,0.645236,3
3,adam,relu,0.655024,0.645236,3


In [22]:
mlp = MLPClassifier(random_state = 0, solver = 'sgd')
param_space = {'max_iter' : [200, 500, 1000, 2000],
              }
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_max_iter','mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #500 for max_iter is sufficient

Unnamed: 0,param_max_iter,param_activation,mean_train_score,mean_test_score,rank_test_score
0,200,tanh,0.653632,0.653634,1
1,500,tanh,0.653632,0.653634,1
2,1000,tanh,0.653632,0.653634,1
3,2000,tanh,0.653632,0.653634,1
4,200,relu,0.653632,0.653634,1
5,500,relu,0.653632,0.653634,1
6,1000,relu,0.653632,0.653634,1
7,2000,relu,0.653632,0.653634,1


In [24]:
mlp = MLPClassifier(random_state = 0, solver = 'sgd', max_iter = 500)
param_space = {'hidden_layer_sizes': [(100,50,25),(4,3,2), (50,40,25)],'activation' : ['tanh', 'relu']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_hidden_layer_sizes','param_activation',
                          'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #no distinction with different activations/hidden layer sizes

Unnamed: 0,param_hidden_layer_sizes,param_activation,mean_train_score,mean_test_score,rank_test_score
0,"(100, 50, 25)",tanh,0.653632,0.653634,1
1,"(4, 3, 2)",tanh,0.653632,0.653634,1
2,"(50, 40, 25)",tanh,0.653632,0.653634,1
3,"(100, 50, 25)",relu,0.653632,0.653634,1
4,"(4, 3, 2)",relu,0.653632,0.653634,1
5,"(50, 40, 25)",relu,0.653632,0.653634,1


In [42]:
mlp = MLPClassifier(random_state = 0, max_iter = 500, solver = 'sgd')#do not run again, this will take forever.
param_space = {'alpha': [0.0001, 0.01, 0.05, 0.1],
    'learning_rate': ['constant','adaptive']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_alpha', 'param_learning_rate', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #No distinction when varying learning_rate and alpha (strength of the l2 regularization)

Unnamed: 0,param_alpha,param_learning_rate,mean_train_score,mean_test_score,rank_test_score
0,0.0001,constant,0.653632,0.653634,1
1,0.0001,adaptive,0.653632,0.653634,1
2,0.01,constant,0.653632,0.653634,1
3,0.01,adaptive,0.653632,0.653634,1
4,0.05,constant,0.653632,0.653634,1
5,0.05,adaptive,0.653632,0.653634,1
6,0.1,constant,0.653632,0.653634,1
7,0.1,adaptive,0.653632,0.653634,1


In [17]:
#top non-default model with simplest parameters
param_space = {'max_iter' : [500], 'solver' : ['sgd']}
mlp = MLPClassifier(random_state = 0 )
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results.loc[:,'mean_test_score'] #0.653634 cross validated and tuned

0    0.653634
Name: mean_test_score, dtype: float64

In [18]:
mlp.fit(X_train_feat,y_train.loc[:,'win_side'])
predict_test = mlp.predict(X_test_feat)
print('accuracy_score:',accuracy_score(y_test.loc[:,'win_side'],predict_test))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],predict_test))

accuracy_score: 0.6677524429967426
f1_score: 0.7943548387096774


In [28]:
mlp = MLPClassifier(random_state = 0) 
#test across all parameter spaces. 
param_space = {'max_iter' : [500, 1000, 2000], 
               'alpha': [0.0001, 0.01, 0.05, 0.1],
               'learning_rate': ['constant','adaptive'],
               'hidden_layer_sizes': [(100,50,25),(4,3,2), (50,40,25)],
              'activation' : ['tanh', 'relu'],
               'solver': ['sgd']
               }
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_feat, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.sort_values('rank_test_score')
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_max_iter,param_solver,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.677059,0.201067,0.006845,0.005068,tanh,0.0001,"(100, 50, 25)",constant,500,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
92,0.235744,0.018011,0.005280,0.001539,relu,0.01,"(100, 50, 25)",constant,2000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
93,1.320371,0.028680,0.004846,0.000374,relu,0.01,"(100, 50, 25)",adaptive,500,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
94,1.333214,0.020458,0.014239,0.013808,relu,0.01,"(100, 50, 25)",adaptive,1000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
95,1.240089,0.067806,0.008860,0.006878,relu,0.01,"(100, 50, 25)",adaptive,2000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,0.402876,0.015185,0.037946,0.005478,tanh,0.05,"(50, 40, 25)",constant,1000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
50,0.543704,0.023146,0.008648,0.004537,tanh,0.05,"(50, 40, 25)",constant,2000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
51,1.190804,0.026976,0.012328,0.005732,tanh,0.05,"(50, 40, 25)",adaptive,500,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
142,1.094916,0.053855,0.008111,0.003643,relu,0.1,"(50, 40, 25)",adaptive,1000,sgd,...,0.65272,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645


In [27]:
results.head(15)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_max_iter,param_solver,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
177,3.672455,0.64185,0.024167,0.014858,relu,0.0001,"(50, 40, 25)",adaptive,1000,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
169,2.171652,0.455536,0.013914,0.007224,relu,0.0001,"(50, 40, 25)",constant,500,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
171,2.144562,0.459258,0.008655,0.007169,relu,0.0001,"(50, 40, 25)",constant,1000,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
173,2.333508,0.537722,0.016631,0.002747,relu,0.0001,"(50, 40, 25)",constant,2000,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
175,2.972249,0.820979,0.016942,0.018561,relu,0.0001,"(50, 40, 25)",adaptive,500,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
179,3.331807,0.622312,0.004111,0.000268,relu,0.0001,"(50, 40, 25)",adaptive,2000,adam,...,0.656904,0.638655,0.655005,0.012646,1,0.668763,0.662474,0.671548,0.667595,0.003796
121,0.398373,0.019579,0.006726,0.004979,tanh,0.1,"(4, 3, 2)",constant,500,adam,...,0.65272,0.655462,0.653634,0.001293,7,0.654088,0.654088,0.65272,0.653632,0.000645
122,0.43767,0.007739,0.00303,0.00011,tanh,0.1,"(4, 3, 2)",constant,1000,sgd,...,0.65272,0.655462,0.653634,0.001293,7,0.654088,0.654088,0.65272,0.653632,0.000645
123,0.380184,0.010009,0.003249,6.6e-05,tanh,0.1,"(4, 3, 2)",constant,1000,adam,...,0.65272,0.655462,0.653634,0.001293,7,0.654088,0.654088,0.65272,0.653632,0.000645
124,0.446291,0.006353,0.007409,0.00574,tanh,0.1,"(4, 3, 2)",constant,2000,sgd,...,0.65272,0.655462,0.653634,0.001293,7,0.654088,0.654088,0.65272,0.653632,0.000645


In [6]:
model = GaussianNB() 
model.fit(X_train_all,y_train.loc[:,'win_side'])
win_pred = model.predict(X_test_all)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6482084690553745
f1_score: 0.7768595041322315


In [9]:
model_feat = GaussianNB() 
model_feat.fit(X_train_feat,y_train.loc[:,'win_side'])
win_pred = model_feat.predict(X_test_feat)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6514657980456026
f1_score: 0.7820773930753564


In [21]:
model_tfidf = GaussianNB() 
model_tfidf.fit(X_train_tfidf,y_train.loc[:,'win_side'])
win_pred = model_tfidf.predict(X_test_tfidf)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6482084690553745
f1_score: 0.7768595041322315
