In [2]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import gender_guesser.detector as gender
import matplotlib.pylab as plt
%matplotlib inline

zsh:1: command not found: jt


In [3]:
X_train_all = pd.read_csv('data/X_train_all.csv', index_col = 'id')
X_test_all = pd.read_csv('data/X_test_all.csv', index_col = 'id')

y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
np.mean(y_train)

Unnamed: 0    507.361732
win_side        0.653631
dtype: float64

In [9]:
np.mean(pd.concat([y_train,y_test]))

Unnamed: 0    511.000000
win_side        0.652981
dtype: float64

In [4]:
mlp = MLPClassifier(random_state = 0)
param_space = {'activation' : ['tanh', 'relu'],
              'solver': ['sgd', 'adam']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_solver', 'param_activation', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #SGD looks best

GridSearchCV(cv=3, estimator=MLPClassifier(random_state=0), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'solver': ['sgd', 'adam']},
             return_train_score=True)

Unnamed: 0,param_solver,param_activation,mean_train_score,mean_test_score,rank_test_score
0,sgd,tanh,0.653632,0.653634,1
2,sgd,relu,0.653632,0.653634,1
1,adam,tanh,1.0,0.607579,3
3,adam,relu,1.0,0.606185,4


In [6]:
mlp = MLPClassifier(random_state = 0, solver = 'sgd')
param_space = {'max_iter' : [500, 1000, 2000,3000],}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_max_iter', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #500 for max_iter is sufficient

Unnamed: 0,param_max_iter,mean_train_score,mean_test_score,rank_test_score
0,500,0.653632,0.653634,1
1,1000,0.653632,0.653634,1
2,2000,0.653632,0.653634,1
3,3000,0.653632,0.653634,1


In [9]:
mlp = MLPClassifier(random_state = 0, solver = 'sgd', max_iter = 500)
param_space = {'hidden_layer_sizes': [(100,50,25),(4,3,2), (50,40,25)],
              'activation' : ['tanh', 'relu']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_hidden_layer_sizes', 'param_activation', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #no distinction with different activations/hidden layer sizes

Unnamed: 0,param_hidden_layer_sizes,param_activation,mean_train_score,mean_test_score,rank_test_score
0,"(100, 50, 25)",tanh,0.653632,0.653634,1
1,"(4, 3, 2)",tanh,0.653632,0.653634,1
2,"(50, 40, 25)",tanh,0.653632,0.653634,1
3,"(100, 50, 25)",relu,0.653632,0.653634,1
4,"(4, 3, 2)",relu,0.653632,0.653634,1
5,"(50, 40, 25)",relu,0.653632,0.653634,1


In [7]:
mlp = MLPClassifier(random_state = 0, max_iter = 500, solver = 'sgd')#do not run again, this will take forever.
param_space = {'alpha': [0.0001, 0.01, 0.05, 0.1],
    'learning_rate': ['constant','adaptive']}
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])
results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_alpha', 'param_learning_rate', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results #No distinction when varying learning_rate and alpha (strength of the l2 regularization)

Unnamed: 0,param_alpha,param_learning_rate,mean_train_score,mean_test_score,rank_test_score
0,0.01,constant,0.653632,0.653634,1
1,0.01,adaptive,0.653632,0.653634,1
2,0.05,constant,0.653632,0.653634,1
3,0.05,adaptive,0.653632,0.653634,1
4,0.1,constant,0.653632,0.653634,1
5,0.1,adaptive,0.653632,0.653634,1


In [12]:
mlp = MLPClassifier(random_state = 0) 
#test across all parameter spaces. This will take a few hours.
param_space = {'max_iter' : [500, 1000, 2000],
               'alpha': [0.0001, 0.01, 0.05, 0.1],
               'learning_rate': ['constant','adaptive'],
               'hidden_layer_sizes': [(100,50,25),(4,3,2), (50,40,25)],
              'activation' : ['tanh', 'relu'],
               'solver': ['sgd', 'adam']
               }
search = GridSearchCV(mlp, param_space, n_jobs=-1, cv=3, return_train_score=True)
search.fit(X_train_all, y_train.loc[:,'win_side'])

results = pd.DataFrame(search.cv_results_)
results = results.sort_values('rank_test_score')
results





Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_max_iter,param_solver,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,29.157370,11.384977,0.571429,0.082112,tanh,0.0001,"(100, 50, 25)",constant,500,sgd,...,0.652720,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
159,151.775454,3.714696,0.400708,0.014876,relu,0.0001,"(4, 3, 2)",constant,1000,adam,...,0.652720,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
160,60.424816,0.731582,0.446385,0.030189,relu,0.0001,"(4, 3, 2)",constant,2000,sgd,...,0.652720,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
161,149.979677,1.729585,0.385319,0.053872,relu,0.0001,"(4, 3, 2)",constant,2000,adam,...,0.652720,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
162,73.757306,0.709461,0.442606,0.014357,relu,0.0001,"(4, 3, 2)",adaptive,500,sgd,...,0.652720,0.655462,0.653634,0.001293,1,0.654088,0.654088,0.65272,0.653632,0.000645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,67.255533,0.453887,0.670809,0.153749,tanh,0.01,"(50, 40, 25)",adaptive,1000,adam,...,0.543933,0.621849,0.592220,0.034437,283,1.000000,1.000000,1.00000,1.000000,0.000000
67,75.811057,5.666909,0.624910,0.075865,tanh,0.01,"(50, 40, 25)",adaptive,500,adam,...,0.543933,0.621849,0.592220,0.034437,283,1.000000,1.000000,1.00000,1.000000,0.000000
65,90.136587,0.359310,0.613940,0.123514,tanh,0.01,"(50, 40, 25)",constant,2000,adam,...,0.543933,0.621849,0.592220,0.034437,283,1.000000,1.000000,1.00000,1.000000,0.000000
63,75.738773,2.176713,0.774358,0.134335,tanh,0.01,"(50, 40, 25)",constant,1000,adam,...,0.543933,0.621849,0.592220,0.034437,283,1.000000,1.000000,1.00000,1.000000,0.000000


In [13]:
results.to_csv('data/results.csv')

In [11]:
#top model with simplest parameters
#162,relu,0.0001,"(4, 3, 2)",adaptive,500,sgd,"{'activation': 'relu', 'alpha': 0.0001, 
#'hidden_layer_sizes': (4, 3, 2), 'learning_rate': 'adaptive', 'max_iter': 500, 'solver': 'sgd'}"
mlp = MLPClassifier(random_state = 0, max_iter = 500, solver = 'sgd', 
                    hidden_layer_sizes = (4,3,2), learning_rate = 'adaptive',
                   activation='relu', alpha = 0.0001)
mlp.fit(X_train_all,y_train.loc[:,'win_side'])
predict_test = mlp.predict(X_test_all)
print('accuracy_score:',accuracy_score(y_test.loc[:,'win_side'],predict_test))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],predict_test))
#default alpha and activation are fine, rest were tuned

accuracy_score: 0.6514657980456026
f1_score: 0.7889546351084813


In [8]:
X_train_feat = X_train_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]
X_test_feat = X_test_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]


In [6]:
model = GaussianNB() 
model.fit(X_train_all,y_train.loc[:,'win_side'])
win_pred = model.predict(X_test_all)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6482084690553745
f1_score: 0.7768595041322315


In [9]:
model_feat = GaussianNB() 
model_feat.fit(X_train_feat,y_train.loc[:,'win_side'])
win_pred = model_feat.predict(X_test_feat)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6514657980456026
f1_score: 0.7820773930753564


In [17]:
X_train_tfidf = X_train_all.loc[:,X_train_all.columns.difference(['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons'])]
X_test_tfidf = X_test_all.loc[:,X_test_all.columns.difference(['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons'])]

In [20]:
X_train_tfidf

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abbott,...,younger,youth,youthful,zenith,zero,zillion,zip,zone,zones,zoning
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019_19-67,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.023168,0.0,0.0,0.0,0.0,0.0
2013_12-1315,0.0,0.000000,0.000000,0.000000,0.000000,0.020612,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2005_04-1186,0.0,0.008259,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2006_05-6551,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014269,0.0,0.0,0.0,0.0,0.0
2005_05-5224,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005559,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016_16-6219,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2007_06-984,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003443,0.0,0.0,0.0,0.0,0.0
2013_12-79,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003724,0.0,0.0,0.0,0.0,0.0
2012_11-982,0.0,0.000000,0.005297,0.010163,0.007013,0.014189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [21]:
model_tfidf = GaussianNB() 
model_tfidf.fit(X_train_tfidf,y_train.loc[:,'win_side'])
win_pred = model_tfidf.predict(X_test_tfidf)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test.loc[:,'win_side'],win_pred))
print('f1_score:', f1_score(y_test.loc[:,'win_side'],win_pred))

Naive Bayes All
accuracy_score: 0.6482084690553745
f1_score: 0.7768595041322315
