In [1]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import gender_guesser.detector as gender
import matplotlib.pylab as plt
%matplotlib inline

In [14]:
X_train_tfidf = pd.read_csv('data/X_train_tfidf', index_col = 'id')
X_test_tfidf = pd.read_csv('data/X_test_tfidf', index_col = 'id')
X_train_all = pd.read_csv('data/X_train_all', index_col = 'id')
X_test_all = pd.read_csv('data/X_test_all', index_col = 'id')
X_train_feat = pd.read_csv('data/X_train_feat', index_col = 'id')
X_test_feat = pd.read_csv('data/X_test_feat', index_col = 'id')
y_train = pd.read_csv('data/y_train')
y_test = pd.read_csv('data/y_test')

In [18]:
y_train

Unnamed: 0.1,Unnamed: 0,win_side
0,760,0.0
1,204,1.0
2,362,1.0
3,5,1.0
4,358,1.0
...,...,...
813,835,0.0
814,192,0.0
815,629,0.0
816,559,0.0


In [45]:
mlp = MLPClassifier(random_state = 0)

parameter_space = {
    'activation' : ['tanh', 'relu'],
    'max_iter': [500, 1000, 2000,3000],
    'hidden_layer_sizes': [(100,50,25),(4,3,2), (50,40,25)],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}
search = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
search.fit(X_train_feat, y_train.loc[:,'win_side'])
# Best parameters set
print('Best parameters found:\n', search.best_params_)
means = search.cv_results_['mean_test_score']
means
#mlp.fit(X_train_feat,y_train.loc[:,'win_side'])
#predict_test = mlp.predict(X_test_feat)
#accuracy_score(y_test.loc[:,'win_side'],predict_test)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (100, 50, 25), 'learning_rate': 'constant', 'max_iter': 500, 'solver': 'adam'}


array([0.65892318, 0.65401673, 0.65892318, 0.65401673, 0.65892318,
       0.65401673, 0.65892318, 0.65401673, 0.65892318, 0.65401673,
       0.65892318, 0.65401673, 0.65892318, 0.65401673, 0.65892318,
       0.65401673, 0.65892318, 0.65892318, 0.65892318, 0.65892318,
       0.65892318, 0.65892318, 0.65892318, 0.65892318, 0.65892318,
       0.65892318, 0.65892318, 0.65892318, 0.65892318, 0.65892318,
       0.65892318, 0.65892318, 0.65892318, 0.65768872, 0.65892318,
       0.65768872, 0.65892318, 0.65768872, 0.65892318, 0.65768872,
       0.65892318, 0.65768872, 0.65892318, 0.65768872, 0.65892318,
       0.65768872, 0.65892318, 0.65768872, 0.65892318, 0.65401673,
       0.65892318, 0.65401673, 0.65892318, 0.65401673, 0.65892318,
       0.65401673, 0.65892318, 0.65401673, 0.65892318, 0.65401673,
       0.65892318, 0.65401673, 0.65892318, 0.65401673, 0.65892318,
       0.65892318, 0.65892318, 0.65892318, 0.65892318, 0.65892318,
       0.65892318, 0.65892318, 0.65892318, 0.65892318, 0.65892

In [58]:
max_iter = [250,500,1000,2000,5000]
activation = ['identity', 'logistic', 'tanh', 'relu']
hidden_layer_sizes = [(4,3,2), (10,5,2), (50,40,25)]
solver = ['sgd', 'adam']
alpha = [0.0001, 0.001, 0.05, 0.1]
learning_rate = ['constant','adaptive']
accuracy = []
for val in max_iter:
    mlp = MLPClassifier(max_iter=val, random_state = 0)
    mlp.fit(X_train_feat, y_train.loc[:,'win_side'])
    pred = mlp.predict(X_test_feat)
    accuracy.append(accuracy_score(y_test.loc[:,'win_side'],pred))
print(accuracy)
#All levels of iterations seem to be the same

[0.6390243902439025, 0.6390243902439025, 0.6390243902439025, 0.6390243902439025, 0.6390243902439025]


In [54]:
accuracy = []
for val in hidden_layer_sizes:
    mlp = MLPClassifier(hidden_layer_sizes = val, max_iter=500, random_state = 0)
    mlp.fit(X_train_feat, y_train.loc[:,'win_side'])
    pred = mlp.predict(X_test_feat)
    accuracy.append(accuracy_score(y_test.loc[:,'win_side'],pred))
print(accuracy)

[0.6292682926829268, 0.6585365853658537, 0.6682926829268293]


In [55]:
accuracy = []
for val in activation:
    mlp = MLPClassifier(hidden_layer_sizes = (50,40,25), max_iter=500, random_state = 0, activation = val)
    mlp.fit(X_train_feat, y_train.loc[:,'win_side'])
    pred = mlp.predict(X_test_feat)
    accuracy.append(accuracy_score(y_test.loc[:,'win_side'],pred))
print(accuracy)
#Relu seems to be the best

[0.6341463414634146, 0.6292682926829268, 0.6585365853658537, 0.6682926829268293]


In [56]:
accuracy = []
for val in solver:
    mlp = MLPClassifier(hidden_layer_sizes = (50,40,25), max_iter=500, 
                        random_state = 0, activation = 'relu', solver=val)
    mlp.fit(X_train_feat, y_train.loc[:,'win_side'])
    pred = mlp.predict(X_test_feat)
    accuracy.append(accuracy_score(y_test.loc[:,'win_side'],pred))
print(accuracy)
#Adam solver seems best

[0.6292682926829268, 0.6682926829268293]


In [60]:
accuracy = []
for val in alpha:
    mlp = MLPClassifier(hidden_layer_sizes = (50,40,25), max_iter=500,
                        random_state = 0, activation = 'relu', solver='adam', alpha = val)
    mlp.fit(X_train_feat, y_train.loc[:,'win_side'])
    pred = mlp.predict(X_test_feat)
    accuracy.append(accuracy_score(y_test.loc[:,'win_side'],pred))
print(accuracy)
#Alpha val of 0.0001 seems best

[0.6682926829268293, 0.6634146341463415, 0.6634146341463415, 0.6536585365853659]


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,10))
epochs_array = max_iter
colors = ['steelblue','grey','red']

for i,val in enumerate(accuracy): 
    ax.plot(epochs_array, val.acc_train_array, ls = 'dashed',color=colors[i], label=f"Train Accuracy: {val.sizes[1]}")
    ax.plot(epochs_array, val.acc_test_array,color=colors[i], label=f"Test Accuracy: {val.sizes[1]}" )
    ax.legend(loc="center right", fontsize=16)


ax.legend(loc="lower right", fontsize=16)
ax.set_xlabel("epochs", fontsize=16)
ax.set_ylabel("accuracy", fontsize=16)
plt.show()