In [None]:
%%capture
%config InlineBackend.figure_format = 'retina'
from sklearn import datasets
from sklearn import impute
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import svm
from sklearn import compose
from sklearn import tree
from sklearn import linear_model
from sklearn import neural_network
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sio
import seaborn as sns
import warnings
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import json

warnings.filterwarnings('ignore')
# pip install gspread and df2gspread
import sys
!{sys.executable} -m pip install gspread
!{sys.executable} -m pip install numpy df2gspread
import gspread
from df2gspread import gspread2df as g2d
from oauth2client.service_account import ServiceAccountCredentials

In [None]:
# Where we want the credentials to be sent in order to be authorized
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
keyfile = 'gserviceaccount-client-secret.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(keyfile, scope)
gc = gspread.authorize(credentials)

In [None]:
spreadsheet_key = '1QUMP6tlBqR3CqYlh7uC18e_XA4FgMT9SqMSc5agoGUA'

df = g2d.download(
    spreadsheet_key,
    wks_name='Results',
    credentials=credentials,
    col_names=True,
)

# Two of the columns contain Python dicts that have been serialized.
# This serves to deserialize them into dicts again, in a totally utterly unsafe way. 
for i, (p, cv) in enumerate(zip(df['params'], df['cv_results'])):
    from numpy import array, int32
    from numpy.ma import masked_array
    # This is horrible. Don't do this.
    # However, since I happen to be the one who generates the output it's slightly less horrible,
    # though definitely still a huge no-go.
    df['params'][i] = eval(p)
    df['cv_results'][i] = eval(cv)


In [None]:
def xyzc(cv_results, keys, score = 'mean_test_score'):
    xk, yk, zk = keys

    X, Y, Z, C = [], [], [], []
    for param, test in zip(cv_results['params'], cv_results[score]):
        X.append(str(param[xk]))
        Y.append(str(param[yk]))
        Z.append(param[zk])
        C.append(test)

    
    return X, Y, Z, C

def map_lookup(key, collection, f = lambda x: x):
    return np.array([f(x[key]) for x in collection])

def normalize(arr):
    return (arr - min(arr)) / (max(arr) - min(arr))

def uniques(xs):
    return sorted(set(xs))

def encoder(xs):
    return { v: k for k, v in enumerate(uniques(xs)) }

def decoder(xs):
    return { k: v for k, v in enumerate(uniques(xs)) }

def encoded(xs):
    enc = encoder(xs)
    return np.array([enc[x] for x in xs])

def decoded(xs, dec):
    return np.array([dec[x] for x in xs])

In [None]:
cvr = df.iloc[107]['cv_results']
params = df.iloc[107]['cv_results']['params']
activations, hidden_layers = [map_lookup(k, params, f = str) for k in ['activation', 'hidden_layer_sizes']]
alphas = map_lookup('alpha', params)
train_scores = cvr['mean_train_score']
test_scores = cvr['mean_test_score']

X, Y, Z, C = map(np.array, xyzc(cvr, ['activation', 'hidden_layer_sizes', 'alpha']))
_, _, _, Ct = map(np.array, xyzc(cvr, ['activation', 'hidden_layer_sizes', 'alpha'], score='mean_train_score'))

S = 200 * ((C - min(C)) / (max(C) - min(C)))**2
St = 200 * ((Ct - min(Ct)) / (max(Ct) - min(Ct)))**2
encX, encY, encZ = [{ k: v for v, k in enumerate(sorted(list(set(S)))) } for S in [X, Y, Z]]
encode_with = lambda enc, xs: [enc[x] for x in xs]
df.iloc[107]

In [None]:
# 3d plot with points (activation, hidden_layer_sizes, alpha) where the accuracy is represented through
# color and size
ax = plt.figure(dpi=600).add_subplot(111, projection='3d')

ax.scatter(encoded(activations), encoded(hidden_layers), encoded(alphas), c=test_scores, s=200 * normalize(test_scores)**2, cmap='viridis')

ax.axes.set_xticklabels(uniques(activations))
ax.axes.set_yticklabels(uniques(hidden_layers))
ax.axes.set_zticklabels(uniques(alphas))
ax.axes.set_xticks(range(len(uniques(activations))))
ax.axes.set_yticks(range(len(uniques(hidden_layers))))
ax.axes.set_zticks(range(len(uniques(alphas))))
#ax.set_title('Test Accuracy for Various Hyperparameters')
ax.set_xlabel('Activation')
ax.set_ylabel('Hidden Layer Sizes')
ax.set_zlabel('Alpha')

ax.xaxis.labelpad=20
ax.yaxis.labelpad=20
ax.zaxis.labelpad=20

ax.dist = 12

plt.savefig('hp.png')

In [None]:
# 3d plot with points (activation, hidden_layer_sizes, alpha) where the accuracy is represented through
# color and size
ax = plt.figure(dpi=300).add_subplot(111, projection='3d')

#ax.scatter(encode_with(encX, X), encode_with(encY, Y), encode_with(encZ, Z), c=Ct, s=St, cmap='viridis')
ax.scatter(encoded(activations), encoded(hidden_layers), encoded(alphas), c=train_scores, s = 200 * normalize(train_scores)**2, cmap='viridis')

ax.axes.set_xticklabels(uniques(activations))
ax.axes.set_yticklabels(uniques(hidden_layers))
ax.axes.set_zticklabels(uniques(alphas))
ax.axes.set_xticks(range(len(uniques(activations))))
ax.axes.set_yticks(range(len(uniques(hidden_layers))))
ax.axes.set_zticks(range(len(uniques(alphas))))
ax.set_title('Training Accuracy for Various Hyperparameters')
ax.set_xlabel('Activation')
ax.set_ylabel('Hidden Layer Sizes')
ax.set_zlabel('Alpha')

ax.xaxis.labelpad=20
ax.yaxis.labelpad=20
ax.zaxis.labelpad=20

ax.dist = 12

In [None]:
# 3d scatter plot with X = Layers, Y = alpha, Z = test accuracy and C = activation
ax = plt.figure(dpi=300).add_subplot(111, projection='3d')

sc = ax.scatter(encoded(hidden_layers), encoded(alphas), test_scores, c=encoded(activations), s=100, cmap='viridis')

ax.axes.set_xticklabels(uniques(hidden_layers))
ax.axes.set_yticklabels(uniques(alphas))
ax.axes.set_xticks(range(len(uniques(hidden_layers))))
ax.axes.set_yticks(range(len(uniques(alphas))))
ax.set_xlabel('Hidden Layer Sizes')
ax.set_ylabel('Alpha')
ax.set_zlabel('Mean Test Accuracy')

ax.xaxis.labelpad=20
ax.yaxis.labelpad=20
ax.zaxis.labelpad=20

ax.dist = 13

(lines, names) = sc.legend_elements()
ax.legend(lines, uniques(activations))

In [None]:
# 2d scatter plot with X = Layers, Y = alpha, Z = test accuracy and C = activation
fig = plt.figure(dpi=300)

ax = fig.add_subplot(211)
sc = ax.scatter(encode_with(encY, Y), C, c=encode_with(encX, X), cmap='viridis')
ax.axes.set_xticklabels(sorted(list(set(Y))))
ax.axes.set_xticks(range(len(set(Y))))
ax.set_xlabel('Hidden Layer Sizes')
ax.set_ylabel('Mean Test Accuracy')
(lines, names) = sc.legend_elements()
ax.legend(*(lines, sorted(list(set(X)))))

bx = fig.add_subplot(212)
sc = bx.scatter(encode_with(encZ, Z), C, c=encode_with(encX, X), cmap='viridis')
bx.axes.set_xticklabels(sorted(list(set(Z))))
bx.axes.set_xticks(range(len(set(Z))))
bx.set_xlabel('Activation Function')
bx.set_ylabel('Mean Test Accuracy')
(lines, names) = sc.legend_elements()
bx.legend(*(lines, sorted(list(set(X)))));

In [None]:
import seaborn as sns

classifier = 'ANN'

relevant = df[df['classifier'] == classifier]

def extract(cv_results):
    params = cv_results['params']
    train = cv_results['mean_train_score']
    test = cv_results['mean_test_score']
    
    for (p, tr, te) in zip(params, train, test):
        yield {
            **p,
            'mean_train_score': tr,
            'mean_test_score': te     
        }
    
flattened = pd.DataFrame(x for cvr in relevant['cv_results'] for x in extract(cvr))

In [None]:
# Logistic regression on (16, 13, 7) has some pretty huge outliers. Including them severely distorts the
# chart, so we'll not show them (showfliers=False).
plt.figure(dpi=300)
ax = sns.boxplot(x = 'hidden_layer_sizes', y='mean_test_score', hue='activation', data=flattened, showfliers=False)
ax.set(xlabel='Hidden Layers', ylabel='Test Accuracy', ylim=(0.60, 1.00))
ax.legend().set_title('Activation');

In [None]:
pivot_table = flattened.pivot_table(
    values='mean_test_score',
    columns=['activation'],
    index=['hidden_layer_sizes', 'alpha'],
)

plt.figure(dpi=300)
ax = sns.heatmap(pivot_table, cmap='inferno')

ax.set(
    xlabel = 'Activation', 
    ylabel='Hidden Layers',
    # A horrible hack to hide the different alphas within each chosen hidden layer.
    yticklabels=[
        '', '', (16, 13, 7), '', '',
        '', '', (64,), '', '',
        '', '', (72, 24), '', ''
    ]
);