In [None]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

In [None]:
def open_monks(path):
    '''
    Function to open monks datasets
    Parameters    
    '''
    
    file = open(path, 'r')
    content = file.read().split('\n') # split to separate different data
    monks_df = pd.DataFrame([line.split(' ')[1:] for line in content][:-1]) # creation of the df using separation by ' '
    
    # The 3 lines below change names to the columns
    dict_for_rename = {0:'target', monks_df.shape[1]-1:'id'}
    dict_for_rename.update({i:i-1 for i in range(1,monks_df.shape[1]-1)})
    monks_df = monks_df.rename(columns=dict_for_rename)
    return monks_df

def hot_encoding(df):
    '''
    Function useful for one hot encoding
    '''
    target_column = df.columns[0] # Columns referred to target 
    y = df[target_column] # selecting target value for each datapoint
    y = y.values # from a pd. Dataframe to a np. array
    y = np.array(y, dtype=int) # Convert target values from string to int
    features_columns = df.columns[1:7] # Columns referred to cat. variables
    X = df[features_columns] # selecting features columns for each datapoint   
    columns = X.columns # Selecting the columns of X. These columns are just the categorical columns of df 
    X_hot = pd.get_dummies(X, columns=columns) # applying one-hot encoding to X features (from 6 dims to 17 dims)
    X_hot = X_hot.values # from a pd. Dataframe to a np. array
    return X_hot, y

def model_results(X_train, y_train, X_val, y_val, clf, epochs):
    '''
    Fuction useful to plot learning curve until reached the selected number of epochs
    and to give the scoring of the chosen model (from epochs number)
    '''
    clf_cloned = clone(clf)
    clf_cloned.early_stopping = False
    train_ACCs = []
    val_ACCs = []
    n_samples = X_train.shape[0]
    classes = np.unique(y_train)
    
    for i in range(epochs):
        clf_cloned.partial_fit(X_train, y_train, classes)
        train_ACC = clf_cloned.score(X_train, y_train)
        train_ACCs.append(train_ACC)
        val_ACC = clf_cloned.score(X_val, y_val)
        val_ACCs.append(val_ACC)

    train_ACCs = np.array(train_ACCs)
    val_ACCs = np.array(val_ACCs)
    
    fig1 = plt.figure()
    plt.plot(train_ACCs, color='r')
    plt.plot(val_ACCs, color='b', linestyle='--')
    plt.legend(['Accuracy over TR', 'Accuracy over VAL'])
    plt.title('Learning curves', fontsize=20)
    plt.xlabel('#Epochs', fontsize=15)
    plt.ylabel('Accuracy', fontsize=15)
    plt.grid()
    
    fig2 = plt.figure()
    plt.plot(clf_cloned.loss_curve_)
    plt.grid()
    plt.title('Loss vs #Epochs', fontsize=20)
    plt.xlabel('#Epochs', fontsize=15)
    plt.ylabel('Loss', fontsize=15)

    print(f'inner train score = {clf_cloned.score(X_train, y_train)}')
    print(f'validation score = {clf_cloned.score(X_val, y_val)}')
    return train_ACCs, val_ACCs, fig1, fig2, clf_cloned

monks1_train = open_monks('MONKS/monks-1.train')
monks1_test = open_monks('MONKS/monks-1.test')

monks2_train = open_monks('MONKS/monks-2.train')
monks2_test = open_monks('MONKS/monks-2.test')

monks3_train = open_monks('MONKS/monks-3.train')
monks3_test = open_monks('MONKS/monks-3.test')

# MONKS

Below there are the grid searches for MONKS dataset. We decided, as suggested by the lectures on MONKS, to solve the problems in the easiest way possible:
- few units
- tanh as activation function
- stochastic gradient descent method
- alpha = 0, regularization shouldn't be necessary

We choose to use:
- learning rate 'invscaling' (which include also 'constant', thanks to power_t=0)
- to explore only 2 learning rate starting values, higher than the standard proposal for the learning_rate_init (1e-3), given that in most of the explorations there will be an invscaling learning rate
- an on-line type of descending (batch size = 1), which implies no nesterov momentum (on the lectures it's told that it improves performances only for the full batch mode)
- 0 momentum, it shouldn't be necessary for the convergence (it's a small dataset)
- standard tolerance, 1e-4 

The first part of the code for each monk is the grid search as described above.

In the second part of each section, after the hyperparameters selection, the training is divided in two sub-sets, 'inner training' and 'validation'. On the inner training set we retrain the model, using the evaluation set we choose at which epoch to stop the training. In this way we can show the learning curve and choose the right number of epochs for the assessment.

# MONKS1

In [None]:
%%time

X_train, y_train = hot_encoding(monks1_train)
X_test, y_test = hot_encoding(monks1_test)
n_samples = monks1_train.shape[0]

param_grid1 = {
    'hidden_layer_sizes' : [3, 4, 5],
    'activation' : ['tanh'],
    'solver' : ['sgd'],
    'alpha' : [0],
    'batch_size' : [1],
    'learning_rate' : ['invscaling'],
    'learning_rate_init' : [1e-1, 1e-2],
    'power_t' : [0, 0.1, 0.2, 0.3],
    'max_iter' : [500], 
    'tol' : [1e-4], 
    'momentum' : [0], 
    'nesterovs_momentum' : [False],
    'n_iter_no_change' : [10],
}

grid1 = GridSearchCV(
    MLPClassifier(),
    param_grid=param_grid1,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    n_jobs=-1,
    refit='accuracy',
)

grid1.fit(X_train, y_train)

best_clf1 = grid1.best_estimator_
plt.plot(best_clf1.loss_curve_)
print(f'train score = {best_clf1.score(X_train, y_train)}')

best_clf1

In [None]:
pd.DataFrame(grid1.cv_results_).to_csv('MLP_MONKS_grid_results/MONKS1_results.csv', index=False)

In [None]:
plt.figure()
plt.plot(best_clf1.loss_curve_)
plt.grid()
plt.title('Loss vs #Epochs', fontsize=20)
plt.xlabel('#Epochs', fontsize=15)
plt.ylabel('Loss', fontsize=15)

In [None]:
X_train, y_train = hot_encoding(monks1_train)
X_inner_train, X_val, y_inner_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0, stratify=y_train, shuffle=True)
X_test, y_test = hot_encoding(monks1_test)

clf1 = clone(MLPClassifier(activation='tanh', alpha=0, batch_size=1, hidden_layer_sizes=4,
              learning_rate='invscaling', learning_rate_init=0.1, max_iter=500,
              momentum=0, nesterovs_momentum=False, power_t=0.2, solver='sgd', random_state=0))



train_ACCs, val_ACCs, fig1, fig2, clf_cloned = model_results(X_inner_train, y_inner_train, X_val, y_val, clf1, 60)

fig1.savefig('images_MLP_MONKS/learning_curve_MONKS1.pdf')
fig2.savefig('images_MLP_MONKS/loss_curve_MONKS1.pdf')

In [None]:
print(f'train score = {clf_cloned.score(X_inner_train, y_inner_train)}')
print(f'val score = {clf_cloned.score(X_val, y_val)}')
print(f'test score = {clf_cloned.score(X_test, y_test)}')
print(f'train MSE = {mean_squared_error(y_inner_train, clf_cloned.predict(X_inner_train))}')
print(f'val MSE = {mean_squared_error(y_val, clf_cloned.predict(X_val))}')
print(f'test MSE = {mean_squared_error(y_test, clf_cloned.predict(X_test))}')

# MONKS2

In [None]:
%%time

X_train, y_train = hot_encoding(monks2_train)
X_test, y_test = hot_encoding(monks2_test)
n_samples = monks2_train.shape[0]

param_grid2 = {
    'hidden_layer_sizes' : [3, 4, 5],
    'activation' : ['tanh'],
    'solver' : ['sgd'],
    'alpha' : [0],
    'batch_size' : [1],
    'learning_rate' : ['invscaling'],
    'learning_rate_init' : [1e-1, 1e-2],
    'power_t' : [0, 0.1, 0.2, 0.3],
    'max_iter' : [500], 
    'tol' : [1e-4], 
    'momentum' : [0], 
    'nesterovs_momentum' : [False],
    'n_iter_no_change' : [10],
}

grid2 = GridSearchCV(
    MLPClassifier(),
    param_grid=param_grid2,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    n_jobs=-1,
    refit='accuracy',
)

grid2.fit(X_train, y_train)

best_clf2 = grid2.best_estimator_
plt.plot(best_clf2.loss_curve_)
print(f'test score = {best_clf2.score(X_test, y_test)}')
print(f'train score = {best_clf2.score(X_train, y_train)}')

best_clf2

In [None]:
pd.DataFrame(grid2.cv_results_).to_csv('MLP_MONKS_grid_results/MONKS2_results.csv', index=False)

In [None]:
plt.figure()
plt.plot(best_clf2.loss_curve_)
plt.grid()
plt.title('Loss vs #Epochs', fontsize=20)
plt.xlabel('#Epochs', fontsize=15)
plt.ylabel('Loss', fontsize=15)

In [None]:
X_train, y_train = hot_encoding(monks2_train)
X_inner_train, X_val, y_inner_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0, stratify=y_train, shuffle=True)
X_test, y_test = hot_encoding(monks2_test)

clf2 = clone(MLPClassifier(activation='tanh', alpha=0, batch_size=1, hidden_layer_sizes=3,
              learning_rate='invscaling', learning_rate_init=0.1, max_iter=500,
              momentum=0, nesterovs_momentum=False, power_t=0.1, solver='sgd', random_state=0))

train_ACCs_best2, test_ACCs_best2, fig1, fig2, clf_cloned = model_results(X_inner_train, y_inner_train,
                                                                                         X_val, y_val,
                                                                                         clf2, 100)
fig1.savefig('images_MLP_MONKS/learning_curve_MONKS2.pdf')
fig2.savefig('images_MLP_MONKS/loss_curve_MONKS2.pdf')

In [None]:
print(f'train score = {clf_cloned.score(X_inner_train, y_inner_train)}')
print(f'val score = {clf_cloned.score(X_val, y_val)}')
print(f'test score = {clf_cloned.score(X_test, y_test)}')
print(f'train MSE = {mean_squared_error(y_inner_train, clf_cloned.predict(X_inner_train))}')
print(f'val MSE = {mean_squared_error(y_val, clf_cloned.predict(X_val))}')
print(f'test MSE = {mean_squared_error(y_test, clf_cloned.predict(X_test))}')

# MONKS3

In [None]:
%%time

X_train, y_train = hot_encoding(monks3_train)
X_test, y_test = hot_encoding(monks3_test)
n_samples = monks3_train.shape[0]

param_grid3 = {
    'hidden_layer_sizes' : [3, 4, 5],
    'activation' : ['tanh'],
    'solver' : ['sgd'],
    'alpha' : [0],
    'batch_size' : [1],
    'learning_rate' : ['invscaling'],
    'learning_rate_init' : [1e-1, 1e-2],
    'power_t' : [0, 0.1, 0.2, 0.3],
    'max_iter' : [500], 
    'tol' : [1e-4], 
    'momentum' : [0], 
    'nesterovs_momentum' : [False],
    'n_iter_no_change' : [10],
}

grid3 = GridSearchCV(
    MLPClassifier(),
    param_grid=param_grid3,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    n_jobs=-1,
    refit='accuracy',
)

grid3.fit(X_train, y_train)

best_clf3 = grid3.best_estimator_
plt.plot(best_clf3.loss_curve_)
print(f'test score = {best_clf3.score(X_test, y_test)}')
print(f'train score = {best_clf3.score(X_train, y_train)}')

best_clf3

In [None]:
pd.DataFrame(grid3.cv_results_).to_csv('MLP_MONKS_grid_results/MONKS3_results.csv', index=False)

In [None]:
plt.figure()
plt.plot(best_clf3.loss_curve_)
plt.grid()
plt.title('Loss vs #Epochs', fontsize=20)
plt.xlabel('#Epochs', fontsize=15)
plt.ylabel('Loss', fontsize=15)

In [None]:
X_train, y_train = hot_encoding(monks3_train)
X_inner_train, X_val, y_inner_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0, stratify=y_train, shuffle=True)
X_test, y_test = hot_encoding(monks3_test)

clf3 = clone(MLPClassifier(activation='tanh', alpha=0, batch_size=1, hidden_layer_sizes=3,
              learning_rate='invscaling', learning_rate_init=0.01, max_iter=500,
              momentum=0, nesterovs_momentum=False, power_t=0.3, solver='sgd', random_state=0))

train_ACCs_best3, test_ACCs_best3, fig1, fig2, clf_cloned = model_results(X_inner_train, y_inner_train,
                                                                                         X_val, y_val,
                                                                                         clf3, 200)

fig1.savefig('images_MLP_MONKS/learning_curve_MONKS3.pdf')
fig2.savefig('images_MLP_MONKS/loss_curve_MONKS3.pdf')

In [None]:
print(f'train score = {clf_cloned.score(X_inner_train, y_inner_train)}')
print(f'val score = {clf_cloned.score(X_val, y_val)}')
print(f'test score = {clf_cloned.score(X_test, y_test)}')
print(f'train MSE = {mean_squared_error(y_inner_train, clf_cloned.predict(X_inner_train))}')
print(f'val MSE = {mean_squared_error(y_val, clf_cloned.predict(X_val))}')
print(f'test MSE = {mean_squared_error(y_test, clf_cloned.predict(X_test))}')

# MONKS3 + REG

In [None]:
X_train, y_train = hot_encoding(monks3_train)
X_inner_train, X_val, y_inner_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0, stratify=y_train, shuffle=True)
X_test, y_test = hot_encoding(monks3_test)

clf3 = clone(MLPClassifier(activation='tanh', alpha=0, batch_size=1, hidden_layer_sizes=3,
              learning_rate='constant', learning_rate_init=0.01, max_iter=500,
              momentum=0, nesterovs_momentum=False, solver='sgd', random_state=0))
clf3.alpha = 1e-1
train_ACCs_best3, test_ACCs_best3, fig1, fig2, clf_cloned = model_results(X_inner_train, y_inner_train,
                                                                                         X_val, y_val,
                                                                                         clf3, 50)

fig1.savefig('images_MLP_MONKS/learning_curve_MONKS3+reg.pdf')
fig2.savefig('images_MLP_MONKS/loss_curve_MONKS3+reg.pdf')

In [None]:
print(f'train score = {clf_cloned.score(X_inner_train, y_inner_train)}')
print(f'val score = {clf_cloned.score(X_val, y_val)}')
print(f'test score = {clf_cloned.score(X_test, y_test)}')
print(f'train MSE = {mean_squared_error(y_inner_train, clf_cloned.predict(X_inner_train))}')
print(f'val MSE = {mean_squared_error(y_val, clf_cloned.predict(X_val))}')
print(f'test MSE = {mean_squared_error(y_test, clf_cloned.predict(X_test))}')