# MONKS (1,2,3) - KNN

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [None]:
def open_monks(path):
    '''
    Function to open monks datasets
    Parameters
    ---
    path : str
        It's the path of the file
    Returns
    ---
    monks_df : pandas DataFrame
        the df that contains the dataset
    
    '''    
    file = open(path, 'r')
    content = file.read().split('\n') # split to separate different data
    monks_df = pd.DataFrame([line.split(' ')[1:] for line in content][:-1]) # creation of the df using separation by ' '
    
    # The 3 lines below change names to the columns
    dict_for_rename = {0:'target', monks_df.shape[1]-1:'id'}
    dict_for_rename.update({i:i-1 for i in range(1,monks_df.shape[1]-1)})
    monks_df = monks_df.rename(columns=dict_for_rename)
    return monks_df


def hot_encoding(df):
    
    
    target_column = df.columns[0] # Columns referred to target 
    y = df[target_column] # selecting target value for each datapoint
    y = y.values # from a pd. Dataframe to a np. array
    y = np.array(y, dtype=int) # Convert target values from string to int
    
    
    features_columns = df.columns[1:7] # Columns referred to cat. variables
    X = df[features_columns] # selecting features columns for each datapoint   
    columns = X.columns # Selecting the columns of X. These columns are just the categorical columns of df 
    X_hot = pd.get_dummies(X, columns=columns) # applying one-hot encoding to X features (from 6 dims to 17 dims)
    X_hot = X_hot.values # from a pd. Dataframe to a np. array
    
    return X_hot, y


def not_hot_encoding(df):
    
    target_column = df.columns[0] # Columns referred to target 
    y = df[target_column].values # selecting target value for each datapoint and switching from a pd. Dataframe to a np. array
    y = np.array(y, dtype=int) # Convert target values from string to int
    features_columns = df.columns[1:7] # Columns referred to cat. variables
    X = df[features_columns].values # selecting features columns for each datapoint  and switching from a pd. Dataframe to a np. array   
    X = np.array(X, dtype=int) # # Convert features values from string to int
    
    return X, y



def selecting_results(df, param_metric_value, param_weights_value):
    
    '''
    Function to extrapolate, from the cv_results_ grid, 
    all the models which have the same best model hyperparameter
    "metric" and "weights"
    
    Inputs:
    df - The cv_results grid converted as a pd. DataFrame
    param_metric_value - Metric hyperparameter of the best model
    param_weights_value - Weights hyperparameter of the best model
    
    Outputs:
    df - dataframe with the models choosen as explained
    
    '''
    df = df[df["param_metric"] == param_metric_value]
    df = df[df["param_weights"] == param_weights_value]
    return df


### Custom Metrics 

In [None]:
def distance_nominal(x,y):
    return (x!=y).sum()

def distance_ordinal(x, y):
    return np.sum(np.abs(x-y)/np.array([2, 2, 1, 2, 3, 1]))


In [None]:
#Uploading Dataset (TR and TS) for each MONK

monks1_train = open_monks('monks-1.train')
monks1_test = open_monks('monks-1.test')

monks2_train = open_monks('monks-2.train')
monks2_test = open_monks('monks-2.test')

monks3_train = open_monks('monks-3.train')
monks3_test = open_monks('monks-3.test')

# MONK 1

# Data Preprocessing
- One-hot encoding: ON 

In [None]:
X_train, y_train = hot_encoding(monks1_train)
X_test, y_test = hot_encoding(monks1_test)

## Model Selection (Monk 1: one-hot encoding ON)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/5), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance",'uniform'],
    "metric": ["cityblock","cosine"]
}


grid_1hot = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)


grid_1hot.fit(X_train, y_train)
best_estimator_1hot = grid_1hot.best_estimator_
cv_results_1hot = pd.DataFrame(grid_1hot.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_1hot = cv_results_1hot[cv_results_1hot["rank_test_score"] == 1]
best_model_1hot

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_1hot = best_model_1hot["param_n_neighbors"].iloc[0]
best_weigths_1hot = best_model_1hot["param_weights"].iloc[0]
best_distance_1hot = best_model_1hot["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_1hot = best_model_1hot['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_1hot = best_model_1hot['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_1hot = best_model_1hot['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_1hot = best_model_1hot['std_test_score'].iloc[0]


#computing the MSE over TR

#computing the MSE over VL


print ("##### BEST MODEL _1hot #####")
print( f"best n_neighbors: {best_n_neighbors_1hot}")
print(f"best weigths: {best_weigths_1hot}")
print(f"best distance: {best_distance_1hot}")
print(f"Train Accuracy: {tr_mean_accuracy_1hot} +- {tr_std_accuracy_1hot}")
print(f"Validation Accuracy: {vl_mean_accuracy_1hot} +- {vl_std_accuracy_1hot}")


In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_1hot = selecting_results(cv_results_1hot, best_distance_1hot, best_weigths_1hot)

In [None]:
plt.scatter(results_1hot["param_n_neighbors"], results_1hot["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK1, ENC- Weights = " + best_weigths_1hot + ", " + "metric = " + best_distance_1hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_1hot)
plt.axvline(x=best_model_1hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk1_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)


In [None]:
plt.scatter(results_1hot["param_n_neighbors"], results_1hot["std_test_score"], marker='.', label = "Validation error (std)")
plt.grid()
plt.ylabel("std Validation Accuracy ")
plt.xlabel("n neighbors")
title_str = "MONK1, ENC- Weights = " + best_weigths_1hot + ", " + "metric = " + best_distance_1hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_weigths_1hot)
plt.axvline(x=best_model_1hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk1_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)

## Model Assessment (Monk 1: one-hot encoding ON)

### Metrics and Condusion Matrix

In [None]:
#evaulate metrics on the test set
y_pred_1hot = best_estimator_1hot.predict(X_test)
print(classification_report(y_test, y_pred_1hot))
best_estimator_1hot

In [None]:
cf = confusion_matrix(y_test, y_pred_1hot)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")


accuracy_test_1hot = best_estimator_1hot.score(X_test, y_test) 
print(f"Test Accuracy_1hot: {accuracy_test_1hot}")
print(f"Validation Accuracy_1hot: {vl_mean_accuracy_1hot}")

# Data Preprocessing
- One-hot encoding: OFF 

In [None]:
X_train, y_train = not_hot_encoding(monks1_train)
X_test, y_test = not_hot_encoding(monks1_test)

## Model Selection (Monk 1: one-hot encoding OFF)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/6), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance", 'uniform'],
    "metric": [distance_nominal, distance_ordinal]
}


grid_1cold = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)
 

grid_1cold.fit(X_train, y_train)
best_estimator_1cold = grid_1cold.best_estimator_
cv_results_1cold = pd.DataFrame(grid_1cold.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_1cold = cv_results_1cold[cv_results_1cold["rank_test_score"] == 1]
best_model_1cold

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_1cold = best_model_1cold["param_n_neighbors"].iloc[0]
best_weigths_1cold = best_model_1cold["param_weights"].iloc[0]
best_distance_1cold = best_model_1cold["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_1cold = best_model_1cold['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_1cold = best_model_1cold['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_1cold = best_model_1cold['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_1cold = best_model_1cold['std_test_score'].iloc[0]


print ("##### BEST MODEL _1cold #####")
print( f"best n_neighbors: {best_n_neighbors_1cold}")
print(f"best weigths: {best_weigths_1cold}")
print(f"best distance: {best_distance_1cold}")
print(f"Train Accuracy: {tr_mean_accuracy_1cold} +- {tr_std_accuracy_1cold}")
print(f"Validation Accuracy: {vl_mean_accuracy_1cold} +- {vl_std_accuracy_1cold}")

In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_1cold = selecting_results(cv_results_1cold, best_distance_1cold, best_weigths_1cold)

In [None]:
plt.scatter(results_1cold["param_n_neighbors"], results_1cold["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK1, NENC - Weights = " + best_weigths_1cold + ", " + "metric = 'distance_nominal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_1cold)
plt.axvline(x=best_model_1cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk1_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

In [None]:
plt.scatter(results_1cold["param_n_neighbors"], results_1cold["std_test_score"], marker='.', label = "Validation error (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK1, NENC - Weights = " + best_weigths_1cold + ", " + "metric = 'distance_nominal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_1cold)
plt.axvline(x=best_model_1cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk1_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

## Model Assessment (Monk 1: one-hot encoding OFF)

### Metrics 


In [None]:
#evaulate metrics on the test set
y_pred_1cold = best_estimator_1cold.predict(X_test)
print(classification_report(y_test, y_pred_1cold))
best_estimator_1cold

### Confusion Matrix 

In [None]:
cf = confusion_matrix(y_test, y_pred_1cold)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")

accuracy_test_1cold = best_estimator_1cold.score(X_test, y_test) 
print(f"Test Accuracy_1cold: {accuracy_test_1cold}")
print(f"Validation Accuracy_1cold: {vl_mean_accuracy_1cold}")

# MONK 2

# Data Preprocessing
- One-hot encoding: ON 

In [None]:
X_train, y_train = hot_encoding(monks2_train)
X_test, y_test = hot_encoding(monks2_test)

# Model Selection (Monk 2: one-hot encoding ON)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/6), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance", 'uniform'],
    "metric": ["cityblock","cosine"]
}


grid_2hot = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)
 

grid_2hot.fit(X_train, y_train)
best_estimator_2hot = grid_2hot.best_estimator_
cv_results_2hot = pd.DataFrame(grid_2hot.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_2hot = cv_results_2hot[cv_results_2hot["rank_test_score"] == 1]
best_model_2hot

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_2hot = best_model_2hot["param_n_neighbors"].iloc[0]
best_weigths_2hot = best_model_2hot["param_weights"].iloc[0]
best_distance_2hot = best_model_2hot["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_2hot = best_model_2hot['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_2hot = best_model_2hot['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_2hot = best_model_2hot['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_2hot = best_model_2hot['std_test_score'].iloc[0]


print ("##### BEST MODEL _2hot #####")
print( f"best n_neighbors: {best_n_neighbors_2hot}")
print(f"best weigths: {best_weigths_2hot}")
print(f"best distance: {best_distance_2hot}")
print(f"Train Accuracy: {tr_mean_accuracy_2hot} +- {tr_std_accuracy_2hot}")
print(f"Validation Accuracy: {vl_mean_accuracy_2hot} +- {vl_std_accuracy_2hot}")

In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_2hot = selecting_results(cv_results_2hot, best_distance_2hot, best_weigths_2hot)

In [None]:
plt.scatter(results_2hot["param_n_neighbors"], results_2hot["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK2, ENC - Weights = " + best_weigths_2hot+ ", " + "metric = " + best_distance_2hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_2hot)
plt.axvline(x=best_model_2hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk2_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)

In [None]:
plt.scatter(results_2hot["param_n_neighbors"], results_2hot["std_test_score"], marker='.', label = "Validation error (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK2, ENC - Weights = " + best_weigths_2hot+ ", " + "metric = " + best_distance_2hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_2hot)
plt.axvline(x=best_model_2hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk2_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)

# Model Assessment (Monk 2: one-hot encoding ON)

## Metrics and Confusion Matrix

In [None]:
#evaulate metrics on the test set
y_pred_2hot = best_estimator_2hot.predict(X_test)
print(classification_report(y_test, y_pred_2hot))
best_estimator_2hot

In [None]:
cf = confusion_matrix(y_test, y_pred_2hot)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")

accuracy_test_2hot = best_estimator_2hot.score(X_test, y_test) 
print(f"Test Accuracy_2hot: {accuracy_test_2hot}")
print(f"Validation Accuracy_2hot: {vl_mean_accuracy_2hot}")

# Data Preprocessing
- One-hot encoding: OFF 

In [None]:
X_train, y_train = not_hot_encoding(monks2_train)
X_test, y_test = not_hot_encoding(monks2_test)

# Model Selection (Monk 2: one-hot encoding OFF)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/6), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance", 'uniform'],
    "metric": [distance_nominal, distance_ordinal]
}


grid_2cold = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)
 

grid_2cold.fit(X_train, y_train)
best_estimator_2cold = grid_2cold.best_estimator_
cv_results_2cold = pd.DataFrame(grid_2cold.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_2cold = cv_results_2cold[cv_results_2cold["rank_test_score"] == 1]
best_model_2cold

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_2cold = best_model_2cold["param_n_neighbors"].iloc[0]
best_weigths_2cold = best_model_2cold["param_weights"].iloc[0]
best_distance_2cold = best_model_2cold["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_2cold = best_model_2cold['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_2cold = best_model_2cold['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_2cold = best_model_2cold['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_2cold = best_model_2cold['std_test_score'].iloc[0]


print ("##### BEST MODEL _2cold #####")
print( f"best n_neighbors: {best_n_neighbors_2cold}")
print(f"best weigths: {best_weigths_2cold}")
print(f"best distance: {best_distance_2cold}")
print(f"Train Accuracy: {tr_mean_accuracy_2cold} +- {tr_std_accuracy_2cold}")
print(f"Validation Accuracy: {vl_mean_accuracy_2cold} +- {vl_std_accuracy_2cold}")


In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_2cold = selecting_results(cv_results_2cold, best_distance_2cold, best_weigths_2cold)

In [None]:
plt.scatter(results_2cold["param_n_neighbors"], results_2cold["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK2, NENC - Weights = " + best_weigths_2cold + ", " + "metric = 'distance_ordinal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_2cold)
plt.axvline(x=best_model_2cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk2_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

In [None]:
plt.scatter(results_2cold["param_n_neighbors"], results_2cold["std_test_score"], marker='.', label = "Validation error (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK2, NENC - Weights = " + best_weigths_2cold + ", " + "metric = 'distance_ordinal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_2cold)
plt.axvline(x=best_model_2cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk2_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

# Model Assessment (Monk 2: one-hot encoding OFF)

## Metrics and Confusion Matrix 

In [None]:
#evaulate metrics on the test set
y_pred_2cold = best_estimator_2cold.predict(X_test)
print(classification_report(y_test, y_pred_2cold))
best_estimator_2cold

In [None]:
cf = confusion_matrix(y_test, y_pred_2cold)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")

accuracy_test_2cold = best_estimator_2cold.score(X_test, y_test) 
print(f"Test Accuracy_2cold: {accuracy_test_2cold}")
print(f"Validation Accuracy_2cold: {vl_mean_accuracy_2cold}")

# MONK 3


# Data Preprocessing
- One-hot encoding: ON 

In [None]:
X_train, y_train = hot_encoding(monks3_train)
X_test, y_test = hot_encoding(monks3_test)

# Model Selection (Monk 3: one-hot encoding ON)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/6), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance", 'uniform'],
    "metric": ["cityblock","cosine"]
}


grid_3hot = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)
 

grid_3hot.fit(X_train, y_train)
best_estimator_3hot = grid_3hot.best_estimator_
cv_results_3hot = pd.DataFrame(grid_3hot.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_3hot = cv_results_3hot[cv_results_3hot["rank_test_score"] == 1]
best_model_3hot

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_3hot = best_model_3hot["param_n_neighbors"].iloc[0]
best_weigths_3hot = best_model_3hot["param_weights"].iloc[0]
best_distance_3hot = best_model_3hot["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_3hot = best_model_3hot['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_3hot = best_model_3hot['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_3hot = best_model_3hot['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_3hot = best_model_3hot['std_test_score'].iloc[0]


print ("##### BEST MODEL _3hot #####")
print( f"best n_neighbors: {best_n_neighbors_3hot}")
print(f"best weigths: {best_weigths_3hot}")
print(f"best distance: {best_distance_3hot}")
print(f"Train Accuracy: {tr_mean_accuracy_3hot} +- {tr_std_accuracy_3hot}")
print(f"Validation Accuracy: {vl_mean_accuracy_3hot} +- {vl_std_accuracy_3hot}")



In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_3hot = selecting_results(cv_results_3hot, best_distance_3hot, best_weigths_3hot)


In [None]:
plt.scatter(results_3hot["param_n_neighbors"], results_3hot["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK3, ENC - Weights = " + best_weigths_3hot + ", " + "metric = " + best_distance_3hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_3hot)
plt.axvline(x=best_model_3hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk3_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)

In [None]:
plt.scatter(results_3hot["param_n_neighbors"], results_3hot["std_test_score"], marker='.', label = "Validation error (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK3, ENC - Weights = " + best_weigths_3hot + ", " + "metric = " + best_distance_3hot
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_3hot)
plt.axvline(x=best_model_3hot["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk3_hot" + ".pdf"
plt.legend()
plt.savefig(title_str)

# Model Assessment (Monk 3: one-hot encoding ON)

## Metrics and Confusion Matrix 

In [None]:
#evaulate metrics on the test set
y_pred_3hot = best_estimator_3hot.predict(X_test)
print(classification_report(y_test, y_pred_3hot))
best_estimator_3hot

In [None]:
cf = confusion_matrix(y_test, y_pred_3hot)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")

accuracy_test_3hot = best_estimator_3hot.score(X_test, y_test) 
print(f"Test Accuracy_3hot: {accuracy_test_3hot}")
print(f"Validation Accuracy_3hot: {vl_mean_accuracy_3hot}")

# Data Preprocessing
- One-hot encoding: OFF 

In [None]:
X_train, y_train = not_hot_encoding(monks3_train)
X_test, y_test = not_hot_encoding(monks3_test)

# Model Selection (Monk 3: one-hot encoding OFF)

In [None]:
n_neighbors_v = np.arange(1, int(4*X_train.shape[0]/6), step = 1 )

param_grid = {
    "n_neighbors": n_neighbors_v,
    "weights": ["distance", 'uniform'],
    "metric": [distance_nominal, distance_ordinal]
}


grid_3cold = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit=True,
    return_train_score = True
)
 

grid_3cold.fit(X_train, y_train)
best_estimator_3cold = grid_3cold.best_estimator_
cv_results_3cold= pd.DataFrame(grid_3cold.cv_results_)

In [None]:
#best model: It's been choosen the model with the highest mean accuracy value over VL
best_model_3cold = cv_results_3cold[cv_results_3cold["rank_test_score"] == 1]
best_model_3cold

In [None]:
#extrapolating the hyperparameters of the best model
best_n_neighbors_3cold = best_model_3cold["param_n_neighbors"].iloc[0]
best_weigths_3cold = best_model_3cold["param_weights"].iloc[0]
best_distance_3cold = best_model_3cold["param_metric"].iloc[0]

#getting from cv_results_ the mean accuracy value over TR
tr_mean_accuracy_3cold = best_model_3cold['mean_train_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over TR
tr_std_accuracy_3cold = best_model_3cold['std_train_score'].iloc[0]

#getting from cv_results_ the mean accuracy value over VL
vl_mean_accuracy_3cold = best_model_3cold['mean_test_score'].iloc[0]
#getting from cv_results_ the std of accuracy value over VL
vl_std_accuracy_3cold = best_model_3cold['std_test_score'].iloc[0]


print ("##### BEST MODEL _3cold #####")
print( f"best n_neighbors: {best_n_neighbors_3cold}")
print(f"best weigths: {best_weigths_3cold}")
print(f"best distance: {best_distance_3cold}")
print(f"Train Accuracy: {tr_mean_accuracy_3cold} +- {tr_std_accuracy_3cold}")
print(f"Validation Accuracy: {vl_mean_accuracy_3cold} +- {vl_std_accuracy_3cold}")



In [None]:
#selecting rows which have the hyperparameters metric and weights equals to best ones (referred to the best model).
results_3cold = selecting_results(cv_results_3cold, best_distance_3cold, best_weigths_3cold)

In [None]:
plt.scatter(results_3cold["param_n_neighbors"], results_3cold["mean_test_score"], marker='.', label = "Validation Accuracy (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK3, NENC - Weights = " + best_weigths_3cold + ", " + "metric = 'distance_ordinal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_3cold)
plt.axvline(x=best_model_3cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_vs_k_monk3_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

In [None]:
plt.scatter(results_3cold["param_n_neighbors"], results_3cold["std_test_score"], marker='.', label = "Validation error (mean value)")
plt.grid()
plt.ylabel("Validation Accuracy")
plt.xlabel("n neighbors")
title_str = "MONK3, NENC - Weights = " + best_weigths_3cold + ", " + "metric = 'distance_ordinal'" 
plt.title(title_str)
best_k_sting = "best n_neighbors = " + str(best_n_neighbors_3cold)
plt.axvline(x=best_model_3cold["param_n_neighbors"].iloc[0], color='red', label = best_k_sting)
title_str = "val_acc_std_vs_k_monk3_cold" + ".pdf"
plt.legend()
plt.savefig(title_str)

# Model Assessment (Monk 3: one-hot encoding OFF)

## Metrics and Confusion Matrix 

In [None]:
#evaulate metrics on the test set
y_pred_3cold = best_estimator_3cold.predict(X_test)
print(classification_report(y_test, y_pred_3cold))
best_estimator_3cold

In [None]:
cf = confusion_matrix(y_test, y_pred_3cold)
sns.heatmap(cf, annot=True, cmap="Greens")
plt.xlabel("True label")
plt.ylabel("Predicted label")


accuracy_test_3cold = best_estimator_3cold.score(X_test, y_test) 
print(f"Test Accuracy: {accuracy_test_3cold}")
print(f"Validation Accuracy: {vl_mean_accuracy_3cold}")