In [7]:
from basic_fixture import *
from util import evaluate_classifier

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, SelectKBest, f_classif

data, target = Dataset().get_data()
x_train, x_test, y_train, y_test = train_test_split(
        data, target, test_size=0.3, random_state=4)


### Feature selection
Let's test what inputs has the most impact on the outcome for a few values of K

In [8]:
desired_features_cnt = 5

ft = SelectKBest(f_classif, k=desired_features_cnt).fit(x_train, y_train)
ft_np = np.array(ft.scores_)
max_elements_ind = np.argpartition(ft_np, -desired_features_cnt)[-desired_features_cnt:]
top = x_train.columns[max_elements_ind]

print(f'most important inputs : {x_train.columns[max_elements_ind[np.argsort(ft_np[max_elements_ind])[::-1]]]}')

most important inputs : Index(['Unités curriculaires 2e semestre (approuvées)',
       'Unités curriculaires 2e semestre (note)',
       'Unités curriculaires 1er semestre (approuvées)',
       'Unités curriculaires 1er semestre (note)',
       'Frais de scolarité à jour'],
      dtype='object')


### Finding a good range for K
Testing a couple of values for K with a default KNN classifier, let's find the range we want for the tuning

In [9]:
# Preprocessing
X_train_2 = ft.transform(x_train)
X_test_2 = ft.transform(x_test)
X_train = preprocessing.StandardScaler().fit(X_train_2).transform(X_train_2.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test_2).transform(X_test_2.astype(float))

n_range = 30
mean_acc = np.zeros(n_range)
for i in range(1,n_range + 1):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)
    
k_range_cnt = 10
k_range = np.argpartition(mean_acc, -k_range_cnt)[-k_range_cnt:]
#print(k_range)

"""
loc = np.arange(1,n_range + 1,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,n_range + 1), mean_acc)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()
"""


"\nloc = np.arange(1,n_range + 1,step=1.0)\nplt.figure(figsize = (10, 6))\nplt.plot(range(1,n_range + 1), mean_acc)\nplt.xticks(loc)\nplt.xlabel('Number of Neighbors ')\nplt.ylabel('Accuracy')\nplt.show()\n"

# Hyperparameters tuning

### 1. weights and metrics

In [10]:

grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_1 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_1}')

Fitting 3 folds for each of 60 candidates, totalling 180 fits
best score : 0.889413616686344
best params : {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}


### 2. p and leaf size


In [11]:
grid_params = { 'n_neighbors' : k_range,
               'leaf_size' : list(range(1,20))#,
               #'p' : [1, 2]
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_2 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_2}')

Fitting 3 folds for each of 190 candidates, totalling 570 fits
best score : 0.88823297914207
best params : {'leaf_size': 1, 'n_neighbors': 28}


### 3. Everything all at once

In [12]:
grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'p' : [1,2],
               'leaf_size' : list(range(1,50)),
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=5, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_3 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_3}')

Fitting 5 folds for each of 17640 candidates, totalling 88200 fits
best score : 0.8894164874773756
best params : {'algorithm': 'ball_tree', 'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}


### 4. only n_neighbors, 5. diff n_neighbors

In [13]:
grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'p' : [1,2],
               'leaf_size' : list(range(1,20)),}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_4 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_4}')

grid_params = { 'n_neighbors' : [5,6]}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_5 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_5}')

Fitting 3 folds for each of 2280 candidates, totalling 6840 fits
best score : 0.889413616686344
best params : {'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}
Fitting 3 folds for each of 2 candidates, totalling 6 fits
best score : 0.8823297914207006
best params : {'n_neighbors': 5}


### evaluation

In [18]:
control = evaluate_classifier(KNeighborsClassifier(), data, target)

tuned_params = [tuned_best_param_1, tuned_best_param_2, tuned_best_param_3, tuned_best_param_4, tuned_best_param_5]
tuned = [evaluate_classifier(KNeighborsClassifier(**params), data, target) for params in tuned_params]

### Performance comparisons

In [19]:
print("Control")
print(f'train_time : {control["train_time"]}, performance : {control["performance"]}')

for i, t in enumerate(tuned):
    print(f'\n set{i}')
    print(f'train_time : {t["train_time"]}, performance : {t["performance"]}')
    print(f'hyperparams : {tuned_params[i]}')

Control
train_time : 0.09649777412414551, performance : 0.7947063165664874

 set0
train_time : 0.08600068092346191, performance : 0.805008963408204
hyperparams : {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}

 set1
train_time : 0.07950091361999512, performance : 0.7874143203627544
hyperparams : {'leaf_size': 1, 'n_neighbors': 28}

 set2
train_time : 0.19950461387634277, performance : 0.8119213329115259
hyperparams : {'algorithm': 'ball_tree', 'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}

 set3
train_time : 0.08499813079833984, performance : 0.805008963408204
hyperparams : {'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}

 set4
train_time : 0.07300209999084473, performance : 0.7947063165664874
hyperparams : {'n_neighbors': 5}


### reports

In [16]:
print("Control")
print(f'train_time : {control["train_time"]}, classification report : \n{control["classification_report"]}')

for i, t in enumerate(tuned):
    print(f'\n set{i}')
    print(f'train_time : {t["train_time"]}, classification report : \n{t["classification_report"]}')

Control
train_time : 0.2200002670288086, classification report : 
              precision    recall  f1-score   support

           0       0.88      0.65      0.75       435
           1       0.80      0.94      0.86       654

    accuracy                           0.82      1089
   macro avg       0.84      0.79      0.81      1089
weighted avg       0.83      0.82      0.82      1089


 set0
train_time : 0.08550071716308594, classification report : 
              precision    recall  f1-score   support

           0       0.95      0.63      0.76       435
           1       0.80      0.98      0.88       654

    accuracy                           0.84      1089
   macro avg       0.87      0.81      0.82      1089
weighted avg       0.86      0.84      0.83      1089


 set1
train_time : 0.07249832153320312, classification report : 
              precision    recall  f1-score   support

           0       0.89      0.63      0.74       435
           1       0.79      0.95      

## Conclusions
- Tuning the leaf size does nothing for performance but does affect the training speed
- The hyperparameters that truly matters are : n_neighbors, metric, weights