In [8]:
from basic_fixture import *
from util import *

import numpy as np

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

data, target = Dataset().get_data()
x_train, x_test, y_train, y_test = train_test_split(
        data, target, test_size=0.3, random_state=4)


### Feature selection
Let's test what inputs has the most impact on the outcome for a few values of K

In [9]:
desired_features_cnt = 5

ft = SelectKBest(f_classif, k=desired_features_cnt).fit(x_train, y_train)
ft_np = np.array(ft.scores_)
max_elements_ind = np.argpartition(ft_np, -desired_features_cnt)[-desired_features_cnt:]
top = x_train.columns[max_elements_ind]

print(f'most important inputs : {x_train.columns[max_elements_ind[np.argsort(ft_np[max_elements_ind])[::-1]]]}')

most important inputs : Index(['Unités curriculaires 2e semestre (approuvées)',
       'Unités curriculaires 2e semestre (note)',
       'Unités curriculaires 1er semestre (approuvées)',
       'Unités curriculaires 1er semestre (note)',
       'Frais de scolarité à jour'],
      dtype='object')


### Finding a good range for K
Testing a couple of values for K with a default KNN classifier, let's find the range we want for the tuning

In [10]:
# Preprocessing
X_train_2 = ft.transform(x_train)
X_test_2 = ft.transform(x_test)
X_train = preprocessing.StandardScaler().fit(X_train_2).transform(X_train_2.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test_2).transform(X_test_2.astype(float))

n_range = 30
mean_acc = np.zeros(n_range)
for i in range(1,n_range + 1):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)
    
k_range_cnt = 10
k_range = np.argpartition(mean_acc, -k_range_cnt)[-k_range_cnt:]
#print(k_range)

"""
loc = np.arange(1,n_range + 1,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,n_range + 1), mean_acc)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()
"""


"\nloc = np.arange(1,n_range + 1,step=1.0)\nplt.figure(figsize = (10, 6))\nplt.plot(range(1,n_range + 1), mean_acc)\nplt.xticks(loc)\nplt.xlabel('Number of Neighbors ')\nplt.ylabel('Accuracy')\nplt.show()\n"

# Hyperparameters tuning

### 1. every hyperparameters

In [11]:

grid_params = { 
               'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'p' : [1,2],
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_1 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_1}')

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
best score : 0.889413616686344
best params : {'algorithm': 'ball_tree', 'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}


### Performance comparisons

In [12]:
tuned_clf = [KNeighborsClassifier(), KNeighborsClassifier(**tuned_best_param_1)]
names = ['control', 'with optimized hyperparameters']

compare_classifiers(tuned_clf, data, target, clf_names=names)


control
KNeighborsClassifier with params {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
train_time: 0.07249641418457031, classification report: 
              precision    recall  f1-score   support

           0       0.88      0.65      0.75       435
           1       0.80      0.94      0.86       654

    accuracy                           0.82      1089
   macro avg       0.84      0.79      0.81      1089
weighted avg       0.83      0.82      0.82      1089

performance: 0.7947063165664874

with optimized hyperparameters
KNeighborsClassifier with params {'algorithm': 'ball_tree', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}
train_time: 0.15900397300720215, classification report: 
              precision    recall  f1-score   support

           0       0.95      0.63      0.76       435
       

## Observations
- Tuning the leaf size does nothing for performance. Same for P
- Increasing the sizes of K and the number of columns helps every versions but the one where we tune every hyperparameters
- with 10 columns every tunings have the same score of 84.20%
- The chosen metric seems to pretty much always choose minkowsky