In [1]:
import numpy as np
from load_dataset import load_data
import utils

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

## Instancier le jeu de données

In [2]:
X, y, cat_features = load_data().get_data_X_y(data='simplify')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_cpy = X_train.copy()

### Feature selection
Let's test what inputs has the most impact on the outcome for a few values of K

In [4]:
desired_features_cnt = 5

ft = SelectKBest(f_classif, k=desired_features_cnt).fit(X_train, y_train)
ft_np = np.array(ft.scores_)
max_elements_ind = np.argpartition(ft_np, -desired_features_cnt)[-desired_features_cnt:]
top = X_train.columns[max_elements_ind]

print(f'most important inputs : {X_train.columns[max_elements_ind[np.argsort(ft_np[max_elements_ind])[::-1]]]}')

most important inputs : Index(['Unités curriculaires 2e semestre (approuvées)',
       'Unités curriculaires 2e semestre (note)',
       'Unités curriculaires 1er semestre (approuvées)',
       'Unités curriculaires 1er semestre (note)',
       'Frais de scolarité à jour'],
      dtype='object')


### Finding a good range for K
Testing a couple of values for K with a default KNN classifier, let's find the range we want for the tuning

In [5]:
# Preprocessing
X_train_2 = ft.transform(X_train)
X_test_2 = ft.transform(X_test)
X_train = preprocessing.StandardScaler().fit(X_train_2).transform(X_train_2.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test_2).transform(X_test_2.astype(float))

n_range = 30
mean_acc = np.zeros(n_range)
for i in range(1,n_range + 1):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)
    
k_range_cnt = 10
k_range = np.argpartition(mean_acc, -k_range_cnt)[-k_range_cnt:]
#print(k_range)

"""
loc = np.arange(1,n_range + 1,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,n_range + 1), mean_acc)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()
"""


"\nloc = np.arange(1,n_range + 1,step=1.0)\nplt.figure(figsize = (10, 6))\nplt.plot(range(1,n_range + 1), mean_acc)\nplt.xticks(loc)\nplt.xlabel('Number of Neighbors ')\nplt.ylabel('Accuracy')\nplt.show()\n"

# Hyperparameters tuning

### 1. every hyperparameters

In [6]:

grid_params = { 
               'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'p' : [1,2],
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=5, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_1 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_1}')

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
best score : 0.8878339495382331
best params : {'algorithm': 'ball_tree', 'metric': 'minkowski', 'n_neighbors': 14, 'p': 1, 'weights': 'uniform'}


### Performance comparisons

In [7]:
tuned_clf = [KNeighborsClassifier(), KNeighborsClassifier(**tuned_best_param_1)]
names = ['control', 'with optimized hyperparameters']

utils.compare_classifiers(tuned_clf, X, y, clf_names=names)


control
KNeighborsClassifier with params {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
train_time: 0.2749974727630615, classification report: 
              precision    recall  f1-score   support

           0       0.89      0.69      0.78       435
           1       0.82      0.94      0.88       654

    accuracy                           0.84      1089
   macro avg       0.85      0.82      0.83      1089
weighted avg       0.85      0.84      0.84      1089

performance: 0.8173099230201413

with optimized hyperparameters
KNeighborsClassifier with params {'algorithm': 'ball_tree', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 14, 'p': 1, 'weights': 'uniform'}
train_time: 0.1634986400604248, classification report: 
              precision    recall  f1-score   support

           0       0.93      0.71      0.80       435
         

In [11]:
#utils.features_importance(KNeighborsClassifier(**tuned_best_param_1), X_train_cpy.columns, plot=False, log=True)

AttributeError: 'KNeighborsClassifier' object has no attribute 'feature_importances_'

## Observations
- Tuning the leaf size does nothing for performance. Same for P
- Increasing the sizes of K and the number of columns helps every versions but the one where we tune every hyperparameters
- with 10 columns every tunings have the same score of 84.20%
- The chosen metric seems to pretty much always choose minkowsky