In [None]:
from basic_fixture import *
from util import evaluate_classifier

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, SelectKBest, f_classif

data, target = Dataset().get_data()
x_train, x_test, y_train, y_test = train_test_split(
        data, target, test_size=0.3, random_state=4)


### Feature selection
Let's test what inputs has the most impact on the outcome for a few values of K

In [None]:
desired_features_cnt = 5

ft = SelectKBest(f_classif, k=desired_features_cnt).fit(x_train, y_train)
ft_np = np.array(ft.scores_)
max_elements_ind = np.argpartition(ft_np, -desired_features_cnt)[-desired_features_cnt:]
top = x_train.columns[max_elements_ind]

print(f'most important inputs : {x_train.columns[max_elements_ind[np.argsort(ft_np[max_elements_ind])[::-1]]]}')

### Finding a good range for K
Testing a couple of values for K with a default KNN classifier, let's find the range we want for the tuning

In [None]:
# Preprocessing
X_train_2 = ft.transform(x_train)
X_test_2 = ft.transform(x_test)
X_train = preprocessing.StandardScaler().fit(X_train_2).transform(X_train_2.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test_2).transform(X_test_2.astype(float))

n_range = 30
mean_acc = np.zeros(n_range)
for i in range(1,n_range + 1):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)
    
k_range_cnt = 10
k_range = np.argpartition(mean_acc, -k_range_cnt)[-k_range_cnt:]
#print(k_range)

"""
loc = np.arange(1,n_range + 1,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,n_range + 1), mean_acc)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()
"""


# Hyperparameters tuning

### 1. weights and metrics

In [None]:

grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_1 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_1}')

### 2. p and leaf size


In [None]:
grid_params = { 'n_neighbors' : k_range,
               'leaf_size' : list(range(1,20))#,
               #'p' : [1, 2]
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_2 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_2}')

### 3. Everything all at once

In [None]:
grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'p' : [1,2],
               #'leaf_size' : list(range(1,50)),
               }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=5, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_3 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_3}')

### 4. only n_neighbors, 5. diff n_neighbors

In [None]:
grid_params = { 'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'p' : [1,2],
               'leaf_size' : list(range(1,20)),}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_4 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_4}')

grid_params = { 'n_neighbors' : [5,6]}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)

tuned_best_param_5 = g_res.best_params_
print(f'best score : {g_res.best_score_}')
print(f'best params : {tuned_best_param_5}')

### evaluation

In [None]:
control = evaluate_classifier(KNeighborsClassifier(), data, target)

tuned_params = [tuned_best_param_1, tuned_best_param_2, tuned_best_param_3, tuned_best_param_4, tuned_best_param_5]
tuned = [evaluate_classifier(KNeighborsClassifier(**params), data, target) for params in tuned_params]

### Performance comparisons

In [None]:
print("Control")
print(f'train_time : {control["train_time"]}, performance : {control["performance"]}')

for i, t in enumerate(tuned):
    print(f'\n set{i}')
    print(f'train_time : {t["train_time"]}, performance : {t["performance"]}')
    print(f'hyperparams : {tuned_params[i]}')

### reports

In [None]:
print("Control")
print(f'train_time : {control["train_time"]}, classification report : \n{control["classification_report"]}')

for i, t in enumerate(tuned):
    print(f'\n set{i}')
    print(f'train_time : {t["train_time"]}, classification report : \n{t["classification_report"]}')

## Conclusions
- Tuning the leaf size does nothing for performance but does affect the training speed
- The hyperparameters that truly matters are : n_neighbors, metric, weights