In [1]:
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import StandardScaler

from joblib import dump, load

# Préparation dataset

In [2]:
# Import des données
df = pd.read_csv('data/merged_data_2018_2021_for_model.csv' )
df.shape

(423136, 34)

In [3]:
# Encodage des variables catégorielles
cat_var = df.select_dtypes(include='object').columns
encoded_cat = pd.get_dummies(df[cat_var], prefix=cat_var, drop_first=True).astype(int)
df = df.drop(columns=cat_var)
df = pd.concat([df, encoded_cat], axis=1)
print('Taille de dataset apres onehotconding',df.shape)


Taille de dataset apres onehotconding (423136, 51)


In [4]:
# Séparation de la variable cible et des variables explicatives
X = df.drop(['grav'], axis=1)
y = df['grav']

In [5]:
#  Division des données en ensemble d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101, stratify=y)

In [6]:
# Standardisation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Création variable cible binaire

In [7]:
# Création variable cible binaire
y_bin = y.replace([1, 2], 0) \
         .replace([3, 4], 1)
y_train_bin = y_train.replace([1, 2], 0) \
                         .replace([3, 4], 1)
y_test_bin = y_test.replace([1, 2], 0) \
                        .replace([3, 4], 1)

# Oversampling

In [8]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=101)
X_train_overresampled, y_train_overresampled = ros.fit_resample(X_train_scaled, y_train)

# Dummy classifier 4 classes

In [10]:
# Test DummyClassifier pour comparaison
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_overresampled, y_train_overresampled)
y_pred_dum = dummy_clf.predict(X_test_scaled)
print(classification_report(y_test,y_pred_dum))

              precision    recall  f1-score   support

         1.0       0.42      1.00      0.60     17964
         2.0       0.00      0.00      0.00      1096
         3.0       0.00      0.00      0.00      6667
         4.0       0.00      0.00      0.00     16587

    accuracy                           0.42     42314
   macro avg       0.11      0.25      0.15     42314
weighted avg       0.18      0.42      0.25     42314



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# KNN oversampling 4 classes

In [9]:
model = load('knn_4_clusters_euclidean_oversampling.joblib')

# train score
y_pred = model.predict(X_train_overresampled)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_euclidean_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_euclidean_test.csv', index= True)
print(clsf_report)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
1.0            0.754566  0.845537  0.797466  161670.000000
2.0            0.938768  1.000000  0.968417  161670.000000
3.0            0.756587  0.937836  0.837518  161670.000000
4.0            0.862977  0.495911  0.629868  161670.000000
accuracy       0.819821  0.819821  0.819821       0.819821
macro avg      0.828225  0.819821  0.808317  646680.000000
weighted avg   0.828225  0.819821  0.808317  646680.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
1.0            0.649134  0.719996  0.682731  17964.000000
2.0            0.110308  0.208942  0.144388   1096.000000
3.0            0.301398  0.511024  0.379166   6667.000000
4.0            0.578644  0.314282  0.407329  16587.000000
accuracy       0.514794  0.514794  0.514794      0.514794
macro avg      0.409871  0.438561  0.403404  42314.000000
weighted avg   0.552756  0.514794  0.513001  42314.000000


In [12]:
model =  KNeighborsClassifier(n_neighbors=4, metric='minkowski')
model.fit(X_train_overresampled, y_train_overresampled)

# train score
y_pred = model.predict(X_train_overresampled)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_minkowski_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_minkowski_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_4classes_oversamppling_minkowski.joblib')


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
1.0            0.754566  0.845537  0.797466  161670.000000
2.0            0.938779  1.000000  0.968423  161670.000000
3.0            0.756593  0.937849  0.837526  161670.000000
4.0            0.862978  0.495918  0.629873  161670.000000
accuracy       0.819826  0.819826  0.819826       0.819826
macro avg      0.828229  0.819826  0.808322  646680.000000
weighted avg   0.828229  0.819826  0.808322  646680.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
1.0            0.649134  0.719996  0.682731  17964.000000
2.0            0.110308  0.208942  0.144388   1096.000000
3.0            0.301398  0.511024  0.379166   6667.000000
4.0            0.578644  0.314282  0.407329  16587.000000
accuracy       0.514794  0.514794  0.514794      0.514794
macro avg      0.409871  0.438561  0.403404  42314.000000
weighted avg   0.552756  0.514794  0.513001  42314.000000


['model/knn_4classes_oversamppling_minkowski.joblib']

In [13]:
model =  KNeighborsClassifier(n_neighbors=4, metric='manhattan')
model.fit(X_train_overresampled, y_train_overresampled)

# train score
y_pred = model.predict(X_train_overresampled)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_manhattan_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_manhattan_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_4classes_oversamppling_manhattan.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
1.0            0.766196  0.865052  0.812629  161670.000000
2.0            0.942149  1.000000  0.970213  161670.000000
3.0            0.767005  0.940403  0.844899  161670.000000
4.0            0.876387  0.511375  0.645878  161670.000000
accuracy       0.829208  0.829208  0.829208       0.829208
macro avg      0.837934  0.829208  0.818405  646680.000000
weighted avg   0.837934  0.829208  0.818405  646680.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
1.0            0.667970  0.751002  0.707057  17964.000000
2.0            0.112872  0.214416  0.147892   1096.000000
3.0            0.319101  0.525724  0.397145   6667.000000
4.0            0.607889  0.331706  0.429207  16587.000000
accuracy       0.537245  0.537245  0.537245      0.537245
macro avg      0.426958  0.455712  0.420325  42314.000000
weighted avg   0.575073  0.537245  0.534827  42314.000000


['model/knn_4classes_oversamppling_manhattan.joblib']

In [None]:
model =  KNeighborsClassifier(n_neighbors=4, metric='chebyshev')
model.fit(X_train_overresampled, y_train_overresampled)

# train score
y_pred = model.predict(X_train_overresampled)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_chebyshev_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_4classes_oversamppling_chebyshev_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_4classes_oversamppling_manhattan.joblib')

In [14]:
print(pd.read_csv('res/knn_4classes_oversamppling_chebyshev_train.csv'))
print(pd.read_csv('res/knn_4classes_oversamppling_chebyshev_test.csv'))

     Unnamed: 0  precision    recall  f1-score        support
0           1.0   0.741662  0.780720  0.760690  161670.000000
1           2.0   0.906258  1.000000  0.950824  161670.000000
2           3.0   0.737913  0.931558  0.823505  161670.000000
3           4.0   0.819886  0.476743  0.602909  161670.000000
4      accuracy   0.797255  0.797255  0.797255       0.797255
5     macro avg   0.801429  0.797255  0.784482  646680.000000
6  weighted avg   0.801429  0.797255  0.784482  646680.000000
     Unnamed: 0  precision    recall  f1-score       support
0           1.0   0.603890  0.617067  0.610407  17964.000000
1           2.0   0.071199  0.202555  0.105363   1096.000000
2           3.0   0.258070  0.446078  0.326975   6667.000000
3           4.0   0.507407  0.284982  0.364977  16587.000000
4      accuracy   0.449213  0.449213  0.449213      0.449213
5     macro avg   0.360141  0.387671  0.351931  42314.000000
6  weighted avg   0.497784  0.449213  0.456460  42314.000000


# KNN oversampling 2 classes

In [13]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=101)
X_train_overresampled_bin, y_train_overresampled_bin = ros.fit_resample(X_train_scaled, y_train_bin)

In [14]:
model =  KNeighborsClassifier(n_neighbors=2, metric='euclidean')
model.fit(X_train_overresampled_bin, y_train_overresampled_bin)

# train score
y_pred = model.predict(X_train_overresampled_bin)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_euclidean_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_euclidean_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_2classes_oversamppling_euclidean.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
0.0            0.758305  0.999971  0.862530  209283.000000
1.0            0.999958  0.681278  0.810415  209283.000000
accuracy       0.840625  0.840625  0.840625       0.840625
macro avg      0.879131  0.840625  0.836473  418566.000000
weighted avg   0.879131  0.840625  0.836473  418566.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
0.0            0.577319  0.820514  0.677761  19060.000000
1.0            0.775304  0.507612  0.613529  23254.000000
accuracy       0.648556  0.648556  0.648556      0.648556
macro avg      0.676311  0.664063  0.645645  42314.000000
weighted avg   0.686123  0.648556  0.642462  42314.000000


['model/knn_2classes_oversamppling_euclidean.joblib']

In [15]:
model =  KNeighborsClassifier(n_neighbors=2, metric='minkowski')
model.fit(X_train_overresampled_bin, y_train_overresampled_bin)

# train score
y_pred = model.predict(X_train_overresampled_bin)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_minkowski_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_minkowski_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_2classes_oversamppling_minkowski.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
0.0            0.758305  0.999971  0.862530  209283.000000
1.0            0.999958  0.681278  0.810415  209283.000000
accuracy       0.840625  0.840625  0.840625       0.840625
macro avg      0.879131  0.840625  0.836473  418566.000000
weighted avg   0.879131  0.840625  0.836473  418566.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
0.0            0.577319  0.820514  0.677761  19060.000000
1.0            0.775304  0.507612  0.613529  23254.000000
accuracy       0.648556  0.648556  0.648556      0.648556
macro avg      0.676311  0.664063  0.645645  42314.000000
weighted avg   0.686123  0.648556  0.642462  42314.000000


['model/knn_2classes_oversamppling_minkowski.joblib']

In [16]:
model =  KNeighborsClassifier(n_neighbors=2, metric='manhattan')
model.fit(X_train_overresampled_bin, y_train_overresampled_bin)

# train score
y_pred = model.predict(X_train_overresampled_bin)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_manhattan_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_manhattan_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_2classes_oversamppling_manhattan.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
0.0            0.765012  0.999947  0.866843  209283.000000
1.0            0.999924  0.692847  0.818533  209283.000000
accuracy       0.846397  0.846397  0.846397       0.846397
macro avg      0.882468  0.846397  0.842688  418566.000000
weighted avg   0.882468  0.846397  0.842688  418566.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
0.0            0.590757  0.834995  0.691957  19060.000000
1.0            0.795434  0.525888  0.633168  23254.000000
accuracy       0.665123  0.665123  0.665123      0.665123
macro avg      0.693096  0.680441  0.662562  42314.000000
weighted avg   0.703239  0.665123  0.659649  42314.000000


['model/knn_2classes_oversamppling_manhattan.joblib']

In [17]:
model =  KNeighborsClassifier(n_neighbors=2, metric='chebyshev')
model.fit(X_train_overresampled_bin, y_train_overresampled_bin)

# train score
y_pred = model.predict(X_train_overresampled_bin)
clsf_report = pd.DataFrame(classification_report(y_train_overresampled_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_chebyshev_train.csv', index= True)
print(clsf_report)

# test score
y_pred= model.predict(X_test_scaled)
clsf_report = pd.DataFrame(classification_report(y_test_bin, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('res/knn_2classes_oversamppling_chebyshev_test.csv', index= True)
print(clsf_report)

# sauvegarde modèle
dump(model, 'model/knn_2classes_oversamppling_chebyshev.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score        support
0.0            0.749525  0.999947  0.856814  209283.000000
1.0            0.999921  0.665840  0.799379  209283.000000
accuracy       0.832894  0.832894  0.832894       0.832894
macro avg      0.874723  0.832894  0.828096  418566.000000
weighted avg   0.874723  0.832894  0.828096  418566.000000


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score       support
0.0            0.545502  0.765163  0.636925  19060.000000
1.0            0.712690  0.477466  0.571833  23254.000000
accuracy       0.607057  0.607057  0.607057      0.607057
macro avg      0.629096  0.621314  0.604379  42314.000000
weighted avg   0.637382  0.607057  0.601153  42314.000000


['model/knn_2classes_oversamppling_chebyshev.joblib']

# KNN Best model avec oversampling (4 classes) => pas de résultats temps d'exécution trop long

In [11]:
param_grid = {'n_neighbors': np.arange(1, 20),
              'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train_overresampled, y_train_overresampled)

print(grid.best_score_)
print(grid.best_params_)
dump(grid, 'model/nn_bestmodel_4classes_oversamping.joblib')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# KNN Best model avec oversampling (2 classes) => pas de résultats temps d'exécution trop long

In [None]:
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train_overresampled_bin, y_train_overresampled_bin)
print(grid.best_score_)
print(grid.best_params_)
dump(grid, 'knn_bestmodel_2classes_oversamping.joblib')

# KNN Best model avec oversampling (4 classes) => test sur dico réduit à juste k entre 1 et 10 => pas de résultats temps d'exécution trop long

In [None]:
model_2 = KNeighborsClassifier(n_neighbors = 4)
model_2.fit(X_train_overresampled, y_train_overresampled)
k_2 = np.arange(1, 10)

train_score_2, val_score_2 = validation_curve(model_2, 
                                              X_train_overresampled,
                                              y_train_overresampled, 
                                               param_name="n_neighbors",
                                              param_range=k_2, cv = 5,
                                             scoring="accuracy")

plt.plot(k_2, val_score_2.mean(axis = 1), label = 'validation')
plt.plot(k_2, train_score_2.mean(axis = 1), label = 'train')

plt.ylabel('score')
plt.xlabel('n_neighbors')
plt.legend()

# KNN Best model avec oversampling 2 classes) => test sur dico réduit à juste k entre 1 et 10 => pas de résultats temps d'exécution trop long

In [None]:
model_2 = KNeighborsClassifier(n_neighbors = 4)
model_2.fit(X_train_overresampled_bin, y_train_overresampled_bin)
k_2 = np.arange(1, 10)

train_score_2, val_score_2 = validation_curve(model_2, 
                                              X_train_overresampled,
                                              y_train_overresampled, 
                                               param_name="n_neighbors",
                                              param_range=k_2, cv = 5,
                                             scoring="accuracy")

plt.plot(k_2, val_score_2.mean(axis = 1), label = 'validation')
plt.plot(k_2, train_score_2.mean(axis = 1), label = 'train')

plt.ylabel('score')
plt.xlabel('n_neighbors')
plt.legend()