# Evaluer la performance d'un modèle de ML

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.metrics import auc,roc_curve
import warnings
warnings.filterwarnings('ignore')## pour ne pas afficher les warnings

In [None]:
data=pd.read_spss("scoring.sav")# il faut isntaller le package 'pyreadstat' avant /pip install pyreadstat
data.head()

In [None]:
#Afficher les diemsions de la BDD
data.shape
#Afficher la liste des variables
print(data.columns) #Afficher les types de variables
print(data.dtypes)

In [None]:
data.describe() # data.describe(include=all)

In [None]:
data["Statut1"].value_counts() # BDD désiquilibré

In [None]:
# Recodage des variables:
data1=data.copy()
replace={"Statut1":{'Yes':1,'No':0}}
data1=data1.replace(replace)
data1.head()

In [None]:
#Identifier les données manquantes
print(data1.isnull().sum())
# Représenter les données manquantes
sns.heatmap(data1.isnull())

In [None]:
# Matrice de corrélation:
sns.heatmap(data1.corr(),annot=True)

In [None]:
# Séparer les variables en explicatives et à expliquer:
y=data1["Statut1"]
x=data1.drop("Statut1",axis=1)

In [None]:
#séparer la base de données en apprentissage et test
seed=101
test_size=0.2
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=test_size,
                                               random_state=seed)

In [None]:
#Entrainer le modèle avec les hyperparamètres par défaut
model=KNeighborsClassifier()
model.fit(x_train,y_train)

* Evaluaer la performance du modèle avec la base d'entrainement et la base test

In [None]:
## accuracy avec la BDD d'entrainement
pred_train=model.predict(x_train)
accuracy_score(y_train,pred_train)

In [None]:
## Accuracy avec la base de données test
pred_test=model.predict(x_test)
accuracy_score(y_test,pred_test)

In [None]:
#Matrice de confusion
mat=confusion_matrix(pred_test,y_test)
plt.figure()
sns.heatmap(mat,annot=True,square=True)
plt.xlabel("predicted label")
plt.ylabel("Actual label")

In [None]:
## Afficher la précision, l'accuracy et le F1 score
print(classification_report(pred_test,y_test))

In [None]:
# valeur de l'AUC
roc_auc_score(y_train,probabilities)

In [None]:
# Courbe Roc
plt.figure(figsize=(4,3))
probabilities = model.predict_proba(x_train)
probabilities = probabilities[:, 1]  # keep probabilities for first class only
# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, probabilities)    
    
# Plot the "dumb model" line
plt.plot([0, 1], [0, 1], linestyle='--')
    
# Plot the model line
plt.plot(fpr, tpr, marker='.')

plt.text(0.75, 0.25, "AUC: " + str(round(roc_auc_score(y_train, probabilities),2)))

plt.show()

#### Evaluer un modèle avec les hyper-paramètres par défaut par CV

In [None]:
# K-Fold cross validation
from sklearn.model_selection import KFold,cross_val_score
x.head()
# Create 5 folds
seed = 123
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Create a model with default paramters 
model_KN = KNeighborsClassifier()

# Train and evaluate multiple models using kfolds
results = cross_val_score(model_KN, x, y, cv=kfold, scoring='accuracy')
print(results)
mean_accuracy=results.mean()
std_accuracy=results.std()

print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation of Accuracy: {std_accuracy:.2f}")


In [None]:
# Stratified KFold cross validation

from sklearn.model_selection import StratifiedKFold

# Créer 5 parties(fold)
seed = 7
skf = StratifiedKFold(n_splits=5)
# Créer un modèle 
model_KN = KNeighborsClassifier()
# Entrainer et évaluer  plusieurs  modèles à l'aide Stratified KFold
results = cross_val_score(model_KN,x, y, cv=skf, scoring='accuracy')
print("mean:",results.mean())
print("std:",results.std())

#####   Optimisation des hyperparamètres et évaluation du modèle par CV 

In [None]:
model1=KNeighborsClassifier()
# définir la liste des valeurs pour les hyper-paramètres 
liste = [{'n_neighbors': range(1,50), 'metric':['minkowski','euclidean']}]
# Créer des combinaisons des hyper-paramètres
grid_knn=GridSearchCV(model1,liste,cv=5,scoring='accuracy')
#entrainer des modèles pour toutes les combinaisons des valeurs
grid_knn.fit(x_train,y_train)

In [None]:
# récupérer les hyper-paramètres optimaux
best_params=grid_knn.best_params_
# récupérer le modèle avec les hyper-paramètres optimaux
best_model=grid_knn.best_estimator_

In [None]:
print(best_params)
print(grid_knn.best_score_)

In [None]:
## Test le modèle séléctionné 
pred=best_model.predict(x_test) # ou grid_knn.predict(x_test) 
accuracy_score(pred,y_test)

In [None]:
print(classification_report(pred,y_test))

In [None]:
plt.figure()
sns.heatmap(confusion_matrix(y_test,pred),annot=True,square=True)
plt.ylabel("Actual label")
plt.xlabel("predicted label")

In [None]:
plt.figure()
mat=confusion_matrix(y_test,pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = mat, display_labels = [False, True])
cm_display.plot()

In [None]:
# Nested CV
from sklearn.model_selection import GridSearchCV, KFold
# Define your model
model = KNeighborsClassifier()
# Define the parameter grid for hyperparameter tuning
liste = [{'n_neighbors': range(1,50), 'metric':['minkowski','euclidean']}]
# Create an outer cross-validation iterator
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Create an inner cross-validation iterator
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# Create the GridSearchCV object for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=liste, cv=inner_cv, scoring='accuracy')
# Perform nested cross-validation
nested_scores = cross_val_score(grid_search, x, y, cv=outer_cv)
# Print the performance scores
print("Nested CV Mean Score:", nested_scores.mean())