# Classification binaire par apprentissage non supervisé

L'objectif de ce notebook est de tenter d'établir une classification de la comestibilité d'un champignon à partir d'une image par la biais de méthodes non supervisées.
Les inputs de ce notebook sont :
- un fichier .csv contenant le nom des fichiers images et la cible correspondante
- un dossier d'images.

Ce notebook est inspiré de la page 'https://www.kaggle.com/code/hosen42/pneumonia-detection-using-traditional-ml-image'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import random


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

from yellowbrick.model_selection import learning_curve


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.utils import shuffle as shf
import pickle
import os
import glob as gb
import shutil

from joblib import dump

import warnings as wr
wr.filterwarnings('ignore')

In [None]:
# import du jeu de données
images_folder = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\dataset_cleaned'
df = pd.read_csv(r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\df_XS.csv')
validation_folder = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\val_dataset'
validation_dataset = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\validation_dataset_wildfooduk_cleaned.csv'
display(df.head(), df.info())


In [None]:
# création d'un dossier de sauvegarde des résultats
parent_directory = os.path.dirname(images_folder)
if not os.path.exists(os.path.join(parent_directory, 'Models_results')):
    os.mkdir(os.path.join(parent_directory, 'Models_results'))

models_results = os.path.join(parent_directory, 'Models_results')

### préparation des images pour l'étude

#### séparation du jeu de données en 2 parties : train, test


L'objectif est de créer 2 partitions du dataframe original :
- Train : partition qui servira à l'entrainement du modèle de classification,
- Test : partition qui servira de set de validation lors de l'entraienemt du modèle,
- Validation : Partition qui ne verra jamais l'entrainement permettant de comparer différents modèles entre eux.

Le dataset complet est découpé en deux pour obtenir 'Train' et 'Test'.
Le dataset de validation est le dataset WillFoodUK.

Au final, depuis l'ensemble des données disponibles les datasets représentent :
Test (df_test) : 20%
Validation (df_val) : (3122 images)
Train (df_train) : 80%

Le random test de cette étude est 3

In [None]:
# creation de la partition 'test' qui servira a tester le modele une fois celui-ci mis au point
df_train, df_test = train_test_split(df, test_size=0.2, random_state = 3)


In [None]:
# extraction des noms d'images des dataframes selon la colonne cible df['edible']
# Sélection les lignes pour edible = 1 et edible = 0
df_edible_train = df_train.loc[df_train["edible"] == 1]
df_inedible_train = df_train.loc[df_train["edible"] == 0]
df_edible_test = df_test.loc[df_test["edible"] == 1]
df_inedible_test = df_test.loc[df_test["edible"] == 0]

# Extraction des noms des images dans des listes
images_names_edible_train = df_edible_train["filename"].values
images_names_inedible_train = df_inedible_train["filename"].values
images_names_edible_train = list(images_names_edible_train)
images_names_inedible_train = list(images_names_inedible_train)

images_names_edible_test = df_edible_test["filename"].values
images_names_inedible_test = df_inedible_test["filename"].values
images_names_edible_test = list(images_names_edible_test)
images_names_inedible_test = list(images_names_inedible_test)


In [None]:
# création de l'arborescence des fichiers
train = 'train'
test = 'test'

edible = 'edible'
inedible = 'inedible'
XS_dataset = 'XS_dataset'
parent_dir = os.path.dirname(images_folder)

os.mkdir(os.path.join(parent_dir, XS_dataset))
path_source = os.path.join(parent_dir, XS_dataset)

os.mkdir(os.path.join(path_source, train))
os.mkdir(os.path.join(path_source, test))

path_train = os.path.join(path_source, train)
path_test = os.path.join(path_source, test)


os.mkdir(os.path.join(path_train, edible))
os.mkdir(os.path.join(path_train, inedible))
path_train_edible = os.path.join(path_train, edible)
path_train_inedible = os.path.join(path_train, inedible)

os.mkdir(os.path.join(path_test, edible))
os.mkdir(os.path.join(path_test, inedible))
path_test_edible = os.path.join(path_test, edible)
path_test_inedible = os.path.join(path_test, inedible)



In [None]:
# déclaration des chemins d'accés
train = 'train'
test = 'test'

edible = 'edible'
inedible = 'inedible'
XS_dataset = 'XS_dataset'
parent_dir = os.path.dirname(images_folder)

path_source = os.path.join(parent_dir, XS_dataset)
path_train = os.path.join(path_source, train)
path_test = os.path.join(path_source, test)

path_train_edible = os.path.join(path_train, edible)
path_train_inedible = os.path.join(path_train, inedible)
path_test_edible = os.path.join(path_test, edible)
path_test_inedible = os.path.join(path_test, inedible)


In [None]:
# copie des images dans l'arborescence créée
for image_name in images_names_edible_train :    
    original_image_path = os.path.join(images_folder, image_name)
    new_image_path = os.path.join(path_train_edible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)
        
for image_name in images_names_inedible_train :    
    original_image_path = os.path.join(images_folder, image_name)
    new_image_path = os.path.join(path_train_inedible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)
        
for image_name in images_names_edible_test :    
    original_image_path = os.path.join(images_folder, image_name)
    new_image_path = os.path.join(path_test_edible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)
        
for image_name in images_names_inedible_test :    
    original_image_path = os.path.join(images_folder, image_name)
    new_image_path = os.path.join(path_test_inedible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)
       


In [None]:
# pour le dataset de validation
validation_df = pd.read_csv(validation_dataset)
validation_df.drop(['kingdom', 'family', 'phylum', 'order', 'classes', 'genus', 'species'], axis = 1, inplace = True)

# extraction des noms d'images des dataframes selon la colonne cible df['edible']
# Sélection les lignes pour edible = 1 et edible = 0
df_edible_val = validation_df.loc[validation_df["edible"] == 1]
df_inedible_val = validation_df.loc[validation_df["edible"] == 0]
# Extraction des noms des images dans des listes
images_names_edible_val = df_edible_val["filename"].values
images_names_inedible_val = df_inedible_val["filename"].values
images_names_edible_val = list(images_names_edible_val)
images_names_inedible_val = list(images_names_inedible_val)

# création de l'arborescence des fichiers
val = 'val'
os.mkdir(os.path.join(path_source, val))
path_val = os.path.join(path_source, val)
os.mkdir(os.path.join(path_val, edible))
os.mkdir(os.path.join(path_val, inedible))
path_val_edible = os.path.join(path_val, edible)
path_val_inedible = os.path.join(path_val, inedible)

# copie des images dans l'arborescence créée
for image_name in images_names_edible_val :    
    original_image_path = os.path.join(validation_folder, image_name)
    new_image_path = os.path.join(path_val_edible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)
        
for image_name in images_names_inedible_val :    
    original_image_path = os.path.join(validation_folder, image_name)
    new_image_path = os.path.join(path_val_inedible, image_name)
    if os.path.exists(original_image_path):
        shutil.copy(original_image_path, new_image_path)


## Equilibrage des datasets train et test par suppression aléatoire d'images dans les dossiers sources de l'étude 

Les 4 cellules ci-dessous sont à executer si besoin pour réaliser un sous échantillonnnage aléatoire des données images dans le but d'équilbrer les classes. Pour le dataset XS, cette partie n'est pas utilisée.

In [None]:
# création d'une fonction permettant de supprimer n fichiers
def delete_files(folder, n):
    files = os.listdir(folder)
    ensemble = set(files)
    files_to_delete = random.sample(ensemble, n)
    for file in files_to_delete:
        os.remove(os.path.join(folder, file))

In [None]:
count_train_edible = len(os.listdir(path_train_edible))
count_train_inedible = len(os.listdir(path_train_inedible))
count_test_edible = len(os.listdir(path_test_edible))
count_test_inedible = len(os.listdir(path_test_inedible))

n_train = count_train_inedible - count_train_edible
n_test = count_test_inedible - count_test_edible


In [None]:
delete_files(path_train_inedible, n_train)
delete_files(path_test_inedible, n_test)

In [None]:
count_train_edible = len(os.listdir(path_train_edible))
count_train_inedible = len(os.listdir(path_train_inedible))
count_test_edible = len(os.listdir(path_test_edible))
count_test_inedible = len(os.listdir(path_test_inedible))

print('train_dataset :', count_train_edible, count_train_inedible)
print('test_dataset :', count_test_edible, count_test_inedible)

## Import du dataset 

In [None]:
# pour le dataset d'entrainement
X_train = []
y_train = []

W, H = 128, 128

# pour la partie train_edible
# conversion des images sous la forme de matrices [W, H, 3]
for image_path in os.listdir(path_train_edible):
    image = Image.open(os.path.join(path_train_edible, image_path))
    image_array = np.asarray(image)
    # Redimentionnement des images
    image_resized = image_array[
        :W, :H]
    # Ajout de l'image et de son dossier d'appartenance aux listes
    X_train.append(image_resized)
    y_train.append(1)

# pour la partie train_inedible
for image_path in os.listdir(path_train_inedible):
    image = Image.open(os.path.join(path_train_inedible, image_path))
    image_array = np.asarray(image)
    image_resized = image_array[
        :W, :H]
    X_train.append(image_resized)
    y_train.append(0)
np.save('X_train',X_train)
np.save('y_train',y_train)

In [None]:
# application des mêmes étapes pour le dataset de test
X_test = []
y_test = []

W, H = 128,128

# conversion des images sous la forme de matrices [W, H, 3]
for image_path in os.listdir(path_test_edible):
    image = Image.open(os.path.join(path_test_edible, image_path))
    image_array = np.asarray(image)
    image_resized = image_array[
        :W, :H]
    
    X_test.append(image_resized)
    y_test.append(1)


for image_path in os.listdir(path_test_inedible):
    image = Image.open(os.path.join(path_test_inedible, image_path))
    image_array = np.asarray(image)
    image_resized = image_array[
        :W, :H]
    X_test.append(image_resized)
    y_test.append(0)
np.save('X_test',X_test)
np.save('y_test',y_test)

In [None]:
# application des mêmes étapes pour le dataset de validation
X_val = []
y_val = []

W, H = 128,128

for image_path in os.listdir(path_val_edible):
    image = Image.open(os.path.join(path_val_edible, image_path))
    image_array = np.asarray(image)
    image_resized = image_array[
        :W, :H]
    X_val.append(image_resized)
    y_val.append(1)

for image_path in os.listdir(path_val_inedible):
    image = Image.open(os.path.join(path_val_inedible, image_path))
    image_array = np.asarray(image)
    image_resized = image_array[
        :W, :H]
    X_val.append(image_resized)
    y_val.append(0)
np.save('X_val',X_val)
np.save('y_val',y_val)

In [None]:
print("Total Image for training:",(len(X_train)+len(X_test)))
print("Total Image for validation:",(len(X_val)))

### Chargement des fichiers .npy comme arrays numpy

In [None]:
# X_train, X_testcontiennent les images sous forme d'arrays numpy
# y_train, y_test contiennent les catégories de chaque image 
# chargement des fichiers .npy comme arrays numpy
loaded_X_train = np.load('./X_train.npy')
loaded_X_test = np.load('./X_test.npy')
loaded_y_train = np.load('./y_train.npy')
loaded_y_test = np.load('./y_test.npy')
loaded_X_val = np.load('./X_val.npy')
loaded_y_val = np.load('./y_val.npy')

In [None]:
print(loaded_X_train.shape)
#La dimension de X_train est de 20698 images de format 128x128 sur trois canaux de couleurs


In [None]:
print(loaded_X_test.shape)
#La dimension de X_test est de 5120 images de format 128x128 sur trois canaux de couleurs


In [None]:
print(loaded_X_val.shape)
#La dimension de X_val est de 3052 images de format 128x128 sur trois canaux de couleurs


In [None]:
# y_train et y_test contiennent les catégories de chaque image, avec 0 'inedible' ou 1 'edible'
print(loaded_y_train.shape)
print(loaded_y_test.shape)
print(loaded_y_val.shape)

### Data Analysis

In [None]:
code = {'inedible':0 ,'edible':1}
# création d'une fonction permettant de retourner la catégorie (inedible / edible - 0 / 1) en fonction de la valeur de la cible
def getcode(n) : 
    for x , y in code.items() : 
        if n == y : 
            return x

In [None]:
#affichage de quelques images de champignons comestibles et non-comestibles du dataset d'entrainement
plt.figure(figsize=(20,10))
for n , i in enumerate(np.random.randint(0,len(loaded_X_train),16)): 
    plt.subplot(2,8,n+1)
    plt.imshow(loaded_X_train[i])
    plt.axis('off')
    plt.title(getcode(loaded_y_train[i]))

In [None]:
# affichage d'un countplot permettant de visualiser le nombre d'images par catégorie disponible pour l'entrainement
df_train = pd.DataFrame()
df_train["labels"]= loaded_y_train
lab = df_train['labels']
dist = lab.value_counts()
sns.countplot(df_train, x = 'labels')
plt.show()

In [None]:
#affichage de quelques images de champignons comestibles et non-comestibles du dataset de test
plt.figure(figsize=(20,10))
for n , i in enumerate(np.random.randint(0,len(loaded_X_test),16)): 
    plt.subplot(2,8,n+1)
    plt.imshow(loaded_X_test[i])
    plt.axis('off')
    plt.title(getcode(loaded_y_test[i]))

In [None]:
# affichage d'un countplot permettant de visualiser le nombre d'images par catégorie disponible pour le test
#As we can see inedible are over represented in all the data set. We will deal with such imbalance late
df_test = pd.DataFrame()
df_test["labels"]= loaded_y_test
lab = df_test['labels']
dist = lab.value_counts()
#play with pallette colors
sns.countplot(df_test, x ='labels')
plt.show()

#Le dataset est bien équilibré entre les deux catégories dans les datasets d'entrainement et de test

### Histogram

In [None]:
#Creation d'une fonction permettant de visualiser l'intensité des pixels sur les canaux de couleur dans une image 
def plotHistogram(a):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    plt.imshow(a)
    histo = plt.subplot(1,2,2)
    histo.set_ylabel('Count')
    histo.set_xlabel('Pixel Intensity')
    n_bins = 30
    plt.hist(a[:,:,0].flatten(), bins= n_bins, lw = 0, color='r', alpha=0.9)
    plt.hist(a[:,:,1].flatten(), bins= n_bins, lw = 0, color='g', alpha=0.9)
    plt.hist(a[:,:,2].flatten(), bins= n_bins, lw = 0, color='b', alpha=0.9)

In [None]:
plotHistogram(loaded_X_train[np.random.randint(len(loaded_X_train))])

In [None]:
plotHistogram(loaded_X_test[np.random.randint(len(loaded_X_test))])

### Mise à plat des images et mélange des datasets train et test 

In [None]:
# Applatissement des images sous forme d'array sà 2 dimensions pour les datasets d'entrainement et de test
X_train = loaded_X_train.reshape([-1, np.product((128,128,3))])
X_test = loaded_X_test.reshape([-1, np.product((128,128,3))])

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
y_train = loaded_y_train
y_test = loaded_y_test

In [None]:
# Mélange aléatoire des lignes des datasets 
X_train, y_train = shf(X_train, y_train, random_state=15)
X_test, y_test = shf(X_test, y_test, random_state=15)

### Data preprocessing 

In [None]:
# Standardisation des images 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Réalisation d'une ACP afin de réduire le nombre de features dans les datasets
# # nous choisisons de conserver 95% de la variance représentée
# #PCA    
pca = PCA(.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [None]:
# sauvegarde des résultats de la PCA
path_save_name = os.path.join(models_results, 'pca_XS_dataset_overbalanced.joblib')
dump(pca, path_save_name, 3)

In [None]:
# Affichage du nombre de features permettant de représenter 95% de la variance d'aprés l'ACP
print('Number of components after PCA: ' + str(pca.n_components_))

### Préparation du dataset de validation 

In [None]:
# Applatissement des images pour le dataset de validation
X_val = loaded_X_val.reshape([-1, np.product((128,128,3))])
y_val = loaded_y_val


print('X_val.shape before PCA', X_val.shape)
print('y_val.shpae',y_val.shape)

# Standardisation des images 
X_val = sc.fit_transform(X_val)

# extraction des principales features obtenues par la PCA
X_val = pca.transform(X_val)
print('X_val.shape after PCA', X_val.shape)

### équilibrage du dataset avec SMOTE 

In [None]:
# Application de SMOTE a X_train et y_train afin de générer de nouveaux individus et équilibrer les classes
from collections import Counter
smote = SMOTE(random_state = 11)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
# Affichage de l'impact de SMOTE sur le jeu de données d'entrainement
print('Before SMOTE : ',Counter(y_train))
print('After SMOTE : ',Counter(y_train_res))

# Affichage des dimensions de X_train et y_train aprés SMOTE
print('X_train shape :',X_train_res.shape)
print('y_train shape :',y_train_res.shape)

In [None]:
#!pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

# Création d'un LazyClassifier puis entrainement
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=True,
                     random_state=12,
                     classifiers='all')
LazyClassifier()

# A ajuster selon l'utilisation de SMOTE ou pas
# model, predictions = clf.fit(X_train_res, x_test,y_train_res, y_test)
model, predictions = clf.fit(X_train, X_test,y_train, y_test)
model

In [None]:
predictions.head()

In [None]:
# Afifchage des 10 modèles présentant les meilleurs résultats sur la métrique 'Balanced Accuracy'
top_10= model.sort_values(by='Balanced Accuracy', ascending=False).head(12)
print(top_10)

### Visualisation des résultats 

In [None]:
import plotly.express as px

line = px.line(data_frame= model ,y =["Accuracy"] , markers = True)
line.update_xaxes(title="Model",
              rangeslider_visible = False)
line.update_yaxes(title = "Accuracy")
line.update_traces(line_color="red")
line.update_layout(showlegend = True,
    title = {
        'text': 'Accuracy vs Model',
        'y':0.94,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

line.show()

In [None]:
line = px.line(data_frame= model ,y =["Accuracy", "ROC AUC" , "F1 Score"] , markers = True)
line.update_xaxes(title="Models",
               rangeslider_visible = False)
line.update_yaxes(title = "scores")
line.update_layout(showlegend = True,
    title = {
        'text': 'Accuracy, ROC AUC and F1 Score vs Models',
        'y':0.94,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

line.show()

In [None]:
line = px.line(data_frame= model ,y =["Time Taken"] , markers = True)
line.update_xaxes(title="Model",
              rangeslider_visible = False)
line.update_yaxes(title = "Time(s)")
line.update_traces(line_color="purple")
line.update_layout(showlegend = True,
    title = {
        'text': 'TIME TAKEN vs Model',
        'y':0.94,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

line.show()

In [None]:
# sauvegarde des résultats de LazyPredict
path_save_name = os.path.join(models_results, 'LazyPredict_XS_dataset_withoutSMOTE.joblib')
dump(clf, path_save_name, 3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, precision_recall_curve, PrecisionRecallDisplay, RocCurveDisplay

### GaussianNB 

In [None]:
from sklearn.naive_bayes import MultinomialNB , GaussianNB, BernoulliNB
gaussian_nb = GaussianNB()
gaussian_nb=gaussian_nb.fit(X_train_res, y_train_res)

In [None]:
print ("Training Accuracy: ", gaussian_nb.score(X_train_res, y_train_res)*100) # Check training accuracy
print ("Validation Accuracy: ", gaussian_nb.score(X_test,y_test)*100) # Check validation accuracy

In [None]:
gaussian_nb_pred = gaussian_nb.predict(X_val)
accuracy_score(gaussian_nb_pred,y_val)*100

In [None]:
print(classification_report(y_val,gaussian_nb_pred)) #main


In [None]:
cm = confusion_matrix(y_val,gaussian_nb_pred)

In [None]:
plt.rcParams['font.size'] = 20
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal')
plt.title(f'Confusion matrix for {extra_trees_classifier}')
plt.ylabel('True label', fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Predicted label', fontsize = 20)
plt.xticks(fontsize = 20)
plt.legend(fontsize = 20)
plt.grid(False)
plt.show()

In [None]:
# sauvegarde des résultats de GaussianNB
path_save_name = os.path.join(models_results, 'GaussienNB_XS_dataset_with_SMOTE.joblib')
dump(gaussian_nb, path_save_name, 3)

In [None]:
RocCurveDisplay.from_estimator(gaussian_nb,X_test,y_test)
plt.plot([1,0],[1,0],'go--')

### BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
bagging_classifier = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0)
bagging_classifier = bagging_classifier.fit(X_train_res, y_train_res)


In [None]:
print ("Training Accuracy: ", bagging_classifier.score(X_train_res, y_train_res)*100) # Check training accuracy
print ("validation Accuracy: ", bagging_classifier.score(X_test,y_test)*100) # Check testing accuracy

In [None]:
bagging_classifier_pred = bagging_classifier.predict(X_val)
accuracy_score(bagging_classifier_pred,y_val)*100

In [None]:
print(classification_report(y_val,bagging_classifier_pred)) #main

In [None]:
cm=confusion_matrix(y_val,bagging_classifier_pred)
cm

In [None]:
plt.rcParams['font.size'] = 20
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal')
plt.title(f'Confusion matrix for {extra_trees_classifier}')
plt.ylabel('True label', fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Predicted label', fontsize = 20)
plt.xticks(fontsize = 20)
plt.legend(fontsize = 20)
plt.grid(False)
plt.show()

In [None]:
# sauvegarde des résultats de BaggingClassifier
path_save_name = os.path.join(models_results, 'BaggingClassifier_XS_dataset_with_SMOTE.joblib')
dump(bagging_classifier, path_save_name, 3)

In [None]:
RocCurveDisplay.from_estimator(bagging_classifier,X_test,y_test)
plt.plot([1,0],[1,0],'go--')

### ExtraTreesClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
extra_trees_classifier = ExtraTreesClassifier()
extra_trees_classifier=extra_trees_classifier.fit(X_train_res, y_train_res)

In [None]:
print ("Training Accuracy: ", extra_trees_classifier.score(X_train_res, y_train_res)*100) # Check training accuracy
print ("validation Accuracy: ", extra_trees_classifier.score(X_test,y_test)*100) # Check testing accuracy

In [None]:
extra_trees_classifier_pred = extra_trees_classifier.predict(X_val)
accuracy_score(extra_trees_classifier_pred,y_val)*100

In [None]:
print(classification_report(y_val,extra_trees_classifier_pred)) #main

In [None]:
cm=confusion_matrix(y_val,extra_trees_classifier_pred)
cm

In [None]:
plt.rcParams['font.size'] = 20
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal')
plt.title(f'Confusion matrix for {extra_trees_classifier}')
plt.ylabel('True label', fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Predicted label', fontsize = 20)
plt.xticks(fontsize = 20)
plt.legend(fontsize = 20)
plt.grid(False)
plt.show()

In [None]:
# sauvegarde des résultats de ExtraTreesClassifier
path_save_name = os.path.join(models_results, 'ExtraTreesClassifier_XS_dataset_with_SMOTE.joblib')
dump(extra_trees_classifier, path_save_name, 3)