# Final_training ResNet50V2

L'objectif de ce notebook est de réaliser l'entrainement final de ResNet50V2 avec les hyperparamètres identifiés sur un peu plus d'échocs.



In [None]:
import os
import pandas as pd
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import time
import random
import seaborn as sns
from joblib import dump

import  keras
import tensorflow as tf # Utilisation de tensorflow v2.9.1
from tensorflow.keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.resnet_v2 import ResNet50V2
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras import optimizers
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# création des liens vers les dossiers et fichiers source
images_dataset = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\FFD_images_dataset'
train_dataset = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\train_FFDataframe_full_undersampling.csv'
test_dataset = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\test_FFDataframe_full_undersampling.csv'
validation_dataset = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\val_FFDataframe_full.csv'

# dossier ou sauver les résultats obtenus sur les modèles
save_models_results = r'C:\Users\renamedadmin\Documents\Formation_Datascience\Projet_Datascientest_Champignons\Dossier_technique\02_Pieces_constitutives\Dataset\Models_results'

In [None]:
# création de quelques fonctions utiles

# affichage des metriques (accuracy, loss) d'entrainement d'un modèle
def plot_scores(model, title):
    '''
    Arg :
    model : model dont on souhaite afficher les metriques
    Return:
    plot des métriques Accuracy et loss sur les datasets train et test
    '''
    sns.set()
    plt.rcParams['figure.figsize'] = [14,4]

    # Créer la figure
    fig = plt.figure()
    
    plt.gcf().subplots_adjust(left = 0, bottom = 0, right = 1, top = 1, wspace = 0.3, hspace = 0.3)
    # Créer les 4 graphiques
    ax1 = fig.add_subplot(1, 2, 1)
    ax2 = fig.add_subplot(1, 2, 2)

    # Tracer les données sur les graphiques
    ax1.plot(model.history['accuracy'], label = "train")
    ax1.plot(model.history['val_accuracy'], label = "test")
    ax1.legend(loc = "lower right")
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Accuracy')    

    ax2.plot(model.history['loss'], label = "train")
    ax2.plot(model.history['val_loss'], label = "test")
    ax2.legend(loc = "upper right")
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Loss')  
    plt.title(title, loc = "left")
    plt.show()
    
# affichage de la matrice de confusion du dataset de validation
def show_confusion_matrix(model):
    '''
    Args :
    model : modele à utiliser pour fair eles predictions
   
    Return :
    plot de la matrice de confusion
    '''
    # réalisation des prédiction pour le modèle
    model_pred=model.predict(val_generator, steps=val_steps, verbose=1)
    y_pred = []
    for element in model_pred:
        pred = np.argmax(element)
        y_pred.append(pred)
    y_val = df_val.edible.to_list()
    confusion_mtx = confusion_matrix(y_val, y_pred)
    #
    plt.rcParams['font.size'] = 20
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mtx)
    disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal', colorbar = False)
    plt.title(f'Confusion matrix for {model}')
    plt.ylabel('True label', fontsize = 20)
    plt.yticks(fontsize = 20)
    plt.xlabel('Predicted label', fontsize = 20)
    plt.xticks(fontsize = 20)
    plt.grid(False)
    plt.show()
    
# création d'une fonction permettant de compiler un modèle
def compile_model(model, optimizer, loss, metrics):
    '''
    Args :
    model : model à compiler
    optimizer :  choix de l'optimizer à utiliser durant l'entrainement
    loss : fonction de loss à utiliser durant l'entrainement sous la forme : "loss"
    metrics : metrique à évaluer durant l'entrainement sou sla forme : ["metrics"]
    '''
    model.compile(optimizer = optimizer, loss = loss, metrics = metrics)


In [None]:
# chargement des dataframes
df_train = pd.read_csv(train_dataset)
df_test = pd.read_csv(test_dataset)
df_val = pd.read_csv(validation_dataset)

# affichage de quelques infos sur ces dataframes + affichage d'une figure de répartition des catégories
display(df_train.head(), df_test.info(), df_val.info())

# génération des données du graph
inedible = []
edible = []

dataframes = [df_train, df_test, df_val]
for dataframe in dataframes:
    count_inedible = dataframe['edible'].value_counts()[0]
    inedible.append(count_inedible)
    count_edible = dataframe['edible'].value_counts()[1]
    edible.append(count_edible)   

data = ['df_train', 'df_test', 'df_val']
edibility = {'inedible': inedible, 'edible' : edible}

colonnes = ['df_train', 'df_test', 'df_val']
sex_counts = {
    'inedible': inedible,
    'edible': edible
}

width = 0.6
fig, ax = plt.subplots()
bottom = np.zeros(3)
for i, j in edibility.items():
    p = ax.bar(data, j, width, label=i, bottom=bottom)
    bottom += j
    ax.bar_label(p, label_type='center')
ax.set_title('Number of images by category')
ax.legend(title = 'categories')

plt.show()


## Création du modèle

In [None]:
# création par transfert learning d'un modèle de type ResNet50V2 à deux sorties
TL_ResNet50V2 = ResNet50V2(include_top=False, pooling="avg", weights='imagenet')
for layer in TL_ResNet50V2.layers:
    layer.trainable=False

logits = Dense(2)(TL_ResNet50V2.layers[-1].output)
output = Activation('softmax')(logits)
TL_ResNet50V2 = Model(TL_ResNet50V2.input, output, name = 'TL_ResNet50V2')
TL_ResNet50V2.summary()



## Entrainement

In [None]:
# Définition de quelques paramètres
batch_size = 128
SEED = 3
epochs = 30
W, H = 224, 224
optimizer = optimizers.Adam(learning_rate = 0.001)
loss_function = "hinge"
metrics = ["accuracy"]

In [None]:
# Création d'un DataGenerator pour le dataset d'entrainement
train_datagen = ImageDataGenerator(preprocessing_function = preprocess_input)

df_train["edible"] = df_train["edible"].apply(str)

train_generator = train_datagen.flow_from_dataframe(df_train, images_dataset,
                                                    x_col="filename",
                                                    y_col="edible",
                                                    class_mode="categorical",
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    seed=SEED)

# Création d'un DataGenerator pour le dataset de test
test_datagen = ImageDataGenerator(preprocessing_function = preprocess_input)

df_test["edible"] = df_test["edible"].apply(str)

test_generator = test_datagen.flow_from_dataframe(df_test, images_dataset,
                                                  x_col="filename",
                                                  y_col="edible",
                                                  class_mode="categorical",
                                                  batch_size=batch_size)

In [None]:
# compilation du modèle
compile_model(TL_ResNet50V2, optimizer, loss = loss_function, metrics = metrics)

# création de callbacks
checkpointer_ResNet50V2 = ModelCheckpoint(filepath=os.path.join(save_models_results, "VF_ResNet50V2.hdf5"),
                                            monitor='val_loss',
                                            save_best_only=True,
                                            mode='auto')
CSV_logger_ResNet50V2 = CSVLogger(filename = 'logger_VF_ResNet50V2.csv',
                                    separator=',',
                                    append = True)
callbacks_ResNet50V2 = [checkpointer_ResNet50V2, CSV_logger_ResNet50V2]

# entrainement du modèle
start_time = time.time()
history_VF_ResNet50V2 = TL_ResNet50V2.fit_generator(train_generator,
                                                    epochs=epochs,
                                                    validation_data=test_generator,
                                                    validation_steps=len(df_test)//batch_size,
                                                    steps_per_epoch=len(df_train)//batch_size,
                                                    callbacks=callbacks_ResNet50V2)

end_time = time.time()
print("Durée de l'entrainement :", end_time - start_time)

## Affichage des performances

In [None]:
# affichage des courbes d'entrainement
plot_scores(history_VF_ResNet50V2, "entrainement final de ResNet50V2")

In [None]:
# Création d'un DataGenerator pour le dataset de test
test_datagen = ImageDataGenerator(preprocessing_function = preprocess_input)

test_generator = test_datagen.flow_from_dataframe(df_test, images_dataset,
                                                  x_col="filename",
                                                  class_mode=None,
                                                  batch_size=1)
test_steps = len(df_test)
df_test["edible"] = df_test["edible"].apply(int)

# Création d'un DataGenerator pour le dataset de validation
val_datagen = ImageDataGenerator(preprocessing_function = preprocess_input)

val_generator = val_datagen.flow_from_dataframe(df_val, images_dataset,
                                                  x_col="filename",
                                                  class_mode=None,
                                                  batch_size=1)
val_steps = len(df_val)
df_val["edible"] = df_val["edible"].apply(int)


In [None]:
# affichage de la matrice de confusion sur le dataset de test

test_steps = len(df_test)
test_pred=TL_ResNet50V2.predict(test_generator, steps=test_steps, verbose=1)
y_pred_test = []
for element in test_pred:
    pred = np.argmax(element)
    y_pred_test.append(pred)
y_test = df_test.edible.to_list()


confusion_mtx = confusion_matrix(y_test, y_pred_test)
plt.rcParams['font.size'] = 20
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mtx)
disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal', colorbar = False)
plt.title('Confusion matrix for ResNet50v2 dataset de test')
plt.ylabel('True label', fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Predicted label', fontsize = 20)
plt.xticks(fontsize = 20)
plt.grid(False)
plt.show()

In [None]:
# affichage de la matrice de confusion sur le dataset de validation

val_pred=TL_ResNet50V2.predict(val_generator, steps=val_steps, verbose=1)
y_pred_val = []
for element in val_pred:
    pred = np.argmax(element)
    y_pred_val.append(pred)
y_val = df_val.edible.to_list()

confusion_mtx = confusion_matrix(y_val, y_pred_val)
plt.rcParams['font.size'] = 20
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mtx)
disp.plot(cmap='Blues', values_format='d', xticks_rotation='horizontal', colorbar = False)
plt.title('Confusion matrix for ResNet50v2 dataset de validation')
plt.ylabel('True label', fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Predicted label', fontsize = 20)
plt.xticks(fontsize = 20)
plt.grid(False)
plt.show()

In [None]:
# création d'une liste des probabilité d echampignons comestibles du df_val
pred_val_edible = []
for element in val_pred:
    pred = element[1]
    pred_val_edible.append(pred)
pred_val_edible = pd.Series(pred_val_edible)

# affichage d'un histogramme de densité des probabilité d'avoir un champignon comestible
fig = plt.figure(figsize =(5, 5))
#plt.hist(pred_val_edible, bins = 20, density = True)

ax = pred_val_edible.plot.density()
ax.set_xlim(0, 1)
ax.set_xlabel('probs')
ax.set_ylabel('density') 

In [None]:
# sauvegarde des résultats du modele VF_ResNet50V2
path_save_name = os.path.join(save_models_results, 'history_VF_ResNet50V2.joblib')
dump(history_VF_ResNet50V2, path_save_name, 3)

