# En local.

## 1. Importation des librairies nécessaires.

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical, set_random_seed
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from pyspark.sql.functions import element_at, split
from pyspark.sql import SparkSession

2024-09-27 20:36:44.818067: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 20:36:44.822823: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 20:36:44.835989: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-27 20:36:44.856289: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-27 20:36:44.861479: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-27 20:36:44.876089: I tensorflow/core/platform/cpu_feature_gu

## 2. Transfer learning *sans* PySpark

Commençons par importer le modèle de base :

In [2]:
base_model = MobileNetV2(weights='imagenet',
                         include_top=False,
                         input_shape=(224, 224, 3))

La fonction `create_model` ci-dessous permet de créer un classifieur basé sur un modèle pré-entraîné (transfer learning). Elle prend en entrée un CNN pré-entraîné ainsi que le nombre de classe, gèle les classes du modèle de base, lui ajoute une couche de GlobalAveragePooling2D ainsi qu'une couche dense de classification et renvoi le modèle ainsi obtenu.

In [3]:
def create_model(base_model, nbr_de_classes):
    # Gel des poids du modèle de base
    for layer in base_model.layers:
        layer.trainable = False
    # Ajout des couches de classification
    model = Sequential([base_model,
                        GlobalAveragePooling2D(),
                        Dense(1024, activation='relu'),
                        Dense(nbr_de_classes, activation='softmax')
                       ])
    return model

La fonction `metrics_display` ci-dessous prends en entrée les étiquettes réelles et prédites et affiche cinq métriques de classification : l'**accuracy**, la **precision**, le **recall**, le **f1** et l'**ARI** (qui est plutôt une métrique de segmentation).

In [4]:
def metrics_display(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    ARI = adjusted_rand_score(true_labels, predicted_labels)
    print(f'accuracy = {accuracy}   precision = {precision}   recall={recall}   f1 = {f1}   ARI = {ARI}')

La fonction `train_and_test_model` ci-dessous permet de prétraiter des images d'entraînement et de test enregistrées localement et déjà rangées par catégories (grâce à la fonction **ImageDataGenerator**) puis d'entraîner et d'évaluer un modèle :

In [5]:
def train_and_test_model(train_dir,
                         test_dir,
                         model,
                         seed_value=42,
                         target_size=(224, 224),
                         optimizer=Nadam,
                         learning_rate=0.001,
                         epochs=10
                        ):
    
    # Fixation des germes des générateurs aléatoires pour assurer la reproductibilité des résultats.
    set_random_seed(seed_value)
    
    # Préparation du générateur de données
    generator = ImageDataGenerator(rescale=1./255)
    
    train_generator = generator.flow_from_directory(train_dir,
                                                    target_size=target_size,
                                                    class_mode='categorical',
                                                    seed=seed_value
                                                    )

    test_generator = generator.flow_from_directory(test_dir,
                                                   target_size=target_size,
                                                   class_mode='categorical',
                                                   shuffle=False,
                                                   seed=seed_value
                                                   )
    
    # Compilation du modèle
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Construction des callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

    # Entraînement du modèle
    history = model.fit(train_generator,
                        validation_data=test_generator,
                        callbacks=[early_stopping, reduce_lr],
                        epochs=epochs)

    # Obtention des étiquettes prédites et réelles
    predictions = model.predict(test_generator)
    y_pred = np.argmax(predictions, axis=1)
    y_true = test_generator.classes
    
    # Évaluation des performances du modèle
    metrics_display(y_true, y_pred)

In [5]:
# Chemins des répertoires où se trouvent les images d'entraînement et de test.
train_dir = '../data/train_local'
test_dir = '../data/test_local'

On crée notre classifieur :

In [7]:
model = create_model(base_model, 3)
model.summary()

On l'entraîne et on l'évalue :

In [8]:
train_and_test_model(train_dir, test_dir, model)

Found 1381 images belonging to 3 classes.
Found 461 images belonging to 3 classes.
Epoch 1/10


  self._warn_if_super_not_called()
2024-09-27 19:37:20.467994: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154140672 exceeds 10% of free system memory.
2024-09-27 19:37:20.611470: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 156905472 exceeds 10% of free system memory.


[1m 1/44[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:47[0m 7s/step - accuracy: 0.4688 - loss: 1.0155

2024-09-27 19:37:21.628248: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154140672 exceeds 10% of free system memory.
2024-09-27 19:37:21.769587: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 156905472 exceeds 10% of free system memory.


[1m 2/44[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m51s[0m 1s/step - accuracy: 0.5469 - loss: 0.8952 

2024-09-27 19:37:22.863454: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154140672 exceeds 10% of free system memory.


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2s/step - accuracy: 0.9215 - loss: 0.1734 - val_accuracy: 1.0000 - val_loss: 1.5460e-04 - learning_rate: 0.0010
Epoch 2/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1s/step - accuracy: 1.0000 - loss: 3.3205e-05 - val_accuracy: 1.0000 - val_loss: 1.0210e-04 - learning_rate: 0.0010
Epoch 3/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - accuracy: 1.0000 - loss: 7.5790e-06 - val_accuracy: 1.0000 - val_loss: 3.7303e-05 - learning_rate: 0.0010
Epoch 4/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 1.0000 - loss: 2.6774e-06 - val_accuracy: 1.0000 - val_loss: 3.3587e-05 - learning_rate: 0.0010
Epoch 5/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1s/step - accuracy: 1.0000 - loss: 1.5407e-06 - val_accuracy: 1.0000 - val_loss: 1.2605e-05 - learning_rate: 0.0010
Epoch 6/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

## 3. Transfer learning *avec* PySpark 

In [6]:
# Démarrage d'une session Spark
spark = (SparkSession.builder
                     .appName('P8')
                     .master('local')
                     .config("spark.driver.memory", "4g")
                     .config("spark.executor.memory", "3g")
                     .config("spark.sql.shuffle.partitions", "32")
                     .getOrCreate())

24/09/27 20:37:11 WARN Utils: Your hostname, chakir-Latitude-7390 resolves to a loopback address: 127.0.1.1; using 192.168.43.175 instead (on interface wlp2s0)
24/09/27 20:37:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/27 20:37:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
sc = spark.sparkContext

In [8]:
spark

Il n'est plus possible d'utiliser la fonction **ImageDataGenerator** pour charger et prétraiter les images (mais aussi les étiquettes !), car elle est incompatible avec Spark. Nous allons donc créer deux fonctions. 
La première, `get_images_path_and_labels`, permet de récupérer les chemins des images ainsi que leurs étiquettes.
La deuxième, `preprocess_image`permet de charger et prétraiter une image à partir de son chemin.

In [9]:
# Fonction pour obtenir les chemins des images et leurs étiquettes
def get_image_paths_and_labels(spark_session, dir_path):
    
    images = (spark_session.read.format('binaryFile')
                           .option('pathGlobFilter', '*.jpg')
                           .option('recursiveFileLookup', 'true')
                           .load(dir_path))

    images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
    
    return images

In [10]:
# Fonction de prétraitement d'images (redimensionnement, normalisation)
def preprocess_image(row, target_size=(224, 224)):
        image_path = row['path'].replace('file:', '')
        img = load_img(image_path, target_size=target_size)
        img_array = img_to_array(img)
        img_array /= 255.0
        return img_array, row['label']

La fonction `train_and_test_model_spark` est une adaptation à spark de la fonction **train_and_test_model** précédente.

In [20]:
def train_and_test_model_spark(train_dir, 
                               test_dir, 
                               model,
                               spark_session,
                               seed_value=42, 
                               target_size=(224, 224), 
                               optimizer=Nadam, 
                               learning_rate=0.001, 
                               epochs=10):
    
    # Fixation des germes pour la reproductibilité des résultats
    set_random_seed(seed_value)
    
    # Obtention des chemins des images et leurs étiquettes
    train_df = get_image_paths_and_labels(spark_session, train_dir)
    test_df = get_image_paths_and_labels(spark_session, test_dir)
    
    # Transformation des données en parallèle sur Spark
    train_data_rdd = train_df.rdd.map(preprocess_image)
    test_data_rdd = test_df.rdd.map(preprocess_image)

    # Collecte des données en mémoire locale pour l'entraînement du modèle
    train_data = train_data_rdd.collect()
    test_data = test_data_rdd.collect()
    
    # Séparation des données et des étiquettes
    X_train, y_train = zip(*train_data)
    X_test, y_test = zip(*test_data)

    # Conversion en array numpy
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    # Encodage des étiquettes en one-hot
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    y_train_cnn = to_categorical(y_train)
    y_test_cnn = to_categorical(y_test)

    # Compiler le modèle
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Callbacks pour le modèle
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)
    
    # Entraîner le modèle
    history = model.fit(X_train, y_train_cnn,
                        validation_data=(X_test, y_test_cnn),
                        callbacks=[early_stopping, reduce_lr],
                        epochs=epochs)
    
    # Prédictions
    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)
    
    # Évaluation du modèle
    metrics_display(y_test, y_pred)

On crée un classifieur :

In [21]:
model = create_model(base_model, 3)

On diffuse ses poids à tous les noeuds du cluster, sans quoi certain workers ne pourrait pas réaliser leurs tâches :

In [22]:
brodcast_weights = sc.broadcast(model.get_weights())

On entraîne et on évalue le modèle :

In [23]:
train_and_test_model_spark(train_dir, test_dir, model, spark)

Exception in thread "serve RDD 26" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)
                                                                                

Epoch 1/10


2024-09-27 20:42:41.687631: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 831516672 exceeds 10% of free system memory.


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 1s/step - accuracy: 0.8898 - loss: 0.2945 - val_accuracy: 1.0000 - val_loss: 3.2307e-04 - learning_rate: 0.0010
Epoch 2/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1s/step - accuracy: 1.0000 - loss: 5.2905e-05 - val_accuracy: 1.0000 - val_loss: 1.7070e-04 - learning_rate: 0.0010
Epoch 3/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 1.0000 - loss: 1.9368e-05 - val_accuracy: 1.0000 - val_loss: 1.0505e-04 - learning_rate: 0.0010
Epoch 4/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 1s/step - accuracy: 1.0000 - loss: 1.0073e-05 - val_accuracy: 1.0000 - val_loss: 7.2057e-05 - learning_rate: 0.0010
Epoch 5/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1s/step - accuracy: 1.0000 - loss: 6.2570e-06 - val_accuracy: 1.0000 - val_loss: 4.5939e-05 - learning_rate: 0.0010
Epoch 6/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m