In [0]:
import pandas as pd
from PIL import Image
import numpy as np
import io
import sys
import os
import random
import shutil

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical, set_random_seed
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from pyspark.sql.functions import col, udf, pandas_udf, PandasUDFType, element_at, split
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql import SparkSession

## En local, sans PySpark

In [0]:
base_model = MobileNetV2(weights='imagenet',
                         include_top=False,
                         input_shape=(224, 224, 3))

In [0]:
def create_model(base_model, nbr_de_classes):
    for layer in base_model.layers:
        layer.trainable = False
    # Ajout des couches de classification
    model = Sequential([base_model,
                        GlobalAveragePooling2D(),
                        Dense(1024, activation='relu'),
                        Dense(nbr_de_classes, activation='softmax')
                       ])
    return model

In [0]:
model = create_model(base_model, 3)
model.summary()

In [0]:
def metrics_display(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    ARI = adjusted_rand_score(true_labels, predicted_labels)
    print(f'accuracy = {accuracy}   precision = {precision}   recall={recall}   f1 = {f1}   ARI = {ARI}')

In [0]:
def train_and_test_model(train_dir,
                         test_dir,
                         model,
                         seed_value=42,
                         target_size=(224, 224),
                         optimizer=Nadam,
                         learning_rate=0.001,
                         epochs=10
                        ):
    
    # Fixation des germes des générateurs aléatoires pour assurer la reproductibilité des résultats.
    set_random_seed(seed_value)
    
    # Préparation du générateur de données
    generator = ImageDataGenerator(rescale=1./255)
    
    train_generator = generator.flow_from_directory(train_dir,
                                                    target_size=target_size,
                                                    class_mode='categorical',
                                                    seed=seed_value
                                                    )

    test_generator = generator.flow_from_directory(test_dir,
                                                   target_size=target_size,
                                                   class_mode='categorical',
                                                   shuffle=False,
                                                   seed=seed_value
                                                   )
    
    # Compiler le modèle
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Construction des callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

    # Entraînement du modèle
    history = model.fit(train_generator,
                        validation_data=test_generator,
                        callbacks=[early_stopping, reduce_lr],
                        epochs=epochs)

    # Obtention des étiquettes prédites et réelles
    predictions = model.predict(test_generator)
    y_pred = np.argmax(predictions, axis=1)
    y_true = test_generator.classes
    
    # Évaluation des performances du modèle
    metrics_display(y_true, y_pred)

In [0]:
train_dir = '../data/train_local'
test_dir = '../data/test_local'

In [0]:
train_and_test_model(train_dir, test_dir, model)

Found 1381 images belonging to 3 classes.
Found 461 images belonging to 3 classes.
Epoch 1/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 920ms/step - accuracy: 0.8996 - loss: 0.1968 - val_accuracy: 1.0000 - val_loss: 1.2202e-04 - learning_rate: 0.0010
Epoch 2/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 906ms/step - accuracy: 1.0000 - loss: 2.9620e-05 - val_accuracy: 1.0000 - val_loss: 6.4307e-05 - learning_rate: 0.0010
Epoch 3/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 902ms/step - accuracy: 1.0000 - loss: 6.8215e-06 - val_accuracy: 1.0000 - val_loss: 3.1388e-05 - learning_rate: 0.0010
Epoch 4/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 996ms/step - accuracy: 1.0000 - loss: 3.3063e-06 - val_accuracy: 1.0000 - val_loss: 2.3720e-05 - learning_rate: 0.0010
Epoch 5/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 925ms/step - accuracy: 1.0000 - loss: 2.1207e-06 - val_accuracy: 

## En local, avec PySpark

In [0]:
env_projet = '/home/chakir/.local/share/virtualenvs/Projet8-N0cGnpCf/bin/python'

In [0]:
# Configuration de l'environnement virtuel pour PySpark
os.environ['PYSPARK_PYTHON'] = env_projet
os.environ['PYSPARK_DRIVER_PYTHON'] = env_projet

In [0]:
# Démarrage d'une session Spark
spark = (SparkSession.builder
                     .appName('P8')
                     .master('local')
                     .config('spark.executorEnv.PYSPARK_PYTHON', env_projet)
                     .getOrCreate())

24/09/17 14:03:21 WARN Utils: Your hostname, chakir-Latitude-7390 resolves to a loopback address: 127.0.1.1; using 192.168.43.175 instead (on interface wlp2s0)
24/09/17 14:03:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/17 14:03:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/17 14:03:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/17 14:03:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [0]:
sc = spark.sparkContext

In [0]:
spark

In [0]:
images_train = (spark.read.format('binaryFile')
                     .option('pathGlobFilter', '*.jpg')
                     .option('recursiveFileLookup', 'true')
                     .load(train_dir))

images_test = (spark.read.format('binaryFile')
                    .option('pathGlobFilter', '*.jpg')
                    .option('recursiveFileLookup', 'true')
                    .load(test_dir))

In [0]:
images_train = images_train.withColumn('label', element_at(split(images_train['path'], '/'),-2))
images_test = images_test.withColumn('label', element_at(split(images_test['path'], '/'),-2))

In [0]:
print(images_train.printSchema())
print(images_test.select('path','label').show(5,False))

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None
+-------------------------------------------------------------------------------------+--------------+
|path                                                                                 |label         |
+-------------------------------------------------------------------------------------+--------------+
|file:/home/chakir/Bureau/PROJETS/Projet8/data/test_local/Apple Braeburn/r_326_100.jpg|Apple Braeburn|
|file:/home/chakir/Bureau/PROJETS/Projet8/data/test_local/Apple Braeburn/r_4_100.jpg  |Apple Braeburn|
|file:/home/chakir/Bureau/PROJETS/Projet8/data/test_local/Apple Braeburn/r_8_100.jpg  |Apple Braeburn|
|file:/home/chakir/Bureau/PROJETS/Projet8/data/test_local/Apple Braeburn/r_324_100.jpg|Apple Braeburn|
|file:/home/chakir/Bureau/PROJETS/Projet8/data/test_local/Apple Braeb

In [0]:
def create_model_spark(spark_context, base_model, nbr_de_classes):
    for layer in base_model.layers:
        layer.trainable = False
    # Ajout des couches de classification
    model = Sequential([base_model,
                        GlobalAveragePooling2D(),
                        Dense(1024, activation='relu'),
                        Dense(nbr_de_classes, activation='softmax')
                       ])
    brodcast_weights = spark_context.broadcast(model.get_weights())
    return model

In [0]:
# Fonction de prétraitement d'images (redimensionnement, normalisation)
def preprocess_image(image_path, target_size=(224, 224)):
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img)
    img_array /= 255.0
    return img_array.tolist()

# Fonction pour obtenir les chemins des images et leurs étiquettes
def get_image_paths_and_labels(spark_session, dir_path):
    
    images = (spark_session.read.format('binaryFile')
                           .option('pathGlobFilter', '*.jpg')
                           .option('recursiveFileLookup', 'true')
                           .load(dir_path))

    images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
    
    return images

In [0]:
def train_and_test_model_spark(train_dir, 
                               test_dir, 
                               model,
                               spark_session,
                               seed_value=42, 
                               target_size=(224, 224), 
                               optimizer=Nadam, 
                               learning_rate=0.001, 
                               epochs=10):
    
    # Fixation des germes pour la reproductibilité des résultats
    set_random_seed(seed_value)
    
    # Obtention des chemins des images et leurs étiquettes
    train_df = get_image_paths_and_labels(spark_session, train_dir)
    test_df = get_image_paths_and_labels(spark_session, test_dir)
    
    # Conversion en dataframe pandas pour prétraitement en local
    train_data = train_df.toPandas()
    test_data = test_df.toPandas()

    # Nettoyage des chemins de fichiers en retirant le préfixe "file:"
    train_data['path'] = train_data.path.apply(lambda x: x.replace('file:', ''))
    test_data['path'] = test_data.path.apply(lambda x: x.replace('file:', ''))
    
    # Prétraitement des images
    X_train = np.array([preprocess_image(path) for path in train_data.path])
    X_test = np.array([preprocess_image(path) for path in test_data.path])

    # Sélection des étiquettes d'entraînement et de test
    y_train = train_data.label
    y_test = test_data.label
    
    # Encodage des étiquettes en one-hot
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train.values)
    y_test = encoder.transform(y_test.values)
    y_train_cnn = to_categorical(y_train)
    y_test_cnn = to_categorical(y_test)

    # Compiler le modèle
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Callbacks pour le modèle
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)
    
    # Entraîner le modèle
    history = model.fit(X_train, y_train_cnn,
                        validation_data=(X_test, y_test_cnn),
                        callbacks=[early_stopping, reduce_lr],
                        epochs=epochs)
    
    # Prédictions
    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)
    
    # Évaluation du modèle
    metrics_display(y_test, y_pred)

In [0]:
model = create_model_spark(sc, base_model, 3)

In [0]:
train_and_test_model_spark(train_dir, test_dir, model, spark)

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

## Sur le cloud AWS via Databricks

In [None]:
# Démarrage d'une session Spark
spark = (SparkSession.builder
                     .appName('P8')
                     .getOrCreate())

In [0]:
sc = spark.sparkContext

In [0]:
spark

In [0]:
train_dir = 'gs://chakir-p8/data/train_local'
test_dir = 'gs://chakir-p8/data/test_local'

In [None]:
def create_model_spark(spark_context, base_model, nbr_de_classes):
    for layer in base_model.layers:
        layer.trainable = False
    # Ajout des couches de classification
    model = Sequential([base_model,
                        GlobalAveragePooling2D(),
                        Dense(1024, activation='relu'),
                        Dense(nbr_de_classes, activation='softmax')
                       ])
    brodcast_weights = spark_context.broadcast(model.get_weights())
    return model

In [0]:
# Fonction pour obtenir les chemins des images et leurs étiquettes
def get_image_paths_and_labels(spark_session, dir_path):
    
    images = (spark_session.read.format('binaryFile')
                           .option('pathGlobFilter', '*.jpg')
                           .option('recursiveFileLookup', 'true')
                           .load(dir_path))

    images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
    
    return images

In [0]:
# Fonction pour prétraiter une image
def preprocess_image(binary_image):
    image = Image.open(io.BytesIO(binary_image))
    image = image.resize((224, 224))
    img_array = img_to_array(image)
    img_array = preprocess_input(img_array)
    return img_array.flatten().tolist()
    
# Création d'une UDF pour prétraiter les images
preprocess_image_udf = udf(preprocess_image, ArrayType(FloatType()))

In [0]:
def train_and_test_model_spark(train_dir, 
                               test_dir, 
                               model,
                               spark_session,
                               seed_value=42, 
                               target_size=(224, 224), 
                               optimizer=Nadam, 
                               learning_rate=0.001, 
                               epochs=10):
    
    # Fixation des germes pour la reproductibilité des résultats
    set_random_seed(seed_value)
    
    # Obtention des chemins des images et leurs étiquettes
    train_df = get_image_paths_and_labels(spark_session, train_dir)
    test_df = get_image_paths_and_labels(spark_session, test_dir)

    # Prétraitement des images
    train_df = train_df.withColumn('preprocessed_image', preprocess_image_udf(col('content')))
    test_df = test_df.withColumn('preprocessed_image', preprocess_image_udf(col('content')))

    # Conversion en format numpy pour le modèle tensorflow
    train_data = train_df.select('preprocessed_image', 'label').collect()
    test_data = test_df.select('preprocessed_image', 'label').collect()

    X_train = np.array([np.array(row['preprocessed_image']).reshape(224, 224, 3) for row in train_data])
    X_test = np.array([np.array(row['preprocessed_image']).reshape(224, 224, 3) for row in test_data])

    y_train = np.array([row['label'] for row in train_data])
    y_test = np.array([row['label'] for row in test_data])

    # Encodage des étiquettes en one-hot
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    y_train_cnn = to_categorical(y_train)
    y_test_cnn = to_categorical(y_test)

    # Vérifie les valeurs de X_train et X_test
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    # Compiler le modèle
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Callbacks pour le modèle
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)
    
    # Entraîner le modèle
    history = model.fit(X_train, y_train_cnn,
                        validation_data=(X_test, y_test_cnn),
                        callbacks=[early_stopping, reduce_lr],
                        epochs=epochs)
    
    # Prédictions
    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)
    
    # Évaluation du modèle
    metrics_display(y_test, y_pred)

In [0]:
model = create_model_spark(sc, base_model, 3)

In [0]:
train_and_test_model_spark(train_dir, test_dir, model, spark)

[0;31m---------------------------------------------------------------------------[0m
[0;31mThe Python process exited with exit code 137 (SIGKILL: Killed). This may have been caused by an OOM error. Check your command's memory usage.[0m
[0;31m[0m
[0;31m[0m
[0;31m[0m
[0;31mThe last 10 KB of the process's stderr and stdout can be found below. See driver logs for full logs.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stderr:[0m
[0;31mThu Sep 26 10:46:12 2024 Connection to spark from PID  6116[0m
[0;31mThu Sep 26 10:46:12 2024 Initialized gateway on port 35403[0m
[0;31mThu Sep 26 10:46:14 2024 Connected to spark.[0m
[0;31m2024-09-26 10:48:15.321085: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.[0m
[0;31m2024-09-26 10:48:15.405038: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, 

In [0]:
# Tester l'accès à S3
df = spark.read.text("s3://chakir-p8-data/requirements.txt")