# <!-- TITLE --> Projet 6 : notebook 4
<!-- AUTHOR : Anthony DAVID -->

In [2]:
import os, time, sys, pathlib, json, glob
import math, random
import datetime
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import TensorBoard

import sklearn.metrics

from skimage.morphology import disk
from skimage.util import img_as_ubyte
from skimage.filters import rank
from skimage import io, color, exposure, transform
from IPython.display import display,Image,Markdown,HTML

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed

In [4]:
import os

# Dossier de destination contenant les images copiées
destination_dir = './selected_data/images/'

# Vérifier si le dossier de destination existe et récupérer les noms des classes
if os.path.exists(destination_dir):
    class_names = [breed_dir for breed_dir in os.listdir(destination_dir) if os.path.isdir(os.path.join(destination_dir, breed_dir))]
else:
    class_names = []

# Nombre de classes
num_classes = len(class_names)

# Affichage
print("Class names:", class_names)
print("Number of classes:", num_classes)

Class names: ['Rottweiler', 'Doberman', 'Weimaraner', 'Staff', 'Malinois']
Number of classes: 5


# 5 - Full convolutions

Here we will use many models with many datasets and then show a report in the next section.

## 5.1 - Parameters

In [5]:
enhanced_dir = f'./data'
run_dir = './run_full'

# ---- 
datasets      = ['set-BW', 'set-L', 'set-L-HE', 'set-L-LHE', 'set-L-CLAHE', 'set-RGB', 'set-RGB-HE']
# models        = {'v1':'get_model_v1', 'v2':'get_model_v2', 'v3':'get_model_v3', 'v4':'get_model_v4', 'v5':'get_model_v5'}
models        = {'v1':'get_model_v1'}
batch_size    = 16
epochs        = 20
with_datagen  = False
fit_verbosity = 0
tag_id = '{:06}'.format(random.randint(0,99999))

## 5.2 - Dataset loading

In [6]:
def shuffle_np_dataset(*data):
    """
    Shuffle a list of dataset
    args:
        *data : datasets
    return:
        *datasets mixed
    """
    # print('Datasets have been shuffled.')
    p = np.random.permutation(len(data[0]))
    out = [ d[p] for d in data ]
    return out[0] if len(out)==1 else out


def read_dataset(enhanced_dir, dataset_name):
    """Reads h5 dataset from dataset_dir"""
    filename = f'{enhanced_dir}/{dataset_name}.h5'
    with h5py.File(filename, 'r') as f:
        x_train = f['x_train'][:]
        y_train = f['y_train'][:]
        x_val = f['x_val'][:]
        y_val = f['y_val'][:]
        x_test = f['x_test'][:]
        y_test = f['y_test'][:]
    x_train, y_train = shuffle_np_dataset(x_train, y_train)
    return x_train, y_train, x_val, y_val, x_test, y_test

## 5.3 - Models collection

In [7]:
def get_model_v1(lx,ly,lz):
    model = keras.models.Sequential()

    model.add( keras.layers.Conv2D(64, (3, 3), padding='same', input_shape=(lx,ly,lz), activation='relu'))
    model.add( keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add( keras.layers.Conv2D(128, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(256, (3, 3), padding='same',activation='relu'))
    model.add( keras.layers.Conv2D(256, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Flatten())
    model.add( keras.layers.Dense(512, activation='relu'))
    model.add( keras.layers.Dropout(0.4))
    model.add( keras.layers.Dense(num_classes, activation='softmax'))
    return model


def get_model_v2(lx,ly,lz):
    model = keras.models.Sequential()
    model.add( keras.layers.Conv2D(32, (3,3),   activation='relu', input_shape=(lx,ly,lz)))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(128, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(256, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Flatten()) 
    model.add( keras.layers.Dense(1152, activation='relu'))
    model.add( keras.layers.Dropout(0.4))

    model.add( keras.layers.Dense(num_classes, activation='softmax'))
    return model


def get_model_v3(lx,ly,lz):
    model = keras.models.Sequential()
    model.add( keras.layers.Conv2D(32, (3,3),   activation='relu', input_shape=(lx,ly,lz)))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(128, (3, 3), activation='relu'))
    model.add( keras.layers.MaxPooling2D((2, 2)))
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Conv2D(256, (3, 3), activation='relu'))
    
    model.add( keras.layers.GlobalAveragePooling2D())
    model.add( keras.layers.Dropout(0.2))

    model.add( keras.layers.Dense(1152, activation='relu'))
    model.add( keras.layers.Dropout(0.4))

    model.add( keras.layers.Dense(num_classes, activation='softmax'))
    return model


def get_model_v4(lx,ly,lz):
    model = keras.models.Sequential()

    model.add(keras.layers.Conv2D(128, 4, activation='relu', input_shape=(lx,ly,lz)))
    model.add(keras.layers.MaxPooling2D())
    
    model.add(keras.layers.Conv2D(64, 4, activation='relu'))
    model.add(keras.layers.MaxPooling2D())

    model.add(keras.layers.Conv2D(32, 4, activation='relu'))
    model.add(keras.layers.MaxPooling2D())

    model.add(keras.layers.Conv2D(16, 4, activation='relu'))
    model.add(keras.layers.MaxPooling2D())

    model.add(keras.layers.Flatten()) 
    model.add(keras.layers.Dense(64, activation='relu'))

    model.add(keras.layers.Dense(num_classes, activation='softmax'))
    return model


def get_model_v5(lx,ly,lz):
    model = keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (5, 5), padding='same',  activation='relu', input_shape=(lx,ly,lz)))
    model.add(tf.keras.layers.BatchNormalization(axis=-1))      
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    model.add(tf.keras.layers.Conv2D(64, (5, 5), padding='same',  activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(axis=-1))
    model.add(tf.keras.layers.Conv2D(128, (5, 5), padding='same', activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(axis=-1))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.4))

    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    return model

## 5.4 - Prepare the multi-run

In [8]:
print(tag_id)

055225


In [9]:
resume_file = f'{run_dir}/checkpoint_{tag_id}.json'  # To store progress

In [10]:
# ## Load checkpoint or initialize new run
# def load_checkpoint():
#     if os.path.exists(resume_file):
#         with open(resume_file, 'r') as f:
#             return json.load(f)
#     else:
#         return {'Dataset': [], 'Model': {}}

# checkpoint = load_checkpoint()

In [11]:
def multi_run(enhanced_dir, datasets, models, datagen=None, 
              batch_size=batch_size, epochs=epochs, 
              fit_verbosity=0, tag_id='last'):
    """
    Launches a dataset-model combination with checkpointing to resume in case of crashes.
    args:
        enhanced_dir   : Directory of the enhanced datasets
        datasets       : List of dataset (whitout .h5)
        models         : List of model like { "model name":get_model(), ...}
        datagen        : Data generator or None (None)
        batch_size     : Batch size (64)
        epochs         : Number of epochs (16)
        fit_verbosity  : Verbose level (0)
        tag_id         : postfix for report, logs and models dir (_last)
    return:
        report        : Report as a dict for Pandas.
    """  
    checkpoint_file = f'{run_dir}/checkpoint_{tag_id}.json'
    
    # Load existing checkpoint if available
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
    else:
        checkpoint = {"Dataset": [], "Model": {}}
    
    # ---- Logs and models dir
    os.makedirs(f'{run_dir}/logs_{tag_id}', mode=0o750, exist_ok=True)
    os.makedirs(f'{run_dir}/models_{tag_id}', mode=0o750, exist_ok=True)
    
    # ---- Columns of output
    output = {}
    output['Dataset'] = []
    for m in models:
        output[m+'_Accuracy'] = []
        output[m+'_Duration'] = []

    # ---- Let's go
    for d_name in datasets:
        # Skip dataset if already completed
        if d_name in checkpoint["Dataset"]:
            print(f"Skipping dataset {d_name} (already completed)")
            continue
        
        print("\nDataset : ",d_name)

        # ---- Read dataset
        x_train, y_train, x_val, y_val, x_test, y_test = read_dataset(enhanced_dir, d_name)
        output['Dataset'].append(d_name)
                
        # ---- Get the shape
        (n,lx,ly,lz) = x_train.shape

        # ---- For each model
        for m_name, m_function in models.items():
            if d_name in checkpoint["Model"] and m_name in checkpoint["Model"][d_name]:
                print(f"    Skipping model {m_name} for dataset {d_name} (already completed)")
                continue   
                     
            print("    Run model {}  : ".format(m_name), end='')
            # ---- get model
            try:
                # ---- get function by name
                m_function=globals()[m_function]
                model=m_function(lx,ly,lz)
                # ---- Compile it
                model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
                              loss='sparse_categorical_crossentropy', metrics=['accuracy'])
                # ---- Callbacks tensorboard
                log_dir = f'{run_dir}/logs_{tag_id}/tb_{d_name}_{m_name}'
                tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
                # ---- Callbacks bestmodel
                save_dir = f'{run_dir}/models_{tag_id}/model_{d_name}_{m_name}.keras'
                bestmodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, monitor='val_accuracy', save_best_only=True)
                # ---- Callbacks early stopping
                early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                                                          patience=5, verbose=1, restore_best_weights=True)
                # ---- Train
                start_time = time.time()
                if datagen==None:
                    # ---- No data augmentation (datagen=None) --------------------------------------
                    history = model.fit(x_train, y_train,
                                        batch_size      = batch_size,
                                        epochs          = epochs,
                                        verbose         = fit_verbosity,
                                        validation_data = (x_val, y_val),
                                        callbacks       = [tensorboard_callback, bestmodel_callback, early_stopping_callback])
                else:
                    # ---- Data augmentation (datagen given) ----------------------------------------
                    datagen.fit(x_train)
                    history = model.fit(datagen.flow(x_train, y_train, batch_size=batch_size),
                                        # steps_per_epoch = int(len(x_train)/batch_size),
                                        epochs          = epochs,
                                        verbose         = fit_verbosity,
                                        validation_data = (x_val, y_val),
                                        callbacks       = [tensorboard_callback, bestmodel_callback, early_stopping_callback])
                    
                # ---- Result
                end_time = time.time()
                duration = end_time-start_time
                accuracy = max(history.history["val_accuracy"])*100
                #
                output[m_name+'_Accuracy'].append(accuracy)
                output[m_name+'_Duration'].append(duration)
                print(f"Accuracy={accuracy: 7.2f}    Duration={duration: 7.2f}")
                
                # ---- Update checkpoint
                if d_name not in checkpoint["Model"]:
                    checkpoint["Model"][d_name] = {}
                checkpoint["Model"][d_name][m_name] = {"Accuracy": accuracy, "Duration": duration}
                
                # Save checkpoint
                with open(checkpoint_file, 'w') as f:
                    json.dump(checkpoint, f, indent=4)
                
            except Exception as e:
                print(f'Error occurred for model {m_name}: {e}')
                output[m_name+'_Accuracy'].append('0')
                output[m_name+'_Duration'].append('999')
                continue
        
        # Mark dataset as completed
        checkpoint["Dataset"].append(d_name)
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f, indent=4)
                
    return output

## 5.5 - Run !

In [12]:
print('\n---- Run','-'*50)


# ---- Data augmentation or not
#
if with_datagen :
    datagen = keras.preprocessing.image.ImageDataGenerator(featurewise_center=False,
                                                           featurewise_std_normalization=False,
                                                           width_shift_range=0.1,
                                                           height_shift_range=0.1,
                                                           zoom_range=0.2,
                                                           shear_range=0.1,
                                                           rotation_range=10.)
else:
    datagen=None
# ---- Run
#
output = multi_run(enhanced_dir,
                   datasets, 
                   models,
                   datagen       = datagen,
                   batch_size    = batch_size,
                   epochs        = epochs,
                   fit_verbosity = fit_verbosity,
                   tag_id        = tag_id)

# ---- Save report
#
report={}
report['output']=output
report['description'] = f' batch_size={batch_size} epochs={epochs} data_aug={with_datagen}'

report_name=f'{run_dir}/report_{tag_id}.json'

with open(report_name, 'w') as file:
    json.dump(report, file, indent=4)

print('\nReport saved as ',report_name)


print('-'*59)


---- Run --------------------------------------------------

Dataset :  set-BW
    Run model v1  : 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 5.
Accuracy=  31.98    Duration= 826.73

Dataset :  set-L
    Run model v1  : Epoch 16: early stopping
Restoring model weights from the end of the best epoch: 11.
Accuracy=  34.88    Duration= 1377.04

Dataset :  set-L-HE
    Run model v1  : Epoch 7: early stopping
Restoring model weights from the end of the best epoch: 2.
Accuracy=  29.65    Duration= 604.87

Dataset :  set-L-LHE
    Run model v1  : Epoch 6: early stopping
Restoring model weights from the end of the best epoch: 1.
Accuracy=  20.35    Duration= 523.01

Dataset :  set-L-CLAHE
    Run model v1  : Restoring model weights from the end of the best epoch: 16.
Accuracy=  36.63    Duration= 1757.17

Dataset :  set-RGB
    Run model v1  : Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 5.
Accuracy=  31.98    Duration= 902.53

Dataset :  set-RGB-HE
    Run model v1  : Epoch 19: early stopping
Restoring model weights 

# 6 - Show report

## 6.1 - Parameters

In [13]:
# Where to find the report : 

report_dir = './run_full'

## 6.2 - Few nice functions

In [14]:
def highlight_max(s):
    is_max = (s == s.max())
    return ['background-color: yellow' if v else '' for v in is_max]

def highlight_max_red(s):
    is_max = (s == s.max())
    return ['background-color: red' if v else '' for v in is_max]

def show_report(file):
    # ---- Read json file
    with open(file) as infile:
        dict_report = json.load( infile )
    output      = dict_report['output']
    description = dict_report['description']
    # ---- about
    display(Markdown(f"<br>**Report : {Path(file).stem}**"))
    print(    "Desc.  : ",description,'\n')
    # ---- Create a pandas
    report       = pd.DataFrame (output)
    col_accuracy = [ c for c in output.keys() if c.endswith('Accuracy')]
    col_duration = [ c for c in output.keys() if c.endswith('Duration')]
    # ---- Build formats
    lambda_acc = lambda x : '{:.2f} %'.format(x) if (isinstance(x, float)) else '{:}'.format(x)
    lambda_dur = lambda x : '{:.1f} s'.format(x) if (isinstance(x, float)) else '{:}'.format(x)
    formats = {'Size':'{:.2f} Mo'}
    for c in col_accuracy:   
        formats[c]=lambda_acc
    for c in col_duration:
        formats[c]=lambda_dur
    # t=report.style.highlight_max(subset=col_accuracy).format(formats)
    t = report.style.apply(highlight_max_red, subset=col_accuracy).format(formats)
    display(t)

## 6.3 - Reports display

In [15]:
for file in glob.glob(f'{report_dir}/report_*.json'):
    show_report(file)

<br>**Report : report_060218**

Desc.  :   batch_size=16 epochs=20 data_aug=False 



Unnamed: 0,Dataset,v2_Accuracy,v2_Duration,v3_Accuracy,v3_Duration,v4_Accuracy,v4_Duration
0,set-BW,33.72 %,529.9 s,27.33 %,47.4 s,29.07 %,174.8 s
1,set-L,35.47 %,256.3 s,26.74 %,124.4 s,27.33 %,127.7 s
2,set-L-HE,31.40 %,307.6 s,25.58 %,80.0 s,33.72 %,274.8 s
3,set-L-LHE,25.58 %,180.5 s,25.00 %,40.8 s,19.19 %,117.9 s
4,set-L-CLAHE,31.98 %,430.8 s,19.77 %,54.9 s,33.14 %,213.8 s
5,set-RGB,35.47 %,274.3 s,19.19 %,51.9 s,32.56 %,248.5 s
6,set-RGB-HE,31.98 %,358.0 s,19.19 %,59.5 s,31.98 %,354.5 s


<br>**Report : report_055225**

Desc.  :   batch_size=16 epochs=20 data_aug=False 



Unnamed: 0,Dataset,v1_Accuracy,v1_Duration
0,set-BW,31.98 %,826.7 s
1,set-L,34.88 %,1377.0 s
2,set-L-HE,29.65 %,604.9 s
3,set-L-LHE,20.35 %,523.0 s
4,set-L-CLAHE,36.63 %,1757.2 s
5,set-RGB,31.98 %,902.5 s
6,set-RGB-HE,33.14 %,1673.7 s
