## Train a model

The objective of this notebook is train and evaluate a given model specified in the parameters file.

In [None]:
# For Development and debugging:
# Reload modul without restarting the kernel
#%load_ext autoreload
#%autoreload 2

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import tensorflow_datasets as tfds
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import shutil
import json
import math
import time

Load model parameters:

In [None]:
# Do not touch the value of PARAMETERS_FILE!
# When this notebook is executed with jupyter-nbconvert (from script), 
# it will be replaced outomatically
PARAMETERS_FILE = 'dont_touch_me-input_parameters_file'

if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))
    
# Open parameters
with open(PARAMETERS_FILE) as params_file:
    p = json.load(params_file)

# Save parameter file path
p['parameters_file_path'] = PARAMETERS_FILE
p.keys()

Set logging:

In [None]:
# Set logging configuration
import logging
log_file_path = p['log_file_name']
logging.basicConfig(
    filename=log_file_path,
    filemode='w', 
    level=getattr(logging, p['log_level'])
)
logging.info('Parameters loaded from file:\n{}'.format(PARAMETERS_FILE))

#### Model parameters:

In [None]:
# Set missing parameters with default values
if not 'conv_reg' in p.keys():
    p['conv_reg'] = [0,0]
if not 'dense_reg' in p.keys():
    p['dense_reg'] = [0,0]
    
if not 'verbose_level' in p.keys():
    p['verbose_level'] = 2
    
if not 'pre_training' in p.keys():
    p['pre_training'] = 0

In [None]:
msg = 'Dataset:\n\t{}'.format(p['tf_ds_name'])
msg += '\n\nData Augmentation:'
msg += '\n\tRandom Flipping: {}\n\tRandom 90deg Rotations: {}'.format(p['random_horizontal_flipping'],p['random_90deg_rotations'])
msg += '\n\tRandom centerd zoom: {}'.format(p['random_CenterZoom'])
msg += '\n\nModel:'
msg += '\n\tArchitecture: {}'.format(p['model_name'])
msg += '\n\tpre_training: {}'.format(p['pre_training'])
msg += '\n\tConv layers regularization ([l1, l2]): {}'.format(p['conv_reg'])
msg += '\n\tDense layers regularization ([l1, l2]): {}'.format(p['dense_reg'])
msg += '\n\tLoss function: {}'.format(p['loss'])
msg += '\n\tLearning rate: {}'.format(p['learning_rate'])
msg += '\n\tEpochs: {}\n\n'.format(p['number_of_epochs'])
logging.info(msg)
print(msg)

In [None]:
# Load external libraries path
EXTERNAL_LIBS_PATH = p['external_libs_path']
if not os.path.exists(EXTERNAL_LIBS_PATH):
    msg = 'External library path {} does not exist!'.format(EXTERNAL_LIBS_PATH)
    logging.error(msg)
    raise Exception(msg)
else:
    msg='EXTERNAL_LIBS_PATH: {}'.format(EXTERNAL_LIBS_PATH)
    print(msg)
    logging.info(msg)
# Add EXTERNAL_LIBS_PATH to sys paths (for loading libraries)
sys.path.insert(1, EXTERNAL_LIBS_PATH)
# Load external libraries
from Models_V2 import Predef_models as predef_models
from Utils import Tee_Logger as Tee_Logger
#from Utils import lr_schedule_Callback
#from Utils import save_best_model_Callback
from Utils import save_best_model_base_on_CMA_Callback
from Utils import evaluate_model
import Utils as utils
import Data_augmentation

Create dirs where model output will be saved:

In [None]:
# If you want to avoid cleaning (deleting) model dir, then uncomment the next line:
#p['clean_model_dir'] = 0

base_path, model_path, checkpoints_path = utils.create_model_dirs(parameters=p)

msg = 'Base path:\n{}'.format(base_path)
msg += '\nModel path:\n{}'.format(model_path)
msg += '\nCheckpoints path:\n{}'.format(checkpoints_path)
logging.info(msg)
print(msg)

In [None]:
# Make tf to ignore GPU
if p['disable_gpu']:
    msg = "Cuda devices (GPUs) disabled"
    logging.info(msg)
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
physical_devices = tf.config.experimental.list_physical_devices('GPU')
msg = 'Physical GPU devises:\n{}'.format(physical_devices)
logging.info(msg)
print(msg)

#restrict GPU mem
if p['set_memory_growth']:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        msg = 'GPU Memory limited!'
    except:
        msg = 'It was not possible to limit GPU memory'
        
    logging.info(msg)
    print(msg)

## Load Preprocessing parameters and information:

In [None]:
# Preprocessed data path
pp_path = p['pp_path']

with open(os.path.join(pp_path, 'params.json')) as file:
    pp_params = json.load(file)
msg = 'Loaded data preprocessing parameters from:\n{}'.format(file)
logging.info(msg)
seed = pp_params['seed']

# Load Channels file
with open(os.path.join(pp_path, 'channels.csv')) as file:
    channels = pd.read_csv(file)
msg = 'Loaded channels file from:\n{}'.format(file)
logging.info(msg)

## Specify input channels

In [None]:
selected_channels = p['input_channels']
msg = 'Selected input channels:\n{}'.format(selected_channels)
logging.info(msg)
print(msg)
# Get selected channel ids
input_ids = np.array(channels.set_index(['name']).loc[selected_channels].channel_id.values)
msg = 'Corresponding input channel ids:\n{}'.format(input_ids)
logging.info(msg)
print(msg)
print(input_ids.shape)

## Load Dataset

In [None]:
# Path where tf datasets are
dataset, metadata = tfds.load(
    name=p['tf_ds_name'], 
    data_dir=p['local_tf_datasets'], 
    # If False, returns a dictionary with all the features
    as_supervised=True, 
    shuffle_files=p['shuffle_files'],
    with_info=True)
msg = 'Tensorflow dataset {} loaded from:\n{}'.format(p['tf_ds_name'], p['local_tf_datasets'])
logging.info(msg)

# Load the splits
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

Show information about the dataset:

In [None]:
metadata

In [None]:
metadata.splits

In [None]:
metadata.features

## Process the data
Before training the network, we discriminate some channels, apply some linear transformations (90deg rotations and horizontal flipping) to augment the **Training** dataset, create the batches and shuffle them. Also, we perform other operations to improve performance.

**Tune performance**<br>
tf.data.Dataset.prefetch overlaps data preprocessing and model execution while training.
It can be used to decouple the time when data is produced from the time when data is consumed. In particular, the transformation uses a background thread and an internal buffer to prefetch elements from the input dataset ahead of the time they are requested. The number of elements to prefetch should be equal to (or possibly greater than) the number of batches consumed by a single training step. You could either manually tune this value, or set it to **tf.data.experimental.AUTOTUNE** which will prompt the tf.data runtime to tune the value dynamically at runtime.

**Shuffling**<br>
dataset.shuffle() Randomly shuffles the elements of this dataset.
This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

For instance, if your dataset contains 10,000 elements but buffer_size is set to 1,000, then `shuffle` will initially select a random element from only the first 1,000 elements in the buffer. Once an element is selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, maintaining the 1,000 element buffer.

**reshuffle_each_iteration** controls whether the shuffle order should be different for each epoch.

Take a look into one image and a random transformation (random rotation+random horizontal flippig):

In [None]:
# Get one image from the training dataset
image, target = next(iter(train_data))
# Visualize the original vs. random flipping and rotations
plt_size=np.array([5,5])

plt.figure(figsize=(plt_size[0],plt_size[1]))
Data_augmentation.visualize_tensor_cell_image(image, 'Original Cell')

if p['random_horizontal_flipping'] | p['random_90deg_rotations'] | p['random_CenterZoom']:
    plt.figure(figsize=(4*plt_size[0],plt_size[1]))
    for i in range(4):
        plt.subplot(1,4,i+1)
        img, _ = Data_augmentation.augment(image, target, p, input_ids, metadata)
        Data_augmentation.visualize_tensor_cell_image(img, 'Augmented Cell')

Prepare datasets for training the CNN:

In [None]:
BATCH_SIZE = p['BATCH_SIZE']
buffer_size = 512
AUTOTUNE = tf.data.experimental.AUTOTUNE
# the lambda function is to give more arguments to the map function

train_data = train_data.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True)
train_data = train_data.map(lambda image, target: Data_augmentation.augment(image, target, p, input_ids, metadata), num_parallel_calls=AUTOTUNE)
train_data = train_data.batch(BATCH_SIZE).prefetch(AUTOTUNE)

val_data = val_data.map(lambda image, target: Data_augmentation.filter_channels(image, target, input_ids, metadata), num_parallel_calls=AUTOTUNE)
val_data = val_data.batch(BATCH_SIZE).prefetch(AUTOTUNE)

test_data = test_data.map(lambda image, target: Data_augmentation.filter_channels(image, target, input_ids, metadata), num_parallel_calls=AUTOTUNE)
test_data = test_data.prefetch(AUTOTUNE)

## Model Selection

Models are selected from a group of predefined models in the class `Predef_models` (in `Models.py`). The name of the selected model is specified in the parameter `p['model_method']`.

First we need to init the `Predef_models` class:

In [None]:
# Init models class
models = predef_models()

# Select model
img_shape = metadata.features['image'].shape[:-1] + (input_ids.shape[0],)
model = models.select_model(model_name=p['model_name'], 
                            input_shape=img_shape,
                            conv_reg=p['conv_reg'],
                            dense_reg=p['dense_reg'],
                            pre_training=p['pre_training']
                           )

Select the loss function and build the model:

In [None]:
# Select the loss function
if p['loss'] == 'mse':
    loss = tf.keras.losses.MeanSquaredError()
    
elif p['loss'] == 'huber':
    loss = tf.keras.losses.Huber(delta=1.0)
    
elif p['loss'] == 'mean_absolute_error':
    loss = tf.keras.losses.MeanAbsoluteError()
    
msg = '{} loss function selected. Building the model...'.format(p['loss'])
logging.info(msg)
print(msg)

metrics = ['mse', 'mean_absolute_error']
model.compile(optimizer=Adam(learning_rate=p['learning_rate']),
              loss=loss,
              metrics=metrics
             )
msg = 'Model compiled!'
logging.info(msg)

Take a look into the model architecture and number of parameters:

In [None]:
# Duplicates sys.stdout to the log file
TeeLog = Tee_Logger(log_file_path)
model.summary()

In [None]:
# Finish stdout duplication
TeeLog.close()

Set callback to save best model accordingly to the average of the Validation MAE of the last 30, 20 and 10 epochs. It also save the best model with out any average:

In [None]:
avg_sizes = [11, 21, 31]
monitor='val_mean_absolute_error'

save_best_model = save_best_model_base_on_CMA_Callback(monitor, avg_sizes)
callbacks = [save_best_model]

Set tensorboard config (if active):

In [None]:
if p['tensorboard']:
    tb_dir_path = p['log_file_name'][:-4]+'_tensorboard'
    try:
        shutil.rmtree(tb_dir_path)
    except OSError as e:
        msg  = 'Tensorboard log dir {} could not be deleted!\n\nOSError: {}'.format(tb_dir_path, e)
        logging.error(msg)
        print(msg)
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir_path, histogram_freq=1)
    callbacks.append(tensorboard_callback)
    
    msg = 'Tensorboard file: {}'.format(tb_dir_path)
    logging.info('\n\n'+msg+'\n\n')
    print(msg)

Train the model:

In [None]:
msg = 'Starting model training...'
logging.info(msg)

In [None]:
# if LR_SCHEDULE given, then init lr scheduler callback
# commented since Adam+decreasing the learning during training make the model more prompt to overfitting
#if 'LR_SCHEDULE' in p.keys():
#    finish_warmup_and_lr_schedule = lr_schedule_Callback(utils.lr_schedule, p['LR_SCHEDULE'])
#    callbacks.append(finish_warmup_and_lr_schedule)

In [None]:
# Save time before training
tic = time.time()
# Duplicates sys.stdout to the log file
TeeLog = Tee_Logger(log_file_path)

# Fit model
n_train = metadata.splits['train'].num_examples
history = model.fit(train_data,
                    validation_data=val_data,
                    epochs=p['number_of_epochs'],
                    callbacks=callbacks,
                    verbose=p['verbose_level'],
                    #steps_per_epoch=math.ceil(n_train/BATCH_SIZE),
                    )
toc = time.time()
print('Training time (in mins): {}'.format((toc-tic)/60))

In [None]:
# Finish stdout duplication
TeeLog.close()

### Plot Loss

In [None]:
utils.plot_train_metrics(history=history.history, metrics=['loss']+metrics, p=p, figsize=(15,23))

In [None]:
msg = 'Saiving trained model'
logging.info(msg)

# Save history
with open(os.path.join(base_path, 'history.json'), 'w') as file:
    json.dump(history.history, file, indent=4)
    
# Save CMA history
# First wee need to convert from np.int64 and np.float64 to regular python int and float
temp_dict = {}
for key in save_best_model.CMA_history.keys():
    temp_dict[key] = [[int(item[0]), float(item[1])] for item in save_best_model.CMA_history[key]]
with open(os.path.join(base_path, 'CMA_history.json'), 'w') as file:
    json.dump(temp_dict, file, indent=4)
    
# Save parameters
with open(os.path.join(base_path, 'parameters.json'), 'w') as file:
    json.dump(p, file, indent=4)

In [None]:
# Load history
#path = ''
#with open(os.path.join(path, 'history.json'), 'r') as file:
#    history = json.load(file)
# Save parameters
#with open(os.path.join(base_path, 'parameters.json'), 'r') as file:
#    p = json.load(file)
#metrics = ['mse', 'mean_absolute_error']

# Model evaluation

In [None]:
# Create data frame to save model metrics
metrics_df = pd.DataFrame()

## 1.- Last model

In [None]:
# Evaluate model
model_eval = evaluate_model(p, model, input_ids)
model_eval.targets_df.head()

In [None]:
# Error distribution plot
model_eval.plot_error_dist(figsize=(20,7), hue='cell_cycle', sets=['train','val'])
# y and y_hat distribution plot
model_eval.plot_y_dist(figsize=(15,7), x='cell_cycle', sets=['train','val'])
# Residuals plot
model_eval.plot_residuals(figsize=(10,7), hue='cell_cycle')
# Target vs predicted
model_eval.plot_y_vs_y_hat(figsize=(7,7), hue='cell_cycle')

In [None]:
model_eval.get_metrics()
mask = model_eval.metrics_df.Set == 'test'
model_eval.metrics_df[~mask]

## 2.- Best model with no Center Moving Average (CMA_0)

In [None]:
eval_name='CMA_0'
# Load weights
model.set_weights(save_best_model.best_models[eval_name][3])
# Save model
model.save(os.path.join(model_path, eval_name))
# Evaluate model
model_eval = evaluate_model(p, model, input_ids)
# Save model data (y_hat values and metrics)
model_eval.save_model_evaluation_data(base_path, eval_name=eval_name)
model_eval.targets_df.head()

In [None]:
# Loss plot
utils.plot_train_metrics(history=history.history, 
                         metrics=['mean_absolute_error'], 
                         p=p,
                         figsize=(15,23))
# Error distribution plot
model_eval.plot_error_dist(figsize=(20,7), hue='cell_cycle', sets=['train','val'])
# y and y_hat distribution plot
model_eval.plot_y_dist(figsize=(15,7), x='cell_cycle', sets=['train','val'])
# Residuals plot
model_eval.plot_residuals(figsize=(10,7), hue='cell_cycle')
# Target vs predicted
model_eval.plot_y_vs_y_hat(figsize=(7,7), hue='cell_cycle')

In [None]:
model_eval.get_metrics()
metrics_df = metrics_df.append(model_eval.metrics_df, ignore_index=True)
mask = model_eval.metrics_df.Set == 'test'
model_eval.metrics_df[~mask]

## 3.- Best model wrt Central Moving Average of size 11 (CMA_11)

In [None]:
eval_name='CMA_11'
# Load weights
model.set_weights(save_best_model.best_models[eval_name][3])
# Save model
model.save(os.path.join(model_path, eval_name))
# Evaluate model
model_eval = evaluate_model(p, model, input_ids)
# Save model data (y_hat values and metrics)
model_eval.save_model_evaluation_data(base_path, eval_name=eval_name)
model_eval.targets_df.head()

In [None]:
# Loss plot
utils.plot_train_metrics(history=history.history, 
                         CMA_history=save_best_model.CMA_history[eval_name],
                         CMA_metric='mean_absolute_error',
                         metrics=['mean_absolute_error'], 
                         p=p,
                         title=eval_name,
                         figsize=(15,23))
# Error distribution plot
model_eval.plot_error_dist(figsize=(20,7), hue='cell_cycle', sets=['train','val'])
# y and y_hat distribution plot
model_eval.plot_y_dist(figsize=(15,7), x='cell_cycle', sets=['train','val'])
# Residuals plot
model_eval.plot_residuals(figsize=(10,7), hue='cell_cycle')
# Target vs predicted
model_eval.plot_y_vs_y_hat(figsize=(7,7), hue='cell_cycle')

In [None]:
model_eval.get_metrics(CMA_size=11, 
                       CMA=save_best_model.best_models[eval_name][1], 
                       CMA_Std=save_best_model.best_models[eval_name][2], 
                       Epoch=save_best_model.best_models[eval_name][0])
metrics_df = metrics_df.append(model_eval.metrics_df, ignore_index=True)
mask = model_eval.metrics_df.Set == 'test'
model_eval.metrics_df[~mask]

## 4.- Best model wrt Central Moving Average of size 21 (CMA_21)

In [None]:
eval_name='CMA_21'
# Load weights
model.set_weights(save_best_model.best_models[eval_name][3])
# Save model
model.save(os.path.join(model_path, eval_name))
# Evaluate model
model_eval = evaluate_model(p, model, input_ids)
# Save model data (y_hat values and metrics)
model_eval.save_model_evaluation_data(base_path, eval_name=eval_name)
model_eval.targets_df.head()

In [None]:
# Loss plot
utils.plot_train_metrics(history=history.history, 
                         CMA_history=save_best_model.CMA_history[eval_name],
                         CMA_metric='mean_absolute_error',
                         metrics=['mean_absolute_error'], 
                         p=p,
                         title=eval_name,
                         figsize=(15,23))
# Error distribution plot
model_eval.plot_error_dist(figsize=(20,7), hue='cell_cycle', sets=['train','val'])
# y and y_hat distribution plot
model_eval.plot_y_dist(figsize=(15,7), x='cell_cycle', sets=['train','val'])
# Residuals plot
model_eval.plot_residuals(figsize=(10,7), hue='cell_cycle')
# Target vs predicted
model_eval.plot_y_vs_y_hat(figsize=(7,7), hue='cell_cycle')

In [None]:
model_eval.get_metrics(CMA_size=21, 
                       CMA=save_best_model.best_models[eval_name][1], 
                       CMA_Std=save_best_model.best_models[eval_name][2], 
                       Epoch=save_best_model.best_models[eval_name][0])
metrics_df = metrics_df.append(model_eval.metrics_df, ignore_index=True)
mask = model_eval.metrics_df.Set == 'test'
model_eval.metrics_df[~mask]

## 5.- Best model wrt Central Moving Average of size 31 (CMA_31)

In [None]:
eval_name='CMA_31'
# Load weights
model.set_weights(save_best_model.best_models[eval_name][3])
# Save model
model.save(os.path.join(model_path, eval_name))
# Evaluate model
model_eval = evaluate_model(p, model, input_ids)
# Save model data (y_hat values and metrics)
model_eval.save_model_evaluation_data(base_path, eval_name=eval_name)
model_eval.targets_df.head()

In [None]:
# Loss plot
utils.plot_train_metrics(history=history.history, 
                         CMA_history=save_best_model.CMA_history[eval_name],
                         CMA_metric='mean_absolute_error',
                         metrics=['mean_absolute_error'], 
                         p=p,
                         title=eval_name,
                         figsize=(15,23))
# Error distribution plot
model_eval.plot_error_dist(figsize=(20,7), hue='cell_cycle', sets=['train','val'])
# y and y_hat distribution plot
model_eval.plot_y_dist(figsize=(15,7), x='cell_cycle', sets=['train','val'])
# Residuals plot
model_eval.plot_residuals(figsize=(10,7), hue='cell_cycle')
# Target vs predicted
model_eval.plot_y_vs_y_hat(figsize=(7,7), hue='cell_cycle')

In [None]:
model_eval.get_metrics(CMA_size=31, 
                       CMA=save_best_model.best_models[eval_name][1], 
                       CMA_Std=save_best_model.best_models[eval_name][2], 
                       Epoch=save_best_model.best_models[eval_name][0])
metrics_df = metrics_df.append(model_eval.metrics_df, ignore_index=True)
mask = model_eval.metrics_df.Set == 'test'
model_eval.metrics_df[~mask]

# Compare metrics

In [None]:
mask = metrics_df.Set == 'test'
metrics_df[~mask]

In [None]:
mask = metrics_df.Set == 'val'
metrics_df[mask].sort_values(by=['MAE', 'Bias', 'Std'])

In [None]:
# Save metrics
with open(os.path.join(base_path, 'metrics.csv'), 'w') as file:
    metrics_df.to_csv(file, index=False)

In [None]:
msg = 'Notebook execution finished!'
logging.info(msg)