## Train a model

The objective of this notebook is train and evaluate a given model specified in the parameters file.

In [None]:
# For Development and debugging:
# Reload modul without restarting the kernel
#%load_ext autoreload
#%autoreload 2

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.optimizers import Adam
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import shutil
import json
import math
from datetime import datetime

Load model parameters:

In [None]:
# Do not touch the value of PARAMETERS_FILE!
# When this notebook is executed with jupyter-nbconvert (from script), 
# it will be replaced outomatically
PARAMETERS_FILE = 'dont_touch_me-input_parameters_file'
if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))
    
# Open parameters
with open(PARAMETERS_FILE) as file:
    p = json.load(file)
p.keys()

Set logging:

In [None]:
# Set logging configuration
import logging
log_file_path = p['log_file_name']
logging.basicConfig(
    filename=log_file_path,
    filemode='w', 
    level=getattr(logging, p['log_level'])
)
logging.info('Parameters loaded from file:\n{}'.format(PARAMETERS_FILE))

#### Model parameters:

In [None]:
msg = 'Dataset:\n{}'.format(p['tf_ds_name'])
msg += '\nAugmentation:\nRandom Flipping: {}\nRandom 90deg Rotations: {}'.format(p['random_horizontal_flipping'],p['random_90deg_rotations'])
msg += '\nModel:\nArchitecture: {}'.format(p['model_name'])
msg += '\nLoss function: {}'.format(p['loss'])
msg += '\nFirst trianing stage Epochs: {}'.format(p['first_trianing_stage_epochs'])
msg += '\nSecond trianing stage Epochs: {}'.format(p['second_trianing_stage_epochs'])
msg += '\nSecond trianing stage lr: {}\n\n'.format(p['second_trianing_stage_lr'])
logging.info(msg)
print(msg)

In [None]:
# Load external libraries path
EXTERNAL_LIBS_PATH = p['external_libs_path']
if not os.path.exists(EXTERNAL_LIBS_PATH):
    msg = 'External library path {} does not exist!'.format(EXTERNAL_LIBS_PATH)
    logging.error(msg)
    raise Exception(msg)
else:
    msg='EXTERNAL_LIBS_PATH: {}'.format(EXTERNAL_LIBS_PATH)
    print(msg)
    logging.info(msg)
# Add EXTERNAL_LIBS_PATH to sys paths (for loading libraries)
sys.path.insert(1, EXTERNAL_LIBS_PATH)
# Load external libraries
from Models import Predef_models as predef_models
from Utils import Tee_Logger as Tee_Logger
import Utils as utils

Create dirs where model output will be saved:

In [None]:
# If you want to avoid cleaning (deleting) model dir, then uncomment the next line:
#p['clean_model_dir'] = 0

base_path, model_path, checkpoints_path = utils.create_model_dirs(parameters=p)

msg = 'Base path:\n{}'.format(base_path)
msg += '\nModel path:\n{}'.format(model_path)
msg += '\nCheckpoints path:\n{}'.format(checkpoints_path)
logging.info(msg)
print(msg)

Set logging:

In [None]:
# Set logging configuration
import logging
log_file_path = os.path.join(base_path, p['log_file_name'])
logging.basicConfig(
    filename=log_file_path,
    filemode='w', 
    level=getattr(logging, p['log_level'])
)
logging.info('Parameters loaded from file:\n{}'.format(PARAMETERS_FILE))

msg = 'Base path:\n{}'.format(base_path)
msg += '\nModel path:\n{}'.format(model_path)
msg += '\nCheckpoints path:\n{}'.format(checkpoints_path)
logging.info(msg)

In [None]:
# Make tf to ignore GPU
if p['disable_gpu']:
    msg = "Cuda devices (GPUs) disabled"
    logging.info(msg)
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
physical_devices = tf.config.experimental.list_physical_devices('GPU')
msg = 'Physical GPU devises:\n{}'.format(physical_devices)
logging.info(msg)
print(msg)

#restrict GPU mem
if p['set_memory_growth']:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        msg = 'GPU Memory limited!'
    except:
        msg = 'It was not possible to limit GPU memory'
        
    logging.info(msg)
    print(msg)

## Load Preprocessing parameters and information:

In [None]:
# Preprocessed data path
pp_path = p['pp_path']

with open(os.path.join(pp_path, 'params.json')) as file:
    pp_params = json.load(file)
msg = 'Loaded data preprocessing parameters from:\n{}'.format(file)
logging.info(msg)
seed = pp_params['seed']

# Load Channels file
with open(os.path.join(pp_path, 'channels.csv')) as file:
    channels = pd.read_csv(file)
msg = 'Loaded channels file from:\n{}'.format(file)
logging.info(msg)

## Specify input channels

In [None]:
selected_channels = p['input_channels']
msg = 'Selected input channels:\n{}'.format(selected_channels)
logging.info(msg)
print(msg)
# Get selected channel ids
input_ids = np.array(channels.set_index(['name']).loc[selected_channels].channel_id.values)
msg = 'Corresponding input channel ids:\n{}'.format(input_ids)
logging.info(msg)
print(msg)
print(input_ids.shape)

## Load Dataset

In [None]:
msg = 'Tensorflow dataset {} loaded from:\n{}'.format(p['tf_ds_name'], p['local_tf_datasets'])
logging.info(msg)

# Path where tf datasets are
dataset, metadata = tfds.load(
    name=p['tf_ds_name'], 
    data_dir=p['local_tf_datasets'], 
    # If False, returns a dictionary with all the features
    as_supervised=True, 
    shuffle_files=p['shuffle_files'],
    with_info=True)

# Load the splits
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

Show information about the dataset:

In [None]:
metadata

## Process the data
Before training the network, we discriminate some channels, apply some linear transformations (90deg rotations and horizontal flipping) to augment the **Training** dataset, create the batches and shuffle them. Also, we perform other operations to improve performance.

**Tune performance**<br>
tf.data.Dataset.prefetch overlaps data preprocessing and model execution while training.
It can be used to decouple the time when data is produced from the time when data is consumed. In particular, the transformation uses a background thread and an internal buffer to prefetch elements from the input dataset ahead of the time they are requested. The number of elements to prefetch should be equal to (or possibly greater than) the number of batches consumed by a single training step. You could either manually tune this value, or set it to **tf.data.experimental.AUTOTUNE** which will prompt the tf.data runtime to tune the value dynamically at runtime.

**Shuffling**<br>
dataset.shuffle() Randomly shuffles the elements of this dataset.
This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

For instance, if your dataset contains 10,000 elements but buffer_size is set to 1,000, then `shuffle` will initially select a random element from only the first 1,000 elements in the buffer. Once an element is selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, maintaining the 1,000 element buffer.

**reshuffle_each_iteration** controls whether the shuffle order should be different for each epoch.

In [None]:
# Source:
# https://www.tensorflow.org/tutorials/images/data_augmentation

def filter_channels(image, target):
    """Function to discriminated undecired channels"""
    
    image = tf.cast(image, dtype=tf.float32)
    
    n_channels = metadata.features['image'].shape[-1]
    n_selected_channels = input_ids.shape[-1]
    
    # Create projection matrix base on selected channels
    projection_tensor = np.zeros((n_channels, n_selected_channels))
    for col, row in enumerate(input_ids):
        projection_tensor[row,col] = 1
    projection_tensor = tf.constant(projection_tensor, dtype=tf.float32)
    
    new_shape = image.shape[:-1]+(n_selected_channels,)
    
    return tf.reshape(tf.reshape(image, (-1,n_channels)) @ projection_tensor, (new_shape)), target

def augment(image, target):
    """Function to augment dataset. After channel filtering, it flips (horizontally) and rotates (0, 90, 180, 270 degrees) randomly the images."""
    
    image, target = filter_channels(image, target)
    
    # random Left and right flip
    if p['random_horizontal_flipping']:
        image = tf.image.random_flip_left_right(image)
        
    # random rotations
    # Number of 90deg rotation
    if p['random_90deg_rotations']:
        k = np.random.randint(0,4)
        image = tf.image.rot90(image, k=k)
    
    return image, target

In [None]:
msg = ''
if p['random_horizontal_flipping']:
    msg = 'Random horizontal flipping for training set selected!'
if p['random_90deg_rotations']:
    msg = msg + '\nRandom 90 degrees rotations (0, 90, 180 or 270 deg) for training set selected!'
if msg == '':
    msg = 'No data augmentation technique selected for trainingset!'
logging.info(msg)
print(msg)

Take a look into one image and a random transformation (random rotation+random horizontal flippig):

In [None]:
def visualize_cell(image):
    plt.figure(figsize=(6,4))
    plt.title('Original Cell')
    plt.imshow(image.numpy()[:,:,10:13],
               cmap=plt.cm.PiYG,
               vmin=0, vmax=1)
    
    if p['random_horizontal_flipping'] | p['random_90deg_rotations']:
        plt.figure(figsize=(15,4))
        for i in range(4):
            img, _ = augment(image, 0)
            plt.subplot(1,4,i+1)
            plt.title('Augmented Cell')
            plt.imshow(img.numpy()[:,:,10:13],
                       cmap=plt.cm.PiYG,
                       vmin=0, vmax=1)

In [None]:
# Get one image from the training dataset
image, _ = next(iter(train_data))

In [None]:
# Visualize the original vs. random flipping and rotations
visualize_cell(image)

Prepare datasets for training the CNN:

In [None]:
BATCH_SIZE = p['BATCH_SIZE']
buffer_size = 512
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_data = (
    train_data
    .shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True)
    .map(augment, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

val_data = (
    val_data
    .map(filter_channels, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

test_data = (
    test_data
    .map(filter_channels, num_parallel_calls=AUTOTUNE)
    .prefetch(AUTOTUNE)
)

## Model Selection

Models are selected from a group of predefined models in the class `Predef_models` (in `Models.py`). The name of the selected model is specified in the parameter `p['model_method']`.

First we need to init the `Predef_models` class:

In [None]:
# Init models class
models = predef_models()

# Select model
img_shape = metadata.features['image'].shape[:-1] + (input_ids.shape[0],)
model = models.select_model(model_name=p['model_name'], input_shape=img_shape)

Select the loss function and build the model:

In [None]:
# Select the loss function
if p['loss'] == 'mse':
    loss = tf.keras.losses.MeanSquaredError()
    
elif p['loss'] == 'huber':
    loss = tf.keras.losses.Huber(delta=1.0)
msg = '{} loss function selected. Building the model...'.format(p['loss'])
logging.info(msg)
print(msg)

metrics = ['mse', 'mean_absolute_error']
model.compile(optimizer='adam',
              loss=loss,
              metrics=metrics
             )
msg = 'Model compiled!'
logging.info(msg)

Take a look into the model architecture and number of parameters:

In [None]:
# Duplicates sys.stdout to the log file
TeeLog = Tee_Logger(log_file_path)
model.summary()

In [None]:
# Finish stdout duplication
TeeLog.close()

## First stage of training

First train only non-pretrained layers i.e. the last (dense) layers:

Set callback to save best weights:

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoints_path+'/checkpoint_first_trianing_stage',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_freq='epoch',
    save_best_only=True,
)

Train the model:

In [None]:
msg = 'Starting first training stage...'
logging.info(msg)

In [None]:
# Duplicates sys.stdout to the log file
TeeLog = Tee_Logger(log_file_path)

# Fit model
n_train = metadata.splits['train'].num_examples
first_trianing_stage_history = model.fit(train_data,
                    validation_data=val_data,
                    epochs=p['first_trianing_stage_epochs'],
                    #epochs=2,
                    callbacks=[model_checkpoint_callback],
                    #verbose=1, #progress bar
                    verbose=2, #one line per epoch
                    #steps_per_epoch=math.ceil(n_train/BATCH_SIZE),
                    )

In [None]:
# Finish stdout duplication
TeeLog.close()

In [None]:
# Save history
with open(os.path.join(base_path, 'first_trianing_stage_history.json'), 'w') as file:
    json.dump(first_trianing_stage_history.history, file, indent=4)

In [None]:
msg = 'First training stage completed!\n\n\n'
logging.info(msg)

## Second Training Stage

Now, train the whole architectur using a small learning rate:

In [None]:
# Make all layers trainable
for layer in model.layers:
    layer.trainable = True

In [None]:
# Compile again the model using a smaller learning rate:
model.compile(optimizer=Adam(learning_rate=p['second_trianing_stage_lr']),
              loss=loss,
              metrics=metrics
             )
msg = 'Model with all layers trainable compiled!'
logging.info(msg)

In [None]:
# Load best weights from previous training
model.load_weights(checkpoints_path+'/checkpoint_first_trianing_stage')

In [None]:
# Save te best model
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoints_path+'/best_model',
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_freq='epoch',
    save_best_only=True,
)

Train all the layers:

In [None]:
msg = 'Starting second training stage...'
logging.info(msg)

In [None]:
# Duplicates sys.stdout to the log file
TeeLog = Tee_Logger(log_file_path)

second_trianing_stage_history = model.fit(train_data,
                    validation_data=val_data,
                    epochs=p['second_trianing_stage_epochs'],
                    #epochs=2,
                    callbacks=[model_checkpoint_callback],
                    #verbose=1, #progress bar
                    verbose=2, #one line per epoch
                                         )

In [None]:
# Finish stdout duplication
TeeLog.close()

In [None]:
# Save history
with open(os.path.join(base_path, 'second_trianing_stage_history.json'), 'w') as file:
    json.dump(second_trianing_stage_history.history, file, indent=4)

In [None]:
msg = 'Second training stage completed!\n\n\n'
logging.info(msg)

In [None]:
# Save the last epoch model
msg = 'Saiving trained model'
logging.info(msg)

# Save model
model.save(model_path)

In [None]:
# Concatenate the history of both models
join_idx = np.argmin(first_trianing_stage_history.history['val_loss'])
print('Best val_loss during first stage in epoch: {}'.format(join_idx))
history = {}
for k in first_trianing_stage_history.history.keys():
    history[k] = first_trianing_stage_history.history[k][0:join_idx]+second_trianing_stage_history.history[k]

In [None]:
# Save history
with open(os.path.join(base_path, 'history.json'), 'w') as file:
    json.dump(history, file, indent=4)
    
# Save parameters
with open(os.path.join(base_path, 'parameters.json'), 'w') as file:
    json.dump(p, file, indent=4)

## Training Plots

In [None]:
print('base_path="{}"\n'.format(base_path))

In [None]:
# Load history
#path = ''
#with open(os.path.join(path, 'history.json'), 'r') as file:
#    history = json.load(file)
# Save parameters
#with open(os.path.join(base_path, 'parameters.json'), 'r') as file:
#    p = json.load(file)
#metrics = ['mse', 'mean_absolute_error']

In [None]:
# Plot History
def plot_loss(history):
    plt.figure(figsize=(10,18))
    keys = ['loss'] + metrics
    for i, key in enumerate(keys,1):
        warm_stage = int(join_idx + p['second_trianing_stage_epochs']*0.20)
        min_val = np.asarray(history[key]+history['val_'+key]).min()
        max_val = np.asarray(history[key][warm_stage:]+history['val_'+key][warm_stage:]).max()
        
        plt.subplot(3,1,i)
        plt.plot(history[key], label=key)
        plt.plot(history['val_'+key], label='val_'+key)
        val_min = np.asarray(history['val_'+key]).min()
        val_min_idx = np.argmin(history['val_'+key])
        label='bets val value\nEpoch={}\n{}={}'.format(val_min_idx,key,round(val_min,2))
        plt.scatter(x=val_min_idx, y=val_min, c='red', linewidths=4, label=label)
        plt.grid(True)
        plt.ylim([min_val, max_val])
        plt.xlabel('Epoch')
        plt.ylabel(key)
        plt.legend()
        plt.title(key)

plot_loss(history)

## Evaluate the whole Dataset

In [None]:
# First Load the best weights into the model
model.load_weights(os.path.join(checkpoints_path, 'best_model/variables/variables'))

Reload the dataset with flag `as_supervised=False` to load also the `mapobject_id_cell`.

In [None]:
del(train_data, val_data, test_data)
del(dataset, metadata)

In [None]:
dataset, metadata = tfds.load(
    name=p['tf_ds_name'], 
    data_dir=p['local_tf_datasets'], 
    # If False, returns a dictionary with all the features
    as_supervised=False, 
    shuffle_files=False,
    with_info=True)

train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

In [None]:
BATCH_SIZE = p['BATCH_SIZE']
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_data = (
    train_data
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

val_data = (
    val_data
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

test_data = (
    test_data
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

In [None]:
columns = ['y', 'y_hat', 'mapobject_id_cell', 'set']
targets_df = pd.DataFrame(columns=columns)

dss = [train_data, val_data, test_data]
ds_names = ['train', 'val', 'test']
for ds, dsn in zip(dss, ds_names):
    for cells in ds:
        cell_ids = [cell_id.decode() for cell_id in cells['mapobject_id_cell'].numpy()]
        cell_ids = np.asarray(cell_ids).reshape(-1,1)
        Y = cells['target'].numpy()
        cell_imgs, _ = filter_channels(cells['image'], _)
        Y_hat = model.predict(cell_imgs)
        temp_df = pd.DataFrame(np.concatenate((Y, Y_hat), axis=1), columns=['y', 'y_hat'])
        temp_df['mapobject_id_cell'] = cell_ids
        temp_df['set'] = dsn
        targets_df = pd.concat((targets_df, temp_df), axis=0, ignore_index=True)
targets_df

In [None]:
# Sava targets info
with open(os.path.join(base_path, 'targets.csv'), 'w') as file:
    targets_df.to_csv(file, index=False)

## Plot Fited values
Now lets see how our model performs.

First, we plot the error distribution devided by set:

In [None]:
temp_df = targets_df.copy()
temp_df['diff'] = temp_df['y'] - temp_df['y_hat']

plt.figure(figsize=(20,7))
order = ['train', 'val', 'test']
colors = ['orange', 'blue', 'green']

plt.subplot(1,2,1)
sns.kdeplot(x='diff',
            common_norm=True,
            data=temp_df,
            hue='set',
            hue_order=order,
            color=colors,
            shade=True, 
            bw_method=0.2)
plt.xlabel('y - y_hat')
plt.title('Error KDE per set')

plt.subplot(1,2,2)
sns.boxplot(y='diff',
            x='set',
            order=order,
            data=temp_df)
#sns.swarmplot(y='diff',
#              x='set',
#              dodge=True,
#              order=order,
#              color='red',
#              size=3,
#              data=temp_df)
plt.ylabel('y - y_hat')
plt.title('Error Distribution per set')

Now we compare the distribution between *y* and *y_hat*:

In [None]:
temp_df = targets_df.copy()
temp_df = temp_df.set_index(['mapobject_id_cell', 'set']).stack().reset_index()
temp_df.columns = ['mapobject_id_cell', 'set', 'var', 'value']

plt.figure(figsize=(15,7))
sns.boxplot(y='value',
            x='var',
            hue='set',
            hue_order=order,
            data=temp_df)

#sns.swarmplot(y='value',
#              x='var',
#              dodge=True,
#              hue='set',
#              hue_order=order,
#              color='red',
#              size=3,
#              data=temp_df)
plt.title('Transcription Rate (TR) values distribution')

In [None]:
msg = 'Notebook execution finished!'
logging.info(msg)