# Semantic Segmentation of Water using U-Net
# Part 7 - Hyperparameter Tuning

In [None]:
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.layers import concatenate, Conv2DTranspose
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, save_img
import numpy as np
import json, os
from random import shuffle
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import warnings
import re
import time

from unetlib.metrics import BinaryMeanIoU
from unetlib.model import UNet
from unetlib.preprocessing import get_lakes_with_masks, make_dataframes_for_flow, make_img_msk_flows
import unetlib.visualisation as vs

In [None]:
# Wrapper for training process with some hyperparameters
def train_unet(model, batch_size=16, epochs=100,
               optimiser='RMSProp', learning_rate=None, save_as=None):
    
    # Split the test/train data
    (train_img_df, train_msk_df,
     test_img_df, test_msk_df) = make_dataframes_for_flow(nwpu_data_dir,
                                                          nwpu_mask_dir,
                                                          test_size=0.25,
                                                          random_state=42
                                                         )

    # Split the training data into train and validation generators
    # with augmentation applied to the training data only
    aug_dict = {'rotation_range':90,
                'horizontal_flip':True,
                'vertical_flip':True,
                'width_shift_range':0.15,
                'height_shift_range':0.15,
                'zoom_range':0.25
               }

    (train_gen, val_gen,
     train_fps, val_fps) = make_img_msk_flows(train_img_df, train_msk_df,
                                              nwpu_data_dir, nwpu_mask_dir,
                                              val_split=0.3, rescale=1/255.,
                                              aug_dict=aug_dict,
                                              batch_size=batch_size
                                             )
    
    # Compute steps per epoch
    train_steps = int(np.ceil(len(train_fps) / batch_size))
    val_steps = int(np.ceil(len(val_fps) / batch_size))
    
    # Output paths
    if save_as is None:
        hist_filepath = f'{model.name}_bs{batch_size}e{epochs}.history.pickle'
        weights_filepath = f'{model.name}_bs{batch_size}e{epochs}.weights.h5'
    else:
        hist_filepath = f'{save_as}_bs{batch_size}e{epochs}.history.pickle'
        weights_filepath = f'{save_as}_bs{batch_size}e{epochs}.weights.h5'
        
    if os.path.dirname(hist_filepath) != '':
        os.makedirs(os.path.dirname(hist_filepath), exist_ok=True)
    if os.path.dirname(weights_filepath) != '':
        os.makedirs(os.path.dirname(weights_filepath), exist_ok=True)

    # Configure optimiser
    if isinstance(optimiser, str):
        optimiser = tf.keras.optimizers.get(optimiser)
        if learning_rate is not None:
            optimiser.lr = learning_rate
            
    # Configure callbacks
    checkpointer = tf.keras.callbacks.ModelCheckpoint(weights_filepath,
                                                      save_best_only=True,
                                                      save_weights_only=True
                                                     )
    callbacks=[checkpointer]
    
    # Compile model  
    model.compile(optimizer=optimiser,
                  loss='binary_crossentropy',
                  metrics=[BinaryMeanIoU(threshold=0.5)]
                 )

    # Train the model and record the time taken
    t1 = time.time()
    history = model.fit(train_gen, epochs=epochs, steps_per_epoch=train_steps,
                        validation_data=val_gen, validation_steps=val_steps,
                        callbacks=callbacks
                       )
    runtime = time.time() - t1
    
    # Save history to pickle
    with open(hist_filepath, 'wb') as f:
        pickle.dump(history.history, f)
        
    return hist_filepath, runtime

To tune the learning rate hyperparameter, and any others for that matter, a selection of values should be tried and whichever yields the best validation loss should be kept. It is not practical or efficient to test every possible value so a common strategy in the literature is to test powers of 10 e.g. 0.001, 0.01, 0.1 etc.

Another common approach is to allow the learning rate to be decreased during the training process. This means that earlier steps can make larger movements but as the model converges, the learning rate gets smaller so small steps can me made to avoid overshooting the minimum. One way of implementing this is to use a learning rate scheduler to reduce the rate after a certain number of epochs, though it can be difficult to determine at which epochs the rate should be reudced. An alternative is to reduce the learning rate whenever the loss doesnt improve for a certain amount of time.

The `RMSProp` optimiser i'm using also has a `momentum` parameter. This essentially allows gradient descent to build up speed and can help pass local minima or saddle points. RMSProp also includes a dampening factor, `rho` which helps slow the process to avoid overshooting the minimum.

Speed up convergence - batch norm / learning rate / momentum
Dropout (if overfitting)
Optimizers - could try sigmoid or tanh (adjut batch norm appropriately)
Ensemble? e.g. train several smaller models and average the predictions