In this notebook we train a convolution neural network (based upon ResNetv2) to classify routes by their grades and achieve 62% accuracy on the test dataset. 

### Building the data pipeline
We load the input data, create the training, validation and testing datasets ensuring the proper distribution of grade 6, 7 and 8's in each, and then build the data pipeline

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from IPython.utils import io

from skopt import gp_minimize, callbacks, load
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt.plots import plot_convergence
from skopt.callbacks import CheckpointSaver
# Need skopt.__version__ > 0.5.2 or pip install git+https://github.com/scikit-optimize/scikit-optimize/

%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams["image.origin"] = 'lower'
plt.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
def load_data(train=0.8, val=0.1, test=0.1, data_format='channels_last'):
    """
    Loads the datasets from data/data.npz and randomly creates the train, test and 
    validation datasets.

    Inputs:
    - train, val, test: the fraction of the dataset in the train dataset, validation dataset
      and test dataset respectively
    - data_format: A string, one of 'channels_last' (default) or 'channels_first'.
      'channels_last' best for CPU and 'channels_first' best for GPU
    Returns:
    - 6 numpy arrays: X_train, y_train, X_val, y_val, X_test, y_test in either
      channel_last or channel_first format
    - grade_dict : a dictionary of coverting the grades to numerical scores.
    """
    assert train + val + test == 1

    # load the data, n.b. arrays are sorted by grade
    loaded = np.load('data/data.npz')
    moves = loaded['moves']
    grades = loaded['grades']
    grade_dict = loaded['grade_dict'][()]

    # Find partition arguments between grade 6, 7 & 8
    part_arg = np.searchsorted(grades, [grade_dict['7A'], grade_dict['8A']])

    # now shuffle within the grade 6's, 7's and 8's
    permute_idx = np.arange(grades.shape[0])
    np.random.shuffle(permute_idx[:part_arg[0]])
    np.random.shuffle(permute_idx[part_arg[0]:part_arg[1]])
    np.random.shuffle(permute_idx[part_arg[1]:])
    moves = moves[permute_idx]
    grades = grades[permute_idx]

    # data processing
    if data_format == 'channels_first':
        moves = np.moveaxis(moves, -1, 1)
    moves = moves.astype(np.float32)

    # create the train, val and test datasets from the grade classes
    part_start = np.append(0, part_arg)
    size = np.array([part_arg[0], part_arg[1] - part_arg[0],
                     len(grades) - part_arg[1]])

    num_val = (val * size).astype(int)
    num_test = (test * size).astype(int)
    num_train = (size - num_val - num_test).astype(int)

    # generate the training, val and test sets
    slice_range = [part_start,
                   part_start + num_train,
                   part_start + num_train + num_val,
                   part_start + num_train + num_val + num_test]
    X, y = [], []
    for j in range(3):
        grade_list, moves_list = [], []
        for i in range(3):
            grade_list.append(grades[slice_range[j][i]: slice_range[j+1][i]])
            moves_list.append(moves[slice_range[j][i]: slice_range[j+1][i]])
        X.append(np.concatenate(moves_list))
        y.append(np.concatenate(grade_list))

    X_train, X_val, X_test = X
    y_train, y_val, y_test = y

    # check: sets are the correct length
    assert (len(y_val) == np.sum(num_val) and len(y_test) == np.sum(num_test)
            and len(y_train) == np.sum(num_train))

    # Normalize the data: subtract the mean pixel and divide by std
    mean_pixel = X_train.mean(axis=(0, 1, 2), keepdims=True)
    std_pixel = X_train.std(axis=(0, 1, 2), keepdims=True)
    X_train = (X_train - mean_pixel) / std_pixel
    X_val = (X_val - mean_pixel) / std_pixel
    X_test = (X_test - mean_pixel) / std_pixel

    return X_train, y_train, X_val, y_val, X_test, y_test, grade_dict


data_format = 'channels_first'
X_train, y_train, X_val, y_val, X_test, y_test, grade_dict = load_data(
    data_format=data_format)
num_classes = len(grade_dict) - 1

print('Train data shape: ', X_train.shape, X_train.dtype)
print('Train labels shape: ', y_train.shape, y_train.dtype)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Train data shape:  (8789, 3, 18, 11) float32
Train labels shape:  (8789,) int32
Validation data shape:  (1097, 3, 18, 11)
Validation labels shape:  (1097,)
Test data shape:  (1097, 3, 18, 11)
Test labels shape:  (1097,)


In [3]:
batch_size = 256


def construct_datasets(num_epochs=1):
    """
    Constructs the datasets in Tensorflow.

    Inputs: 
    - num_epochs: The number of epochs to run the training data for

    Outputs:
    - next_element_train, next_element_test: get_next() method for train and test dataset iterators.
      The next_element_test is either from the validation or testing dataset depending on which has
      been initialised
    - train_init_op, val_init_op, test_init_op:
      iterator initialisation operations for the respective datasets
    - steps_to_epochs: The number of integers steps to each epoch of the training dataset
    """
    prefetch = 2

    # make sure the dataset is on the CPU to leave the GPU for training the model
    with tf.device('/cpu:0'):
        with tf.variable_scope('train_dataset'):
            dataset_train = tf.data.Dataset.from_tensor_slices(
                (X_train, y_train))
            dataset_train = dataset_train.apply(
                tf.data.experimental.shuffle_and_repeat(len(X_train), count=num_epochs))
            dataset_train = dataset_train.shuffle(len(X_train))
            dataset_train = dataset_train.batch(batch_size).prefetch(prefetch)

        with tf.variable_scope('validation_dataset'):
            dataset_val = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            dataset_val = dataset_val.batch(batch_size).prefetch(prefetch)
        with tf.variable_scope('test_dataset'):
            dataset_test = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            dataset_test = dataset_test.batch(batch_size).prefetch(prefetch)

        iterator_train = tf.data.Iterator.from_structure(dataset_train.output_types,
                                                         dataset_train.output_shapes)
        next_element_train = iterator_train.get_next()
        iterator_test = tf.data.Iterator.from_structure(dataset_train.output_types,
                                                        dataset_train.output_shapes)
        next_element_test = iterator_test.get_next()

        train_init_op = iterator_train.make_initializer(dataset_train)
        val_init_op = iterator_test.make_initializer(dataset_val)
        test_init_op = iterator_test.make_initializer(dataset_test)
        steps_to_epochs = len(X_train) // batch_size

    return next_element_train, next_element_test, train_init_op, val_init_op, test_init_op, steps_to_epochs

## Define the neural network

Our neural network is a deep network based upon Resnetv2 and has the same structure as the CIFAR-10 version of ResNetv2.

In [4]:
initializer = tf.contrib.layers.variance_scaling_initializer()

# Helper layer functions


def batch_norm_relu_conv2d(inputs, filters, is_training, stride=1, reg=1e-4, data_format='channels_last'):
    inputs = batch_norm_relu(inputs, is_training)
    inputs = conv2d(inputs, filters, is_training, stride=stride,
                    reg=reg, data_format=data_format)
    return inputs


def batch_norm_relu(inputs, is_training):
    inputs = tf.layers.batch_normalization(inputs, training=is_training)
    return tf.nn.relu(inputs)


def conv2d(inputs, filters, is_training, kernel_size=3, stride=1, reg=1e-4, data_format='channels_last'):
    inputs = tf.layers.conv2d(inputs, filters, kernel_size, strides=stride, padding="same", kernel_initializer=initializer,
                              kernel_regularizer=tf.contrib.layers.l2_regularizer(
                                  scale=reg),
                              bias_regularizer=tf.contrib.layers.l2_regularizer(
                                  scale=reg),
                              data_format=data_format)
    return inputs

# Resnet unit


def ResNet_unit(inputs, filters, is_training, i, j, subsample=False, reg=1e-4, final_unit=False, data_format='channels_last'):
    with tf.variable_scope(f"conv{i+2}_{j+1}"):
        shortcut = inputs
        stride = 2 if subsample else 1

        # for the first unit batch_norm_relu before splitting into two paths
        if i == 0 and j == 0:
            inputs = batch_norm_relu(inputs, is_training)
            shortcut = inputs
            inputs = conv2d(inputs, filters, is_training,
                            stride=stride, reg=reg, data_format=data_format)
        else:
            inputs = batch_norm_relu_conv2d(
                inputs, filters, is_training, stride=stride, reg=reg, data_format=data_format)
        inputs = batch_norm_relu_conv2d(
            inputs, filters, is_training, reg=reg, data_format=data_format)

        if subsample:
            if data_format == 'channels_last':
                paddings = tf.constant(
                    [[0, 0], [0, 0], [0, 0], [0, filters // 2]])
                # reduce image height and width by striding as in resnet paper
                shortcut = shortcut[:, ::2, ::2, :]
            else:
                paddings = tf.constant(
                    [[0, 0], [0, filters // 2], [0, 0], [0, 0]])
                shortcut = shortcut[:, :, ::2, ::2]
            shortcut = tf.pad(shortcut, paddings)
        inputs = shortcut + inputs

        # Final activation
        if final_unit:
            inputs = batch_norm_relu(inputs, is_training)

        return inputs


def model_ResNetv2(inputs, is_training, total_layers=20, num_classes=10, reg=2e-4, data_format='channels_last'):
    """
    Creates a ResNetv2 model based upon CIFAR-10 ResNet.  
    Total_layers = 6n + 2
    """
    assert (total_layers - 2) % 6 == 0
    num_layers = (total_layers - 2) // 6
    filters = [16, 32, 64]

    # first do a single convolution ResNet_unit with no addition
    with tf.variable_scope("conv1"):
        inputs = conv2d(
            inputs, filters[0], is_training, stride=2, reg=reg, data_format=data_format)

    # now some ResNet units
    for i in range(3):
        for j in range(num_layers):
            # don't subsample on first go round
            subsample = i > 0 and j == 0
            final = i == 2 and j == num_layers-1
            inputs = ResNet_unit(inputs, filters[i], is_training, i, j,
                                 subsample=subsample, reg=reg, final_unit=final, data_format=data_format)

    # Global average pooling, 10 way FC layer and then output to scores.
    # Global average pooling is same as doing reduce_mean
    if data_format == 'channels_last':
        reduce_axis = [1, 2]
    else:
        reduce_axis = [2, 3]
    inputs = tf.reduce_mean(inputs, axis=reduce_axis)
    inputs = tf.layers.flatten(inputs)
    scores = tf.layers.dense(inputs, num_classes, kernel_initializer=initializer,
                             kernel_regularizer=tf.contrib.layers.l2_regularizer(
                                 scale=reg),
                             bias_regularizer=tf.contrib.layers.l2_regularizer(scale=reg))
    return scores

A small test to check that our neutral network works correctly

In [5]:
def test_model_ResNet_fc():
    """ A small unit test for model_ResNetv2 above. """
    tf.reset_default_graph()

    x = tf.zeros((50, 3, 18, 11))
    scores = model_ResNetv2(x, 1, num_classes=num_classes,
                            data_format='channels_first')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores)
        print(scores_np.shape)

# test_model_ResNet_fc()

In [6]:
def sparse_earth_mover(labels, logits):
    """
    Computes the normalised squared Earth Mover’s Distance loss from https://arxiv.org/pdf/1611.05916.pdf.
    Since our classes are ordered this loss behaves much better than the usual softmax cross entropy.

    Inputs:
    - labels: Tensor of shape [batch_size] and dtype int32 or int64.
      Each entry in labels must be an index in [0, num_classes)
    - logits: Unscaled log probabilities of shape [batch_size, num_classes]

    Returns:
    - loss: A Tensor of the same shape as labels and of the same type as logits with the softmax cross entropy loss.
    """

    with tf.name_scope("sparse_earth_mover"):
        num_classes = tf.shape(logits)[-1]
        batch_size = tf.shape(labels)[0]

        logits_norm = tf.nn.softmax(logits)
        one_hot_labels = tf.one_hot(labels, num_classes)

        cdf_labels = tf.cumsum(one_hot_labels, axis=-1)
        cdf_logits = tf.cumsum(logits_norm, axis=-1)
        loss = tf.sqrt(tf.reduce_mean(
            tf.square(cdf_labels - cdf_logits), axis=-1))

    return loss

In [7]:
def check_acc_tb(sess, x, next_element, scores, is_training, FLAG_print=True, grades_away=0):
    """
    Checks the accuracy of a classification model.

    Inputs:
    - sess: A TensorFlow Session that will be used to run the graph
    - x: A TensorFlow placeholder Tensor where input images should be fed
    - next_element: A TensorFlow placeholder Tensor where the next batch of elements will be fed
    - scores: A TensorFlow Tensor representing the scores output from the
      model; this is the Tensor we will ask TensorFlow to evaluate.
    - A TensorFlow placeholder Tensor where a bool should be fed if we are training the dataset
    - grades_away: An int.  Accuracy is calculated within grades_away
    - FLAG_print: a bool to decide if we print how accurate we are

    Returns: Accuracy of the model
    """
    num_correct, num_samples = 0, 0
    with tf.name_scope('accuracy'):
        while True:
            try:
                (x_np, y_np) = sess.run(next_element)
            except tf.errors.OutOfRangeError:
                break
            feed_dict = {x: x_np, is_training: False}
            scores_np = sess.run(scores, feed_dict=feed_dict)
            y_pred = scores_np.argmax(axis=1)
            num_samples += x_np.shape[0]
            num_correct += np.sum(np.abs(y_pred - y_np) <= grades_away)
        assert num_samples > 0
        acc = float(num_correct) / num_samples
        if FLAG_print == True:
            print('Got %d / %d correct (%.2f%%)' %
                  (num_correct, num_samples, 100 * acc))
    return acc

In [8]:
def check_acc_train(x, y, scores, grades_away=0):
    """
    Check accuracy on a classification model from a batch of data.

    Inputs:
    - x: A TensorFlow placeholder Tensor where input images should be fed
    - y: A TensorFlow placeholder Tensor where input classification scores
      should be fed
    - scores: A TensorFlow Tensor representing the scores output from the
      model; this is the Tensor we will ask TensorFlow to evaluate.
    - grades_away: An int.  Accuracy is calculated within grades_away

    Returns: Accuracy of the model on a batch of training data
    """
    num_correct, num_samples = 0, 0
    with tf.name_scope('accuracy'):
        y_pred = tf.argmax(scores, axis=1, output_type=tf.int32)
        bool_acc = tf.abs(y_pred - y) <= grades_away
        acc = tf.cast(tf.count_nonzero(bool_acc), tf.float32) / \
            tf.cast(tf.shape(x)[0], tf.float32)
    return acc

## Define the training loop

In [9]:
def train(model_init_fn, optimizer_init_fn, lr, num_epochs=1, experiment_name="",
          grades_away=1, cross_coeff=0, decay_at=[], decay_to=[],
          save=False, log=True, save_graph=False, val=True):
    """
    Simple training loop for use with models defined using tf.layers. It trains
    a model for num_epochs, peridoically checks the accuracy on the validation
    dataset, logs the training data to Tensorboard, saves the graph, and tests 
    the final accuracy on the test dataset.

    Inputs:
    - model_init_fn: A function that takes no parameters; when called it
      constructs the model we want to train: model = model_init_fn()
    - optimizer_init_fn: A function which takes no parameters; when called it
      constructs the Optimizer object we will use to optimize the model:
      optimizer = optimizer_init_fn()
    - num_epochs: The number of epochs to train for
    - data_format: Channels first or last for the tensors
    - grades_away: An int.  Accuracy is calculated within grades_away
    - experiment_nume: The name to call the experiement when logging and saving
    - cross_coeff: A coefficient infront of the cross entropy loss.
    - deacy_at: A list of epochs to decay the learning rate at
    - decay_to: A list of learning rates to decay to
    - save: A bool to decide if we save the Tensorflow graph after training the model
    - log: A bool to decide to log the training for Tensorboard
    - save_graph: A bool to decide if we save the computational graph for Tensorboard
    - val: A bool to decide if we check the accuracy on the validation data.
      Set to False if there is no validation dataset.

    Returns:
    - acc_val: Accuracy on the validation dataset.  This is np.nan if val=False
    - acc_test: Accuracy on the test dataset
    """

    tf.reset_default_graph()

    # construct the datasets
    (next_element_train, next_element_test, train_init_op,
     val_init_op, test_init_op, steps_to_epochs) \
        = construct_datasets(num_epochs)
    (x, y) = next_element_train

    # declare placeholders
    is_training = tf.placeholder(tf.bool, name='is_training')
    lr_var = tf.Variable(lr, trainable=False, name='learning_rate')

    # Whenever we need to record the loss, feed the mean test accuracy to this placeholder
    with tf.name_scope('acc'):
        tf_acc_ph = tf.placeholder(tf.float32, shape=None, name='acc_summary')
        # Create a scalar summary object for the accuracy so it can be displayed
        tf.summary.scalar(f"accuracy_within_{grades_away}_grades", tf_acc_ph)

    # Use the model function to build the forward pass.
    scores = model_init_fn(x, is_training)

    # Compute the losses
    if cross_coeff != 0:
        loss_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y, logits=scores)
        loss_cross_entropy = cross_coeff*tf.reduce_mean(loss_cross_entropy)
    else:
        loss_cross_entropy = 0
    loss_em = sparse_earth_mover(labels=y, logits=scores)
    loss_em = tf.reduce_mean(loss_em)
    loss_reg = tf.losses.get_regularization_loss()
    loss = loss_em + loss_reg + loss_cross_entropy

    # Tensorboard logging scalars
    if cross_coeff != 0:
        tf.summary.scalar('loss_cross_entropy', loss_cross_entropy)
    tf.summary.scalar('loss_em', loss_em)
    tf.summary.scalar('loss_reg', loss_reg)
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('learning_rate', lr_var)

    # initialise the optimizer and create the training operation
    optimizer = optimizer_init_fn(lr_var)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        with tf.name_scope('train'):
            train_op = optimizer.minimize(loss)

    # check train accuarcy function
    acc_train_op = check_acc_train(x, y, scores, grades_away=grades_away)

    with tf.Session() as sess:

        # Tensorboard, merge all summaries but the error ones
        merged = tf.summary.merge_all(scope="(?!acc)")
        merged_acc = tf.summary.merge_all(scope="(acc)")

        sess.run(tf.global_variables_initializer())

        # Create the saver and Tensorboard log writers
        if save:
            saver = tf.train.Saver()
        if log:
            log_path = "C:/tmp/logs"
            if save_graph:
                train_writer = tf.summary.FileWriter(
                    log_path + '/train/' + experiment_name, sess.graph)
            else:
                train_writer = tf.summary.FileWriter(
                    log_path + '/train/' + experiment_name)
            test_writer = tf.summary.FileWriter(
                log_path + '/test/' + experiment_name)

        # Initialize an iterator over the training dataset.
        sess.run(train_init_op)
        t = 0
        while True:
            # decay learning rate
            if (t / steps_to_epochs) in decay_at:
                lr_var.load(
                    decay_to[decay_at.index(t / steps_to_epochs)], sess)

            # train on next batch of data
            feed_dict = {is_training: True}
            try:
                # check running accuracy on training batch and add to tensorboard every 20 steps
                if (t + 1) % 20 == 0:
                    summary, _, acc_train = sess.run(
                        [merged, train_op, acc_train_op], feed_dict=feed_dict)
                    if log:
                        train_writer.add_summary(summary, t)
                        train_writer.add_summary(
                            sess.run(merged_acc, feed_dict={tf_acc_ph: acc_train}), t)
                else:
                    # train normally
                    loss_np, _ = sess.run(
                        [loss, train_op], feed_dict=feed_dict)
                    # stop training if loss blows up
                    if np.isnan(loss_np):
                        if val:
                            # roughly expected accuracy from random guess
                            return (3/15, 3/15)
                        else:
                            break
            except tf.errors.OutOfRangeError:
                break
            t += 1

            # Check accuacry on validation dataset every epoch
            if t % steps_to_epochs == 0 and log and val:
                sess.run(val_init_op)
                acc_val = check_acc_tb(sess, x, next_element_test, scores, is_training,
                                       FLAG_print=False, grades_away=grades_away)
                test_writer.add_summary(
                    sess.run(merged_acc, feed_dict={tf_acc_ph: acc_val}), t)

        # End of training.  Calculate accuracy on validation dataset
        if val:
            sess.run(val_init_op)
            acc_val = check_acc_tb(sess, x, next_element_test, scores, is_training,
                                   FLAG_print=False, grades_away=grades_away)
            if log:
                test_writer.add_summary(
                    sess.run(merged_acc, feed_dict={tf_acc_ph: acc_val}), t)
        else:
            acc_val = np.NaN

        print('End of training')
        if val:
            print(f"Validation accuracy is {acc_val}")

        # Calculate accuracy on test dataset
        sess.run(test_init_op)
        acc_test = check_acc_tb(
            sess, x, next_element_test, scores, is_training, grades_away=grades_away)
        print(f"Accuracy on the test dataset is {acc_test}")

        # Save the graph to disk.
        if save:
            save_path = saver.save(sess, f"C:/tmp/save/{experiment_name}.ckpt")
        return acc_val, acc_test

Before we optimiser our hyperparameters we check the model with some sensible parameters.

In [10]:
num_epochs = 20
total_layers = 14
learning_rate = 0.005
reg = 0.002
cross_coeff = 0.1

name = (f"climbing_ResNet{total_layers}_lr{learning_rate}"
        f"_reg{reg}_cross{cross_coeff}_momentum")


def model_init_fn(inputs, is_training, total_layers=total_layers, reg=reg):
    return model_ResNetv2(inputs, is_training, total_layers=total_layers, reg=reg,
                          num_classes=num_classes, data_format=data_format)


def optimizer_init_fn(lr):
    return tf.train.MomentumOptimizer(lr, momentum=0.9)


train(model_init_fn, optimizer_init_fn, learning_rate, num_epochs=num_epochs,
      experiment_name=name, cross_coeff=cross_coeff)

End of training
Validation accuracy is 0.545123062898815
Got 554 / 1097 correct (50.50%)
Accuracy on the test dataset is 0.5050136736554239


## Optimise the hyperparameters
We use `scikit-optimize` to perform a random grid search and Gaussian process optimisation to find the best hyperparameters: `learning rate`, `reg` and `cross_coeff`.  We train a 14 layer network over 150 epochs.  This optimisation process takes 60 minutes on my laptop.  We choose to use a Momentum optimizer as it tends to give better test performance than an Adam one.  For the learning rate we follow a process similar to the ResNet papers: we divide the learning rate by 5 at 60% of the way through the training, and we also 'warm up' the model for 10 epochs at `learning_rate / 5`.

In [11]:
num_epochs = 150
total_layers = 14
num_calls = 30

# learning_rate decay similar to ResNet paper: intial warm up, then divide learning rate by 10 at 50% and 75%

dim_learning_rate = Real(low=1e-4, high=1e-1, prior='log-uniform',
                         name='learning_rate')
dim_reg = Real(low=1e-5, high=1e0, prior='log-uniform',
               name='reg')
dim_cross_coeff = Real(low=1e-4, high=1e0, prior='log-uniform',
                       name='cross_coeff')
dimensions = [dim_learning_rate, dim_reg, dim_cross_coeff]
checkpoint_path = f"C:/tmp/gp_search_checkpoint_epochs_{num_epochs}.pkl"
checkpoint_saver = CheckpointSaver(checkpoint_path, compress=9)


@use_named_args(dimensions=dimensions)
def op_acc(learning_rate, reg, cross_coeff):
    # setup decay scheme
    decay_at = [10, int(num_epochs * 0.6)]
    decay_to = [learning_rate, learning_rate / 5]
    name = (f"climbing_ResNet{total_layers}_lr{learning_rate}_reg{reg}"
            f"_cross{cross_coeff}_momentum_decay_gp_search")
    learning_rate = learning_rate / 5

    def model_init_fn(inputs, is_training, total_layers=total_layers, reg=reg):
        return model_ResNetv2(inputs, is_training, total_layers=total_layers,
                              reg=reg, num_classes=num_classes, data_format=data_format)

    def optimizer_init_fn(lr):
        return tf.train.MomentumOptimizer(lr, momentum=0.9)

    # Stop printing from train function
    # (https://stackoverflow.com/questions/23610585/ipython-notebook-avoid-printing-within-a-function/23611571#23611571)
    with io.capture_output() as captured:
        acc_val, _ = train(model_init_fn, optimizer_init_fn, learning_rate, num_epochs=num_epochs,
                           experiment_name=name, cross_coeff=cross_coeff,
                           log=True, decay_at=decay_at, decay_to=decay_to)
    return -acc_val


x0 = [0.005, 0.002, 0.1]
search_result = gp_minimize(func=op_acc, dimensions=dimensions, n_calls=num_calls,
                            callback=[checkpoint_saver], verbose=True, x0=x0)
plot_convergence(search_result)
print(search_result.x)

A plot of how the validation accuracy increased during our hyperparameter optimisation

![Convergence plot](convergence_plot.png)

Retrain the model with the validation data included in the training dataset.

In [12]:
[learning_rate, reg, cross_coeff] = search_result.x

# add the validation data to the training set.  Keep the test dataset untouched
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

# setup decay scheme
decay_at = [10, int(num_epochs * 0.6)]
decay_to = [learning_rate, learning_rate / 5]
name = (f"climbing_ResNet{total_layers}_lr{learning_rate}_reg{reg}"
        f"_cross{cross_coeff}_momentum_decay")
learning_rate = learning_rate / 5


def model_init_fn(inputs, is_training, total_layers=total_layers, reg=reg):
    return model_ResNetv2(inputs, is_training, total_layers=total_layers,
                          reg=reg, num_classes=num_classes, data_format=data_format)


def optimizer_init_fn(lr):
    return tf.train.MomentumOptimizer(lr, momentum=0.9)


_, acc_test = train(model_init_fn, optimizer_init_fn, learning_rate, num_epochs=num_epochs,
                    experiment_name=name, cross_coeff=cross_coeff,
                    decay_at=decay_at, decay_to=decay_to, val=False)

End of training
Got 676 / 1097 correct (61.62%)
Accuracy on the test dataset is 0.6162260711030082
