# Running and evaluating the model

We use Tensorflow to run the simple linear regression model as we can easily use its dataset objects to create the data pipeline and we have ready made optimisers to use too.  Also, by writing the model in Tensorflow we could easily change it to a more complicated neural network model at a later stage.

We have used scikit-optimize to optimize the hyperparameters of the model using the 2014 validation dataset and then train the model on the all of the training data (including the validation dataset).  We obtain a mean error of £640 on the test (2015) dataset.

In [1]:
import tensorflow as tf
import numpy as np

# for hyperparameter search
# pip install scikit-optimize
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

path_train = 'train_shuf.csv'
path_val = 'val.csv'
path_test = 'test.csv'

In [2]:
def map_to_xy(*tup):
    x = tf.stack(tup[1:], axis=0)
    y = tf.squeeze(tup[0])
    return (x, y)

In [3]:
batch_size = 2 ** 12
shuffle_size = 10 ** 6


def construct_datasets(num_epochs, val):
    # Construct the datasets
    # val = True: use the 2014 data for the test dataset
    # val = False: use 2015 data for test data set and val & train data for train dataset
    if val:
        filenames_train = [path_train]
        filenames_test = path_val
    else:
        filenames_train = [path_train, path_val]
        filenames_test = path_test

    prefetch = 10
    defaults = [tf.float32] * 8

    # make sure the dataset is on the CPU to leave the GPU for training the model
    # although this model is so simple it's quicker to leave the training on the CPU too
    with tf.device('/cpu:0'):
        with tf.name_scope('dataset_train'):
            dataset_train = tf.data.experimental.CsvDataset(
                filenames_train, defaults)
            dataset_train = dataset_train.apply(
                tf.data.experimental.shuffle_and_repeat(shuffle_size, count=num_epochs))
            dataset_train = dataset_train.map(
                map_to_xy).batch(batch_size).prefetch(prefetch)
            next_element_train = dataset_train.make_one_shot_iterator().get_next()

        with tf.variable_scope('dataset_test'):
            dataset_test = tf.data.experimental.CsvDataset(
                filenames_test, defaults)
            dataset_test = dataset_test.map(map_to_xy).batch(
                batch_size).prefetch(prefetch)
            next_element_test = dataset_test.make_one_shot_iterator().get_next()

    return next_element_train, next_element_test

In [12]:
def train(num_epochs=1, val=True, learning_rate=20000, reg=0.006, alpha=1.0):
    tf.reset_default_graph()

    # construct the datasets
    next_element_train, next_element_test = construct_datasets(num_epochs, val)

    x = tf.placeholder(tf.float32, [None, 7])
    y = tf.placeholder(tf.float32, [None])

    # create simple linear model with elastic regularization
    scores = tf.layers.dense(
        x, 1, kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(alpha, 1-alpha))
    scores = tf.squeeze(scores)

    # use l2 loss
    with tf.name_scope('loss'):
        loss = tf.square(scores - y)
        loss_mean = tf.reduce_mean(loss)
        loss_reg = reg * tf.losses.get_regularization_loss()
        loss_total = loss_mean + loss_reg

    # tensorboard logging
    tf.summary.scalar('loss_reg', loss_reg)
    tf.summary.scalar('loss', loss_mean)
    tf.summary.scalar('loss_total', loss_total)
    global_step = tf.Variable(1, trainable=False, name='global_step')
    merged = tf.summary.merge_all()

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        train_op = optimizer.minimize(loss_total, global_step=global_step)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        train_writer = tf.summary.FileWriter('train')

        # train
        while True:
            try:
                (x_np, y_np) = sess.run(next_element_train)
            except tf.errors.OutOfRangeError:
                break
            feed_dict = {x: x_np, y: y_np}
            summary, _ = sess.run([merged, train_op], feed_dict=feed_dict)
            train_writer.add_summary(
                summary, tf.train.global_step(sess, global_step))

        # test
        scores_list = []
        loss_list = []
        while True:
            try:
                (x_np, y_np) = sess.run(next_element_test)
            except tf.errors.OutOfRangeError:
                break
            feed_dict = {x: x_np, y: y_np}
            test_loss_tmp, test_scores_tmp = sess.run(
                [loss, scores], feed_dict=feed_dict)
            scores_list.append(test_scores_tmp)
            loss_list.append(test_loss_tmp)

        test_scores = np.concatenate(scores_list)
        test_loss = np.concatenate(loss_list)
        test_loss_av = np.sqrt(np.sum(test_loss)) / len(test_loss)
    return test_scores, test_loss_av

In [13]:
(val_scores, val_loss_av) = train(num_epochs=1, val=True)
print(f"Mean loss on the validation dataset is {val_loss_av}")

Mean loss on the validation dataset is 809.5874714189712


In [14]:
(test_scores, test_loss_av) = train(num_epochs=1, val=False)
print(f"Mean loss on the test dataset is {test_loss_av}")

Mean loss on the test dataset is 927.167829506533


We found the default hyperparameters by validation on the 2014 dataset using the gaussian process optimisation below

In [None]:
dim_learning_rate = Real(low=1e-3, high=1e7, prior='log-uniform',
                         name='learning_rate')
dim_reg = Real(low=1e-6, high=1e3, prior='log-uniform',
               name='reg')
dim_alpha = Real(0, 1, name='alpha')
dimensions = [dim_learning_rate, dim_reg, dim_alpha]


@use_named_args(dimensions=dimensions)
def op_acc(learning_rate, reg, alpha):
#     print(learning_rate, reg, alpha)
    _, av_loss = train(val=True, learning_rate=learning_rate, reg=np.float32(reg),
                       alpha=np.float32(alpha))
    return av_loss


search_result = gp_minimize(func=op_acc, dimensions=dimensions, n_calls=20, n_random_starts=10,
                            verbose=True)
print(search_result.x)
[learning_rate, reg, alpha] = search_result.x

(test_scores, test_av_loss) = train(num_epochs=3, val=False, learning_rate=learning_rate,
                                    reg=np.float32(reg), alpha=np.float32(alpha))
print(f"\nMean loss on the testing data is {test_av_loss}")

After the hyperparameter search:

Mean loss on the validation data is: 807

Mean loss on the testing data is:    888