# Coursework 1

This notebook is intended to be used as a starting point for your experiments. The instructions can be found in the instructions file located under spec/coursework1.pdf. The methods provided here are just helper functions. If you want more complex graphs such as side by side comparisons of different experiments you should learn more about matplotlib and implement them. **Before each experiment remember to re-initialize neural network weights and reset the data providers so you get a properly initialized experiment.** For each experiment try to keep most hyperparameters the same except the one under investigation so you can understand what the effects of each are.

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

def train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True):
    
    # As well as monitoring the error over training also monitor classification
    # accuracy i.e. proportion of most-probable predicted classes being equal to targets
    data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

    # Use the created objects to initialise a new Optimiser instance.
    optimiser = Optimiser(
        model, error, learning_rule, train_data, valid_data, data_monitors, notebook=notebook)

    # Run the optimiser for 5 epochs (full passes through the training set)
    # printing statistics every epoch.
    stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)

    # Plot the change in the validation and training set error over training.
    fig_1 = plt.figure(figsize=(8, 4))
    ax_1 = fig_1.add_subplot(111)
    for k in ['error(train)', 'error(valid)']:
        ax_1.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
    ax_1.legend(loc=0)
    ax_1.set_xlabel('Epoch number')

    # Plot the change in the validation and training set accuracy over training.
    fig_2 = plt.figure(figsize=(8, 4))
    ax_2 = fig_2.add_subplot(111)
    for k in ['acc(train)', 'acc(valid)']:
        ax_2.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
    ax_2.legend(loc=0)
    ax_2.set_xlabel('Epoch number')
    
    return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2

In [2]:
# The below code will set up the data providers, random number
# generator and logger objects needed for training runs. As
# loading the data from file take a little while you generally
# will probably not want to reload the data providers on
# every training run. If you wish to reset their state you
# should instead use the .reset() method of the data providers.
import numpy as np
import logging
from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider

# Seed a random number generator
seed = 11102019 
rng = np.random.RandomState(seed)
batch_size = 100
# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

# Create data provider objects for the MNIST data set
train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)
valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)

KeysView(<numpy.lib.npyio.NpzFile object at 0x2b33fbf07df0>)
KeysView(<numpy.lib.npyio.NpzFile object at 0x2b33fbf070a0>)


In [3]:
# The model set up code below is provided as a starting point.
# You will probably want to add further code cells for the
# different experiments you run.

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule
from mlp.optimisers import Optimiser


# Template

# Task 1
## Varying number of hidden units.
Initially you will train various 1-hidden layer networks by using either 32, 64 and 128 ReLU hidden units per layer on EMNIST. Make sure you use Adam optimizer with the hyperparameters provided in the template and train each network for 100 epochs. Visualise and discuss how increasing number of hidden units affects the validation performance and whether it worsens or mitigates the overfitting problem.

### 32 hidden units

In [4]:
#setup hyperparameters
learning_rate = 0.1
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 32

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
hunits32 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1: 64.0s to complete
    error(train)=1.20e+00, acc(train)=6.61e-01, error(valid)=1.21e+00, acc(valid)=6.52e-01


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 2: 60.2s to complete
    error(train)=1.02e+00, acc(train)=7.04e-01, error(valid)=1.04e+00, acc(valid)=7.00e-01


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 3: 65.4s to complete
    error(train)=9.19e-01, acc(train)=7.28e-01, error(valid)=9.43e-01, acc(valid)=7.24e-01


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 4: 63.7s to complete
    error(train)=8.51e-01, acc(train)=7.47e-01, error(valid)=8.80e-01, acc(valid)=7.40e-01


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 5: 60.9s to complete
    error(train)=8.19e-01, acc(train)=7.53e-01, error(valid)=8.48e-01, acc(valid)=7.44e-01


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
%store hunits32
# Print last validation accuracy and calculate generalization gap
print(hunits32[0][-1, hunits32[1]['acc(valid)']])
print('{:.3f}'.format(np.abs(hunits32[0][-1, hunits32[1]['error(valid)']]-hunits32[0][-1, hunits32[1]['error(train)']])))

### 64 hiddent units

In [None]:
#setup hyperparameters
learning_rate = 0.1
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 64

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
hunits64 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)

In [None]:
%store hunits64
# Print last validation accuracy and calculate generalization gap
print(hunits64[0][-1, hunits64[1]['acc(valid)']])
print('{:.3f}'.format(np.abs(hunits64[0][-1, hunits64[1]['error(valid)']]-hunits64[0][-1, hunits64[1]['error(train)']])))

### 128 hidden units

In [None]:
#setup hyperparameters
learning_rate = 0.1
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 128

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
hunits128 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)

In [None]:
%hunits128
# Print last validation accuracy and calculate generalization gap
print(hunits128[0][-1, hunits128[1]['acc(valid)']])
print('{:.3f}'.format(np.abs(hunits128[0][-1, hunits128[1]['error(valid)']]-hunits128[0][-1, hunits128[1]['error(train)']])))

It looks like clearly increasing the number of hidden units makes the overfitting problem worse.
*That makes sense, because by increasing the number of hidden units we really increase the number of weights, making the network too flexible.*

## Varying number of layers.
Here you will train various neural networks by using either 1 (previous experiment), 2, 3 hidden layers
with 128 ReLU hidden units per layer on EMNIST. Make sure you use Adam optimizer with the hyperparameters
provided in the template and train each network for 100 epochs. Visualise and discuss how increasing number of
layers affects the validation performance and whether it worsens or mitigates the overfitting problem.

### 2 hidden layers, 128 units

In [None]:
#setup hyperparameters
learning_rate = 0.1
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 128

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
hlayers2 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)

In [None]:
%store hlayers2

### 3 hidden layers, 128 units

In [None]:
#setup hyperparameters
learning_rate = 0.1
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 128

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
hlayers3 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)

In [None]:
%store hlayers3

In [None]:
# Let's do this by brute force. fprop_output (my results) are going to be exactly the input, and I'm gonna try two things:
from mlp.test_methods import test_dropout_layer
import numpy as np

fprop_test, fprop_output, fprop_correct, \
bprop_test, bprop_output, bprop_correct = test_dropout_layer()

#First, let's check out which values of the correct results aren't turned into 0
bool_correct = (fprop_correct != 0)
bool_correct[1]

#Now, let's look at the difference between the inputs and the correct outputs
fprop_difference = fprop_output - fprop_correct 
bool_difference = (fprop_difference == 0)
bool_difference[1]

#And finally, check if they are the same
np.all(bool_correct == bool_difference)

#Okay, they are the same. That means that there aren't any values in the input that are initially 0.
#So if we look at fprop_correct, we'll see all the values that have to become 0.
bool_all = (bool_correct == bool_difference)
for x in range(fprop_correct.shape[3]):
    a = fprop_correct[:,:,x][bool_correct[:,:,:,x]].shape[0]/fprop_correct[:,:,x][bool_all[:,:,:,x]].shape[0]
    b = fprop_output[:,:,x][bool_correct[:,:,:,x]].shape[0]/fprop_output[:,:,x][bool_all[:,:,:,x]].shape[0]
    print(a-b)
    
# Good news! Apparently the drop out is happening all in terms of the fourth dimension.
#Now let's see if I can replicate that:
#ok in theory we did but we are still running into problems -- we're not passing the test
#maybe the problem is with how we access the fourth dimension, np.reshape 

In [None]:
#where in I experiment with a hypothesis:
    #(1) The first dimension is the batch number and drop out is performed at the sample level, 
    # so there's "rows" in the first dimension that haven't been dropped out i.e. aren't 0
        #False. Really, only the first "row" is equal to our results
        #Unless... check changing your code so drop out is performed at the sample level (first row)
            #Nope, doesn't work. Hypothesis wrong.
            

from mlp.test_methods import test_dropout_layer
import numpy as np

fprop_test, fprop_output, fprop_correct, \
bprop_test, bprop_output, bprop_correct = test_dropout_layer()

i = 0
for s in range(fprop_correct.shape[0]):
    #assert sample[(sample == 0)].shape != (0,)
    if np.all(fprop_correct[s] == fprop_output[s]):
        print(s)
    i += 1
np.all(fprop_correct[1] == fprop_output[1])