In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.layers import batch_norm, dropout
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [2]:
#  Create a test graph
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))

[[ 22.  28.]
 [ 49.  64.]]


In [3]:
# We will first train the model to identify digits 0 to 4 and then do transfer training to identify images 5 to 9.

def divide_dataset_into_two_sets(images, labels):
    assert len(images) == len(labels)
    zero_to_four_images = []
    zero_to_four_labels = []
    five_to_nine_images = []
    five_to_nine_labels = []
    four_label = 4
    for i in range(len(images)):
        if labels[i] <= four_label:
            zero_to_four_images.append(images[i])
            zero_to_four_labels.append(labels[i])
        else:
            five_to_nine_images.append(images[i])
            five_to_nine_labels.append(labels[i])
    return np.array(zero_to_four_images), np.array(zero_to_four_labels), np.array(five_to_nine_images), np.array(five_to_nine_labels)

train_zero_to_four_images, train_zero_to_four_labels, train_five_to_nine_images, train_five_to_nine_labels = divide_dataset_into_two_sets(mnist.train.images, mnist.train.labels)
validation_zero_to_four_images, validation_zero_to_four_labels, validation_five_to_nine_images, validation_five_to_nine_labels = divide_dataset_into_two_sets(mnist.validation.images, mnist.validation.labels)
test_zero_to_four_images, test_zero_to_four_labels, test_five_to_nine_images, test_five_to_nine_labels = divide_dataset_into_two_sets(mnist.test.images, mnist.test.labels)

In [4]:
print(test_zero_to_four_images.shape)
print(test_zero_to_four_labels.shape)
print(test_five_to_nine_images.shape)
print(test_five_to_nine_labels.shape)

(5139, 784)
(5139,)
(4861, 784)
(4861,)


In [5]:
def create_next_batch_fn(images, labels, batch_size):
    assert len(images) == len(labels)
    current_batch = 0
    def next_batch():
        nonlocal current_batch
        i = current_batch
        #print(current_batch)
        current_batch = (current_batch + batch_size) % len(images)
        return images[i:i+batch_size], labels[i:i+batch_size]
    return next_batch

In [6]:
number_of_inputs = 28 * 28
n_output = 5

In [7]:
is_training = tf.placeholder(tf.bool, shape=(), name="is_training")
bn_params = {
    "training": is_training,
    "momentum": 0.99
}
keep_prob = 0.99

def he_normal_initialisation(n_inputs, n_outputs):
    stddev = np.power(2 / (n_inputs + n_outputs), 1 / np.sqrt(2))
    # truncated normal distributions limit the size of the weights, speeding up the training time.
    return tf.truncated_normal((n_inputs, n_outputs), stddev=stddev)

def he_uniform_initialisation(n_inputs, n_outputs):
    r = np.power(6 / (n_inputs + n_outputs), 1 / np.sqrt(2))
    # truncated normal distributions limit the size of the weights, speeding up the training time.
    return tf.random_uniform((n_inputs, n_outputs), -r, r)

def neuron_layer(X, n_neurons, name):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        W = tf.Variable(he_normal_initialisation(n_inputs, n_neurons), name="weights")
        b = tf.Variable(tf.zeros([n_neurons]), name="biases")
        z = tf.matmul(dropout(X, keep_prob, is_training=is_training), W) + b
        bn = tf.layers.batch_normalization(z, **bn_params)
        return tf.nn.elu(bn)

Exercise 8
===

In [None]:
try:
    # Access a global and if it exists, reset the graph.
    number_of_inputs
    #reset_graph()
except:
    pass
n_hidden_per_layer = [90] * 4

# batch normalisation doesn't work on my GPU for some reason.
#with tf.device("/gpu:0"):
with tf.device("/cpu:0"):    
    x = tf.placeholder(tf.float32, shape=(None, number_of_inputs), name="input")
    y = tf.placeholder(tf.int64, shape=(None), name="y")

    with tf.name_scope("dnn"):
        input_tensor = x
        for i in range(len(n_hidden_per_layer)):
            input_tensor = neuron_layer(input_tensor, n_hidden_per_layer[i], "hidden" + str(i + 1))
        logits = neuron_layer(input_tensor, n_output, "output")

    with tf.name_scope("loss"):
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(cross_entropy, name="loss")

    with tf.name_scope("training"):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.005)
        # Make the training op depend upon the update ops from the batch normalisation layer.
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    k = 1
    correctness = tf.nn.in_top_k(logits, y, k)
    accuracy = tf.reduce_mean(tf.cast(correctness, tf.float32)) * 100

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

interim_checkpoint_path = "./checkpoints/0-4_mnist_model.ckpt"
early_stopping_checkpoint_path = "./checkpoints/0-4_mnist_model_early_stopping.ckpt"

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
log_dir = "{}/run-{}/".format(root_logdir, now)

loss_summary = tf.summary.scalar('loss', loss)
train_accuracy_summary = tf.summary.scalar("train_accuracy", accuracy)
summary_op = tf.summary.merge([loss_summary, train_accuracy_summary])
file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())

In [None]:
batch_size = 200
next_test_batch = create_next_batch_fn(test_zero_to_four_images, test_zero_to_four_labels, batch_size)

In [None]:
epochs = 5000
n_batches = int(np.ceil(len(train_zero_to_four_images) // batch_size))

early_stopping_check_frequency = n_batches // 10
early_stopping_check_limit = n_batches * 2

with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    sess.run(init)
    #saver.restore(sess, interim_checkpoint_path)
    
    best_validation_acc = 0.0
    best_validation_step = 0
    for epoch in range(epochs):
        print("epoch", epoch)
        for batch_index in range(n_batches):
            step = epoch * n_batches + batch_index
            X_batch, y_batch = next_test_batch()
            if batch_index % 10 == 0:
                # Output summaries
                summary_str = summary_op.eval(feed_dict={x: X_batch, y: y_batch, is_training: False})
                file_writer.add_summary(summary_str, step)
            t, l, a = sess.run([training_op, loss, accuracy], feed_dict={x: X_batch, y: y_batch, is_training: True})
            if batch_index % 10 == 0: print("loss:", l, "training accuracy:", a)
            # Early stopping check
            if batch_index % early_stopping_check_frequency == 0:
                validation_acc = accuracy.eval(feed_dict={x: validation_zero_to_four_images, y: validation_zero_to_four_labels, is_training: False})
                print("validation accuracy", validation_acc)
                if validation_acc > best_validation_acc:
                    print("Saving best model")
                    saver.save(sess, early_stopping_checkpoint_path)
                    best_validation_acc = validation_acc
                    best_validation_step = step
                elif step >= (best_validation_step + early_stopping_check_limit):
                    print("Stopping early during epoch", epoch)
                    print("Best validation performance", best_validation_acc)
                    break
        else:
            continue
        break
        save_path = saver.save(sess, interim_checkpoint_path)
    saver.restore(sess, early_stopping_checkpoint_path)
    test_acc = accuracy.eval(feed_dict={x: test_zero_to_four_images, y: test_zero_to_four_labels, is_training: False})
    print(">>>>>>>>>> test dataset accuracy:", test_acc)

    save_path = saver.save(sess, "./checkpoints/0-4_mnist_model_final.ckpt")

Batch normalisation does produce a better model (when you explicitly run the extra updates op). It does however, greatly increase the model's learning speed.

Adding double connect to the network improved validation performance to 98.1235%.

Exercise 9
===

Perform transfer learning by freezing all of the hidden layers and creating a new output layer.

In [8]:
original_saver = tf.train.import_meta_graph("./checkpoints/0-4_mnist_model_final.ckpt.meta")
for op in tf.get_default_graph().get_operations():
    print(op.name)

a
b
MatMul
is_training
a_1
b_1
MatMul_1
is_training_1
input
y
dnn/hidden1/truncated_normal/shape
dnn/hidden1/truncated_normal/mean
dnn/hidden1/truncated_normal/stddev
dnn/hidden1/truncated_normal/TruncatedNormal
dnn/hidden1/truncated_normal/mul
dnn/hidden1/truncated_normal
dnn/hidden1/weights
dnn/hidden1/weights/Assign
dnn/hidden1/weights/read
dnn/hidden1/zeros
dnn/hidden1/biases
dnn/hidden1/biases/Assign
dnn/hidden1/biases/read
dnn/hidden1/Dropout/cond/Switch
dnn/hidden1/Dropout/cond/switch_t
dnn/hidden1/Dropout/cond/switch_f
dnn/hidden1/Dropout/cond/pred_id
dnn/hidden1/Dropout/cond/dropout/keep_prob
dnn/hidden1/Dropout/cond/dropout/Shape/Switch
dnn/hidden1/Dropout/cond/dropout/Shape
dnn/hidden1/Dropout/cond/dropout/random_uniform/min
dnn/hidden1/Dropout/cond/dropout/random_uniform/max
dnn/hidden1/Dropout/cond/dropout/random_uniform/RandomUniform
dnn/hidden1/Dropout/cond/dropout/random_uniform/sub
dnn/hidden1/Dropout/cond/dropout/random_uniform/mul
dnn/hidden1/Dropout/cond/dropout/ran

In [9]:
x = tf.get_default_graph().get_tensor_by_name("input:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
is_training = tf.get_default_graph().get_tensor_by_name("is_training:0")
is_training_1 = tf.get_default_graph().get_tensor_by_name("is_training_1:0")
#hidden4_output = tf.get_default_graph().get_tensor_by_name("dnn/hidden4/Elu:0")
logits = tf.get_default_graph().get_tensor_by_name("dnn/output/Elu:0")
loss = tf.get_default_graph().get_tensor_by_name("loss/loss:0")

In [17]:
# batch normalisation doesn't work on my GPU for some reason.
#with tf.device("/gpu:0"):

with tf.device("/cpu:0"):
    #with tf.name_scope("dnn"):
    #    logits = neuron_layer(hidden4_output, n_output, "output")
        
    #with tf.name_scope("loss"):
    #    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    #    loss = tf.reduce_mean(cross_entropy, name="loss")

    #print(tf.trainable_variables())
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dnn/hidden4|dnn/output")
    print("train_vars:", train_vars)
    
    with tf.name_scope("training"):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.005, name="Adam2")
        # Make the training op depend upon the update ops from the batch normalisation layer.
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            # This is where we freeze the hidden layers by telling the optimiser to only optimise the output layer.
            training_op = optimizer.minimize(loss, var_list=train_vars)

with tf.name_scope("eval"):
    k = 1
    correctness = tf.nn.in_top_k(logits, y, k)
    accuracy = tf.reduce_mean(tf.cast(correctness, tf.float32)) * 100
    
init = tf.global_variables_initializer()

new_saver = tf.train.Saver()
five_to_nine_final_model_path = "./checkpoints/5-9_mnist_model_final.ckpt"
early_stopping_checkpoint_path = "./checkpoints/5-9_mnist_model_early_stop_checkpoint.ckpt"

train_vars: [<tf.Variable 'dnn/hidden4/weights:0' shape=(90, 90) dtype=float32_ref>, <tf.Variable 'dnn/hidden4/biases:0' shape=(90,) dtype=float32_ref>, <tf.Variable 'dnn/output/weights:0' shape=(90, 5) dtype=float32_ref>, <tf.Variable 'dnn/output/biases:0' shape=(5,) dtype=float32_ref>]


In [18]:
X_train2_full = mnist.train.images[mnist.train.labels >= 5]
y_train2_full = mnist.train.labels[mnist.train.labels >= 5] - 5
X_valid2_full = mnist.validation.images[mnist.validation.labels >= 5]
y_valid2_full = mnist.validation.labels[mnist.validation.labels >= 5] - 5
X_test2 = mnist.test.images[mnist.test.labels >= 5]
y_test2 = mnist.test.labels[mnist.test.labels >= 5] - 5

def sample_n_instances_per_class(X, y, n=100):
    Xs, ys = [], []
    for label in np.unique(y):
        idx = (y == label)
        Xc = X[idx][:n]
        yc = y[idx][:n]
        Xs.append(Xc)
        ys.append(yc)
    return np.concatenate(Xs), np.concatenate(ys)

X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)

batch_size = 20 
next_test_batch = create_next_batch_fn(X_train2, y_train2, batch_size)

In [19]:
epochs = 5000
n_batches = int(np.ceil(X_train2.shape[0] // batch_size))

early_stopping_check_frequency = n_batches // 20
early_stopping_check_limit = n_batches * 4

with tf.Session() as sess:
    sess.run(init)
    original_saver.restore(sess, "./checkpoints/0-4_mnist_model_final.ckpt")
    
    best_validation_acc = 0.0
    best_validation_step = 0
    for epoch in range(epochs):
        print("epoch", epoch)
        for batch_index in range(n_batches):
            step = epoch * n_batches + batch_index
            X_batch, y_batch = next_test_batch()
            if batch_index % 10 == 0:
                # Output summaries
                #summary_str = summary_op.eval(feed_dict={x: X_batch, y: y_batch, is_training: False, is_training_1: False})
                #file_writer.add_summary(summary_str, step)
                pass
            t, l, a = sess.run([training_op, loss, accuracy], feed_dict={x: X_batch, y: y_batch, is_training: True, is_training_1: True})
            if batch_index % 10 == 0: print("loss:", l, "training accuracy:", a)
            # Early stopping check
            if batch_index % early_stopping_check_frequency == 0:
                validation_acc = accuracy.eval(feed_dict={x: X_valid2, y: y_valid2, is_training: False, is_training_1: False})
                print("validation accuracy", validation_acc)
                if validation_acc > best_validation_acc:
                    print("Saving best model")
                    new_saver.save(sess, early_stopping_checkpoint_path)
                    best_validation_acc = validation_acc
                    best_validation_step = step
                elif step >= (best_validation_step + early_stopping_check_limit):
                    print("Stopping early during epoch", epoch)
                    print("Best validation performance", best_validation_acc)
                    break
        else:
            continue
        break
    
    new_saver.save(sess, five_to_nine_final_model_path)

INFO:tensorflow:Restoring parameters from ./checkpoints/0-4_mnist_model_final.ckpt
epoch 0
loss: 3.87546 training accuracy: 15.0
validation accuracy 30.0
Saving best model
validation accuracy 30.6667
Saving best model
validation accuracy 31.3333
Saving best model
validation accuracy 31.3333
validation accuracy 32.6667
Saving best model
validation accuracy 32.0
validation accuracy 28.6667
validation accuracy 27.3333
validation accuracy 24.6667
validation accuracy 24.0
loss: 2.58558 training accuracy: 20.0
validation accuracy 23.3333
validation accuracy 22.6667
validation accuracy 20.6667
validation accuracy 20.6667
validation accuracy 18.6667
validation accuracy 17.3333
validation accuracy 15.3333
validation accuracy 16.6667
validation accuracy 12.6667
validation accuracy 12.6667
loss: 2.48815 training accuracy: 15.0
validation accuracy 12.6667
validation accuracy 12.6667
validation accuracy 12.6667
validation accuracy 12.6667
validation accuracy 12.0
epoch 1
loss: 2.46843 training accu

These are pretty appaulling results. Its barely learning anything. In fact it appears that its not training properly at all. I must have set up the transfer learning model incorrectly, or I have the wrong set of hyper parameters.

Exercise 10
===

In [6]:
number_of_inputs = 28 * 28
n_hidden_per_layer = [100, 100, 100, 100, 100]
n_output = 10

def he_normal_initialisation(n_inputs, n_outputs):
    stddev = np.power(2 / (n_inputs + n_outputs), 1 / np.sqrt(2))
    # truncated normal distributions limit the size of the weights, speeding up the training time.
    return tf.truncated_normal((n_inputs, n_outputs), stddev=stddev)

def he_uniform_initialisation(n_inputs, n_outputs):
    r = np.power(6 / (n_inputs + n_outputs), 1 / np.sqrt(2))
    # truncated normal distributions limit the size of the weights, speeding up the training time.
    return tf.random_uniform((n_inputs, n_outputs), -r, r)

def neuron_layer(X, n_neurons, name):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        W = tf.Variable(he_normal_initialisation(n_inputs, n_neurons), name="weights")
        b = tf.Variable(tf.zeros([n_neurons]), name="biases")
        z = tf.matmul(X, W) + b
        return tf.nn.elu(z)

with tf.device("/gpu:0"):
    x = tf.placeholder(tf.float32, shape=(None, number_of_inputs), name="input")
    y = tf.placeholder(tf.int64, shape=(None), name="y")

    with tf.name_scope("dnn"):
        input_tensor = x
        for i in range(len(n_hidden_per_layer)):
            input_tensor = neuron_layer(input_tensor, n_hidden_per_layer[i], "hidden" + str(i + 1))
        logits = neuron_layer(input_tensor, n_output, "output")

    with tf.name_scope("loss"):
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(cross_entropy, name="loss")

    with tf.name_scope("training"):
        optimizer = tf.train.AdamOptimizer()
        training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    k = 1
    correctness = tf.nn.in_top_k(logits, y, k)
    accuracy = tf.reduce_mean(tf.cast(correctness, tf.float32)) * 100

In [7]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

interim_checkpoint_path = "./checkpoints/mnist_model.ckpt"
early_stopping_checkpoint_path = "./checkpoints/mnist_model_early_stopping.ckpt"

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
log_dir = "{}/run-{}/".format(root_logdir, now)

loss_summary = tf.summary.scalar('loss', loss)
accuracy_summary = tf.summary.scalar("accuracy", accuracy)
summary_op = tf.summary.merge([loss_summary, accuracy_summary])
file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())

In [8]:
epochs = 20
batch_size = 200
n_batches = int(np.ceil(mnist.train.num_examples // batch_size))

early_stopping_check_frequency = batch_size // 4
early_stopping_check_limit = batch_size * 2

with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    sess.run(init)
    #saver.restore(sess, interim_checkpoint_path)
    
    best_validation_acc = 0.0
    best_validation_step = 0
    for epoch in range(epochs):
        print("epoch", epoch)
        for batch_index in range(n_batches):
            step = epoch * n_batches + batch_index
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            if batch_index % 10 == 0:
                summary_str = summary_op.eval(feed_dict={x: X_batch, y: y_batch})
                file_writer.add_summary(summary_str, step)
            t, l, a = sess.run([training_op, loss, accuracy], feed_dict={x: X_batch, y: y_batch})
            if batch_index % 10 == 0: print("loss:", l, "test accuracy:", a)
            # Early stopping check
            if batch_index % early_stopping_check_frequency == 0:
                validation_acc = accuracy.eval(feed_dict={x: mnist.validation.images, y: mnist.validation.labels})
                print("validation accuracy", validation_acc)
                if validation_acc > best_validation_acc:
                    saver.save(sess, early_stopping_checkpoint_path)
                    best_validation_acc = validation_acc
                    best_validation_step = step
                elif step >= (best_validation_step + early_stopping_check_limit):
                    print("Stopping early during epoch", epoch)
                    break
        else:
            continue
        break
    save_path = saver.save(sess, interim_checkpoint_path)
    test_acc = accuracy.eval(feed_dict={x: mnist.test.images, y: mnist.test.labels})
    print(">>>>>>>>>> test dataset accuracy:", test_acc)

    save_path = saver.save(sess, "./checkpoints/mnist_model_final.ckpt")

epoch 0
loss: 2.3025 test accuracy: 14.0
validation accuracy 15.04
loss: 2.19268 test accuracy: 37.0
loss: 1.62996 test accuracy: 43.5
loss: 1.19511 test accuracy: 59.5
loss: 0.97696 test accuracy: 66.0
loss: 0.679434 test accuracy: 79.5
validation accuracy 73.9
loss: 0.745608 test accuracy: 76.5
loss: 0.642079 test accuracy: 77.0
loss: 0.618012 test accuracy: 78.0
loss: 0.627164 test accuracy: 81.0
loss: 0.56056 test accuracy: 82.0
validation accuracy 84.46
loss: 0.449496 test accuracy: 86.0
loss: 0.353521 test accuracy: 89.5
loss: 0.498155 test accuracy: 86.0
loss: 0.584658 test accuracy: 82.0
loss: 0.495865 test accuracy: 84.5
validation accuracy 87.98
loss: 0.343682 test accuracy: 89.5
loss: 0.377395 test accuracy: 88.5
loss: 0.372256 test accuracy: 89.0
loss: 0.538575 test accuracy: 83.5
loss: 0.310677 test accuracy: 90.5
validation accuracy 89.06
loss: 0.376628 test accuracy: 88.5
loss: 0.468921 test accuracy: 87.0
loss: 0.263155 test accuracy: 89.5
loss: 0.369656 test accuracy: 