Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [13]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [14]:
num_steps = 1001

print(        
        num_steps,
        batch_size,
    
        optimizer,
        loss,
        train_prediction,
        valid_prediction,
        test_prediction,
    
        train_labels,
        valid_labels,
        test_labels,
    
        tf_train_dataset,
        tf_train_labels,)

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

1001 16 name: "GradientDescent"
op: "NoOp"
input: "^GradientDescent/update_Variable/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_1/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_2/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_3/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_4/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_5/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_6/ApplyGradientDescent"
input: "^GradientDescent/update_Variable_7/ApplyGradientDescent"
 Tensor("Mean:0", shape=(), dtype=float32) Tensor("Softmax:0", shape=(16, 10), dtype=float32) Tensor("Softmax_1:0", shape=(10000, 10), dtype=float32) Tensor("Softmax_2:0", shape=(10000, 10), dtype=float32) [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 .

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [None]:
# Run convolutional model
# Copy-Paste from 3d assignment

TRAIN_BATCH_SIZE = 16
WIDTH = image_size
HEIGHT = image_size

PATCH_SIZE = 5
DEPTH = 16
NUM_HIDDEN = 64

NUM_OF_STEPS = 1001

STARTER_LEARNING_RATE = 0.05
DECAY_STEPS = 1000
DECAY_RATE = 0.96

# Beta coeff for l2 regularization 
BETA = 0

def create_graph(
        tf_train_dataset,
        tf_train_labels,
        tf_valid_dataset,
        tf_test_dataset,
        train_model_func,
        prepare_model_params_func,
        validate_model_func=None,
        use_learning_rate_decay=False,
    ):
    """
    Create graph using specified function to train model (for example logistic regression).
    Return tf specifications for train prediction, validation prediction and test prediction.
    Graph should be initialized before calling this function
    """
    number_of_labels = tf_train_labels.shape[1].value

    model_params = prepare_model_params_func(
        tf_train_dataset.shape[-1].value, # num channels
        number_of_labels)
    
    logits = train_model_func(
        tf_train_dataset, *model_params)

    softmax = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf_train_labels, logits=logits)
    loss = tf.reduce_mean(softmax)

    if not use_learning_rate_decay:
        optimizer = tf.train.GradientDescentOptimizer(STARTER_LEARNING_RATE).\
            minimize(loss)
    else:
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            STARTER_LEARNING_RATE, 
            global_step,
            DECAY_STEPS,
            DECAY_RATE)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).\
            minimize(loss, global_step=global_step)
     
    train_prediction = tf.nn.softmax(logits)
    
    validate_model_func = validate_model_func or train_model_func
    valid_prediction = tf.nn.softmax(
        validate_model_func(tf_valid_dataset, *model_params))
    test_prediction = tf.nn.softmax(
        validate_model_func(tf_test_dataset, *model_params))
    return (
        optimizer,
        loss,
        train_prediction,
        valid_prediction,
        test_prediction)


def prepare_params_for_conv_model(
        patch_size,
        depth):
    """
    Prepare parameters for convolutional model
    """
    def wrapped(num_channels, num_labels):
        layer_conv_weights_0 = tf.Variable(tf.truncated_normal(
            [
                patch_size, patch_size, num_channels, depth
            ], stddev=0.1))
        layer_conv_biases_0 = tf.Variable(tf.zeros([depth]))
        layer_conv_weights_1 = tf.Variable(tf.truncated_normal(
            [
                patch_size, patch_size, depth, depth
            ], stddev=0.1))
        layer_conv_biases_1 = tf.Variable(tf.constant(1.0, shape=[depth]))
        layer_weights_2 = tf.Variable(tf.truncated_normal(
           [
                image_size // 4 * image_size // 4 * depth, num_hidden
           ], stddev=0.1))
        layer_biases_2 = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
        layer_weights_3 = tf.Variable(tf.truncated_normal(
            [
                num_hidden, num_labels
            ], stddev=0.1))
        layer_biases_3 = tf.Variable(tf.constant(1.0, shape=[num_labels]))
        return (
            layer_conv_weights_0, layer_conv_biases_0,
            layer_conv_weights_1, layer_conv_biases_1,
            layer_weights_2, layer_biases_2,
            layer_weights_3, layer_biases_3,   
        )
    return wrapped

def train_conv_model(
        dataset, 
        layer_conv_weights_0, layer_conv_biases_0,
        layer_conv_weights_1, layer_conv_biases_1,
        layer_weights_2, layer_biases_2,
        layer_weights_3, layer_biases_3,   
    ):
    """
    Create tf representation for convolutional model
    """
    conv_0 = tf.nn.conv2d(
        dataset, 
        layer_conv_weights_0, 
        [1, 2, 2, 1], # strides
        padding='SAME')
    
    hidden_1 = tf.nn.relu(conv_0 + layer_conv_biases_0)
    
    conv_2 = tf.nn.conv2d(
        hidden_1, 
        layer_conv_weights_1, 
        [1, 2, 2, 1], #strides
        padding='SAME')
    
    hidden_3 = tf.nn.relu(conv_2 + layer_conv_biases_1)
    
    shape = hidden_3.get_shape().as_list()
    reshape = tf.reshape(hidden_3, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden_4 = tf.nn.relu(
        tf.matmul(reshape, layer_weights_2) + layer_biases_2)
    return tf.matmul(hidden_4, layer_weights_3) + layer_biases_3

def execute_session(
        session,
    
        num_steps,
        batch_size,
    
        optimizer,
        loss,
        train_prediction,
        valid_prediction,
        test_prediction,
    
        train_labels,
        valid_labels,
        test_labels,
    
        tf_train_dataset,
        tf_train_labels,
    ):
    """
    Execute session and print prediction result
    """
    def _run_step(step):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    
        feed_dict = {
            tf_train_dataset : batch_data, 
            tf_train_labels : batch_labels
        }
        _, l, predictions = session.run(
           [
               optimizer, 
               loss, 
               train_prediction
           ], 
           feed_dict=feed_dict)
        if (step % 50 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        

    for step in range(num_steps):
        _run_step(step)
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
        

def run_conv_model(
       num_of_steps,
       train_batch_size,
       width,
       height,

       train_dataset,
       train_labels,

       valid_dataset,
       valid_labels,
    
       test_dataset,
       test_labels,
    
       train_model_func,
       prepare_model_params_func,

       validate_model_func=None,
       **kwargs
    ):
    """
    Define & Run conv tf model
    """
    num_labels = train_labels.shape[1]

    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default():
        tf_train_dataset = tf.placeholder(
            tf.float32,
            shape=(
                train_batch_size, width, height, num_channels))
        tf_train_labels = tf.placeholder(
            tf.float32, shape=(train_batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset)
        
        (
            optimizer,
            loss,
            train_prediction,
            valid_prediction,
            test_prediction
        ) = create_graph(
            tf_train_dataset,
            tf_train_labels,
            tf_valid_dataset,
            tf_test_dataset,
            train_model_func,
            prepare_model_params_func,
            validate_model_func,
            **kwargs
        )
    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        execute_session(
            session,
            num_of_steps,
            train_batch_size,
    
            optimizer,
            loss,
            train_prediction,
            valid_prediction,
            test_prediction,
    
            train_labels,
            valid_labels,
            test_labels,
    
            tf_train_dataset,
            tf_train_labels)
        
         

# Run conv nn model
print('\n\nRunning conv nn model')
run_conv_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,
    WIDTH,
    HEIGHT,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_conv_model,
    prepare_params_for_conv_model(PATCH_SIZE, DEPTH))



Running conv nn model
Minibatch loss at step 0: 3.621693
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 50: 2.130113
Minibatch accuracy: 18.8%
Validation accuracy: 36.3%
Minibatch loss at step 100: 1.248052
Minibatch accuracy: 43.8%
Validation accuracy: 70.5%
Minibatch loss at step 150: 1.540194
Minibatch accuracy: 62.5%
Validation accuracy: 70.8%
Minibatch loss at step 200: 0.373242
Minibatch accuracy: 87.5%
Validation accuracy: 77.6%
Minibatch loss at step 250: 0.812025
Minibatch accuracy: 75.0%
Validation accuracy: 78.0%
Minibatch loss at step 300: 1.260340
Minibatch accuracy: 56.2%
Validation accuracy: 78.3%
Minibatch loss at step 350: 0.945352
Minibatch accuracy: 68.8%
Validation accuracy: 78.8%
Minibatch loss at step 400: 0.664533
Minibatch accuracy: 87.5%
Validation accuracy: 78.9%
Minibatch loss at step 450: 0.619652
Minibatch accuracy: 87.5%
Validation accuracy: 79.8%
Minibatch loss at step 500: 0.277992
Minibatch accuracy: 87.5%
Validation accura

In [None]:
# Max pooling
#PATCH_SIZE_CONV = 5
#NUM_CHANNELS_CONV = 1
#DEPTH = 16
#NUM_HIDDEN_CONV = 64


def train_conv_model_with_max_pool(
        dataset, 
        layer_conv_weights_0, layer_conv_biases_0,
        layer_conv_weights_1, layer_conv_biases_1,
        layer_weights_2, layer_biases_2,
        layer_weights_3, layer_biases_3,
    ):
    """
    Create tf representation for convolutional model using max pooling
    """
    conv_0 = tf.nn.conv2d(
        dataset, 
        layer_conv_weights_0, 
        [1, 1, 1, 1], # strides
        padding='SAME') + layer_conv_biases_0

    pool_1 = tf.nn.max_pool(
        conv_0,
        [1, 2, 2, 1], # kernel size
        [1, 2, 2, 1], #strides
        padding='SAME')
    
    hidden_2 = tf.nn.relu(pool_1)

    conv_3 = tf.nn.conv2d(
        hidden_2, 
        layer_conv_weights_1, 
        [1, 1, 1, 1], #strides
        padding='SAME') + layer_conv_biases_1

    pool_4 = tf.nn.max_pool(
        conv_3,
        [1, 2, 2, 1], # kernel size
        [1, 2, 2, 1], #strides
        padding='SAME')

    hidden_5 = tf.nn.relu(pool_4)
    
    shape = hidden_5.get_shape().as_list()
    reshape = tf.reshape(hidden_5, [shape[0], shape[1] * shape[2] * shape[3]])
    
    hidden_6 = tf.nn.relu(
        tf.matmul(reshape, layer_weights_2) + layer_biases_2)
    return tf.matmul(hidden_6, layer_weights_3) + layer_biases_3


print('\n\nRunning conv nn model with pooling')
run_conv_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,
    WIDTH,
    HEIGHT,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_conv_model_with_max_pool,
    prepare_params_for_conv_model(PATCH_SIZE, DEPTH),)



Running conv nn model with pooling
Minibatch loss at step 0: 3.758092
Minibatch accuracy: 18.8%
Validation accuracy: 13.9%
Minibatch loss at step 50: 2.341930
Minibatch accuracy: 6.2%


---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---