Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [80]:
TRAIN_BATCH_SIZE = 128
NUM_OF_STEPS = 3001
BETA = 0.001

DECAY_STEPS = 1000
DECAY_RATE = 0.96


def create_graph(
        tf_train_dataset,
        tf_train_labels,
        tf_valid_dataset,
        tf_test_dataset,
        train_model_func,
        prepare_model_params_func,
        validate_model_func=None,
        use_learning_rate_decay=False,
    ):
    """
    Create graph using specified function to train model (for example logistic regression).
    Return tf specifications for train prediction, validation prediction and test prediction.
    Graph should be initialized before calling this function
    """
    number_of_labels = tf_train_labels.shape[1].value

    model_params = prepare_model_params_func(
        tf_train_dataset.shape[1].value,
        number_of_labels)
    model = train_model_func(
        tf_train_dataset, *model_params)

    weights = model_params[0]
    
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=model) + \
       BETA * tf.nn.l2_loss(weights))
  
    if not use_learning_rate_decay:
        optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    else:
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            0.5, 
            global_step,
            DECAY_STEPS,
            DECAY_RATE)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).\
            minimize(loss, global_step=global_step)
  
    train_prediction = tf.nn.softmax(model)
    
    validate_model_func = validate_model_func or train_model_func
    valid_prediction = tf.nn.softmax(
        validate_model_func(tf_valid_dataset, *model_params))
    test_prediction = tf.nn.softmax(
        validate_model_func(tf_test_dataset, *model_params))
    return (
        optimizer,
        loss,
        train_prediction,
        valid_prediction,
        test_prediction)

def prepare_params_for_logistic_model(
        num_features, num_labels):
    """
    Prepare parameters for logistic model
    """
    weights = tf.Variable(
        tf.truncated_normal([num_features, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    return weights, biases
    
def train_logistic_model(dataset, weights, biases):
    """
    Create tf representation for logistic model
    """
    return tf.matmul(dataset, weights) + biases


def prepare_params_for_nn_model(
        num_features, num_labels):
    """
    Prepare paraneters for nn model
    """
    weights_before_relu = tf.Variable(
        tf.truncated_normal([num_features, num_labels]))
    biases_before_relu = tf.Variable(tf.zeros([num_labels]))
    
    weights_after_relu = tf.Variable(
        tf.truncated_normal([num_labels, num_labels]))
    biases_after_relu = tf.Variable(tf.zeros([num_labels]))
    return (
        weights_before_relu, biases_before_relu,
        weights_after_relu, biases_after_relu,
    )

def train_nn_model(
        dataset, 
        weights_before_relu, biases_before_relu,
        weights_after_relu, biases_after_relu
    ):
    """
    Create tf representation for nn model
    """
    return tf.matmul(
            tf.nn.relu(
                tf.matmul(dataset, weights_before_relu) + biases_before_relu),
            weights_after_relu
            ) + biases_after_relu


def execute_session(
        session,
    
        num_steps,
        batch_size,
    
        optimizer,
        loss,
        train_prediction,
        valid_prediction,
        test_prediction,
    
        train_labels,
        valid_labels,
        test_labels,
    
        tf_train_dataset,
        tf_train_labels,
    ):
    """
    Execute session and print prediction result
    """
    def _run_step(step):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {
            tf_train_dataset : batch_data, 
            tf_train_labels : batch_labels
        }
        _, l, predictions = session.run(
           [
               optimizer, 
               loss, 
               train_prediction
           ], 
           feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        

    for step in range(num_steps):
        _run_step(step)
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
        

def run_model(
       num_of_steps,
       train_batch_size,

       train_dataset,
       train_labels,

       valid_dataset,
       valid_labels,
    
       test_dataset,
       test_labels,
    
       train_model_func,
       prepare_model_params_func,

       validate_model_func=None,
       **kwargs
    ):
    """
    Define & Run tf model
    """
    num_labels = train_labels.shape[1]

    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default():
        tf_train_dataset = tf.placeholder(
            tf.float32,
            shape=(train_batch_size, train_dataset.shape[1]))
        tf_train_labels = tf.placeholder(tf.float32, shape=(train_batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset) 
        
        (
            optimizer,
            loss,
            train_prediction,
            valid_prediction,
            test_prediction
        ) = create_graph(
            tf_train_dataset,
            tf_train_labels,
            tf_valid_dataset,
            tf_test_dataset,
            train_model_func,
            prepare_model_params_func,
            validate_model_func,
            **kwargs
        )
    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        execute_session(
            session,
            num_of_steps,
            train_batch_size,
    
            optimizer,
            loss,
            train_prediction,
            valid_prediction,
            test_prediction,
    
            train_labels,
            valid_labels,
            test_labels,
    
            tf_train_dataset,
            tf_train_labels)
        
        
# Run logistic regression model
print('\n\nRunning logistic regression')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,
    
    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_logistic_model,
    prepare_params_for_logistic_model)  

# Run nn model
print('\n\nRunning nn model')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_nn_model,
    prepare_params_for_nn_model)




Running logistic regression
Minibatch loss at step 0: 21.335131
Minibatch accuracy: 7.8%
Validation accuracy: 11.0%
Minibatch loss at step 500: 3.133156
Minibatch accuracy: 70.3%
Validation accuracy: 75.8%
Minibatch loss at step 1000: 1.582602
Minibatch accuracy: 79.7%
Validation accuracy: 78.3%
Minibatch loss at step 1500: 1.283206
Minibatch accuracy: 77.3%
Validation accuracy: 80.5%
Minibatch loss at step 2000: 0.998777
Minibatch accuracy: 82.8%
Validation accuracy: 79.9%
Minibatch loss at step 2500: 1.029074
Minibatch accuracy: 76.6%
Validation accuracy: 81.4%
Minibatch loss at step 3000: 0.737538
Minibatch accuracy: 80.5%
Validation accuracy: 81.8%
Test accuracy: 88.7%


Running nn model
Minibatch loss at step 0: 37.321106
Minibatch accuracy: 8.6%
Validation accuracy: 14.9%
Minibatch loss at step 500: 3.008811
Minibatch accuracy: 58.6%
Validation accuracy: 68.5%
Minibatch loss at step 1000: 1.685855
Minibatch accuracy: 80.5%
Validation accuracy: 78.4%
Minibatch loss at step 1500:

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [44]:
NUM_OF_STEPS_OVERFITTED = 3001
NUMBER_OF_BATCH_OVERFITTED = 100
TRAIN_BATCH_SIZE_OVERFITTED = train_dataset.shape[0] / NUMBER_OF_BATCH_OVERFITTED

# Run logistic regression model
# to demonstrate an extreme case of overfitting
print('\n\nRunning logistic regression '
      'to demonstrate an extreme case of overfitting')
run_model(
    NUM_OF_STEPS_OVERFITTED,
    TRAIN_BATCH_SIZE_OVERFITTED,
    
    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_logistic_model,
    prepare_params_for_logistic_model)  

# Run nn model
# to demonstrate an extreme case of overfitting
print('\n\nRunning nn model '
      'to demonstrate an extreme case of overfitting')
run_model(
    NUM_OF_STEPS_OVERFITTED,
    TRAIN_BATCH_SIZE_OVERFITTED,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_nn_model,
    prepare_params_for_nn_model)



Running logistic regression to demonstrate an extreme case of overfitting
Minibatch loss at step 0: 22.044024
Minibatch accuracy: 8.2%
Validation accuracy: 9.6%
Minibatch loss at step 500: 2.728003
Minibatch accuracy: 76.6%
Validation accuracy: 76.9%
Minibatch loss at step 1000: 1.699403
Minibatch accuracy: 78.3%
Validation accuracy: 79.5%
Minibatch loss at step 1500: 1.205603
Minibatch accuracy: 80.2%
Validation accuracy: 81.1%
Minibatch loss at step 2000: 0.935479
Minibatch accuracy: 82.3%
Validation accuracy: 82.2%
Minibatch loss at step 2500: 0.736053
Minibatch accuracy: 84.3%
Validation accuracy: 82.9%
Minibatch loss at step 3000: 0.663834
Minibatch accuracy: 84.5%
Validation accuracy: 82.9%
Test accuracy: 90.0%


Running nn model to demonstrate an extreme case of overfitting
Minibatch loss at step 0: 39.038002
Minibatch accuracy: 6.8%
Validation accuracy: 11.7%
Minibatch loss at step 500: 2.729185
Minibatch accuracy: 73.5%
Validation accuracy: 74.9%
Minibatch loss at step 1000:

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [50]:
KEEP_PROB = 0.8
# Introduce droputs
def train_nn_model_with_dropouts(
        dataset, 
        weights_before_relu, biases_before_relu,
        weights_after_relu, biases_after_relu
    ):
    """
    Create tf representation for nn model with droput
    """
    return tf.nn.dropout(
        train_nn_model(        
            dataset, 
            weights_before_relu, biases_before_relu,
            weights_after_relu, biases_after_relu),
        KEEP_PROB)

def train_logistic_model_with_dropouts(
        dataset, weights, biases):
    """
    Create tf representation for logistic model with dropout
    """
    return tf.nn.dropout(
        train_logistic_model(
            dataset, weights, biases),
        KEEP_PROB)

# Run logistic regression model with dropouts
print('\n\nRunning logistic regression with dropouts')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,
    
    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_logistic_model_with_dropouts,
    prepare_params_for_logistic_model,

    train_logistic_model,)  

# Run nn model with dropouts
print('\n\nRunning nn model with dropouts')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_nn_model_with_dropouts,
    prepare_params_for_nn_model,
    train_nn_model,)



Running logistic regression with dropouts
Minibatch loss at step 0: 24.326685
Minibatch accuracy: 10.2%
Validation accuracy: 9.2%
Minibatch loss at step 500: 3.858128
Minibatch accuracy: 59.4%
Validation accuracy: 75.1%
Minibatch loss at step 1000: 1.855206
Minibatch accuracy: 64.1%
Validation accuracy: 77.7%
Minibatch loss at step 1500: 1.609410
Minibatch accuracy: 65.6%
Validation accuracy: 79.1%
Minibatch loss at step 2000: 1.362005
Minibatch accuracy: 64.8%
Validation accuracy: 79.5%
Minibatch loss at step 2500: 1.252787
Minibatch accuracy: 67.2%
Validation accuracy: 80.7%
Minibatch loss at step 3000: 0.988527
Minibatch accuracy: 73.4%
Validation accuracy: 81.0%
Test accuracy: 88.5%


Running nn model with dropouts
Minibatch loss at step 0: 36.381355
Minibatch accuracy: 14.8%
Validation accuracy: 19.1%
Minibatch loss at step 500: 3.095686
Minibatch accuracy: 61.7%
Validation accuracy: 69.2%
Minibatch loss at step 1000: 2.036812
Minibatch accuracy: 69.5%
Validation accuracy: 77.4%

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [63]:
# Muli layer model
def train_multilayer_nn_model(
        dataset, 
        *weights_n_biases
    ):
    """
    Create tf representation for nn model with multilayer
    """
    weights_0, biases_0 = weights_n_biases[:2]
    dataset = tf.matmul(dataset, weights_0) + biases_0
    
    for index in range(2, len(weights_n_biases), 2):
        weights_i = weights_n_biases[index]
        biases_i = weights_n_biases[index + 1]

        dataset = tf.matmul(
            tf.nn.relu(dataset),
            weights_i) + biases_i

    return dataset

def train_multilayer_nn_model_with_dropouts(
        dataset, 
        *weights_n_biases
    ):
    """
    Create tf representation for nn model with droput
    """
    return tf.nn.dropout(
        train_multilayer_nn_model(
            dataset, *weights_n_biases),
        KEEP_PROB)

def prepare_params_for_nn_model_multilayer(layer_number):
    """
    Prepare paraneters for nn model with multiple layers
    """
    def wrapped(
            num_features, num_labels):
        weights_0 = tf.Variable(
            tf.truncated_normal([num_features, num_labels]))
        biases_0 = tf.Variable(tf.zeros([num_labels]))
        
        weights_n_biases = [
            weights_0,
            biases_0,
        ]
    
        for _ in range(layer_number):
            weights_n_biases.extend([
                tf.Variable(
                    tf.truncated_normal([num_labels, num_labels])),
                tf.Variable(tf.zeros([num_labels]))  
            ])
            
        return weights_n_biases
    return wrapped


N_LAYERS = 1

# Run nn model with dropouts
print('\n\nRunning nn model with multiple layers')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_multilayer_nn_model_with_dropouts,
    prepare_params_for_nn_model_multilayer(N_LAYERS),
    train_multilayer_nn_model,)

# Best test occuracy with only 1 layer



Running nn model with multiple layers
Minibatch loss at step 0: 40.786430
Minibatch accuracy: 7.0%
Validation accuracy: 15.3%
Minibatch loss at step 500: 3.232042
Minibatch accuracy: 62.5%
Validation accuracy: 69.3%
Minibatch loss at step 1000: 1.938446
Minibatch accuracy: 73.4%
Validation accuracy: 77.6%
Minibatch loss at step 1500: 1.673009
Minibatch accuracy: 69.5%
Validation accuracy: 79.2%
Minibatch loss at step 2000: 1.326191
Minibatch accuracy: 69.5%
Validation accuracy: 80.3%
Minibatch loss at step 2500: 1.307503
Minibatch accuracy: 68.8%
Validation accuracy: 81.4%
Minibatch loss at step 3000: 0.958134
Minibatch accuracy: 71.1%
Validation accuracy: 81.7%
Test accuracy: 89.4%


In [81]:
# Use learning rate decay
# Run nn model with learning rate decay
print('\n\nRunning nn model with learning rate decay')
run_model(
    NUM_OF_STEPS,
    TRAIN_BATCH_SIZE,

    train_dataset,
    train_labels,

    valid_dataset,
    valid_labels,
    
    test_dataset,
    test_labels,

    train_nn_model_with_dropouts,
    prepare_params_for_nn_model,
    train_nn_model,

    use_learning_rate_decay=True)



Running nn model with learning rate decay
Minibatch loss at step 0: 35.765656
Minibatch accuracy: 12.5%
Validation accuracy: 15.3%
Minibatch loss at step 500: 3.108158
Minibatch accuracy: 60.9%
Validation accuracy: 68.0%
Minibatch loss at step 1000: 1.992736
Minibatch accuracy: 67.2%
Validation accuracy: 77.5%
Minibatch loss at step 1500: 1.527724
Minibatch accuracy: 65.6%
Validation accuracy: 79.0%
Minibatch loss at step 2000: 1.294971
Minibatch accuracy: 70.3%
Validation accuracy: 80.5%
Minibatch loss at step 2500: 1.104659
Minibatch accuracy: 75.0%
Validation accuracy: 81.2%
Minibatch loss at step 3000: 0.939039
Minibatch accuracy: 71.9%
Validation accuracy: 81.8%
Test accuracy: 89.0%
