Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [0]:
import os
data_root = './data'
pickle_file = os.path.join(data_root, 'notMNIST.pickle')
print("{}".format(pickle_file))

In [0]:
with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [0]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

In [0]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [0]:
# Randomly picks a batch of training and validation data
def get_data(self, dataset, labels):
  n = len(dataset)
  perm = np.random.randint(n, size=self.batch_size)
  return dataset[perm], labels[perm]

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

LOGISTIC REGRESSION: Softmax Score = X*W1 + B1
NEURAL MODEL: Softmax Score =(ReLU(X*W1 + B1))*W2 + B2

In [0]:
# LOGISTIC REGRESSION: Softmax Score = X*W1 + B1
def define_state_logistic(self):
  with self.graph.as_default():
    # Input Layer placeholders for data and labels
    self.dataset_ph = tf.placeholder(tf.float32,
                                     shape=(None, self.n_features), name="dataset")
    self.labels_ph = tf.placeholder(tf.float32,
                                    shape=(None, self.label_size), name="labels")
    self.l2_reg_ph = tf.placeholder(tf.float32, name= "l2_reg")
    # Use truncated normal initializer
    tr_norm_init = tf.truncated_normal_initializer()

    # Softmax Score Layer:
    with tf.variable_scope("Softmax", initializer=tr_norm_init) as scope:
      sm_weights = tf.get_variable("Weights", [self.n_features, self.label_size])
      sm_biases    = tf.get_variable("Biases", [self.label_size], initializer=tf.zeros_initializer)

    return
def define_computation_logistic(self):
  with self.graph.as_default():
    # Input/Placeholder to NN
    with tf.variable_scope("Softmax", reuse=True) as scope:
      sm_weights = tf.get_variable("Weights")
      sm_biases = tf.get_variable("Biases")

    logits = tf.matmul(self.dataset_ph, sm_weights) + sm_biases

    # SoftmaxScore to Prediction
    self.prediction_op = tf.nn.softmax(logits, name="Prediction")
    # Cross Entropy Loss
    XEnt = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=self.labels_ph)
  
    # Scalar Loss
    self.loss_op = tf.reduce_mean(XEnt) + self.l2_reg_ph*tf.nn.l2_loss(sm_weights)
  
    # Accuracy
    P = tf.argmax(self.prediction_op, axis=1)
    L = tf.argmax(self.labels_ph, axis=1)
    self.accuracy_op = 100.0 * tf.reduce_mean(tf.to_float(tf.equal(P,L)))
  
    # Optimizer
    self.train_op = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss_op)

    return

def run_epochs(self):
  best_val_epoch = 0
  best_val_acc      = -float('inf')

  with tf.device('/cpu:0'), tf.Session(graph=self.graph) as s:
    initer = tf.global_variables_initializer()
    s.run(initer)
    print("Initialized")

    for step in range(self.n_epochs):
      # Generate a minibatch.
      batch_data, batch_labels = self.get_data(train_dataset, train_labels)
      # Dictionary: Feeds Training Minibatch.
      feed_dict = {self.dataset_ph: batch_data, self.labels_ph: batch_labels, self.l2_reg_ph:self.l2_reg}
      _, tr_loss, tr_acc = s.run( [self.train_op, self.loss_op, self.accuracy_op], feed_dict=feed_dict)

      if ((step != 0) and (step % 500 == 0)):
        # Track validationation accuracy
        feed_dict = {self.dataset_ph: valid_dataset, self.labels_ph: valid_labels, self.l2_reg_ph:1.0}
        val_acc = s.run(self.accuracy_op, feed_dict=feed_dict)

        print("Minibatch Step=%d"%step)
        print("------------------------------")
        print("Training Loss=%.2f: Accuracy=%.2f%%" % (tr_loss, tr_acc))
        print("Validation Accuracy: %.2f%%" % val_acc)
      
        # Remember the epoch when best validation accuracy was realized
        # Stop early if average validation accuracy is not improving for a few steps
        if val_acc > best_val_acc:
          best_val_epoch = step
          best_val_acc = val_acc
          
        if step >= (best_val_epoch + self.early_stop):
          print("Step %d: Best_Epoch %d: Early_Stop %d" % (step, best_val_epoch, self.early_stop))
          print("Terminate training early. Validation accuracy has flattened at %.2f%%"% best_val_acc)
          break
      
    print("Training Completed: ")
    print("-------------------------------")
    feed_dict = {self.dataset_ph: test_dataset, self.labels_ph: test_labels, self.l2_reg_ph:1.0}
    test_acc = s.run( self.accuracy_op, feed_dict=feed_dict)
    print("Test Accuracy: %.2f%%" % test_acc)  

In [0]:
# Dimensions
# image_size = 28
# num_labels = 10
# hidden layer
class ImageModel():
  l2_reg           = 0.01
  lr                    = 0.01
  label_size    = num_labels
  n_features   = image_size*image_size
  batch_size   = 128
  n_epochs     = 100000
  early_stop   = 1000

  define_state = define_state_logistic
  define_computation = define_computation_logistic
  get_data = get_data
  run_epochs = run_epochs  

  def __init__(self):
    self.graph = tf.Graph()
    
imMod = ImageModel()    
imMod.define_state()
imMod.define_computation()
imMod.run_epochs()

In [0]:
# Dimensions
# image_size = 28
# num_labels = 10
# hidden layer
class ImageModel():
  l2_reg           = 0.01
  dropout       = 1.0
  lr                    = 0.01
  label_size    = num_labels
  hidden_size = 1024
  n_features   = image_size*image_size
  batch_size   = 128
  n_epochs     = 100000
  early_stop   = 1000

  define_state = define_state_nn
  define_computation = define_computation_nn
  get_data = get_data
  run_epochs = run_epochs  

  def __init__(self):
    self.graph = tf.Graph()
    
imMod = ImageModel()    
imMod.define_state()
imMod.define_computation()
imMod.run_epochs()

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [0]:
# Dimensions
# image_size = 28
# num_labels = 10
# hidden layer
class ImageModel():
  l2_reg           = 0.01
  dropout       = 1.0
  lr                    = 0.01
  label_size    = num_labels
  hidden_size = 1024
  n_features   = image_size*image_size
  batch_size   = 128
  n_epochs     = 1000
  early_stop   = 1000

  define_state = define_state_nn
  define_computation = define_computation_nn
  get_data = get_data
  run_epochs = run_epochs  

  def __init__(self):
    self.graph = tf.Graph()
    
imMod = ImageModel()    
imMod.define_state()
imMod.define_computation()
imMod.run_epochs()

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [0]:
# Dimensions
# image_size = 28
# num_labels = 10
# hidden layer
class ImageModel():
  l2_reg           = 0.01
  dropout       = 0.9
  lr                    = 0.01
  label_size    = num_labels
  hidden_size = 1024
  n_features   = image_size*image_size
  batch_size   = 128
  n_epochs     = 1000
  early_stop   = 1000

  define_state = define_state_nn
  define_computation = define_computation_nn
  get_data = get_data
  run_epochs = run_epochs  

  def __init__(self):
    self.graph = tf.Graph()
    
imMod = ImageModel()    
imMod.define_state()
imMod.define_computation()
imMod.run_epochs()

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [0]:
# DEEP NEURAL MODEL: Softmax Score =(((ReLU(X*W1 + B1))*W2 + B2)*W3 + B3)*W4 + B4
def define_state_deep_nn(self):
  with self.graph.as_default():
    # Input Layer placeholders for data and labels
    self.dataset_ph = tf.placeholder(tf.float32,
                                     shape=(None, self.n_features), name="dataset")
    self.labels_ph = tf.placeholder(tf.float32,
                                    shape=(None, self.label_size), name="labels")
    self.l2_reg_ph = tf.placeholder(tf.float32, name= "l2_reg")
    self.dropout_ph = tf.placeholder(tf.float32, name= "dropout")

    # Use truncated normal initializer
    tr_norm_init = tf.truncated_normal_initializer(stddev=0.1)
    # NN Layer 1:
    with tf.variable_scope("NN1", initializer=tr_norm_init) as scope:
      nn1_weights = tf.get_variable("Weights", [self.n_features, self.hidden1_size])
      nn1_biases    = tf.get_variable("Biases", [self.hidden1_size], initializer=tf.zeros_initializer)
    # NN Layer 2:
    with tf.variable_scope("NN2", initializer=tr_norm_init) as scope:
      nn2_weights = tf.get_variable("Weights", [self.hidden1_size, self.hidden2_size])
      nn2_biases    = tf.get_variable("Biases", [self.hidden2_size], initializer=tf.zeros_initializer)
    # NN Layer 3:
    with tf.variable_scope("NN3", initializer=tr_norm_init) as scope:
      nn2_weights = tf.get_variable("Weights", [self.hidden2_size, self.hidden3_size])
      nn2_biases    = tf.get_variable("Biases", [self.hidden3_size], initializer=tf.zeros_initializer)
    # Softmax Score Layer:
    with tf.variable_scope("Softmax", initializer=tr_norm_init) as scope:
      sm_weights = tf.get_variable("Weights", [self.hidden3_size, self.label_size])
      sm_biases    = tf.get_variable("Biases", [self.label_size], initializer=tf.zeros_initializer)

    return
    
def define_computation_deep_nn(self):
  with self.graph.as_default():
    # NN1
    with tf.variable_scope("NN1", reuse=True) as scope:
      nn1_weights = tf.get_variable("Weights")
      nn1_biases = tf.get_variable("Biases")

    Z1 = tf.matmul(self.dataset_ph, nn1_weights) + nn1_biases
    A1 = tf.nn.relu(Z1, "ReLU")
    # Dropout at Activation
    Ad1 = tf.nn.dropout(A1, self.dropout_ph)
    nn1_loss = tf.nn.l2_loss(nn1_weights)

    # NN2
    with tf.variable_scope("NN2", reuse=True) as scope:
       nn2_weights = tf.get_variable("Weights")
       nn2_biases = tf.get_variable("Biases")

    Z2 = tf.matmul(Ad1, nn2_weights) + nn2_biases
    A2 = tf.nn.relu(Z2, "ReLU")
    # Dropout at Activation
    Ad2 = tf.nn.dropout(A2, self.dropout_ph)
    nn2_loss = tf.nn.l2_loss(nn2_weights)
  
    # NN3
    with tf.variable_scope("NN2", reuse=True) as scope:
       nn3_weights = tf.get_variable("Weights")
       nn3_biases = tf.get_variable("Biases")

    Z3 = tf.matmul(Ad2, nn3_weights) + nn3_biases
    A3 = tf.nn.relu(Z3, "ReLU")
    # Dropout at Activation
    Ad3 = tf.nn.dropout(A3, self.dropout_ph)
    nn3_loss = tf.nn.l2_loss(nn3_weights)
  
    # NN3 to SoftmaxScore
    with tf.variable_scope("Softmax", reuse=True) as scope:
      sm_weights = tf.get_variable("Weights")
      sm_biases    = tf.get_variable("Biases")
    Z4 = tf.matmul(Ad3, sm_weights) + sm_biases
    # Dropout at Output
    logits = tf.nn.dropout(Z4, self.dropout_ph)  
    sm_loss = tf.nn.l2_loss(sm_weights)
    
    # SoftmaxScore to Prediction
    self.prediction_op = tf.nn.softmax(logits, name="Prediction")

    # Cross Entropy Loss
    XEnt = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=self.labels_ph)
  
    # Scalar Loss
    self.loss_op = tf.reduce_mean(XEnt) + self.l2_reg_ph*(nn1_loss + nn2_loss + nn3_loss + sm_loss)
  
    # Accuracy
    P = tf.argmax(self.prediction_op, axis=1)
    L = tf.argmax(self.labels_ph, axis=1)
    self.accuracy_op = 100.0 * tf.reduce_mean(tf.to_float(tf.equal(P,L)))

    # Optimizer with exponential decay of learning rate
    n_steps = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(self.lr, n_steps, self.decay_steps, self.decay_rate)
    self.train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(self.loss_op, global_step=n_steps)

  return

def run_epochs_deep_nn(self):
  best_val_epoch = 0
  best_val_acc      = -float('inf')

  with tf.device('/cpu:0'), tf.Session(graph=self.graph) as s:
    initer = tf.global_variables_initializer()
    s.run(initer)
    print("Initialized")

    for step in range(self.n_epochs):
      # Generate a minibatch.
      batch_data, batch_labels = self.get_data(train_dataset, train_labels)
      # Dictionary: Feeds Training Minibatch.
      feed_dict = {
        self.dataset_ph:    batch_data,
        self.labels_ph:       batch_labels,
        self.l2_reg_ph:       self.l2_reg,
        self.dropout_ph:   self.dropout
      }
      _, tr_loss, tr_acc = s.run( [self.train_op, self.loss_op, self.accuracy_op], feed_dict=feed_dict)

      if (step % 500 == 0):
        # Track validationation accuracy
        feed_dict = {
          self.dataset_ph: valid_dataset,
          self.labels_ph: valid_labels,
          self.l2_reg_ph:0.0,
          self.dropout_ph:1.0
        }
        val_acc = s.run(self.accuracy_op, feed_dict=feed_dict)

        print("Minibatch Step=%d"%step)
        print("------------------------------")
        print("Training Loss=%.2f: Accuracy=%.2f%%" % (tr_loss, tr_acc))
        print("Validation Accuracy: %.2f%%" % val_acc)
      
        # Remember the epoch when best validation accuracy was realized
        # Stop early if average validation accuracy is not improving for a few steps
        if val_acc > best_val_acc:
          best_val_epoch = step
          best_val_acc = val_acc
          
        if step >= (best_val_epoch + self.early_stop):
          print("Step %d: Best_Epoch %d: Early_Stop %d" % (step, best_val_epoch, self.early_stop))
          print("Terminate training early. Validation accuracy has flattened at %.2f%%"% best_val_acc)
          break
      
    print("Training Completed: ")
    print("-------------------------------")
    feed_dict = {
      self.dataset_ph: test_dataset,
      self.labels_ph: test_labels,
      self.l2_reg_ph:0.0,
      self.dropout_ph:1.0
    }
    test_acc = s.run( self.accuracy_op, feed_dict=feed_dict)
    print("Test Accuracy: %.2f%%" % test_acc)

In [0]:
# Dimensions
# image_size = 28
# num_labels = 10
# hidden layer
class ImageModel():
  l2_reg           = 0.01
  dropout       = 0.9
  lr                    = 0.1
  decay_rate   = 0.95
  decay_steps = 100
  label_size    = num_labels
  hidden1_size = 1024
  hidden2_size = 1024
  hidden3_size = 1024
  n_features   = image_size*image_size
  batch_size   = 128
  n_epochs     = 10000
  early_stop   = 2500

  define_state = define_state_deep_nn
  define_computation = define_computation_deep_nn
  get_data = get_data
  run_epochs = run_epochs_deep_nn

  def __init__(self):
    self.graph = tf.Graph()
    
imMod = ImageModel()    
imMod.define_state()
imMod.define_computation()
imMod.run_epochs()

Accuracy of around 91.49% was accomplished with the following hyperparameters:
* 2 layer NN of 512 units each.
* L2 Reg 0.01 Dropout 0.95
* LR 0.1 Decay Rate 0.90 Decay Steps 1000
Accuracy of around 90.51% was accomplished with the following hyperparameters:
* 3 layer NN of 1024 units each.
* L2 Reg 0.01 Dropout 0.9
* LR 0.1 Decay Rate 0.95 Decay Steps 100
