Deep Learning
=============

Assignment 3 | Tianzi Cai | 2016-06-30

------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [2]:
try:
  import tensorflow as tf
  print("TensorFlow is already installed")
except ImportError:
  print("Installing TensorFlow")
  import subprocess
  subprocess.check_call(["/databricks/python/bin/pip", "install", "https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.6.0-cp27-none-linux_x86_64.whl"])
  print("TensorFlow has been installed on this cluster")

In [3]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [5]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [7]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32) # One shape dimension can be -1. In this case, the value is inferred from the length of the array and remaining dimensions.
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32) # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set: ', train_dataset.shape, train_labels.shape)
print('Validation set: ', valid_dataset.shape, valid_labels.shape)
print('Test set: ', test_dataset.shape, test_labels.shape)

In [8]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

### Problem 1

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

https://github.com/arrisray/ud730/blob/38d01d2073e0b01c35df44488415ef6d0fcd6f91/3_regularization.ipynb

In [11]:
def logistic_graph(beta):
  graph = tf.Graph()
  with graph.as_default():
    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (None, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (None, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels])) # think matmul
    biases = tf.Variable(tf.zeros([num_labels]))
    
    # Training.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) # this is the loss function that needs to be optimized
    reg = beta * tf.nn.l2_loss(weights)
    loss = loss + reg # update loss
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Prediction.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
  
  return {'graph': graph,
         'tf_train_dataset': tf_train_dataset,
         'tf_train_labels': tf_train_labels,
         'tf_valid_dataset': tf_valid_dataset,
         'tf_test_dataset': tf_test_dataset,
         'loss': loss,
         'optimizer': optimizer,
         'train_prediction': train_prediction,
         'valid_prediction': valid_prediction,
         'test_prediction': test_prediction}

In [12]:
def nn_graph(beta, use_dropout = False):
  graph = tf.Graph()
  with graph.as_default():
    num_hidden_nodes = 1024
    
    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (128, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (None, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes])) # think matmul
    biases_1 = tf.Variable(tf.zeros([num_hidden_nodes]))
    
    # Hidden layer.
    hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1)+biases_1)
    
    # Output layer.
    weights_2 = tf.Variable(tf.truncated_normal([num_hidden_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training.
    logits = 0
    if (use_dropout):
      logits = tf.matmul(tf.nn.dropout(hidden_layer, 0.5), weights_2) + biases_2
    else:
      logits = tf.matmul(hidden_layer, weights_2) + biases_2
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    reg = beta * tf.nn.l2_loss(weights_1) + beta * tf.nn.l2_loss(weights_2)
    loss = loss + reg # update loss
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Prediction.
    train_prediction = tf.nn.softmax(logits)
    
    hidden_valid_prediction = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    valid_prediction = tf.nn.softmax(tf.matmul(hidden_valid_prediction, weights_2) + biases_2)
    
    hidden_test_prediction = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    test_prediction = tf.nn.softmax(tf.matmul(hidden_test_prediction, weights_2) + biases_2)
  
  return {'graph': graph,
         'tf_train_dataset': tf_train_dataset,
         'tf_train_labels': tf_train_labels,
         'tf_valid_dataset': tf_valid_dataset,
         'tf_test_dataset': tf_test_dataset,
         'loss': loss,
         'optimizer': optimizer,
         'train_prediction': train_prediction,
         'valid_prediction': valid_prediction,
         'test_prediction': test_prediction}

In [13]:
def run_graph(graph_info, num_steps = 801, batch_size = 128, train_size = 20000):
  graph = graph_info['graph']
  tf_train_dataset = graph_info['tf_train_dataset']
  tf_train_labels = graph_info['tf_train_labels']
  tf_valid_dataset = graph_info['tf_valid_dataset']
  tf_test_dataset = graph_info['tf_test_dataset']
  loss = graph_info['loss']
  optimizer = graph_info['optimizer']
  train_prediction = graph_info['train_prediction']
  valid_prediction = graph_info['valid_prediction']
  test_prediction = graph_info['test_prediction']

  with tf.Session(graph = graph) as session:
    tf.initialize_all_variables().run()
    
    # Am I making unnecessary copies of data here?
    if train_size != 20000:
        tmp = np.random.choice(20000, train_size, False)
        t_dataset = train_dataset[tmp, :]
        t_labels = train_labels[tmp, :]
    else: 
        t_dataset = train_dataset[:, :]
        t_labels = train_labels[:, :]
   
    for step in range(num_steps):
      
      # Generate a minibatch.
      offset = (step * batch_size) % (t_labels.shape[0] - batch_size)
      batch_data = t_dataset[offset:(offset + batch_size), :]
      batch_labels = t_labels[offset:(offset + batch_size), :]
      
      feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
      _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict = feed_dict)
      
      if (step % 500 == 0):
        print('Mini-batch (%d) Loss at step %d: %f' % (batch_size, step, l))
        print('Mini-batch (%d) Training accuracy: %.1f%%' % (batch_size, accuracy(predictions, batch_labels)))
        print('Mini-batch (%d) Validation accuracy: %.1f%%' % (batch_size, accuracy(valid_prediction.eval(), valid_labels)))
        
    print('Mini-batch (%d) Test accuracy: %.1f%%\n\n' % (batch_size, accuracy(test_prediction.eval(), test_labels)))
    #return accuracy(valid_prediction.eval(), valid_labels)

In [14]:
logistic_graph_info = logistic_graph(beta = 0.1)
run_graph(logistic_graph_info, num_steps = 1001, batch_size = 128)

logistic_graph_info = logistic_graph(beta = 0.01)
run_graph(logistic_graph_info, num_steps = 1001, batch_size = 128)

logistic_graph_info = logistic_graph(beta = 0.001)
run_graph(logistic_graph_info, num_steps = 1001, batch_size = 128)

logistic_graph_info = logistic_graph(beta = 0.0001)
run_graph(logistic_graph_info, num_steps = 1001, batch_size = 128)

In [15]:
nn_graph_info = nn_graph(beta = 0.1, use_dropout = False)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

nn_graph_info = nn_graph(beta = 0.01, use_dropout = False)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

nn_graph_info = nn_graph(beta = 0.001, use_dropout = False)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

nn_graph_info = nn_graph(beta = 0.0001, use_dropout = False)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

### Problem 2

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [17]:
# small training data quickly goes to overfit

logistic_graph_info = logistic_graph(beta = 0.001)
run_graph(logistic_graph_info, num_steps = 1001, batch_size = 128, train_size = 4*128)

In [18]:
nn_graph_info = nn_graph(beta = 0.001, use_dropout = False)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 4*128)

### Problem 3

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

In [20]:
nn_graph_info = nn_graph(beta = 0.005, use_dropout = True)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

In [21]:
nn_graph_info = nn_graph(beta = 0.004, use_dropout = True)
run_graph(nn_graph_info, num_steps = 1001, batch_size = 128, train_size = 20000)

### Problem 4
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).
One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

```
global_step = tf.Variable(0)  # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
```

In [23]:
print (2018*128 % (20000-128) + 128)
print (2019*128 % (20000-128))

In [24]:
def multi_nn_graph(beta):
  graph = tf.Graph()
  with graph.as_default():
    num_hidden_nodes_1 = 1024
    num_hidden_nodes_2 = 512   
    
    # Input data. 
    tf_train_dataset = tf.placeholder(tf.float32, shape = (None, image_size * image_size)) # None introduced to take any shape
    tf_train_labels = tf.placeholder(tf.float32, shape = (None, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes_1], stddev = 0.05))#np.sqrt(2/20000))) # stddev: sqrt(2/size of previous layer)
    biases_1 = tf.Variable(tf.zeros([num_hidden_nodes_1]))
    
    # Hidden layer.
    hidden_layer_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1)+biases_1)

    # Variables.
    weights_2 = tf.Variable(tf.truncated_normal([num_hidden_nodes_1, num_hidden_nodes_2], stddev = 0.05))#np.sqrt(2/num_hidden_nodes_1)))
    biases_2 = tf.Variable(tf.zeros([num_hidden_nodes_2]))
    
    # Hidden layer.
    hidden_layer_2 = tf.nn.relu(tf.matmul(tf.nn.dropout(hidden_layer_1, keep_prob = 1, seed = 2016), weights_2)+biases_2) # no dropout
    
    # Output layer.
    weights_3 = tf.Variable(tf.truncated_normal([num_hidden_nodes_2, num_labels], stddev = 0.05))#np.sqrt(2/num_hidden_nodes_2)))
    biases_3 = tf.Variable(tf.zeros([num_labels]))
    
    # Training.
    logits = tf.matmul(tf.nn.dropout(hidden_layer_2, keep_prob = 1, seed = 2016), weights_3) + biases_3 # no dropout
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    reg = beta * tf.nn.l2_loss(weights_1) + beta * tf.nn.l2_loss(weights_2) + beta * tf.nn.l2_loss(weights_3)
    loss = loss + reg # update loss
    
    # Optimizer.
    
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps = 3000, decay_rate = 0.5)
    # Passing global_step to minimize() will increment it at each step.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step) 
    # global_step += 1
    
    # Prediction.
    train_prediction = tf.nn.softmax(logits)
    
    hidden1_valid_prediction = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    hidden2_valid_prediction = tf.nn.relu(tf.matmul(hidden1_valid_prediction, weights_2) + biases_2)
    valid_prediction = tf.nn.softmax(tf.matmul(hidden2_valid_prediction, weights_3) + biases_3)
    
    hidden1_test_prediction = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    hidden2_test_prediction = tf.nn.relu(tf.matmul(hidden1_test_prediction, weights_2) + biases_2)
    test_prediction = tf.nn.softmax(tf.matmul(hidden2_test_prediction, weights_3) + biases_3)
  
  return {'graph': graph,
         'tf_train_dataset': tf_train_dataset,
         'tf_train_labels': tf_train_labels,
         'tf_valid_dataset': tf_valid_dataset,
         'tf_test_dataset': tf_test_dataset,
         'loss': loss,
         'optimizer': optimizer,
         'train_prediction': train_prediction,
         'valid_prediction': valid_prediction,
         'test_prediction': test_prediction} 

In [25]:
def run_graph(graph_info, num_steps = 3001, batch_size = 128):
  graph = graph_info['graph']
  tf_train_dataset = graph_info['tf_train_dataset']
  tf_train_labels = graph_info['tf_train_labels']
  tf_valid_dataset = graph_info['tf_valid_dataset']
  tf_test_dataset = graph_info['tf_test_dataset']
  loss = graph_info['loss']
  optimizer = graph_info['optimizer']
  train_prediction = graph_info['train_prediction']
  valid_prediction = graph_info['valid_prediction']
  test_prediction = graph_info['test_prediction']

  with tf.Session(graph = graph) as session:
    tf.initialize_all_variables().run()
   
    for step in range(num_steps):
      
      # Generate a minibatch.
      offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
      batch_data = train_dataset[offset:(offset + batch_size), :]
      batch_labels = train_labels[offset:(offset + batch_size), :]
      
      feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels} # this will feed the placeholders
      _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict = feed_dict)
      
      if (step % 500 == 0):
        print('Mini-batch (%d) Loss at step %d: %f' % (batch_size, step, l))
        print('Mini-batch (%d) Training accuracy: %.1f%%' % (batch_size, accuracy(predictions, batch_labels)))
        print('Mini-batch (%d) Validation accuracy: %.1f%%' % (batch_size, accuracy(valid_prediction.eval(), valid_labels)))
      
    print('Final Test accuracy: %.1f%%\n\n' % (accuracy(test_prediction.eval(), test_labels)))

In [26]:
nn_graph_info = multi_nn_graph(beta = 0.0001)
run_graph(nn_graph_info, num_steps = 10000)

In [27]:
def multi_nn_graph(beta):
  graph = tf.Graph()
  with graph.as_default():
    num_hidden_nodes_1 = 1024
    num_hidden_nodes_2 = 400   
    
    # Input data. 
    tf_train_dataset = tf.placeholder(tf.float32, shape = (None, image_size * image_size)) # None introduced to take any shape
    tf_train_labels = tf.placeholder(tf.float32, shape = (None, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes_1], tf.sqrt(2/num_hidden_nodes_1)))
    biases_1 = tf.Variable(tf.zeros([num_hidden_nodes_1]))
    
    # Hidden layer.
    hidden_layer_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1)+biases_1)

    # Variables.
    weights_2 = tf.Variable(tf.truncated_normal([num_hidden_nodes_1, num_hidden_nodes_2], tf.sqrt(2/num_hidden_nodes_2)))
    biases_2 = tf.Variable(tf.zeros([num_hidden_nodes_2]))
    
    # Hidden layer.
    hidden_layer_2 = tf.nn.relu(tf.matmul(tf.nn.dropout(hidden_layer_1, keep_prob = 1, seed = 2016), weights_2)+biases_2) # no dropout
    
    # Output layer.
    weights_3 = tf.Variable(tf.truncated_normal([num_hidden_nodes_2, num_labels], tf.sqrt(2/num_labels)))
    biases_3 = tf.Variable(tf.zeros([num_labels]))
    
    # Training.
    logits = tf.matmul(tf.nn.dropout(hidden_layer_2, keep_prob = 1, seed = 2016), weights_3) + biases_3 # no dropout
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    reg = beta * tf.nn.l2_loss(weights_1) + beta * tf.nn.l2_loss(weights_2) + beta * tf.nn.l2_loss(weights_3)
    loss = loss + reg # update loss
    
    # Optimizer.
    
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps = 1000, decay_rate = 0.5)
    # Passing global_step to minimize() will increment it at each step.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step) 
    # global_step += 1
    
    # Prediction.
    train_prediction = tf.nn.softmax(logits)
    
    hidden1_valid_prediction = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    hidden2_valid_prediction = tf.nn.relu(tf.matmul(hidden1_valid_prediction, weights_2) + biases_2)
    valid_prediction = tf.nn.softmax(tf.matmul(hidden2_valid_prediction, weights_3) + biases_3)
    
    hidden1_test_prediction = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    hidden2_test_prediction = tf.nn.relu(tf.matmul(hidden1_test_prediction, weights_2) + biases_2)
    test_prediction = tf.nn.softmax(tf.matmul(hidden2_test_prediction, weights_3) + biases_3)
  
  return {'graph': graph,
         'tf_train_dataset': tf_train_dataset,
         'tf_train_labels': tf_train_labels,
         'tf_valid_dataset': tf_valid_dataset,
         'tf_test_dataset': tf_test_dataset,
         'loss': loss,
         'optimizer': optimizer,
         'train_prediction': train_prediction,
         'valid_prediction': valid_prediction,
         'test_prediction': test_prediction} 

In [28]:
nn_graph_info = multi_nn_graph(beta = 0.0001)
run_graph(nn_graph_info, num_steps = 10000)