Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [9]:
batch_size = 128
beta=0.01
graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))+ beta * tf.nn.l2_loss(weights)
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [10]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 47.354774
Minibatch accuracy: 16.4%
Validation accuracy: 16.5%
Minibatch loss at step 500: 0.753821
Minibatch accuracy: 85.9%
Validation accuracy: 81.5%
Minibatch loss at step 1000: 0.801339
Minibatch accuracy: 80.5%
Validation accuracy: 81.2%
Minibatch loss at step 1500: 0.565806
Minibatch accuracy: 82.8%
Validation accuracy: 81.1%
Minibatch loss at step 2000: 0.648272
Minibatch accuracy: 87.5%
Validation accuracy: 81.2%
Minibatch loss at step 2500: 0.782439
Minibatch accuracy: 78.9%
Validation accuracy: 81.1%
Minibatch loss at step 3000: 0.784383
Minibatch accuracy: 80.5%
Validation accuracy: 81.4%
Test accuracy: 88.6%


## neural network 

In [17]:
batch_size = 128
n_nodes_hl1=1024
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  hidden_1_layer={'weights':tf.Variable(
  tf.truncated_normal([image_size * image_size, n_nodes_hl1]),name='hidden_weights'),'bias':tf.Variable(tf.zeros([n_nodes_hl1]),name='hidden_bias')}                                                                                                             
  output_layer={'weights':tf.Variable(tf.truncated_normal([n_nodes_hl1,num_labels]),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  output=tf.add(tf.matmul(l1,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"]))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  

In [18]:
num_steps = 10000

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 593.248718
Minibatch accuracy: 10.2%
Validation accuracy: 33.9%
Minibatch loss at step 500: 181.862213
Minibatch accuracy: 78.9%
Validation accuracy: 79.1%
Minibatch loss at step 1000: 112.801064
Minibatch accuracy: 79.7%
Validation accuracy: 81.2%
Minibatch loss at step 1500: 74.073616
Minibatch accuracy: 90.6%
Validation accuracy: 82.6%
Minibatch loss at step 2000: 49.114239
Minibatch accuracy: 90.6%
Validation accuracy: 84.3%
Minibatch loss at step 2500: 32.967834
Minibatch accuracy: 86.7%
Validation accuracy: 85.1%
Minibatch loss at step 3000: 22.263639
Minibatch accuracy: 85.9%
Validation accuracy: 86.1%
Minibatch loss at step 3500: 15.059161
Minibatch accuracy: 86.7%
Validation accuracy: 86.7%
Minibatch loss at step 4000: 10.181773
Minibatch accuracy: 90.6%
Validation accuracy: 87.4%
Minibatch loss at step 4500: 6.968161
Minibatch accuracy: 89.8%
Validation accuracy: 87.3%
Minibatch loss at step 5000: 4.855597
Minibatch accuracy: 89.1%
Valida

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [21]:
batch_size = 10
n_nodes_hl1=1024
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  hidden_1_layer={'weights':tf.Variable(
  tf.truncated_normal([image_size * image_size, n_nodes_hl1]),name='hidden_weights'),'bias':tf.Variable(tf.zeros([n_nodes_hl1]),name='hidden_bias')}                                                                                                             
  output_layer={'weights':tf.Variable(tf.truncated_normal([n_nodes_hl1,num_labels]),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  output=tf.add(tf.matmul(l1,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"]))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  

In [22]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 585.116211
Minibatch accuracy: 20.0%
Validation accuracy: 18.5%
Minibatch loss at step 500: 100481.554688
Minibatch accuracy: 30.0%
Validation accuracy: 54.9%
Minibatch loss at step 1000: 92544.468750
Minibatch accuracy: 50.0%
Validation accuracy: 56.1%
Minibatch loss at step 1500: 1397098.375000
Minibatch accuracy: 70.0%
Validation accuracy: 49.1%
Minibatch loss at step 2000: 8571844.000000
Minibatch accuracy: 40.0%
Validation accuracy: 41.1%
Minibatch loss at step 2500: 2852372.250000
Minibatch accuracy: 40.0%
Validation accuracy: 43.7%
Minibatch loss at step 3000: 463610.937500
Minibatch accuracy: 30.0%
Validation accuracy: 50.5%
Test accuracy: 56.0%


In [31]:
batch_size = 20
n_nodes_hl1=2048
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  hidden_1_layer={'weights':tf.Variable(
  tf.truncated_normal([image_size * image_size, n_nodes_hl1]),name='hidden_weights'),'bias':tf.Variable(tf.zeros([n_nodes_hl1]),name='hidden_bias')}                                                                                                             
  output_layer={'weights':tf.Variable(tf.truncated_normal([n_nodes_hl1,num_labels]),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  output=tf.add(tf.matmul(l1,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"]))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  

In [32]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 867.437622
Minibatch accuracy: 20.0%
Validation accuracy: 12.9%
Minibatch loss at step 500: 564.476929
Minibatch accuracy: 80.0%
Validation accuracy: 68.9%
Minibatch loss at step 1000: 500.818726
Minibatch accuracy: 75.0%
Validation accuracy: 59.4%
Minibatch loss at step 1500: 166.388077
Minibatch accuracy: 75.0%
Validation accuracy: 71.7%
Minibatch loss at step 2000: 109.435852
Minibatch accuracy: 65.0%
Validation accuracy: 71.7%
Minibatch loss at step 2500: 72.815384
Minibatch accuracy: 75.0%
Validation accuracy: 78.3%
Minibatch loss at step 3000: 48.715622
Minibatch accuracy: 80.0%
Validation accuracy: 80.4%
Test accuracy: 87.8%


In [25]:
batch_size = 1000
n_nodes_hl1=1024
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  hidden_1_layer={'weights':tf.Variable(
  tf.truncated_normal([image_size * image_size, n_nodes_hl1]),name='hidden_weights'),'bias':tf.Variable(tf.zeros([n_nodes_hl1]),name='hidden_bias')}                                                                                                             
  output_layer={'weights':tf.Variable(tf.truncated_normal([n_nodes_hl1,num_labels]),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  output=tf.add(tf.matmul(l1,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"]))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [37]:
batch_size = 128
n_nodes_hl1=2048
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  hidden_1_layer={'weights':tf.Variable(
  tf.truncated_normal([image_size * image_size, n_nodes_hl1]),name='hidden_weights'),'bias':tf.Variable(tf.zeros([n_nodes_hl1]),name='hidden_bias')}                                                                                                             
  output_layer={'weights':tf.Variable(tf.truncated_normal([n_nodes_hl1,num_labels]),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  hidden_1_layer["weights"]=tf.nn.dropout(hidden_1_layer["weights"],keep_prob=0.4) 
  output=tf.add(tf.matmul(l1,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"]))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  

In [38]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 1700.951172
Minibatch accuracy: 18.0%
Validation accuracy: 22.1%
Minibatch loss at step 500: 467.589874
Minibatch accuracy: 85.9%
Validation accuracy: 52.9%
Minibatch loss at step 1000: 170.288803
Minibatch accuracy: 82.0%
Validation accuracy: 53.8%
Minibatch loss at step 1500: 61.997978
Minibatch accuracy: 90.6%
Validation accuracy: 50.3%
Minibatch loss at step 2000: 23.008766
Minibatch accuracy: 92.2%
Validation accuracy: 62.4%
Minibatch loss at step 2500: 8.906447
Minibatch accuracy: 88.3%
Validation accuracy: 71.6%
Minibatch loss at step 3000: 3.749403
Minibatch accuracy: 85.2%
Validation accuracy: 83.6%
Test accuracy: 89.4%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [82]:
del output_layer

In [92]:
batch_size = 128 # 128
n_nodes_hl1=2024
n_nodes_hl2=1024
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  #tf.get_variable("W", shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer())
           
  hidden_1_layer={'weights':tf.get_variable(shape=[image_size * image_size, n_nodes_hl1],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl1]),name='hidden_bias')}
  hidden_2_layer={'weights':tf.get_variable( shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights_2'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl2]),name='hidden_bias_2')}
  output_layer={'weights':tf.get_variable( shape=[n_nodes_hl2,num_labels],initializer=tf.contrib.layers.xavier_initializer(),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  hidden_1_layer["weights"]=tf.nn.dropout(hidden_1_layer["weights"],keep_prob=0.5) 
  l2=tf.add(tf.matmul(l1,hidden_2_layer["weights"]),hidden_2_layer["bias"])
  l2=tf.nn.relu(l2)
  hidden_2_layer["weights"]=tf.nn.dropout(hidden_2_layer["weights"],keep_prob=0.5)   
  output=tf.add(tf.matmul(l2,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"])+tf.nn.l2_loss(hidden_2_layer["weights"]))
  # Optimizer.
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.4, global_step,decay_steps=100,decay_rate=.96, staircase= False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  #optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  # output=tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"])    
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))

In [93]:
num_steps = 10000

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 5.377193
Minibatch accuracy: 7.0%
Validation accuracy: 10.0%
Minibatch loss at step 500: 9.487813
Minibatch accuracy: 83.6%
Validation accuracy: 74.2%
Minibatch loss at step 1000: 6.863490
Minibatch accuracy: 78.9%
Validation accuracy: 78.6%
Minibatch loss at step 1500: 5.063195
Minibatch accuracy: 88.3%
Validation accuracy: 82.4%
Minibatch loss at step 2000: 4.080177
Minibatch accuracy: 92.2%
Validation accuracy: 84.5%
Minibatch loss at step 2500: 3.504969
Minibatch accuracy: 89.8%
Validation accuracy: 84.6%
Minibatch loss at step 3000: 3.165132
Minibatch accuracy: 84.4%
Validation accuracy: 83.6%
Minibatch loss at step 3500: 2.894245
Minibatch accuracy: 85.9%
Validation accuracy: 85.1%
Minibatch loss at step 4000: 2.540234
Minibatch accuracy: 90.6%
Validation accuracy: 84.8%
Minibatch loss at step 4500: 2.438391
Minibatch accuracy: 88.3%
Validation accuracy: 83.9%
Minibatch loss at step 5000: 2.347358
Minibatch accuracy: 89.8%
Validation accuracy

In [120]:
batch_size = 128 # 128
n_nodes_hl1=2048
n_nodes_hl2=2048
beta=0.0005
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  #tf.get_variable("W", shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer())
           
  hidden_1_layer={'weights':tf.get_variable(shape=[image_size * image_size, n_nodes_hl1],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl1]),name='hidden_bias')}
  hidden_2_layer={'weights':tf.get_variable( shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights_2'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl2]),name='hidden_bias_2')}
  output_layer={'weights':tf.get_variable( shape=[n_nodes_hl2,num_labels],initializer=tf.contrib.layers.xavier_initializer(),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.relu(l1)
  hidden_1_layer["weights"]=tf.nn.dropout(hidden_1_layer["weights"],keep_prob=0.7) 
  l2=tf.add(tf.matmul(l1,hidden_2_layer["weights"]),hidden_2_layer["bias"])
  l2=tf.nn.relu(l2)
  hidden_2_layer["weights"]=tf.nn.dropout(hidden_2_layer["weights"],keep_prob=0.3)   
  output=tf.add(tf.matmul(l2,output_layer["weights"]),output_layer["bias"])  
  #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"])+tf.nn.l2_loss(hidden_2_layer["weights"]))
  # Optimizer.
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.4, global_step,decay_steps=100,decay_rate=.96, staircase= False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  #optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  # output=tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"])    
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))

In [121]:
num_steps = 10000

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 5.064282
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Minibatch loss at step 500: 2.205374
Minibatch accuracy: 88.3%
Validation accuracy: 82.1%
Minibatch loss at step 1000: 1.790165
Minibatch accuracy: 84.4%
Validation accuracy: 82.4%
Minibatch loss at step 1500: 1.290869
Minibatch accuracy: 90.6%
Validation accuracy: 85.2%
Minibatch loss at step 2000: 1.103902
Minibatch accuracy: 93.8%
Validation accuracy: 84.4%
Minibatch loss at step 2500: 1.042681
Minibatch accuracy: 89.1%
Validation accuracy: 85.0%
Minibatch loss at step 3000: 1.062919
Minibatch accuracy: 84.4%
Validation accuracy: 85.4%
Minibatch loss at step 3500: 1.006845
Minibatch accuracy: 86.7%
Validation accuracy: 85.6%
Minibatch loss at step 4000: 0.840431
Minibatch accuracy: 90.6%
Validation accuracy: 85.8%
Minibatch loss at step 4500: 0.815019
Minibatch accuracy: 89.8%
Validation accuracy: 84.4%
Minibatch loss at step 5000: 0.863426
Minibatch accuracy: 89.8%
Validation accurac

## different activation function 

In [104]:
batch_size = 128 # 128
n_nodes_hl1=2024
n_nodes_hl2=1024
beta=0.0008
graph = tf.Graph()
with graph.as_default():
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size),name='train')
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels),name='train_label')
  tf_valid_dataset = tf.constant(valid_dataset,name='valid')
  tf_test_dataset = tf.constant(test_dataset,name='test')
  #tf.get_variable("W", shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer())
           
  hidden_1_layer={'weights':tf.get_variable(shape=[image_size * image_size, n_nodes_hl1],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl1]),name='hidden_bias')}
  hidden_2_layer={'weights':tf.get_variable( shape=[n_nodes_hl1, n_nodes_hl2],initializer=tf.contrib.layers.xavier_initializer(),name='hidden_weights_2'),'bias':tf.Variable(tf.truncated_normal([n_nodes_hl2]),name='hidden_bias_2')}
  output_layer={'weights':tf.get_variable( shape=[n_nodes_hl2,num_labels],initializer=tf.contrib.layers.xavier_initializer(),name='output_weights'),'bias':tf.Variable(tf.truncated_normal([num_labels]),name='output_bias')}                                                                                                           
  l1=tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])
  l1=tf.nn.tanh(l1)
  hidden_1_layer["weights"]=tf.nn.dropout(hidden_1_layer["weights"],keep_prob=0.6) 
  l2=tf.add(tf.matmul(l1,hidden_2_layer["weights"]),hidden_2_layer["bias"])
  l2=tf.nn.relu(l2)
  hidden_2_layer["weights"]=tf.nn.dropout(hidden_2_layer["weights"],keep_prob=0.3)   
  output=tf.add(tf.matmul(l2,output_layer["weights"]),output_layer["bias"])  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, tf_train_labels))+beta * (tf.nn.l2_loss(hidden_1_layer["weights"])+tf.nn.l2_loss(output_layer["weights"])+tf.nn.l2_loss(hidden_2_layer["weights"]))
  # Optimizer.
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.4, global_step,decay_steps=100,decay_rate=.96, staircase= False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  #optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(output)
  # output=tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_train_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"])    
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))
  test_prediction = tf.nn.softmax(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf.nn.relu(tf.add(tf.matmul(tf_test_dataset,hidden_1_layer["weights"]),hidden_1_layer["bias"])),hidden_2_layer["weights"]),hidden_2_layer["bias"])),output_layer["weights"]),output_layer["bias"]))

In [105]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
        
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 5.323531
Minibatch accuracy: 7.0%
Validation accuracy: 10.0%
Minibatch loss at step 500: 3.451479
Minibatch accuracy: 75.8%
Validation accuracy: 55.1%
Minibatch loss at step 1000: 2.185287
Minibatch accuracy: 79.7%
Validation accuracy: 67.5%
Minibatch loss at step 1500: 1.510751
Minibatch accuracy: 85.2%
Validation accuracy: 75.2%
Minibatch loss at step 2000: 1.214741
Minibatch accuracy: 88.3%
Validation accuracy: 69.7%
Minibatch loss at step 2500: 1.109519
Minibatch accuracy: 86.7%
Validation accuracy: 74.7%
Minibatch loss at step 3000: 1.129091
Minibatch accuracy: 82.0%
Validation accuracy: 77.1%
Test accuracy: 83.3%


In [None]:
def xavier_init(n_inputs, n_outputs, uniform=True):
  """Set the parameter initialization using the method described.
  This method is designed to keep the scale of the gradients roughly the same
  in all layers.
  Xavier Glorot and Yoshua Bengio (2010):
           Understanding the difficulty of training deep feedforward neural
           networks. International conference on artificial intelligence and
           statistics.
  Args:
    n_inputs: The number of input nodes into each output.
    n_outputs: The number of output nodes for each input.
    uniform: If true use a uniform distribution, otherwise use a normal.
  Returns:
    An initializer.
  """
  if uniform:
    # 6 was used in the paper.
    init_range = math.sqrt(6.0 / (n_inputs + n_outputs))
    return tf.random_uniform_initializer(-init_range, init_range)
  else:
    # 3 gives us approximately the same limits as above since this repicks
    # values greater than 2 standard deviations from the mean.
    stddev = math.sqrt(3.0 / (n_inputs + n_outputs))
    return tf.truncated_normal_initializer(stddev=stddev)