Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [26]:
batch_size = 64
lambda_ = .3
lr = .125

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)
        + lambda_ * tf.nn.l2_loss(weights) 
        + lambda_ * tf.nn.l2_loss(biases)
        )
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    
    
num_steps = 10001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 931.474609
Minibatch accuracy: 12.5%
Validation accuracy: 9.6%
Minibatch loss at step 500: 1.179510
Minibatch accuracy: 84.4%
Validation accuracy: 77.1%
Minibatch loss at step 1000: 1.229837
Minibatch accuracy: 81.2%
Validation accuracy: 78.6%
Minibatch loss at step 1500: 1.412376
Minibatch accuracy: 75.0%
Validation accuracy: 76.1%
Minibatch loss at step 2000: 1.080016
Minibatch accuracy: 78.1%
Validation accuracy: 76.0%
Minibatch loss at step 2500: 1.226054
Minibatch accuracy: 79.7%
Validation accuracy: 76.5%
Minibatch loss at step 3000: 1.205091
Minibatch accuracy: 81.2%
Validation accuracy: 76.8%
Minibatch loss at step 3500: 1.289847
Minibatch accuracy: 81.2%
Validation accuracy: 76.2%
Minibatch loss at step 4000: 1.472557
Minibatch accuracy: 73.4%
Validation accuracy: 77.6%
Minibatch loss at step 4500: 1.198334
Minibatch accuracy: 78.1%
Validation accuracy: 78.0%
Minibatch loss at step 5000: 1.475516
Minibatch accuracy: 68.8%
Validation accura

In [77]:
batch_size = 96
h0 = 1024 
h1 = 32
lambda_ = 0.005
lr = .001

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    biases0 = tf.Variable(tf.zeros([h0]))
    
    weights1 = tf.Variable(tf.truncated_normal([h0, h1]))
    biases1 = tf.Variable(tf.zeros([h1]))
    
    weights2 = tf.Variable(tf.truncated_normal([h1, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.relu(tf.matmul(tf_train_dataset, weights0) + biases0)
    s1 = tf.nn.relu(tf.matmul(s0, weights1) + biases1)
    logits = tf.matmul(s1, weights2) + biases2

    # reg = tf.nn.l2_loss(weights0) + tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    reg = tf.reduce_sum(tf.square(weights0)) + tf.reduce_sum(tf.square(weights1)) + tf.reduce_sum(tf.square(weights1))
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights0) + biases0)
    v1 = tf.nn.relu(tf.matmul(v0, weights1) + biases1)
    valid_prediction = tf.nn.softmax(tf.matmul(v1, weights2) + biases2)
    
    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t1 = tf.nn.relu(tf.matmul(t0, weights1) + biases1)
    test_prediction = tf.nn.softmax(tf.matmul(t1, weights2) + biases2)
    
    t_pred = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t_pred = tf.nn.relu(tf.matmul(t_pred, weights1) + biases1)
    t_pred = tf.matmul(t_pred, weights2) + biases2
    # t_pred = tf.matmul(t_pred, weights1) + biases1
    test_prediction = tf.nn.softmax(t_pred)
        
num_steps = 15001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 4704.526367
Minibatch accuracy: 10.4%
Validation accuracy: 10.0%
Minibatch loss at step 1000: 2320.164062
Minibatch accuracy: 25.0%
Validation accuracy: 30.1%
Minibatch loss at step 2000: 1107.817993
Minibatch accuracy: 50.0%
Validation accuracy: 40.7%
Minibatch loss at step 3000: 364.240173
Minibatch accuracy: 57.3%
Validation accuracy: 59.0%
Minibatch loss at step 4000: 83.693680
Minibatch accuracy: 70.8%
Validation accuracy: 70.9%
Minibatch loss at step 5000: 16.179203
Minibatch accuracy: 77.1%
Validation accuracy: 79.1%
Minibatch loss at step 6000: 3.132547
Minibatch accuracy: 83.3%
Validation accuracy: 83.3%
Minibatch loss at step 7000: 0.983776
Minibatch accuracy: 85.4%
Validation accuracy: 84.6%
Minibatch loss at step 8000: 0.642909
Minibatch accuracy: 85.4%
Validation accuracy: 84.1%
Minibatch loss at step 9000: 0.744519
Minibatch accuracy: 80.2%
Validation accuracy: 84.6%
Minibatch loss at step 10000: 0.525013
Minibatch accuracy: 89.6%
Val

In [96]:
batch_size = 96
h0 = 1024 
h1 = 32
lambda_ = 0.0025
lr = .001

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    biases0 = tf.Variable(tf.zeros([h0]))
    
    weights1 = tf.Variable(tf.truncated_normal([h0, h1]))
    biases1 = tf.Variable(tf.zeros([h1]))
    
    weights2 = tf.Variable(tf.truncated_normal([h1, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.relu(tf.matmul(tf_train_dataset, weights0) + biases0)
    s1 = tf.nn.relu(tf.matmul(s0, weights1) + biases1)
    logits = tf.matmul(s1, weights2) + biases2

    reg = tf.nn.l2_loss(weights0) + tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    # reg = tf.reduce_sum(tf.square(weights0)) + tf.reduce_sum(tf.square(weights1)) + tf.reduce_sum(tf.square(weights1))
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights0) + biases0)
    v1 = tf.nn.relu(tf.matmul(v0, weights1) + biases1)
    valid_prediction = tf.nn.softmax(tf.matmul(v1, weights2) + biases2)
    
    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t1 = tf.nn.relu(tf.matmul(t0, weights1) + biases1)
    test_prediction = tf.nn.softmax(tf.matmul(t1, weights2) + biases2)
    
    t_pred = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t_pred = tf.nn.relu(tf.matmul(t_pred, weights1) + biases1)
    t_pred = tf.matmul(t_pred, weights2) + biases2
    # t_pred = tf.matmul(t_pred, weights1) + biases1
    test_prediction = tf.nn.softmax(t_pred)
        
num_steps = 15001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2068.638428
Minibatch accuracy: 6.2%
Validation accuracy: 12.2%
Minibatch loss at step 1000: 722.624268
Minibatch accuracy: 18.8%
Validation accuracy: 21.5%
Minibatch loss at step 2000: 562.912170
Minibatch accuracy: 33.3%
Validation accuracy: 26.4%
Minibatch loss at step 3000: 371.479340
Minibatch accuracy: 37.5%
Validation accuracy: 39.6%
Minibatch loss at step 4000: 197.814926
Minibatch accuracy: 47.9%
Validation accuracy: 44.5%
Minibatch loss at step 5000: 85.263542
Minibatch accuracy: 51.0%
Validation accuracy: 54.9%
Minibatch loss at step 6000: 32.535183
Minibatch accuracy: 68.8%
Validation accuracy: 66.1%
Minibatch loss at step 7000: 12.401726
Minibatch accuracy: 76.0%
Validation accuracy: 74.8%
Minibatch loss at step 8000: 4.871196
Minibatch accuracy: 82.3%
Validation accuracy: 78.2%
Minibatch loss at step 9000: 2.135779
Minibatch accuracy: 81.2%
Validation accuracy: 83.1%
Minibatch loss at step 10000: 0.845630
Minibatch accuracy: 89.6%
Val

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [95]:
batch_size = 96
h0 = 1024 
h1 = 32
lambda_ = 0.00
lr = .001

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    biases0 = tf.Variable(tf.zeros([h0]))
    
    weights1 = tf.Variable(tf.truncated_normal([h0, h1]))
    biases1 = tf.Variable(tf.zeros([h1]))
    
    weights2 = tf.Variable(tf.truncated_normal([h1, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.relu(tf.matmul(tf_train_dataset, weights0) + biases0)
    s1 = tf.nn.relu(tf.matmul(s0, weights1) + biases1)
    logits = tf.matmul(s1, weights2) + biases2

    reg = tf.nn.l2_loss(weights0) + tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    # reg = tf.reduce_sum(tf.square(weights0)) + tf.reduce_sum(tf.square(weights1)) + tf.reduce_sum(tf.square(weights1))
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights0) + biases0)
    v1 = tf.nn.relu(tf.matmul(v0, weights1) + biases1)
    valid_prediction = tf.nn.softmax(tf.matmul(v1, weights2) + biases2)
    
    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t1 = tf.nn.relu(tf.matmul(t0, weights1) + biases1)
    test_prediction = tf.nn.softmax(tf.matmul(t1, weights2) + biases2)
    
    t_pred = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t_pred = tf.nn.relu(tf.matmul(t_pred, weights1) + biases1)
    t_pred = tf.matmul(t_pred, weights2) + biases2
    # t_pred = tf.matmul(t_pred, weights1) + biases1
    test_prediction = tf.nn.softmax(t_pred)
        
num_steps = 12001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
    # offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 824.395447
Minibatch accuracy: 10.4%
Validation accuracy: 15.1%
Minibatch loss at step 1000: 0.019850
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 2000: 0.013736
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 3000: 0.000001
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 4000: 0.004536
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 5000: 0.002494
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 6000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 7000: 0.000825
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 8000: 0.000494
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 9000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 44.1%
Minibatch loss at step 10000: 0.000186
Minibatch accuracy: 100.0%
Vali

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [99]:
batch_size = 96
h0 = 1024 
h1 = 32
lambda_ = 0.00
lr = .001
keep_prob = .625

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    biases0 = tf.Variable(tf.zeros([h0]))
    
    weights1 = tf.Variable(tf.truncated_normal([h0, h1]))
    biases1 = tf.Variable(tf.zeros([h1]))
    
    weights2 = tf.Variable(tf.truncated_normal([h1, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, weights0) + biases0), keep_prob)
    s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, weights1) + biases1), keep_prob)
    logits = tf.matmul(s1, weights2) + biases2

    reg = tf.nn.l2_loss(weights0) + tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    # reg = tf.reduce_sum(tf.square(weights0)) + tf.reduce_sum(tf.square(weights1)) + tf.reduce_sum(tf.square(weights1))
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights0) + biases0)
    v1 = tf.nn.relu(tf.matmul(v0, weights1) + biases1)
    valid_prediction = tf.nn.softmax(tf.matmul(v1, weights2) + biases2)
    
    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t1 = tf.nn.relu(tf.matmul(t0, weights1) + biases1)
    test_prediction = tf.nn.softmax(tf.matmul(t1, weights2) + biases2)
    
    t_pred = tf.nn.relu(tf.matmul(tf_test_dataset, weights0) + biases0)
    t_pred = tf.nn.relu(tf.matmul(t_pred, weights1) + biases1)
    t_pred = tf.matmul(t_pred, weights2) + biases2
    # t_pred = tf.matmul(t_pred, weights1) + biases1
    test_prediction = tf.nn.softmax(t_pred)
        
num_steps = 20001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
    # offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 1967.771606
Minibatch accuracy: 12.5%
Validation accuracy: 10.1%
Minibatch loss at step 1000: 4.915576
Minibatch accuracy: 19.8%
Validation accuracy: 17.2%
Minibatch loss at step 2000: 1.868286
Minibatch accuracy: 25.0%
Validation accuracy: 20.0%
Minibatch loss at step 3000: 1.647395
Minibatch accuracy: 35.4%
Validation accuracy: 24.2%
Minibatch loss at step 4000: 1.838254
Minibatch accuracy: 36.5%
Validation accuracy: 29.7%
Minibatch loss at step 5000: 1.494194
Minibatch accuracy: 40.6%
Validation accuracy: 33.7%
Minibatch loss at step 6000: 1.324174
Minibatch accuracy: 47.9%
Validation accuracy: 41.7%
Minibatch loss at step 7000: 1.437298
Minibatch accuracy: 46.9%
Validation accuracy: 45.2%
Minibatch loss at step 8000: 1.307086
Minibatch accuracy: 49.0%
Validation accuracy: 47.0%
Minibatch loss at step 9000: 1.470837
Minibatch accuracy: 45.8%
Validation accuracy: 47.0%
Minibatch loss at step 10000: 1.345255
Minibatch accuracy: 46.9%
Validation ac

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [111]:
batch_size = 128
h0 = 256
h1 = 256
h2 = 256
lambda_ = 1e-5
lr = .001
keep_prob = .85

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    w0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    b0 = tf.Variable(tf.zeros([h0]))

    w1 = tf.Variable(tf.truncated_normal([h0, h1]))
    b1 = tf.Variable(tf.zeros([h1]))
    
    w2 = tf.Variable(tf.truncated_normal([h1, h2]))
    b2 = tf.Variable(tf.zeros([h2]))

    w3 = tf.Variable(tf.truncated_normal([h2, num_labels]))
    b3 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w0) + b0), keep_prob)
    s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, w1) + b1), keep_prob)
    s2 = tf.nn.relu(tf.matmul(s1, w2) + b2)
    logits = tf.matmul(s2, w3) + b3

    reg = tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
    # reg = tf.reduce_sum(tf.square(w0)) + tf.reduce_sum(tf.square(w1)) + tf.reduce_sum(tf.square(w1))

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)

    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)

    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, w0) + b0)
    v1 = tf.nn.relu(tf.matmul(v0, w1) + b1)
    v2 = tf.nn.relu(tf.matmul(v1, w2) + b2)
    valid_prediction = tf.nn.softmax(tf.matmul(v2, w3) + b3)

    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, w0) + b0)
    t1 = tf.nn.relu(tf.matmul(t0, w1) + b1)
    t2 = tf.nn.relu(tf.matmul(t1, w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(t2, w3) + b3)

num_steps = 50001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        # offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}

        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % 2000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 19930.998047
Minibatch accuracy: 7.8%
Validation accuracy: 10.2%
Minibatch loss at step 2000: 620.792725
Minibatch accuracy: 76.6%
Validation accuracy: 78.2%
Minibatch loss at step 4000: 168.996170
Minibatch accuracy: 75.0%
Validation accuracy: 77.6%
Minibatch loss at step 6000: 104.573509
Minibatch accuracy: 71.1%
Validation accuracy: 77.2%
Minibatch loss at step 8000: 52.844090
Minibatch accuracy: 66.4%
Validation accuracy: 74.5%
Minibatch loss at step 10000: 24.543062
Minibatch accuracy: 59.4%
Validation accuracy: 69.1%
Minibatch loss at step 12000: 14.801310
Minibatch accuracy: 57.0%
Validation accuracy: 63.9%
Minibatch loss at step 14000: 7.372912
Minibatch accuracy: 60.2%
Validation accuracy: 59.6%
Minibatch loss at step 16000: 2.480627
Minibatch accuracy: 67.2%
Validation accuracy: 57.1%
Minibatch loss at step 18000: 3.049649
Minibatch accuracy: 58.6%
Validation accuracy: 56.6%
Minibatch loss at step 20000: 2.842341
Minibatch accuracy: 57.0%

In [141]:
batch_size = 128
h0 = 256
h1 = 256
h2 = 256
lambda_ = 1e-5
lr = .0005
keep_prob = .675

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    w0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    b0 = tf.Variable(tf.zeros([h0]))

    w1 = tf.Variable(tf.truncated_normal([h0, h1]))
    b1 = tf.Variable(tf.zeros([h1]))
    
    w2 = tf.Variable(tf.truncated_normal([h1, h2]))
    b2 = tf.Variable(tf.zeros([h2]))

    w3 = tf.Variable(tf.truncated_normal([h2, num_labels]))
    b3 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w0) + b0), keep_prob)
    s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, w1) + b1), keep_prob)
    s2 = tf.nn.relu(tf.matmul(s1, w2) + b2)
    logits = tf.matmul(s2, w3) + b3

    reg = tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
    # reg = tf.reduce_sum(tf.square(w0)) + tf.reduce_sum(tf.square(w1)) + tf.reduce_sum(tf.square(w1))

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)

    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)

    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, w0) + b0)
    v1 = tf.nn.relu(tf.matmul(v0, w1) + b1)
    v2 = tf.nn.relu(tf.matmul(v1, w2) + b2)
    valid_prediction = tf.nn.softmax(tf.matmul(v2, w3) + b3)

    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, w0) + b0)
    t1 = tf.nn.relu(tf.matmul(t0, w1) + b1)
    t2 = tf.nn.relu(tf.matmul(t1, w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(t2, w3) + b3)

batches = np.ceil(float(train_labels.shape[0]) / batch_size)
num_epochs = 175
num_steps = int(np.ceil(float(train_labels.shape[0]) / batch_size))

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for epoch in range(num_epochs):
        l_mean = 0
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            # offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}

            _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
            l_mean += l

        if (epoch % 1 == 0):
            print("Avg Minibatch loss for epoch %d: %f" % (epoch, l_mean / num_steps))
            # print(" Minibatch loss at end of epoch %d: %f" % (epoch, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%\n" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Avg Minibatch loss for epoch 0: 3761.436169
Minibatch accuracy: 60.2%
Validation accuracy: 77.0%

Avg Minibatch loss for epoch 1: 1105.405648
Minibatch accuracy: 65.6%
Validation accuracy: 77.2%

Avg Minibatch loss for epoch 2: 632.923827
Minibatch accuracy: 61.7%
Validation accuracy: 76.8%

Avg Minibatch loss for epoch 3: 393.026527
Minibatch accuracy: 64.1%
Validation accuracy: 75.8%

Avg Minibatch loss for epoch 4: 248.010304
Minibatch accuracy: 63.3%
Validation accuracy: 72.8%

Avg Minibatch loss for epoch 5: 147.903750
Minibatch accuracy: 54.7%
Validation accuracy: 68.8%

Avg Minibatch loss for epoch 6: 90.266291
Minibatch accuracy: 56.2%
Validation accuracy: 61.5%

Avg Minibatch loss for epoch 7: 57.327172
Minibatch accuracy: 48.4%
Validation accuracy: 57.6%

Avg Minibatch loss for epoch 8: 38.531971
Minibatch accuracy: 43.8%
Validation accuracy: 53.9%

Avg Minibatch loss for epoch 9: 27.459044
Minibatch accuracy: 39.1%
Validation accuracy: 49.8%

Avg Minibatch loss f

In [5]:
batch_size = 64
h0 = 256
h1 = 256
h2 = 256
lambda_ = 5e-5
lr = .0005
keep_prob = .675

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    tf_tr_dataset = tf.constant(train_dataset)

    # Variables.
    w0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    b0 = tf.Variable(tf.zeros([h0]))

    w1 = tf.Variable(tf.truncated_normal([h0, h1]))
    b1 = tf.Variable(tf.zeros([h1]))
    
    w2 = tf.Variable(tf.truncated_normal([h1, h2]))
    b2 = tf.Variable(tf.zeros([h2]))

    w3 = tf.Variable(tf.truncated_normal([h2, num_labels]))
    b3 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w0) + b0), keep_prob)
    s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, w1) + b1), keep_prob)
    s2 = tf.nn.relu(tf.matmul(s1, w2) + b2)
    logits = tf.matmul(s2, w3) + b3

    reg = tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
    # reg = tf.reduce_sum(tf.square(w0)) + tf.reduce_sum(tf.square(w1)) + tf.reduce_sum(tf.square(w1))

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)

    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)

    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, w0) + b0)
    v1 = tf.nn.relu(tf.matmul(v0, w1) + b1)
    v2 = tf.nn.relu(tf.matmul(v1, w2) + b2)
    valid_prediction = tf.nn.softmax(tf.matmul(v2, w3) + b3)

    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, w0) + b0)
    t1 = tf.nn.relu(tf.matmul(t0, w1) + b1)
    t2 = tf.nn.relu(tf.matmul(t1, w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(t2, w3) + b3)
    
    r0 = tf.nn.relu(tf.matmul(tf_tr_dataset, w0) + b0)
    r1 = tf.nn.relu(tf.matmul(r0, w1) + b1)
    r2 = tf.nn.relu(tf.matmul(r1, w2) + b2)
    tr_prediction = tf.nn.softmax(tf.matmul(r2, w3) + b3)

batches = np.ceil(float(train_labels.shape[0]) / batch_size)
num_epochs = 175
num_steps = int(np.ceil(float(train_labels.shape[0]) / batch_size))

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for epoch in range(num_epochs):
        l_mean = 0
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            # offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}

            _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
            l_mean += l

        print("Avg Minibatch loss for epoch %d: %f" % (epoch, l_mean / num_steps))
        print(" Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
        if (epoch % 5 == 0):
            print("  ***   Training set accuracy: %.2f%% ***" % accuracy(tr_prediction.eval(), train_labels))
            print("  *** Validation set accuracy: %.2f%% ***" % accuracy(valid_prediction.eval(), valid_labels))
    print("\nFinal Training set accuracy: %.2f%%" % accuracy(tr_prediction.eval(), train_labels))
    print("Final Validation set accuracy: %.2f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test set accuracy: %.2f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Avg Minibatch loss for epoch 0: 2886.379087
 Minibatch accuracy: 54.7%
  ***   Training set accuracy: 77.80% ***
  *** Validation set accuracy: 77.43% ***
Avg Minibatch loss for epoch 1: 736.311754
 Minibatch accuracy: 59.4%
Avg Minibatch loss for epoch 2: 358.629818
 Minibatch accuracy: 62.5%
Avg Minibatch loss for epoch 3: 174.180594
 Minibatch accuracy: 53.1%
Avg Minibatch loss for epoch 4: 83.722366
 Minibatch accuracy: 48.4%
Avg Minibatch loss for epoch 5: 45.430048
 Minibatch accuracy: 45.3%
  ***   Training set accuracy: 55.45% ***
  *** Validation set accuracy: 55.21% ***
Avg Minibatch loss for epoch 6: 28.766584
 Minibatch accuracy: 37.5%
Avg Minibatch loss for epoch 7: 20.805520
 Minibatch accuracy: 29.7%
Avg Minibatch loss for epoch 8: 16.216207
 Minibatch accuracy: 31.2%
Avg Minibatch loss for epoch 9: 13.543785
 Minibatch accuracy: 35.9%
Avg Minibatch loss for epoch 10: 11.727413
 Minibatch accuracy: 35.9%
  ***   Training set accuracy: 40.28% ***
  *** Validat

In [161]:
best_w0 = w0
best_w1 = w1
best_w2 = w2
best_w3 = w3

best_b0 = b0
best_b1 = b1
best_b2 = b2
best_b3 = b3

best_lr = lr
best_h0 = h0
best_h1 = h1
best_h2 = h2
best_lambda = lambda_
best_keep = keep_prob

best_epochs = 175

In [170]:
batch_size = 64
h0 = 256
h1 = 256
h2 = 256
lambda_ = 6e-4
lr = .000375
keep_prob = .675

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    tf_tr_dataset = tf.constant(train_dataset)

    # Variables.
    w0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
    b0 = tf.Variable(tf.zeros([h0]))

    w1 = tf.Variable(tf.truncated_normal([h0, h1]))
    b1 = tf.Variable(tf.zeros([h1]))
    
    w2 = tf.Variable(tf.truncated_normal([h1, h2]))
    b2 = tf.Variable(tf.zeros([h2]))

    w3 = tf.Variable(tf.truncated_normal([h2, num_labels]))
    b3 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w0) + b0), keep_prob)
    s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, w1) + b1), keep_prob)
    s2 = tf.nn.relu(tf.matmul(s1, w2) + b2)
    logits = tf.matmul(s2, w3) + b3

    reg = tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
    # reg = tf.reduce_sum(tf.square(w0)) + tf.reduce_sum(tf.square(w1)) + tf.reduce_sum(tf.square(w1))

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)

    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)

    v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, w0) + b0)
    v1 = tf.nn.relu(tf.matmul(v0, w1) + b1)
    v2 = tf.nn.relu(tf.matmul(v1, w2) + b2)
    valid_prediction = tf.nn.softmax(tf.matmul(v2, w3) + b3)

    t0 = tf.nn.relu(tf.matmul(tf_test_dataset, w0) + b0)
    t1 = tf.nn.relu(tf.matmul(t0, w1) + b1)
    t2 = tf.nn.relu(tf.matmul(t1, w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(t2, w3) + b3)
    
    r0 = tf.nn.relu(tf.matmul(tf_tr_dataset, w0) + b0)
    r1 = tf.nn.relu(tf.matmul(r0, w1) + b1)
    r2 = tf.nn.relu(tf.matmul(r1, w2) + b2)
    tr_prediction = tf.nn.softmax(tf.matmul(r2, w3) + b3)

batches = np.ceil(float(train_labels.shape[0]) / batch_size)
num_epochs = 175
num_steps = int(np.ceil(float(train_labels.shape[0]) / batch_size))
curves = []

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for epoch in range(num_epochs):
        l_mean = 0
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            # offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}

            _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
            l_mean += l

        print("Avg Minibatch loss for epoch %d: %f" % (epoch, l_mean / num_steps))
        # print(" Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
        if (epoch % 5 == 0):
            tra = accuracy(tr_prediction.eval(), train_labels)
            va = accuracy(valid_prediction.eval(), valid_labels)
            print("  ***   Training set accuracy: %.2f%% ***" % tra)
            print("  *** Validation set accuracy: %.2f%% ***" % va)
            curves.append({'epoch': epoch, 'train': 1 - tra, 'val': 1- va})
    print("\nFinal Training set accuracy: %.2f%%" % accuracy(tr_prediction.eval(), train_labels))
    print("Final Validation set accuracy: %.2f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test set accuracy: %.2f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Avg Minibatch loss for epoch 0: 3335.316641
  ***   Training set accuracy: 77.73% ***
  *** Validation set accuracy: 77.30% ***
Avg Minibatch loss for epoch 1: 1003.327266
Avg Minibatch loss for epoch 2: 566.657956
Avg Minibatch loss for epoch 3: 353.749486
Avg Minibatch loss for epoch 4: 230.371917
Avg Minibatch loss for epoch 5: 158.369888
  ***   Training set accuracy: 61.99% ***
  *** Validation set accuracy: 61.15% ***
Avg Minibatch loss for epoch 6: 121.026956
Avg Minibatch loss for epoch 7: 101.663283
Avg Minibatch loss for epoch 8: 90.571421
Avg Minibatch loss for epoch 9: 83.337551
Avg Minibatch loss for epoch 10: 78.015060
  ***   Training set accuracy: 40.87% ***
  *** Validation set accuracy: 40.73% ***
Avg Minibatch loss for epoch 11: 73.526497
Avg Minibatch loss for epoch 12: 69.317634
Avg Minibatch loss for epoch 13: 65.063659
Avg Minibatch loss for epoch 14: 60.674348
Avg Minibatch loss for epoch 15: 56.178468
  ***   Training set accuracy: 41.42% ***
  *** 

Learning Curves:

In [171]:

batch_size = 64
num_epochs = 125
h0 = 256
h1 = 256
h2 = 256
lambda_ = 5e-5
lr = .0005
keep_prob = .675

run_sizes = [500, 1000, 2500, 5000, 10000, 25000, 50000, 250000, 100000, train_labels.shape[0]]

lcurves = []

for m in run_sizes:
    x_train_dataset = train_dataset[:m, :]
    x_train_labels = train_labels[:m, :]

    graph = tf.Graph()
    with graph.as_default():
        # Input data. For the training data, we use a placeholder that will be fed
        # at run time with a training minibatch.
        tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
        tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset)
        tf_tr_dataset = tf.constant(x_train_dataset)

        # Variables.
        w0 = tf.Variable(tf.truncated_normal([image_size * image_size, h0]))
        b0 = tf.Variable(tf.zeros([h0]))

        w1 = tf.Variable(tf.truncated_normal([h0, h1]))
        b1 = tf.Variable(tf.zeros([h1]))

        w2 = tf.Variable(tf.truncated_normal([h1, h2]))
        b2 = tf.Variable(tf.zeros([h2]))

        w3 = tf.Variable(tf.truncated_normal([h2, num_labels]))
        b3 = tf.Variable(tf.zeros([num_labels]))

        # Training computation.
        s0 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w0) + b0), keep_prob)
        s1 = tf.nn.dropout(tf.nn.relu(tf.matmul(s0, w1) + b1), keep_prob)
        s2 = tf.nn.relu(tf.matmul(s1, w2) + b2)
        logits = tf.matmul(s2, w3) + b3

        reg = tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
        # reg = tf.reduce_sum(tf.square(w0)) + tf.reduce_sum(tf.square(w1)) + tf.reduce_sum(tf.square(w1))

        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + (lambda_ * reg)

        # Optimizer.
        # optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
        optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

        # Predictions for the training, validation, and test data.
        train_prediction = tf.nn.softmax(logits)

        v0 = tf.nn.relu(tf.matmul(tf_valid_dataset, w0) + b0)
        v1 = tf.nn.relu(tf.matmul(v0, w1) + b1)
        v2 = tf.nn.relu(tf.matmul(v1, w2) + b2)
        valid_prediction = tf.nn.softmax(tf.matmul(v2, w3) + b3)

        t0 = tf.nn.relu(tf.matmul(tf_test_dataset, w0) + b0)
        t1 = tf.nn.relu(tf.matmul(t0, w1) + b1)
        t2 = tf.nn.relu(tf.matmul(t1, w2) + b2)
        test_prediction = tf.nn.softmax(tf.matmul(t2, w3) + b3)

        r0 = tf.nn.relu(tf.matmul(tf_tr_dataset, w0) + b0)
        r1 = tf.nn.relu(tf.matmul(r0, w1) + b1)
        r2 = tf.nn.relu(tf.matmul(r1, w2) + b2)
        tr_prediction = tf.nn.softmax(tf.matmul(r2, w3) + b3)

    batches = np.ceil(float(train_labels.shape[0]) / batch_size)
    num_steps = int(np.ceil(float(train_labels.shape[0]) / batch_size))

    with tf.Session(graph=graph) as session:
        tf.initialize_all_variables().run()
        print("Run size %d initialized" % m)
        for epoch in range(num_epochs):
            l_mean = 0
            for step in range(num_steps):
                # Pick an offset within the training data, which has been randomized.
                # Note: we could use better randomization across epochs.
                # offset = (step * batch_size) % (int(batch_size * 2.5) - batch_size)
                offset = (step * batch_size) % (x_train_labels.shape[0] - batch_size)
                # Generate a minibatch.
                batch_data = x_train_dataset[offset:(offset + batch_size), :]
                batch_labels = x_train_labels[offset:(offset + batch_size), :]
                # Prepare a dictionary telling the session where to feed the minibatch.
                # The key of the dictionary is the placeholder node of the graph to be fed,
                # and the value is the numpy array to feed to it.
                feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}

                _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
                l_mean += l

            print("   Avg Minibatch loss for epoch %d: %f" % (epoch, l_mean / num_steps))
            # print(" Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            if (epoch % 5 == 0):
                tra = accuracy(tr_prediction.eval(), x_train_labels)
                va = accuracy(valid_prediction.eval(), valid_labels)
                print("     ***   Training set accuracy: %.2f%% ***" % tra)
                print("     *** Validation set accuracy: %.2f%% ***" % va)

        ta = accuracy(tr_prediction.eval(), x_train_labels)
        va = accuracy(valid_prediction.eval(), valid_labels)
        lcurves.append({"m": m, "train": 1 - ta / 100, "val": 1 - va / 100})
        print("\n   Final Training set accuracy: %.2f%%" % ta)
        print("   Final Validation set accuracy: %.2f%%" % va)
        print("   Test set accuracy: %.2f%%\n\n" % accuracy(test_prediction.eval(), test_labels))


Run size 500 initialized
   Avg Minibatch loss for epoch 0: 1768.132651
     ***   Training set accuracy: 98.80% ***
     *** Validation set accuracy: 76.55% ***
   Avg Minibatch loss for epoch 1: 120.038027
   Avg Minibatch loss for epoch 2: 42.229033
   Avg Minibatch loss for epoch 3: 22.663578
   Avg Minibatch loss for epoch 4: 15.800269
   Avg Minibatch loss for epoch 5: 13.092072
     ***   Training set accuracy: 100.00% ***
     *** Validation set accuracy: 78.52% ***
   Avg Minibatch loss for epoch 6: 11.596330
   Avg Minibatch loss for epoch 7: 10.199064
   Avg Minibatch loss for epoch 8: 9.460913
   Avg Minibatch loss for epoch 9: 9.408908
   Avg Minibatch loss for epoch 10: 8.506672
     ***   Training set accuracy: 100.00% ***
     *** Validation set accuracy: 78.34% ***
   Avg Minibatch loss for epoch 11: 8.565779
   Avg Minibatch loss for epoch 12: 8.128609
   Avg Minibatch loss for epoch 13: 7.745308
   Avg Minibatch loss for epoch 14: 7.899625
   Avg Minibatch loss for e

In [172]:
import pandas as pd

In [173]:
lc = pd.DataFrame(lcurves)
print(lc)

        m     train     val
0     500  0.000000  0.2131
1    1000  0.001000  0.2061
2    2500  0.002800  0.1971
3    5000  0.001800  0.1706
4   10000  0.002100  0.1531
5   25000  0.003400  0.1343
6   50000  0.007100  0.1204
7  250000  0.060545  0.0951
8  100000  0.027610  0.1043
9  200000  0.057140  0.0918
