Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
import io

First reload the data we generated in `1_notmnist.ipynb`. We're downloading this from drive again.

In [0]:
#Auth
from google.colab import auth
auth.authenticate_user()

#Drive API client
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

#Downloads
from googleapiclient.http import MediaIoBaseDownload

In [3]:
#Find file id
file_name = 'notMNIST.pickle' #@param {type:"string"}
file_id = ''
page_token = None
while True:
    response = drive_service.files().list(q="name contains '{}'".format(file_name),
                                          spaces='drive',
                                          fields='nextPageToken, files(id, name)',
                                          pageToken=page_token).execute()
    for file in response.get('files', []):
        # Process change
        print('Found file: %s (%s)' % (file.get('name'), file.get('id')))
        file_id = file.get('id')
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break
        
#Download file id
request = drive_service.files().get_media(fileId=file_id)
handler = io.BytesIO()
downloader = MediaIoBaseDownload(handler, request)
done = False
while done is False:
  status, done = downloader.next_chunk()
  print("Download %d%%" % int(status.progress() * 100))
  
handler.seek(0)
save = pickle.load(handler)
print("Loaded pickle: " + str(save))

Found file: notMNIST.pickle (1dlQpbatb09W3W0nBbjlpiB8fSbEn46UL)
Download 15%
Download 30%
Download 45%
Download 60%
Download 75%
Download 91%
Download 100%
Loaded pickle: {'train_labels': array([4, 9, 6, ..., 2, 4, 4], dtype=int32), 'valid_labels': array([1, 9, 3, ..., 8, 9, 8], dtype=int32), 'test_labels': array([3, 6, 4, ..., 6, 9, 0], dtype=int32), 'test_dataset': array([[[-0.5       , -0.5       , -0.48431373, ..., -0.49607843,
         -0.5       , -0.5       ],
        [-0.5       , -0.5       , -0.49607843, ..., -0.5       ,
         -0.49607843, -0.5       ],
        [-0.5       , -0.5       , -0.5       , ..., -0.46078432,
         -0.49215686, -0.49607843],
        ...,
        [-0.5       , -0.5       , -0.40980393, ..., -0.5       ,
         -0.5       , -0.5       ],
        [-0.30784315, -0.01764706,  0.3745098 , ..., -0.5       ,
         -0.5       , -0.5       ],
        [ 0.29607844,  0.5       ,  0.5       , ..., -0.5       ,
         -0.5       , -0.5       ]],

   

In [4]:
#pickle_file = 'notMNIST.pickle'

#with open(pickle_file, 'rb') as f:
#  save = pickle.load(f)

train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save  # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [5]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [0]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

Logistic model:
To introduce L2 regularlization, we need to add the L2 norm (sum of squares) of the weights to the loss and adding a constant that can be tuned.


In [7]:
batch_size = 128
b = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  
  # Add L2 Normalizaiton here:
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)
    + b * tf.nn.l2_loss(weights))
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [8]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 47.650177
Minibatch accuracy: 11.7%
Validation accuracy: 14.6%
Minibatch loss at step 500: 0.880854
Minibatch accuracy: 82.0%
Validation accuracy: 81.2%
Minibatch loss at step 1000: 0.976134
Minibatch accuracy: 78.1%
Validation accuracy: 81.9%
Minibatch loss at step 1500: 1.011078
Minibatch accuracy: 76.6%
Validation accuracy: 82.2%
Minibatch loss at step 2000: 0.828068
Minibatch accuracy: 82.0%
Validation accuracy: 80.2%
Minibatch loss at step 2500: 0.947139
Minibatch accuracy: 76.6%
Validation accuracy: 82.1%
Minibatch loss at step 3000: 0.724812
Minibatch accuracy: 84.4%
Validation accuracy: 81.5%
Test accuracy: 87.7%


And now for the 1 layer NN with hidden ReLUs

In [0]:
# Pulled from the SGD example above and from help from these stackoverflow posts:
# https://stackoverflow.com/questions/35387109/validation-and-test-with-tensorflow
# https://stackoverflow.com/questions/38641104/tensorflow-relu-misunderstanding

batch_size = 128
b = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
    
  # Weights and biases for before ReLU, need to transform into a 1024 node layer
  weights_pre = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
  biases_pre = tf.Variable(tf.zeros(1024))
  
  # Weights and biases for after ReLU, reduce down to label size
  weights_post = tf.Variable(tf.truncated_normal([1024, num_labels]))
  biases_post = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden = tf.nn.relu(tf.matmul(tf_train_dataset, weights_pre) + biases_pre)
  logits = tf.matmul(hidden, weights_post) + biases_post
  regularizer = tf.nn.l2_loss(weights_pre) + tf.nn.l2_loss(weights_post)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits) + 
      b * regularizer)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training dataset
  train_prediction = tf.nn.softmax(logits)
  
  # Predictions for validation dataset
  valid_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  valid_prediction = tf.nn.softmax(valid_logit)
  
  # Predictions for test dataset
  test_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  test_prediction = tf.nn.softmax(test_logit)

In [10]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3425.981445
Minibatch accuracy: 11.7%
Validation accuracy: 23.5%
Minibatch loss at step 500: 21.226393
Minibatch accuracy: 84.4%
Validation accuracy: 84.5%
Minibatch loss at step 1000: 0.988222
Minibatch accuracy: 78.1%
Validation accuracy: 84.1%
Minibatch loss at step 1500: 0.843288
Minibatch accuracy: 79.7%
Validation accuracy: 83.5%
Minibatch loss at step 2000: 0.726166
Minibatch accuracy: 83.6%
Validation accuracy: 83.7%
Minibatch loss at step 2500: 0.888704
Minibatch accuracy: 78.1%
Validation accuracy: 84.0%
Minibatch loss at step 3000: 0.741414
Minibatch accuracy: 85.9%
Validation accuracy: 84.3%
Test accuracy: 90.1%


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [0]:
# Pulled from the SGD example above and from help from these stackoverflow posts:
# https://stackoverflow.com/questions/35387109/validation-and-test-with-tensorflow
# https://stackoverflow.com/questions/38641104/tensorflow-relu-misunderstanding

batch_size = 128
b = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
    
  # Weights and biases for before ReLU, need to transform into a 1024 node layer
  weights_pre = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
  biases_pre = tf.Variable(tf.zeros(1024))
  
  # Weights and biases for after ReLU, reduce down to label size
  weights_post = tf.Variable(tf.truncated_normal([1024, num_labels]))
  biases_post = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden = tf.nn.relu(tf.matmul(tf_train_dataset, weights_pre) + biases_pre)
  logits = tf.matmul(hidden, weights_post) + biases_post
  regularizer = tf.nn.l2_loss(weights_pre) + tf.nn.l2_loss(weights_post)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits) + b * regularizer)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training dataset
  train_prediction = tf.nn.softmax(logits)
  
  # Predictions for validation dataset
  valid_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  valid_prediction = tf.nn.softmax(valid_logit)
  
  # Predictions for test dataset
  test_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  test_prediction = tf.nn.softmax(test_logit)

In [12]:
# Create new restricted training dataset

restr_train_dataset = train_dataset[:500]
restr_train_labels = train_labels[:500]

print(restr_train_dataset.shape)
print(train_dataset.shape)
print(restr_train_labels.shape)
print(train_labels.shape)

(500, 784)
(200000, 784)
(500, 10)
(200000, 10)


In [13]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (restr_train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = restr_train_dataset[offset:(offset + batch_size), :]
    batch_labels = restr_train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3517.274902
Minibatch accuracy: 11.7%
Validation accuracy: 32.5%
Minibatch loss at step 500: 20.972031
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 1000: 0.470602
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 0.298813
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 2000: 0.277259
Minibatch accuracy: 100.0%
Validation accuracy: 78.3%
Minibatch loss at step 2500: 0.275610
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 3000: 0.272792
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Test accuracy: 85.3%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

While introducint dropout to the model, it turns out that keep_prob needs to be changed when running the validation and test sessions. Since this current architecture feeds all datasets in the feed dict, which is also where I feed the keep_prob, I'll need to rearchitect this to allow for different feeds.

To do this, lets decouple the datasets in the feed dictionary and make the initial feed dictionary only contain the training dataset. Once we have the trained model, we can evaluate it with the test and validation datasets.

In [9]:
# Create new restricted training dataset

restr_train_dataset = train_dataset[:500]
restr_train_labels = train_labels[:500]

print(restr_train_dataset.shape)
print(train_dataset.shape)
print(restr_train_labels.shape)
print(train_labels.shape)

(500, 784)
(200000, 784)
(500, 10)
(200000, 10)


In [0]:
# Pulled from the SGD example above and from help from these stackoverflow posts:
# https://stackoverflow.com/questions/35387109/validation-and-test-with-tensorflow
# https://stackoverflow.com/questions/38641104/tensorflow-relu-misunderstanding
# This section will introduce Dropout through tf.nn.dropout()
# Since the list of calls that we have to make for test and valid prediction calculations
# are getting a bit large, lets move them into a function

batch_size = 128
b = 0.01

graph = tf.Graph()
with graph.as_default():
  
  # Input data, we're generalizing the graph for training, validation, and test data
  tf_keep_prob = tf.placeholder(tf.float32)
  tf_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
  # Weights and biases for before ReLU, need to transform into a 1024 node layer
  weights_pre = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
  biases_pre = tf.Variable(tf.zeros(1024))
  
  # Weights and biases for after ReLU, reduce down to label size
  weights_post = tf.Variable(tf.truncated_normal([1024, num_labels]))
  biases_post = tf.Variable(tf.zeros([num_labels]))
  
  def model(tf_dataset, weights_pre, weights_post, biases_pre, biases_post):  
    hidden = tf.nn.relu(tf.matmul(tf_dataset, weights_pre) + biases_pre)
    dropped = tf.nn.dropout(hidden, tf_keep_prob)
    out_layer = tf.matmul(dropped, weights_post) + biases_post
    return out_layer
  
  # Actual computation of model
  logits = model(tf_dataset, weights_pre, weights_post, biases_pre, biases_post)
  
  # Loss and Optimizer
  regularizer = tf.nn.l2_loss(weights_pre) + tf.nn.l2_loss(weights_post)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_labels, logits=logits) + 
    b * regularizer)
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions
  prediction = tf.nn.softmax(logits)

In [22]:
# Since this method doesn't allow specifying of different keep probs per session
# we'll need to figure out how to allow different keep probabilities per prediction
# My understanding of the underlying code is a bit wonky, so I'll go back to the old architecture

num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (restr_train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = restr_train_dataset[offset:(offset + batch_size), :]
    batch_labels = restr_train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_dataset : batch_data, tf_labels : batch_labels, 
                 tf_keep_prob : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        prediction.eval({tf_dataset: valid_dataset, tf_labels: valid_labels, tf_keep_prob: 1.0}), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(prediction.eval({tf_dataset: test_dataset, tf_labels: test_labels, tf_keep_prob: 1.0}), test_labels))

Initialized
Minibatch loss at step 0: 3632.326904
Minibatch accuracy: 10.9%


AttributeError: ignored

In [7]:
# Pulled from the SGD example above and from help from these stackoverflow posts:
# https://stackoverflow.com/questions/35387109/validation-and-test-with-tensorflow
# https://stackoverflow.com/questions/38641104/tensorflow-relu-misunderstanding

batch_size = 128
b = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
    
  # Weights and biases for before ReLU, need to transform into a 1024 node layer
  weights_pre = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
  biases_pre = tf.Variable(tf.zeros(1024))
  
  # Weights and biases for after ReLU, reduce down to label size
  weights_post = tf.Variable(tf.truncated_normal([1024, num_labels]))
  biases_post = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden = tf.nn.relu(tf.matmul(tf_train_dataset, weights_pre) + biases_pre)
  keep_prob = tf.placeholder('float')
  hidden_dropped = tf.nn.dropout(hidden, keep_prob)
  logits = tf.matmul(hidden_dropped, weights_post) + biases_post
  regularizer = tf.nn.l2_loss(weights_pre) + tf.nn.l2_loss(weights_post)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits) + b * regularizer)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training dataset
  train_prediction = tf.nn.softmax(logits)
  
  # Predictions for validation dataset
  valid_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  valid_prediction = tf.nn.softmax(valid_logit)
  
  # Predictions for test dataset
  test_logit = tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_pre) + biases_pre), weights_post) + biases_post
  test_prediction = tf.nn.softmax(test_logit)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [11]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (restr_train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = restr_train_dataset[offset:(offset + batch_size), :]
    batch_labels = restr_train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob: 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3626.314453
Minibatch accuracy: 5.5%
Validation accuracy: 30.9%
Minibatch loss at step 500: 21.041096
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Minibatch loss at step 1000: 0.499112
Minibatch accuracy: 99.2%
Validation accuracy: 78.9%
Minibatch loss at step 1500: 0.325664
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Minibatch loss at step 2000: 0.309257
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Minibatch loss at step 2500: 0.296944
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Minibatch loss at step 3000: 0.293223
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Test accuracy: 85.5%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---
