# Deep Learning
## Assignment 3
Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in 1_notmnist.ipynb.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

# Problem 1¶
Introduce and tune L2 regularization for both logistic and neural network models.
Remember that L2 amounts to adding a penalty on the norm of the weights to the loss.
In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t).
The right amount of regularization should improve your validation / test accuracy.

In [10]:
## L2 Regularization for Logistic approach

In [8]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
beta = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random values following a (truncated)
  # normal distribution. The biases get initialized to zero.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  # We multiply the inputs with the weight matrix, and add biases. We compute
  # the softmax and cross-entropy (it's one operation in TensorFlow, because
  # it's very common, and it can be optimized). We take the average of this
  # cross-entropy across all training examples: that's our loss.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
   
  # loss and regularizer
  regularizer = tf.nn.l2_loss(weights)
  loss = tf.reduce_mean(loss + beta*regularizer)
  
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Run Computation and Iterate

In [9]:
num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
  # This is a one-time operation which ensures the parameters get initialized as
  # we described in the graph: random weights for the matrix, zeros for the
  # biases. 
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      print('Training accuracy: %.1f%%' % accuracy(
        predictions, train_labels[:train_subset, :]))
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 47.601593
Training accuracy: 5.8%
Validation accuracy: 9.8%
Loss at step 100: 11.783808
Training accuracy: 73.4%
Validation accuracy: 72.6%
Loss at step 200: 4.448852
Training accuracy: 78.9%
Validation accuracy: 77.1%
Loss at step 300: 1.967347
Training accuracy: 82.3%
Validation accuracy: 79.8%
Loss at step 400: 1.126474
Training accuracy: 83.8%
Validation accuracy: 81.5%
Loss at step 500: 0.837888
Training accuracy: 84.2%
Validation accuracy: 82.0%
Loss at step 600: 0.737597
Training accuracy: 84.4%
Validation accuracy: 82.2%
Loss at step 700: 0.702420
Training accuracy: 84.3%
Validation accuracy: 82.3%
Loss at step 800: 0.689995
Training accuracy: 84.3%
Validation accuracy: 82.2%
Test accuracy: 88.9%


## L2 Regularization for Neural Network

In [20]:
batch_size = 128
num_hidden_nodes = 1024
beta = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes, num_labels]))
  biases2 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset,weights1)+biases1)
  logits = tf.matmul(lay1_train, weights2) + biases2
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

   # loss and regularization
  regularizer = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
  loss = tf.reduce_mean(loss + beta*regularizer)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset,weights1)+biases1)
  valid_prediction = tf.nn.softmax(
    tf.matmul(lay1_valid, weights2) + biases2)
  lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset,weights1)+biases1)
  test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)

In [21]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3435.736084
Minibatch accuracy: 7.8%
Validation accuracy: 27.3%
Minibatch loss at step 500: 21.390873
Minibatch accuracy: 82.8%
Validation accuracy: 84.4%
Minibatch loss at step 1000: 0.807028
Minibatch accuracy: 86.7%
Validation accuracy: 83.4%
Minibatch loss at step 1500: 0.789872
Minibatch accuracy: 80.5%
Validation accuracy: 83.5%
Minibatch loss at step 2000: 0.785277
Minibatch accuracy: 77.3%
Validation accuracy: 83.4%
Minibatch loss at step 2500: 0.711362
Minibatch accuracy: 85.9%
Validation accuracy: 84.4%
Minibatch loss at step 3000: 0.695665
Minibatch accuracy: 85.2%
Validation accuracy: 83.7%
Test accuracy: 90.0%


# Problem 2
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [19]:
#we will use the above graph and restrict our training to 500
num_steps = 3001
train_dataset_2 = train_dataset[:500,:]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset_2[offset:(offset + batch_size), :]
    batch_labels = train_labels_2[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3497.442383
Minibatch accuracy: 7.8%
Validation accuracy: 29.9%
Minibatch loss at step 500: 21.065657
Minibatch accuracy: 100.0%
Validation accuracy: 77.1%
Minibatch loss at step 1000: 0.454802
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Minibatch loss at step 1500: 0.286952
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Minibatch loss at step 2000: 0.268475
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 2500: 0.264614
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 3000: 0.262687
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Test accuracy: 85.7%


As you can see training accuracy is 100% which implies that out model is overfitting

# Problem 3
Introduce Dropout on the hidden layer of the neural network.
Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation 
results would be stochastic as well.
TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

# L2 Regularization and Dropout to Neural Network

In [22]:
batch_size = 128
num_hidden_nodes = 1024
beta = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  keep_prob = tf.placeholder(tf.float32)
  
  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes, num_labels]))
  biases2 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
    # lay1_train is hidden layer with relu activation function
  lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset,weights1)+biases1)
  # apply dropout to hidden layer
  dropout = tf.nn.dropout(lay1_train,keep_prob)
  logits = tf.matmul(dropout, weights2) + biases2
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

   # loss and regularization
  regularizer = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
  loss = tf.reduce_mean(loss + beta*regularizer)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training
  train_prediction = tf.nn.softmax(logits)
  # Prediction for the  validation, and test data.
  lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset,weights1)+biases1)
  valid_prediction = tf.nn.softmax(
    tf.matmul(lay1_valid, weights2) + biases2)
  lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset,weights1)+biases1)
  test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)

In [24]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,keep_prob :0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3692.589355
Minibatch accuracy: 10.2%
Validation accuracy: 28.9%
Minibatch loss at step 500: 21.533356
Minibatch accuracy: 78.1%
Validation accuracy: 84.2%
Minibatch loss at step 1000: 0.897806
Minibatch accuracy: 85.2%
Validation accuracy: 83.2%
Minibatch loss at step 1500: 0.880834
Minibatch accuracy: 80.5%
Validation accuracy: 83.7%
Minibatch loss at step 2000: 0.898835
Minibatch accuracy: 76.6%
Validation accuracy: 83.5%
Minibatch loss at step 2500: 0.766671
Minibatch accuracy: 82.8%
Validation accuracy: 84.0%
Minibatch loss at step 3000: 0.810920
Minibatch accuracy: 82.0%
Validation accuracy: 83.1%
Test accuracy: 89.7%


# Extreme Overfitting

In [25]:
#we will use the above graph and restrict our training to 500
num_steps = 3001
train_dataset_2 = train_dataset[:500,:]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset_2[offset:(offset + batch_size), :]
    batch_labels = train_labels_2[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,keep_prob:0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3568.469727
Minibatch accuracy: 16.4%
Validation accuracy: 32.4%
Minibatch loss at step 500: 21.160070
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 1000: 0.485239
Minibatch accuracy: 100.0%
Validation accuracy: 79.2%
Minibatch loss at step 1500: 0.308107
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Minibatch loss at step 2000: 0.285827
Minibatch accuracy: 100.0%
Validation accuracy: 78.8%
Minibatch loss at step 2500: 0.289044
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Minibatch loss at step 3000: 0.279242
Minibatch accuracy: 100.0%
Validation accuracy: 79.1%
Test accuracy: 85.8%


# Problem 4¶
Try to get the best performance you can using a multi-layer model!
The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.
Another one is to use learning rate decay.

# With 2 Layers 

In [40]:
import math as math

In [43]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 256
beta = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  keep_prob = tf.placeholder(tf.float32)
  
  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_hidden_nodes1],stddev=math.sqrt(2.0/(image_size*image_size))))
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2],stddev=math.sqrt(2.0/num_hidden_nodes1)))
  biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
  weights3 = tf.Variable(tf.truncated_normal([num_hidden_nodes2,num_labels],stddev=math.sqrt(2.0/num_hidden_nodes2)))
  biases3 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
    # lay1_train is hidden layer with relu activation function
  lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset,weights1)+biases1)
  lay2_train = tf.nn.relu(tf.matmul(lay1_train,weights2)+biases2)
  logits = tf.matmul(lay2_train, weights3) + biases3
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

   # loss and regularization
  regularizer = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3)
  loss = tf.reduce_mean(loss + beta*regularizer)
    
  # optimizer using learning rate decay
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.5, global_step, 1000,0.65,staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training
  train_prediction = tf.nn.softmax(logits)
  # Prediction for the  validation, and test data.
  lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset,weights1)+biases1)
  lay2_valid = tf.nn.relu(tf.matmul(lay1_valid,weights2)+biases2)
  valid_prediction = tf.nn.softmax(
    tf.matmul(lay2_valid, weights3) + biases3)
  lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset,weights1)+biases1)
  lay2_test = tf.nn.relu(tf.matmul(lay1_test,weights2)+biases2)
  test_prediction = tf.nn.softmax(tf.matmul(lay2_test, weights3) + biases3)

In [44]:
num_steps = 9001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 12.372956
Minibatch accuracy: 4.7%
Validation accuracy: 45.3%
Minibatch loss at step 500: 0.914920
Minibatch accuracy: 81.2%
Validation accuracy: 83.2%
Minibatch loss at step 1000: 0.694372
Minibatch accuracy: 85.9%
Validation accuracy: 83.8%
Minibatch loss at step 1500: 0.795597
Minibatch accuracy: 81.2%
Validation accuracy: 84.2%
Minibatch loss at step 2000: 0.789433
Minibatch accuracy: 80.5%
Validation accuracy: 84.3%
Minibatch loss at step 2500: 0.736802
Minibatch accuracy: 85.9%
Validation accuracy: 85.1%
Minibatch loss at step 3000: 0.696859
Minibatch accuracy: 85.2%
Validation accuracy: 85.2%
Minibatch loss at step 3500: 0.782833
Minibatch accuracy: 82.0%
Validation accuracy: 85.0%
Minibatch loss at step 4000: 0.736829
Minibatch accuracy: 84.4%
Validation accuracy: 85.4%
Minibatch loss at step 4500: 0.914090
Minibatch accuracy: 77.3%
Validation accuracy: 85.2%
Minibatch loss at step 5000: 0.594714
Minibatch accuracy: 89.8%
Validation accurac

# With 3 layers and Dropout

In [48]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 256
num_hidden_nodes3 = 128
beta = 0.01

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  keep_prob = tf.placeholder(tf.float32)
  
  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_hidden_nodes1],stddev=math.sqrt(2.0/(image_size*image_size))))
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2],stddev=math.sqrt(2.0/num_hidden_nodes1)))
  biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
  weights3 = tf.Variable(tf.truncated_normal([num_hidden_nodes2,num_hidden_nodes3],stddev=math.sqrt(2.0/num_hidden_nodes2)))
  biases3 = tf.Variable(tf.zeros([num_hidden_nodes3]))
  weights4 = tf.Variable(tf.truncated_normal([num_hidden_nodes3,num_labels],stddev=math.sqrt(2.0/num_hidden_nodes3)))
  biases4 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
    # lay1_train is hidden layer with relu activation function
  lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset,weights1)+biases1)
  dropout1 = tf.nn.dropout(lay1_train,keep_prob)
  lay2_train = tf.nn.relu(tf.matmul(dropout1,weights2)+biases2)
  dropout2 = tf.nn.dropout(lay2_train,keep_prob)
  lay3_train = tf.nn.relu(tf.matmul(dropout2,weights3)+biases3)
  logits = tf.matmul(lay3_train, weights4) + biases4
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

   # loss and regularization
  regularizer = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3) + tf.nn.l2_loss(weights4)
  loss = tf.reduce_mean(loss + beta*regularizer)
    
  # optimizer using learning rate decay
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.5, global_step, 4000,0.65,staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training
  train_prediction = tf.nn.softmax(logits)
  # Prediction for the  validation, and test data.
  lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset,weights1)+biases1)
  lay2_valid = tf.nn.relu(tf.matmul(lay1_valid,weights2)+biases2)
  lay3_valid = tf.nn.relu(tf.matmul(lay2_valid,weights3)+biases3)
  valid_prediction = tf.nn.softmax(
    tf.matmul(lay3_valid, weights4) + biases4)
  lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset,weights1)+biases1)
  lay2_test = tf.nn.relu(tf.matmul(lay1_test,weights2)+biases2)
  lay3_test = tf.nn.relu(tf.matmul(lay2_test,weights3)+biases3)
  test_prediction = tf.nn.softmax(tf.matmul(lay3_test, weights4) + biases4)

In [50]:
num_steps = 16001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,keep_prob:0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 13.512499
Minibatch accuracy: 12.5%
Validation accuracy: 25.2%
Minibatch loss at step 500: 1.058594
Minibatch accuracy: 79.7%
Validation accuracy: 82.6%
Minibatch loss at step 1000: 0.869485
Minibatch accuracy: 85.2%
Validation accuracy: 83.2%
Minibatch loss at step 1500: 0.982234
Minibatch accuracy: 79.7%
Validation accuracy: 82.7%
Minibatch loss at step 2000: 1.027304
Minibatch accuracy: 75.8%
Validation accuracy: 80.7%
Minibatch loss at step 2500: 0.918671
Minibatch accuracy: 83.6%
Validation accuracy: 83.5%
Minibatch loss at step 3000: 0.887570
Minibatch accuracy: 83.6%
Validation accuracy: 83.2%
Minibatch loss at step 3500: 1.008856
Minibatch accuracy: 77.3%
Validation accuracy: 82.2%
Minibatch loss at step 4000: 0.995465
Minibatch accuracy: 78.9%
Validation accuracy: 83.5%
Minibatch loss at step 4500: 1.045768
Minibatch accuracy: 77.3%
Validation accuracy: 82.9%
Minibatch loss at step 5000: 0.728575
Minibatch accuracy: 89.8%
Validation accura