Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [None]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [None]:
# Logistic regression

# copy from Lab2
train_subset = 10000

logreg_graph = tf.Graph()
with logreg_graph.as_default():
  # Input data.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  beta = 0.005
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) + beta * tf.nn.l2_loss(weights))
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=logreg_graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      print('Training accuracy: %.1f%%' % accuracy(predictions, train_labels[:train_subset, :]))
      print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

In [None]:
#logistic regression with SGD

batch_size = 128

logreg_sgd_graph = tf.Graph()
with logreg_sgd_graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  beta = 0.005
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) + beta * tf.nn.l2_loss(weights))
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

num_steps = 3001

with tf.Session(graph=logreg_sgd_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [None]:
# Logistic Regression summary
# original logistic regression w/o sgd: 83.5%, w/ sgd: 86.0%
# regularized logistic regression w/o sgd: 88.0%, w/ sgd: 86.9%, beta = 0.05
# regularized logistic regression w/o sgd: 88.8%, w/ sgd: 88.6%, beta = 0.01
# regularized logistic regression w/o sgd: 88.3%, w/ sgd: 89.0%, beta = 0.005

In [None]:
# 1-layer hidden layer with regularization

# Build 1-hidden layer
# network:
# input[batch_size, image_size * image_size] => \
# w1[batch_size, 1024] => ReLu [batch_size, 1024] => w2[batch_size, 10] => softmax

batch_size = 128
relu_size = 1024

relu_graph = tf.Graph()

# hidden layer calculation (right before softmax)
def hidden_layer_calc(dataset, weights1, biases1, weights2, biases2):
    relu_input = tf.matmul(dataset, weights1) + biases1
    relu_output = tf.nn.relu(relu_input)
    logits = tf.matmul(relu_output, weights2) + biases2
    return logits

with relu_graph.as_default():
    # pack input data
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    # layer 1
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, relu_size]))
    biases1 = tf.Variable(tf.zeros([relu_size]))
    # layer 2
    weights2 = tf.Variable(tf.truncated_normal([relu_size, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits = hidden_layer_calc(tf_train_dataset, weights1, biases1, weights2, biases2)
    beta = .05
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) + \
        beta * (tf.nn.l2_loss(weights1) / (image_size * image_size * relu_size)) + \
        beta * (tf.nn.l2_loss(weights2) / (relu_size * num_labels)))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(hidden_layer_calc(tf_valid_dataset, weights1, biases1, weights2, biases2))
    test_prediction  = tf.nn.softmax(hidden_layer_calc(tf_test_dataset, weights1, biases1, weights2, biases2))
    
    
# run it
num_steps = 3

with tf.Session(graph=relu_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [None]:
# hidden layer summary (step == 6000)
# original hidden layer: Test accuracy: 90.3%
# hidden layer with beta = 0.05: Test accuracy: 84.0%
# hidden layer with beta = 0.01: Test accuracy: 89.6%, w/o weights dimension normalization
# hidden layer with beta = 1:    Test accuracy: 90.0%, w/  weights dimension normalization
# hidden layer with beta = 0.5:  Test accuracy: 89.8%, w/  weights dimension normalization
# hidden layer with beta = 0.1:  Test accuracy: 90.4%, w/  weights dimension normalization
# hidden layer with beta = 5:    Test accuracy: 89.9%, w/  weights dimension normalization
# hidden layer with beta = 0.05: Test accuracy: 89.9%, w/  weights dimension normalization


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [None]:
# Change the num_steps to 10 and Test accuracy: 81.5%
# Change the num_steps to 3  and Test accuracy: 42.6%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [None]:
# 1-layer hidden layer with dropout

# Build 1-hidden layer
# network:
# input[batch_size, image_size * image_size] => \
# w1[batch_size, 1024] => ReLu [batch_size, 1024] => w2[batch_size, 10] => softmax

batch_size = 128
relu_size = 1024

relu_graph = tf.Graph()

# hidden layer calculation (right before softmax)
def hidden_layer_calc(dataset, weights1, biases1, weights2, biases2, keep_prob):
    relu_input = tf.matmul(dataset, weights1) + biases1
    relu_output = tf.nn.relu(relu_input)
    relu_do_output = tf.nn.dropout(relu_output, keep_prob)
    logits = tf.matmul(relu_do_output, weights2) + biases2
    return logits

with relu_graph.as_default():
    # pack input data
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    #keep_prob = tf.placeholder(tf.float32)
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    # layer 1
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, relu_size]))
    biases1 = tf.Variable(tf.zeros([relu_size]))
    # layer 2
    weights2 = tf.Variable(tf.truncated_normal([relu_size, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits = hidden_layer_calc(tf_train_dataset, weights1, biases1, weights2, biases2, 0.5) # keep_prop = .5 for training
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(hidden_layer_calc(tf_valid_dataset, weights1, biases1, weights2, biases2, 1.))
    test_prediction  = tf.nn.softmax(hidden_layer_calc(tf_test_dataset, weights1, biases1, weights2, biases2, 1.))

# run it
num_steps = 6001

with tf.Session(graph=relu_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [None]:
# for  3 batches, normal regularization: 38.5%, drop out: 39.5%
# for  6 batches, normal regularization: 72.7%, drop out: 75.6%
# for 30 batches, normal regularization: 81.1%, drop out: 82.1%

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [None]:
# Multilayer design (Trial 1)
# [weights1,biases1] => relu1 => [weights2,biases2] => relu2 => [weights3,biases3]
batch_size = 128
relu_size = [1024, 10]

multi_layer_graph = tf.Graph()

# hidden layer calculation (right before softmax)
def multi_layer_calc(dataset, weights1, biases1, weights2, biases2, weights3, biases3):
    relu_input1 = tf.matmul(dataset,      weights1) + biases1
    relu_output1= tf.nn.relu(relu_input1)
    relu_input2 = tf.matmul(relu_output1, weights2) + biases2
    relu_output2= tf.nn.relu(relu_input2)
    logits      = tf.matmul(relu_output1, weights3) + biases3
    return logits

with multi_layer_graph.as_default():
    # pack input data
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels  = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    #keep_prob = tf.placeholder(tf.float32)
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset  = tf.constant(test_dataset)
    
    # Variables
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, relu_size[0]]))
    biases1  = tf.Variable(tf.zeros([relu_size[0]]))
    weights2 = tf.Variable(tf.truncated_normal([relu_size[0], relu_size[1]]))
    biases2  = tf.Variable(tf.zeros([relu_size[1]]))
    weights3 = tf.Variable(tf.truncated_normal([relu_size[0], num_labels]))
    biases3  = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = multi_layer_calc(tf_train_dataset, weights1, biases1, weights2, biases2, weights3, biases3)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    #global_step = tf.Variable(0)  # count the number of steps taken.
    #learning_rate = tf.train.exponential_decay(0.5, step, 100000, 0.96, staircase=True)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(multi_layer_calc(tf_valid_dataset, weights1, biases1, weights2, biases2, weights3, biases3))
    test_prediction  = tf.nn.softmax(multi_layer_calc(tf_test_dataset, weights1, biases1, weights2, biases2, weights3, biases3))

# run it
num_steps = 6001

with tf.Session(graph=multi_layer_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data   = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [7]:
#Multiple layer design2
batch_size = 50

multi_layer_graph2 = tf.Graph()

def conv2d(x, W, in_name):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', use_cudnn_on_gpu=False, name = in_name)

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def multi_layer_calc(dataset, weights1, biases1, weights2, biases2, W_fc1, b_fc1, W_fc2, b_fc2, keep_prob):
    dataset_image = tf.reshape(dataset, [-1,28,28,1])
    # layer1
    h_conv1 = tf.nn.relu(conv2d(dataset_image, weights1, "Layer1") + biases1)
    h_pool1 = max_pool_2x2(h_conv1)
    # layer2
    h_conv2 = tf.nn.relu(conv2d(h_pool1, weights2, "Layer2") + biases2)
    h_pool2 = max_pool_2x2(h_conv2)
    # full connected
    h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    # read out layer
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    return logits


with multi_layer_graph2.as_default():
    # pack input data
    tf_train_dataset = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
    tf_train_labels  = tf.placeholder(tf.float32, shape=(None, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset  = tf.constant(test_dataset)
    keep_prob = tf.placeholder(tf.float32)

    #layer1
    W1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32]))
    b1 = tf.Variable(tf.constant(0.1, shape=[32]))
    W2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64]))
    b2 = tf.Variable(tf.constant(0.1, shape=[64]))
    W_fc1 = tf.Variable(tf.truncated_normal([7 * 7 * 64, 1024]))
    b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]))
    W_fc2 = tf.Variable(tf.truncated_normal([1024, 10]))
    b_fc2 = tf.Variable(tf.constant(0.1, shape=[10]))

    y_conv = tf.nn.softmax(multi_layer_calc(tf_train_dataset, W1, b1, W2, b2, W_fc1, b_fc1, W_fc2, b_fc2, keep_prob))
    loss = -tf.reduce_sum(tf_train_labels*tf.log(y_conv))
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
    correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(tf_train_labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    train_prediction = tf.argmax(y_conv, 1)
    valid_prediction = \
        tf.argmax(tf.nn.softmax(multi_layer_calc(tf_valid_dataset, W1, b1, W2, b2, W_fc1, b_fc1, W_fc2, b_fc2, keep_prob)), 1)
    test_prediction  = \
        tf.argmax(tf.nn.softmax(multi_layer_calc(tf_test_dataset, W1, b1, W2, b2, W_fc1, b_fc1, W_fc2, b_fc2, keep_prob)), 1)

num_steps = 6001

with tf.Session(graph=multi_layer_graph2) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data   = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      #print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      #print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: nan


InvalidArgumentError: ReluGrad input is not finite. : Tensor had NaN values
	 [[Node: gradients/Relu_grad/Relu/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](add)]]
Caused by op u'gradients/Relu_grad/Relu/CheckNumerics', defined at:
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 403, in start
    ioloop.IOLoop.instance().start()
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 151, in start
    super(ZMQIOLoop, self).start()
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 433, in _handle_events
    self._handle_recv()
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 465, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 407, in _run_callback
    callback(*args, **kwargs)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3006, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-6b3da7bb8070>", line 49, in <module>
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 186, in minimize
    aggregation_method=aggregation_method)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 232, in compute_gradients
    aggregation_method=aggregation_method)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/gradients.py", line 445, in gradients
    in_grads = _AsList(grad_fn(op_wrapper, *out_grads))
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 126, in _ReluGrad
    t = _VerifyTensor(op.inputs[0], op.name, "ReluGrad input is not finite.")
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 119, in _VerifyTensor
    verify_input = array_ops.check_numerics(t, message=msg)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 48, in check_numerics
    name=name)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
    op_def=op_def)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
    self._traceback = _extract_stack()

...which was originally created as op u'Relu', defined at:
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
[elided 17 identical lines from previous traceback]
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-6b3da7bb8070>", line 47, in <module>
    y_conv = tf.nn.softmax(multi_layer_calc(tf_train_dataset, W1, b1, W2, b2, W_fc1, b_fc1, W_fc2, b_fc2, keep_prob))
  File "<ipython-input-7-6b3da7bb8070>", line 15, in multi_layer_calc
    h_conv1 = tf.nn.relu(conv2d(dataset_image, weights1, "Layer1") + biases1)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 547, in relu
    return _op_def_lib.apply_op("Relu", features=features, name=name)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
    op_def=op_def)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/zhaoyiwei/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
    self._traceback = _extract_stack()


In [9]:
batch_size = 50

print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

def conv2d(x, W, in_name):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

dataset = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
labels  = tf.placeholder(tf.float32, shape=(None, num_labels))
#tf_valid_dataset = tf.constant(valid_dataset)
#tf_test_dataset  = tf.constant(test_dataset)
keep_prob = tf.placeholder(tf.float32)

W1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, shape=[32]))
W2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1))
b2 = tf.Variable(tf.constant(0.1, shape=[64]))
W_fc1 = tf.Variable(tf.truncated_normal([7 * 7 * 64, 1024], stddev=0.1))
b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]))
W_fc2 = tf.Variable(tf.truncated_normal([1024, 10], stddev=0.1))
b_fc2 = tf.Variable(tf.constant(0.1, shape=[10]))

dataset_image = tf.reshape(dataset, [-1,28,28,1])
# layer1
h_conv1 = tf.nn.relu(conv2d(dataset_image, W1, "Layer1") + b1)
h_pool1 = max_pool_2x2(h_conv1)
# layer2
h_conv2 = tf.nn.relu(conv2d(h_pool1, W2, "Layer2") + b2)
h_pool2 = max_pool_2x2(h_conv2)
# full connected
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
# read out layer
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

cross_entropy = -tf.reduce_sum(labels*tf.log(y_conv))
train_step_m = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#train_step_m = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(labels,1))
accuracy_m = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

sess = tf.Session()
sess.run(tf.initialize_all_variables())

num_steps = 20001
with sess.as_default():
    for i in range(num_steps):
      #batch = mnist.train.next_batch(50)
      offset = (i * batch_size) % (train_labels.shape[0] - batch_size)
      # Generate a minibatch.
      batch_data   = train_dataset[offset:(offset + batch_size), :]
      batch_labels = train_labels[offset:(offset + batch_size), :]
      if i%100 == 0:
        train_accuracy = accuracy_m.eval(feed_dict={dataset: batch_data, labels: batch_labels, keep_prob: 1.0})
        print("offset %d step %d, training accuracy %g" % (offset, i, train_accuracy))
      train_step_m.run(feed_dict={dataset: batch_data, labels: batch_labels, keep_prob: 0.5})

    print("test accuracy %g" % \
          accuracy_m.eval(feed_dict={dataset: test_dataset, labels: test_labels, keep_prob: 1.0}))

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)
offset 0 step 0, training accuracy 0.14
offset 5000 step 100, training accuracy 0.72
offset 10000 step 200, training accuracy 0.8
offset 15000 step 300, training accuracy 0.84
offset 20000 step 400, training accuracy 0.86
offset 25000 step 500, training accuracy 0.82
offset 30000 step 600, training accuracy 0.8
offset 35000 step 700, training accuracy 0.84
offset 40000 step 800, training accuracy 0.8
offset 45000 step 900, training accuracy 0.84
offset 50000 step 1000, training accuracy 0.78
offset 55000 step 1100, training accuracy 0.86
offset 60000 step 1200, training accuracy 0.8
offset 65000 step 1300, training accuracy 0.88
offset 70000 step 1400, training accuracy 0.84
offset 75000 step 1500, training accuracy 0.9
offset 80000 step 1600, training accuracy 0.74
offset 85000 step 1700, training accuracy 0.88
offset 90000 step 1800, training accuracy 0.82
offset 95000 st