In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)
Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [19]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    pool = tf.nn.max_pool(data, [1,2,2,1], [1,2,2,1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer1_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    pool = tf.nn.max_pool(hidden, [1,2,2,1], [1,2,2,1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [20]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.400999
Minibatch accuracy: 18.8%
Validation accuracy: 10.0%
Minibatch loss at step 50: 1.770602
Minibatch accuracy: 43.8%
Validation accuracy: 58.7%
Minibatch loss at step 100: 1.071033
Minibatch accuracy: 68.8%
Validation accuracy: 75.2%
Minibatch loss at step 150: 1.126480
Minibatch accuracy: 75.0%
Validation accuracy: 76.5%
Minibatch loss at step 200: 0.790297
Minibatch accuracy: 75.0%
Validation accuracy: 78.5%
Minibatch loss at step 250: 1.386748
Minibatch accuracy: 68.8%
Validation accuracy: 79.8%
Minibatch loss at step 300: 0.908683
Minibatch accuracy: 68.8%
Validation accuracy: 79.3%
Minibatch loss at step 350: 0.893628
Minibatch accuracy: 75.0%
Validation accuracy: 80.5%
Minibatch loss at step 400: 0.816830
Minibatch accuracy: 68.8%
Validation accuracy: 80.7%
Minibatch loss at step 450: 0.349745
Minibatch accuracy: 87.5%
Validation accuracy: 82.0%
Minibatch loss at step 500: 0.455329
Minibatch accuracy: 81.2%
Validation accuracy: 81.6%
M

In [38]:
"""
Our best performing algorithm (97.0%) from before, which used 4 hidden layers with smartly optimized weights. 
We will replace the fully connected layers with convolutions (same padding, strides of 2).

After training 95001 steps w/ batch size of 16, test score was 95.2%
"""

#batch_size = 128
layer1_size = 1024
layer2_size = 1024
layer3_size = 305
layer4_size = 75
# note, logits_size == num_labels

batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64


layer1_weights_stdev = 0.0517
layer2_weights_stdev = 0.0441
layer3_weights_stdev = 0.0441
layer4_weights_stdev = 0.0809
logits_weights_stdev = 0.1632

regularization_meta = 0.03
keepprob = 0.75

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  """ 
  # Old layer weights.
  layer1_weights = tf.Variable(tf.truncated_normal([image_size * image_size, layer1_size], stddev=layer1_weights_stdev))
  layer2_weights = tf.Variable(tf.truncated_normal([layer1_size, layer2_size], stddev=layer2_weights_stdev))
  layer3_weights = tf.Variable(tf.truncated_normal([layer2_size, layer3_size], stddev=layer3_weights_stdev))
  layer4_weights = tf.Variable(tf.truncated_normal([layer3_size, layer4_size], stddev=layer4_weights_stdev))
  logits_weights = tf.Variable(tf.truncated_normal([layer4_size, num_labels], stddev=logits_weights_stdev))
  
  # Old layer biases.
  layer1_biases = tf.Variable(tf.zeros([layer1_size]))
  layer2_biases = tf.Variable(tf.zeros([layer2_size]))
  layer3_biases = tf.Variable(tf.zeros([layer3_size]))
  layer4_biases = tf.Variable(tf.zeros([layer4_size]))
  logits_biases = tf.Variable(tf.zeros([num_labels]))
  """

  # New layer weights
  layer1_weights = tf.Variable(tf.truncated_normal([5, 5, 1, 16], stddev=layer1_weights_stdev))
  layer2_weights = tf.Variable(tf.truncated_normal([5, 5, 16, 16], stddev=layer2_weights_stdev))
  layer3_weights = tf.Variable(tf.truncated_normal([784, 64], stddev=layer3_weights_stdev))
  layer4_weights = tf.Variable(tf.truncated_normal([64, 10], stddev=layer4_weights_stdev))

  # New layer biases
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
    
  
  def apply_layers(dataset, use_dropout=False):
    conv = tf.nn.conv2d(dataset, layer1_weights, [1, 2, 2, 1], padding='SAME')
    layer = tf.nn.relu(conv + layer1_biases)
    if use_dropout:
        layer = tf.nn.dropout(layer, keepprob)
    
    conv = tf.nn.conv2d(layer, layer2_weights, [1, 2, 2, 1], padding='SAME')
    layer = tf.nn.relu(conv + layer2_biases)
    if use_dropout:
        layer = tf.nn.dropout(layer, keepprob)

    shape = layer.get_shape().as_list()
    reshape = tf.reshape(layer, [shape[0], shape[1] * shape[2] * shape[3]])
    layer = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    if use_dropout:
        layer = tf.nn.dropout(layer, keepprob)
    
    # one fully connected layer at the end
    logits = tf.matmul(layer, layer4_weights) + layer4_biases
    return logits

  # Training computation.
  train_logits = apply_layers(tf_train_dataset, True)
  valid_logits = apply_layers(tf_valid_dataset)
  test_logits = apply_layers(tf_test_dataset)

  train_loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=train_logits))
  #loss_l2 = train_loss + (regularization_meta * (tf.nn.l2_loss(weights2)))
  
  # Optimizer.
  global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.3, global_step, 3500, 0.86, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(train_loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(train_logits)
  valid_prediction = tf.nn.softmax(valid_logits)
  test_prediction = tf.nn.softmax(test_logits)

In [39]:
num_steps = 95001


with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
    
    
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, train_loss, train_prediction], feed_dict=feed_dict)
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.583692
Minibatch accuracy: 12.5%
Validation accuracy: 10.2%
Minibatch loss at step 1000: 0.808243
Minibatch accuracy: 81.2%
Validation accuracy: 77.7%
Minibatch loss at step 2000: 0.589585
Minibatch accuracy: 75.0%


KeyboardInterrupt: 

In [45]:
"""
We will use the same model as above, but with 4 convolutional layers and with pooling


"""

batch_size = 16

layer1_weights_stdev = 0.0517
layer2_weights_stdev = 0.0441
layer3_weights_stdev = 0.0441
layer4_weights_stdev = 0.0809
layer5_weights_stdev = 0.1632

regularization_meta = 0.03
keepprob = 0.75


graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  

  # New layer weights
  layer1_weights = tf.Variable(tf.truncated_normal([3, 3, num_channels, 4], stddev=layer1_weights_stdev))
  layer2_weights = tf.Variable(tf.truncated_normal([3, 3, 4, 16], stddev=layer2_weights_stdev))
  layer3_weights = tf.Variable(tf.truncated_normal([3, 3, 16, 49], stddev=layer3_weights_stdev))
  layer4_weights = tf.Variable(tf.truncated_normal([784, 64], stddev=layer4_weights_stdev))
  layer5_weights = tf.Variable(tf.truncated_normal([64, num_labels], stddev=layer5_weights_stdev))

  # New layer biases
  layer1_biases = tf.Variable(tf.zeros([4]))
  layer2_biases = tf.Variable(tf.zeros([16]))
  layer3_biases = tf.Variable(tf.zeros([49]))
  layer4_biases = tf.Variable(tf.zeros([64]))
  layer5_biases = tf.Variable(tf.zeros([num_labels]))
  
  def apply_layers(dataset, use_dropout=False):
    pool = tf.nn.max_pool(dataset, [1,3,3,1], [1,2,2,1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer1_weights, [1,1,1,1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    if use_dropout:
        hidden = tf.nn.dropout(hidden, keepprob)
    
    pool = tf.nn.max_pool(hidden, [1,3,3,1], [1,2,2,1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer2_weights, [1,1,1,1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    if use_dropout:
        hidden = tf.nn.dropout(hidden, keepprob)

    pool = tf.nn.max_pool(hidden, [1,3,3,1], [1,2,2,1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer3_weights, [1,1,1,1], padding='SAME')
    hidden = tf.nn.relu(conv + layer3_biases)
    if use_dropout:
        hidden = tf.nn.dropout(hidden, keepprob)
    
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer4_weights) + layer4_biases)
    if use_dropout:
        hidden = tf.nn.dropout(hidden, keepprob)
    
    # one fully connected layer at the end
    logits = tf.matmul(hidden, layer5_weights) + layer5_biases
    return logits

  # Training computation.
  train_logits = apply_layers(tf_train_dataset, True)
  valid_logits = apply_layers(tf_valid_dataset)
  test_logits = apply_layers(tf_test_dataset)

  train_loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=train_logits))
  #loss_l2 = train_loss + (regularization_meta * (tf.nn.l2_loss(weights2)))
  
  # Optimizer.
  global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.3, global_step, 3500, 0.86, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(train_loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(train_logits)
  valid_prediction = tf.nn.softmax(valid_logits)
  test_prediction = tf.nn.softmax(test_logits)

In [46]:
num_steps = 95001


with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
    
    
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, train_loss, train_prediction], feed_dict=feed_dict)
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.306716
Minibatch accuracy: 0.0%
Validation accuracy: 10.0%
Minibatch loss at step 1000: 0.973636
Minibatch accuracy: 75.0%
Validation accuracy: 72.5%
Minibatch loss at step 2000: 1.153676
Minibatch accuracy: 62.5%
Validation accuracy: 64.9%
Minibatch loss at step 3000: 2.284338
Minibatch accuracy: 18.8%
Validation accuracy: 10.0%
Minibatch loss at step 4000: 2.326447
Minibatch accuracy: 12.5%
Validation accuracy: 10.0%
Minibatch loss at step 5000: 2.309686
Minibatch accuracy: 0.0%
Validation accuracy: 10.0%
Minibatch loss at step 6000: 2.263053
Minibatch accuracy: 25.0%
Validation accuracy: 10.0%
Minibatch loss at step 7000: 2.311858
Minibatch accuracy: 12.5%
Validation accuracy: 10.0%
Minibatch loss at step 8000: 2.279373
Minibatch accuracy: 25.0%
Validation accuracy: 10.0%
Minibatch loss at step 9000: 2.284620
Minibatch accuracy: 0.0%
Validation accuracy: 10.0%


KeyboardInterrupt: 