# Deep Learning
### Assignment 3
Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [12]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in notmist.ipynb.

In [13]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [14]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [15]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

# Problem 1
Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.

-----------------
### logistic regression with l2 loss function
regularizing with beta = 0.01

In [16]:
# multinomial logistic regression 
train_subset = 10000
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables    
    weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    # loss function using l2
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    loss = tf.reduce_mean(loss + beta * tf.nn.l2_loss(weights) )
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases )
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [17]:
num_steps = 801

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        
        _, l, predictions = session.run([optimizer, loss, train_prediction])

        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
            predictions, train_labels[:train_subset, :]))
            print('Validation accuracy: %.1f%%' % accuracy( valid_prediction.eval(), valid_labels) )

    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 49.704948
Training accuracy: 11.1%
Validation accuracy: 12.4%
Loss at step 100: 11.658316
Training accuracy: 73.6%
Validation accuracy: 71.9%
Loss at step 200: 4.405949
Training accuracy: 79.2%
Validation accuracy: 76.8%
Loss at step 300: 1.954034
Training accuracy: 82.7%
Validation accuracy: 79.7%
Loss at step 400: 1.120747
Training accuracy: 84.0%
Validation accuracy: 81.3%
Loss at step 500: 0.833583
Training accuracy: 84.5%
Validation accuracy: 81.8%
Loss at step 600: 0.733426
Training accuracy: 84.8%
Validation accuracy: 82.0%
Loss at step 700: 0.698172
Training accuracy: 84.9%
Validation accuracy: 82.0%
Loss at step 800: 0.685667
Training accuracy: 84.9%
Validation accuracy: 82.0%
Test accuracy: 88.9%


### neural network with l2 loss function

In [32]:
n_hidden = 1024
L2_weight = 0.5e-3


def forward(tf_X):
    """
    assert tf.shape(tf_X)[1] == image_size*image_size,\
        "Training data not of correct shape. Each input should be of shape: %s" % (image_size*image_size)
    """
    with tf.name_scope('hidden1'):
        weights = tf.Variable(tf.truncated_normal([image_size*image_size, n_hidden]), name="weights")
        biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
        z01 = tf.matmul(tf_X, weights) + biases
        hidden1 = tf.nn.relu(z01)
        l2_reg_01 = tf.nn.l2_loss(weights)
    with tf.name_scope('z12'):
        weights = tf.Variable(tf.truncated_normal([n_hidden, num_labels]), name="weights")
        biases = tf.Variable(tf.zeros([num_labels]), name="biases")
        z12 = tf.matmul(hidden1, weights) + biases
        l2_reg_12 = tf.nn.l2_loss(weights)
    return z12, l2_reg_01+l2_reg_12

# Define loss
def get_loss(z12, l2_loss, tf_Y):
    """
    assert tf.shape(tf_X)[1] == image_size*image_size,\
        "Training data not of correct shape. got %s require %s" % (tf.shape(tf_X)[1], image_size*image_size)
    assert tf.shape(tf_Y)[1] == num_labels,\
        "Training data not of correct shape. got %s require %s" % (tf.shape(tf_Y)[1], num_labels)
    """
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(z12, tf_training_labels))
    total_loss = loss + L2_weight*l2_loss
    return total_loss

# Define the network graph
graph = tf.Graph()
with graph.as_default():
    #tf_training_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
    #tf_training_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_training_dataset = tf.placeholder(tf.float32) # Should have shape (batch_size, image_size*image_size)
    tf_training_labels = tf.placeholder(tf.float32) # Should have shape (batch_size, num_labels)
    
    z12, l2_loss = forward(tf_training_dataset)
    total_loss = get_loss(z12, l2_loss, tf_training_labels)
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(total_loss)

In [33]:
# train the model
num_steps = 3001
batch_size = 128
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized, using batch size: %s" % batch_size)
    for step in xrange(num_steps):
        idx = np.random.randint(train_dataset.shape[0], size=batch_size)
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[idx]
        batch_labels = train_labels[idx]
        #batch_data = train_dataset[offset:(offset + batch_size), :]
        #batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels}
        _, l, predictions = session.run([optimizer, total_loss, z12], feed_dict=feed_dict)
        if (step % 500 == 0):
            #batch_size += 100
            print("Updated batch size: %s" % batch_size)
            print("Minibatch loss at step", step, ":", l)
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset})
            print("Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels))
    predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset})
    print("Test accuracy: %.1f%%" % accuracy(predictions, test_labels))

Initialized, using batch size: 128
Updated batch size: 128
Minibatch loss at step 0 : 502.851
Minibatch accuracy: 4.7%
Validation accuracy: 34.7%
Updated batch size: 128
Minibatch loss at step 500 : 128.682
Minibatch accuracy: 78.9%
Validation accuracy: 80.6%
Updated batch size: 128
Minibatch loss at step 1000 : 98.827
Minibatch accuracy: 79.7%
Validation accuracy: 81.9%
Updated batch size: 128
Minibatch loss at step 1500 : 73.2438
Minibatch accuracy: 84.4%
Validation accuracy: 81.3%
Updated batch size: 128
Minibatch loss at step 2000 : 57.8333
Minibatch accuracy: 83.6%
Validation accuracy: 82.8%
Updated batch size: 128
Minibatch loss at step 2500 : 43.7344
Minibatch accuracy: 86.7%
Validation accuracy: 84.0%
Updated batch size: 128
Minibatch loss at step 3000 : 34.312
Minibatch accuracy: 85.9%
Validation accuracy: 84.2%
Test accuracy: 90.8%


# Problem 2
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

------
### neural network wiht l2 loss and small training data

In [34]:
# Overfitting using very small subset of data
num_steps = 3001
batch_size = 100
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized, using batch size: %s" % batch_size)
    for step in xrange(num_steps):
        idx = np.random.randint(train_dataset.shape[0]/100, size=batch_size)
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[idx]
        batch_labels = train_labels[idx]
        #batch_data = train_dataset[offset:(offset + batch_size), :]
        #batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels}
        _, l, predictions = session.run([optimizer, total_loss, z12], feed_dict=feed_dict)
        if (step % 500 == 0):
            #batch_size += 100
            print("Updated batch size: %s" % batch_size)
            print("Minibatch loss at step", step, ":", l)
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset})
            print("Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels))
    predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset})
    print("Test accuracy: %.1f%%" % accuracy(predictions, test_labels))

Initialized, using batch size: 100
Updated batch size: 100
Minibatch loss at step 0 : 506.215
Minibatch accuracy: 6.0%
Validation accuracy: 17.0%
Updated batch size: 100
Minibatch loss at step 500 : 122.231
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Updated batch size: 100
Minibatch loss at step 1000 : 95.1697
Minibatch accuracy: 100.0%
Validation accuracy: 78.3%
Updated batch size: 100
Minibatch loss at step 1500 : 74.1074
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Updated batch size: 100
Minibatch loss at step 2000 : 57.7088
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Updated batch size: 100
Minibatch loss at step 2500 : 44.9396
Minibatch accuracy: 100.0%
Validation accuracy: 78.9%
Updated batch size: 100
Minibatch loss at step 3000 : 34.9954
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Test accuracy: 86.4%


------
### log reg with l2 loss and small training data

In [35]:
# multinomial logistic regression 
train_subset = 100
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables    
    weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    # loss function using l2
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    loss = tf.reduce_mean(loss + beta * tf.nn.l2_loss(weights) )
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases )
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [36]:
num_steps = 801

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        
        _, l, predictions = session.run([optimizer, loss, train_prediction])

        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
            predictions, train_labels[:train_subset, :]))
            print('Validation accuracy: %.1f%%' % accuracy( valid_prediction.eval(), valid_labels) )

    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 50.718063
Training accuracy: 3.0%
Validation accuracy: 9.6%
Loss at step 100: 11.271645
Training accuracy: 100.0%
Validation accuracy: 46.4%
Loss at step 200: 4.200890
Training accuracy: 100.0%
Validation accuracy: 52.2%
Loss at step 300: 1.617562
Training accuracy: 100.0%
Validation accuracy: 57.8%
Loss at step 400: 0.675791
Training accuracy: 100.0%
Validation accuracy: 62.1%
Loss at step 500: 0.332560
Training accuracy: 100.0%
Validation accuracy: 63.9%
Loss at step 600: 0.207312
Training accuracy: 100.0%
Validation accuracy: 64.6%
Loss at step 700: 0.161507
Training accuracy: 100.0%
Validation accuracy: 64.8%
Loss at step 800: 0.144699
Training accuracy: 100.0%
Validation accuracy: 65.0%
Test accuracy: 71.6%


Training accuracy goes to 100% due to the small sample size, however, the model is not as good the validation samples.

# Problem 3
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

In [38]:
batch_size = 128
beta = 0.001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)

In [39]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 459.557098
Minibatch accuracy: 17.2%
Validation accuracy: 35.1%
Minibatch loss at step 500: 40.795624
Minibatch accuracy: 72.7%
Validation accuracy: 78.1%
Minibatch loss at step 1000: 18.608608
Minibatch accuracy: 75.8%
Validation accuracy: 80.1%
Minibatch loss at step 1500: 10.248548
Minibatch accuracy: 78.9%
Validation accuracy: 79.2%
Minibatch loss at step 2000: 6.949739
Minibatch accuracy: 72.7%
Validation accuracy: 79.3%
Minibatch loss at step 2500: 4.872125
Minibatch accuracy: 76.6%
Validation accuracy: 79.2%
Minibatch loss at step 3000: 4.625711
Minibatch accuracy: 77.3%
Validation accuracy: 80.3%
Test accuracy: 87.6%


In [40]:
num_steps = 3001

train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 525.713196
Minibatch accuracy: 10.9%
Validation accuracy: 25.7%
Minibatch loss at step 500: 2.992490
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Minibatch loss at step 1000: 1.856812
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 1.164409
Minibatch accuracy: 100.0%
Validation accuracy: 78.1%
Minibatch loss at step 2000: 0.738911
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 2500: 0.466634
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Minibatch loss at step 3000: 0.322515
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Test accuracy: 85.3%


# Problem 4
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

-----------
What you can do:
- trying smaller beta and bigger num steps
- trying keep prob of 0.1
- trying with learning rate and multiple keep probs
- trying different learning rates
- trying 2 layer neural network
- trying 2 NN with different loss function

### BEST MODEL

In [42]:
"""
[Step: 5000] Minibatch loss 12.6376, accuracy: 89.5%
[Step: 5000] Validation loss 12.6891, accuracy: 86.9%
Test loss 12.4793, accuracy: 93.0%
"""

batch_size = 128
n_hidden = 1024
L2_weight = 0.5e-3

def forward(tf_X, dropout_p):
    """
    assert tf.shape(tf_X)[1] == image_size*image_size,\
        "Training data not of correct shape. Each input should be of shape: %s" % (image_size*image_size)
    """
    l2_weight_loss = [0]
    #tf.Variable(0, name="l2_weight_loss")
    with tf.name_scope('hidden1'):
        weights = tf.Variable(tf.truncated_normal([image_size*image_size, n_hidden]), name="weights")
        biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
        z01 = tf.matmul(tf.nn.dropout(tf_X, 0.9), weights) + biases # Dropout input keeping 0.9 inputs always
        hidden1 = tf.nn.dropout(tf.nn.relu(z01), dropout_p) # Added dropout
        #hidden1 = tf.nn.relu(z01) # No dropout
        l2_weight_loss.append(tf.nn.l2_loss(weights))
    """
    with tf.name_scope('z12'):
        weights = tf.Variable(tf.truncated_normal([n_hidden, n_hidden]), name="weights")
        biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
        z12 = tf.matmul(hidden1, weights) + biases
        hidden2 = tf.nn.dropout(tf.nn.tanh(z12), dropout_p) # Added dropout
        #hidden2 = tf.nn.relu(z12) # No dropout
        #l2_weight_loss.append(tf.nn.l2_loss(weights))
    """
    with tf.name_scope('outputs'):
        weights = tf.Variable(tf.truncated_normal([n_hidden, num_labels]), name="weights")
        biases = tf.Variable(tf.zeros([num_labels]), name="biases")
        outputs = tf.matmul(hidden1, weights) + biases # Add constant to ensure input to log is never zero.
        l2_weight_loss.append(tf.nn.l2_loss(weights))
    return outputs, reduce(lambda x, y: x + y, l2_weight_loss)
    #return outputs, 0

In [41]:
# Define loss
def get_loss(outputs, l2_loss, tf_Y):
    """
    assert tf.shape(tf_X)[1] == image_size*image_size,\
        "Training data not of correct shape. got %s require %s" % (tf.shape(tf_X)[1], image_size*image_size)
    assert tf.shape(tf_Y)[1] == num_labels,\
        "Training data not of correct shape. got %s require %s" % (tf.shape(tf_Y)[1], num_labels)
    """
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(outputs, tf_training_labels))
    total_loss = loss + L2_weight*l2_loss
    return total_loss

In [44]:
# Define the network graph
# tf.python.framework.ops.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    tf_training_dataset = tf.placeholder(tf.float32) # Should have shape (batch_size, image_size*image_size)
    tf_training_labels = tf.placeholder(tf.float32) # Should have shape (batch_size, num_labels)
    dropout_p = tf.placeholder(tf.float32)
    
    outputs, l2_loss = forward(tf_training_dataset, dropout_p)
    total_loss = get_loss(outputs, l2_loss, tf_training_labels)
    
    global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
    #learning_rate = tf.train.exponential_decay(0.5, global_step, 10000, 0.96)
    learning_rate = 0.5
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss, global_step=global_step)

In [46]:
# train the model
num_steps = 5001
batch_size = 128
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized, using batch size: %s" % batch_size)
    for step in xrange(num_steps):
        idx = np.random.randint(train_dataset.shape[0], size=batch_size)
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[idx]
        batch_labels = train_labels[idx]
        #batch_data = train_dataset[offset:(offset + batch_size), :]
        #batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels, dropout_p: 1}
        _, l = session.run([optimizer, total_loss], feed_dict=feed_dict)
        predictions = session.run(outputs, feed_dict={tf_training_dataset: batch_data, dropout_p: 1})
        if (step % 500 == 0):
            batch_size += 100
            print("Updated batch size: %s" % batch_size)
            print("[Step: %s] Minibatch loss %s, accuracy: %.1f%%" % (step, l, accuracy(predictions, batch_labels)))
            predictions, l = session.run([outputs, total_loss], 
                                      feed_dict={tf_training_dataset: valid_dataset, tf_training_labels : valid_labels, dropout_p: 1})
            print("[Step: %s] Validation loss %s, accuracy: %.1f%%" % (step, l, accuracy(predictions, valid_labels)))
    predictions, l = session.run([outputs, total_loss], 
                                      feed_dict={tf_training_dataset: test_dataset, tf_training_labels : test_labels, dropout_p: 1})
    print("Test loss %s, accuracy: %.1f%%" % (l, accuracy(predictions, test_labels)))

Initialized, using batch size: 128
Updated batch size: 228
[Step: 0] Minibatch loss 542.261, accuracy: 50.8%
[Step: 0] Validation loss 1806.13, accuracy: 39.0%
Updated batch size: 328
[Step: 500] Minibatch loss 128.014, accuracy: 86.8%
[Step: 500] Validation loss 133.583, accuracy: 79.2%
Updated batch size: 428
[Step: 1000] Minibatch loss 97.5414, accuracy: 84.8%
[Step: 1000] Validation loss 97.5553, accuracy: 80.2%
Updated batch size: 528
[Step: 1500] Minibatch loss 73.8631, accuracy: 82.0%
[Step: 1500] Validation loss 74.4446, accuracy: 80.0%
Updated batch size: 628
[Step: 2000] Minibatch loss 57.3519, accuracy: 78.8%
[Step: 2000] Validation loss 57.9258, accuracy: 76.0%
Updated batch size: 728
[Step: 2500] Minibatch loss 43.6407, accuracy: 84.2%
[Step: 2500] Validation loss 43.6096, accuracy: 83.4%
Updated batch size: 828
[Step: 3000] Minibatch loss 33.8846, accuracy: 87.1%
[Step: 3000] Validation loss 33.9103, accuracy: 84.1%
Updated batch size: 928
[Step: 3500] Minibatch loss 26.5