In [2]:
# AdaGrad solution

import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import numpy as np
from mnistReader import mnist
from math import sqrt


xTrain, xTest, yTrain, yTest = mnist()

tf.reset_default_graph() 
g = tf.Graph() 
with g.as_default():
    
    X = tf.placeholder(tf.float32, shape=[None, 784]) # effectively the first layer; minibatch_size by 784 matrix
    Y = tf.placeholder(tf.float32, shape=[None, 10]) # true image class. this is a one-hot coded matrix, minibatch_size by 10

    lr = tf.constant(0.2, dtype=tf.float32, name='lr')
    weight_shape = [784, 10] # weight matrix shape for weight matrix between 1st and 2nd layer
    [n_inputs, n_outputs] = weight_shape
    init_range = sqrt(6.0 / (n_inputs + n_outputs))  #from glorot paper
    #Pick one of the following initializations for weights: idea is neural net to train faster
    w = tf.Variable(tf.random_normal(weight_shape, stddev=0.01), name='w')   #avant de -glorot
    #w = tf.Variable(tf.random_uniform(weight_shape, -init_range, init_range), name='w')   #apres glorot

    #define network
    logits = tf.matmul(X, w) # matrix multiplication between 1st and 2nd layer, minibatch_size by 10 matrix 
    py_x = tf.nn.softmax(logits) # this is second layer; minibatch_size by 10 matrix
    y_pred = tf.argmax(py_x, dimension=1) # predicting 1 class for each image, minibatch_size by 1 matrix

    #define cost
    rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(logits, Y, name='rows_of_cost') # cost per image, minibatch_size by 1 matrix
    cost = tf.reduce_mean(rows_of_cost, reduction_indices=None, keep_dims=False, name='cost') # average cost over all images. actual loss value

    #extract gradients
    gradients = tf.gradients(cost, [w], name='gradients')[0] # wow! 1 line to get gradient wrt to complicated cost function

    # AdaGrad here
    # meta parameters
    nSteps = 20
    eta = tf.constant(0.5)
    #initialize g
    sum_g = tf.Variable(tf.ones(weight_shape, dtype=tf.float32)  * 1.0e-3, name='g') #The 1e-3 is to prevent division by zero
    sum_g_updater = sum_g.assign(sum_g + tf.square(gradients))
    w_updater = w.assign(w - eta * gradients / tf.sqrt(sum_g))
    
    # summary writers
    summary1 = tf.scalar_summary("Cost over time", cost) 
    summary2 = tf.histogram_summary('Weights over time', w)
    merged = tf.merge_summary([summary1, summary2]) 
    
    
with tf.Session(graph=g) as sess:
    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter('logs/',graph=sess.graph)
    miniBatchSize = 40
    startEnd = zip(range(0, len(xTrain), miniBatchSize), range(miniBatchSize, len(xTrain) + 1, miniBatchSize))
    costList = []
    nPasses = 30
    iteration = 0
    for iPass in range(nPasses):
        for (s, e) in startEnd:
            costVal, _, weight, tbSummary = sess.run([cost, sum_g_updater, w_updater, merged], feed_dict={X: xTrain[s:e,], Y: yTrain[s:e]})
            # When trainW is called, it will return the updated weights
            writer.add_summary(tbSummary, iteration)
            iteration += 1
            costList.append(costVal)
        if iPass % 5 == 0: 
            testResult = sess.run([y_pred], feed_dict={X:xTest})
            print iPass, np.mean(np.argmax(yTest, axis=1) == testResult) #accuracy

0 0.9049
5 0.9125
10 0.9143
15 0.9165
20 0.9179
25 0.9186
