## In-class coding exercises
Replace plain vanilla gradient descent with a different accelerated gradient method - AdaDelt, NAG or AdaGrad.  
Add dropout for regularization - What meta parameters does dropout add?  
Add a L2 weight penalty for regularization.  What meta parameters does weight penalty add?

In [1]:
# 4-layer MNIST with momentum with batch normalization
import tensorflow as tf
import numpy as np
from mnistReader import mnist
from math import sqrt

#build and initialize weights
def init_weights(shape, name, glorot=False):
    [n_inputs, n_outputs] = shape
    init_range = sqrt(6.0 / (n_inputs + n_outputs))
    if glorot: return tf.Variable(tf.random_uniform(shape, -init_range, init_range), name=name)
    else: return tf.Variable(tf.random_normal(shape, stddev=0.01), name=name)
    
def momentum(list_of_weights, list_of_gradients, beta, delta):
    # INPUT:
    # beta is what percent of accumulated gradient you want to keep: like 0.85
    # delta is your learning rate: like 0.01
    
    # OUTPUT:
    # outputs the updaters for an accumulated step for layer's weights and the weights themselves. 
    list_of_steps = [tf.Variable(tf.zeros(weight.get_shape(), dtype=tf.float32), name='step') for weight
                     in list_of_weights]
    udStep = [step.assign(beta * step - (1 - beta) * delta * grad) for step, grad in 
                 zip(list_of_steps, list_of_gradients)]
    udWt = [weight.assign(weight + step) for weight, step in zip(list_of_weights, udStep)]
    return udStep + udWt


#read in the data and run through training
xTrain, xTest, yTrain, yTest = mnist()


tf.reset_default_graph() 
graph = tf.Graph() 
with graph.as_default():
    X = tf.placeholder(tf.float32, shape=[None, 784])
    Y = tf.placeholder(tf.float32, shape=[None, 10])
    lr = tf.constant(0.00002, dtype=tf.float32, name='lr')

    w1 = init_weights([784, 300], 'w1')
    w2 = init_weights([300, 50], 'w2')
    w3 = init_weights((50, 10), 'w3')

    #define network
    h1 = tf.nn.relu(tf.matmul(X, w1))  #look under Neural Net -> Activation in API left column
    h1_bn = tf.contrib.layers.batch_norm(h1)
    h2 = tf.nn.relu(tf.matmul(h1_bn, w2))
    h2_bn = tf.contrib.layers.batch_norm(h2)
    logits = tf.matmul(h2_bn, w3)
    py_x = tf.nn.softmax(logits)
    y_pred = tf.argmax(py_x, dimension=1)

    #define cost
    rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(logits, Y, name='rows_of_cost')
    cost = tf.reduce_mean(rows_of_cost, reduction_indices=None, keep_dims=False, name='cost')

    #start building list that you'll reference in sess.run
    udList = [cost]

    #use hand-crafted updater
    W = [w1, w2, w3]

    #calculate gradients
    grad = tf.gradients(cost, W)

    # momentum
    udList = udList + momentum(W, grad, 0.85, 0.001) # hyper-parameter 0.85 and 0.001 can be placeholders

    #form a list of the updates - including this in sess.run will force calculation of new weights each step
    # udList = udList + [w.assign(w - lr * g) for (w, g) in zip(W, grad)]

    #use tf.optimizer by uncommenting the following two lines (and modifying where necessary)
    #optimizer = tf.train.GradientDescentOptimizer(lr)
    #train = optimizer.minimize(cost)

    #output for tensorboard
    summary1 = tf.scalar_summary("Cost over time", cost) 
    summary2 = tf.histogram_summary('Weight w1 over time', w1)
    summary3 = tf.histogram_summary('Weight w2 over time', w2)
    summary4 = tf.histogram_summary('Weight w3 over time', w3)
    merged = tf.merge_summary([summary1, summary2, summary3, summary4]) 

    #add tensorboard output to sess.run list
    udList.append(merged)

with tf.Session(graph=graph) as sess:
    result = sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter('logs/',graph=sess.graph)
    miniBatchSize = 40
    startEnd = zip(range(0, len(xTest), miniBatchSize), range(miniBatchSize, len(xTest) + 1, miniBatchSize))
    costList = []
    nPasses = 31
    iteration = 0
    for iPass in range(nPasses):
        for (s, e) in startEnd:
            [costVal, step1, step2, step3, weight1, weight2, weight3, tbSummary] = sess.run(udList, 
                 feed_dict={X: xTrain[s:e,], Y: yTrain[s:e]})
            
            writer.add_summary(tbSummary, iteration)
            iteration += 1
            costList.append(costVal)
        if iPass % 5 == 0: 
            testResult = sess.run([y_pred], feed_dict={X:xTest})
            print iPass, np.mean(np.argmax(yTest, axis=1) == testResult)

0 0.6841
5 0.9093
10 0.9405
15 0.9515
20 0.9554
25 0.9583
30 0.9594


In [1]:
# 4-layer MNIST with momentum and dropout

# all you do is:
# 1) for each layer, put it in tf.nn.dropout(layer, percent_keep)
# 2) add a percent_keep placeholder (can be a placeholder, so you can 
# adjust it in session.run
# 3) modify sess.run() to put in fill in placeholder in feed_dict

import tensorflow as tf
import numpy as np
from mnistReader import mnist
from math import sqrt

#build and initialize weights
def init_weights(shape, name, glorot=False):
    [n_inputs, n_outputs] = shape
    init_range = sqrt(6.0 / (n_inputs + n_outputs))
    if glorot: return tf.Variable(tf.random_uniform(shape, -init_range, init_range), name=name)
    else: return tf.Variable(tf.random_normal(shape, stddev=0.01), name=name)
    
def momentum(list_of_weights, list_of_gradients, beta, delta):
    # INPUT:
    # beta is what percent of accumulated gradient you want to keep: like 0.85
    # delta is your learning rate: like 0.01
    
    # OUTPUT:
    # outputs the updaters for an accumulated step for layer's weights and the weights themselves. 
    list_of_steps = [tf.Variable(tf.zeros(weight.get_shape(), dtype=tf.float32), name='step') for weight
                     in list_of_weights]
    udStep = [step.assign(beta * step - (1 - beta) * delta * grad) for step, grad in 
                 zip(list_of_steps, list_of_gradients)]
    udWt = [weight.assign(weight + step) for weight, step in zip(list_of_weights, udStep)]
    return udStep + udWt


#read in the data and run through training
xTrain, xTest, yTrain, yTest = mnist()


tf.reset_default_graph() 
graph = tf.Graph() 
with graph.as_default():
    X = tf.placeholder(tf.float32, shape=[None, 784])
    Y = tf.placeholder(tf.float32, shape=[None, 10])
    lr = tf.constant(0.00002, dtype=tf.float32, name='lr')

    w1 = init_weights([784, 300], 'w1')
    w2 = init_weights([300, 50], 'w2')
    w3 = init_weights((50, 10), 'w3')

    #define network
    percent_keep = tf.placeholder(tf.float32) # add this line
    h1 = tf.nn.relu(tf.matmul(X, w1))
    h1 = tf.nn.dropout(h1, percent_keep) # add this line
    h2 = tf.nn.relu(tf.matmul(h1, w2))
    h2 = tf.nn.dropout(h2, percent_keep) # add this line
    logits = tf.matmul(h2, w3)
    py_x = tf.nn.softmax(logits)
    y_pred = tf.argmax(py_x, dimension=1)

    #define cost
    rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(logits, Y, name='rows_of_cost')
    cost = tf.reduce_mean(rows_of_cost, reduction_indices=None, keep_dims=False, name='cost')

    #start building list that you'll reference in sess.run
    udList = [cost]

    #use hand-crafted updater
    W = [w1, w2, w3]

    #calculate gradients
    grad = tf.gradients(cost, W)

    # momentum
    udList = udList + momentum(W, grad, 0.85, 0.001)

    #form a list of the updates - including this in sess.run will force calculation of new weights each step
    # udList = udList + [w.assign(w - lr * g) for (w, g) in zip(W, grad)]

    #use tf.optimizer by uncommenting the following two lines (and modifying where necessary)
    #optimizer = tf.train.GradientDescentOptimizer(lr)
    #train = optimizer.minimize(cost)

    #output for tensorboard
    summary1 = tf.scalar_summary("Cost over time", cost) 
    summary2 = tf.histogram_summary('Weight w1 over time', w1)
    summary3 = tf.histogram_summary('Weight w2 over time', w2)
    summary4 = tf.histogram_summary('Weight w3 over time', w3)
    merged = tf.merge_summary([summary1, summary2, summary3, summary4]) 

    #add tensorboard output to sess.run list
    udList.append(merged)

with tf.Session(graph=graph) as sess:
    result = sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter('logs/',graph=sess.graph)
    miniBatchSize = 40
    startEnd = zip(range(0, len(xTest), miniBatchSize), range(miniBatchSize, len(xTest) + 1, miniBatchSize))
    costList = []
    nPasses = 31
    iteration = 0
    for iPass in range(nPasses):
        for (s, e) in startEnd:
            [costVal, step1, step2, step3, weight1, weight2, weight3, tbSummary] = sess.run(
                udList, feed_dict={X: xTrain[s:e,], Y: yTrain[s:e], percent_keep: 0.5})
            
            writer.add_summary(tbSummary, iteration)
            iteration += 1
            costList.append(costVal)
        if iPass % 5 == 0: 
            testResult = sess.run([y_pred], feed_dict={X:xTest, percent_keep: 1.0})
            print iPass, np.mean(np.argmax(yTest, axis=1) == testResult)

0 0.0857
5 0.111
10 0.1372
15 0.1602
20 0.1796
25 0.1984
30 0.2221


In [3]:
# 4-layer MNIST with momentum and L2 regularization

# all you do is:
# 1) add a penalty variable (can be a placeholder, so you can 
# adjust it in session.run
# 2) modify the cost function to subtract weight matrices squared
# 3) modify sess.run() to put in fill in placeholder in feed_dict

import tensorflow as tf
import numpy as np
from mnistReader import mnist
from math import sqrt

#build and initialize weights
def init_weights(shape, name, glorot=False):
    [n_inputs, n_outputs] = shape
    init_range = sqrt(6.0 / (n_inputs + n_outputs))
    if glorot: return tf.Variable(tf.random_uniform(shape, -init_range, init_range), name=name)
    else: return tf.Variable(tf.random_normal(shape, stddev=0.01), name=name)
    
def momentum(list_of_weights, list_of_gradients, beta, delta):
    # INPUT:
    # beta is what percent of accumulated gradient you want to keep: like 0.85
    # delta is your learning rate: like 0.01
    
    # OUTPUT:
    # outputs the updaters for an accumulated step for layer's weights and the weights themselves. 
    list_of_steps = [tf.Variable(tf.zeros(weight.get_shape(), dtype=tf.float32), name='step') for weight
                     in list_of_weights]
    udStep = [step.assign(beta * step - (1 - beta) * delta * grad) for step, grad in 
                 zip(list_of_steps, list_of_gradients)]
    udWt = [weight.assign(weight + step) for weight, step in zip(list_of_weights, udStep)]
    return udStep + udWt


#read in the data and run through training
xTrain, xTest, yTrain, yTest = mnist()


tf.reset_default_graph() 
graph = tf.Graph() 
with graph.as_default():
    X = tf.placeholder(tf.float32, shape=[None, 784])
    Y = tf.placeholder(tf.float32, shape=[None, 10])
    lr = tf.constant(0.00002, dtype=tf.float32, name='lr')

    w1 = init_weights([784, 300], 'w1')
    w2 = init_weights([300, 50], 'w2')
    w3 = init_weights((50, 10), 'w3')

    #define network
    percent_keep = tf.placeholder(tf.float32) # add this line
    h1 = tf.nn.relu(tf.matmul(X, w1))
    h2 = tf.nn.relu(tf.matmul(h1, w2))
    logits = tf.matmul(h2, w3)
    py_x = tf.nn.softmax(logits)
    y_pred = tf.argmax(py_x, dimension=1)

    #define cost
    penalty = tf.placeholder(tf.float32) # penalty for L2 regularization
    rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(logits, Y, name='rows_of_cost')
    cost = tf.reduce_mean(rows_of_cost, reduction_indices=None, keep_dims=False, 
                          name='cost') - penalty * (tf.reduce_sum(tf.square(w1)) + 
                          tf.reduce_sum(tf.square(w2)))


    #start building list that you'll reference in sess.run
    udList = [cost]

    #use hand-crafted updater
    W = [w1, w2, w3]

    #calculate gradients
    grad = tf.gradients(cost, W)

    # momentum
    udList = udList + momentum(W, grad, 0.85, 0.001)

    #form a list of the updates - including this in sess.run will force calculation of new weights each step
    # udList = udList + [w.assign(w - lr * g) for (w, g) in zip(W, grad)]

    #use tf.optimizer by uncommenting the following two lines (and modifying where necessary)
    #optimizer = tf.train.GradientDescentOptimizer(lr)
    #train = optimizer.minimize(cost)

    #output for tensorboard
    summary1 = tf.scalar_summary("Cost over time", cost) 
    summary2 = tf.histogram_summary('Weight w1 over time', w1)
    summary3 = tf.histogram_summary('Weight w2 over time', w2)
    summary4 = tf.histogram_summary('Weight w3 over time', w3)
    merged = tf.merge_summary([summary1, summary2, summary3, summary4]) 

    #add tensorboard output to sess.run list
    udList.append(merged)

with tf.Session(graph=graph) as sess:
    result = sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter('logs/',graph=sess.graph)
    miniBatchSize = 40
    startEnd = zip(range(0, len(xTest), miniBatchSize), range(miniBatchSize, len(xTest) + 1, miniBatchSize))
    costList = []
    nPasses = 31
    iteration = 0
    for iPass in range(nPasses):
        for (s, e) in startEnd:
            [cost, step1, step2, step3, weight1, weight2, weight3, tbSummary] = sess.run(
                udList, feed_dict={X: xTrain[s:e,], Y: yTrain[s:e], penalty: 0.0001})
            
            writer.add_summary(tbSummary, iteration)
            iteration += 1
            costList.append(cost)
        if iPass % 5 == 0: 
            testResult = sess.run([y_pred], feed_dict={X:xTest, penalty: 0.0001})
            print iPass, np.mean(np.argmax(yTest, axis=1) == testResult)

0 0.0835
5 0.1283
10 0.182
15 0.2287
20 0.268
25 0.3011
30 0.3268


## Homework Exercise
Build 4-layer network for classifying Cifar images.  Use 10k training data (as in last lecture) to truncate the training time.  

In [6]:
# 4-layer cifar using dropout
# quite a lot of things to change to get it to run

# switched test and training set to train faster
# used original training set as test set to print accuracy per 5 epochs

import tensorflow as tf
import numpy as np
from cifarHandler import cifar
from math import sqrt


xTrain, yTrain, xTest, yTest = cifar()


#build and initialize weights
def init_weights(shape, name, glorot=False):
    [n_inputs, n_outputs] = shape
    init_range = sqrt(6.0 / (n_inputs + n_outputs))
    if glorot:
        return tf.Variable(tf.random_uniform(shape, -init_range, init_range), name=name)
    else:
        return tf.Variable(tf.random_normal(shape, stddev=0.01), name=name)
    
def momentum(list_of_weights, list_of_gradients, beta, delta):
    # INPUT:
    # beta is what percent of accumulated gradient you want to keep: like 0.85
    # delta is your learning rate: like 0.01
    
    # OUTPUT:
    # outputs the updaters for an accumulated step for layer's weights and the weights themselves. 
    list_of_steps = [tf.Variable(tf.zeros(weight.get_shape(), dtype=tf.float32), name='step') for weight
                     in list_of_weights]
    udStep = [step.assign(beta * step - (1 - beta) * delta * grad) for step, grad in 
                 zip(list_of_steps, list_of_gradients)]
    udWt = [weight.assign(weight + step) for weight, step in zip(list_of_weights, udStep)]
    return udStep + udWt


tf.reset_default_graph() 
graph = tf.Graph() 
with graph.as_default():
    X = tf.placeholder(tf.float32, shape=[None, 3072])
    Y = tf.placeholder(tf.float32, shape=[None, 10])
    lr = tf.constant(0.000001, dtype=tf.float32, name='lr') # if gradients approach infinity, decrease lr
    w1 = init_weights([3072, 300], 'w1') # weights between 1st and 2nd layer
    w2 = init_weights([300, 50], 'w2') # weights between 2nd and 3rd layer
    w3 = init_weights([50, 10], 'w3') # weights between 3rd and 4th layer

    #define network
    percent_keep = tf.placeholder(tf.float32)
    h1 = tf.nn.relu(tf.matmul(X, w1))
    h1 = tf.nn.dropout(h1, percent_keep) # add this line
    h2 = tf.nn.relu(tf.matmul(h1, w2))
    h2 = tf.nn.dropout(h2, percent_keep) # add this line    
    logits = tf.matmul(h2, w3)
    py_x = tf.nn.softmax(logits)
    y_pred = tf.argmax(py_x, dimension=1) # actual prediction, gives the index number that has highest value

    #define cost
    rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(logits, Y, name='rows_of_cost')
    cost = tf.reduce_mean(rows_of_cost, reduction_indices=None, keep_dims=False, 
                          name='cost')

    #start building list that you'll reference in sess.run; it will execute cost function, weight updates
    udList = [cost]

    #use hand-crafted updater
    W = [w1, w2, w3]
    #calculate gradients
    grad = tf.gradients(cost, W) # gradient for both weight matricies

    udList = udList + momentum(W, grad, 0.85, 0.001)

    #form a list of the updates - including this in sess.run will force calculation of new weights each step
    #udList = udList + [w.assign(w - lr * g) for (w, g) in zip(W, grad)]

    #use tf.optimizer by uncommenting the following two lines (and modifying where necessary)
    #optimizer = tf.train.GradientDescentOptimizer(lr)
    #train = optimizer.minimize(cost)

    #output for tensorboard
    summary1 = tf.scalar_summary("Cost over time", cost) 
    summary2 = tf.histogram_summary('Weight w1 over time', w1)
    summary3 = tf.histogram_summary('Weight w2 over time', w2)
    summary4 = tf.histogram_summary('Weight w3 over time', w3)
    merged = tf.merge_summary([summary1, summary2, summary3, summary4]) 

    #add tensorboard output to sess.run list
    udList.append(merged)

with tf.Session(graph=graph) as sess:
    result = sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter('logs/',graph=sess.graph)
    miniBatchSize = 40
    startEnd = zip(range(0, len(xTest), miniBatchSize), range(miniBatchSize, len(xTest) + 1, miniBatchSize))
    costList = []
    nPasses = 31
    iteration = 0
    for iPass in range(nPasses):
        for (s, e) in startEnd:
            [costVal, step1, step2, step3, weight1, weight2, weight3, tbSummary] = sess.run(
                udList, feed_dict={X: xTest[s:e,], Y: yTest[s:e], percent_keep: 0.50})
            writer.add_summary(tbSummary, iteration)
            iteration += 1
            costList.append(costVal)
        if iPass % 5 == 0: 
            testResult = sess.run([y_pred], feed_dict={X:xTrain, percent_keep: 1.0})
            print iPass, np.mean(np.argmax(yTrain, axis=1) == testResult)

0 0.2511
5 0.3465
10 0.3646
15 0.3738
20 0.395
25 0.3917
30 0.4027
