In [1]:
import numpy as np
import random
import os
from datetime import datetime
from functools import partial
from itertools import compress

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
print("Tensorflow:", tf.__version__)

import sys
print("Python Version:", sys.version)

# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

def new_logdir():
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    root_logdir = "tf_logs"
    return "{}/run-{}/".format(root_logdir, now)

(u'Tensorflow:', '1.1.0')
(u'Python Version:', '2.7.14 |Anaconda, Inc.| (default, Dec  7 2017, 17:05:42) \n[GCC 7.2.0]')


# Exercise 1
Is it okay to initialize all the weights to the same value as long as that value is selected randomly using He Initialization?



# Exercise 2
Is it okay to initialize the bias terms to 0?



# Exercise 3
Name 3 advantages of the ELU activation function over the ReLU?
1. It has a constant derivative over the entire X range
2. Bloop
3. Bloop

# Exercise 4
In which cases would you want to use each of the following activation functions?
1. ELU:
2. Leaky ReLU (and varients):
3. ReLU: 
4. Tanh:
5. Logistic:
6. Softmax: to use on the output layer to output probabilities.

# Exercise 5
What may happen is you set the momentum hyperparameter too close to 1 when using the `MomentOptimizer`?

It won't actually take the gradients into account. Similar to using a very high learning rate, it might over jump to optimum minimum and take a long time to converge.

# Exercise 6
Name three ways you can produce a sparse model?
1. You can trim weights to 0 when it's below a certain threshold.
2. You can use a strong L1 regularizer (allows 0 weights).
3. You can use FTLR, which is a crazy algorithm that comes up with the weights on the fly based on other parameters.

# Exercise 7 
Does dropout slow down training? Does it slow down inference? 

It does slow down training because it will take a while for it to converge, but it shouldn't slow down inference because we are not performing dropout at that stage.

# Exercise 8

In [2]:
mnist = input_data.read_data_sets("/tmp/data/")

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
X_train = mnist.train.images
X_test = mnist.test.images
y_train = mnist.train.labels.astype("int")
y_test = mnist.test.labels.astype("int")

keep_l4_train = np.array([x <= 4 for x in y_train])
keep_l4_test = np.array([x <= 4 for x in y_test])

X_train_l4 = X_train[keep_l4_train]
X_test_l4 = X_test[keep_l4_test]
y_train_l4 = y_train[keep_l4_train]
y_test_l4 = y_test[keep_l4_test]

X_train_g5 = X_train[np.invert(keep_l4_train)]
X_test_g5 = X_test[np.invert(keep_l4_test)]
y_train_g5 = y_train[np.invert(keep_l4_train)]
y_test_g5 = y_test[np.invert(keep_l4_test)]

print(y_train_l4[:5])
print(y_train_g5[:5])

[3 4 1 1 0]
[7 6 8 9 8]


In [4]:
# 1) DNN with 5L[100]N, He, and ELU
# 2) Adam, and Early stopping. 
# 3) Save model and checkpoints.
# 4) Train on 0-4 MNIST
# 5) Tune with cross-validation. Precision?
# 6) Batch-Normalization. Compare?
# 7) Overfitting? Use Drop. How about now?

n_inputs = 28*28
layer_sizes = [100, 100, 100, 100, 100]
n_outputs = 5

lr = 0.01

he_init = tf.contrib.layers.variance_scaling_initializer()
elu = tf.nn.elu
my_dense_layer = partial(tf.layers.dense, activation=elu, kernel_initializer=he_init)

with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
    y = tf.placeholder(tf.int64, shape=(None), name="y")
    
with tf.name_scope("dnn"):
    hidden = []
    last_hidden = X
    for i in xrange(len(layer_sizes)):
        hidden.append(my_dense_layer(last_hidden, layer_sizes[i], name="hidden"+str(i)))
        last_hidden = hidden[-1]
    logits = tf.layers.dense(last_hidden, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    acc_sum = tf.summary.scalar("Accuracy", accuracy)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

print("Ready!")

Ready!


In [5]:
def make_batch(X, y, batch_size):
    idxs = np.random.randint(0, len(X), batch_size)
    return X[idxs], y[idxs]

In [6]:
batch_size = 500
best_threshold = 0.001

log_dir = new_logdir()
train_writer = tf.summary.FileWriter(log_dir + '/train', tf.get_default_graph())
test_writer = tf.summary.FileWriter(log_dir + '/test', , tf.get_default_graph())

acc_train = 0
acc_test = 0
acc_test_best = 0
steps_since_best = 0
best_ckpt = ""
epoch = 0
with tf.Session() as sess:
    init.run()
    while steps_since_best < 100:
        X_batch, y_batch = make_batch(X_train_l4, y_train_l4, batch_size)
        sess.run(train, feed_dict={X:X_batch, y:y_batch})
        acc_train, train_sum = sess.run(
            [accuracy, acc_sum],
            feed_dict={X:X_batch, y:y_batch})
        acc_test, test_sum = sess.run(
            [accuracy, acc_sum],
            feed_dict={X:X_test_l4, y:y_test_l4})
        if epoch % 10 == 0:
            train_writer.add_summary(train_sum, epoch)
            test_writer.add_summary(test_sum, epoch)
            print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)
        steps_since_best += 1
        epoch += 1
        if acc_test_best < acc_test:
            if acc_test_best + best_threshold < acc_test:
                steps_since_best = 0
            acc_test_best = acc_test
            best_ckpt = saver.save(sess, "./ckpts/epoch_" + str(epoch) + ".ckpt")
    saver.restore(sess, best_ckpt)
    save_path = saver.save(sess, "./my_model_final.ckpt")

train_writer.close()
test_writer.close()

0 Train accuracy: 0.694 Test accuracy: 0.668418
10 Train accuracy: 0.922 Test accuracy: 0.929169
20 Train accuracy: 0.95 Test accuracy: 0.958552
30 Train accuracy: 0.956 Test accuracy: 0.962833
40 Train accuracy: 0.974 Test accuracy: 0.972368
50 Train accuracy: 0.968 Test accuracy: 0.974509
60 Train accuracy: 0.978 Test accuracy: 0.978984
70 Train accuracy: 0.97 Test accuracy: 0.976844
80 Train accuracy: 0.974 Test accuracy: 0.98093
90 Train accuracy: 0.978 Test accuracy: 0.981125
100 Train accuracy: 0.972 Test accuracy: 0.982876
110 Train accuracy: 0.988 Test accuracy: 0.983265
120 Train accuracy: 0.988 Test accuracy: 0.982681
130 Train accuracy: 0.986 Test accuracy: 0.981125
140 Train accuracy: 0.984 Test accuracy: 0.980541
150 Train accuracy: 0.982 Test accuracy: 0.972174
160 Train accuracy: 0.988 Test accuracy: 0.981319
170 Train accuracy: 0.982 Test accuracy: 0.981903
180 Train accuracy: 0.986 Test accuracy: 0.984627
190 Train accuracy: 0.984 Test accuracy: 0.985406
200 Train accu