In [None]:
from __future__ import absolute_import
from __future__ import print_function
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.scipy.misc import logsumexp
from autograd import grad
from autograd.util import quick_grad_check

In [21]:
def load_mnist():
    print("Loading training data...")
    import imp
    partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
    one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
    source = '/var/folders/c8/nyrw36d161b9z7tngmrc3qym0000gn/T/tmpGHWkKL.py'
    data = imp.load_source('data', source).mnist()
    train_images, train_labels, test_images, test_labels = data
    train_images = partial_flatten(train_images) / 255.0
    test_images  = partial_flatten(test_images)  / 255.0
    train_labels = one_hot(train_labels, 10)
    test_labels = one_hot(test_labels, 10)
    N_data = train_images.shape[0]
    return N_data, train_images, train_labels, test_images, test_labels

In [22]:
if __name__ == '__main__':
    N_data, train_images, train_labels, test_images, test_labels=load_mnist()
    print(train_images.shape)
    print(N_data)

Loading training data...
(60000, 784)
60000


In [3]:
def make_nn_funs(layer_sizes, L2_reg):
    shapes = zip(layer_sizes[:-1], layer_sizes[1:])
    N = sum((m+1)*n for m, n in shapes)

    def unpack_layers(W_vect):
        for m, n in shapes:
            yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n]
            W_vect = W_vect[(m+1)*n:]

    def predictions(W_vect, inputs):
        for W, b in unpack_layers(W_vect):
            outputs = np.dot(inputs, W) + b
            inputs = np.tanh(outputs)
        return outputs - logsumexp(outputs, axis=1, keepdims=True)

    def loss(W_vect, X, T):
        log_prior = -L2_reg * np.dot(W_vect, W_vect)
        log_lik = np.sum(predictions(W_vect, X) * T)
        return - log_prior - log_lik

    def frac_err(W_vect, X, T):
        return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1))

    return N, predictions, loss, frac_err

In [7]:
if __name__ == '__main__':
    layer_sizes = [784, 200, 100, 10]
    L2_reg = 1.0
    N, predictions, loss, frac_err = make_nn_funs(layer_sizes, L2_reg)
    print(N)
    print(loss)
    print(frac_err)

178110
<function loss at 0x104043230>
<function frac_err at 0x104043398>


In [None]:
def make_batches(N_data, batch_size):
    return [slice(i, min(i+batch_size, N_data))
            for i in range(0, N_data, batch_size)]

In [30]:
# Network parameters
layer_sizes = [784, 200, 100, 10]
L2_reg = 1.0

# Training parameters
param_scale = 0.1
learning_rate = 1e-3
momentum = 0.9
batch_size = 256
num_epochs = 50

# Make neural net functions
N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
loss_grad = grad(loss_fun)

In [31]:
print(N_weights)

178110


In [32]:
# Initialize weights
rs = npr.RandomState()
W = rs.randn(N_weights) * param_scale

# Check the gradients numerically, just to be safe
quick_grad_check(loss_fun, W, (train_images, train_labels))



def print_perf(epoch, W):
    test_perf  = frac_err(W, test_images, test_labels)
    train_perf = frac_err(W, train_images, train_labels)
    print("{0:15}|{1:15}|{2:15}".format(epoch, train_perf, test_perf))

# Train with sgd
batch_idxs = make_batches(train_images.shape[0], batch_size)
cur_dir = np.zeros(N_weights)

Checking gradient of <function loss at 0x106668a28> at [ 0.0013496  -0.10268497  0.07023831 ..., -0.10388204  0.06244274
 -0.02446704]
Gradient projection OK (numeric grad: -432.491936954, analytic grad: -432.491936705)


In [1]:
print("    Epoch      |    Train err  |   Test err  ")
for epoch in range(num_epochs):
    print_perf(epoch, W)
    for idxs in batch_idxs:
        grad_W = loss_grad(W, train_images[idxs], train_labels[idxs])
        cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W
        W -= learning_rate * cur_dir

Loading training data...
Checking gradient of <function loss at 0x106636f50> at [ 0.05768681 -0.03568799 -0.13128437 ..., -0.08752415 -0.06350238
  0.07270739]
Gradient projection OK (numeric grad: -199.372684874, analytic grad: -199.372684035)
    Epoch      |    Train err  |   Test err  
              0| 0.893533333333|         0.9018
              1|0.0854833333333|         0.0796
              2|0.0790833333333|         0.0747
              3|        0.07545|         0.0726
              4|0.0729833333333|         0.0708
              5|        0.07075|         0.0687
              6|0.0691833333333|         0.0671
              7|0.0678666666667|         0.0666
              8|0.0666666666667|          0.065
              9|        0.06555|         0.0642
             10|0.0648333333333|         0.0634
             11|0.0641333333333|         0.0629
             12|0.0635333333333|         0.0625
             13|0.0630166666667|         0.0622
             14|         0.0629|     