In [1]:
import torch
from torch import Tensor
from torch.autograd import Variable
import numpy as np
import gzip
import pickle
import timeit

In [2]:
class NeuralNetworkMNIST(object):
    """
    A Neural Network designed to be trained on the MNIST database. 
    """
    
    def __init__(self, layersz):
        """
        Initializes biases and weights with random values from a N(0,1) distribution.
        Since we are using this network with the MNIST database, the input layer must
        be of size 784 = 28 x 28 (the number of pixels of each image). Also, the ouput
        layer must be of size 10 (to represent 0...9)

        Parameters:
        layersz -- list of layer sizes, layers[0] corresponding to the input layer
        """
        if layersz [0] != 784: raise RuntimeError('The size of the input layer must be 784')
        if layersz[-1] !=  10: raise RuntimeError('The size of the output layer must be 10')
        
        # the following convention is used for weights: 
        # w[i,j] denotes the weight associated with the connection from neuron
        # 'j' in the previous layer to the neuron 'i' in the current layer.
        
        self.nlayers = len(layersz)
        self.layersz = layersz
        
        # we init via the numpy random number generator so we can compare with other implementation
        # for debugging purposes
        bs = [np.random.randn(i, 1) for i in layersz[1:]]
        ws = [np.random.randn(j, i) for i, j in zip(layersz[:-1], layersz[1:])]

        self.biases  = [Variable(torch.from_numpy(b).float(), requires_grad=True) for b in bs]
        self.weights = [Variable(torch.from_numpy(w).float(), requires_grad=True) for w in ws]

        
    def feedforward(self, a):
        """
        Propagates a given input vector forward through the network and returns the output.
        
        Parameters:
        a -- network input, a 2D array of shape (n,1), where n is the size of the input layer        
        """
        if a.data.shape != (self.layersz[0], 1):
            raise RuntimeError('Input tensor has wrong shape')
        
        for b, w in zip(self.biases, self.weights):
            a = torch.sigmoid(torch.mm(w, a) + b)
        return a
    
    
    def SGD(self, training_data, epochs, batchsz, eta, test_data=None):
        """
        Train the neural network using batch stochastic gradient descent.  
        The network weights and biases are updated as the result of running this method.
        Both 'training_data' and 'test_data' are lists of tuples, each tuple being an
        example - the first element is the network input, the second is the target output.
        
        Parameters:
        training_data -- list of tuples representing training inputs and the desired outputs  
        epochs        -- for how many epochs to train the network
        batchsz       -- the size of each batch of training example (this is *stochastic* GD)
        eta           -- the learning rate
        test_data     -- used to evaluate the performace of the network at the end of each epoch
        """
        for j in range(epochs):
            start_time = timeit.default_timer()
                
            # break up the training data into batches
            np.random.shuffle(training_data)
            batches = [training_data[k:k+batchsz] for k in range(0, len(training_data), batchsz)]
            
            # SGD means that we update weights/biases based on gradients calculated
            # using only a batch of training examples (as opposed to the entire training data) 
            for batch in batches:
                for b in self.biases:  
                    if b.grad is not None: 
                        b.grad.data.zero_() 
                for w in self.weights: 
                    if w.grad is not None: 
                        w.grad.data.zero_() 
                
                # calculate the (stochastic) gradient
                # below 'x' represents an input vector, 'y' the target output
                #t0 = timeit.default_timer()
                for x, y in batch:
                    out  = self.feedforward(x)
                    loss = (out - y).pow(2).sum()  # <-- quadratic loss function
                    loss.backward()
                #print(timeit.default_timer() - t0)
                        
                # update weights/biases in the direction of the stochastic gradient
                eta_avg = eta/len(batch)
                for w in self.weights:
                    w.data = w.data - eta_avg * w.grad.data
                for b in self.biases:
                    b.data = b.data - eta_avg * b.grad.data
                                    
            dt = timeit.default_timer() - start_time
            if test_data: print("Epoch %2d: %d of %d (elapsed time: %fs)" % (j+1, self.evaluate(test_data), len(test_data), dt))
            else:         print("Epoch %2d complete  (elapsed time: %fs)" % (j+1), dt)

                
    def evaluate(self, test_data):
        """
        Evaluates the performance of the neural network on a data set not used for training.
        """
        test_results = [(np.argmax(self.feedforward(x).data.numpy()), y) for (x, y) in test_data]
        return sum(int(x == y.data.numpy()) for (x, y) in test_results)

In [3]:
# where to find the file storing the MNIST database
MNIST_DATA_FILEPATH = "mnist.pkl.gz"

def load_data_raw():
    """
    Return the MNIST data as a tuple containing the training data,
    the validation data, and the test data.

    The 'training_data' is returned as a tuple with two entries.
    The first entry contains the actual training images.  This is a
    numpy ndarray with 50,000 entries.  Each entry is, in turn, a
    numpy ndarray with 784 values, representing the 28 * 28 = 784
    pixels in a single MNIST image.

    The second entry in the 'training_data' tuple is a numpy ndarray
    containing 50,000 entries.  Those entries are just the digit
    values (0...9) for the corresponding images contained in the first
    entry of the tuple.

    The 'validation_data' and 'test_data' are similar, except
    each contains only 10,000 images.
    """
    f = gzip.open(MNIST_DATA_FILEPATH, 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    return (training_data, validation_data, test_data)


def load_data():
    """
    Repackages the data returned by 'load_data_raw' in a format
    more convenient for using with the neural network.
    
    Return a tuple (training_data, validation_data, test_data).

    'training_data'   is a list of 50,000 2-tuples (x, y)
    'validation_data' is a list of 10,000 2-tuples (x, z)
    'test_data'       is a list of 10,000 2-tuples (x, z)

    'x' is a numpy array of shape (784, 1) containing the input image.
    'y' is a numpy array of shape ( 10, 1) representing the digit encoded
        by 'x' (it has 0 entries with the exception of one 1 in the position
        of the digit represented by 'x')
    'z' is just the digit represented by 'x'
    """
    tr_d, va_d, te_d = load_data_raw()

    training_inputs  = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_inputs  = [Variable(torch.from_numpy(arr).float(), requires_grad=False) for arr in training_inputs]
    
    training_results = [asvector(y) for y in tr_d[1]]
    training_results = [Variable(torch.from_numpy(arr).float(), requires_grad=False) for arr in training_results]

    training_data    = zip(training_inputs, training_results)

    # ------------
    
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_inputs = [Variable(torch.from_numpy(arr).float(), requires_grad=False) for arr in validation_inputs]    
    validation_data   = zip(validation_inputs, Variable(torch.from_numpy(va_d[1]).float(), requires_grad=False))

    # ------------

    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_inputs = [Variable(torch.from_numpy(arr).float(), requires_grad=False) for arr in test_inputs]
    test_data   = zip(test_inputs, Variable(torch.from_numpy(te_d[1]).float(), requires_grad=False))

    return (training_data, validation_data, test_data)


def asvector(j):
    """Create vector of shape (10, 1) with 1.0 in the jth position and 0.0 elsewhere."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

# load the MNIST database
print("Loading the MNIST database...")
training_data, validation_data, test_data = [list(d) for d in load_data()]
print("Done")

Loading the MNIST database...
Done


In [4]:
# initialize a Neural Network

layer_sizes = [784, 30, 10]  # <-- first entry must be 784, last one must be 10

np.random.seed(1234)
net = NeuralNetworkMNIST(layer_sizes)

print(net.weights[0])

Variable containing:
 7.6200e-02 -5.6645e-01  3.6142e-02  ...  -2.8836e-01 -2.1979e-01  2.0025e-01
-8.4550e-01  2.6429e+00 -3.3374e-01  ...   1.3265e+00  8.4115e-01  9.2409e-02
 1.0899e+00  2.0696e+00  9.5819e-01  ...  -1.0890e+00  2.9177e-01  2.5380e-01
                ...                   ⋱                   ...                
-1.1851e+00  6.6195e-01 -5.0461e-01  ...   1.2052e+00 -5.0683e-01  1.5786e-01
 5.3238e-01  5.2275e-02 -2.1909e+00  ...  -8.4788e-01 -2.1325e+00 -3.3003e-01
-1.6596e-01  5.0059e-03 -1.5762e-01  ...  -2.8192e-01 -9.6807e-01 -5.6724e-01
[torch.FloatTensor of size 30x784]



In [5]:
# evaluate the performance of the untrained network

nright = net.evaluate(test_data)
print("Untrained network: got right %d out of %d (accuracy %.2f pct)" % (nright, len(test_data), 100*float(nright)/len(test_data)))

Untrained network: got right 988 out of 10000 (accuracy 9.88 pct)


In [6]:
# train the network

EPOCHS  = 30
BATCHSZ = 10
ETA     =  2

net.SGD(training_data, EPOCHS, BATCHSZ, ETA, test_data=test_data)

Epoch  1: 9052 of 10000 (elapsed time: 31.136235s)
Epoch  2: 9197 of 10000 (elapsed time: 30.378844s)
Epoch  3: 9225 of 10000 (elapsed time: 30.323463s)
Epoch  4: 9313 of 10000 (elapsed time: 29.917347s)
Epoch  5: 9371 of 10000 (elapsed time: 30.168319s)
Epoch  6: 9344 of 10000 (elapsed time: 29.575896s)
Epoch  7: 9386 of 10000 (elapsed time: 30.110790s)
Epoch  8: 9423 of 10000 (elapsed time: 30.147428s)
Epoch  9: 9445 of 10000 (elapsed time: 29.946958s)
Epoch 10: 9412 of 10000 (elapsed time: 30.106696s)
Epoch 11: 9398 of 10000 (elapsed time: 29.655067s)
Epoch 12: 9466 of 10000 (elapsed time: 29.782650s)
Epoch 13: 9437 of 10000 (elapsed time: 30.256443s)
Epoch 14: 9414 of 10000 (elapsed time: 30.343114s)
Epoch 15: 9402 of 10000 (elapsed time: 30.050161s)
Epoch 16: 9396 of 10000 (elapsed time: 29.985712s)
Epoch 17: 9459 of 10000 (elapsed time: 29.357427s)
Epoch 18: 9487 of 10000 (elapsed time: 29.898439s)
Epoch 19: 9470 of 10000 (elapsed time: 29.923440s)
Epoch 20: 9461 of 10000 (elapse