### MNIST Neural Network with Autograd
<br\>
This notebook implements a Neural Network for recognizing hand written digits, trained using the MNIST database. The backpropagation algorithm is <b>not</b> implemented explicitly. Instead, we build the computational graph associated with the network and calculate the gradients via reverse mode differentiation. We carry out these calculations with the help of `pytorch`.

In [12]:
import gzip
import numpy as np
import pickle
import timeit
import torch
from torch import Tensor, LongTensor
from torch.autograd import Variable
import line_profiler
%load_ext line_profiler

dtype_float = torch.FloatTensor
dtype_int   = torch.IntTensor

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [13]:
class NeuralNetworkMNISTAutoGrad(object):
    """
    A Neural Network designed to be used with the MNIST database. 
    Gradients are calculated using reverse mode differentiation on
    the computational graphs associated with the network.
    """
    
    def __init__(self, layersz):
        """
        Pass in a list of layer sizes (layersz[0]/layersz[-1] are the input/output layers).
        The size of this list is the number of layers in the network.
        Since we are using this network with the MNIST database, the input layer must
        be of size 784 = 28 x 28 (the number of pixels of each image). Also, the ouput
        layer must be of size 10 (to represent 0...9).
        
        PARAMETERS:
        layersz -- list of layer sizes    
        """
        if layersz [0] != 784: raise RuntimeError('The size of the input layer must be 784')
        if layersz[-1] !=  10: raise RuntimeError('The size of the output layer must be 10')
        
        self.nlayers = len(layersz)
        self.layersz = layersz

        # Initializes biases and weights with random values from a N(0,1) distribution.
        # The following convention is used for weights: 
        #    w[i,j] denotes the weight associated with the connection from neuron
        #   'j' in the previous layer to the neuron 'i' in the current layer.
                
        # NOTE: we use the numpy random number generator as opposed to the pytorch one,
        #       so we can compare with other implementations, for debugging
        bs = [np.random.randn(i, 1) for i in layersz[1:]]
        ws = [np.random.randn(i, j) for i, j in zip(layersz[1:], layersz[:-1])]

        self.biases  = [Variable(torch.from_numpy(b).type(dtype_float), requires_grad=True) for b in bs]
        self.weights = [Variable(torch.from_numpy(w).type(dtype_float), requires_grad=True) for w in ws]

        
    def feedforward(self, a):
        """
        Propagates a given input vector forward through the network and returns the output.
        PARAMETERS:
        a Variable/Tensor of size [784, 1], representing a digit image from the MNIST database
        RETURN:
        a Variable/Tensor of size [10, 1], a one-hot representation of the network output (0..9)       
        """        
        if a.data.shape != (self.layersz[0], 1):
            raise RuntimeError('Input array has wrong shape - must be (784, 1)')
        
        for b, w in zip(self.biases, self.weights):
            a = torch.sigmoid(torch.mm(w, a) + b)
        return a
    
    
    def SGD(self, training_data, epochs, batchsz, eta, test_data=None):
        """
        Train the neural network using batch stochastic gradient descent.  
        The network weights and biases are updated as the result of running this method.
        Both 'training_data' and 'test_data' are lists of tuples, each tuple being an
        example - the first element is the network input, the second is the target output.
        For both data sets, the first tuple element is an MNIST digit image, represented 
        as a Variable/Tensor of size [784, 1].
        The target output (the digit associated with the image) is represented as a one-hot
        Variable/Tensor of size [10, 1] in 'training_data', and as the actual digit (0..9) in 
        'test_data'.
        
        PARAMETERS:
        training_data -- list of tuples representing training inputs and the desired outputs  
        epochs        -- for how many epochs to train the network
        batchsz       -- the size of each batch of training example (this is *stochastic* GD)
        eta           -- the learning rate
        test_data     -- used to evaluate the performace of the network at the end of each epoch
        """
        for j in range(epochs):
            start_time = timeit.default_timer()
                
            # break up the training data into batches
            np.random.shuffle(training_data)
            batches = [training_data[k:k+batchsz] for k in range(0, len(training_data), batchsz)]
            
            # SGD means that we update weights/biases based on gradients calculated
            # using only a batch of training examples (as opposed to the entire training data) 
            for batch in batches:
                                
                # calculate the (stochastic) gradient
                # below 'x' represents an input image, 'y' the associated digit
                for x, y in batch:
                    out  = self.feedforward(x)
                    loss = (out - y).pow(2).sum()  # <-- quadratic loss function
                    loss.backward()
                        
                # update weights/biases in the direction of the stochastic gradient
                eta_scaled = float(eta)/len(batch)
                for w in self.weights: w.data -= eta_scaled * w.grad.data
                for b in self.biases:  b.data -= eta_scaled * b.grad.data
                
                for b in self.biases:  b.grad.data.zero_() 
                for w in self.weights: w.grad.data.zero_() 
                
            dt = timeit.default_timer() - start_time
            if test_data: print("Epoch %2d: %d of %d (elapsed time: %fs)" % (j+1, self.evaluate(test_data), len(test_data), dt))
            else:         print("Epoch %2d complete  (elapsed time: %fs)" % (j+1), dt)

                
    def evaluate(self, test_data):
        """
        Evaluates the performance of the neural network on a given data set.
        This dataset consists of a list of tuples, each tuple being an example: 
        the first tuple entry is an image encoded as an is a Variable/Tensor of 
        size [784, 1] the second one is a Variable storing the digit the image
        represents (as 0..9). 
        PARAMETERS:
        test_data -- dataset used for evaluating network performance
        RETURNS:
        number of correct answers on the given dataset
        """
        # when passing an image through the network, the output is a one-hot vector
        # we use the 'argmax' to convert this vector to the 0..9 digit it represents        
        test_results = [(np.argmax(self.feedforward(x).data.numpy()), y) for (x, y) in test_data]        
        return sum(int(x == y.data[0]) for (x, y) in test_results)              

In [14]:
# where to find the file storing the MNIST database
MNIST_DATA_FILEPATH = "mnist.pkl.gz"

def load_data_raw():
    """
    Return the MNIST data as a tuple containing the training data,
    the validation data, and the test data.

    The 'training_data' is returned as a tuple with two entries.
    The first entry contains the actual training images.  This is a
    numpy ndarray with 50,000 entries.  Each entry is, in turn, a
    numpy ndarray with 784 values, representing the 28 * 28 = 784
    pixels in a single MNIST image.

    The second entry in the 'training_data' tuple is a numpy ndarray
    containing 50,000 entries.  Those entries are just the digit
    values (0...9) for the corresponding images contained in the first
    entry of the tuple.

    The 'validation_data' and 'test_data' are similar, except
    each contains only 10,000 images.
    """
    f = gzip.open(MNIST_DATA_FILEPATH, 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    return (training_data, validation_data, test_data)


def load_data():
    """
    Repackages the data returned by 'load_data_raw' in a format
    more convenient for using with the neural network.
    
    Return a tuple (training_data, validation_data, test_data).

    'training_data'   is a list of 50,000 2-tuples (x, y)
    'validation_data' is a list of 10,000 2-tuples (x, z)
    'test_data'       is a list of 10,000 2-tuples (x, z)

    'x' is a Variable/Tensor of size [784, 1] containing the input image.
    'y' is a Variable/Tensor of size [ 10, 1] representing the digit encoded
        by 'x' (it has 0 entries with the exception of one 1 in the position
        of the digit represented by 'x')
    'z' is a Variable storing just the digit represented by 'x'
    """
    tr_d, va_d, te_d = load_data_raw()
    
    training_in = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_in = [Variable(torch.from_numpy(x).type(dtype_float), requires_grad=False) for x in training_in]
    
    training_out = [asvector(y) for y in tr_d[1]]
    training_out = [Variable(torch.from_numpy(y).type(dtype_float), requires_grad=False) for y in training_out]

    training_data = zip(training_in, training_out)

    # ------------
    
    validation_in = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_in = [Variable(torch.from_numpy(x).type(dtype_float), requires_grad=False) for x in validation_in]  

    validation_out  = [Variable(torch.from_numpy(np.array([y])).type(dtype_float), requires_grad=False) for y in va_d[1]]
    validation_data = zip(validation_in, validation_out)

    # ------------

    test_in = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_in = [Variable(torch.from_numpy(x).type(dtype_float), requires_grad=False) for x in test_in]

    test_out  = [Variable(torch.from_numpy(np.array([y])).type(dtype_float), requires_grad=False) for y in te_d[1]]
    test_data = zip(test_in, test_out)

    return (training_data, validation_data, test_data)


def asvector(j):
    """Create vector of shape (10, 1) with 1.0 in the jth position and 0.0 elsewhere."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

# load the MNIST database
print("Loading the MNIST database...")
training_data, validation_data, test_data = [list(d) for d in load_data()]
print("Done")

Loading the MNIST database...
Done


In [15]:
# initialize a Neural Network

layer_sizes = [784, 30, 10]  # <-- first entry must be 784, last one must be 10

np.random.seed(1234)
net = NeuralNetworkMNISTAutoGrad(layer_sizes)

In [16]:
# evaluate the performance of the untrained network

nright = net.evaluate(test_data)
print("Untrained network: got right %d out of %d (accuracy %.2f pct)" % (nright, len(test_data), 100*float(nright)/len(test_data)))

Untrained network: got right 988 out of 10000 (accuracy 9.88 pct)


In [17]:
# train the network

EPOCHS  = 10
BATCHSZ = 10
ETA     =  2

net.SGD(training_data, EPOCHS, BATCHSZ, ETA, test_data=test_data)

# to profile the network, run the line below instead of the one above
# set EPOCHS = 1
#%lprun -f net.SGD net.SGD(training_data, EPOCHS, BATCHSZ, ETA, test_data=test_data)

Epoch  1: 9052 of 10000 (elapsed time: 35.017382s)
