# Simple Neural Network Module

Implementation of SGD and backprop in a neural network

In [1]:
import random
import numpy as np

In [2]:
class NN:
    def __init__(self, sizes):
        """
        sizes is a list of the number of neurons in each layer of the network
        eg. [20, 10, 15] - 20 in the input layer 
                           10 in the hidden layer
                           15 in the output layer
        """
        self.num_layers = len(sizes)
        self.sizes = sizes
        # randomly initialize the weights and biases of the network
        self.weights = [np.random.randn(current, prev) 
                        for prev, current in zip(self.sizes[:-1], self.sizes[1:])]
        self.biases = [np.random.randn(current, 1) 
                       for current in self.sizes[1:]]
        
    def forward_pass(self, x):
        """
        returns the output if x is the input vector
        """
        for weight, bias in zip(self.weights, self.biases):
            z = np.dot(weight, x) + bias
            x = self.sigmoid(z) # vector of activations for that layer
        return x
    
    def SGD(self, training_data, epochs, bs, lr, test_data=None):
        """
        epochs: number of epochs ie. number of times to go through the entire dataset
        bs : batch size
        lr: learining rate
        """
        training_data = list(training_data)
        n = len(training_data)
        
        for i in range(epochs):
            # randomly shuffle the data 
            random.shuffle(training_data)
            # divide the data into mini-batches
            mini_batches = [training_data[index:index+bs]
                            for index in range(0, n, bs)]
            for mini_batch in mini_batches:
                # run one mini-batch
                self.run_mini_batch(mini_batch, lr)
            
            if test_data:
                test_data = list(test_data)
                n_test = len(test_data)
                # call evaluate to evaluate performance
                number_correct = self.evaluate(test_data)
                print(f'Epoch {i}: {number_correct} / {n_test}')
            else:
                print(f'Epoch {i} completed')
    
    def run_mini_batch(self, mini_batch, lr):
        for x, y in mini_batch:
            grad_biases, grad_weights = self.backprop(x, y)
            
            # update rule for stochastic gradient descent
            self.weights = [old_weight - (lr/len(mini_batch))*gradient
                            for old_weight, gradient in zip(self.weights, grad_weights)]
            self.biases = [old_bias - (lr/len(mini_batch))*gradient
                           for old_bias, gradient in zip(self.biases, grad_biases)]
            
    def backprop(self, x, y):
        grad_biases = [np.zeros_like(bias) for bias in self.biases]
        grad_weights = [np.zeros_like(weight) for weight in self.weights]
        
        # forward pass 
        activations = [x] # list of the activations at all layers in the network
        zs = [] # list of the z vectors at all layers in the network
        
        for weight, bias in zip(self.weights, self.biases):
            z = np.dot(weight, x) + bias
            zs.append(z)
            # cal the activation for that layer
            x = self.sigmoid(z)
            activations.append(x)
            
        # use the first equation to calculate the error in the ouput layer
        error = (activations[-1] - y) * self.sigmoid_prime(zs[-1])
        
        # according to the second equation rate of change of the cost wrt 
        # the biases in any layer is equal to the error in that layer
        grad_biases[-1] = error
        
        # use the fourth equation to calculate the rate of change of
        # the cost wrt the weights in the last layer
        grad_weights[-1] = np.dot(error, activations[-2].T)
        
        for i in range(2, self.num_layers):
            # calculate error for subsequent layers using equation two
            error = np.dot(self.weights[-i+1].T, error) * self.sigmoid_prime(zs[-i]) 
            grad_biases[-i] = error
            grad_weights[-i] = np.dot(error, activations[-i-1].T)
            
        return grad_biases, grad_weights
    
    def evaluate(self, test_data):
        predictions = [ (np.argmax(self.forward_pass(x)), y)
                       for x, y in test_data]
        return sum(int(x == y) for x, y in predictions)
    
    def sigmoid(self, z):
        return 1 / (1+np.exp(-z))
    
    def sigmoid_prime(self, z):
        return self.sigmoid(z) * (1 - self.sigmoid(z))
            
        
        

## Data preparation

Neural network is tested on the MNIST dataset

In [3]:
import mnist_loader

In [4]:
train, val, test = mnist_loader.load_data_wrapper()

## Create Network

In [5]:
net = NN([784, 30, 10])

In [6]:
net.SGD(train, epochs=30, bs=10, lr=3.0, test_data=test)

Epoch 0: 8278 / 10000
Epoch 1: 9248 / 10000
Epoch 2: 9263 / 10000
Epoch 3: 9340 / 10000
Epoch 4: 9350 / 10000
Epoch 5: 9401 / 10000
Epoch 6: 9424 / 10000
Epoch 7: 9416 / 10000
Epoch 8: 9426 / 10000
Epoch 9: 9468 / 10000
Epoch 10: 9440 / 10000
Epoch 11: 9478 / 10000
Epoch 12: 9471 / 10000
Epoch 13: 9461 / 10000
Epoch 14: 9462 / 10000
Epoch 15: 9469 / 10000
Epoch 16: 9495 / 10000
Epoch 17: 9459 / 10000
Epoch 18: 9491 / 10000
Epoch 19: 9489 / 10000
Epoch 20: 9473 / 10000
Epoch 21: 9466 / 10000
Epoch 22: 9477 / 10000
Epoch 23: 9485 / 10000
Epoch 24: 9503 / 10000
Epoch 25: 9506 / 10000
Epoch 26: 9494 / 10000
Epoch 27: 9495 / 10000
Epoch 28: 9504 / 10000
Epoch 29: 9471 / 10000
