In [124]:
import numpy as np
import random
import mnist_loader

training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
training_data, test_data = list(training_data), list(test_data)

In [None]:
class NeuralNetwork:
    # sizes = [5, 3, 2]
    def __init__(self, sizes):
        self.sizes = sizes
        self.weights = [np.random.randn(sizes[i], sizes[i-1]) for i in range(1, len(sizes))]
        self.biases = [np.random.randn(i, 1) for i in sizes[1:]]

    def feed_forward(self, a):
        a_s, z_s = [a], []
        for W, b in zip(self.weights, self.biases):
            z = (W @ a) + b
            z_s.append(z)
            a = self.sigmoid(z)
            a_s.append(a)
        return (a_s, z_s, a)
            
    def back_propogation(self, y, a_s, z_s):
        deltas = [0 for _ in range(len(self.sizes) - 1)]
        delta_L = (a_s[-1] - y) * self.sigmoid_prime(z_s[-1])
        deltas[-1] = delta_L
        
        for l in reversed(range(len(a_s)-2)):
            delta_l = (self.weights[l+1].T @ deltas[l+1]) * self.sigmoid_prime(z_s[l])
            deltas[l] = delta_l

        nabla_b = deltas
        nabla_w = [deltas[i] @ (a_s[i]).T for i in range(len(a_s) - 1)]

        return (nabla_w, nabla_b)

    def gradient_descent(self, w_gradients, b_gradients, eta):
        for i in range(len(self.weights)):
            self.weights[i] = self.weights[i] - (eta * w_gradients[i+1][0])
            self.biases[i] = self.biases[i] - (eta * b_gradients[i+1][0])
                        
    def train(self, epochs, training_data, eta):
        for i in range(epochs):
            w_gradients, b_gradients = {i:[] for i in range(1, len(self.sizes))}, {i:[] for i in range(1, len(self.sizes))}
            for j in training_data:
                a_s, z_s, y_hat = self.feed_forward(j[0])
                nabla_w, nabla_b = self.back_propogation(j[1], a_s, z_s)
                for i in range(len(nabla_w)):
                    w_gradients[i+1].append(nabla_w[i])
                    b_gradients[i+1].append(nabla_b[i])
            # Compute the average
            for j in w_gradients.keys():
                avg = np.mean(np.stack(w_gradients[j]), axis=0)
                w_gradients[j] = avg

            for j in b_gradients.keys():
                avg = np.mean(np.stack(b_gradients[j]), axis=0)
                b_gradients[j] = avg
                
            # Perform gradient descent
            self.gradient_descent(w_gradients, b_gradients, eta)
            print(i)
                    
    def sigmoid(self, k):
        return 1.0 / (1.0 + np.exp(-k))

    def sigmoid_prime(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))

In [119]:
class NeuralNetwork:
    def __init__(self, sizes):
        self.sizes = sizes
        self.weights = [np.random.randn(sizes[i], sizes[i-1]) for i in range(1, len(sizes))]
        self.biases = [np.random.randn(i, 1) for i in sizes[1:]]

    def feed_forward(self, a):
        a_s = [a]
        z_s = []
        for i in range(len(weights)): # note: len(weights) = len(biases)
            z = (self.weights[i] @ a) + self.biases[i]
            z_s.append(z)
            a = self.sigmoid(z)
            a_s.append(a)

        return (a_s, z_s)
        

    def stochastic_gradient_descent(self, training_data, epochs, eta, batch_size):
        for i in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[j:j+batch_size] for j in range(0, len(training_data), batch_size)]
            for mini_batch in mini_batches:
                nabla_w = [np.zeros(w.shape) for w in self.weights]
                nabla_b = [np.zeros(b.shape) for b in self.biases]
                for x, y in mini_batch:
                    delta_nabla_w, delta_nabla_b = self.backpropagate(x, y)
                    nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
                    nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]

                self.weights = [w - ((eta / len(mini_batch)) * nw) for w, nw in zip(self.weights, nabla_w)]
                self.biases = [b - ((eta / len(mini_batch)) * nb) for b, nb in zip(self.biases, nabla_b)]
                
            print(f'Epoch {i+1}/{epochs} completed.')

    def backpropagate(self, x, y):
        a_s, z_s = self.feed_forward(x)
        d_L = (a_s[-1] - y) * sigmoid_prime(z_s[-1])
        d_s = [0 for _ in range(len(self.weights))]
        d_s[-1] = d_L
        
        # Propogate the error backwards
        for i in reversed(range(len(d_s))-1):
            d_l = ((self.weights[i+1].T) @ d_s[i+1]) * sigmoid_prime(z_s[i])
            d_s[i] = d_l

        nabla_b = d_s
        nabla_w = [d_s[i] @ (a_s[i].T) for i in range(len(d_s))]

        return (nabla_w, nabla_b)
            
    def sigmoid(self, k):
        return 1.0 / (1.0 + np.exp(-k))

    def sigmoid_prime(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
        