In [None]:
import random
import numpy as np

In [None]:
def σ(z):
    """ Sigmoid function"""
    return 1.0/(1.0 + np.exp(-z))

def σp(z):
    """ Sigmoid first derivative"""
    return σ(z) * (1 - σ(z))

In [None]:
class Weights():

    def __init__(self, lines, columns) -> None:
        """ Initialize the weights matrix. """
        self.weights_matrix = np.random.randn(lines, columns)

        # Shortcuts
        self.W = self.weights_matrix
        self.T = self.weights_matrix.T
        self.shape = self.weights_matrix.shape

    def update_weights(self, learning_rate, weight_gradient) -> None:
        """ Update the weights using gradient descent. """
        self.weights_matrix -= learning_rate * weight_gradient


class Biases():

    def __init__(self, lines) -> None:
        """ Initialize the biases vector. """
        self.bias_vector = np.random.randn(lines, 1)

        # Shortcuts
        self.b = self.bias_vector
        self.shape = self.bias_vector.shape

    def update_biases(self, learning_rate, bias_gradient) -> None:
        """ Update the biases using gradient descent. """
        self.bias_vector -= learning_rate * bias_gradient.sum(axis=1).reshape(-1, 1)


In [None]:
class Network:

    def __init__(self, layers: list[int]):
        """
        Sets up the network with the number of neurons in each layer defined by ``layers`` and initializes the weights
        and biases using a Normal Distribution N(0,1).
        """
        self.n_layers = len(layers)
        self.biases = [Biases(lines=bias) for bias in layers[1:]]
        self.weights = [Weights(lines=weights, columns=nodes) for weights, nodes in zip(layers[1:], layers[:-1])]

        # Store values to backpropagate
        self.Zs = None    # Signals
        self.As = None    # Activations

    def evaluate_test_data(self, test_data: list[tuple]):
        """ Returns the number of test inputs for which the neural network outputs the correct result. """
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def feedforward(self, x: np.ndarray) -> np.ndarray:
        """ Returns the output of the network for the input ``x``. """
        for W, b in zip(self.weights, self.biases):
            x = σ(np.dot(W.W, x) + b.b)
        return x

    def stochastic_gradient_descent(self,
                                    training_data: list[tuple],
                                    epochs: int,
                                    mini_batch_size: int,
                                    learning_rate: float,
                                    test_data: list[tuple] = None):
        """
        Trains the neural network using mini-batch stochastic gradient descent where the ``training_data`` is a list of
        tuples ``(x, y)`` representing the training inputs and the labeled outputs.

        If ``test_data`` is provided then the network will be evaluated against the test data after each epoch and
        partial progress printed out.
        """
        for epoch in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k + mini_batch_size] for k in range(0, len(training_data), mini_batch_size)]

            for mini_batch in mini_batches:
                self.update_weights_and_biases(mini_batch, learning_rate)

            if test_data:
                print(f"Epoch {epoch}: {self.evaluate_test_data(test_data)} / {len(test_data)}")
            else:
                print(f"Epoch {epoch} complete")

    def update_weights_and_biases(self, mini_batch: list[tuple], η: float):
        """
        Updates the network's weights and biases by applying gradient descent using backpropagation to a single mini
        batch ``mini_batch`` and a learning rate ``η``.
        """
        m = len(mini_batch)
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(W.shape) for W in self.weights]

        # Create mini-batch matrices
        X = np.column_stack([mini_batch[i][0] for i in range(m)])
        Y = np.column_stack([mini_batch[i][1] for i in range(m)])
        nabla_b, nabla_w = self.backpropagate(X, Y)

        # Gradient Descent Update
        for b, nb in zip(self.biases, nabla_b):
            b.update_biases(learning_rate=η, bias_gradient=(nb / m))
        for W, NW in zip(self.weights, nabla_w):
            W.update_weights(learning_rate=η, weight_gradient=(NW / m))

    def backpropagate(self, X: np.ndarray, Y: np.ndarray) -> tuple[list, list]:
        """
        Returns a tuple ``(nabla_b, nabla_w)`` representing the gradient for the cost function of the training examples
        where ``X`` is a matrix whose columns are the examples of the mini-batch and ``Y`` is a matrix whose columns are
        the labels.
        """
        nabla_b = [None for _ in self.biases]
        nabla_w = [None for _ in self.weights]
        last_layer = self.n_layers - 1
        L = -1

        # Feed forward
        self.feedforward_training_matrix(X)

        # Compute δ of the last layer L
        Z_L = self.Zs[L]
        A_L = self.As[L]
        δ = self.cost_derivative(A_L, Y) * σp(Z_L)

        # Compute gradient for the cost function C_X
        nabla_b[L] = δ
        nabla_w[L] = np.dot(δ, self.As[L - 1].T)

        # Backpropagate
        for l in reversed(range(1, last_layer)):
            Z = self.Zs[l - 1]
            AT = self.As[l - 1].T
            WT = self.weights[l].T

            δ = np.dot(WT, δ) * σp(Z)
            nabla_b[l - 1] = δ
            nabla_w[l - 1] = np.dot(δ, AT)

        return (nabla_b, nabla_w)

    def feedforward_training_matrix(self, A: np.ndarray):
        """
        Updates the signals and the activations arrays of the network where ``A`` is a matrix whose columns are the
        examples of the mini-batch.
        """
        self.Zs = []
        self.As = [A]

        for W, b in zip(self.weights, self.biases):
            Z = np.array(np.dot(W.W, A) + b.b)
            A = σ(Z)
            self.Zs.append(Z)
            self.As.append(A)

    @staticmethod
    def cost_derivative(network_output: np.ndarray, y: np.ndarray) -> np.ndarray:
        """ Returns the derivative of the cost function given the ``network_output`` and the correct label ``y``. """
        return (network_output - y)

In [None]:
import os
import gzip
import pickle
import requests

def load_data():
    filename = "mnist.pkl.gz"
    with gzip.open(filename, 'rb') as f:
        training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    reshape = lambda x, s: np.reshape(x, (s, 1))
    tr_d, va_d, te_d = load_data()
    training_data   = [(reshape(x, 784), vectorized_result(y)) for x,y in zip(tr_d[0], tr_d[1])]
    validation_data = [(reshape(x, 784), y) for x,y in zip(va_d[0], va_d[1])]
    test_data       = [(reshape(x, 784), y) for x,y in zip(te_d[0], te_d[1])]
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

In [None]:

def main():
    layers = [784, 30, 10]
    net = Network(layers)

    training_data, validation_data, test_data = load_data_wrapper()
    epc = 5
    mbs = 10
    eta = 3.0

    net.stochastic_gradient_descent(training_data=training_data,
                                    epochs=epc,
                                    mini_batch_size=mbs,
                                    learning_rate=eta,
                                    test_data=test_data)


main()