In [380]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [381]:
WANDB_PROJECT = "myprojectname"
WANDB_ENTITY = "myname"
DATASET = "fashion_mnist"
EPOCHS = 10
BATCH_SIZE = 128
LOSS = "cross_entropy"
OPTIMIZER = "sgd"
LEARNING_RATE = 0.01
MOMENTUM = 0.5
BETA = 0.5
BETA1 = 0.5
BETA2 = 0.5
EPSILON = 1e-6
WEIGHT_DECAY = 0.0
WEIGHT_INIT = "random"
NUM_LAYERS = 4
HIDDEN_SIZE = 128
ACTIVATION = "sigmoid"

In [382]:
from keras.datasets import fashion_mnist

# Load the data
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# Normalize the data
x_train = x_train / 255.0
x_test = x_test / 255.0

In [383]:
class FFNeuralNetwork():
    def __init__(self, neurons=HIDDEN_SIZE, hid_layers=NUM_LAYERS, input_size=784, output_size=10, act_func=ACTIVATION, weight_init=WEIGHT_INIT, out_act_func="softmax"):
        self.neurons, self.hidden_layers = neurons, hid_layers
        self.weights, self.biases = [], []
        self.input_size, self.output_size = input_size, output_size
        self.activation_function, self.weight_init = act_func, weight_init
        self.output_activation_function = out_act_func

        self.initialize_weights()
        self.initiate_biases()

    def initialize_weights(self):
        self.weights.append(np.random.randn(self.input_size, self.neurons))
        for _ in range(self.hidden_layers-1):
            self.weights.append(np.random.randn(self.neurons, self.neurons))
        self.weights.append(np.random.randn(self.neurons, self.output_size))

        if self.weight_init == "xavier":
            for i in range(len(self.weights)):
                self.weights[i] = self.weights[i] / np.sqrt(self.weights[i].shape[0])

    def initiate_biases(self):
        for _ in range(self.hidden_layers):
            self.biases.append(np.random.randn(self.neurons))
        self.biases.append(np.random.randn(self.output_size))
    
    def activation(self, x):
        # x is a matrix of size (batch_size, neurons)
        if self.activation_function == "sigmoid":
            return 1 / (1 + np.exp(-x))
        elif self.activation_function == "tanh":
            return np.tanh(x)
        elif self.activation_function == "ReLU":
            return np.maximum(0, x)
        else:
            raise Exception("Invalid activation function")
    
    def output_activation(self, x):
        # x is a matrix of size (batch_size, output_size)
        if self.output_activation_function == "softmax":
            max_x = np.max(x, axis=1)
            max_x = max_x.reshape(max_x.shape[0], 1)
            exp_x = np.exp(x - max_x)
            return exp_x / np.sum(exp_x, axis=1).reshape(exp_x.shape[0], 1)
        else:
            raise Exception("Invalid output activation function")
    
    def forward(self, x):
        # x is a matrix of size (batch_size, input_size)
        self.pre_activation, self.post_activation = [x], [x]

        for i in range(self.hidden_layers):
            self.pre_activation.append(np.matmul(self.post_activation[-1], self.weights[i]) + self.biases[i])
            self.post_activation.append(self.activation(self.pre_activation[-1]))
            
        self.pre_activation.append(np.matmul(self.post_activation[-1], self.weights[-1]) + self.biases[-1])
        self.post_activation.append(self.output_activation(self.pre_activation[-1]))

        return self.post_activation[-1]

In [384]:
def loss(loss, y, y_pred):
    # y is a matrix of size (batch_size, output_size)
    # y_pred is a matrix of size (batch_size, output_size)
    if loss == "cross_entropy":
        return -np.sum(y * np.log(y_pred))
    elif loss == "mean_squared":
        return np.sum((y - y_pred) ** 2) / (2 * y.shape[0])
    else:
        raise Exception("Invalid loss function")

In [385]:
class Backpropagation():
    def __init__(self, nn: FFNeuralNetwork, loss=LOSS, act_func=ACTIVATION):
        self.nn, self.loss, self.activation_function = nn, loss, act_func
    
    def loss_derivative(self, y, y_pred):
        # y is a matrix of size (batch_size, output_size)
        if self.loss == "cross_entropy":
            return -y / y_pred
        elif self.loss == "mse":
            return 2 * (y_pred - y)
        else:
            raise Exception("Invalid loss function")
        
    def activation_derivative(self, x):
        # x is a matrix of size (batch_size, neurons)
        if self.activation_function == "sigmoid":
            return np.exp(-x) / (1 + np.exp(-x))**2
        elif self.activation_function == "tanh":
            return 1 - np.tanh(x)**2
        elif self.activation_function == "ReLU":
            return (x > 0).astype(int)
        else:
            raise Exception("Invalid activation function")
        
    def output_activation_function(self, y, y_pred):
        # this is the derivative of the loss function with respect to the pre-activation of the output layer
        if self.nn.output_activation_function == "softmax":
            return y_pred - y
        else:
            raise Exception("Invalid output activation function")
    
    def backward(self, y, y_pred):
        # y is a matrix of size (batch_size, output_size)
        # y_pred is a matrix of size (batch_size, output_size)
        self.d_weights, self.d_biases = [], []
        self.d_h, self.d_a = [], []

        self.d_h.append(self.loss_derivative(y, y_pred))
        self.d_a.append(self.output_activation_function(y, y_pred))

        for i in range(self.nn.hidden_layers, 0, -1):
            self.d_weights.append(np.matmul(self.nn.post_activation[i].T, self.d_a[-1]))
            self.d_biases.append(np.sum(self.d_a[-1], axis=0))
            self.d_h.append(np.matmul(self.d_a[-1], self.nn.weights[i].T))
            self.d_a.append(self.d_h[-1] * self.activation_derivative(self.nn.pre_activation[i]))

        self.d_weights.append(np.matmul(self.nn.post_activation[0].T, self.d_a[-1]))
        self.d_biases.append(np.sum(self.d_a[-1], axis=0))

        self.d_weights.reverse()
        self.d_biases.reverse()

        return self.d_weights, self.d_biases

In [386]:
class Optimiser():
    def __init__(self, nn: FFNeuralNetwork, bp:Backpropagation, lr=LEARNING_RATE, optimiser=OPTIMIZER, momentum=MOMENTUM):
        self.nn, self.lr, self.optimiser = nn, lr, optimiser
        self.momentum = momentum
        self.bp = bp
        self.history_weights = [np.zeros_like(w) for w in self.nn.weights]
        self.history_biases = [np.zeros_like(b) for b in self.nn.biases]

    def run(self, d_weights, d_biases):
        if(self.optimiser == "sgd"):
            self.SGD(d_weights, d_biases)
        elif(self.optimiser == "momentum"):
            self.MomentunGD(d_weights, d_biases)
        else:
            raise Exception("Invalid optimiser")
    
    def SGD(self, d_weights, d_biases):
        for i in range(self.nn.hidden_layers + 1):
            self.nn.weights[i] -= self.lr * d_weights[i]
            self.nn.biases[i] -= self.lr * d_biases[i]

    def MomentunGD(self, d_weights, d_biases):
        for i in range(self.nn.hidden_layers + 1):
            self.history_weights[i] = self.momentum * self.history_weights[i] + d_weights[i]
            self.history_biases[i] = self.momentum * self.history_biases[i] + d_biases[i]

            self.nn.weights[i] -= self.history_weights[i] * self.lr
            self.nn.biases[i] -= self.history_biases[i] * self.lr
            

In [396]:
def train(x_train, y_train):
    # x_train is a matrix of size (batch_size, input_size)
    # y_train is a matrix of size (batch_size, output_size) - one-hot encoded

    nn = FFNeuralNetwork(input_size=784, output_size=10, hid_layers=4, neurons=512, act_func="sigmoid", out_act_func="softmax")
    bp = Backpropagation(nn, loss="cross_entropy", act_func="sigmoid")
    opt = Optimiser(nn, bp, lr=0.002, optimiser="momentum", momentum=0.5)

    batch_size = 128

    for epoch in range(10):
        for i in range(0, x_train.shape[0], batch_size):
            x_batch = x_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            y_pred = nn.forward(x_batch)
            d_weights, d_biases = bp.backward(y_batch, y_pred)
            opt.run(d_weights, d_biases)

        y_pred = nn.forward(x_train)
        print("Epoch: {}, Loss: {}".format(epoch + 1, loss("cross_entropy", y_train, y_pred)))
        print("Accuracy: {}".format(np.sum(np.argmax(y_pred, axis=1) == np.argmax(y_train, axis=1)) / y_train.shape[0]))
    
    return nn

x_train_reshape = x_train.reshape(x_train.shape[0], -1)

y_train_reshape = np.zeros((y_train.shape[0], 10))
y_train_reshape[np.arange(y_train.shape[0]), y_train] = 1 # one-hot encoding

print(x_train_reshape.shape)
print(y_train_reshape.shape)

nn = train(x_train_reshape, y_train_reshape)

(60000, 784)
(60000, 10)
Epoch: 1, Loss: 76933.39328326975
Accuracy: 0.6973
Epoch: 2, Loss: 63321.80562899262
Accuracy: 0.77915
Epoch: 3, Loss: 50759.83300491186
Accuracy: 0.79045
Epoch: 4, Loss: 42656.53882336947
Accuracy: 0.80535
Epoch: 5, Loss: 36418.252189558436
Accuracy: 0.8167333333333333
Epoch: 6, Loss: 34175.929024470264
Accuracy: 0.8233
Epoch: 7, Loss: 35859.699903745546
Accuracy: 0.8222
Epoch: 8, Loss: 31456.093385507913
Accuracy: 0.83385
Epoch: 9, Loss: 28949.980994272177
Accuracy: 0.8428666666666667
Epoch: 10, Loss: 26663.97555244872
Accuracy: 0.8485


In [395]:
x_test_reshape = x_test.reshape(x_test.shape[0], -1)

y_pred = nn.forward(x_test_reshape)
print("Test Accuracy: {}".format(np.sum(np.argmax(y_pred, axis=1) == y_test) / y_test.shape[0]))

Test Accuracy: 0.7974
