In [9]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

def Verify(expression, message):
    assert expression

#Can be changed later
def initialize_matrix(shape):
     return np.ones(shape)

def softmax(z):
    e_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return e_z / np.sum(e_z, axis=1, keepdims=True)

In [None]:
####THIS CELL IS NOT TO BE USED, PUT HERE SO I CAN LOOK BACK LATER MAYBE####

class ReLU:
    def __call__(self, X):
        return np.maximum(0, X)
    
    def grad(self, X):
        return (X > 0).astype(X.dtype)

## MLP Implementation uses softmax and a cross entropy loss function
class MLP:
    def __init__(self, activation_functor_with_grad, number_of_hidden_layers, number_of_hidden_units : list, input_dimensions=784, output_dimensions = 25, use_bias = True):
        try:
            Verify(number_of_hidden_layers == len(number_of_hidden_units), "Hidden layers does not match size of hidden units list.")
            Verify(callable(activation_functor_with_grad) and hasattr(activation_functor_with_grad, 'grad'), "Activation functor passed must be callable and have a grad method.")

            self.activator = activation_functor_with_grad
            self.number_of_hidden_layers = number_of_hidden_layers
            self.number_of_hidden_units = number_of_hidden_units
            self.use_bias = use_bias

            self.dimensions_list = [input_dimensions] + number_of_hidden_units + [output_dimensions]

            self.weights = []
            for i in range(len(self.dimensions_list)-1):
                self.weights.append(initialize_matrix((self.dimensions_list[i]+self.use_bias, self.dimensions_list[i+1])))

        except AssertionError as e:
                print(f"An error occurred: {e}")

    def predict(self, input):
        if self.use_bias:
            # Adding a bias column of ones to the input if use_bias is True
            input = np.hstack((input, np.ones((input.shape[0], 1))))
            
        current_layer = input
        for weight_matrix in self.weights[:-1]:  # Excluding the last weight matrix so that we can avoid adding the bias to the final output
            current_layer_unactivated = current_layer @ weight_matrix
            current_layer = self.activator(current_layer_unactivated)
            if self.use_bias:
                # Add bias term for the next layer
                current_layer = np.hstack((current_layer, np.ones((current_layer.shape[0], 1))))

        # Processing the last layer without adding a bias term again
        final_unactivated = current_layer @ self.weights[-1]
        return softmax(self.activator(final_unactivated))
    
    def grad():
        #This is hella complicated, trying a different approach.
        return None

In [11]:
class Optimizer:
    def __init__(self, net):
        self.net = net

    def step(self):
        for layer in self.net.layers[::-1]:
            if layer.parameters is not None:
                self.update(layer.parameters, layer.gradient)

    def update(self, params, gradient):
        raise NotImplementedError

class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float):
        super().__init__(net)
        self.lr = lr

    def update(self, params, gradient):
        for (p, g) in zip(params, gradient):
            p -= self.lr * g.mean(axis=0)

In [13]:
#Base class for neural net layers, from: https://colab.research.google.com/github/yueliyl/comp551-notebooks/blob/master/NumpyDeepMLP.ipynb
#Overall approach is very similar to the one taken in most machine learning APIs like TensorFlow and PyTorch
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None
        self.parameters = None
    
    def forward(self, X):
        raise NotImplementedError
    
    def backward(self, gradient):
        raise NotImplementedError
    

#Linear Layer implementation, with gaussian initialization
class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.ni = input_size
        self.no = output_size
        self.w = np.random.randn(output_size, input_size)
        self.b = np.random.randn(output_size)
        self.cur_input = None
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x
        return (self.w[None, :,:] @ x[:,:,None]).squeeze() + self.b #Clever notation from the collab to deal with dimensions -> Final output is output_sizex1
    
    def backward(self, gradient):
        Verify(self.cur_input is not None, "Must call forawrd before backward")
        dw = gradient[:,:,None] @ self.cur_input[:,None,:] #Derivative with respect to the weight is the input, so we multiply the backpropped gradient by the current input
        db = gradient #Derivative with respect to b is 1, so we just keep the previous gradient
        self.gradient = [dw, db]
        return gradient.dot(self.w)
    

#ReLU Layer implementation
class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
    
    #The gradient depends on what is passed in due to the discontinuity
    def forward(self, x):
        self.gradient = np.where(x>0, 1.0, 0.0) #Gradient is 1 for input greater than 0, otherwise its 0
    
    def backward(self,gradient):
        Verify(self.gradient is not None, "Must call forward before backward")
        return gradient *self.gradient

    def copy():
        return ReLULayer()
    
#Softmax layer - gradient only valid for use with cross entropy loss
class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None
    
    def forward(self, x):
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs
    
    def backward(self, target):
        Verify(self.cur_probs is not None, "Must call forward before backward")
        return self.cur_probs - target #Really simple gradient form when softmax is combined with cross entropy loss function
    

class MLP:
    def __init__(self, activation_functor, number_of_hidden_layers, number_of_hidden_units : list, input_dimensions=784, output_dimensions = 25):
        Verify(number_of_hidden_layers == len(number_of_hidden_units), "Hidden layers does not match size of hidden units list.")

        self.activator = activation_functor
        self.number_of_hidden_layers = number_of_hidden_layers
        self.number_of_hidden_units = number_of_hidden_units

        self.dimensions_list = [input_dimensions] + number_of_hidden_units + [output_dimensions]
        self.layers = []
        for i in range(len(self.dimensions_list)-1):
            self.layers.append(LinearLayer(self.dimensions_list[i], self.dimensions_list[i+1]))
            self.layers.append(activation_functor.copy())
        self.layers.append(SoftmaxOutputLayer()) #Final layer is a softmax
    
    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self, target):
        for layer in self.layers[::-1]:
            target = layer.backward(target) #target points to the gradients now

    #Just a wrapper for convenience with other code
    def predict(self, x):
        return self.forward(x)
    
    def fit(self, x, y, max_iter, optimizer : Optimizer):
        losses = []
        labels = np.eye(26)[np.array(y)] ##TODO implement this for our output labels, 3 replaced with 26? -> Converts to onehot encoded matrix

        for i in tqdm(range(max_iter)):
            predictions = self.forward(x)
            loss = -(labels * np.log(predictions))*sum(axis=-1).mean() #Cross entropy
            losses.append(loss)
            self.backward(labels) #labels now points to gradient
            optimizer.step()

        plt.plot(losses)
        plt.xlabel("Epoch")
        plt.ylabel("Cross entropy loss")
            