# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data processing

In [2]:
train_path = "mnist_train.csv"
data = pd.read_csv(train_path)

In [3]:
data = data[:10000]

In [4]:
labels = data.iloc[:, 0:1].to_numpy()
pixels = data.iloc[:, 1:].to_numpy()

In [5]:
def oneHotEncode(labels):
    rows = np.shape(labels)[0]
    cols = np.shape(np.unique(labels))[0]
    base = np.zeros((rows, cols), dtype = int)
    
    for index, value in enumerate(labels):
        base[index][value] = 1
        
    return base

In [6]:
def split_vals(a, n):
    return a[:n], a[n:]

In [7]:
trn = int(np.shape(pixels)[0]*0.7)
trn_labels, valid_labels = split_vals(labels, trn)
trn_pixels, valid_pixels = split_vals(pixels, trn)

In [8]:
def normalise(a, mean, std):
    return (a - mean)/std

In [9]:
mean = trn_pixels.mean()
std = trn_pixels.std()
norm_trn_pixels = normalise(trn_pixels, mean, std)
norm_valid_pixels = normalise(valid_pixels, mean, std)

# Plot

In [10]:
def show(img, title = None):
    plt.imshow(img, cmap = "gray")
    if title is not None:
        plt.title(title)

def plot(arr):
    sample_to_plot = np.reshape(arr, (28, 28))
    show(sample_to_plot)

# Layer functions

In [11]:
class Linear():
    def __init__(self, n_input, n_output):
        np.random.seed(42)
        self.weights = np.random.randn(n_input, n_output) * np.sqrt(2/n_input)
        self.bias = np.zeros(n_output)
        
    def setWeights(self, weights = None):
        if weights is not None:
            self.weights = weights
            
    def setBias(self, bias = None):
        if bias is not None:
            self.bias = bias

    def forward(self, x):
        self.old_x = x
        return np.matmul(x, self.weights) + self.bias

    # There will be a gradient wrt each output of this layer that comes from layers ahead
    # Therefore shape of grad will be the (n_samples, n_outputs) because that many
    # elements can come inside from ahead in the layers.
    def backward(self, grad):
        # This is averaging over all rows. Thus shape is (n_outputs,)
        self.grad_bias = np.mean(grad, axis=0)

        # x : (n_samples, n_inputs), grad: (n_samples, n_outputs)
        # "None" adds a unit axis whereexver specified.
        # Therefore matrix multiplication becomes (n_samples, n_inputs, 1) and (n_samples, 1, n_outputs)
        # This can be translated to: for all "samples" do (n_inputs, 1) * (1, n_outputs)
        # This gives for all "samples", (n_inputs, n_outputs) ie (n_samples, n_inputs, n_outputs)

        # (n_inputs, n_outputs) should be expected because, in linear layer, each input node touches
        # all the output nodes.
        # Taking average at axis 0 will be taking average across all samples since "n_samples" is
        # the 0th axis
        # As a result shape of weights is (n_inputs, n_outputs)
        self.grad_weights = (np.matmul(self.old_x[:, :, None], grad[:, None, :])).mean(axis=0)

        # (n_samples, n_outputs) * (n_inputs, n_outputs)T will be (n_samples, n_inputs)
        # This is expected because the previous layer this layer will pass it to, will have
        # n_inputs number of "outputs" in its layer, which will form (n_samples, n_outputs) for that layer
        return np.dot(grad, self.weights.transpose())

    def __repr__(self):
        n_input, n_output = np.shape(self.weights)
        return f"Linear ({n_input},{n_output})"


In [12]:
class ReLU():
    def forward(self, x):
        self.old_x = x
        return np.clip(x, 0, None)

    def backward(self, grad):
        return np.where(self.old_x > 0, grad, 0)

    def __repr__(self):
        return "ReLU"

In [13]:
class Softmax():
    def forward(self, x):
        x = x - np.reshape(np.max(x, axis = 1), (-1, 1))
        self.old_y = np.exp(x) / (np.exp(x).sum(axis=1)[:, None])
        return self.old_y

    def backward(self, grad):
        return self.old_y * (grad - (grad * self.old_y).sum(axis=1)[:, None])

    def __repr__(self):
        return "Softmax"

In [14]:
class CrossEntropy():
    def forward(self, x, y):
        self.old_x = x.clip(min=1e-8, max=None)
        self.old_y = y
        return (np.where(y == 1, -np.log(self.old_x), 0)).sum(axis=1)

    def backward(self):
        return np.where(self.old_y == 1, -1 / self.old_x, 0)

    def __repr__(self):
        return "Cross-Entropy"

In [15]:
class Sigmoid():
    def forward(self, x):
        self.old_y = np.exp(x)/(1. + np.exp(x))
        return self.old_y
    
    def backward(self, grad):
        differentiation = self.old_y * (1 - self.old_y)
        return differentiation * grad
    
    def __repr__(self):
        return "Sigmoid"

# Model

In [16]:
class Model():
    def __init__(self, layers, cost):
        self.layers = layers
        self.cost = cost

    def forward(self, x):
        #print(f"Before processing: {np.shape(x)}")

        for layer in self.layers:
            x = layer.forward(x)
        #    print(f"After passing through {layer} : {np.shape(x)}")
        return x
    
    def predict(self, x):
        return self.forward(x)

    def loss(self, x, y):
        l = self.cost.forward(self.forward(x), y)
        #print(f"After passing through {self.cost} : {np.shape(l)}")
        return l

    def backward(self):
        grad = self.cost.backward()
        #print(f"After backward on {self.cost} : {np.shape(grad)}")

        for i in range(len(self.layers) - 1, -1, -1):
            grad = self.layers[i].backward(grad)
         #   print(f"After backward on {self.layers[i]} : {np.shape(grad)}")

# Initiasing

In [17]:
layers = [Linear(784, 20), ReLU(), Linear(20, 10), Softmax()]
cost = CrossEntropy()
model = Model(layers = layers, cost = cost)

In [18]:
epochs = 20
learning_rate = 1

# Train loop

In [19]:
def train(model, inputs, labels, epochs=1, learning_rate=0.1):
    for i in range(epochs):
        l = model.loss(inputs, labels).sum()

        model.backward()

        for layer in model.layers:
            if type(layer) is Linear:
                layer.weights -= learning_rate * layer.grad_weights
                layer.bias -= learning_rate * layer.grad_bias
                
        print(f"total loss: {l}, inputs: {np.shape(inputs)[0]}, average loss: {l/np.shape(inputs)[0]}")

In [20]:
train(model = model, inputs = norm_trn_pixels, labels = oneHotEncode(trn_labels), epochs = epochs)

total loss: 19891.955770908906, inputs: 7000, average loss: 2.841707967272701
total loss: 14771.314874160618, inputs: 7000, average loss: 2.1101878391658024
total loss: 13384.901270106304, inputs: 7000, average loss: 1.9121287528723292
total loss: 12188.518853317266, inputs: 7000, average loss: 1.7412169790453238
total loss: 11092.170626703577, inputs: 7000, average loss: 1.5845958038147967
total loss: 10090.848818871971, inputs: 7000, average loss: 1.4415498312674244
total loss: 9215.956073815483, inputs: 7000, average loss: 1.3165651534022118
total loss: 8440.307906310703, inputs: 7000, average loss: 1.2057582723301006
total loss: 7782.041848054709, inputs: 7000, average loss: 1.1117202640078154
total loss: 7237.650212252598, inputs: 7000, average loss: 1.0339500303217997
total loss: 6841.737538930327, inputs: 7000, average loss: 0.9773910769900467
total loss: 6669.641083163524, inputs: 7000, average loss: 0.9528058690233606
total loss: 6913.719919276392, inputs: 7000, average loss: 

# Exp

In [21]:
def compute_accuracy(predictions, labels):
    p = predictions.argmax(axis = 1)
    l = labels.reshape(-1,)
    return (p == l).sum()/np.shape(p)[0]

In [22]:
p = model.predict(norm_valid_pixels)
acc = compute_accuracy(p, valid_labels)

In [23]:
acc

0.795

In [24]:
p = model.predict(norm_trn_pixels)
acc = compute_accuracy(p, trn_labels)

In [25]:
acc

0.8162857142857143

In [26]:
import json

In [29]:
def dumpParamsInJson(model):
    params = dict()
    filename = "nn_params.json"

    for layer in model.layers:
        if type(layer) is Linear:
            params[f"{layer}_weights"] = layer.weights.tolist()
            params[f"{layer}_bias"] = layer.bias.tolist()
            
    with open(filename, "w") as file:
        json.dump(params, file)

In [30]:
dumpParamsInJson(model)

In [64]:
def loadParamsIntoLayers(filename, layers):

    with open(filename, "r") as file:
        params = json.load(file)
    
    for layer in layers:
        if type(layer) is Linear:
            key_weight = f"{layer}_weights"
            key_bias = f"{layer}_bias"
            layer.setWeights(np.array(params[key_weight]))
            layer.setBias(np.array(params[key_bias]))
    
    return layers

In [65]:
filename = "nn_params.json"
layersR = [Linear(784, 20), ReLU(), Linear(20, 10), Softmax()]
layersR = loadParamsIntoLayers(filename, layers)

In [67]:
modelR = Model(layers = layersR, cost = cost)

In [68]:
p = modelR.predict(norm_trn_pixels[:2])