### Problem 1 :

#### Definition of the MLP class:

In [None]:
import numpy as np
import pickle

# MLP class :

# The model implemented as follows :
# Each layers is represented by a b vector (biases) and a W matrix (weights)
# These are referenced by the weights dictionary. The format is :
# self.weights[f"W{n}"] to access the weights of the n-th layer
# self.weights[f"b{n}"] to access the biases of the n-th layer

class NN(object):
    
    def __init__(self,
                 hidden_dims=(1024, 2048), # dimensions of each hidden layers
                 n_hidden=2, # number of hidden layers
                 mode='train', # current mode : train/test
                 datapath=None, # path where to find the .pkl file
                 model_path=None, # path where to save/load the model 
                 epsilon = 1e-6,
                 lr = 1e-1, # learning rate
                 n_epochs = 1000, # max number of epochs
                 batch_size = 1000): # batch size for training
        
        assert len(hidden_dims) == n_hidden, "Hidden dims mismatch!"
        
        self.hidden_dims = hidden_dims
        self.n_hidden = n_hidden
        self.mode = mode
        self.datapath = datapath
        self.model_path = model_path
        self.epsilon = epsilon
        self.lr = lr
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        
        # train, validation and test sets :
        #self.tr, self.va, self.te = np.load(open(datapath, "rb"))
        u = pickle._Unpickler(open(datapath, 'rb'))
        u.encoding = 'latin1'
        self.tr, self.va, self.te = u.load()

    def initialize_weights(self, dims, method):
        """
        Parameters :
        - dims: (list of two integers) - the size of input/output layers
        - method: (string) - initializes the weight matrices
            -> "zero" for a Zero initialisation of the weights
            -> "normal" for a Normal initialisation of the weights
            -> "glorot" for a Uniform initialisation of the weights
        :return: None
        """
        if self.mode == "train":
            
            self.weights = {}
            all_dims = [dims[0]] + list(self.hidden_dims) + [dims[1]]
            print("Layers dimensions are : ", all_dims)
            
            for layer_n in range(1, self.n_hidden + 2):
                if method == "zero":
                    self.weights[f"W{layer_n}"] = np.zeros(shape=(all_dims[layer_n - 1],all_dims[layer_n]))
                elif method == "normal":
                    self.weights[f"W{layer_n}"] = np.random.normal(loc=0.0, scale=1.0, size=(all_dims[layer_n - 1],all_dims[layer_n]))
                elif method == "glorot":
                    b = np.sqrt(6.0/(all_dims[layer_n]+all_dims[layer_n-1]))
                    self.weights[f"W{layer_n}"] = np.random.uniform(low=-1*b, high=b, size=(all_dims[layer_n - 1],all_dims[layer_n]))
                else:
                    raise Exception("The provided method name is invalid.")
                print("Initialized W",layer_n,":\n",self.weights[f"W{layer_n}"])
                self.weights[f"b{layer_n}"] = np.zeros((1, all_dims[layer_n]))  # np.random.rand(1, all_dims[layer_n])
                
        elif self.mode == "test":
            pass
        else:
            raise Exception("Unknown Mode!")

    def activation(self, input, prime=False): # Prime for Heavyside, else ReLu
        if prime:
            return input > 0
        return np.maximum(0, input)

    def loss(self, prediction, labels):  #
        # TODO
        prediction[np.where(prediction < self.epsilon)] = self.epsilon
        prediction[np.where(prediction > 1 - self.epsilon)] = 1 - self.epsilon
        return - np.sum(labels * np.log(prediction)) # / prediction.shape[0]

    def softmax(self, input):  # Computes the stable softmax of the input
        Z = np.exp(input - np.max(input)) # softmax(x+C) = softmax(x)
        return Z / np.sum(Z, axis=1, keepdims=True)
    
    def forward(self, input):  #
        cache = {"H0": input}
        for layer in range(1, self.n_hidden + 1):
            cache[f"A{layer}"] = cache[f"H{layer-1}"] @ self.weights[f"W{layer}"] + self.weights[f"b{layer}"]
            cache[f"H{layer}"] = self.activation(cache[f"A{layer}"])

        layer = self.n_hidden + 1
        cache[f"A{layer}"] = cache[f"H{layer-1}"] @ self.weights[f"W{layer}"] + self.weights[f"b{layer}"]
        cache[f"H{layer}"] = self.softmax(cache[f"A{layer}"]) # softmax on last layer
        return cache

    def backward(self, cache, labels):  #
        # TODO
        output = cache[f"H{self.n_hidden+1}"]
        grads = {
            f"dA{self.n_hidden+1}": - (labels - output),
        }
        for layer in range(self.n_hidden + 1, 0, -1):
            # print(f"Shape dA=", grads[f"dA{layer}"].shape)
            # print(f"Shape H=", cache[f"H{layer-1}"].shape)

            grads[f"dW{layer}"] = cache[f"H{layer-1}"].T @ grads[f"dA{layer}"]
            grads[f"db{layer}"] = grads[f"dA{layer}"]

            if layer > 1:
                grads[f"dH{layer-1}"] = grads[f"dA{layer}"] @ self.weights[f"W{layer}"].T
                grads[f"dA{layer-1}"] = grads[f"dH{layer-1}"] * self.activation(cache[f"A{layer-1}"], prime=True)
                # print(f"Shape dA=", grads[f"dA{layer-1}"].shape)
        return grads

    def update(self, grads):  #
        # rint(grads.keys())
        for layer in range(1, self.n_hidden + 1):
            # print(grads[f"dW{layer}"].shape,self.weights[f"W{layer}"].shape)
            self.weights[f"W{layer}"] = self.weights[f"W{layer}"] - self.lr * grads[f"dW{layer}"] / self.batch_size

    def train(self, initializationMethod):
        X_train, y_train = self.tr
        y_onehot = np.eye(np.max(y_train) - np.min(y_train) + 1)[y_train]
        # print(y_train.shape,y_onehot.shape)
        dims = [X_train.shape[1], y_onehot.shape[1]]
        self.initialize_weights(dims, initializationMethod)

        n_batches = int(np.ceil(X_train.shape[0] / self.batch_size))

        for epoch in range(self.n_epochs):
            predictedY = np.zeros_like(y_train)
            trainLoss = 0
            for batch in range(n_batches):
                minibatchX = X_train[self.batch_size * batch:self.batch_size * (batch + 1), :]
                minibatchY = y_onehot[self.batch_size * batch:self.batch_size * (batch + 1), :]
                cache = self.forward(minibatchX)
                grads = self.backward(cache, minibatchY)
                self.update(grads)

                trainLoss += self.loss(cache[f"H{self.n_hidden+1}"], minibatchY)
                predictedY[self.batch_size * batch:self.batch_size * (batch + 1)] = np.argmax(
                    cache[f"H{self.n_hidden + 1}"], axis=1)

            X_val, y_val = self.va
            onVal_y = np.eye(np.max(y_train) - np.min(y_train) + 1)[y_val]
            valCache = self.forward(X_val)

            predicted_valY = np.argmax(valCache[f"H{self.n_hidden + 1}"], axis=1)
            valAccuracy = np.mean(y_val == predicted_valY)
            valLoss = self.loss(valCache[f"H{self.n_hidden+1}"], onVal_y)

            trAccuracy = np.mean(y_train == predictedY)

            print(f"Epoch= {epoch}, Loss={trainLoss:10.2f}, Accuracy={trAccuracy:4.2f}, Val.Loss={valLoss:10.2f}, Val.Accuracy= {valAccuracy:4.2f}")
            # break

    def test(self):
        pass

#### Test of the NN class with MNIST:

In [None]:
neural_net = NN(datapath="mnist.pkl", hidden_dims=(500, 400))
neural_net.train("glorot")

Layers dimensions are :  [784, 500, 400, 10]
Initialized W 1 :
 [[ 0.00129058 -0.06596431  0.00814446 ... -0.05051122 -0.04019191
   0.06466707]
 [ 0.04339839  0.0328401  -0.0558186  ... -0.02579991 -0.04871583
  -0.00735755]
 [ 0.02227065 -0.04462931 -0.04940466 ... -0.03489078 -0.05014492
   0.05607768]
 ...
 [-0.03234866 -0.023499    0.00110721 ...  0.02435768  0.06770049
  -0.0466094 ]
 [ 0.03961624  0.0006465  -0.04126814 ...  0.01528632  0.01788926
   0.03989433]
 [-0.03846133 -0.06437692  0.02346631 ... -0.06265702 -0.04148803
   0.03378623]]
Initialized W 2 :
 [[ 0.05752787  0.07532826  0.01655859 ... -0.03216579 -0.05613719
  -0.03262965]
 [-0.04945394  0.04610535  0.01889513 ... -0.01169685  0.03838026
  -0.05419353]
 [-0.00038338 -0.05000258  0.01893677 ... -0.04346026  0.03564136
   0.03342679]
 ...
 [ 0.05693506 -0.07222667 -0.0349831  ... -0.06158319 -0.04302877
  -0.03379674]
 [ 0.05441861 -0.07849314  0.01068567 ... -0.04923981  0.00451123
   0.05469962]
 [ 0.00690586  

Epoch= 80, Loss=   3706.12, Accuracy=0.98, Val.Loss=    968.97, Val.Accuracy= 0.97
Epoch= 81, Loss=   3665.01, Accuracy=0.98, Val.Loss=    963.83, Val.Accuracy= 0.97
Epoch= 82, Loss=   3624.61, Accuracy=0.98, Val.Loss=    958.85, Val.Accuracy= 0.97
Epoch= 83, Loss=   3584.92, Accuracy=0.98, Val.Loss=    953.90, Val.Accuracy= 0.97
Epoch= 84, Loss=   3545.99, Accuracy=0.98, Val.Loss=    949.15, Val.Accuracy= 0.97
Epoch= 85, Loss=   3507.77, Accuracy=0.98, Val.Loss=    944.44, Val.Accuracy= 0.97
Epoch= 86, Loss=   3470.19, Accuracy=0.98, Val.Loss=    939.79, Val.Accuracy= 0.97
Epoch= 87, Loss=   3433.30, Accuracy=0.98, Val.Loss=    935.34, Val.Accuracy= 0.97
Epoch= 88, Loss=   3397.09, Accuracy=0.98, Val.Loss=    930.95, Val.Accuracy= 0.97
Epoch= 89, Loss=   3361.34, Accuracy=0.98, Val.Loss=    926.65, Val.Accuracy= 0.97
Epoch= 90, Loss=   3326.22, Accuracy=0.98, Val.Loss=    922.43, Val.Accuracy= 0.97
Epoch= 91, Loss=   3291.69, Accuracy=0.98, Val.Loss=    918.31, Val.Accuracy= 0.97
Epoc

Epoch= 178, Loss=   1561.97, Accuracy=0.99, Val.Loss=    738.81, Val.Accuracy= 0.98
Epoch= 179, Loss=   1550.61, Accuracy=0.99, Val.Loss=    737.86, Val.Accuracy= 0.98
Epoch= 180, Loss=   1539.26, Accuracy=0.99, Val.Loss=    736.93, Val.Accuracy= 0.98
Epoch= 181, Loss=   1528.02, Accuracy=0.99, Val.Loss=    736.10, Val.Accuracy= 0.98
Epoch= 182, Loss=   1516.99, Accuracy=0.99, Val.Loss=    735.15, Val.Accuracy= 0.98
Epoch= 183, Loss=   1506.00, Accuracy=0.99, Val.Loss=    734.33, Val.Accuracy= 0.98
Epoch= 184, Loss=   1495.13, Accuracy=0.99, Val.Loss=    733.47, Val.Accuracy= 0.98
Epoch= 185, Loss=   1484.44, Accuracy=0.99, Val.Loss=    732.56, Val.Accuracy= 0.98
Epoch= 186, Loss=   1473.77, Accuracy=0.99, Val.Loss=    731.75, Val.Accuracy= 0.98
Epoch= 187, Loss=   1463.31, Accuracy=0.99, Val.Loss=    730.93, Val.Accuracy= 0.98
Epoch= 188, Loss=   1452.82, Accuracy=0.99, Val.Loss=    730.10, Val.Accuracy= 0.98
Epoch= 189, Loss=   1442.52, Accuracy=0.99, Val.Loss=    729.37, Val.Accurac

Epoch= 276, Loss=    828.39, Accuracy=1.00, Val.Loss=    691.23, Val.Accuracy= 0.98
Epoch= 277, Loss=    823.62, Accuracy=1.00, Val.Loss=    690.96, Val.Accuracy= 0.98
