# Zahra Khatibi - 610398119 - Deep Learning HomeWork

## MLP Approach for Mnist Fashion Problem

In [40]:
import numpy as np
import pandas as pd
import os
import sys
import math
import plotly.express as px

### Initially, following the provided instructions, we read the data from the input and divide all inputs by 255 to scale the input values between 0 and 1. Furthermore, for the cross-entropy part, it is necessary to convert the y values into a one-hot format. This will be elaborated upon in the following sections.
### In this function, we transform the values of y into a 1x10 vector, where the index y[i] of that vector equals 1, and the rest are set to 0.

### In this function, we convert the values of y into a 1x10 vector where the i-th index of that vector equals 1 and the rest are zero.

In [2]:
def one_hot(y):
    result = np.zeros((len(y), 10))
    for i in range(len(y)):
        result[i, y[i]] = 1
    return result

In [3]:
with open ("train.npy" , "rb") as f:
    x_train = np.load(f, allow_pickle = True)
    y_train = np.load(f, allow_pickle = True)
x_train = x_train.astype(int) / 255
y_train = one_hot(y_train)

In [4]:
with open ("val.npy" , "rb") as f:
    x_val = np.load(f, allow_pickle = True)
    y_val = np.load(f, allow_pickle = True)
x_val = x_val.astype(int) / 255
y_val = one_hot(y_val)

In [5]:
with open ("test.npy" , "rb") as f:
    x_test = np.load(f, allow_pickle = True)
    y_test = np.load(f, allow_pickle = True)
x_test = x_test.astype(int) / 255
y_test = one_hot(y_test)

In [6]:
print(f"{x_train.shape = } -- {y_train.shape = }")
print(f"{x_val.shape = } -- {y_val.shape = }")
print(f"{x_test.shape = } -- {y_test.shape = }")

x_train.shape = (50000, 784) -- y_train.shape = (50000, 10)
x_val.shape = (10000, 784) -- y_val.shape = (10000, 10)
x_test.shape = (10000, 784) -- y_test.shape = (10000, 10)


## In this section, the MLP neural network is implemented.

In [15]:
class MLP:

    def __init__(self, input_size, output_size, hiddens, activations, lr ,criterion, lamda):

        self.train_mode = True
        self.nlayers = len(hiddens)+1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.lr = lr
        self.criterion = criterion
        self.lamda = lamda
        
        input_sizes = [self.input_size]
        input_sizes.extend(hiddens)
        output_sizes = hiddens
        output_sizes.append(self.output_size)
        self.linear_layers = []
        
        for k in range(self.nlayers):
            if k == 0:
                prev_layer = self.input_size
            else:
                prev_layer = output_sizes[k-1]
            self.linear_layers.append(Linear(input_sizes[k], output_sizes[k], prev_layer))

    def forward(self, x):

        forward_input = x
        for k in range(self.nlayers):
            output = self.linear_layers[k].forward(forward_input)
            output = self.activations[k](output)
            forward_input = output 
        self.output = output
        return output

    def zero_grads(self):

        for i in range(len(self.linear_layers)):
            layers = self.linear_layers
            layers[i].dW.fill(0.0)
            layers[i].db.fill(0.0)

    def step(self):
        for i in range(len(self.linear_layers)):
            self.linear_layers[i].update_W = -1 * (self.lr * self.linear_layers[i].dW)
            self.linear_layers[i].W = self.linear_layers[i].W + self.linear_layers[i].update_W 
            self.linear_layers[i].update_b = -1 * (self.lr * self.linear_layers[i].db)
            self.linear_layers[i].b = self.linear_layers[i].b + self.linear_layers[i].update_b
        

        
    def compute_cost(self, y, lamda):
        cross = -1 * np.mean(np.sum(y * np.log(self.output), axis=1))
        p = 0
        for i in range(len(self.linear_layers)):
            p += np.sum(self.linear_layers[i].W ** 2)
        return cross + (lamda /(2 * len(y))) * p
        
    
    def backward(self, labels, lamda):
        criterion_derivative = self.output - labels
        for k in reversed(range(len(self.linear_layers))):
            if k == self.nlayers-1:
                delta = self.output - labels
            else:
                delta = criterion_derivative * self.activations[k].derivative()
            linear_backward = self.linear_layers[k].backward(delta, lamda)
            criterion_derivative = linear_backward
        return criterion_derivative
    
    def split_batch(self, x, y, batch_size):
        dt = np.concatenate([x, y] , axis=1)
        np.random.shuffle(dt)
        length = len(dt) // batch_size
        batches = []
        for batch_n in range(length):
            row_start, row_end = batch_n * batch_size, (batch_n + 1) * batch_size
            x_batch, y_batch = dt[row_start:row_end, :-10], dt[row_start:row_end, -10:]
            batches.append((x_batch, y_batch))
        return batches
    
    
    def fit(self, x, y, epoch, batch_size, x_val, y_val, Print=True):
        self.zero_grads()
        lst_costs = []
        val_costs = []
        for epoch_no in range(epoch):
            batches = self.split_batch(x, y, batch_size)
            costs = []
            for x_batch, y_batch in batches:
                self.forward(x_batch)
                cost = self.compute_cost(y_batch, self.lamda)
                self.backward(y_batch, self.lamda)
                self.step()
                costs.append(cost)
            y_val_pred = self.forward(x_val)
            val_cost = self.compute_cost(y_val, self.lamda)
            lst_costs.append(np.mean(costs))
            val_costs.append(val_cost)
            if Print:
                print(f'train cost in {epoch_no} is : {np.mean(costs)}, ------ , valid cost : {val_cost}')
        return lst_costs, val_costs
        

    def error(self, labels):
        return (np.argmax(self.output, axis = 1) != np.argmax(labels, axis = 1)).sum()

    def total_loss(self, labels):
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        self.train_mode = True

    def eval(self):
        self.train_mode = False


### In this section, the ELU function is implemented.

In [8]:
class ELU:

    def __init__(self):
        self.state = None

    def forward(self, x):
        ELU_forward = np.where(x < 0, 0, x)
                    # np.where(x>0 ,  x , np.exp(x))    -------> this code had error, 
                                                               # I decide to use Relu with alpha = 0
                                                               # becuase it is a kind of ELU
        self.state = ELU_forward
        return ELU_forward

    def derivative(self):
        ELU_derivative = np.where(self.state < 0, 0, 1)
        return ELU_derivative
    
    def __call__(self, x):
        return self.forward(x)
    
    
class Softmax:
    
    def __init__(self):
        self.state = None
    
    
    def forward(self, z):
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    
    def __call__(self, z):
        return self.forward(z)
        

In [9]:
class Linear():
    
    def weight_init(self, in_feature, out_feature):
        return np.random.normal(0, np.sqrt(2/self.p), size=(in_feature, out_feature))
    
    
    def __init__(self, in_feature, out_feature, prev_layer):

        self.p = prev_layer
        self.W = self.weight_init(in_feature, out_feature)
        self.b = self.weight_init(1, out_feature)

        self.dW = np.zeros(None)
        self.db = np.zeros(None)

        self.update_W = np.zeros(None)
        self.update_b = np.zeros(None)

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        self.x = x
        linear_forward = np.dot(x, self.W) + self.b

        return linear_forward

    def backward(self, delta, lamda):

        batch_size = self.x.shape[0]

        self.dW = (np.dot(self.x.T, delta) / batch_size) + lamda * self.W
        self.db = np.sum(delta, axis=0, keepdims=True) / batch_size

        dx = np.dot(delta, self.W.T)
        return dx
    
    def __str__(self):
        return f'{self.W.shape}'

### In this section, the Softmax Cross Entropy is implemented.

In [10]:
class SoftmaxCrossEntropy:

    def __init__(self):
        self.logits = None
        self.labels = None
        self.loss = None

    def forward(self, x, y):
        self.logits = x
        self.labels = y

        c = 0.01 
        sm_cross_entropy = np.empty(x.shape[0])
        for idx, item in enumerate(x):
            logsumexp = np.log(np.sum(np.exp(item - c))) + c
            sm_cross_entropy[idx] = -np.sum(y[idx] * (np.log(np.exp(item)) - logsumexp))
    
        return sm_cross_entropy

    def derivative(self):
        return self.activation - self.labels

### Running the model with a learning rate of 0.01 and lambda (regularization parameter) of 0.01.

In [50]:
np.random.seed(42)
input_size = 784
output_size = 10
hiddens = [32, 32, 32]
criterion = SoftmaxCrossEntropy()
activations = [ELU(), ELU(), ELU(),Softmax()]
lr = 0.01
model = MLP(input_size, output_size, hiddens, activations, lr, criterion, lamda=0.01)
model.fit(x_train, y_train, epoch=20, batch_size=32, x_val=x_val, y_val=y_val)


Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.



train cost in 0 is : 0.8773208730428843, ------ , valid cost : 0.6256397081989992
train cost in 1 is : 0.6291680758881095, ------ , valid cost : 0.5570634829474735
train cost in 2 is : 0.5803318454843007, ------ , valid cost : 0.522127698719979
train cost in 3 is : 0.5527957935748332, ------ , valid cost : 0.5210841168984782
train cost in 4 is : 0.5416702803707263, ------ , valid cost : 0.5017608789185678
train cost in 5 is : 0.5312211880419548, ------ , valid cost : 0.4988089790832776
train cost in 6 is : 0.5216221146789742, ------ , valid cost : 0.5182434166884564
train cost in 7 is : 0.5153973364198711, ------ , valid cost : 0.48644846677683046
train cost in 8 is : 0.5084356188154485, ------ , valid cost : 0.49048810381561797
train cost in 9 is : 0.5040755653098936, ------ , valid cost : 0.4957860784083518
train cost in 10 is : 0.49926204084545994, ------ , valid cost : 0.5037790773684351
train cost in 11 is : 0.4952590217949725, ------ , valid cost : 0.4853467529829441
train cost i

### Running the model on different parameters.

In [12]:
lr_values = [0.01, 0.05]
lamda_values = [0.01,0.03,0.05]
train_collect, valid_collect = {}, {}
for curr_lr in lr_values:
    for curr_lamda in lamda_values:
        input_size = 784
        output_size = 10
        hiddens = [32, 32, 32]
        criterion = SoftmaxCrossEntropy()
        activations = [ELU(), ELU(), ELU(),Softmax()]
        model = MLP(input_size, output_size, hiddens, activations, curr_lr, criterion, curr_lamda)
        costs_train , costs_val = model.fit(x_train, y_train, epoch=20, batch_size=32, x_val=x_val, y_val=y_val,Print=False)
        
        train_collect[f'lr:{curr_lr},lamda:{curr_lamda}'] = costs_train
        valid_collect[f'lr:{curr_lr},lamda:{curr_lamda}'] = costs_val

  self.dW = np.zeros(None)
  self.db = np.zeros(None)
  self.update_W = np.zeros(None)
  self.update_b = np.zeros(None)
  cross = -1 * np.mean(np.sum(y * np.log(self.output), axis=1))
  cross = -1 * np.mean(np.sum(y * np.log(self.output), axis=1))
  return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
  return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)


### Based on the cost values in the previous models, we will plot a boxplot.

In [30]:
px.box(pd.DataFrame(train_collect))

In [47]:
px.box(pd.DataFrame(valid_collect))

### Based on the above plots, our selected model has the parameters lr = 0.01 and lambda = 0.03.

In [78]:
np.random.seed(42)
input_size = 784
output_size = 10
hiddens = [32, 32, 32]
criterion = SoftmaxCrossEntropy()
activations = [ELU(), ELU(), ELU(),Softmax()]
lr = 0.01
best_model = MLP(input_size, output_size, hiddens, activations, lr, criterion, lamda=0.03)
train_collcect , val_collect = model.fit(x_train, y_train, epoch=20, batch_size=32,
                                                x_val=x_val, y_val=y_val)



Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.


Passing None into shape arguments as an alias for () is deprecated.



train cost in 0 is : 0.4890360639546668, ------ , valid cost : 0.4858522621754203
train cost in 1 is : 0.4890948036554738, ------ , valid cost : 0.4853755380485835
train cost in 2 is : 0.48875215812685596, ------ , valid cost : 0.49345909087880374
train cost in 3 is : 0.488893238271432, ------ , valid cost : 0.48985570020679853
train cost in 4 is : 0.48914838293924723, ------ , valid cost : 0.49161987517693095
train cost in 5 is : 0.48835066042051145, ------ , valid cost : 0.4899143807138332
train cost in 6 is : 0.4864372140227855, ------ , valid cost : 0.5063415041314985
train cost in 7 is : 0.4857006505493511, ------ , valid cost : 0.4800797213728529
train cost in 8 is : 0.4842706887953637, ------ , valid cost : 0.4865242183435373
train cost in 9 is : 0.48463843299800885, ------ , valid cost : 0.48906816338034215
train cost in 10 is : 0.4851378019907788, ------ , valid cost : 0.5070460197773566
train cost in 11 is : 0.48378436242062034, ------ , valid cost : 0.48725710746547124
train

### Finally, we calculate the model's predictions for the targets.

In [73]:
target_predict = best_model.forward(x_test)
target_predict = np.argmax(target_predict, axis=1)
target_predict

array([9, 2, 1, ..., 8, 1, 5], dtype=int64)



overflow encountered in exp


invalid value encountered in true_divide



## Calculate the accurace of Model

In [84]:
accuracy = sum(target_predict==y_test)/len(y_test)
print("accuracy on test data is: ", accuracy ,+ "%")

accuracy on test data is: 78%
