# **Alireza Zare Z. 9931022 NN-CI-Spring2023**

In [None]:
import numpy as np
from abc import ABC, abstractmethod
import pickle
import tqdm
import matplotlib.pyplot as plt
from random import shuffle
import csv
from google.colab import drive
from PIL import Image
import os
drive.mount('/content/drive')
import pandas as pd
mnist_path = {2: '/content/drive/MyDrive/datasets/MNIST/2/', 5: '/content/drive/MyDrive/datasets/MNIST/5/'}
cali_path = {'train': '/content/drive/MyDrive/datasets/california_houses_price/california_housing_train.csv','test': '/content/drive/MyDrive/datasets/california_houses_price/california_housing_test.csv'}


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



In this code, a class named `FC` is defined, representing a fully connected layer in a neural network.

2. The `initialize_weights` method initializes the weights. If the initialization method is set to "random", the weights are initialized using a normal distribution with `np.random.randn` function.

3. The `initialize_bias` method initializes the bias with zero values. The bias is a numpy array of shape `(output_size, 1)`.

4. The `forward` method is used for the forward pass in the fully connected layer. It takes the input `A_prev` and computes the output of the fully connected layer, returning the result.

5. The `backward` method is used for the backward pass in the fully connected layer. It takes the derivatives of the cost with respect to the current layer's output (`dZ`) and the activations from the previous layer (`A_prev`), and returns the derivative of the cost with respect to the activation of the previous layer (`dA_prev`) and a list of gradients for the weights and bias.

6. The `update_parameters` method updates the layer's parameters using an optimizer. It takes an optimizer object and a list of gradients for the weights and bias, and updates the parameters using the optimizer.

In [None]:
class FC:
    def __init__(self, input_size : int, output_size : int, name : str, initialize_method : str="random"):
        self.input_size = input_size
        self.output_size = output_size
        self.name = name
        self.initialize_method = initialize_method
        self.parameters = [self.initialize_weights(), self.initialize_bias()]
        self.input_shape = None
        self.reshaped_shape = None

    def initialize_weights(self):
        if self.initialize_method == "random":
            # TODO: Initialize weights with random values using np.random.randn
            return np.random.randn(self.output_size, self.input_size) * 0.01

        elif self.initialize_method == "xavier":
            return None

        elif self.initialize_method == "he":
            return None

        else:
            raise ValueError("Invalid initialization method")

    def initialize_bias(self):
        # TODO: Initialize bias with zeros
        return np.zeros((self.output_size, 1))

    def forward(self, A_prev):
        """
        Forward pass for fully connected layer.
            args:
                A_prev: activations from previous layer (or input data)
                A_prev.shape = (batch_size, input_size)
            returns:
                Z: output of the fully connected layer
        """
        # NOTICE: BATCH_SIZE is the first dimension of A_prev
        self.input_shape = A_prev.shape
        A_prev_tmp = np.copy(A_prev)

        # TODO: Implement forward pass for fully connected layer
        if len(A_prev.shape) > 2: # check if A_prev is output of convolutional layer
            batch_size = A_prev.shape[0]
            A_prev_tmp = A_prev_tmp.reshape(batch_size, -1).T
        self.reshaped_shape = A_prev_tmp.shape

        # TODO: Forward part
        W, b = self.parameters
        Z = W @ A_prev_tmp + b
        return Z

    def backward(self, dZ, A_prev):
        """
        Backward pass for fully connected layer.
            args:
                dZ: derivative of the cost with respect to the output of the current layer
                A_prev: activations from previous layer (or input data)
            returns:
                dA_prev: derivative of the cost with respect to the activation of the previous layer
                grads: list of gradients for the weights and bias
        """
        A_prev_tmp = np.copy(A_prev)
        if len(A_prev.shape) > 2: # check if A_prev is output of convolutional layer
            batch_size = A_prev.shape[0]
            A_prev_tmp = A_prev_tmp.reshape(batch_size, -1).T

        # TODO: backward part
        W, b = self.parameters
        dW = dZ @ A_prev_tmp.T / A_prev_tmp.shape[1]
        db = np.sum(dZ, axis=1, keepdims=True) / A_prev_tmp.shape[1]
        dA_prev = W.T @ dZ
        grads = [dW, db]
        # reshape dA_prev to the shape of A_prev
        if len(A_prev.shape) > 2:    # check if A_prev is output of convolutional layer
            dA_prev = dA_prev.T.reshape(self.input_shape)
        return dA_prev, grads

    def update_parameters(self, optimizer, grads):
        """
        Update the parameters of the layer.
            args:
                optimizer: optimizer object
                grads: list of gradients for the weights and bias
        """
        self.parameters = optimizer.update(grads, self.name)


This code defines a class `Conv2D` representing a 2D convolutional layer in a neural network.

2. The `initialize_weights` method initializes the kernel weights. If the initialization method is set to "random", the weights are initialized using a normal distribution with `np.random.randn` function.

3. The `initialize_bias` method initializes the bias with zero values. The bias is a numpy array of shape `(1, 1, 1, out_channels)`.

4. The `target_shape` method calculates the shape of the output of the convolutional layer based on the input shape.

5. The `pad` method pads the input with zeros using the specified padding value and padding dimensions.

6. The `single_step_convolve` method performs a single step of convolution by taking a slice of the input, the kernel weights, and bias, and returns the convolved value.

7. The `forward` method implements the forward pass of the convolutional layer. It takes the input `A_prev` and computes the output of the convolutional layer using the kernel weights and bias.

8. The `backward` method implements the backward pass of the convolutional layer. It takes the gradient of the cost with respect to the output of the convolutional layer (`dZ`) and the activations from the previous layer (`A_prev`), and returns the gradient of the cost with respect to the input of the convolutional layer (`dA_prev`) and a list of gradients for the weights and bias.

9. The `update_parameters` method updates the layer's parameters using an optimizer. It takes an optimizer object and a list of gradients for the weights and bias, and updates the parameters using the optimizer.

In [None]:
class Conv2D:
    def __init__(self, in_channels, out_channels, name, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1), initialize_method="random"):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.name = name
        self.initialize_method = initialize_method

        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
        self.stride = (stride, stride) if isinstance(stride, int) else stride
        self.padding = (padding, padding) if isinstance(padding, int) else padding
        self.parameters = [self.initialize_weights(), self.initialize_bias()]


    def initialize_weights(self):
        """
        Initialize weights.
        returns:
            weights: initialized kernel with shape: (kernel_size[0], kernel_size[1], in_channels, out_channels)
        """
        # TODO: Implement initialization of weights

        if self.initialize_method == "random":
            return np.random.randn(self.kernel_size[0], self.kernel_size[1], self.in_channels, self.out_channels) * 0.01
        if self.initialize_method == "xavier":
            return None
        if self.initialize_method == "he":
            return None
        else:
            raise ValueError("Invalid initialization method")

    def initialize_bias(self):
        """
        Initialize bias.
        returns:
            bias: initialized bias with shape: (1, 1, 1, out_channels)

        """
        # TODO: Implement initialization of bias
        return np.zeros((1, 1, 1, self.out_channels))


    def target_shape(self, input_shape):
        """
        Calculate the shape of the output of the convolutional layer.
        args:
            input_shape: shape of the input to the convolutional layer
        returns:
            target_shape: shape of the output of the convolutional layer
        """
        # TODO: Implement calculation of target shape
        h = input.shape[1]
        w = input.shape[2]
        H = int((h + 2*self.padding[0] - self.kernel_size[0]) / self.stride[0]) + 1
        W = int((w + 2*self.padding[1] - self.kernel_size[1]) / self.stride[1]) + 1
        return (H, W)

    def pad(self, A, padding, pad_value=0):
        """
        Pad the input with zeros.
        args:
            A: input to be padded
            padding: tuple of padding for height and width
            pad_value: value to pad with
        returns:
            A_padded: padded input
        """
        A_padded = np.pad(A, ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0)), mode="constant", constant_values=(pad_value, pad_value))
        return A_padded

    def single_step_convolve(self, a_slic_prev, W, b):
        """
        Convolve a slice of the input with the kernel.
        args:
            a_slic_prev: slice of the input data
            W: kernel
            b: bias
        returns:
            Z: convolved value
        """
        # TODO: Implement single step convolution
        Z = np.multiply(a_slic_prev, W)   # hint: element-wise multiplication
        Z = np.sum(Z)   # hint: sum over all elements
        Z = np.float(Z + b)    # hint: add bias as type float using np.float(None)
        return Z

    def forward(self, A_prev):
        """
        Forward pass for convolutional layer.
            args:
                A_prev: activations from previous layer (or input data)
                A_prev.shape = (batch_size, H_prev, W_prev, C_prev)
            returns:
                A: output of the convolutional layer
        """
        # TODO: Implement forward pass
        W, b = self.parameters
        (batch_size, H_prev, W_prev, C_prev) = A_prev.shape
        (kernel_size_h, kernel_size_w, C_prev, C) = W.shape
        stride_h, stride_w = self.stride
        padding_h, padding_w = self.padding
        H, W = self.target_shape(A_prev.shape)
        Z = np.zeros((batch_size, H, W, C))
        A_prev_pad = self.pad(A_prev, (padding_h, padding_w)) # hint: use self.pad()
        for i in range(batch_size):
            for h in range(H):
                h_start = h * stride_h
                h_end = h_start + kernel_size_h
                for w in range(W):
                    w_start = w * stride_w
                    w_end = w_start + kernel_size_w
                    for c in range(C):
                        a_slic_prev = A_prev_pad[i, h_start:h_end, w_start:w_end, :]
                        Z[i, h, w, c] = self.single_step_convolve(a_slic_prev, W[:, :, :, c], b[:, :, : c]) # hint: use self.single_step_convolve()
        return Z

    def backward(self, dZ, A_prev):
        """
        Backward pass for convolutional layer.
        args:
            dZ: gradient of the cost with respect to the output of the convolutional layer
            A_prev: activations from previous layer (or input data)
            A_prev.shape = (batch_size, H_prev, W_prev, C_prev)
        returns:
            dA_prev: gradient of the cost with respect to the input of the convolutional layer
            gradients: list of gradients with respect to the weights and bias
        """
        # TODO: Implement backward pass
        W, b = self.parameters
        (batch_size, H_prev, W_prev, C_prev) = A_prev.shape
        (kernel_size_h, kernel_size_w, C_prev, C) = W.shape
        stride_h, stride_w = self.stride
        padding_h, padding_w = self.padding
        H, W = dZ.shape[1:3]
        dA_prev = np.zeros((batch_size, H_prev, W_prev, C_prev))  # hint: same shape as A_prev
        dW = np.zeros_like(W)    # hint: same shape as W
        db = np.zeros_like(b)    # hint: same shape as b
        A_prev_pad = self.pad(A_prev, (padding_h, padding_w)) # hint: use self.pad()
        dA_prev_pad = self.pad(dA_prev, (padding_h, padding_w)) # hint: use self.pad()
        for i in range(batch_size):
            a_prev_pad = A_prev_pad[i]
            da_prev_pad = dA_prev_pad[i]
            for h in range(H):
                for w in range(W):
                    for c in range(C):
                        h_start = h * stride_h
                        h_end = h_start + kernel_size_h
                        w_start = w * stride_w
                        w_end = w_start + kernel_size_w
                        a_slice = a_prev_pad[h_start:h_end, w_start:w_end, :]
                        da_prev_pad[h_start:h_end, w_start:w_end, :] += W[..., c] / dZ[i, h, w, c] # hint: use element-wise multiplication of dZ and W
                        dW[..., c] += a_slice * dZ[i, h, w, c]# hint: use element-wise multiplication of dZ and a_slice
                        db[..., c] += dZ[i, h, w, c] # hint: use dZ
            dA_prev[i, :, :, :] = da_prev_pad[padding_h:-padding_h, padding_w:-padding_w, :] # hint: remove padding (trick: pad:-pad)
        grads = [dW, db]
        return dA_prev, grads

    def update_parameters(self, optimizer, grads):
        """
        Update parameters of the convolutional layer.
        args:
            optimizer: optimizer to use for updating parameters
            grads: list of gradients with respect to the weights and bias
        """
        self.parameters = optimizer.update(grads, self.name)


The `MaxPool2D` class represents a max pooling layer. Here's a breakdown of the different methods:

1. `__init__(self, kernel_size=(3, 3), stride=(1, 1), mode="max")`: This method is the constructor of the `MaxPool2D` class. It initializes the layer with the provided `kernel_size`, `stride`, and `mode` parameters. The `stride` and `kernel_size` are converted to tuples if they are provided as single integers.

2. `target_shape(self, input_shape)`: This method calculates the shape of the output of the layer based on the input shape. It uses the formula `(input_shape - kernel_size) / stride + 1` to determine the height and width of the output.

3. `forward(self, A_prev)`: This method performs the forward pass for the max pooling layer. It takes the activations from the previous layer (or input data) as input (`A_prev`) and computes the output of the max pooling layer. It iterates over the input data and applies the max pooling operation based on the `kernel_size`, `stride`, and `mode` parameters.

4. `create_mask_from_window(self, x)`: This method creates a mask from an input matrix `x` to identify the maximum entry. It compares each element of `x` with the maximum value and sets the corresponding element in the mask to `True`.

5. `distribute_value(self, dz, shape)`: This method distributes the input value `dz` in a matrix of dimension `shape`. It calculates the average value based on the dimensions of the matrix and distributes it to each element.

6. `backward(self, dZ, A_prev)`: This method performs the backward pass for the max pooling layer. It takes the gradient of the cost with respect to the output of the max pooling layer (`dZ`) and the activations from the previous layer (`A_prev`) as inputs. It computes the gradient of the cost with respect to the input of the max pooling layer (`dA_prev`) using the chain rule and the mask created during the forward pass.

In [None]:
class MaxPool2D:
    def __init__(self, kernel_size=(3, 3), stride=(1, 1), mode="max"):
        """
        Max pooling layer.
            args:
                kernel_size: size of the kernel
                stride: stride of the kernel
                mode: max or average
            Question:Why we don't need to set name for the layer?
        """
        self.stride = (stride, stride) if isinstance(stride, int) else stride
        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
        self.mode = mode

    def target_shape(self, input_shape):
        """
        Calculate the shape of the output of the layer.
            args:
                input_shape: shape of the input
            returns:
                output_shape: shape of the output
        """
        # TODO: Implement target_shape
        H = int((input_shape[1] - self.kernel_size[0]) / self.stride[0] + 1)
        W = int((input_shape[2] - self.kernel_size[1]) / self.stride[1] + 1)
        return H, W

    def forward(self, A_prev):
        """
        Forward pass for max pooling layer.
            args:
                A_prev: activations from previous layer (or input data)
            returns:
                A: output of the max pooling layer
        """
        # TODO: Implement forward pass for max pooling layer
        (batch_size, H_prev, W_prev, C_prev) = A_prev.shape
        (f_h, f_w) = self.kernel_size
        strideh, stridew = self.stride
        H, W = self.target_shape(A_prev.shape)
        A = np.zeros((batch_size, H, W, C_prev))
        for i in range(batch_size):
            for h in range(H):
                h_start = h * strideh
                h_end = h_start + f_h
                for w in range(W):
                    w_start = w * stridew
                    w_end = w_start + f_w
                    for c in range(C_prev):
                        a_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, c]
                        if self.mode == "max":
                            A[i, h, w, c] = np.max(a_prev_slice * self.create_mask_from_window(a_prev_slice))
                        elif self.mode == "average":
                            A[i, h, w, c] = np.mean(a_prev_slice)
                        else:
                            raise ValueError("Invalid mode")

        return A

    def create_mask_from_window(self, x):
        """
        Create a mask from an input matrix x, to identify the max entry of x.
            args:
                x: numpy array
            returns:
                mask: numpy array of the same shape as window, contains a True at the position corresponding to the max entry of x.
        """
        # TODO: Implement create_mask_from_window
        mask = x == np.max(x)
        return mask

    def distribute_value(self, dz, shape):
        """
        Distribute the input value in the matrix of dimension shape.
            args:
                dz: input scalar
                shape: the shape (n_H, n_W) of the output matrix for which we want to distribute the value of dz
            returns:
                a: distributed value
        """
        # TODO: Implement distribute_value
        (n_H, n_W) = shape
        average = dz/ (n_H * n_W)
        a = np.ones(shape) * average
        return a

    def backward(self, dZ, A_prev):
        """
        Backward pass for max pooling layer.
            args:
                dA: gradient of cost with respect to the output of the max pooling layer
                A_prev: activations from previous layer (or input data)
            returns:
                dA_prev: gradient of cost with respect to the input of the max pooling layer
        """
        # TODO: Implement backward pass for max pooling layer
        (f_h, f_w) = self.kernel_size
        strideh, stridew = self.stride
        batch_size, H_prev, W_prev, C_prev = A_prev.shape
        batch_size, H, W, C = dZ.shape
        dA_prev = np.zeros((batch_size, H_prev, W_prev, C_prev))
        for i in range(batch_size):
            for h in range(H):
                for w in range(W):
                    for c in range(C):
                        h_start = h * strideh
                        h_end = h_start + f_h
                        w_start = w * strideh
                        w_end = w_start + f_w
                        if self.mode == "max":
                            a_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, c]
                            mask = self.create_mask_from_window(a_prev_slice)
                            dA_prev[i, h_start:h_end, w_start:w_end, c] += np.multiply(mask, dZ[i, h, w, c])
                        elif self.mode == "average":
                            dz = dZ[i, h, w, c]
                            dA_prev[i, h_start:h_end, w_start:w_end, c] += self.distribute_value(dz, self.kernel_size)
                        else:
                            raise ValueError("Invalid mode")
        # Don't change the return
        return dA_prev, None





The `BinaryCrossEntropy` class represents the binary cross entropy loss function. It has two main methods:

1. `compute(self, y_hat: np.ndarray, y: np.ndarray) -> float`: This method computes the binary cross entropy loss. It takes the predicted labels `y_hat` and the true labels `y` as input. The method calculates the loss using the formula `-y * log(y_hat) - (1 - y) * log(1 - y_hat)` and returns the loss value as a float.

2. `backward(self, y_hat: np.ndarray, y: np.ndarray) -> np.ndarray`: This method computes the derivative of the binary cross entropy loss with respect to the predicted labels `y_hat`. It takes `y_hat` and `y` as input and returns the derivative of the loss as a numpy array. The derivative is calculated using the formula `-(y / y_hat) + ((1 - y) / (1 - y_hat))`, which is derived from the chain rule.


In [None]:
class BinaryCrossEntropy:
    def __init__(self) -> None:
        pass

    def compute(self, y_hat: np.ndarray, y: np.ndarray) -> float:
        """
        Computes the binary cross entropy loss.
            args:
                y: true labels (n_classes, batch_size)
                y_hat: predicted labels (n_classes, batch_size)
            returns:
                binary cross entropy loss
        """
        # TODO: Implement binary cross entropy loss
        batch_size = y.shape[1]
        cost = np.log(y_hat - y)
        return np.squeeze(cost)

    def backward(self, y_hat: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        Computes the derivative of the binary cross entropy loss.
            args:
                y: true labels (n_classes, batch_size)
                y_hat: predicted labels (n_classes, batch_size)
            returns:
                derivative of the binary cross entropy loss
        """
        # hint: use the np.divide function
        # TODO: Implement backward pass for binary cross entropy loss
        return -(y / y_hat) + ((1 - y) / (1 - y_hat))




The `MeanSquaredError` class represents the mean squared error loss function. It has two main methods:

1. `compute(self, y_pred, y_true)`: This method computes the mean squared error (MSE) loss. It takes the predicted labels `y_pred` and the true labels `y_true` as input. The method calculates the loss using the formula `mean((y_pred - y_true)^2)` and returns the loss value.

2. `backward(self, y_pred, y_true)`: This method computes the derivative of the mean squared error loss with respect to the predicted labels `y_pred`. It takes `y_pred` and `y_true` as input and returns the derivative of the loss. The derivative is calculated using the formula `(y_pred - y_true) / batch_size`, where `batch_size` is the number of examples in the batch.

In [None]:

class MeanSquaredError:
    def __init__(self):
        pass

    def compute(self, y_pred, y_true):
        """
        computes the mean squared error loss
            args:
                y_pred: predicted labels (n_classes, batch_size)
                y_true: true labels (n_classes, batch_size)
            returns:
                mean squared error loss
        """
        # TODO: Implement mean squared error loss
        batch_size = y_pred.shape[1]
        cost = np.sum(np.square(y_pred - y_true)) / (2 * batch_size)
        return np.squeeze(cost)

    def backward(self, y_pred, y_true):
        """
        computes the derivative of the mean squared error loss
            args:
                y_pred: predicted labels (n_classes, batch_size)
                y_true: true labels (n_classes, batch_size)
            returns:
                derivative of the mean squared error loss
        """
        # TODO: Implement backward pass for mean squared error loss
        return (y_pred - y_true) / y_pred.shape[1]


The `GD` class represents the Gradient Descent optimizer. It is used to update the parameters of the layers in a neural network based on the gradients computed during backpropagation. It has three main components:

1. `__init__(self, layers_list: dict, learning_rate: float)`: This method initializes the Gradient Descent optimizer. It takes a dictionary `layers_list` containing the names and layer objects of the network's layers, and a `learning_rate` which determines the step size for parameter updates.

2. `update(self, grads, name)`: This method performs the parameter update for a specific layer. It takes a list of gradients `grads` for the weights and biases of the layer, and the `name` of the layer. It returns a list of updated parameters.

   Inside the method, the corresponding layer object is obtained from `self.layers` using the provided `name`. Then, for each gradient in `grads`, the corresponding parameter is updated using the gradient descent update rule: `new_param = old_param - learning_rate * gradient`. The updated parameters are stored in a list `params`, which is then returned.

In [None]:
# TODO: Implement the gradient descent optimizer
class GD:
    def __init__(self, layers_list: dict, learning_rate: float):
        """
        Gradient Descent optimizer.
            args:
                layers_list: dictionary of layers name and layer object
                learning_rate: learning rate
        """
        self.learning_rate = learning_rate
        self.layers = layers_list

    def update(self, grads, name):
        """
        Update the parameters of the layer.
            args:
                grads: list of gradients for the weights and bias
                name: name of the layer
            returns:
                params: list of updated parameters
        """
        layer = self.layers[name]
        params = []
        #TODO: Implement gradient descent update
        for i in range(len(grads)):
            params.append(layer.parameters[i] - self.learning_rate * grads[i])
        return params


The `Adam` class represents the Adam optimizer, an extension of gradient descent optimization algorithm that adapts the learning rate for each parameter in the network. It has the following components:

1. `__init__(self, layers_list, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8)`: This method initializes the Adam optimizer. It takes a dictionary `layers_list` containing the names and layer objects of the network's layers, and optional parameters `learning_rate`, `beta1`, `beta2`, and `epsilon` that control the behavior of the optimizer.

   Inside the method, the initial values for the parameters of Adam optimizer are set. `self.V` and `self.S` are dictionaries that store the first and second moments of the gradients, respectively. For each layer in `layers_list`, the initial `V` and `S` values are created as lists of zeros with the same shape as the parameters of the layer. These `V` and `S` lists are stored in the dictionaries `self.V` and `self.S` using the layer index as the key.

2. `update(self, grads, name, epoch)`: This method performs the parameter update for a specific layer using the Adam optimizer. It takes a list of gradients `grads`, the `name` of the layer, and the current `epoch` number. It returns a list of updated parameters.

   Inside the method, the corresponding layer object is obtained from `self.layers` using the provided `name`. For each gradient in `grads`, the `V` and `S` values are updated using the Adam update equations. The updated parameters are computed using the Adam update rule: `new_param = old_param - learning_rate * V / (sqrt(S) + epsilon)`. The updated parameters are stored in a list `params`, which is then returned.

In [None]:

# TODO: Implement Adam optimizer
class Adam:
    def __init__(self, layers_list, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.layers = layers_list
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.V = {}
        self.S = {}
        for i in range(len(layers_list)):
            # TODO: Initialize V and S for each layer (v and s are lists of zeros with the same shape as the parameters)
            v = [np.zeros_like(p) for p in layers_list[i].parameters]
            s = [np.zeros_like(p) for p in layers_list[i].parameters]
            self.V[i] = v
            self.S[i] = s

    def update(self, grads, name, epoch):
        layer = self.layers[name]
        params = []
        # TODO: Implement Adam update
        for i in range(len(grads)):
            self.V[name][i] = self.beta1 * self.V[name][i] + (1 - self.beta1) * grads[i]
            self.S[name][i] = self.beta2 * self.S[name][i]  +(1 - self.beta2) * np.square(grads[i])
            self.V[name][i] /= (1 - np.power(self.beta1, epoch+1)) # TODO: correct V
            self.S[name][i] /= (1 - np.power(self.beta2, epoch+1)) # TODO: correct S
            params.append(layer.parameters[i] - self.learing_rate * self.V[name][i]/ (np.sqrt(self.S[name][i]) + self.epsilon))
        return params


The code defines a hierarchy of activation functions for use in neural networks. It consists of an abstract base class `Activation` and several concrete activation classes (`Sigmoid`, `ReLU`, `Tanh`, `LinearActivation`).

The `Activation` class is an abstract base class that defines the interface for activation functions. It has two abstract methods: `forward(self, Z: np.ndarray) -> np.ndarray` and `backward(self, dA: np.ndarray, Z: np.ndarray) -> np.ndarray`. These methods represent the forward pass and backward pass of the activation function, respectively. Any concrete activation class must implement these methods.

The concrete activation classes (`Sigmoid`, `ReLU`, `Tanh`, `LinearActivation`) inherit from the `Activation` class and provide the implementations for the forward and backward pass of each specific activation function.

The `get_activation` function is a helper function that returns the activation function and its derivative based on the given activation name. It takes an `activation` parameter, which is a string representing the name of the desired activation function. It returns a tuple containing the activation class and its derivative.

Let's go through each activation function briefly:

- `Sigmoid`: This class implements the sigmoid activation function. The `forward` method computes the sigmoid function `1 / (1 + exp(-Z))`, and the `backward` method computes the derivative of the cost with respect to `Z` as `dA * A * (1 - A)`, where `A` is the output of the sigmoid function.

- `ReLU`: This class implements the rectified linear unit (ReLU) activation function. The `forward` method computes the ReLU function `max(0, Z)`, and the `backward` method computes the derivative of the cost with respect to `Z` as `dA` where `Z > 0`, and 0 otherwise.

- `Tanh`: This class implements the hyperbolic tangent (tanh) activation function. The `forward` method computes the tanh function `tanh(Z)`, and the `backward` method computes the derivative of the cost with respect to `Z` as `dA * (1 - tanh(Z)^2)`.

- `LinearActivation`: This class represents the linear activation function. The `forward` method simply returns `Z` as the output, and the `backward` method computes the derivative of the cost with respect to `Z` as `dA * 1`.


In [None]:


class Activation:
    def __init__(self) -> None:
        pass

    @abstractmethod
    def forward(self, Z: np.ndarray) -> np.ndarray:
        """
        Forward pass for activation function.
            args:
                Z: input to the activation function
            returns:
                A: output of the activation function
        """
        pass

    @abstractmethod
    def backward(self, dA: np.ndarray, Z: np.ndarray) -> np.ndarray:
        """
        Backward pass for activation function.
            args:
                dA: derivative of the cost with respect to the activation
                Z: input to the activation function
            returns:
                derivative of the cost with respect to Z
        """
        pass

class Sigmoid(Activation):
    def forward(self, Z: np.ndarray) -> np.ndarray:
        """
        Sigmoid activation function.
            args:
            x: input to the activation function
            returns:
                sigmoid(x)
        """
        # TODO: Implement sigmoid activation function
        A = 1. / 1. + np.exp(-Z)
        return A

    def backward(self, dA: np.ndarray, Z: np.ndarray) -> np.ndarray:
        """
        Backward pass for sigmoid activation function.
            args:
                dA: derivative of the cost with respect to the activation
                Z: input to the activation function
            returns:
                derivative of the cost with respect to Z
        """
        A = self.forward(Z)
        # TODO: Implement backward pass for sigmoid activation function
        dZ = dA * A * (1 - A)
        return dZ


class ReLU(Activation):
    def forward(self, Z: np.ndarray) -> np.ndarray:
        """
        ReLU activation function.
            args:
                x: input to the activation function
            returns:
                relu(x)
        """
        # TODO: Implement ReLU activation function
        A = np.maximum(0, Z)
        return A

    def backward(self, dA: np.ndarray, Z: np.ndarray) -> np.ndarray:
        """
        Backward pass for ReLU activation function.
            args:
                dA: derivative of the cost with respect to the activation
                Z: input to the activation function
            returns:
                derivative of the cost with respect to Z
        """
        # TODO: Implement backward pass for ReLU activation function
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0

        return dZ



class Tanh(Activation):
    def forward(self, Z: np.ndarray) -> np.ndarray:
        """
        Tanh activation function.
            args:
                x: input to the activation function
            returns:
                tanh(x)
        """
        # TODO: Implement tanh activation function
        A = np.tanh(Z)
        return A

    def backward(self, dA: np.ndarray, Z: np.ndarray) -> np.ndarray:
        """
        Backward pass for tanh activation function.
            args:
                dA: derivative of the cost with respect to the activation
                Z: input to the activation function
            returns:
                derivative of the cost with respect to Z
        """
        A = self.forward(Z)
        # TODO: Implement backward pass for tanh activation function
        dZ = dA * (1 - np.square(np.tanh(Z)))
        return dZ

class LinearActivation(Activation):
    def linear(Z: np.ndarray) -> np.ndarray:
        """
        Linear activation function.
            args:
                x: input to the activation function
            returns:
                x
        """
        # TODO: Implement linear activation function
        A = Z
        return A

    def backward(dA: np.ndarray, Z: np.ndarray) -> np.ndarray:
        """
        Backward pass for linear activation function.
            args:
                dA: derivative of the cost with respect to the activation
                Z: input to the activation function
            returns:
                derivative of the cost with respect to Z
        """
        # TODO: Implement backward pass for linear activation function
        dZ = dA * np.ones_like(Z)
        return dZ

def get_activation(activation: str) -> tuple:
    """
    Returns the activation function and its derivative.
        args:
            activation: activation function name
        returns:
            activation function and its derivative
    """
    if activation == 'sigmoid':
        return Sigmoid
    elif activation == 'relu':
        return ReLU
    elif activation == 'tanh':
        return Tanh
    elif activation == 'linear':
        return LinearActivation
    else:
        raise ValueError('Activation function not supported')


- `__init__(self, arch, criterion, optimizer, name=None)`: Initializes the model with the given architecture, loss criterion, optimizer, and an optional name.
- `is_layer(self, layer)`: Checks if the given layer is a valid layer.
- `is_activation(self, layer)`: Checks if the given layer is an activation function.
- `forward(self, x)`: Performs a forward pass through the model and returns the output.
- `backward(self, dAL, tmp, x)`: Performs a backward pass through the model and computes the gradients.
- `update(self, grads)`: Updates the model using the computed gradients.
- `one_epoch(self, x, y)`: Performs one epoch of training on the given input and labels.
- `save(self, name)`: Saves the model to a file.
- `load_model(self, name)`: Loads a saved model from a file.
- `shuffle(self, m, shuffling)`: Shuffles the order of the data.
- `batch(self, X, y, batch_size, index, order)`: Retrieves a batch of data from the input and labels.
- `compute_loss(self, X, y, batch_size)`: Computes the loss for the given input and labels.
- `train(self, X, y, epochs, val=None, batch_size=3, shuffling=False, verbose=1, save_after=None)`: Trains the model for the specified number of epochs using the given input and labels.
- `predict(self, X)`: Performs prediction on the given input and returns the output.


In [None]:
class Model:
    def __init__(self, arch, criterion, optimizer, name=None):
        """
        Initialize the model.
        args:
            arch: dictionary containing the architecture of the model
            criterion: loss
            optimizer: optimizer
            name: name of the model
        """
        if name is None:
            self.model = arch
            self.criterion = criterion
            self.optimizer = optimizer
            self.layers_names = list(arch.keys())
        else:
            self.model, self.criterion, self.optimizer, self.layers_names = self.load_model(name)

    def is_layer(self, layer):
        """
        Check if the layer is a layer.
        args:
            layer: layer to be checked
        returns:
            True if the layer is a layer, False otherwise
        """
        # TODO: Implement check if the layer is a layer
        return isinstance(layer, FC) or isinstance(layer, Conv2D) or isinstance(layer, MaxPool2D)

    def is_activation(self, layer):
        """
        Check if the layer is an activation function.
        args:
            layer: layer to be checked
        returns:
            True if the layer is an activation function, False otherwise
        """
        # TODO: Implement check if the layer is an activation
        return isinstance(layer, Activation)

    def forward(self, x):
        """
        Forward pass through the model.
        args:
            x: input to the model
        returns:
            output of the model
        """
        tmp = []
        A = x
        # TODO: Implement forward pass through the model
        # NOTICE: we have a pattern of layers and activations
        for l in range(len(self.layer_names), 2):
            Z = self.model[self.layer_names[l]].forward(A)
            tmp.append(np.copy(Z))    # hint add a copy of Z to tmp
            A = self.model[self.layer_names[l+1]].forward(Z)
            tmp.append(np.copy(A))    # hint add a copy of A to tmp
        return tmp

    def backward(self, dAL, tmp, x):
        """
        Backward pass through the model.
        args:
            dAL: derivative of the cost with respect to the output of the model
            tmp: list containing the intermediate values of Z and A
            x: input to the model
        returns:
            gradients of the model
        """
        dA = dAL
        grads = {}
        # TODO: Implement backward pass through the model
        # NOTICE: we have a pattern of layers and activations
        # for from the end to the beginning of the tmp list
        for l in range(len(layer_names)):
            if l > 2:
                Z, A = tmp[l - 1], tmp[l - 2]
            else:
                Z, A = tmp[l - 1], x
            dZ = self.model[self.layer_names[l]].backward(dA, Z)
            dA, grad = self.model[self.layer_names[l - 1]].backward(dZ, A)
            grads[self.layers_names[l - 1]] = grad
        return grads

    def update(self, grads):
        """
        Update the model.
        args:
            grads: gradients of the model
        """
        for n in self.layer_names:
            if self.is_layer(self.model[n]) and not (isinstance(self.model[n] ,MaxPool2D)) :    # hint check if the layer is a layer and also is not a maxpooling layer
                self.model[n].update(self.optimizer, grads[n])

    def one_epoch(self, x, y):
        """
        One epoch of training.
        args:
            x: input to the model
            y: labels
            batch_size: batch size
        returns:
            loss
        """
        # TODO: Implement one epoch of training
        tmp = self.forward(x)
        AL = tmp[-1]
        loss = self.criterion.compute(AL, y)
        dAL = self.criterion.backward(AL, y)
        grads =  self.backward(dAL, tmp, x)
        self.update(grads)
        return loss

    def save(self, name):
        """
        Save the model.
        args:
            name: name of the model
        """
        with open(name, 'wb') as f:
            pickle.dump((self.model, self.criterion, self.optimizer, self.layers_names), f)

    def load_model(self, name):
        """
        Load the model.
        args:
            name: name of the model
        returns:
            model, criterion, optimizer, layers_names
        """
        with open(name, 'rb') as f:
            return pickle.load(f)

    def shuffle(self, m, shuffling):
        order = list(range(m))
        if shuffling:
            return np.random.shuffle(order)
        return order

    def batch(self, X, y, batch_size, index, order):
        """
        Get a batch of data.
        args:
            X: input to the model
            y: labels
            batch_size: batch size
            index: index of the batch
                e.g: if batch_size = 3 and index = 1 then the batch will be from index [3, 4, 5]
            order: order of the data
        returns:
            bx, by: batch of data
        """
        # TODO: Implement batch
        last_index = min(((index + 1) * batch_size),
                         len(order))  # hint last index of the batch check for the last batch
        batch = order[(index * batch_size): last_index]
        # NOTICE: inputs are 4 dimensional or 2 demensional
        if len(X.shape) == 2:
            bx = X[:, batch]
            by = y[:, batch]
            return bx, by
        else:
            bx = X[batch]
            by = y[batch]
            return bx, by


    def compute_loss(self, X, y, batch_size):
        """
        Compute the loss.
        args:
            X: input to the model
            y: labels
            Batch_Size: batch size
        returns:
            loss
        """
        # TODO: Implement compute loss
        m = X.shape[0] if len(X.shape) == 4 else X.shape[1]
        order = self.shuffle(m, False)
        cost = 0
        for b in range(m // batch_size):
            bx, by = self.batch(X, y, batch_size, b, order)
            tmp = self.forward(bx)
            AL = tmp[-1]
            cost += self.criterion.compute(AL, y)(m // batch_size)
        return cost


    def train(self, X, y, epochs, val=None, batch_size=3, shuffling=False, verbose=1, save_after=None):
        """
        Train the model.
        args:
            X: input to the model
            y: labels
            epochs: number of epochs
            val: validation data
            batch_size: batch size
            shuffling: if True shuffle the data
            verbose: if 1 print the loss after each epoch
            save_after: save the model after training
        """
        # TODO: Implement training
        train_cost = []
        val_cost = []
        # NOTICE: if your inputs are 4 dimensional m = X.shape[0] else m = X.shape[1]
        m = X.shape[0] if len(X.shape) == 4 else X.shape[1]
        for e in tqdm(range(1, epochs + 1)):
            order = self.shuffle(m, shuffling)
            cost = 0
            for b in range(m // batch_size):
                bx, by = self.batch(X, y, batch_size, b, order)
                cost += (self.one_epoch(X,y)) / (m // batch_size)
            train_cost.append(cost)
            if val is not None:
                val_cost.append(self.compute_loss(val, y, batch_size))
            if verbose != False:
                if e % verbose == 0:
                    print("Epoch {}: train cost = {}".format(e, cost))
                if val is not None:
                    print("Epoch {}: val cost = {}".format(e, val_cost[-1]))
        if save_after is not None:
            self.save(save_after)
        return train_cost, val_cost

    def predict(self, X):
        """
        Predict the output of the model.
        args:
            X: input to the model
        returns:
            predictions
        """
        # TODO: Implement prediction
        return self.forward(X)[-1]



1. The `construct_dataset_mnist` function:
   - It starts by retrieving the list of files in the directory specified by `mnist_path[2]` (presumably containing MNIST digit 2 images) using `os.listdir`.
   - Then, it creates a list `MNIST2FILES` by appending the directory path to each file name.
   - Similarly, it retrieves the list of files in the directory specified by `mnist_path[5]` (presumably containing MNIST digit 5 images).
   - It creates a list `MNIST5FILES` by appending the directory path to each file name.
   - The function initializes two empty lists, `X` and `y`.
   - It then loops through each file path in `MNIST2FILES` and performs the following:
     - It opens the image using `Image.open` and converts it to a NumPy array using `np.array`.
     - The pixel values are divided by 255.0 to scale them between 0 and 1.
     - `np.expand_dims` is used to add an extra dimension to the array, making it compatible with convolutional layers (axis=-1 represents the last dimension).
     - The array is appended to `X`, and the label 0 is appended to `y`.
   - The same process is repeated for each file path in `MNIST5FILES`, but the label is set to 1.
   - Finally, the function returns the lists `X` and `y`.

2. Reading the CSV file using Pandas:
   - The `pd.read_csv` function is used to read the CSV file specified by `cali_path['train']` into a DataFrame `df`.

3. Preparing the input features `X` and labels `y`:
   - `X` is created by selecting specific columns from the DataFrame `df` using `df[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']]`.
   - `y` is created by selecting the 'median_house_value' column from the DataFrame `df`.
   - Both `X` and `y` are converted to NumPy arrays using `np.array

`.

4. Z-score scaling the input features `X`:
   - The `zscore_scaling` function is defined, which takes an input array `data`.
   - It calculates the mean and standard deviation of the data using `np.mean` and `np.std`.
   - The scaled data is obtained by subtracting the mean and dividing by the standard deviation: `(data - mean) / std`.
   - The scaled data is returned.
   - The `zscore_scaling` function is applied to `X` using `Xt = zscore_scaling(X)`.

5. The labels `y` are already in NumPy array format and not scaled.

The resulting scaled input features are stored in `Xt`, and the labels are stored in `yt`.

In [None]:
def construct_dataset_mnist():
  directory = os.listdir(mnist_path[2])
  MNIST2FILES = [mnist_path[2] + file_name for file_name in directory]
  directory = os.listdir(mnist_path[5])
  MNIST5FILES = [mnist_path[5] + file_name for file_name in directory]

  X, y  = [], []
  for image_path in MNIST2FILES:
    X.append(np.expand_dims(np.array(Image.open(image_path)) / 255., axis=-1))
    y.append(0)
  for image_path in MNIST5FILES:
    X.append(np.expand_dims(np.array(Image.open(image_path)) / 255., axis=-1))
    y.append(1)

  return X, y

X_MNIST, y_MNIST = construct_dataset_mnist()

df = pd.read_csv(cali_path['train'])

X = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']]
y = df['median_house_value']
X = np.array(X)
y = np.array(y)

def zscore_scaling(data):
    mean = np.mean(data)
    std = np.std(data)
    scaled_data = (data - mean) / std
    return scaled_data

Xt = zscore_scaling(X)
yt = y



This code defines an architecture for the model, `arch_model_MNIST`, using a dictionary. The architecture consists of several layers and activation functions. Here's a breakdown of the architecture:

- `"C1"`: A `Conv2D` layer with 1 input channel, 2 output channels, a kernel size of (10, 10), stride of (1, 1), and padding of (1, 1).
- `"relu1"`: A ReLU activation function.
- `"C2"`: Another `Conv2D` layer with 2 input channels, 4 output channels, a kernel size of (5, 5), stride of (1, 1), and no padding.
- `"relu2"`: Another ReLU activation function.
- `"DENSE1"`: A fully connected (`FC`) layer with 256 input units and 16 output units.
- `"s1"`: A sigmoid activation function.
- `"DENSE2"`: Another `FC` layer with 16 input units and 1 output unit.
- `"s2"`: Another sigmoid activation function.

In this part, the code creates an instance of the `BinaryCrossEntropy` loss criterion and an instance of the `Adam` optimizer. The `Adam` optimizer is initialized with the `arch_model_MNIST` architecture and a learning rate of 0.01. Finally, the `Model` class is instantiated with the `arch_model_MNIST`, `criterion`, and `optimizer`, resulting in the `myModel` object.

The architecture (`arch_model_california_pricing`) consists of the following layers:

1. `"DENSE1"`: Fully Connected (FC) layer with 8 input units and 16 output units.
2. `"relu1"`: ReLU activation function.
3. `"DENSE2"`: FC layer with 16 input units and 16 output units.
4. `"relu2"`: ReLU activation function.
5. `"DENSE3"`: FC layer with 16 input units and 1 output unit.
6. `'s1'`: Sigmoid activation function.

The loss criterion (`criterion2`) is Mean Squared Error (MSE), which is commonly used for regression tasks.

The optimizer (`optimizer2`) is Adam, an optimization algorithm that performs adaptive learning rate updates.

This creates an instance of the `Model` class, which can be used for training, evaluation, and prediction on the California pricing task.

In [None]:
arch_model_MNIST = {
    "C1": Conv2D(1, 2, name="CONV1", kernel_size=(10, 10), stride=(1, 1), padding=(1, 1)),
    "relu1": get_activation("relu")(),
    "C2": Conv2D(2, 4, name="CONV2", kernel_size=(5, 5), stride=(1, 1), padding=(0, 0)),
    "relu2": get_activation("relu")(),
    "DENSE1": FC(256, 16, "FC1"),
    "s1": get_activation("sigmoid")(),
    "DENSE2": FC(16, 1, "FC2"),
    "s2": get_activation("sigmoid")(),
}
criterion_1 = BinaryCrossEntropy()
optimizer_1 = Adam(arch_model_MNIST, learning_rate=0.01)

model_MNIST = Model(arch_model_MNIST, criterion_1, optimizer_1)

arch_model_california_pricing = {
    "DENSE1": FC(8, 16, 'DENSE1'),
    "relu1": get_activation("relu")(),
    "DENSE2": FC(16, 16, 'DENSE2'),
    "relu2": get_activation("relu")(),
    "DENSE3": FC(16, 1, 'DENSE3'),
    's1': get_activation("sigmoid")(),
}
criterion_2 = MeanSquaredError()
optimizer_2 = Adam(arch_model_california_pricing, learning_rate=0.01)

CHP_Model = Model(arch_model_california_pricing, criterion_2, optimizer_2)



In [None]:
model_MNIST.train(X_MNIST, y_MNIST, 10, shuffling=True, save_after='/content/drive/MyDrive/MODELS/MNIST')

In [None]:
CHP_Model.trian(Xt, yt, 10, save_after='/content/drive/MyDrive/MODELS/CALIFORNIA_HOUSE_PRICING')