<a href="https://colab.research.google.com/github/arunm917/CS6910_Assignment1/blob/main/CS6910_Assignment_1_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb

# Importing packages

In [62]:
import numpy as np
import pandas as pd
import gdown
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.notebook import trange, tqdm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
# import tensorflow as ts
from tensorflow import keras
from sklearn.metrics import accuracy_score, log_loss
import wandb
import random

In [None]:
wandb.login()

# Data preprocessing - fashion MNIST

In [5]:
(X_train, Y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, stratify = Y_train, random_state=7, test_size = 0.1)

In [7]:
X_train_scaled = X_train.reshape(len(X_train),28*28)/255.0
X_val_scaled = X_val.reshape(len(X_val),28*28)/255.0
X_test_scaled = X_test.reshape(len(X_test),28*28)/255.0

Encoding labels

In [8]:
enc = OneHotEncoder()
y_train_enc = enc.fit_transform(np.expand_dims(y_train,1)).toarray()
y_val_enc = enc.fit_transform(np.expand_dims(y_val,1)).toarray()
y_test_enc = enc.fit_transform(np.expand_dims(y_test,1)).toarray()
print(y_train_enc.shape, y_val_enc.shape, y_test_enc.shape)

(54000, 10) (6000, 10) (10000, 10)


Batching

In [9]:
X_train_batch = X_train_scaled[0:1000,:]
y_train_batch = y_train_enc[0:1000,:]

In [10]:
print(X_train_scaled.shape, y_train_enc.shape, X_train_batch.shape, y_train_batch.shape )

(54000, 784) (54000, 10) (1000, 784) (1000, 10)


# Initializations

In [12]:
class WeightInitialization:
  
  def initialize(self, sizes, initialization):

    self.W = {}
    self.B = {}
    self.sizes = sizes
    self.nh = len(self.sizes)-2

    if initialization =='RANDOM':                       # Random initialization
      for i in range(self.nh + 1):
        self.W[i+1] = np.random.randn(self.sizes[i+1], self.sizes[i])
        self.B[i+1] = np.zeros((self.sizes[i+1],1))

      return(self.W, self.B)
  
    if initialization == 'XAVIER':                      # Xavier initialization
      for i in range(self.nh + 1):
        std_dev_xavier = np.sqrt(2.0 / (self.sizes[i] + self.sizes[i+1]))
        self.W[i+1] = np.random.normal(loc=0, scale=std_dev_xavier, size=(self.sizes[i+1], self.sizes[i]))
        self.B[i+1] = np.zeros((self.sizes[i+1],1))

      return(self.W, self.B)

    if initialization == 'HE':                         # He initialization
      print('HE')
      for i in range(self.nh + 1):
        std_dev_he = np.sqrt(2.0 / self.sizes[i])
        self.W[i+1] = np.random.randn(self.sizes[i+1], self.sizes[i]) * std_dev_he
        self.B[i+1] = np.zeros((self.sizes[i+1],1))

      return(self.W, self.B)
    
initial = WeightInitialization()  #Instantiation for the class initialization

# Activation functions

In [50]:
class ActivationFunctions:

  def activation(self, z, activation_function):
    if activation_function == 'SIGMOID':
      return 1/(1+np.exp(-z))
    if activation_function == 'TANH':
      return np.tanh(z)
    if activation_function == 'RELU':
      return np.maximum(0,z.T)

  def softmax(self, z):
    return np.exp(z)/np.sum(np.exp(z))

  def der_activation(self, z, activation_function):
    if activation_function == 'SIGMOID':
      return z*(1-z)
    if activation_function == 'TANH':
      return (1 - z**2)
    if activation_function == 'RELU':
      return np.where(z.T > 0, 1, 0)

act = ActivationFunctions()

# Loss Functions

In [82]:
class LossFunctions:
  def MSE(self, y, y_pred):
    return (np.sum((y-y_pred)**2))/len(y)
  
  def cross_entropy(self, y, y_pred):
    # clipping y_pred to prevent log(0) errors
    #print(y.shape,y_pred.shape)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    ce_loss = np.mean(np.sum(np.multiply(-y,np.log(y_pred)), axis = 0))
    #print(ce_loss)
    return(ce_loss)
loss_functions = LossFunctions()

# Model

In [99]:
class Model():
    
  def forward_prop(self, x, activation_function):
    self.A = {}
    self.H = {}
    self.x = x
    loss_batch = 0
    self.H[0] = self.x.T

    for i in range(self.nh + 1):
      self.A[i+1] = np.matmul(self.W[i+1], self.H[i]) + self.B[i+1]
      self.H[i+1] = act.activation(self.A[i+1], activation_function)

    self.y_pred = act.softmax(self.A[i+1])
    #print(y_pred)
    #print('Y = ', self.y.shape, 'Y_pred =', y_pred.shape)
    return(self.y_pred)
  
  def grad(self, x, y, activation_function, W, B):

    if W is None:
      W = self.W
    if B is None:
      B = self.B

    y_pred = self.forward_prop(x, activation_function)
    self.dW = {}
    self.dB = {}
    self.dA = {}
    self.dH = {}
    self.dA[self.nh + 1] = y_pred - y.T

    for i in range(self.nh + 1 , 0, -1):
      self.dW[i]      = np.matmul(self.dA[i],self.H[i-1].T)
      self.dB[i]      = np.sum(self.dA[i], axis = 1).reshape(-1,1)
      self.dH[i-1]    = np.matmul(W[i].T,self.dA[i])
      self.dA[i-1]    = np.multiply(self.dH[i-1], act.der_activation(self.H[i-1], activation_function))
    return (self.dW, self.dB)
  
  def fit(self, X, Y, X_val, Y_val, epochs, learning_rate, hidden_layers, neurons_per_layer, batch_size, optimizer, initialization, activation_function):
    
    # x = X[0:batch_size,:]
    # y = Y[0:batch_size,:]
    #self.X = X
    self.loss_epoc_store = []
    self.nx = X.shape[1]
    self.ny = Y.shape[1]
    self.nh = hidden_layers
    self.neurons = neurons_per_layer
    hidden_layer_sizes = [self.neurons]*self.nh
    self.sizes = [self.nx] + hidden_layer_sizes + [self.ny]
    print(self.sizes)
    if optimizer == 'SGD':
      batch_size =1

    step_size = len(X)/batch_size

    self.W, self.B = initial.initialize(self.sizes, initialization)
    opt = Optimizer(self.W, self.B, self.sizes, batch_size, learning_rate, optimizer)

    for i in trange(epochs, total=epochs, unit="epoch"):
      epoch = i+1
      step = 0
      # print('epoch = ', epoch)
      loss_epoc = 0
      self.loss_batch_store = []

      for i in range(0,len(X),batch_size):
        step += 1
        # print('step = ', step)
        self.x = X[i:i+batch_size,:]
        self.y = Y[i:i+batch_size,:]
        #print('y =', self.y.shape)
        (self.dW, self.dB) = self.grad(self.x, self.y, activation_function, W = None, B = None)
        self.W, self.B = opt.learning_algoithms(self.x, self.y, self.dW, self.dB, step)
        # Predicting loss for each batch
        loss_batch = loss_functions.cross_entropy(self.y.T ,self.y_pred)
        self.loss_batch_store.append(loss_batch)

      loss_epoc = np.sum(self.loss_batch_store)/step_size
      print('training loss = ', round(loss_epoc,4))
      self.loss_epoc_store.append(loss_epoc)

      # Predicting validation loss
      y_pred_val = self.forward_prop(X_val, activation_function)
      Y_pred_val = np.array(y_pred_val.T).squeeze()
      Y_pred_val = np.argmax(Y_pred_val,1)
      #print(Y_pred_val[0:10], Y_val[0:10])
      accuracy_val = accuracy_score(Y_pred_val,Y_val)
      print('validation accuracy = ', round(accuracy_val,2))
    
    plt.plot(self.loss_epoc_store)
    plt.xlabel('Epochs')
    plt.ylabel('log_loss')
    plt.show()

model = Model()

# Learning Algorithms

In [16]:
class Optimizer:
  def __init__(self,W, B, sizes, batch_size, learning_rate, optimizer):
    self.W = W
    self.B = B
    # self.dW = dW
    # self.dB = dB
    self.sizes = sizes
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.optimizer = optimizer
    # self.epoch = epoch
    # self.step = step
    self.nh = len(self.sizes) - 2
    self.v_w = {}
    self.v_b = {}
    self.m_w = {}
    self.m_b = {}
    #Initializing
    for i in range(self.nh + 1):
      self.v_w[i+1] = np.zeros((self.sizes[i+1], self.sizes[i]))
      self.v_b[i+1] = np.zeros((self.sizes[i+1],1))
      self.m_w[i+1] = np.zeros((self.sizes[i+1], self.sizes[i]))
      self.m_b[i+1] = np.zeros((self.sizes[i+1],1))

  def learning_algoithms(self,x, y, dW, dB, update):

    if self.optimizer == 'SGD':         # Stochastic gradient descent
      for i in range(self.nh + 1):
        self.W[i+1] -= self.learning_rate * (dW[i+1])
        self.B[i+1] -= self.learning_rate * (dB[i+1])

    if self.optimizer == 'MBGD':        # Mini-batch gradient descent
      for i in range(self.nh + 1):
        self.W[i+1] -= self.learning_rate * (dW[i+1]/self.batch_size)
        self.B[i+1] -= self.learning_rate * (dB[i+1]/self.batch_size)

    if self.optimizer == 'MGD':         # Momentum-based gradient descent
      beta = 0.9
      for i in range(self.nh + 1):
        # Updating history term
        self.v_w[i+1] = beta*self.v_w[i+1] + self.learning_rate * (dW[i+1]/self.batch_size)
        self.v_b[i+1] = beta*self.v_b[i+1] + self.learning_rate * (dB[i+1]/self.batch_size)
        # Updating weights and biases
        self.W[i+1] -= self.v_w[i+1]
        self.B[i+1] -= self.v_b[i+1]

    if self.optimizer == 'NAG':          # Nestrov accelarated gradient descent
      beta = 0.9
      for i in range(self.nh + 1):
        # Computing look ahead term
        self.v_w[i+1] = beta*self.v_w[i+1]
        self.v_b[i+1] = beta*self.v_b[i+1]
        self.W[i+1] = self.W[i+1] - self.v_w[i+1]
        self.B[i+1] = self.B[i+1] - self.v_b[i+1]
      (dW, dB) = model.grad(x, y, self.W, self.B)
        # Updating weights and biases
      for i in range(self.nh + 1):
        # Updating history term
        self.v_w[i+1] = beta*self.v_w[i+1] + self.learning_rate * (dW[i+1]/self.batch_size)
        self.v_b[i+1] = beta*self.v_b[i+1] + self.learning_rate * (dB[i+1]/self.batch_size)
        # Updating weights and biases
        self.W[i+1] -= self.v_w[i+1]
        self.B[i+1] -= self.v_b[i+1]

    if self.optimizer == 'RMSPROP':       # Root mean squared propagation
      beta = 0.9
      eps = 1e-8
      for i in range(self.nh + 1):
        # Updating history term
        self.v_w[i+1] = beta*self.v_w[i+1] + (1-beta) * ((dW[i+1]/self.batch_size)**2)
        self.v_b[i+1] = beta*self.v_b[i+1] + (1-beta) * ((dB[i+1]/self.batch_size)**2)
        # Updating weights and biases
        self.W[i+1] -= (self.learning_rate/np.sqrt(self.v_w[i+1] + eps)) * (dW[i+1]/self.batch_size)
        self.B[i+1] -= (self.learning_rate/np.sqrt(self.v_b[i+1] + eps)) * (dB[i+1]/self.batch_size)

    if self.optimizer == 'ADAM':           # Adaptive moment estimation
      beta1 = 0.9
      beta2 = 0.999
      eps = 1e-8
      for i in range(self.nh + 1):

        # Updating history term
        self.m_w[i+1] = beta1*self.m_w[i+1] + (1-beta1) * (dW[i+1]/self.batch_size)
        self.m_b[i+1] = beta1*self.m_b[i+1] + (1-beta1) * (dB[i+1]/self.batch_size)

        m_w_hat = self.m_w[i+1]/(1 - np.power(beta1,update))
        m_b_hat = self.m_b[i+1]/(1 - np.power(beta1,update))

        self.v_w[i+1] = beta2*self.v_w[i+1] + (1-beta2) * ((dW[i+1]/self.batch_size)**2)
        self.v_b[i+1] = beta2*self.v_b[i+1] + (1-beta2) * ((dB[i+1]/self.batch_size)**2)

        v_w_hat = self.v_w[i+1]/(1 - np.power(beta2,update))
        v_b_hat = self.v_b[i+1]/(1 - np.power(beta2,update))

        # Updating weights and biases
        self.W[i+1] -= (self.learning_rate/(np.sqrt(self.v_w[i+1]) + eps)) * m_w_hat
        self.B[i+1] -= (self.learning_rate/(np.sqrt(self.v_b[i+1]) + eps)) * m_b_hat

    if self.optimizer == 'NADAM':          # Nestrov Adaptive moment estimation
      beta1 = 0.9
      beta2 = 0.999
      eps = 1e-8
      for i in range(self.nh + 1):

        # Updating history term
        self.m_w[i+1] = beta1*self.m_w[i+1] + (1-beta1) * (dW[i+1]/self.batch_size)
        self.m_b[i+1] = beta1*self.m_b[i+1] + (1-beta1) * (dB[i+1]/self.batch_size)

        m_w_hat = self.m_w[i+1]/(1 - np.power(beta1,update))
        m_b_hat = self.m_b[i+1]/(1 - np.power(beta1,update))

        self.v_w[i+1] = beta2*self.v_w[i+1] + (1-beta2) * ((dW[i+1]/self.batch_size)**2)
        self.v_b[i+1] = beta2*self.v_b[i+1] + (1-beta2) * ((dB[i+1]/self.batch_size)**2)

        v_w_hat = self.v_w[i+1]/(1 - np.power(beta2,update))
        v_b_hat = self.v_b[i+1]/(1 - np.power(beta2,update))

        # Updating weights and biases
        self.W[i+1] -= (self.learning_rate/(np.sqrt(self.v_w[i+1]) + eps)) * (m_w_hat*beta1 + (((1-beta1)*dW[i+1])/(1-np.power(beta1,update))))
        self.B[i+1] -= (self.learning_rate/(np.sqrt(self.v_b[i+1]) + eps)) * (m_b_hat*beta1 + (((1-beta1)*dB[i+1])/(1-np.power(beta1,update))))
    
    return(self.W, self.B)

# Train

In [None]:
model.fit(X = X_train_scaled, 
          Y = y_train_enc,
          X_val = X_val_scaled,
          Y_val = y_val,
          epochs = 20, 
          learning_rate = 0.0001,
          hidden_layers = 2,
          neurons_per_layer = 64,
          batch_size = 32,
          optimizer = 'NADAM',
          initialization = 'HE',
          activation_function = 'TANH')
# For SGD batch-size is automatically set to 1