In [1]:
# Agenda
# 1. Implement diff optimizers
# 2. Learn working of wandb
# 3. Adjust code for other hyperparameters
# 4. Create sweeps and wandb

## MA23M002 - ABHINAV T K <br> CS6910 - Assignment 1

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
# Load the Fashion MNIST dataset
(x, y), (x_test, y_test) = fashion_mnist.load_data()

# Change the shape of the data to (60000, 784)
x = x.reshape(x.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Normalize the data
x = x/255.0
x_test = x_test/255.0

# Splitting the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [4]:
# Dataset analysis and data preprocessing

# no. of classes
c = len(np.unique(y_train))
print("Total no. of classes = ", c)

# Input features
inp_features = x_train.shape[1]
print("Number of input features = ", inp_features)

# training samples size
m = x_train.shape[0]
print("Training samples = ", m)

# validation samples size
m_val = x_val.shape[0]
print("Validation samples = ", m_val)

# test samples size
m_test = x_test.shape[0]
print("Test samples = ", m_test)

# Class names - the index of the class names corresponds to the class label
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


Total no. of classes =  10
Number of input features =  784
Training samples =  54000
Validation samples =  6000
Test samples =  10000


In [5]:
# One hot encoding y
y_train_encoded = np.eye(np.max(y_train) + 1)[y_train].T
y_val_encoded = np.eye(np.max(y_val) + 1)[y_val].T
y_test_encoded = np.eye(np.max(y_test) + 1)[y_test].T


# Neural Network Architecture

In [6]:
# Defining activation functions and their derivatives

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    return sigmoid(x) * (1-sigmoid(x))

def relu(x):
    return np.maximum(0,x)

def relu_grad(x):
    return 1*(x>0)

def tanh(x):
    return np.tanh(x)

def tanh_grad(x):
    return (1 - (np.tanh(x)**2))

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def softmax_grad(x):
    return softmax(x) * (1-softmax(x))


In [7]:
# Initializing parameters W and b

def initialize_parameters(nn_layers):
  '''
  nn_layers: a list containing the number of neurons of each layer - where each layer no. is the index of the list
  '''
  np.random.seed(32)
  parameters = {}                         # dictionary to hold weights and biases of each layer
  prev_v = {}
  for i in range(1, len(nn_layers)):
    parameters["W"+str(i)] = np.random.rand(nn_layers[i], nn_layers[i-1])*0.01
    parameters["b"+str(i)] = np.random.rand(nn_layers[i], 1)*0.01

    prev_v["W"+str(i)] = np.zeros((nn_layers[i], nn_layers[i-1]))
    prev_v["b"+str(i)] = np.zeros((nn_layers[i], 1))
  return parameters, prev_v

In [8]:
# Forward propagation
def forward_propagation(x, nn_layers, parameters, act_fn):

  l = len(nn_layers)  # 5 # No. of neural network layers, including input and output layers

  a = {}              # dictionary to hold hidden layer (pre-activation)
  h = {}              # dictionary to hold hidden layer (activation)

  h[0] = x.T  # input layer
  #print(h[0].shape)
  for i in range(1, l-1):
    W = parameters["W"+str(i)]        # weights of hidden layer i
    b = parameters["b"+str(i)]        # bias of hidden layer i
    a[i] = np.matmul(W,h[i-1]) + b

    # activation for hidden layers
    if act_fn == 'sigmoid':
      h[i] = sigmoid(a[i])
    elif act_fn == 'relu':
      h[i] = relu(a[i])
    elif act_fn == 'tanh':
      h[i] = tanh(a[i])

  # output layer
  W = parameters["W"+str(l-1)]    # weights of hidden layer i
  b = parameters["b"+str(l-1)]    # bias of hidden layer i
  a[l-1] = np.matmul(W,h[l-2]) + b          # activation function for output layer

  y_hat = softmax(a[l-1])
  return y_hat, h, a                # Returns y_hat, h, a

In [9]:
# Backpropagation
def back_propagation(y_hat, y, h, a, nn_layers, parameters, act_fn, batch_size):
  l = len(nn_layers)
  grads = {}            # dictionary to store gradient of loss function wrt parameters and hidden layer neurons

  # Computing gradient wrt output layer
  grads["grada"+str(l-1)] = y_hat - y

  for i in range(l-1,0,-1):
    grads["gradW" + str(i)] = (1/batch_size)*np.dot(grads["grada" + str(i)], h[i-1].T)
    grads["gradb" + str(i)] = (1/batch_size)*np.sum(grads["grada" + str(i)], axis=1, keepdims=True)
    if i>1:
      if act_fn == 'sigmoid':
        grads["grada"+str(i-1)] = np.matmul(parameters["W" + str(i)].T, grads["grada" + str(i)]) * sigmoid_grad(a[i-1])   # Computing gradients wrt hidden layers
      elif act_fn == 'relu':
        grads["grada"+str(i-1)] = np.matmul(parameters["W" + str(i)].T, grads["grada" + str(i)]) * relu_grad(a[i-1])
      elif act_fn == 'tanh':
        grads["grada"+str(i-1)] = np.matmul(parameters["W" + str(i)].T, grads["grada" + str(i)]) * tanh_grad(a[i-1])
  return grads

# Update parameter based on different optimizers

In [10]:
def param_update_sgd(parameters, grads , lr):
    l = len(parameters) // 2    # no. of hidden layers + outer layer
    for i in range(1, l + 1):
        parameters["W" + str(i)] = parameters["W" + str(i)] - lr * grads["gradW" + str(i)]
        parameters["b" + str(i)] = parameters["b" + str(i)] - lr * grads["gradb" + str(i)]
    return parameters

In [11]:
def param_update_momentum(parameters, grads , lr, beta, prev_v):
  l = len(parameters) // 2     # no. of hidden layers + outer layer
  eta = 1.0
  for i in range(1, l + 1):
    prev_v["W"+str(i)] = beta*prev_v["W"+str(i)] + eta*grads["gradW" + str(i)]
    prev_v["b"+str(i)] = beta*prev_v["b"+str(i)] + eta*grads["gradb" + str(i)]

    parameters["W" + str(i)] = parameters["W" + str(i)] - lr*prev_v["W"+str(i)]
    parameters["b" + str(i)] = parameters["b" + str(i)] - lr*prev_v["b"+str(i)]

  return parameters, prev_v

In [12]:
def param_update_nesterov(parameters, grads , lr, beta, prev_v):
  l = len(parameters) // 2     # no. of hidden layers + outer layer
  eta = 1.0
  for i in range(1, l + 1):
    prev_v["W"+str(i)] = beta*prev_v["W"+str(i)] + eta*grads["gradW" + str(i)]
    prev_v["b"+str(i)] = beta*prev_v["b"+str(i)] + eta*grads["gradb" + str(i)]

    parameters["W" + str(i)] = parameters["W" + str(i)] - lr*prev_v["W"+str(i)]
    parameters["b" + str(i)] = parameters["b" + str(i)] - lr*prev_v["b"+str(i)]

  return parameters, prev_v

# Compute loss and accuracy

In [13]:
def compute_loss(y, y_hat, batch_size, parameters):
  loss = (1/batch_size)*(-1.0 * np.sum(np.multiply(y, np.log(y_hat))))
  return loss

# Train the model

In [18]:
def train_model(x_train, y_train, epochs = 10, num_hidden_layers = 3, num_neurons = 128, learning_rate = 0.001, act_fn = 'sigmoid', weight_init = 'xavier',
                optimizer = 'sgd',  batch_size = 512, wt_decay_l2 = 0):
  '''
  number of epochs: 5, 10
  number of hidden layers: 3, 4, 5
  size of every hidden layer: 32, 64, 128
  weight decay (L2 regularisation): 0, 0.0005, 0.5
  learning rate: 1e-3, 1e-4
  optimizer: sgd, momentum, nesterov, rmsprop, adam, nadam
  batch size: 16, 32, 64
  weight initialisation: random, Xavier
  activation functions: sigmoid, tanh, ReLU
  '''

  nn_layers = [inp_features] + [num_neurons]*num_hidden_layers + [c]
  parameters, prev_v = initialize_parameters(nn_layers)
  params_nesterov = parameters.copy()
  l = len(parameters) // 2     # no. of hidden layers + outer layer
  beta = 0.9      # decay rate
  for epoch in range(epochs):
    for i in range(0, x_train.shape[0], batch_size):
      batch_sz = min(batch_size, x_train.shape[0] - i)

      if optimizer == 'nesterov':
        for j in range(1, l+1):
          params_nesterov["W"+str(j)] = parameters["W"+str(j)] - beta*prev_v["W"+str(j)]
          params_nesterov["b"+str(j)] = parameters["b"+str(j)] - beta*prev_v["b"+str(j)]
        # calculating grads for look ahead
        y_hat, h, a = forward_propagation(x_train[i:i+batch_size,:], nn_layers, params_nesterov, act_fn)
        grads = back_propagation(y_hat, y_train[:,i:i+batch_size], h, a, nn_layers, params_nesterov, act_fn, batch_sz)
        # parameter update for nesterove using grad calculated by look ahead
        parameters, prev_v = param_update_momentum(parameters, grads , lr, beta, prev_v)

      else:
        y_hat, h, a = forward_propagation(x_train[i:i+batch_size,:], nn_layers, parameters, act_fn)
        grads = back_propagation(y_hat, y_train[:,i:i+batch_size], h, a, nn_layers, parameters, act_fn, batch_sz)
        if optimizer == 'sgd':
          parameters = param_update_sgd(parameters, grads, lr)
        elif optimizer == 'momentum':
          parameters, prev_v = param_update_momentum(parameters, grads , lr, beta, prev_v)

    # Mean loss for the full training set
    y_hat, _, _ = forward_propagation(x_train, nn_layers, parameters, act_fn)
    cost = compute_loss(y_train, y_hat, 54000, parameters)
    y_hat = np.argmax(y_hat, axis=0)
    accuracy = np.mean(y_hat == y_train)
    print("Epochs = ", epoch, "\tTraining cost:", cost, "\tAccuracy:", accuracy)
  return parameters

In [21]:
if __name__=='__main__':
  iter = 0
  epochs = 10
  lr = 0.01 # learning rate
  batch_size = 64
  params = train_model(x_train, y_train_encoded, epochs = 20, act_fn = 'tanh', optimizer = 'nesterov' )

Epochs =  0 	Training cost: 2.151706342147235 	Accuracy: 0.0
Epochs =  1 	Training cost: 1.7750193807382109 	Accuracy: 0.0
Epochs =  2 	Training cost: 1.6602813456769059 	Accuracy: 0.0
Epochs =  3 	Training cost: 1.6142142941339568 	Accuracy: 0.0
Epochs =  4 	Training cost: 1.5842740885577553 	Accuracy: 0.0
Epochs =  5 	Training cost: 1.5601537727739827 	Accuracy: 0.0
Epochs =  6 	Training cost: 1.5376102053275325 	Accuracy: 0.3523833333333333
Epochs =  7 	Training cost: 2.6548039680183444 	Accuracy: 6.666666666666667e-05
Epochs =  8 	Training cost: 2.851006063161291 	Accuracy: 0.0
Epochs =  9 	Training cost: 2.8157534858214115 	Accuracy: 0.0
Epochs =  10 	Training cost: 3.0188074111589223 	Accuracy: 0.03635740740740741
Epochs =  11 	Training cost: 2.856454948927334 	Accuracy: 0.005518518518518518
Epochs =  12 	Training cost: 2.336648585208616 	Accuracy: 0.030562962962962962
Epochs =  13 	Training cost: 2.0455115785616402 	Accuracy: 0.0
Epochs =  14 	Training cost: 1.9680194625018743 	