In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [7]:
pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import wandb
wandb.login()
wandb.init(project="test-1")

[34m[1mwandb[0m: Currently logged in as: [33mcs22m013[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


**Use the standard train/test split of fashion_mnist (use (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()). Keep 10% of the training data aside as validation data for this hyperparameter search**

In [9]:
fashion_mnist = keras.datasets.fashion_mnist
(X, y), (test_images, test_labels) = fashion_mnist.load_data()
train_images, val_images, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=42)

# **Q1. Download the fashion-MNIST dataset and plot 1 sample image for each class.**

In [10]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k=len(class_names)
images_plot,count=[],0
for i in range(len(train_images)):
  if(count>=k): break
  if(count==train_labels[i]):
    images_plot.append(train_images[i])
    count+=1

wandb.init(entity="cs22m013", project="dl_assignment1")
wandb.log({"Sample Image from each class": [wandb.Image(image, caption=caption) for image, caption in zip(images_plot, class_names)]})
wandb.run.name = "Classes of Fashion_MNIST dataset"
wandb.run.save()



True

**Flatten the Training, Testing & Validation**

In [26]:
#flat the input
image_size=28*28
def flatten_input():
  image_size=28*28
  train_images_X=np.zeros(shape=(len(train_images),image_size),dtype='float32')  #60000x728
  test_images_X=np.zeros(shape=(len(test_images),image_size),dtype='float32')    #10000x728
  val_images_X=np.zeros(shape=(len(val_images),image_size),dtype='float32') 

  for i in range(len(train_images)):
    train_images_X[i]=train_images[i].flatten()/255

  for i in range(len(test_images)):
    test_images_X[i]=test_images[i].flatten()/255

  for i in range(len(val_images)):
    val_images_X[i]=val_images[i].flatten()/255
  return train_images_X,test_images_X,val_images_X

train_images_X,test_images_X,val_images_X=flatten_input()

**Loss function for MSE && Cross Entroy with L2 Regularization**

In [12]:
def l2loss(parameters,lamda):
  l2loss=0
  for i in range(1,len(parameters)//2+1):
    l2loss+=(lamda/2)*np.sum(np.linalg.norm(parameters['W'+str(i)])**2)
  return l2loss

def loss_function(loss_type,true_output,predicted_output,parameters,lamda):
  if(loss_type=='cross_entropy'):
    return -1.0*np.sum(true_output*np.log(predicted_output+1e-9))
  if(loss_type=='mse'):
    return (1/2) * np.sum((true_output-predicted_output)**2)


**Activation functions**

In [13]:
def relu(x):
    return np.maximum(0,x)

def tanh(x):
    return np.tanh(x)

def sigmoid(x): 
   return 1./(1.+np.exp(-x))

def derivative_relu(x):
  return 1*(x>0) 

def softmax(x):
  x=x-max(x)
  return np.exp(x)/np.sum(np.exp(x))

def identity(x):
  return x

def derivative_identity(x):
  return np.ones(x.shape)

def derivative_sigmoid(x):
  return sigmoid(x)*(np.ones_like(x)-sigmoid(x))

def derivative_tanh(x):
  return (1 - (np.tanh(x)**2))

def derivative_softmax(x):
  return softmax(x) * (1-softmax(x))

def activation_function(funct,x,derivative=False):
  if derivative==True:
    if funct=='softmax':
      return derivative_softmax(x)
    if funct=="sigmoid":
      return derivative_sigmoid(x)
    if funct=="relu":
      return derivative_relu(x)
    if funct=="tanh":
      return derivative_tanh(x)
    if funct=="identity":
      return derivative_identity(x)
  
  else:
    if funct=='softmax':
      return softmax(x)
    if funct=="sigmoid":
      return sigmoid(x)
    if funct=="relu":
      return relu(x)
    if funct=="tanh":
      return tanh(x)
    if funct=="identity":
      return identity(x)


**Helper functions**

In [None]:

#one hot encoding of labels
def one_hot(i):
  y=np.zeros((10,1))
  y[i]=1
  return y

#returns predictions and accuracy for (images,labels).
def find_pred(X,y,params,number_hidden_layers,hidden_layer_size,k,function):
    y_pred=[]
    cnt=0
    for i in range(len(X)):
        predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,params,hidden_layer_size,k,X[i],function)
        y_pred.append(np.argmax(predicted_y))
        if(np.argmax(predicted_y)==y[i]):
          cnt+=1
    accuracy=(cnt/len(X))*100
    return y_pred,accuracy

#returs Validation loss & Validation accuracy for Validation Set.
def validationloss(X,y,params,lamda,number_hidden_layers,hidden_layer_size,function,loss_type):
  cnt,loss=0,0
  for i in range(len(X)):
    predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,params,hidden_layer_size,k,X[i],function)
    loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,params,lamda)
    if(np.argmax(predicted_y)==y[i]):
      cnt+=1
  loss+=l2loss(params,lamda)
  Validation_loss=loss/len(X)  
  Validation_accuracy=cnt/len(X)*100
  return Validation_loss,Validation_accuracy

#helper function for logging in WandB
def calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type):
  Training_loss=loss/len(X)
  y_pred,Training_accuracy=find_pred(X,y,parameters,number_hidden_layers,hidden_layer_size,k,function)
  Validation_loss,Validation_accuracy=validationloss(val_images_X,val_labels,parameters,lamda,number_hidden_layers,hidden_layer_size,function,loss_type)
  print('Epoch:',epoch)
  print('Training loss:',Training_loss)
  print('Training_accuracy',Training_accuracy)
  print('Validation loss:', Validation_loss)
  print('Validation accuracy:',Validation_accuracy)
  wandb.log({'Training_accuracy':Training_accuracy,'Epoch':epoch,'Training_loss':Training_loss,'Validation_loss':Validation_loss,'Validation_accuracy':Validation_accuracy})
  



**Helper functions for initialization**

In [15]:
def init(n_in, n_out,initialization):
    if(initialization=='random'):
      return np.random.default_rng().uniform(low=-0.69,high=0.69,size=(n_in,n_out))
    if(initialization=='xavier'):
        return  np.random.randn(n_in,n_out)*np.sqrt(2/(n_in+n_out))
    if(initialization=='zero'):
      return np.zeros((n_in,n_out))

#initializes parameters w.r.t to number of layers & initialization provided.
def initialize_parameters(number_of_neurons,number_hidden_layers,k,layers,initialization):
  parameters={}
  parameters['W'+str(1)]=init(layers[0],image_size,initialization)
  parameters['b'+str(1)]=init(layers[0],1,initialization)
  for i in range(1,number_hidden_layers):
    parameters['W'+str(i+1)]=init(layers[i],layers[i-1],initialization)
    parameters['b'+str(i+1)]=init(layers[i],1,initialization)
  parameters['W'+str(number_hidden_layers+1)]=init(k,layers[-1],initialization)
  parameters['b'+str(number_hidden_layers+1)]=init(k,1,initialization)
  return parameters

# **Q2.Implement a feedforward neural network which takes images from the fashion-mnist data as input and outputs a probability distribution over the 10 classes.**

In [16]:
def feed_forward(number_hidden_layers,parameters,hidden_layer_size,k,data,function):
  activation={}
  pre_activation={}
  activation['h0']=data.reshape(784,1)
  for i in range(1,number_hidden_layers+1):
    a=np.add(parameters['b'+str(i)],np.matmul(parameters['W'+str(i)],activation['h'+str(i-1)]))
    h=activation_function(function,a)
    #h=sigmoid(a)
    pre_activation['a'+str(i)]=a
    activation['h'+str(i)]=h
  
  a=np.add(parameters['b'+str(number_hidden_layers+1)],np.matmul(parameters['W'+str(number_hidden_layers+1)],activation['h'+str(number_hidden_layers)]))
  h=softmax(a)
  pre_activation['a'+str(number_hidden_layers+1)]=a
  activation['h'+str(number_hidden_layers+1)]=h
  
  return h,activation,pre_activation



# **Q3.Implement the backpropagation algorithm with support for the following optimisation functions**




In [17]:
#h:: activation a=:preactivation
def back_propogation(parameters,activation,pre_activation,X,y,number_hidden_layers,predicted_y,k,lamda,function,loss_type):

  gradient_parameters,gradient_activation,gradient_preactivation={},{},{}

  #compute output gradient
  e_y=np.zeros((k,1))
  e_y[y][0]=1
  if(loss_type=='cross_entropy'):
    gradient_preactivation['a'+str(number_hidden_layers+1)]=-(e_y-predicted_y)
  else:
    gradient_preactivation['a'+str(number_hidden_layers+1)]=(predicted_y-e_y)*activation_function('softmax',pre_activation['a'+str(number_hidden_layers+1)],True)

  for t in range(number_hidden_layers+1,0,-1):
    #compute gradients w.r.t parameters
    gradient_parameters['W'+str(t)]=np.matmul(gradient_preactivation['a'+str(t)],activation['h'+str(t-1)].T) 
    gradient_parameters['b'+str(t)]=gradient_preactivation['a'+str(t)]
    #print(t,gradient_parameters['W'+str(t)])
    if(t==1):break
    #compute gradients w.r.t layers below
    gradient_activation['h'+str(t-1)]=np.matmul(parameters['W'+str(t)].T,gradient_preactivation['a'+str(t)])

    #compute gradients w.r.t preactivation layer
    gradient_preactivation['a'+str(t-1)]=np.multiply(gradient_activation['h'+str(t-1)],activation_function(function,pre_activation['a'+str(t-1)],True))
  #print(pre_activation)
  return gradient_parameters

### Helper functions for update Rules in optimization function

In [18]:
#For SGD
def update_parameters(parameters,gradient_change,learning_rate):
  for i in range(1,len(parameters)//2+1):
    parameters['W'+str(i)]=parameters['W'+str(i)]-learning_rate*gradient_change['W'+str(i)]
    parameters['b'+str(i)]=parameters['b'+str(i)]-learning_rate*gradient_change['b'+str(i)]
  return parameters

#For Momentum
def update_parameters_momentum(parameters,gradient_change,prior_updates,learning_rate,beta):
  for i in range(1,len(parameters)//2+1):
    prior_updates['W'+str(i)]=beta*prior_updates['W'+str(i)]+gradient_change['W'+str(i)]
    parameters['W'+str(i)]=parameters['W'+str(i)]-learning_rate*prior_updates['W'+str(i)]

    prior_updates['b'+str(i)]=beta*prior_updates['b'+str(i)]+gradient_change['b'+str(i)]
    parameters['b'+str(i)]=parameters['b'+str(i)]-learning_rate*prior_updates['b'+str(i)]
  return parameters,prior_updates

#For RMSprop
def update_parameters_rmsprop(parameters,gradient_change,prior_updates,learning_rate,beta):
  epsilon=1e-9
  for i in range(1,len(parameters)//2+1):
    prior_updates['W'+str(i)]=beta*prior_updates['W'+str(i)]+(1-beta)*(gradient_change['W'+str(i)])**2
    parameters['W'+str(i)]=parameters['W'+str(i)]-gradient_change['W'+str(i)]*(learning_rate/np.sqrt(prior_updates['W'+str(i)]+epsilon))

    prior_updates['b'+str(i)]=beta*prior_updates['b'+str(i)]+(1-beta)*(gradient_change['b'+str(i)])**2
    parameters['b'+str(i)]=parameters['b'+str(i)]-gradient_change['b'+str(i)]*(learning_rate/np.sqrt(prior_updates['b'+str(i)]+epsilon))
  return parameters,prior_updates


## **Nesterov Accelerated Gradient Descent**



In [19]:
def gradient_descent_nag(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,X,y,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type):
  #initializing parameters,prior_updates,updates 
  parameters=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,initialization)
  prior_updates=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  updates=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  temp=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  for epoch in range(max_epochs):
    loss,cnt=0,0
    for it in range(1,len(parameters)//2+1):
      updates['W'+str(it)]=prior_updates['W'+str(it)]*beta
      updates['b'+str(it)]=prior_updates['b'+str(it)]*beta
    
    for it in range(1,len(parameters)//2+1):
      temp['W'+str(it)]=parameters['W'+str(it)]-updates['W'+str(it)]
      temp['b'+str(it)]=parameters['b'+str(it)]-updates['b'+str(it)]
    
    for i in range(len(X)):
      predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,temp,hidden_layer_size,k,X[i],function)
      loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,parameters,lamda)
      gradient_parameters=back_propogation(temp,activation,pre_activation,X[i],y[i],number_hidden_layers,predicted_y,k,lamda,function,loss_type)
      
      if(cnt==0):
        gradient_change=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
      else:
        for iter in range(1,len(parameters)//2+1):
          gradient_change['W'+str(iter)]+=gradient_parameters['W'+str(iter)]
          gradient_change['b'+str(iter)]+=gradient_parameters['b'+str(iter)]
      cnt+=1

      if(cnt%batch_size==0 or i==len(X)-1):
        if(lamda!=0):
          for it in range(1,len(parameters)//2+1):
            gradient_change['W'+str(it)]+=np.dot(lamda,parameters['W'+str(it)])

        t=cnt if (i==len(X)-1) else batch_size
        for it in range(1,len(parameters)//2+1):
          gradient_change['W'+str(it)]=gradient_change['W'+str(it)]/t
          gradient_change['b'+str(it)]=gradient_change['b'+str(it)]/t
        cnt=0

        #update rule
        for it in range(1,len(parameters)//2+1):
          updates['W'+str(it)]=beta*prior_updates['W'+str(it)]+learning_rate*gradient_change['W'+str(it)]
          parameters['W'+str(it)]=parameters['W'+str(it)]-updates['W'+str(it)]
          temp['W'+str(it)]=parameters['W'+str(it)]
          prior_updates['W'+str(it)]=updates['W'+str(it)]

          updates['b'+str(it)]=beta*prior_updates['b'+str(it)]+learning_rate*gradient_change['b'+str(it)]
          parameters['b'+str(it)]=parameters['b'+str(it)]-updates['b'+str(it)]
          temp['b'+str(it)]=parameters['b'+str(it)]
          prior_updates['b'+str(it)]=updates['b'+str(it)]
    l2regularizedloss=l2loss(parameters,lamda)
    
    loss+=l2regularizedloss
    calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type)

  return parameters


## **Stochastic Gradient Descent**




In [20]:

def gradient_descent_sgd(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,X,y,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type):
  #initializing parameters
  parameters=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,initialization)
  
  for epoch in range(max_epochs):
    loss=0
    cnt=0
    for i in range(len(X)):
      predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,parameters,hidden_layer_size,k,X[i],function)
      loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,parameters,lamda)

      gradient_parameters=back_propogation(parameters,activation,pre_activation,X[i],y[i],number_hidden_layers,predicted_y,k,lamda,function,loss_type)
    
      if(cnt==0):
        gradient_change={}
        gradient_change=gradient_parameters.copy()
      else:
        for iter in range(1,len(parameters)//2+1):
          gradient_change['W'+str(iter)]+=gradient_parameters['W'+str(iter)]
          gradient_change['b'+str(iter)]+=gradient_parameters['b'+str(iter)]
      cnt+=1
      
      if(cnt%batch_size==0 or i==len(X)-1):
        t=cnt if (i==len(X)-1) else batch_size

        if(lamda!=0):
           for it in range(1,len(parameters)//2+1):
             gradient_change['W'+str(it)]+=np.dot(lamda,parameters['W'+str(it)])

        for it in range(1,len(parameters)//2+1):
          gradient_change['W'+str(it)]=gradient_change['W'+str(it)]/t
          gradient_change['b'+str(it)]=gradient_change['b'+str(it)]/t
        cnt=0
        parameters=update_parameters(parameters,gradient_change,learning_rate)
    l2regularizedloss=l2loss(parameters,lamda)
    l2regularizedloss/=len(X)
    calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type)


  return parameters


## **Momentum**

In [27]:
def gradient_descent_momentum(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,X,y,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type):
  #initializing parameters
  parameters=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,initialization)
  for epoch in range(max_epochs):
    loss=0
    cnt=0
    for i in range(len(X)):
      predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,parameters,hidden_layer_size,k,X[i],function)
      #print(activation['h4'])
      
      loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,parameters,lamda)
      gradient_parameters=back_propogation(parameters,activation,pre_activation,X[i],y[i],number_hidden_layers,predicted_y,k,lamda,function,loss_type)
      if( epoch==0 and i==0):
        #initialize with zero
        prior_updates=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')

      if(cnt==0):
        gradient_change={}
        gradient_change=gradient_parameters.copy()

      else:
        for iter in range(1,len(parameters)//2+1):
          gradient_change['W'+str(iter)]+=gradient_parameters['W'+str(iter)]
          gradient_change['b'+str(iter)]+=gradient_parameters['b'+str(iter)]
      cnt+=1
      
      if(cnt%batch_size==0 or i==len(X)-1):
        t=cnt if (i==len(X)-1) else batch_size

        
        if(lamda!=0):
          for it in range(1,len(parameters)//2+1):
            gradient_change['W'+str(it)]+=np.dot(lamda,parameters['W'+str(it)])

        for it in range(1,len(parameters)//2+1):
          gradient_change['W'+str(it)]=gradient_change['W'+str(it)]/t
          gradient_change['b'+str(it)]=gradient_change['b'+str(it)]/t
        cnt=0
        parameters,prior_updates=update_parameters_momentum(parameters,gradient_change,prior_updates,learning_rate,beta)
    l2regularizedloss=l2loss(parameters,lamda)
    loss+=l2regularizedloss

    calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type)

  return parameters


# **RMSprop**

In [21]:

def gradient_descent_rmsprop(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,X,y,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type):
  #initializing parameters
  parameters=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,initialization)
  for epoch in range(max_epochs):
    loss=0
    cnt=0
    for i in range(len(X)):
      predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,parameters,hidden_layer_size,k,X[i],function)
      #print(activation['h4'])
      
      loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,parameters,lamda)
      gradient_parameters=back_propogation(parameters,activation,pre_activation,X[i],y[i],number_hidden_layers,predicted_y,k,lamda,function,loss_type)
      if( epoch==0 and i==0):
        #initialize with zero
        prior_updates=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')

      if(cnt==0):
        gradient_change={}
        gradient_change=gradient_parameters.copy()

      else:
        for iter in range(1,len(parameters)//2+1):
          gradient_change['W'+str(iter)]+=gradient_parameters['W'+str(iter)]
          gradient_change['b'+str(iter)]+=gradient_parameters['b'+str(iter)]
      cnt+=1
      
      if(cnt%batch_size==0 or i==len(X)-1):
        t=cnt if (i==len(X)-1) else batch_size

        
        if(lamda!=0):
          for it in range(1,len(parameters)//2+1):
            gradient_change['W'+str(it)]+=np.dot(lamda,parameters['W'+str(it)])

        for it in range(1,len(parameters)//2+1):
          gradient_change['W'+str(it)]=gradient_change['W'+str(it)]/t
          gradient_change['b'+str(it)]=gradient_change['b'+str(it)]/t
        cnt=0
        parameters,prior_updates=update_parameters_rmsprop(parameters,gradient_change,prior_updates,learning_rate,beta)
    l2regularizedloss=l2loss(parameters,lamda)
    loss+=l2regularizedloss
    
    calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type)

  return parameters


## **Adam & Nadam**


In [22]:
def gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,X,y,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2):
  #initializing parameters,momentum_hat,update_hat

  parameters=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,initialization)
  momentum=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  momentum_hat=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  update=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')
  update_hat=initialize_parameters(hidden_layer_size,number_hidden_layers,k,layers,'zero')

  for epoch in range(max_epochs):
    loss=0
    cnt=0
    epsilon=1e-10
    for i in range(len(X)):
      predicted_y,activation,pre_activation=feed_forward(number_hidden_layers,parameters,hidden_layer_size,k,X[i],function)
      loss+=loss_function(loss_type,one_hot(y[i]),predicted_y,parameters,lamda)
      gradient_parameters=back_propogation(parameters,activation,pre_activation,X[i],y[i],number_hidden_layers,predicted_y,k,lamda,function,loss_type)
      if(cnt==0):
        gradient_change={}
        gradient_change=gradient_parameters.copy()

      else:
        for iter in range(1,len(parameters)//2+1):
          gradient_change['W'+str(iter)]+=gradient_parameters['W'+str(iter)]
          gradient_change['b'+str(iter)]+=gradient_parameters['b'+str(iter)]
      cnt+=1
      
      if(cnt%batch_size==0 or i==len(X)-1):

        if(lamda!=0):
          for it in range(1,len(parameters)//2+1):
            gradient_change['W'+str(it)]+=np.dot(lamda,parameters['W'+str(it)])

        t=cnt if (i==len(X)-1) else batch_size
        for it in range(1,len(parameters)//2+1):
          gradient_change['W'+str(it)]=gradient_change['W'+str(it)]/t
          gradient_change['b'+str(it)]=gradient_change['b'+str(it)]/t
        
        cnt=0
        for it in range(1,len(parameters)//2+1):
          momentum['W'+str(it)]=beta1*momentum['W'+str(it)]+(1-beta1)*gradient_change['W'+str(it)]
          momentum['b'+str(it)]=beta1*momentum['b'+str(it)]+(1-beta1)*gradient_change['b'+str(it)]
          momentum_hat['W'+str(it)]=momentum['W'+str(it)]/(1-beta1**(epoch+1))
          momentum_hat['b'+str(it)]=momentum['b'+str(it)]/(1-beta1**(epoch+1))

        for it in range(1,len(parameters)//2+1):
          update['W'+str(it)]=beta2*update['W'+str(it)]+(1-beta2)*gradient_change['W'+str(it)]**2
          update['b'+str(it)]=beta2*update['b'+str(it)]+(1-beta2)*gradient_change['b'+str(it)]**2
          update_hat['W'+str(it)]=update['W'+str(it)]/(1-beta2**(epoch+1))
          update_hat['b'+str(it)]=update['b'+str(it)]/(1-beta2**(epoch+1))

        if(optimizer=='adam'):
          #update rule for adam
          for it in range(1,len(parameters)//2+1):
            parameters['W'+str(it)]=parameters['W'+str(it)]-(learning_rate*momentum_hat['W'+str(it)]/np.sqrt(update_hat['W'+str(it)]+epsilon))
            parameters['b'+str(it)]=parameters['b'+str(it)]-(learning_rate*momentum_hat['b'+str(it)]/np.sqrt(update_hat['b'+str(it)]+epsilon))
        else:
          #update rule for nadam  
          for it in range(1,len(parameters)//2+1):
            parameters['W'+str(it)]=parameters['W'+str(it)]-(learning_rate/np.sqrt(update_hat['W'+str(it)]+epsilon))*(beta1*momentum_hat['W'+str(it)]+(1-beta1)*gradient_change['W'+str(it)]/(1-beta1**(epoch+1)))
            parameters['b'+str(it)]=parameters['b'+str(it)]-(learning_rate/np.sqrt(update_hat['b'+str(it)]+epsilon))*(beta1*momentum_hat['b'+str(it)]+(1-beta1)*gradient_change['b'+str(it)]/(1-beta1**(epoch+1)))
    
    l2regularizedloss=l2loss(parameters,lamda)
    
    loss+=l2regularizedloss

    calculate(loss,X,y,parameters,number_hidden_layers,hidden_layer_size,k,function,lamda,epoch,loss_type)

  return parameters


**Neural Network function**

In [23]:
def NeuralNetwork():
  k=10
  beta=0.9 
  beta1=0.9
  beta2=0.99
  wandb.init()
  config=wandb.config
  loss_type=config.loss_type
  number_hidden_layers=config.number_hidden_layers
  hidden_layer_size=config.hidden_layer_size
  batch_size=config.batch_size
  max_epochs=config.max_epochs
  optimizer=config.optimizer
  function=config.function
  learning_rate=config.learning_rate
  lamda=config.lamda
  initialization=config.initialization
  layers=[hidden_layer_size for i in range(number_hidden_layers)]
  run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, function,initialization, optimizer, batch_size, lamda, max_epochs, hidden_layer_size, number_hidden_layers,loss_type)
  print(run_name)
  #can add optimizer if needed
  if(optimizer=='sgd'):
    params=gradient_descent_sgd(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type)
  if(optimizer=='nag'):
    params=gradient_descent_nag(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type)
  if(optimizer=='momentum'):
    params=gradient_descent_momentum(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type)
  if(optimizer=='rmsprop'):
    params=gradient_descent_rmsprop(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type)
  if(optimizer=='adam'):
    params=gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2)
  if(optimizer=='nadam'):
    params=gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2)
  
  pred_labels,accuracy=find_pred(test_images_X,test_labels,params,number_hidden_layers,hidden_layer_size,k,function)
  print()
  print("Testing Accuracy:",accuracy)
  wandb.run.name = run_name
  wandb.run.save()


  

# **Q4.Use the sweep functionality provided by wandb to find the best values for the hyperparameters listed below. Use the standard train/test split of fashion_mnist.**

In [None]:
sweep_config = {
  "name": "CS6910 Assignment - MSE & Cross Entropy Error Loss",
  "metric": {
      "name":"Validation_accuracy",
      "goal": "maximize"
  },
  "method": "bayes",
  "parameters": {
        "learning_rate": {
            "values": [0.001,0.1]
        },
        "function": {
            "values": ["relu","tanh","sigmoid"]
        },
        "initialization": {
            "values": ["xavier",'random']
        },
        "optimizer": {
            "values": ["adam","sgd","momentum","nadam","rmsprop"]
        },
        "batch_size": {
            "values": [32,16,64]
        },
        "max_epochs": {
            "values": [10,5]
        },
        "lamda": {
            "values": [0.0005,0.5,0]
        },
        "hidden_layer_size": {
            "values": [64,32,128]
        },
        "number_hidden_layers": {
            "values": [3,4,5]
        },
        "loss_type":{
            "values":['cross_entropy','mse']
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="cs22m013", project="dl_assignment1")
wandb.agent(sweep_id, NeuralNetwork, count=1)

## **Report the accuracy on the test set of fashion_mnist and plot the confusion matrix**

In [25]:
#Best Configuration
k=10
beta=0.9 
beta1=0.9
beta2=0.99
loss_type="cross_entropy"
number_hidden_layers=5
hidden_layer_size=64
batch_size=32
max_epochs=10
optimizer="adam"
function="relu"
learning_rate=0.001
lamda=0.0005
initialization="xavier"
layers=[hidden_layer_size for i in range(number_hidden_layers)]
run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, function,initialization, optimizer, batch_size, lamda, max_epochs, hidden_layer_size, number_hidden_layers,loss_type)
print(run_name)
params=gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2)
  
pred_labels,accuracy=find_pred(test_images_X,test_labels,params,number_hidden_layers,hidden_layer_size,k,function)
print()
print("Testing Accuracy:",accuracy)
wandb.init(entity="cs22m013", project="dl_assignment1")
wandb.log({"confusion_matrix" : wandb.plot.confusion_matrix(probs=None,
                        y_true=test_labels, preds=np.array(pred_labels),
                        class_names=class_names)})

wandb.run.name="Confusion Matrix"
wandb.run.save()

  



lr_0.001_ac_relu_in_xavier_op_adam_bs_32_L2_0.0005_ep_10_nn_64_nh_5_loss_cross_entropy
Epoch: 0
Training loss: 0.5355055954120173
Training_accuracy 84.88888888888889
Validation loss: 0.4212272673147908
Validation accuracy: 84.6
Epoch: 1
Training loss: 0.38300197462476043
Training_accuracy 86.74444444444444
Validation loss: 0.38597467742762265
Validation accuracy: 86.1
Epoch: 2
Training loss: 0.34584950696958056
Training_accuracy 87.53703703703704
Validation loss: 0.37563364259971965
Validation accuracy: 86.63333333333333
Epoch: 3
Training loss: 0.3214613277012385
Training_accuracy 88.71666666666667
Validation loss: 0.359741932703385
Validation accuracy: 87.03333333333333
Epoch: 4
Training loss: 0.3043099202291101
Training_accuracy 89.22962962962963
Validation loss: 0.3522526804262778
Validation accuracy: 87.36666666666667
Epoch: 5
Training loss: 0.2906017986249653
Training_accuracy 89.65925925925926
Validation loss: 0.3479851840455489
Validation accuracy: 87.86666666666667
Epoch: 6
Tra

0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Training_accuracy,▁▃▄▆▆▇▇███
Training_loss,█▄▃▃▂▂▂▁▁▁
Validation_accuracy,▁▄▅▆▆▇▇███
Validation_loss,█▅▄▂▁▁▂▁▂▂

0,1
Epoch,9.0
Training_accuracy,90.58148
Training_loss,0.2512
Validation_accuracy,88.18333
Validation_loss,0.36106


True


# **Q8.In all the models above you would have used cross entropy loss. Now compare the cross entropy loss with the squared error loss.**



In [None]:
#Best confiig 1
sweep_config = {
  "name": "CS6910 Assignment - MSE & Cross Entropy Error Loss",
  "metric": {
      "name":"Validation_accuracy",
      "goal": "maximize"
  },
  "method": "grid",
  "parameters": {
        "learning_rate": {
            "values": [0.001]
        },
        "function": {
            "values": ["relu"]
        },
        "initialization": {
            "values": ["xavier"]
        },
        "optimizer": {
            "values": ["adam"]
        },
        "batch_size": {
            "values": [32]
        },
        "max_epochs": {
            "values": [10]
        },
        "lamda": {
            "values": [0.0005]
        },
        "hidden_layer_size": {
            "values": [64]
        },
        "number_hidden_layers": {
            "values": [5]
        },
        "loss_type":{
            "values":['cross_entropy','mse']
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="cs22m013", project="dl_assignment1")
wandb.agent(sweep_id, NeuralNetwork, count=2)

In [None]:
#Best confiig 2
sweep_config = {
  "name": "CS6910 Assignment - MSE & Cross Entropy Error Loss",
  "metric": {
      "name":"Validation_accuracy",
      "goal": "maximize"
  },
  "method": "grid",
  "parameters": {
        "learning_rate": {
            "values": [0.1]
        },
        "function": {
            "values": ["relu"]
        },
        "initialization": {
            "values": ["xavier"]
        },
        "optimizer": {
            "values": ["sgd"]
        },
        "batch_size": {
            "values": [64]
        },
        "max_epochs": {
            "values": [10]
        },
        "lamda": {
            "values": [0]
        },
        "hidden_layer_size": {
            "values": [64]
        },
        "number_hidden_layers": {
            "values": [3]
        },
        "loss_type":{
            "values":['cross_entropy','mse']
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="cs22m013", project="dl_assignment1")
wandb.agent(sweep_id, NeuralNetwork, count=2)

In [None]:
#Best config 3
sweep_config = {
  "name": "CS6910 Assignment - MSE & Cross Entropy Error Loss",
  "metric": {
      "name":"Validation_accuracy",
      "goal": "maximize"
  },
  "method": "grid",
  "parameters": {
        "learning_rate": {
            "values": [0.001]
        },
        "function": {
            "values": ["tanh"]
        },
        "initialization": {
            "values": ["xavier"]
        },
        "optimizer": {
            "values": ["nadam"]
        },
        "batch_size": {
            "values": [16]
        },
        "max_epochs": {
            "values": [10]
        },
        "lamda": {
            "values": [0]
        },
        "hidden_layer_size": {
            "values": [32]
        },
        "number_hidden_layers": {
            "values": [4]
        },
        "loss_type":{
            "values":['cross_entropy','mse']
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="cs22m013", project="dl_assignment1")
wandb.agent(sweep_id, NeuralNetwork, count=2)

# **Q10. Give me 3 recommendations for what would work for the MNIST dataset**

In [34]:
fashion_mnist = keras.datasets.mnist
(X, y), (test_images, test_labels) = fashion_mnist.load_data()
train_images, val_images, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=42)
train_images_X,test_images_X,val_images_X=flatten_input()


In [None]:
#Best Config1
wandb.init(entity="cs22m013", project="dl_assignment1")
k=10
beta=0.9 
beta1=0.9
beta2=0.99
loss_type="cross_entropy"
number_hidden_layers=5
hidden_layer_size=64
batch_size=32
max_epochs=10
optimizer="adam"
function="relu"
learning_rate=0.001
lamda=0.0005
initialization="xavier"
layers=[hidden_layer_size for i in range(number_hidden_layers)]
run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, function,initialization, optimizer, batch_size, lamda, max_epochs, hidden_layer_size, number_hidden_layers,loss_type)
print(run_name)
params=gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2)
pred_labels,accuracy=find_pred(test_images_X,test_labels,params,number_hidden_layers,hidden_layer_size,k,function)
print()
print("Testing Accuracy:",accuracy)

  



In [39]:
#Best Config2
wandb.init(entity="cs22m013", project="dl_assignment1")
k=10
beta=0.9 
beta1=0.9
beta2=0.99
loss_type="cross_entropy"
number_hidden_layers=3
hidden_layer_size=64
batch_size=64
max_epochs=10
optimizer="sgd"
function="relu"
learning_rate=0.1
lamda=0
initialization="xavier"
layers=[hidden_layer_size for i in range(number_hidden_layers)]
run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, function,initialization, optimizer, batch_size, lamda, max_epochs, hidden_layer_size, number_hidden_layers,loss_type)
print(run_name)
params=gradient_descent_sgd(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type)
pred_labels,accuracy=find_pred(test_images_X,test_labels,params,number_hidden_layers,hidden_layer_size,k,function)
print()
print("Testing Accuracy:",accuracy)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

lr_0.1_ac_relu_in_xavier_op_sgd_bs_64_L2_0_ep_10_nn_64_nh_3_loss_cross_entropy
Epoch: 0
Training loss: 0.4002218551443951
Training_accuracy 93.93518518518519
Validation loss: 0.20474723279489493
Validation accuracy: 93.89999999999999
Epoch: 1
Training loss: 0.17535337884077473
Training_accuracy 95.9462962962963
Validation loss: 0.14142716497278382
Validation accuracy: 95.65
Epoch: 2
Training loss: 0.12685592194475184
Training_accuracy 96.94259259259259
Validation loss: 0.12032333544533355
Validation accuracy: 96.5
Epoch: 3
Training loss: 0.10011242759890493
Training_accuracy 97.42222222222222
Validation loss: 0.11173637462057351
Validation accuracy: 96.68333333333334
Epoch: 4
Training loss: 0.08155278336955364
Training_accuracy 97.8462962962963
Validation loss: 0.10355311807900783
Validation accuracy: 96.98333333333333
Epoch: 5
Training loss: 0.06857019743882108
Training_accuracy 98.0537037037037
Validation loss: 0.099737558929199
Validation accuracy: 97.13333333333334
Epoch: 6
Trainin

In [None]:
#Best Config3
wandb.init(entity="cs22m013", project="dl_assignment1")
k=10
beta=0.9 
beta1=0.9
beta2=0.99
loss_type="cross_entropy"
number_hidden_layers=5
hidden_layer_size=64
batch_size=32
max_epochs=10
optimizer="nadam"
function="tanh"
learning_rate=0.001
lamda=0
initialization="xavier"
layers=[hidden_layer_size for i in range(number_hidden_layers)]
run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, function,initialization, optimizer, batch_size, lamda, max_epochs, hidden_layer_size, number_hidden_layers,loss_type)
print(run_name)
params=gradient_descent_adam(number_hidden_layers,hidden_layer_size,batch_size,max_epochs,train_images_X,train_labels,k,optimizer,learning_rate,beta,layers,initialization,lamda,function,loss_type,beta1,beta2)
pred_labels,accuracy=find_pred(test_images_X,test_labels,params,number_hidden_layers,hidden_layer_size,k,function)
print()
print("Testing Accuracy:",accuracy)


