In [1]:
from keras.datasets import mnist
from matplotlib import pyplot as plt 
from sklearn.utils import shuffle
import numpy as np
import statistics
import math

In [2]:
#---------------------------------install abd import wandb -------------------------------------------------
%%capture
!pip install wandb -qqq
import wandb

In [3]:
#---------------------------------login to wandb -------------------------------
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33manshikag_2210[0m (use `wandb login --relogin` to force relogin)


In [4]:
#-------------------------------------import file load data as ld to load data from the file -------------------------
import load_data as ld

#----------------  loading train_data , train_labels , validation_data , validation_labels , test_data , test_labels ---------------
(train_data , train_labels , validation_data , validation_labels , test_data , test_labels) = ld.load()

In [5]:
class FeedForwardNeuralNetwork():

###################################################### Constructor ################################################################

  def __init__(self, input_dim, output_dim, hidden_dim, hidden_layers, activation = "sigmoid", weight_intialisation = "random"):
    np.random.seed(1234)
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.hidden_layers = hidden_layers
    self.L = hidden_layers
    self.activation = activation
    self.weight_initialisation = weight_initialisation
    self.initialize()


##################################### Function for storing dimensions of all the layers ############################################

  def size(self):
    l_sizes = []
    l_sizes.append(self.input_dim)
    for m in range(self.hidden_layers):
      l_sizes.append(self.hidden_dim)
    l_sizes.append(self.output_dim)
    return l_sizes


######################################## Function for initialization ###########################################3

  def initialize(self):
    size = self.size()    

#-------------------------- random weight-initialization ---------------------------------

    if self.weight_initialisation == "random":
      self.Weights=[np.random.randn(size[i + 1], size[i]) for i in range(len(size) - 1)]   

#--------------------------- xavier weight-initialization --------------------------------

    if self.weight_initialisation == "xavier":
      self.Weights=[np.random.randn(size[i + 1], size[i])*np.sqrt(2/(size[i]+size[i+1])) for i in range(len(size) - 1)]

#--------------Initialize Biases, Activation, Preactivation, update weights and update biases ndarray  ----------------------

    self.Update_Weights=[np.zeros((size[i + 1], size[i])) for i in range(len(size) - 1)]
    self.Biases= [np.zeros((size[i + 1], 1)) for i in range(len(size) - 1)]
    self.Update_Biases= [np.zeros((size[i + 1], 1)) for i in range(len(size) - 1)]
    self.A = [np.zeros((size[i + 1], 1)) for i in range(len(size) - 1)]
    self.H = [np.zeros((size[i + 1], 1)) for i in range(len(size) - 1)]
  
 ######################################### Activation Function ####################################################################
  
  def activation_function(self, x, activation = "sigmoid"):
    if activation == "sigmoid":
      return np.where(x >= 0, 1 / (1 + np.exp(-x)),np.exp(x) / (1 + np.exp(x)))

    if activation == "tanh":
      return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

    if activation == "relu":
      # return np.where(x>=0,x,0)
      return np.where(x>=0,0.12*x,0)


    if activation == "lrelu":
      return np.where(x>0,x,0.2*x)
      # temp = np.where(x>0,x,0)
      # print("temp in relu : ",temp)
      # return np.where(x>0,x,0)


 ######################################### Function for finding derivative of activation  ######################################

  def diff_activation_function(self, x, activation = "sigmoid"):
    if activation == "sigmoid":
      fx = np.where(x >= 0, 1 / (1 + np.exp(-x)),np.exp(x) / (1 + np.exp(x)))
      return fx * (1 - fx)

    if activation == "tanh":
      fx = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
      return (1 - np.power((fx),2))

    if activation == "relu":
      return np.where(x>=0,0.12,0) 

    if activation == "lrelu":
      return np.where(x>=0,1,0.2)

    return


######################################### Prectivation Function ###################################################


  def preactivation(self, w, h_prev, b):
    return (np.dot(w,h_prev) + b)

##################################### Output Function using stable Softmax #####################################################


  def output_function(self, x): 
    z=x-x.max()
    num=np.exp(z)
    denom=np.sum(num,axis=0)
    return num/denom


######################################### function for Forward propogation  ###################################################


  def forwardPropagation(self, input_vector):
    activation = self.activation
    L = self.L   #total layers - input layer
    self.A[0] = self.Biases[0] + np.dot(self.Weights[0], input_vector)
    self.H[0] = self.activation_function(self.A[0], activation)

    for k in range(1,L):
      self.A[k] = np.array(self.preactivation(self.Weights[k], self.H[k-1], self.Biases[k]))
      self.H[k] = self.activation_function(self.A[k], activation)

    self.A[L] = np.array(self.preactivation(self.Weights[L], self.H[L-1], self.Biases[L]))
    self.H[L] = self.output_function(self.A[L])

#---------------------------------------------H[L] = y^---------------------------------------------------
    return self.H[L] 




######################################### function for back propogation  ###################################################


  def backPropagation(self, train_data,truelabel,prediction):
    L = self.L
    activation = self.activation
    e = np.zeros((10,1))
    e[truelabel] = 1
    grad_Weights = [0]*(L+1)
    grad_Biases = [0]*(L+1)
    # a = -(e - self.H[L])
    a = -(e - prediction)

    for k in range(L, -1, -1):
      if k==0:
        grad_Weights[k] = a.dot((train_data).T)
      else:
        grad_Weights[k] = a.dot((self.H[k-1]).T)

      grad_Biases[k] = a
      if k!=0:
        second = self.diff_activation_function(self.A[k-1], activation)
        first = (self.Weights[k].T).dot(a)
        a =  np.multiply(first,second)

    return (grad_Weights,grad_Biases)



######################################### converting label to corresponding one hot vector  ###################################################

  def oneHot(self, num, size = 10):
    vec = [0]*size
    # print("num :",num)
    vec[num] = 1
    vec = np.array(vec)
    vec = vec.reshape(len(vec),1)
    return vec




######################################### Calculate accuracy  of our model  ###################################################

  def calculate_accuracy(self, test_data, test_labels, limit):
    predictions = []
    count = 0
    for example in test_data[:limit]:
      predictions.append(self.forwardPropagation(example.reshape(784,1)))
    predicted_labels = list(map(np.argmax, predictions))
    for p,l in zip(predicted_labels, test_labels[:limit]):
      if p == l:
        count += 1
    return (count/limit)*100



######################################### Calculate cross entropy loss of our model  ###################################################

  def ce_loss(self, data, labels, limit):
    predictions = []
    count = 0
    for example in data[:limit]:
      predictions.append(self.forwardPropagation(example.reshape(784,1)))
    #-log(predictions[labels[i]])
    total_ce_loss = 0
    for i in range(limit):
      total_ce_loss += -math.log(predictions[i][labels[i]])
    average_ce_loss = total_ce_loss / limit
    return average_ce_loss



  def squared_error_loss(self, data, labels, limit):
    predictions = []
    count = 0
    for example in data[:limit]:
      predictions.append(self.forwardPropagation(example.reshape(784,1)))
    #
    total_sqe_loss = 0
    for i in range(limit):
      total_sqe_loss += sum(np.power(self.oneHot(labels[i])-predictions[i],2))
    average_sqe_loss = total_sqe_loss[0] / limit
    return average_sqe_loss




######################################### Training of our model  ###################################################

  def trainingAlgo(self, opt = 'adam', gamma = 0.9, eta = 1e-4, batch_size = 1, max_epochs = 1,alpha = 0.1, eps = 1e-6, beta = 0.9, limit = 500, vlimit= 500, tlimit = 500):


#------------ loading train data, train labels, validation data, validation labels , test data and test labels  -----------------
    (train_data , train_labels , validation_data , validation_labels , test_data , test_labels) = ld.load()


    N = train_data.shape[0]

    #-------------------------L is number of hidden layers in our model-------------------------------------------
    L = self.L    


    if opt=='adam' or 'nadam':
      m_w,m_b = [0]*(L+1),[0]*(L+1)
      step = 1


#---------------------------------run for loops for maximum number of epochs--------------------------------------------
    for i in range(max_epochs):

      print("\nEpoch : ", i+1)
      g_w, g_b = [0]*(L+1), [0]*(L+1)
      count = 0

      X_train,y_train = shuffle(train_data,train_labels,random_state=0)

      for x, y in zip(X_train[:limit], y_train[:limit]):

        predictions=self.forwardPropagation(x.reshape(784,1))
        (grad_Weights, grad_Biases) = self.backPropagation(x.reshape(784,1),y,predictions)

        for j in range(L+1):
          g_w[j] = g_w[j] + grad_Weights[j]
          g_b[j] = g_b[j] + grad_Biases[j]
        count = count + 1

        if count % batch_size == 0 or count == N :

#-----------------------------update weights and biases for momentum based stochastic gradient descent optimizer-----------------------------------------
          if opt=='momentum':
            for j in range(0,L+1):
              self.Update_Weights[j] = gamma * self.Update_Weights[j] + (eta * grad_Weights[j])
              self.Weights[j] =self.Weights[j]-self.Update_Weights[j]

              self.Update_Biases[j] = gamma * self.Update_Biases[j] + (eta * grad_Biases[j])
              self.Biases[j]  = self.Biases[j] - self.Update_Biases[j]



#-----------------------------update weights and biases for Stochastic gradient descent optimizer-----------------------------------------
          elif opt=='sgd':
            for j in range(L + 1):
              self.Weights[j]   = self.Weights[j]-(eta*grad_Weights[j])
              self.Biases[j]  = self.Biases[j]-(eta * grad_Biases[j])



#-----------------------------update weights and biases for nestrov optimizer-----------------------------------------
          elif opt=='nestrov':
            for j in range(L + 1):
              self.Update_Weights[j] = (gamma * self.Update_Weights[j]) + (eta * g_w[j])
              self.Weights[j]  = self.Weights[j] - ( gamma * prev_w[j] + eta * g_w[j] )

              self.Update_Biases[j] = (gamma *  self.Update_Biases[j]) + (eta * g_b[j])
              self.Biases[j]  = self.Biases[j]- ( gamma *  self.Update_Biases[j] + eta * g_b[j] )



#-----------------------------update weights and biases for RMSProp optimizer-----------------------------------------
          elif opt=='rmsprop':
            for j in range(L + 1):
              self.Update_Weights[j] = beta * self.Update_Weights[j] + (1 - beta) * g_w[j]**2
              self.Weights[j] = ( 1 - eta*alpha ) * self.Update_Weights[j] - eta /( np.sqrt(self.Update_Weights[j])+ eps) * grad_Weights[j]

              self.Update_Biases[j]= beta *  self.Update_Biases[j] + (1 - beta) * g_b[j]**2
              self.Biases[j]  = self.Biases[j] -(eta / (np.sqrt( self.Update_Biases[j]) +eps)) * grad_Biases[j]




#-----------------------------update weights and biases for Adam optimizer-----------------------------------------
          elif opt=='adam':
            # pass
            beta1 = 0.9
            beta2 = 0.999
            for j in range(L + 1):

              self.Update_Weights[j] = beta2 * self.Update_Weights[j] + (1 - beta2) * g_w[j]**2
              m_w[j]=beta1*m_w[j] +(1-beta1) * g_w[j]
              m_w_hat=m_w[j]/(1-math.pow(beta1,step))
              v_w_hat=self.Update_Weights[j]/(1-math.pow(beta2,step))
              self.Weights[j]=(1-eta*alpha/N)*self.Weights[j] -(eta/(np.sqrt(v_w_hat)+eps))*m_w_hat

              self.Update_Biases[j] = beta2 * self.Update_Biases[j] + (1 - beta2) * g_b[j]**2
              m_b[j]=beta1*m_b[j] +(1-beta1) * g_b[j]
              m_b_hat=m_b[j]/(1-math.pow(beta1,step))
              v_b_hat=self.Update_Biases[j]/(1-math.pow(beta2,step))
              self.Biases[j] = self.Biases[j] -(eta / (np.sqrt(v_b_hat)+eps))* m_b_hat



#-----------------------------updating weights and biases for Nadam optimizer-----------------------------------------
          elif opt=='nadam':
            # pass
            beta1 = 0.9
            beta2 = 0.999
            for j in range(L + 1):

              self.Update_Weights[j] = beta2 * self.Update_Weights[j] + (1 - beta2) * g_w[j] ** 2
              m_w[j] = beta1 * m_w[j] + (1 - beta1) * g_w[j]
              m_w_hat = m_w[j] / (1 - math.pow(beta1, step))
              m_w_hat = beta1 * m_w_hat + ((1 - beta1) * g_w[j]) / (1 - math.pow(beta1, step))
              v_w_hat=self.Update_Weights[j]/(1-math.pow(beta2,step))
              self.Weights[j] = (1 - eta * alpha) * self.Weights[j] -(eta /( np.sqrt(v_w_hat) + eps)) * m_w_hat

              self.Update_Biases[j] = beta2 * self.Update_Biases[j] + (1 - beta2) * g_b[j] ** 2
              m_b[j] = beta1 * m_b[j] + (1 - beta1) * g_b[j]
              m_b_hat = beta1 * (m_b[j] / (1 - math.pow(beta1, step))) + ((1 - beta1) * g_b[j]) / (1 - math.pow(beta1, step))
              v_b_hat=self.Update_Biases[j]/(1-math.pow(beta2,step))
              self.Biases[j] = self.Biases[j]-(eta/(np.sqrt(v_b_hat)+eps))*m_b_hat

          else:
            pass

#--------------------------------------------weights and biases has been updated---------------------------------------------
          g_w=[0]*(L+1)
          g_b=[0]*(L+1)
          step = step + 1

     #--------------------------------------------calculating training accuracy-----------------------------------------
      train_acc = self.calculate_accuracy(train_data, train_labels, limit)


      #--------------------------------------------calculating validation accuracy-----------------------------------------
      val_acc = self.calculate_accuracy(validation_data, validation_labels, vlimit)


      #--------------------------------------------calculating testing accuracy-----------------------------------------
      test_acc = self.calculate_accuracy(test_data, test_labels, tlimit)


      #----------------------------------------   calculating cross entropy loss for training  ----------------------------------------
      train_ce_loss = self.ce_loss(train_data, train_labels, limit)


      #----------------------------------------   calculating cross entropy loss for validation  ----------------------------------------
      valid_ce_loss = self.ce_loss(validation_data, validation_labels, vlimit)

      
      #----------------------------------------   calculating cross entropy loss for testing ----------------------------------------
      test_ce_loss = self.ce_loss(test_data, test_labels, tlimit)

      #----------------------------------------   Logging the accuracies and losses to wandb  ----------------------------------------
      wandb.log({"train_acc": train_acc, "val_acc": val_acc, "test_acc":test_acc, "train_ce_loss":train_ce_loss, "valid_ce_loss":valid_ce_loss, "test_ce_loss":test_ce_loss, "epoch": i})


#-----------------------------return from function training algo -----------------------------------------------------------
    return




#================================================================== end of class ========================================================================================================================================================================


In [6]:
#================================================================== default intialisation ========================================================================================================================================================================

number_of_hidden_layers = 1
number_of_neurons_in_hidden_layer = 128
activation_function = "sigmoid"
weight_initialisation = "xavier"
optimiser = "adam"
gamma = 0.9
eta = 1e-4
batch_size = 1
max_epochs = 1
alpha = 0.005
train_limit = len(train_data)
test_limit = len(test_data)
validation_limit = len(validation_data)

#===========================================================================================================================================================================================================================================================


In [7]:
#=============================================================================== Sweep configuration ========================================================================================================================================================================

sweep_config={"method":"random",
              'metric' : {
        'name' : 'train_acc',
        'goal' : 'maximize',
    },
    "parameters":{
    "num_hidden":{"values":[4]},
    "hidden_layer_size":{"values":[32,128]},
    "learning_rate":{"values":[1e-3,1e-4]},
    "num_epochs":{"values":[10]},
    "batch_size":{"values":[16,32,64]},
    "optimisation":{"values":["adam","nadam"]},
    "activation_function":{"values":["sigmoid","tanh"]},
    "weight_initialisation":{"values":["random","xavier"]},
    "weight_decay":{"values":[0.0005,0]}
}
              }


sweep_id = wandb.sweep(sweep_config, project="sweep_1")

#===================================================================================================================================================================================================================================================================


Create sweep with ID: 5rc0y5h7
Sweep URL: https://wandb.ai/anshikag_2210/sweep_1/sweeps/5rc0y5h7


In [8]:
#=============================================================================== Run function for running the model ========================================================================================================================================================================


def run():
  wb = wandb.init()
  config = wb.config
  
  ffnn = FeedForwardNeuralNetwork(784, 10, config.hidden_layer_size, config.num_hidden, config.activation_function, config.weight_initialisation)
  ffnn.trainingAlgo(opt = config.optimisation, gamma = 0.9, eta = config.learning_rate, batch_size = config.batch_size, max_epochs = config.num_epochs, alpha = config.weight_decay,  limit = train_limit, vlimit = validation_limit, tlimit = test_limit)
  return


In [None]:
#=============================================================================== wandb agent function ========================================================================================================================================================================

wandb.agent(sweep_id, run)

[34m[1mwandb[0m: Agent Starting Run: lle1c7oe with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_hidden: 4
[34m[1mwandb[0m: 	optimisation: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialisation: random
[34m[1mwandb[0m: Currently logged in as: [33manshikag_2210[0m (use `wandb login --relogin` to force relogin)



Epoch :  1

Epoch :  2

Epoch :  3

Epoch :  4

Epoch :  5

Epoch :  6

Epoch :  7

Epoch :  8

Epoch :  9

Epoch :  10


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_acc,97.75
val_acc,96.36667
test_acc,95.87
train_ce_loss,0.07161
valid_ce_loss,0.14505
test_ce_loss,0.15689
epoch,9.0
_runtime,609.0
_timestamp,1616085882.0
_step,9.0


0,1
train_acc,▁▄▆▆▇▇██▇█
val_acc,▁▃▆▅▆██▇▅▅
test_acc,▁▄▇▇▇▇▇█▇▇
train_ce_loss,█▅▃▃▂▂▁▁▂▁
valid_ce_loss,█▄▁▂▂▃▃▃▅█
test_ce_loss,█▅▁▂▂▃▃▂▄▅
epoch,▁▂▃▃▄▅▆▆▇█
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: 8x7ssufy with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_hidden: 4
[34m[1mwandb[0m: 	optimisation: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialisation: xavier



Epoch :  1

Epoch :  2

Epoch :  3

Epoch :  4

Epoch :  5

Epoch :  6

Epoch :  7

Epoch :  8

Epoch :  9

Epoch :  10


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_acc,99.28704
val_acc,97.73333
test_acc,97.62
train_ce_loss,0.02163
valid_ce_loss,0.08838
test_ce_loss,0.09446
epoch,9.0
_runtime,1427.0
_timestamp,1616087314.0
_step,9.0


0,1
train_acc,▁▃▄▅▇▆▆███
val_acc,▁▄▄▅▇▅▆█▇█
test_acc,▁▃▄▄▆▅▆█▇█
train_ce_loss,█▆▅▄▂▃▂▁▁▁
valid_ce_loss,█▄▄▆▂▄▆▁▄▁
test_ce_loss,█▅▄▅▁▄▄▁▂▁
epoch,▁▂▃▃▄▅▆▆▇█
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: 9ceeg8vc with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_hidden: 4
[34m[1mwandb[0m: 	optimisation: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialisation: xavier



Epoch :  1

Epoch :  2

Epoch :  3

Epoch :  4

Epoch :  5

Epoch :  6


In [None]:
#=============================================================================== wandb finish function ========================================================================================================================================================================

wandb.finish()