### IMPORTING LIBRARIES

In [1]:
from keras.datasets import fashion_mnist
import numpy as np
import matplotlib.pyplot as plt
import copy 
import random
import tensorflow as tf
from tqdm import tqdm

In [2]:
!pip install wandb -qqq
import wandb

In [3]:
# A function to load and return our dataset followed by another function to convert label to name
def load_dataset():
    (X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data()
    return {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        }

#A dictionary is created for the class labels and class id 
class_labels={0:'T-shirt',1:'Trouser',2:'Pullover',3:'Dress',4:'Coat',5:'Sandal',6:'Shirt',7:'Sneaker',8:'Bag',9:'Ankle_Boot'}

def label_to_name(label):
  if enumerate(label):
    l_names=[]
    for l in label:
      l_names.append(class_labels[l])
    return l_names
  else:
    return class_labels[label]

### WANDB LOGIN

In [4]:
#Wandb login
!wandb login --relogin
entity_name="bs20b012"
project_name="CS6910"

wandb: ERROR Find detailed error logs at: C:\Users\bones\wandb\debug-cli.bones.log
Error: api_key not configured (no-tty). call wandb login [your_api_key]


In [5]:
''' 
A function used for loading the images and plots from the dataset acordingly
'''
def log_images():
  images=[]
  labels=[]
  dataset=load_dataset()
  X_train=dataset['X_train']
  y_train=dataset['Y_train']
  wandb.init(entity=entity_name,project=project_name, name="log_images")
  for i in range(100):
    if len(labels)==10:
      break
    if class_labels[y_train[i]] not in labels:
      images.append(X_train[i])
      labels.append(class_labels[y_train[i]])
  wandb.log({"Images": [wandb.Image(img, caption=caption) for img, caption in zip(images,labels)]})

In [6]:
''' 
The following class conatins all required functions for the layer construction such as,
1)Sigmoid
2)Sigmoid derivative
3)Relu
4)Relu derivative
5)Tanh
6)Tanh derivative
7)Softmax derivative

Softmax (final layer activation)
Derivative function to calculate gradient 
'''

class activation(): 
  def __init__(self,a):
    self.a=a

  def sigmoid(self,a):
    try:
      return (1.0/(1.0+np.exp(-a)))
    except:
      print("error")

  def relu(self,a):
    return (np.maximum(0,a))

  def tanh(self,a):
    return np.tanh(a)

  def softmax(self,a):
    try:
      return(np.exp(a)/np.sum(np.exp(a)))
    except:
      print("error")

  def sigmoid_derivative(self,x):
    return self.sigmoid(x)*(1-self.sigmoid(x))

  def tanh_derivative(self,x):
    return 1.0 -self.tanh(x)**2

  def relu_derivative(self,x):
    return 1. * (x>0)
     
  def softmax_derivative(self,x):
    return self.softmax(x) * (1-self.softmax(x))

  def derivative(self,x,activation):
    if activation == "sigmoid":
      return self.sigmoid_derivative(x)
    elif activation == "tanh":
      return self.tanh_derivative(x)
    elif activation == "relu":
      return self.relu_derivative(x)

In [7]:
''' 
This class contains several weight init methodssuch as:

1) Xavier(layers) : With n being the number of inputs to the node, the xavier initialization method 
                generates a random number with a homogeneous probability distribution (U) 
                between -(1/sqrt(n)) and 1/sqrt(n).

2) Random(layers) : Random initialization using numpy random library

'''

class weights():
  def __init__(self,layers):
    self.layers=layers

  def Xavier(self,layers):
    params = {}
    for i in range(1,len(layers)):
       norm_xav=np.sqrt(6)/np.sqrt(layers[i]+layers[i-1])
       params["w"+str(i)]=np.random.randn(layers[i],layers[i-1])*norm_xav
       params["b"+str(i)]=np.zeros((layers[i],1))
    return params

  def Random(self,layers):
    params = {}
    for i in range(1,len(layers)):
       params["w"+str(i)]=0.01*np.random.randn(layers[i],layers[i-1])
       params["b"+str(i)]=0.01*np.random.randn(layers[i],1)
    return params

  def weight_init(self,init_type = "random"):
    params={}
    if(init_type=="xavier"):
      params = self.Xavier(self.layers)
    elif(init_type=="random"):
      params = self.Random(self.layers)
    else:
      print("invalid activation function")
    return params

In [8]:
''' 
LOSS FUNCTIONS:

The following loss functions are defined in the given snippet below,
1) Mean Squared loss
2) CrossEntropy loss



L2 REGULARIZATION:

The regularization method of L2 regularization method is included, 
it is added to the existing loss function chosen 
The value of lambda is an hyperparameter that  can be personalized to user's wish
'''

def squared_loss(y, y_hat):
  error = np.sum(((y - y_hat)**2) / (2 * len(y)))
  return error

def CrossEntropy(y, y_hat):
  if y_hat == 0:
    error = - np.sum( np.multiply(y ,np.clip(np.log(y_hat,1e-8,1-1e-8) )))/len(y)
  else:
    error = - np.sum( np.multiply(y , np.log(y_hat)))/len(y)
  return error

def loss_calc(loss_name, y, y_hat, lambd, layers, parameters):
  loss=0
  if(loss_name == "squared_loss"):
    loss=squared_loss(y, y_hat)
  elif(loss_name == "cross_entropy"):
    loss= CrossEntropy(y, y_hat)

  reg_loss = 0.0
  for i in range(len(layers)-1, 0, -1):
    reg_loss = reg_loss + (np.sum(parameters["w"+str(i)]))**2
  reg_loss = loss + ((lambd/(2*len(y)))*(reg_loss))
  return reg_loss

In [9]:
'''
The Class Network is used to perform forward as well as backward pass 
FUNCTIONS:
def __init__(X,y,params,active,layers,loss_type) is the function used for intializing instances 
forward_prop() is the function for forward propagation
backward_prop() is the function for backward propagation

'''

class network():
  def __init__(self,X,y,params,active,layers,loss_type):
    self.X=X
    self.y=y
    self.params=params
    self.active=active
    self.layers=layers
    self.loss_type=loss_type

  def forward_prop(self):
   out=copy.deepcopy(self.X)
   out=out.reshape(-1,1)
   h=[out]
   a=[out] 

   act=activation(a)

   if(self.active=="sigmoid"):
     for i in range(1,len(self.layers)-1):
       weights = self.params["w"+str(i)]
       biases = self.params["b"+str(i)]

       out = np.dot(weights,h[i-1])+biases
       a.append(out)
       post_a = act.sigmoid(out)
       h.append(post_a)
  
   elif(self.active=="tanh"):
     for i in range(1,len(self.layers)-1):
       weights=self.params["w"+str(i)]
       biases=self.params["b"+str(i)]
      
       out=np.dot(weights,h[i-1])+biases
       a.append(out)
       post_a=act.tanh(out)
       h.append(post_a)
  
   elif(self.active=="relu"):
     for i in range(1,len(self.layers)-1):
       weights=self.params["w"+str(i)]
       biases=self.params["b"+str(i)]
      
       out=np.dot(weights,h[i-1])+biases
       a.append(out)
       post_a=act.relu(out)
       h.append(post_a)       

   else:
     print("Invalid activation function") 
   weights=self.params["w"+str(len(self.layers)-1)]
   biases=self.params["b"+str(len(self.layers)-1)]
  
   out=np.dot(weights,h[len(self.layers)-2])+biases
   a.append(out)
   y_hat=act.softmax(out)
   h.append(y_hat)
   return h,a,y_hat

  def backward_prop(self,y,y_hat,h,a,params,layers):
    grad = {}
    act=activation(self.active)
    if self.loss_type == "squared_loss":
      grad["dh"+str(len(layers)-1)] = (y_hat - y)
      grad["da"+str(len(layers)-1)] = (y_hat - y) * act.softmax_derivative(a[len(layers)-1])

    elif self.loss_type == 'cross_entropy':
      grad["da"+str(len(layers)-1)] = -(y-y_hat)
      grad["dh"+str(len(layers)-1)] = -(y/y_hat)

    for i in range(len(layers)-1, 0, -1 ):
      grad["dw"+str(i)] = np.dot(grad["da"+str(i)], np.transpose(h[i-1]))
      grad["db"+str(i)] = grad["da"+str(i)]
      if i > 1:
        grad["dh"+str(i-1)] = np.dot(np.transpose(params["w"+str(i)]), grad["da"+str(i)])
        grad["da"+str(i-1)] = np.multiply(grad["dh" + str(i-1)], act.derivative(a[i-1],self.active))
    return grad

### DEFINING IMPORTANT FUNCTIONS

In [10]:
''' 
Following are the functions for running inference, accuracy calculation, and gradient calculation
1) run_inference : a function to get preds from the model
2) accuracy_calc : a function to calc the accuracy
3) calculate_grad : a function to calculate gradients
'''

def accuracy_calc(res,y_t):
    acc=0.0   
    for x in range(len(res)):
      if(res[x].argmax()==y_t[x].argmax()):
        acc+=1
    acc=acc/len(y_t)
    return(acc*100)


def run_inference(X,y,parameters,activation,layers):
    result = []
    for i in range(len(X)):
      nn=network(X[i], y[i], parameters, activation, layers,"squared_loss")
      h,a,y_hat = nn.forward_prop()
      y_hat = y_hat.flatten()
      result.append(y_hat)
    return result


def calculate_grad(X, Y, parameters, activation, layers, loss_function):
  grads={}
  grads.clear() 
  for j in range(len(X)):
    y = np.reshape(Y[j], (-1,1))

    nn=network(X[j], y, parameters, activation, layers, loss_function)
    h,a,y_hat = nn.forward_prop()
    new_grads = nn.backward_prop(y,y_hat,h,a,parameters,layers)

    if j == 0:
      grads = copy.deepcopy(new_grads)
    else:
      for k in range(len(layers)-1,0,-1):
        grads["dw"+str(k)] += new_grads["dw"+str(k)]
        grads["db"+str(k)] += new_grads["db"+str(k)]
  return grads

In [11]:
''' 
GRADIENTS FUNCTIONS:
The following functions include several functions for performing gradient descent.
The arguments are: X_train, y_train, eta, max_epochs, layers, mini_batch_size, lambd,loss_function, activation, parameters,optimiser,wandb_log
The function finds derivatives per layer and updates the weights and biases accordingly 

The classes for each optimizers are given below,
Optimisers Classes are:
1)SGD
2)NAG
3)RMSprop
4)Momentumgd
5)adam
6)nadam

'''


def gradient_descent(X_train, y_train, eta, max_epochs, layers, mini_batch_size, lambd,loss_function, activation, parameters,optimiser,wandb_log=False):
  grads={}
  train_loss = []
  val_loss = []
  train_acc = []
  val_acc = []

  for t in tqdm(range(max_epochs)):
    for i in range(0, len(X_train), mini_batch_size):

      grads.clear()

      if str(optimiser) == "nesterovacc_gd":
        opt=optimiser(grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t)
        param_lookahead,update_history=opt.paramlookahead()

      X = X_train[i:i + mini_batch_size]
      Y = y_train[i:i + mini_batch_size]
      
      if str(optimiser) == "nesterovacc_gd":
        grads = calculate_grad(X,Y,param_lookahead,activation,layers,loss_function)
      else: 
        grads = calculate_grad(X,Y,parameters,activation,layers,loss_function)

      opt=optimiser(grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t)
      parameters=opt.get_params()
    
    #Calculating train loss 
    res = run_inference(X_train,y_train,parameters, activation, layers)
    train_err = loss_calc(loss_function,y_train,res,lambd,layers,parameters) 
    train_ac=accuracy_calc(res, y_train)
    train_loss.append(train_err)
    train_acc.append(train_ac)

    #Calculating validation loss
    res = run_inference(X_val, y_val, parameters, activation, layers)
    val_err = loss_calc(loss_function, y_val, res, lambd, layers, parameters )
    val_ac=accuracy_calc(res, y_val)
    val_loss.append(val_err)
    val_acc.append(val_ac)

    if(wandb_log==True):
      log_dict = {"Train_Accuracy": train_ac, "Validation_Accuracy": val_ac, \
                  "Train_Loss": train_err, "Validation_loss": val_err, "epoch": t}
                  
      wandb.log(log_dict)

  return parameters, train_acc, val_acc

### CLASSES OF OPTIMIZERS



In [12]:
class stochastic_gd():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.i=i
    self.t=t

  def get_params(self):
    for j in range(len(self.layers)-1,0,-1):
        self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - (self.eta * self.grads["dw"+str(j)])
        self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - (self.eta * self.grads["db"+str(j)])
    return self.parameters



class momentum_gd():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.gamma=0.9
    self.i=i
    self.t=t

  def get_update_history(self):
    update_history={}
    for j in range(len(self.layers)-1, 0, -1):
          update_history["w"+str(j)] = self.eta*self.grads["dw"+str(j)]
          update_history["b"+str(j)] = self.eta*self.grads["db"+str(j)]
    for j in range(len(self.layers)-1, 0, -1):
          update_history["w"+str(j)] = (self.gamma*update_history["w"+str(j)]) + (self.eta*self.grads["dw"+str(j)])
          update_history["b"+str(j)] = (self.gamma*update_history["b"+str(j)]) + (self.eta*self.grads["db"+str(j)])
    return update_history

  def get_params(self):
    update_history=self.get_update_history()
    for j in range(len(self.layers)-1,0,-1):
        self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - update_history["w"+str(j)]
        self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - update_history["b"+str(j)]
    return self.parameters


class nesterovacc_gd():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.i=i
    self.t=t

  def paramlookahead(self):
    update_history={}
    if self.i==0:
        param_lookahead = copy.deepcopy(self.parameters)
    else:
        for j in range(len(self.layers)-1, 0, -1):
          param_lookahead['w'+str(j)] = self.parameters['w'+str(j)] + (self.gamma*update_history["w"+str(j)])
    return param_lookahead,update_history

  def get_params(self,update_history):
    param_lookahead,update_history=self.paramlookahead()
    if self.i == 0 :
        for j in range(len(self.layers)-1, 0, -1):
          update_history["w"+str(j)] = self.eta*self.grads["dw"+str(j)]
          update_history["b"+str(j)] = self.eta*self.grads["db"+str(j)]
    else:
        for j in range(len(self.layers)-1, 0, -1):
          update_history["w"+str(j)] = (self.gamma*update_history["w"+str(j)]) + (self.eta*self.grads["dw"+str(j)])
          update_history["b"+str(j)] = (self.gamma*update_history["b"+str(j)]) + (self.eta*self.grads["db"+str(j)])
    for j in range(len(self.layers)-1,0,-1):
        self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - update_history["w"+str(j)]
        self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - update_history["b"+str(j)]
    return self.parameters



class rmsprop():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.i=i
    self.beta = 0.9 
    self.epsilon=1e-8
    self.t=t

  def momenta(self):
    update_history={}
    v={}
    for i in range(len(self.layers)-1,0,-1):
      update_history["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      update_history["b"+str(i)]=np.zeros((self.layers[i],1))
    for i in range(len(self.layers)-1,0,-1):
      v["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      v["b"+str(i)]=np.zeros((self.layers[i],1))
    return v,update_history
     
  def get_params(self):
    v,update_history=self.momenta()
    for iq in range(len(self.layers)-1,0,-1):
        v["w"+str(iq)]=self.beta*v["w"+str(iq)]+(1-self.beta)*self.grads["dw"+str(iq)]**2
        v["b"+str(iq)]=self.beta*v["b"+str(iq)]+(1-self.beta)*self.grads["db"+str(iq)]**2     
        update_history["w"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(v["w"+str(iq)]+self.epsilon)),self.grads["dw"+str(iq)])
        update_history["b"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(v["b"+str(iq)]+self.epsilon)),self.grads["db"+str(iq)])
    for j in range(len(self.layers)-1,0,-1):
        self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - update_history["w"+str(j)]
        self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - update_history["b"+str(j)]
    return self.parameters


class adam():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.i=i
    self.beta1=0.9 
    self.epsilon=1e-8
    self.beta2=0.999
    self.t=t
  
  def momenta(self):
    update_history={}
    v={}
    m={}
    for i in range(len(self.layers)-1,0,-1):
      update_history["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      update_history["b"+str(i)]=np.zeros((self.layers[i],1))
    for i in range(len(self.layers)-1,0,-1):
      v["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      v["b"+str(i)]=np.zeros((self.layers[i],1))
    for i in range(len(self.layers)-1,0,-1):
      m["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      m["b"+str(i)]=np.zeros((self.layers[i],1))
    return m,v,update_history
     
  def get_params(self):
    m,v,update_history=self.momenta()
    for iq in range(len(self.layers)-1,0,-1):
          m["w"+str(iq)]=self.beta1*m["w"+str(iq)]+(1-self.beta1)*self.grads["dw"+str(iq)]
          m["b"+str(iq)]=self.beta1*m["b"+str(iq)]+(1-self.beta1)*self.grads["db"+str(iq)]    
          v["w"+str(iq)]=self.beta2*v["w"+str(iq)]+(1-self.beta2)*(self.grads["dw"+str(iq)])**2
          v["b"+str(iq)]=self.beta2*v["b"+str(iq)]+(1-self.beta2)*(self.grads["db"+str(iq)])**2
          mw_hat=m["w"+str(iq)]/(1-np.power(self.beta1,self.t+1))
          mb_hat=m["b"+str(iq)]/(1-np.power(self.beta1,self.t+1))
          vw_hat=v["w"+str(iq)]/(1-np.power(self.beta2,self.t+1))
          vb_hat=v["b"+str(iq)]/(1-np.power(self.beta2,self.t+1))
          update_history["w"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(vw_hat+self.epsilon)),mw_hat)
          update_history["b"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(vb_hat+self.epsilon)),mb_hat)

    for j in range(len(self.layers)-1,0,-1):
          self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - update_history["w"+str(j)]
          self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - update_history["b"+str(j)]
    return self.parameters


class nadam():
  def __init__(self,grads, eta, max_epochs,layers,mini_batch_size,lambd,parameters,i,t):
    self.grads=grads
    self.eta=eta
    self.layers=layers
    self.mini_batch_size=mini_batch_size
    self.parameters=parameters
    self.lambd=lambd
    self.i=i
    self.beta1=0.9 
    self.epsilon=1e-8
    self.beta2=0.999
    self.t=t
  
  def momenta(self):
    update_history={}
    v={}
    m={}
    for i in range(len(self.layers)-1,0,-1):
      update_history["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      update_history["b"+str(i)]=np.zeros((self.layers[i],1))
    for i in range(len(self.layers)-1,0,-1):
      v["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      v["b"+str(i)]=np.zeros((self.layers[i],1))
    for i in range(len(self.layers)-1,0,-1):
      m["w"+str(i)]=np.zeros((self.layers[i],self.layers[i-1]))
      m["b"+str(i)]=np.zeros((self.layers[i],1))
    return m,v,update_history
     
  def get_params(self):
    m,v,update_history=self.momenta()
    for iq in range(len(self.layers)-1,0,-1):
          m["w"+str(iq)]=self.beta1*m["w"+str(iq)]+(1-self.beta1)*self.grads["dw"+str(iq)]
          m["b"+str(iq)]=self.beta1*m["b"+str(iq)]+(1-self.beta1)*self.grads["db"+str(iq)]    
          v["w"+str(iq)]=self.beta2*v["w"+str(iq)]+(1-self.beta2)*(self.grads["dw"+str(iq)])**2
          v["b"+str(iq)]=self.beta2*v["b"+str(iq)]+(1-self.beta2)*(self.grads["db"+str(iq)])**2
          mw_hat=m["w"+str(iq)]/(1-np.power(self.beta1,self.t+1))
          mb_hat=m["b"+str(iq)]/(1-np.power(self.beta1,self.t+1))
          vw_hat=v["w"+str(iq)]/(1-np.power(self.beta2,self.t+1))
          vb_hat=v["b"+str(iq)]/(1-np.power(self.beta2,self.t+1))
          update_history["w"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(vw_hat+self.epsilon)),(self.beta1*mw_hat+(1-self.beta1)*self.grads["dw"+str(iq)]))*(1/(1-np.power(self.beta1,self.t+1)))
          update_history["b"+str(iq)]=self.eta*np.multiply(np.reciprocal(np.sqrt(vb_hat+self.epsilon)),(self.beta1*mb_hat+(1-self.beta1)*self.grads["db"+str(iq)]))*(1/(1-np.power(self.beta1,self.t+1)))

    for j in range(len(self.layers)-1,0,-1):
          self.parameters["w"+str(j)] = (1-((self.eta*self.lambd)/self.mini_batch_size))*self.parameters["w"+str(j)] - update_history["w"+str(j)]
          self.parameters["b"+str(j)] = self.parameters["b"+str(j)] - update_history["b"+str(j)]

    return self.parameters

### DATASET LOADING AND PRE-PROCESSING


In [13]:
''' 
We use test train split from sklearn, 0.1 times the train data is used for validation
we create an empty train val test list and append normalised image data to it 
'''
# CLASSES AND LABELS DEFINED
from sklearn.model_selection import train_test_split
(train_x,train_y),(test_x,test_y)=fashion_mnist.load_data()
num_classes = 10
labels=['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']


# TRAIN VALIDATION SPLIT IN RATIO 90:10
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.1, random_state=40)
  
X_train=np.zeros((len(train_x),784))
X_val=np.zeros((len(val_x),784))
X_test=np.zeros((len(test_x),784))
  
# IMAGE PIXEL NORMALIZATION
for i in range(len(train_x)):
  X_train[i]=(copy.deepcopy(train_x[i].flatten()))/255.0 
for i in range(len(val_x)):
  X_val[i]=(copy.deepcopy(val_x[i].flatten()))/255.0
for i in range(len(test_x)):
  X_test[i]=(copy.deepcopy(test_x[i].flatten()))/255.0

# ONE HOT ENCODING
y_train = np.zeros((train_y.size, 10))
y_train[np.arange(train_y.size), train_y] = 1

y_val = np.zeros((val_y.size, 10))
y_val[np.arange(val_y.size), val_y] = 1

y_test = np.zeros((test_y.size, 10))
y_test[np.arange(test_y.size), test_y] = 1


In [14]:
'''
The following is the training function used for traininth model appropriately
'''

def train(X_train=X_train, y_train=y_train, layers=[784,16,10],wandb_log=True, learning_rate = 0.0001, initialization_type = "random", activation_function = "sigmoid", loss_function = "cross_entropy", mini_batch_Size = 32, max_epochs = 5, lambd = 0,optimization_function = adam):

  # Configuration dictionary is being intialized as we do usually in the case of wandb visualization
  config_defaults = {
      'number_hidden_layers': 2,
      'number_neurons': 32,
      'learning_rate': 0.001,
      'initialization_type': "xavier",
      'activation_function':'sigmoid',
      'mini_batch_size' : 64,
      'max_epochs': 5,
      'lambd': 0,
      'optimization_function': "nadam",
      'loss_function' : "cross_entropy"
  }

  # Wandb run is being intialized uing init and config in-built functions
  wandb.init(config=config_defaults)
  config = wandb.config

  # Neural layers being defined
  layers = [784]
  for i in range(config.number_hidden_layers):
    layers = layers + [config.number_neurons]
  layers  = layers + [10]

  # Extraction of hyperparameters and calling them
  learning_rate = config.learning_rate
  initialization_type = config.initialization_type
  activation_function = config.activation_function
  loss_function = config.loss_function
  mini_batch_size = config.mini_batch_size
  max_epochs = config.max_epochs
  lambd = config.lambd
  opt_fun = config.optimization_function
  hidden_layers=config.number_hidden_layers

  if opt_fun == "adam":
    optimization_function = adam
  elif opt_fun == "nadam":
    optimization_function = nadam
  elif opt_fun == "stochastic_gd":
    optimization_function = stochastic_gd
  elif opt_fun == "momentum_gd":
    optimization_function = momentum_gd
  elif opt_fun == "nesterov-acc_gd":
    optimization_function = nesterovacc_gd
  elif opt_fun == "rmsprop":
    optimization_function = rmsprop
  else:
    print("Wrong optimization function")
    exit()

  name_run = str(hidden_layers) + "_" + initialization_type[0] + "_" + \
  activation_function[:4] + "_" + str(learning_rate) + "_" + opt_fun[:4]

  # wandb run name and log defined
  wandb.run.name = name_run
  wandb_log=True
  
  w=weights(layers)
  parameters = w.weight_init(init_type = initialization_type)
  parameters, train_acc, val_acc = gradient_descent(X_train, y_train,learning_rate, max_epochs, layers, mini_batch_Size, lambd, loss_function, activation_function, parameters,optimization_function,wandb_log)
  
  # printing accuracies
  print("Training Accuracy:",train_acc[-1])
  print("Validation Accuracy:",val_acc[-1])
  
  # running and saving the wandb run progresses
  wandb.run.save()
  wandb.run.finish()

In [15]:
''' 
Hyperparameters sweeps using wandb functionality
Optimally Bayes sweep function is chosen but the following are a few more sweep functions
1)Grid search
2)Random
3)Bayes - widely used

'''

def do_sweep(entity_name,project_name):

  hyperparameters = {
      "learning_rate":{
        'values': [0.001, 0.0001]
      },

      "number_hidden_layers": {
          'values' : [3, 4, 5]
      },

      "number_neurons": {
        'values': [32, 64, 128]
      },

      "initialization_type": {
          'values' : ["xavier", "random"]
      },

      "activation_function": {
          'values': ["sigmoid", "tanh", "relu"]
      },

      "mini_batch_size": {
          'values': [16,32,64]
      },

      "max_epochs": {
          'values': [5, 10]
      },

      "lambd": {
          'values': [0, 0.0005, 0.5]
      },

      "optimization_function": {
          'values': ["stochastic_gd","momentum_gd","rmsprop","adam","nadam"]
      }

  }


  #Using bayes method for hyperparameter sweeps to curb the unnecessary configurations
  sweep_config = {
      'method' : 'bayes',
      'metric' :{
          'name': 'Validation_Accuracy',
          'goal': 'maximize'
      },
      'parameters': hyperparameters
  }

  sweep_id = wandb.sweep(sweep_config, entity=entity_name, project=project_name)
  wandb.agent(sweep_id, train)

In [16]:
do_sweep(entity_name,project_name)

Create sweep with ID: 4c8n6myl
Sweep URL: https://wandb.ai/bs20b012/CS6910/sweeps/4c8n6myl


[34m[1mwandb[0m: Agent Starting Run: zpxp9ar7 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 32
[34m[1mwandb[0m: 	optimization_function: rmsprop
[34m[1mwandb[0m: Currently logged in as: [33mbs20b012[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:30<00:00, 18.18s/it]


Training Accuracy: 73.01666666666667
Validation Accuracy: 72.06666666666666


0,1
Train_Accuracy,▁▂▄▇█
Train_Loss,█▇▅▂▁
Validation_Accuracy,▁▂▄▇█
Validation_loss,█▇▄▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,73.01667
Train_Loss,0.86293
Validation_Accuracy,72.06667
Validation_loss,0.88996
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: pc8nqtfo with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: rmsprop


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:30<00:00, 18.20s/it]

Training Accuracy: 84.15925925925926
Validation Accuracy: 83.38333333333333





0,1
Train_Accuracy,▁▆▇██
Train_Loss,█▃▂▂▁
Validation_Accuracy,▁▆▇██
Validation_loss,█▃▂▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,84.15926
Train_Loss,0.49657
Validation_Accuracy,83.38333
Validation_loss,0.51202
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: y7n9m6ff with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:19<00:00, 51.82s/it]

Training Accuracy: 84.46666666666667
Validation Accuracy: 83.38333333333333





0,1
Train_Accuracy,▁▄▇██
Train_Loss,█▄▁▁▁
Validation_Accuracy,▁▄▇██
Validation_loss,█▄▂▁▂
epoch,▁▃▅▆█

0,1
Train_Accuracy,84.46667
Train_Loss,0.60497
Validation_Accuracy,83.38333
Validation_loss,0.66189
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 4luqjvta with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.96s/it]

Training Accuracy: 73.40185185185184
Validation Accuracy: 72.41666666666666





0,1
Train_Accuracy,▁▂▂▄▄▄▆▇▇█
Train_Loss,██▆▅▄▄▃▂▂▁
Validation_Accuracy,▁▁▂▄▄▄▆▇▇█
Validation_loss,██▆▅▄▄▃▂▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,73.40185
Train_Loss,0.79388
Validation_Accuracy,72.41667
Validation_loss,0.81101
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: cbuw44bi with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:03<00:00, 48.74s/it]

Training Accuracy: 85.32777777777778
Validation Accuracy: 84.43333333333334





0,1
Train_Accuracy,▁▆▇██
Train_Loss,█▅▁▃▂
Validation_Accuracy,▁▆███
Validation_loss,█▅▁▃▂
epoch,▁▃▅▆█

0,1
Train_Accuracy,85.32778
Train_Loss,6.86488
Validation_Accuracy,84.43333
Validation_loss,57.39555
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: f0g4nux8 with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


  grad["dh"+str(len(layers)-1)] = -(y/y_hat)
  return(np.exp(a)/np.sum(np.exp(a)))
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:21<00:00, 50.16s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





0,1
Train_Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation_Accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: hrivhb58 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:08<00:00, 25.69s/it]

Training Accuracy: 9.974074074074075
Validation Accuracy: 10.233333333333333





0,1
Train_Accuracy,▁▁▁▁▁
Train_Loss,███▇▁
Validation_Accuracy,▁▁▁▁▁
Validation_loss,███▇▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,9.97407
Train_Loss,2.30251
Validation_Accuracy,10.23333
Validation_loss,2.30232
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: c1fx7afn with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 32
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:42<00:00, 22.22s/it]

Training Accuracy: 85.03888888888889
Validation Accuracy: 84.25





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▄▅▆▇▇▇███
Train_Loss,█▅▃▃▂▂▂▁▁▁
Validation_Accuracy,▁▄▆▇▇▇████
Validation_loss,█▅▃▃▂▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,85.03889
Train_Loss,0.43251
Validation_Accuracy,84.25
Validation_loss,0.45662
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 977d254t with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:43<00:00, 32.60s/it]

Training Accuracy: 70.94814814814815
Validation Accuracy: 71.55





0,1
Train_Accuracy,▁▄▇▇█
Train_Loss,█▄▃▂▁
Validation_Accuracy,▁▄▇▇█
Validation_loss,█▄▃▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,70.94815
Train_Loss,0.78667
Validation_Accuracy,71.55
Validation_loss,0.7865
epoch,4.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zv0wf3ib with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:49<00:00, 22.98s/it]

Training Accuracy: 10.05
Validation Accuracy: 9.55





0,1
Train_Accuracy,▁█████████
Train_Loss,█▆▆▅▅▄▄▃▂▁
Validation_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_loss,▁▃▄▄▄▄▄▅▆█
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.05
Train_Loss,2.30253
Validation_Accuracy,9.55
Validation_loss,2.30284
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: i7frd7k6 with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:02<00:00, 30.23s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation_Accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g83h0mtu with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:01<00:00, 24.33s/it]

Training Accuracy: 85.31111111111112
Validation Accuracy: 84.31666666666666





0,1
Train_Accuracy,▁▅▆▇█
Train_Loss,█▅▃▂▁
Validation_Accuracy,▁▅▇▇█
Validation_loss,█▇▅▃▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,85.31111
Train_Loss,0.42536
Validation_Accuracy,84.31667
Validation_loss,0.51423
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: jrmm106d with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


  error = - np.sum( np.multiply(y , np.log(y_hat)))/len(y)
  error = - np.sum( np.multiply(y , np.log(y_hat)))/len(y)
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:18<00:00, 63.73s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,██▁▁▁
Train_Loss,▁
Validation_Accuracy,██▁▁▁
Validation_loss,▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 8gtow297 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:01<00:00, 24.24s/it]

Training Accuracy: 85.84444444444445
Validation Accuracy: 84.75





0,1
Train_Accuracy,▁▅▆▇█
Train_Loss,█▄▃▂▁
Validation_Accuracy,▁▄▆▇█
Validation_loss,█▄▂▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,85.84444
Train_Loss,0.40633
Validation_Accuracy,84.75
Validation_loss,0.4319
epoch,4.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 84tmq7bf with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:04<00:00, 18.42s/it]

Training Accuracy: 88.63333333333333
Validation Accuracy: 86.86666666666667





0,1
Train_Accuracy,▁▆▇███████
Train_Loss,█▃▂▂▁▁▁▁▁▁
Validation_Accuracy,▁▆▇███████
Validation_loss,█▃▂▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.63333
Train_Loss,0.31319
Validation_Accuracy,86.86667
Validation_loss,0.36988
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: nfnco0yi with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: stochastic_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:06<00:00, 48.65s/it]

Training Accuracy: 9.974074074074075
Validation Accuracy: 10.233333333333333





0,1
Train_Accuracy,▁▁▁▁▁▁▁▁▁▁
Train_Loss,█▇▆▆▅▄▃▃▂▁
Validation_Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation_loss,█▇▇▅▅▄▃▃▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,9.97407
Train_Loss,2.30312
Validation_Accuracy,10.23333
Validation_loss,2.30279
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: r9ycm5pw with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:44<00:00, 28.49s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁█████████
Train_Loss,▁▂▃▅▆▇████
Validation_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_loss,▁▁▂▄▅▆▆▇██
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,2.30432
Validation_Accuracy,9.56667
Validation_loss,2.31282
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: i85tyziv with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:33<00:00, 51.39s/it]

Training Accuracy: 19.26666666666667
Validation Accuracy: 18.4





0,1
Train_Accuracy,▁▁▁▁▁▁▁▂▆█
Train_Loss,████████▇▁
Validation_Accuracy,▁▁▁▁▁▁▁▁▆█
Validation_loss,████████▇▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,19.26667
Train_Loss,2.29249
Validation_Accuracy,18.4
Validation_loss,2.2926
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: j0qs82cw with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: rmsprop


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:16<00:00, 87.33s/it]

Training Accuracy: 83.96481481481482
Validation Accuracy: 83.31666666666668





0,1
Train_Accuracy,▁▆█▇█
Train_Loss,█▂▁▂▄
Validation_Accuracy,▁▆█▇█
Validation_loss,█▂▁▃▄
epoch,▁▃▅▆█

0,1
Train_Accuracy,83.96481
Train_Loss,0.62717
Validation_Accuracy,83.31667
Validation_loss,0.67032
epoch,4.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vfbw5iyf with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:28<00:00, 26.87s/it]

Training Accuracy: 10.05
Validation Accuracy: 9.55





0,1
Train_Accuracy,▁▁▁▁▁▁▁▁▁▁
Train_Loss,█▁▂▃▃▃▃▃▃▃
Validation_Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation_loss,▁▅▇███████
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.05
Train_Loss,2.30258
Validation_Accuracy,9.55
Validation_loss,2.30273
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6ifwtpwr with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: stochastic_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [11:13<00:00, 67.30s/it]

Training Accuracy: 91.03888888888889
Validation Accuracy: 87.86666666666667





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▃▄▅▆▇▇▇██
Train_Loss,█▆▅▄▃▃▂▂▁▁
Validation_Accuracy,▁▄▅▆▇▇▇███
Validation_loss,█▅▄▃▃▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,91.03889
Train_Loss,0.24205
Validation_Accuracy,87.86667
Validation_loss,0.32843
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: ixeya0qi with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 3
[34m[1mwandb[0m: 	number_neurons: 32
[34m[1mwandb[0m: 	optimization_function: rmsprop


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:28<00:00, 26.88s/it]

Training Accuracy: 86.72962962962963
Validation Accuracy: 85.03333333333333





0,1
Train_Accuracy,▁▃▄▆▆▇▇███
Train_Loss,█▅▃▂▂▂▂▂▁▁
Validation_Accuracy,▁▄▅▆▆▇▇███
Validation_loss,▄▂▁▁▂▄▇█▅▅
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.72963
Train_Loss,0.40579
Validation_Accuracy,85.03333
Validation_loss,0.54442
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: 7f4cfyek with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 32
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:03<00:00, 36.73s/it]

Training Accuracy: 68.95740740740742
Validation Accuracy: 69.81666666666668





0,1
Train_Accuracy,▁▅▆▇█
Train_Loss,█▅▃▂▁
Validation_Accuracy,▁▅▆▇█
Validation_loss,█▅▃▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,68.95741
Train_Loss,0.98295
Validation_Accuracy,69.81667
Validation_loss,0.98433
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 84qngtnc with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: adam


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [37:54<00:00, 227.47s/it]

Training Accuracy: 72.27962962962962
Validation Accuracy: 72.58333333333333





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▃▄▅▅▆▆▇▇█
Train_Loss,█▇▆▅▅▄▄▃▂▁
Validation_Accuracy,▁▃▃▄▅▅▆▇██
Validation_loss,█▇▇▆▅▄▄▃▂▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,72.27963
Train_Loss,0.84758
Validation_Accuracy,72.58333
Validation_loss,0.93945
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 027tkmt9 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: stochastic_gd


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [16:51<00:00, 101.13s/it]

Training Accuracy: 84.91666666666666
Validation Accuracy: 83.76666666666667





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▁▁▂▅▇▇███
Train_Loss,████▃▂▂▁▁▁
Validation_Accuracy,▁▁▁▂▅▇▇███
Validation_loss,████▃▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,84.91667
Train_Loss,0.42543
Validation_Accuracy,83.76667
Validation_loss,0.45619
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: uxymlric with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [16:06<00:00, 96.62s/it]

Training Accuracy: 82.30185185185185
Validation Accuracy: 81.0





0,1
Train_Accuracy,▁▁▁▁▁▁▁▆██
Train_Loss,███████▃▁▁
Validation_Accuracy,▁▁▁▁▁▁▁▆██
Validation_loss,███████▃▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,82.30185
Train_Loss,0.50146
Validation_Accuracy,81.0
Validation_loss,0.53338
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ts3pubz4 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:59<00:00, 29.93s/it]

Training Accuracy: 86.9074074074074
Validation Accuracy: 85.33333333333334





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▄▅▆▆▇▇▇██
Train_Loss,█▅▄▃▃▂▂▁▁▁
Validation_Accuracy,▁▃▅▅▆▇▇▇▇█
Validation_loss,█▅▄▃▃▂▂▂▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.90741
Train_Loss,0.37932
Validation_Accuracy,85.33333
Validation_loss,0.42067
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 291z4oqu with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 32
[34m[1mwandb[0m: 	optimization_function: rmsprop


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:57<00:00, 23.75s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





0,1
Train_Accuracy,██████▁▁▁▁
Train_Loss,▁▂▃▆█
Validation_Accuracy,██████▁▁▁▁
Validation_loss,▁▂▃▄▆█
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: w5b5g89q with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:50<00:00, 82.05s/it]

Training Accuracy: 88.87592592592593
Validation Accuracy: 86.53333333333333





0,1
Train_Accuracy,▁▆▇██
Train_Loss,█▃▂▁▁
Validation_Accuracy,▁▇███
Validation_loss,█▃▁▁▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,88.87593
Train_Loss,0.32508
Validation_Accuracy,86.53333
Validation_loss,0.3876
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 0x7sn5ho with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [13:00<00:00, 78.02s/it]

Training Accuracy: 88.1111111111111
Validation Accuracy: 86.58333333333333





0,1
Train_Accuracy,▁▄▅▆▇▇▇███
Train_Loss,█▅▄▃▂▂▂▁▁▁
Validation_Accuracy,▁▄▆▇▇▇████
Validation_loss,█▅▃▂▂▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.11111
Train_Loss,0.33844
Validation_Accuracy,86.58333
Validation_loss,0.38208
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3a5zb9gr with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: momentum_gd


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [10:32<00:00, 63.22s/it]

Training Accuracy: 10.05
Validation Accuracy: 9.55





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁█████████
Train_Loss,█▇▆▆▅▅▄▃▂▁
Validation_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_loss,▁▆██▇▆▅▄▃▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.05
Train_Loss,2.30253
Validation_Accuracy,9.55
Validation_loss,2.30268
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: rug5ue3k with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:24<00:00, 26.47s/it]

Training Accuracy: 82.30555555555556
Validation Accuracy: 82.53333333333333





0,1
Train_Accuracy,▁▅▆▇▇█████
Train_Loss,▁▄▅▆▇▇▇███
Validation_Accuracy,▁▅▆▇▇█████
Validation_loss,▁▄▅▆▇▇▇███
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,82.30556
Train_Loss,14.08508
Validation_Accuracy,82.53333
Validation_loss,122.18673
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: y7as38vc with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:31<00:00, 27.19s/it]

Training Accuracy: 88.28148148148148
Validation Accuracy: 86.76666666666667





0,1
Train_Accuracy,▁▆▆▇▇█████
Train_Loss,█▄▃▂▂▂▁▁▁▁
Validation_Accuracy,▁▅▇▇▇█████
Validation_loss,█▃▃▂▂▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.28148
Train_Loss,0.33772
Validation_Accuracy,86.76667
Validation_loss,0.396
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: m2f8musc with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [13:50<00:00, 83.01s/it]

Training Accuracy: 80.23148148148148
Validation Accuracy: 79.58333333333333





0,1
Train_Accuracy,▁▄▆▇▇█████
Train_Loss,█▅▃▂▂▂▁▁▁▁
Validation_Accuracy,▁▄▆▇▇█████
Validation_loss,█▅▃▃▂▂▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,80.23148
Train_Loss,0.59437
Validation_Accuracy,79.58333
Validation_loss,0.61108
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: tvddngxs with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


  grad["dh"+str(len(layers)-1)] = -(y/y_hat)
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [15:14<00:00, 91.41s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▄████▁▁▁▁▁
Train_Loss,▁▄█
Validation_Accuracy,▄████▁▁▁▁▁
Validation_loss,▁▆█
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: 6uyznhp3 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:30<00:00, 90.13s/it]

Training Accuracy: 85.59629629629629
Validation Accuracy: 84.51666666666667





0,1
Train_Accuracy,▁▆▇██
Train_Loss,█▃▂▁▁
Validation_Accuracy,▁▆███
Validation_loss,█▃▂▁▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,85.5963
Train_Loss,0.4307
Validation_Accuracy,84.51667
Validation_loss,0.47299
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 2ba6b760 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [16:19<00:00, 97.96s/it]

Training Accuracy: 89.80555555555556
Validation Accuracy: 87.55





0,1
Train_Accuracy,▁▅▆▇▇▇████
Train_Loss,█▃▁▁▁▁▁▁▁▁
Validation_Accuracy,▁▆▇▇▇▇▇███
Validation_loss,█▂▁▂▂▃▃▄▅▆
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,89.80556
Train_Loss,0.35873
Validation_Accuracy,87.55
Validation_loss,0.92855
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: v7i3tfq2 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [07:24<00:00, 44.47s/it]

Training Accuracy: 88.28148148148148
Validation Accuracy: 86.86666666666667





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▅▆▇▇█████
Train_Loss,█▄▃▂▂▁▁▁▁▁
Validation_Accuracy,▁▆▇▇██████
Validation_loss,█▄▃▂▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.28148
Train_Loss,0.34161
Validation_Accuracy,86.86667
Validation_loss,0.3902
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: p7p0nnjr with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [17:21<00:00, 104.19s/it]

Training Accuracy: 86.56481481481481
Validation Accuracy: 85.61666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▅▆▇▇▇████
Train_Loss,█▅▃▂▂▂▁▁▁▁
Validation_Accuracy,▁▅▆▇▇▇████
Validation_loss,█▅▃▂▂▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.56481
Train_Loss,0.39811
Validation_Accuracy,85.61667
Validation_loss,0.43363
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mama5dgh with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [20:58<00:00, 125.83s/it]

Training Accuracy: 86.86481481481482
Validation Accuracy: 85.7





0,1
Train_Accuracy,▁▆▇▇▇▇████
Train_Loss,█▃▃▂▂▂▁▁▁▁
Validation_Accuracy,▁▆▇▇▇▇████
Validation_loss,█▆▇▁▃▂▁▁▁▂
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.86481
Train_Loss,0.40308
Validation_Accuracy,85.7
Validation_loss,0.54353
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: 9xyf0sr1 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:16<00:00, 25.63s/it]

Training Accuracy: 86.12962962962963
Validation Accuracy: 85.26666666666667





0,1
Train_Accuracy,▁▄▆▇▇█████
Train_Loss,█▅▃▂▂▁▁▁▁▁
Validation_Accuracy,▁▄▇▇▇█████
Validation_loss,█▅▃▂▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.12963
Train_Loss,0.41448
Validation_Accuracy,85.26667
Validation_loss,0.45054
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5okfe0ve with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:50<00:00, 23.09s/it]

Training Accuracy: 84.77777777777777
Validation Accuracy: 83.88333333333333





0,1
Train_Accuracy,▁▅▆▇▇▇████
Train_Loss,█▅▄▃▃▂▂▁▁▁
Validation_Accuracy,▁▆▆▇▇▇████
Validation_loss,█▅▄▃▃▃▂▂▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,84.77778
Train_Loss,0.46861
Validation_Accuracy,83.88333
Validation_loss,0.50126
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: k394y3g7 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [14:42<00:00, 88.28s/it]

Training Accuracy: 86.32222222222222
Validation Accuracy: 85.16666666666667





0,1
Train_Accuracy,▁▅▆▇▇█████
Train_Loss,█▄▃▂▂▁▁▁▁▁
Validation_Accuracy,▁▆▇▇███▇██
Validation_loss,█▄▃▂▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.32222
Train_Loss,0.40763
Validation_Accuracy,85.16667
Validation_loss,0.45207
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: de3ppxme with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:58<00:00, 83.71s/it]

Training Accuracy: 73.75
Validation Accuracy: 72.46666666666667





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▄▆▇█
Train_Loss,█▅▃▂▁
Validation_Accuracy,▁▄▆██
Validation_loss,█▅▃▂▁
epoch,▁▃▅▆█

0,1
Train_Accuracy,73.75
Train_Loss,0.78118
Validation_Accuracy,72.46667
Validation_loss,0.80548
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: ihk3okf4 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:01<00:00, 24.15s/it]

Training Accuracy: 86.47777777777777
Validation Accuracy: 85.21666666666667





0,1
Train_Accuracy,▁▄▅▆▆▇▇███
Train_Loss,█▅▄▃▂▂▂▁▁▁
Validation_Accuracy,▁▄▅▆▆▇▇███
Validation_loss,█▅▄▃▂▁▁▁▁▂
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.47778
Train_Loss,0.39449
Validation_Accuracy,85.21667
Validation_loss,0.46555
epoch,9.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5nuq1jhc with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:32<00:00, 30.56s/it]

Training Accuracy: 83.9925925925926
Validation Accuracy: 83.98333333333333





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▆▇██
Train_Loss,▂▁▃▅█
Validation_Accuracy,▁▆▇██
Validation_loss,▁▃▅▆█
epoch,▁▃▅▆█

0,1
Train_Accuracy,83.99259
Train_Loss,0.78338
Validation_Accuracy,83.98333
Validation_loss,3.16374
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: dqvm979h with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:46<00:00, 28.62s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





0,1
Train_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_Accuracy,█▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: w6opngly with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:15<00:00, 25.55s/it]

Training Accuracy: 88.32962962962962
Validation Accuracy: 86.73333333333333





0,1
Train_Accuracy,▁▅▆▇▇▇▇███
Train_Loss,█▄▃▂▂▂▁▁▁▁
Validation_Accuracy,▁▅▆▇▇█████
Validation_loss,█▄▃▂▂▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.32963
Train_Loss,0.34227
Validation_Accuracy,86.73333
Validation_loss,0.38594
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: ry0p85op with config:
[34m[1mwandb[0m: 	activation_function: sigmoid
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: adam


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:34<00:00, 30.88s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,█▁▁▁▁
Train_Loss,▁▄▆▇█
Validation_Accuracy,█▁▁▁▁
Validation_loss,▁▄▅▇█
epoch,▁▃▅▆█

0,1
Train_Accuracy,10.04815
Train_Loss,3.32139
Validation_Accuracy,9.56667
Validation_loss,11.96027
epoch,4.0


[34m[1mwandb[0m: Agent Starting Run: 92oogz2j with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 64
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:36<00:00, 33.65s/it]

Training Accuracy: 88.48703703703704
Validation Accuracy: 86.7





0,1
Train_Accuracy,▁▆▇▇▇▇████
Train_Loss,█▃▂▂▁▂▂▂▁▁
Validation_Accuracy,▁▆▇▇▇█████
Validation_loss,▆▁▁▂▄▆██▆▅
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,88.48704
Train_Loss,0.36988
Validation_Accuracy,86.7
Validation_loss,0.70047
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: 23xn687y with config:
[34m[1mwandb[0m: 	activation_function: relu
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [22:00<00:00, 132.02s/it]

Training Accuracy: 10.048148148148147
Validation Accuracy: 9.566666666666666





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_Accuracy,█▁▁▁▁▁▁▁▁▁
Validation_loss,▁
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,10.04815
Train_Loss,
Validation_Accuracy,9.56667
Validation_loss,
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: jv3e7gl4 with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 32
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [23:25<00:00, 140.55s/it]

Training Accuracy: 86.48518518518519
Validation Accuracy: 85.48333333333333





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▅▆▇▇█████
Train_Loss,██▆▃▅▅▅▂▁▁
Validation_Accuracy,▁▅▆▇▇▇▇███
Validation_loss,▁▇▇▅▇██▅▄▄
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.48519
Train_Loss,0.62796
Validation_Accuracy,85.48333
Validation_loss,2.43491
epoch,9.0


[34m[1mwandb[0m: Agent Starting Run: 973chpwg with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: random
[34m[1mwandb[0m: 	lambd: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	mini_batch_size: 64
[34m[1mwandb[0m: 	number_hidden_layers: 4
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: nadam


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [14:29<00:00, 86.93s/it]

Training Accuracy: 86.86111111111111
Validation Accuracy: 85.86666666666667





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_Accuracy,▁▅▆▇▇█████
Train_Loss,█▄▃▃▂▂▁▁▁▁
Validation_Accuracy,▁▅▆▇▇▇████
Validation_loss,█▃▂▅▇▅▃▁▁▄
epoch,▁▂▃▃▄▅▆▆▇█

0,1
Train_Accuracy,86.86111
Train_Loss,0.40748
Validation_Accuracy,85.86667
Validation_loss,0.57179
epoch,9.0


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Plotting Confusion Matrix


In [None]:
# Hyperparameters with validation accuracy: 86.86666666666667
activation_function= "tanh"
initialization_type= "xavier"
lambd=0.0005
learning_rate=0.0001
max_epochs=10
mini_batch_size=64
number_hidden_layers=5
number_neurons=128
optimization_function=nadam
loss_function = "cross_entropy"

layers=[784]
for i in range(number_hidden_layers):
  layers=layers+[number_neurons]
layers=layers+[10]


config_confmat = {
      'number_hidden_layers': 5,
      'number_neurons': 128,
      'learning_rate': 0.0001,
      'initialization_type': "xavier",
      'activation_function':'tanh',
      'mini_batch_size' : 16,
      'max_epochs': 5,
      'lambd': 0,
      'optimization_function': "nadam"
  }

wandb.init(config=config_confmat,entity=entity_name, project=project_name)
wandb.run.name="Confusion_matrix"
wandb_log=True

w=weights(layers)
parameters = w.weight_init(init_type = initialization_type)
parameters, train_acc, val_acc = gradient_descent(X_train, y_train,learning_rate, max_epochs, layers, mini_batch_size, lambd, loss_function, activation_function, parameters,optimization_function,wandb_log)
res = run_inference(X_test,y_test, parameters, activation_function, layers)


accuracy=accuracy_calc(res,y_test)
print("Test Accuracy:",accuracy)
#converting 1 hot labels to class ids

y=[]

for i in range(len(y_test)):
  y.append(y_test[i].argmax())

y_hat=[]

for i in range(len(res)):
  y_hat.append(res[i].argmax())

wandb.log({"conf_mat":wandb.plot.confusion_matrix(preds=y_hat,y_true=y,class_names=labels),"Test Accuracy": accuracy}) 
    
wandb.run.save()
wandb.run.finish()



[34m[1mwandb[0m: Currently logged in as: [33mbs20b012[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

### CROSS ENTROPY vs MEAN SQUARE ERROR

In [18]:
def compare_loss(entity_name,project_name):


  #We fix all hyperparameters except for the loss function 
  hyperparameters = {
      "learning_rate":{
        'values': [0.0001]
      },

      "number_hidden_layers": {
          'values' : [5]
      },

      "number_neurons": {
        'values': [128]
      },

      "initialization_type": {
          'values' : ["xavier"]
      },

      "activation_function": {
          'values': ["tanh"]
      },

      "mini_batch_size": {
          'values': [16]
      },

      "max_epochs": {
          'values': [5]
      },

      "loss_function": {
          'values' : ["cross_entropy","squared_loss"]      
          
      },

      "lambd": {
          'values': [0.0005]
      },

      "optimization_function": {
          'values': ["adam"]
      }

  }


  sweep_config = {
      'method' : 'bayes',
      'metric' :{
          'name': 'Validation_Accuracy',
          'goal': 'maximize'
      },
      'parameters': hyperparameters
  }

  sweep_id = wandb.sweep(sweep_config, entity=entity_name, project=project_name)
  wandb.agent(sweep_id, train)

In [None]:
compare_loss(entity_name,project_name)

Create sweep with ID: zt7fbs5c
Sweep URL: https://wandb.ai/bs20b012/CS6910/sweeps/zt7fbs5c


[34m[1mwandb[0m: Agent Starting Run: pf0a9voi with config:
[34m[1mwandb[0m: 	activation_function: tanh
[34m[1mwandb[0m: 	initialization_type: xavier
[34m[1mwandb[0m: 	lambd: 0.0005
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	max_epochs: 5
[34m[1mwandb[0m: 	mini_batch_size: 16
[34m[1mwandb[0m: 	number_hidden_layers: 5
[34m[1mwandb[0m: 	number_neurons: 128
[34m[1mwandb[0m: 	optimization_function: adam


 40%|█████████████████████████████████▏                                                 | 2/5 [03:29<05:07, 102.66s/it]

# MNSIT TRIALS


In [None]:
from sklearn.model_selection import train_test_split
from keras.datasets import mnist

(train_x,train_y),(test_x,test_y)=mnist.load_data()
num_classes = 10
labels=np.arange(0,10,1)


#performing the train-validation split
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.1, random_state=40)
  

  
#storing the number of points in each set
  
X_train=np.zeros((len(train_x),784))
X_val=np.zeros((len(val_x),784))
X_test=np.zeros((len(test_x),784))
  
# converting the images into grayscale by normalizing
for i in range(len(train_x)):
  X_train[i]=(copy.deepcopy(train_x[i].flatten()))/255.0 
for i in range(len(val_x)):
  X_val[i]=(copy.deepcopy(val_x[i].flatten()))/255.0
for i in range(len(test_x)):
  X_test[i]=(copy.deepcopy(test_x[i].flatten()))/255.0
  
y_train = np.zeros((train_y.size, 10))
y_train[np.arange(train_y.size), train_y] = 1

y_val = np.zeros((val_y.size, 10))
y_val[np.arange(val_y.size), val_y] = 1

y_test = np.zeros((test_y.size, 10))
y_test[np.arange(test_y.size), test_y] = 1


### CONFIG-1:
The following are the key param changes


*   lambd = 0
*   mini_batch = 16
*   optimizer = nadam
*   epochs = 10





In [None]:
activation_function= "tanh"
initialization_type= "xavier"
lambd=0
learning_rate=0.001
max_epochs=10
mini_batch_size=16
number_hidden_layers=5
number_neurons=128
optimization_function=nadam
loss_function = "cross_entropy"

layers=[784]
for i in range(number_hidden_layers):
  layers=layers+[number_neurons]
layers=layers+[10]


config_confmat = {
      'number_hidden_layers': 5,
      'number_neurons': 128,
      'learning_rate': 0.0001,
      'initialization_type': "xavier",
      'activation_function':'tanh',
      'mini_batch_size' : 16,
      'max_epochs': 5,
      'lambd': 0,
      'optimization_function': "nadam"
  }

wandb.init(config=config_confmat,entity=entity_name, project=project_name)
wandb.run.name="MNIST_trials"
wandb_log=True

w=weights(layers)
parameters = w.weight_init(init_type = initialization_type)
parameters, train_acc, val_acc = gradient_descent(X_train, y_train,learning_rate, max_epochs, layers, mini_batch_size, lambd, loss_function, activation_function, parameters,optimization_function,wandb_log)
res = run_inference(X_test,y_test, parameters, activation_function, layers)


accuracy=accuracy_calc(res,y_test)
print("Test Accuracy:",accuracy)

### CONFIG-2:
The following are the key param changes

*   lambd = 0.0005
*   mini_batch = 32
*   optimizer = rmsprop
*   epochs = 7



In [None]:
activation_function= "tanh"
initialization_type= "xavier"
lambd=0.0005
learning_rate=0.001
max_epochs=7
mini_batch_size=32
number_hidden_layers=5
number_neurons=128
optimization_function=rmsprop
loss_function = "cross_entropy"

layers=[784]
for i in range(number_hidden_layers):
  layers=layers+[number_neurons]
layers=layers+[10]


config_confmat = {
      'number_hidden_layers': 5,
      'number_neurons': 128,
      'learning_rate': 0.0001,
      'initialization_type': "xavier",
      'activation_function':'tanh',
      'mini_batch_size' : 32,
      'max_epochs': 7,
      'lambd': 0.0005,
      'optimization_function': "rmsprop"
  }

wandb.init(config=config_confmat,entity=entity_name, project=project_name)
wandb.run.name="MNIST_trials"
wandb_log=True

w=weights(layers)
parameters = w.weight_init(init_type = initialization_type)
parameters, train_acc, val_acc = gradient_descent(X_train, y_train,learning_rate, max_epochs, layers, mini_batch_size, lambd, loss_function, activation_function, parameters,optimization_function,wandb_log)
res = run_inference(X_test,y_test, parameters, activation_function, layers)


accuracy=accuracy_calc(res,y_test)
print("Test Accuracy:",accuracy)

### CONFIG 3:
The following are the key param changes


*   lambd = 0
*   mini_batch = 64
*   optimizer = adam
*   epochs = 7


In [None]:
activation_function= "tanh"
initialization_type= "xavier"
lambd=0
learning_rate=0.001
max_epochs=7
mini_batch_size=64
number_hidden_layers=5
number_neurons=64
optimization_function=adam
loss_function = "cross_entropy"

layers=[784]
for i in range(number_hidden_layers):
  layers=layers+[number_neurons]
layers=layers+[10]


config_confmat = {
      'number_hidden_layers': 5,
      'number_neurons': 64,
      'learning_rate': 0.0001,
      'initialization_type': "xavier",
      'activation_function':'tanh',
      'mini_batch_size' : 64,
      'max_epochs': 7,
      'lambd': 0,
      'optimization_function': "adam"
  }

wandb.init(config=config_confmat,entity=entity_name, project=project_name)
wandb.run.name="MNIST_trials"
wandb_log=True

w=weights(layers)
parameters = w.weight_init(init_type = initialization_type)
parameters, train_acc, val_acc = gradient_descent(X_train, y_train,learning_rate, max_epochs, layers, mini_batch_size, lambd, loss_function, activation_function, parameters,optimization_function,wandb_log)
res = run_inference(X_test,y_test, parameters, activation_function, layers)


accuracy=accuracy_calc(res,y_test)
print("Test Accuracy:",accuracy)

### Question-1

In [None]:
# Importing required libraries
import numpy as np
from numpy import random
import pandas as pd
import keras 
import tensorflow as tf
import matplotlib.pyplot as plt

# Importing fashion MNIST dataset using keras.datasets
from keras.datasets import fashion_mnist

In [None]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

In [None]:
# Creating a dictionary called "label" for extracting the id or y value for plotting images with their respective classes

label = {0:"T-shirt/top",1:"Trouser", 2:"Pullover", 3:"Dress", 4: "Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8: "Bag", 9: "Ankle boot"}
image_list = []
key_list = []
columns = 5
rows= 2

# setting the title, figure size and setting the x and y axis labels to null or emoty list to avoid the numbers or plots

fig = plt.figure(figsize=(8, 8))
plt.title('FASHION MNIST DATASET',fontsize=18) 
plt.yticks([]) 
plt.xticks([])

# Extracting the images with unique classes and storing their names and id's in a sample set of 30

for i in range(1,30):
    if y_train[i] not in image_list:
        image_list.append(y_train[i])
        key_list.append(i)
        
# Storing the names of the classes

images_names = [label[image_list[i]] for i in range(len(image_list))]

# Plotting the images using the ids and names extracted

for k,j in enumerate(key_list):
    img = x_train[j]
    fig.add_subplot(rows, columns,k+1 )
    plt.xlabel(images_names[k])
    plt.yticks([]) 
    plt.xticks([])
    plt.imshow(img,cmap = "gray")

plt.show()

In [None]:
print("CODE ENDS HERE :)  ")