In [None]:
# This notebook uses wandb for experiment monitoring

In [None]:
# import 
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
import wandb
#from tqdm import tqdm

#load data

def load_and_preprocess_data(normalize = False, debug = True):

  fashion_mnist = tf.keras.datasets.fashion_mnist

  (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

  if debug:
        print('Before preprocessing:')
        print(' - X_train.shape = {}, y_train.shape = {}'.format(train_images.shape, train_labels.shape))
        print(' - X_test.shape = {}, y_test.shape = {}'.format(test_images.shape, test_labels.shape))


  # randomly sort train_images/train_labels
  indexes = np.arange(train_images.shape[0])
  for _ in range(5): indexes = np.random.permutation(indexes)  # shuffle 5 times!
  train_images = train_images[indexes]
  train_labels = train_labels[indexes]
    
  # 'split' into cross-val & train sets (use 6000(10%) records in cross-val set)    
  val_count = 6000
  val_images = train_images[:val_count]
  val_labels = train_labels[:val_count]
  train_images = train_images[val_count:]
  train_labels = train_labels[val_count:]

  if debug:
        print('After preprocessing:')
        print(' - X_train.shape = {}, y_train.shape = {}'.format(train_images.shape, train_labels.shape))
        print(' - X_val.shape = {}, y_val.shape = {}'.format(val_images.shape, val_labels.shape))
        print(' - X_test.shape = {}, y_test.shape = {}'.format(test_images.shape, test_labels.shape))
        


  if(normalize == True):

    train_images = train_images / 255.0
    val_images = val_images / 255.0
    test_images = test_images / 255.0


  return train_images, train_labels, val_images, val_labels, test_images, test_labels



# layer class

class Linear_Layer:

    def __init__(self, inputs, neurons, activation):
        np.random.seed(42)
        self.W = np.random.randn(neurons, inputs) *.01
        self.b = np.zeros((neurons, 1))
        self.activation = activation

        self.update_W = np.zeros((neurons, inputs))
        self.update_b = np.zeros((neurons, 1))

        self.update_W_V = np.zeros((neurons, inputs))
        self.update_b_V = np.zeros((neurons, 1))

       

    def forward(self, H_prev):
        self.H_prev = H_prev
        self.A = np.dot(self.W, self.H_prev.transpose()) + self.b
        self.A = self.A.transpose()
        self.H = self.activation.forward(self.A)
        return self.H

    def backprop(self, dH):
        if(type(self.activation).__name__ != 'Softmax'):
           dA = np.multiply(self.activation.backward(self.A), dH)
        else:
           dA = dH
        dW = 1/dA.shape[0] * np.dot(dA.transpose(),self.H_prev)
        db = 1/dA.shape[0] * np.sum(dA.transpose(), axis=1, keepdims=True)
        dH_prev = np.dot(dA,self.W)

        return dH_prev, dW, db

    def updation(self, dW, db):
        
        self.W = self.W -  dW
        self.b = self.b -  db
        

# Model class

class Model:
  
    def __init__(self, input_dim, num_classes, num_layers, num_neurons, activation):

      self.layers = []
      
      i = 0
      while(i<num_layers):
        if(i == 0):
          layer = Linear_Layer(input_dim, num_neurons, activation)
        else:
          layer = Linear_Layer(num_neurons, num_neurons, activation)
        self.layers.append(layer)
        i+= 1

      layer = Linear_Layer(num_neurons,num_classes, Softmax())
      self.layers.append(layer)
     
    def forward(self,x):
      
      value = 0
      j = 0
      while( j < len(self.layers)):

        curr_layer = self.layers[j]
        if(j == 0):
          value = curr_layer.forward(x)
        else:
          prev_value = value
          value = curr_layer.forward(prev_value)
        
        j+= 1

      return value

    

# loss and accuracy computation

def loss_and_acc(y, y_hat):
     
     
     N = len(y)
     i = 0
     doutput = np.zeros((N, 10))
     loss_total = 0
     acc = 0
     while(i<N):
       target = y[i]
       prediction_vec = y_hat[i]
       
       if(np.equal(np.argmax(prediction_vec),target)):
         acc += 1

       prediction = prediction_vec[target]
       loss = -np.log(prediction)
       dloss = prediction-1
       doutput[i] = prediction_vec
       doutput[i][target] = dloss
       loss_total += loss
       i += 1
     
    
     #print(loss_total / N)

     return  loss_total / N, acc / N , doutput
     


    
# class for setting up a neural network for training

class NN_exp_setup:

      def __init__(self,model,lr,optimizer):
            
          self.model = model
          self.learning_rate = lr
          self.optimizer = optimizer
        

      def train_step(self,batch):
          
          x,y = batch
          #batch_size = len(y)
          y_hat = self.model.forward(x)

         
          loss_step, acc, dloss = loss_and_acc(y, y_hat)
         
          
          return loss_step, acc, dloss

    
      def validation_step(self,batch):
          x,y = batch
          #batch_size = len(y)
          y_hat = self.model.forward(x)

         
          loss_step, acc, _ = loss_and_acc(y, y_hat)
         
          
          return loss_step, acc


      def test_step(self,batch):

          return


# class implementing training process along with back propagation

class Trainer:

  def __init__(self,batch_size, epochs):
     
     self.batch_size = batch_size
     self.epochs = epochs
     self.train_loss = {}
     self.train_acc = {}
     self.val_loss = {}
     self.val_acc = {}


  def fit(self,X,y,NN_train, validation_data = None):

    self.NN_train = NN_train
    

    if(type(NN_train.optimizer).__name__ == 'SGD'):
      self.batch_size = 1

    for epoch in range(self.epochs):
        
        self.train_loss[epoch] = []
        self.train_acc[epoch] = []
        self.val_loss[epoch] = []
        self.val_acc[epoch] = []

        batch_index = 0

        while (batch_index < X.shape[0]):
          start = batch_index
          stop = batch_index + self.batch_size
          if( batch_index + self.batch_size >= X.shape[0]):
            stop = X.shape[0]
          data_batch = X[start:stop,:]
          label_batch = y[start:stop]

          loss_step, acc_step, dloss = self.NN_train.train_step((data_batch,label_batch))
          self.train_loss[epoch].append(loss_step)
          self.train_acc[epoch].append(acc_step)
          
          
          dH = dloss
          for layer in reversed(self.NN_train.model.layers):

            dH,dW,db = layer.backprop(dH)

            dW, db = self.NN_train.optimizer.compute(layer,dW,db,self.NN_train.learning_rate,epoch)

            layer.updation(dW,db)
            
          
          batch_index += self.batch_size
        

        val_loss_step, val_acc_step = self.NN_train.validation_step(validation_data)
        self.val_loss[epoch].append(val_loss_step)
        self.val_acc[epoch].append(val_acc_step)
          
        
        tr_loss_epoch = sum(self.train_loss[epoch]) / len(self.train_loss[epoch])
        tr_acc_epoch = sum(self.train_acc[epoch]) / len(self.train_acc[epoch])

        print(f"Epoch: {epoch}   Loss: {tr_loss_epoch:.4f}   Accuracy: {tr_acc_epoch:.4f}  Val_Loss: {self.val_loss[epoch][0]:.4f}   Val_Accuracy: {self.val_acc[epoch][0]:.4f}")
       
        wandb.log({"training_loss": round(tr_loss_epoch,4), "training_acc": round(tr_acc_epoch,4)}, commit=False)
        wandb.log({"validation_loss": round(val_loss_step,4), "validation_acc": round(val_acc_step,4)})
        #wandb.log({"val_loss": val_loss[epoch], "val_acc'": val_acc[epoch],"epoch": epoch + 1})
        #print(np.max(dW),np.max(db))

    
    return self.train_loss, self.train_acc, self.val_loss, self.val_acc


  def test(self,):


    return



# Softmax computation

class Softmax:

  def forward(self,x):

      rows, columns = x.shape
      output = np.zeros((rows, 10))
      i = 0
      while(i<rows):
        vector = x[i]
        try:
          e = np.exp(vector)
        except OverflowError as oe:
          print("After overflow", oe)
        output[i] = e / e.sum()
        i +=1

      return output

# Sigmoid activation

class Sigmoid:

    def __init__(self,):
        self.H = 1.0
        self.dH = 1.0

    def forward(self,x):
        self.H = 1/(1 + np.exp(-x))
        return self.H

    def backward(self,x):
        sig_H = 1/(1 + np.exp(-x))
        self.dH = (1 - sig_H ) * sig_H 
        return self.dH

# Relu activation

class Relu:

    def __init__(self,):
        self.H = 1.0
        self.dH = 1.0

    def forward(self,x):
        self.H = np.maximum(0, x)
        return self.H

    def backward(self,x):
        #self.dH = np.where(x<=0,0.01*x,1)
        self.dH = np.where(x<=0,0,1)
        return self.dH
      
# Tanh activation

class Tanh:

    def __init__(self,):
        self.H = 1.0
        self.dH = 1.0

    def forward(self,x):
        self.H = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
        return self.H

    def backward(self,x):
        sig_H = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
        self.dH = 1 - sig_H  ** 2
        return self.dH


# Minibatch GD

class MinibatchGD:

  def compute(self,layer,dW,db,lr,epoch):
    #print(np.max(dW))
    #print(np.min(dW))

    update_W = lr * dW
    update_b = lr * db

    return update_W, update_b


# Stochastic GD

class SGD:

  def compute(self,layer,dW,db,lr,epoch):

    update_W = lr * dW
    update_b = lr * db

    return update_W, update_b


# Momentum GD

class Momentum:

  def __init__(self,):

    self.gamma = .9

  def compute(self,layer,dW,db,lr,epoch):

    update_w_prev = layer.update_W
    update_b_prev = layer.update_b

    update_w = self.gamma * update_w_prev + lr * dW
    update_b = self.gamma * update_b_prev + lr * db


    layer.update_W = update_w
    layer.update_b = update_b


    return update_w, update_b 


# NAG GD
# https://stackoverflow.com/questions/50774683/how-is-nesterovs-accelerated-gradient-descent-implemented-in-tensorflow

class NAG:

  def __init__(self,):

    self.gamma = .9

  def compute(self,layer,dW,db,lr,epoch):

    update_w_prev = layer.update_W
    update_b_prev = layer.update_b

    update_w = self.gamma * update_w_prev + lr * dW
    update_b = self.gamma * update_b_prev + lr * db

    comp_w =  self.gamma * update_w_prev + update_w - self.gamma * update_w
    comp_b =  self.gamma * update_b_prev + update_b - self.gamma * update_b


    layer.update_W = update_w
    layer.update_b = update_b


    return comp_w, comp_b

# GD with RMS Propagation

class RMSProp:

  def __init__(self,):

    self.gamma = .9

  def compute(self,layer,dW,db,lr,epoch):

    update_w_prev = layer.update_W
    update_b_prev = layer.update_b

    update_w = self.gamma * update_w_prev + (1-self.gamma) * dW**2
    update_b = self.gamma * update_b_prev + (1-self.gamma) * db**2

    comp_w = (lr / (np.sqrt(update_w + 1e-08))) * dW
    comp_b = (lr / (np.sqrt(update_b + 1e-08))) * db


    layer.update_W = update_w
    layer.update_b = update_b


    return comp_w, comp_b

# GD with ADAM

class Adam:

  def __init__(self,):

    self.gamma1 = .9
    self.gamma2 = .999

  def compute(self,layer,dW,db,lr,epoch):
      
      mdw = layer.update_W
      mdb = layer.update_b

      vdw = layer.update_W_V
      vdb = layer.update_b_V

      

      mdw = self.gamma1 * mdw + (1 - self.gamma1) * dW
      mdb = self.gamma1 * mdb + (1 - self.gamma1) * db

      vdw = self.gamma2 * vdw + (1 - self.gamma2) * dW ** 2
      vdb = self.gamma2 * vdb + (1 - self.gamma2) * db ** 2

      mdw_corr = mdw / (1 - np.power(self.gamma1, epoch + 1))
      mdb_corr = mdb / (1 - np.power(self.gamma1, epoch + 1))

      vdw_corr = vdw / (1 - np.power(self.gamma2, epoch + 1))
      vdb_corr = vdb / (1 - np.power(self.gamma2, epoch + 1))

      comp_W = (lr / (np.sqrt(vdw_corr + 1e-08))) * mdw_corr
      comp_b = (lr / (np.sqrt(vdb_corr + 1e-08))) * mdb_corr
    
      
      layer.update_W = mdw
      layer.update_b = mdb

      layer.update_W_V = vdw
      layer.update_b_V = vdb


      return comp_W, comp_b





In [None]:

train_images, train_labels, val_images, val_labels, test_images, test_labels = load_and_preprocess_data(True,True)

epochs = 10
batch_size = 32
learning_rate = .001
activation = Relu()
optimizer = MinibatchGD()

X_train = train_images
y_train = train_labels
X_train = X_train.reshape(train_images.shape[0],784)

X_val = val_images
y_val = val_labels
X_val = X_val.reshape(val_images.shape[0],784)

input_dim = X_train.shape[1]
num_classes  = 10
num_layers = 2
num_neurons = 10



In [None]:
sweep_config = {
    'method': 'grid'
    }
metric = {
    'name': 'validation_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric


parameters_dict = {
    'optimizer': {
        'values': ['adam', 'MbGD']
        },
    'learning_rate': {
        'values': [0.001,0.01]
        },
    'num_layers': {
          'values': [1,2]
        },
    }

sweep_config['parameters'] = parameters_dict

parameters_dict.update({
    'epochs': {
        'value': 10}
    })

#sweep_id = wandb.sweep(sweep_config)
import pprint

pprint.pprint(sweep_config)

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Neural Network Implementation")

In [None]:
def train(config=None):
    with wandb.init(config=config) as run:

        config = wandb.config

        if config.optimizer == 'adam':
          optimizer = Adam()
        elif config.optimizer == 'MbGD':
          optimizer = MinibatchGD()

        model = Model(input_dim, num_classes, config.num_layers, num_neurons, activation)
        NN_train = NN_exp_setup(model,config.learning_rate,optimizer)

        trainer  = Trainer(batch_size,config.epochs)
        train_loss, train_acc, val_loss, val_acc = trainer.fit(X_train,y_train,NN_train, validation_data = (X_val,y_val) )



wandb.agent(sweep_id, function=train)