# Training a multi-linear classifier with k-layer networks** 

*Herein, I have trained and tested k-layer networks with multiple outputs to classify images from the CIFAR-10 dataset.*

In [0]:
#@title Installers
# !pip uninstall -y scipy
# !pip install scipy==1.2.0
# !pip install texttable

In [0]:
#@title Import libraries
#Import CIFAR-10 data from my google drive folder; I downoaded and unzipped the CIRAR-10 files and uploaded them to my drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
import pandas
import unittest
import numpy
from collections import OrderedDict
import statistics
from texttable import Texttable
from sklearn.preprocessing import StandardScaler
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

from PIL import Image
import pickle
import numpy as np
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import io
from googleapiclient.http import MediaIoBaseDownload
import matplotlib.pyplot as plt

from scipy import misc #remove, using PIL instead

In [0]:
#@title Functions: Decoding and displaying images
def unpickle(file):
  dict = pickle.load(file, encoding='bytes')
  return dict

def unpickle_getFromDrive(file_id):
  filename = GetFromDrive(file_id)
  dict = pickle.load(filename, encoding='bytes')
  return dict 

def loadLabels(file_id):
  data = unpickle_getFromDrive(file_id)
  labels = [x.decode('ascii') for x in data[b'label_names']]
  return labels

def load_batch(file_id):
    filename = GetFromDrive(file_id)
    dataDict = unpickle(filename)
    X = (dataDict[b"data"]).T
    X = (X - np.mean(X, axis=1, keepdims=True) ) / np.std(X, axis=1, keepdims=True) #transform to have zero mean
    y = np.array(dataDict[b"labels"])
    Y = (np.eye(10)[y]).T   
    return X, Y, y

def GetFromDrive(file_id): 
    request = drive_service.files().get_media(fileId=file_id)
    downloaded = io.BytesIO()
    downloader = MediaIoBaseDownload(downloaded, request)
    done = False
    while done is False:
      _, done = downloader.next_chunk()
    downloaded.seek(0)
    return downloaded

- Initialize and store the parameters of my network 
- Apply the network input vectors and keep a record of the intermediary scores when I apply the network (forward pass)
- Compute the gradient of the cost function for a mini-batch relative to the parameters of the network

In [0]:
#@title Functions: Load data_batches (1, 2 and test) for training, validation and test
def trainOnSmallDataBatch(samples=-1):
    X_train, Y_train, y_train = \
        load_batch('1M6-KBxAaqIqy9ekv3vhBlcTd3g1rkFcw') # data_batch_1
    X_val, Y_val, y_val = \
        load_batch('155Iy6tGX9HkNgZSdTPc9qGg9HCfay-Fy') # data_batch_2
    X_test, Y_test, y_test = \
        load_batch('1HdB9Dv2I9y1K-qcb__9M7Za0ZNtU1aIy') # test_batch

    labels = loadLabels('18LLg8Ch3GkdXI0MRAcoSwTzPKdgJMOQv') # labels

    if samples > -1:
      d = 5 #X_train.shape[0]
      data = {
          'X_train': X_train[:samples, :d],
          'Y_train': Y_train[:samples, :d],
          'y_train': y_train,
          'X_val': X_val[:samples, :d],
          'Y_val': Y_val[:samples, :d],
          'y_val': y_val,
          'X_test': X_test[:samples, :d],
          'Y_test': Y_test[:samples, :d],
          'y_test': y_test
      }
    elif samples == -1:
          data = {
          'X_train': X_train,
          'Y_train': Y_train,
          'y_train': y_train,
          'X_val': X_val,
          'Y_val': Y_val,
          'y_val': y_val,
          'X_test': X_test,
          'Y_test': Y_test,
          'y_test': y_test
      }

    return data, labels

The following code is necessary to create my network layers. 

In [0]:
#@title Functions: Create layers
def CreateLayers(shapes, activations):
    lrs = OrderedDict([])
    for i, (shape, activation) in enumerate(zip(shapes, activations)):
        lrs["layer%s" % i] = {"shape": shape, "activation": activation}
    return lrs

In [0]:
#@title Functions: Initialize network (Change initiazliation herein for testing in the last assignment v)
class NetClassifier(): 
    def __init__(self, data, lbs, layers, alpha=0.8, BN=False):
        for k, v in data.items(): setattr(self, k, v)
        self.layers     = layers
        self.alpha      = alpha
        self.lbs     = lbs
        self.BN = BN
        self.k          = len(layers) - 1
        self.activation_funcs = {'relu': self.Relu, 'softmax': self.SoftMax}
        self.W, self.b, self.gamma, self.beta, self.mu_V, self.var_V, \
                self.activations = [], [], [], [], [], [], []

        for lr in layers.values():
            for k, v in lr.items():
                if k == "shape":
                    W, b, gamma, beta, mu_V, var_V = self._init_parameters(v)
                    self.W.append(W), self.b.append(b)
                    self.gamma.append(gamma), self.beta.append(beta)
                    self.mu_V.append(mu_V), self.var_V.append(var_V)
                elif k == "activation":
                    self.activations.append((v, self.activation_funcs[v]))

        if self.BN:
            self.params = {"W": self.W, "b": self.b, "gamma": self.gamma,
                    "beta": self.beta}
        else:
            self.params = {"W": self.W, "b": self.b}


    @staticmethod
    def _init_parameters(d):
        
        stdev = 2/np.sqrt(d[1]) # He initialization whereby weight parameters of each layer are initialized to be normally distributed 
        # stdev = 1e-1  
        # stdev = 1e-3 
        # stdev = 1e-4 

        W      = np.random.normal(0, stdev, size=(d[0], d[1]))
        b      = np.zeros(d[0]).reshape(d[0], 1)
        beta   = np.zeros((d[0], 1))
        gamma  = np.ones((d[0], 1))
        mu_V  = np.zeros((d[0], 1))
        var_V = np.zeros((d[0], 1))

        return W, b, gamma, beta, mu_V, var_V

    @staticmethod
    def SoftMax(a):
        sm = np.exp(a - np.max(a, axis=0)) / \
                np.exp(a - np.max(a, axis=0)).sum(axis=0)
        return sm

    @staticmethod
    def Relu(a):
        a[a<0] = 0
        return a

The following function (Evalute Classifier, Cost and Accuracy) is from my previous assignment, however, they have been altered in small extent to consider new features of this assignment

In [0]:
#@title Functions: Evaluate Classifier, Cost and Accuracy
class EvaluationFunctions(NetClassifier):
    def EvaluateClassifier(self, X, train=False):
        s = np.copy(X)

        if (self.BN==True):
            S, S_h, means, m_var, H = [], [], [], [], []

            for i, (W, b, gamma, beta, mu_V, var_V, activation) in enumerate(
                    zip(self.W, self.b, self.gamma, self.beta, self.mu_V,
                        self.var_V, self.activations)):
                H.append(s)
                s = W@s + b

                if i < self.k:
                    S.append(s)
                    mu = np.mean(s, axis=1, keepdims=True)
                    means.append(mu)
                    var = np.var(s, axis=1, keepdims=True) * (X.shape[1]-1)/X.shape[1]
                    m_var.append(var)
                    if train:
                      self.var_V[i] = self.alpha * var_V + (1-self.alpha) * var
                      self.mu_V[i]  = self.alpha * mu_V + (1-self.alpha) * mu
                    
                    s = (s - mu) / np.sqrt(var + np.finfo(np.float64).eps)
                    S_h.append(s)
                    s = activation[1](beta+np.multiply(gamma, s))

                else:
                    P = activation[1](s)

            return H, P, S, S_h, means, m_var

        else:
            H = []
            for W, b, activation in zip(self.W, self.b, self.activations):
                if activation[0] == "relu":
                    s = activation[1](W@s + b)
                    H.append(s)
                if activation[0] == "softmax":
                    P = activation[1](W@s + b)
            return H, P

    def ComputeAccuracy(self, X, y):
        if (self.BN==True):
            argM_P = np.argmax(self.EvaluateClassifier(X)[1], axis=0)
        else: argM_P = np.argmax(self.EvaluateClassifier(X)[1], axis=0)
        acc = argM_P.T[argM_P == np.asarray(y)].shape[0] / X.shape[1]

        return acc

    def ComputeCost(self, X, Y, lamda): 
        sW = 0
        if self.BN:
            _, P, _, _, _, _ = self.EvaluateClassifier(X)
        else:
            _, P = self.EvaluateClassifier(X)
        loss = np.float64(1/X.shape[1]) * - np.sum(Y*np.log(P))
        for W in self.W:
            sW += (np.sum(np.square(W)))
        cost = loss + lamda * sW
        return loss, cost

In the following the network parameters are intialized starting with a 2-layer network. Careful initialization is applied using He initialization.

In [0]:
data, labels = trainOnSmallDataBatch() 
layers = CreateLayers(
        shapes=[(50, 3072), (10, 50)],
        activations=["relu", "softmax"])
clf = NetClassifier(data, labels, layers) 

*Compute the gradients for the network parameters*

Next, the functions to compute the gradients (copied and altered from my previous assignment) of my k-layer network w.r.t. its weight and bias parameters.

In [0]:
#@title Functions: Compute Gradients
class ComputeGradients(EvaluationFunctions):
  def ComputeGradientsAnalytically(self, X_batch, Y_batch, lamda): 
    N = X_batch.shape[1]

    if self.BN:
        grads = {"W": [], "b": [], "gamma": [], "beta": []}

        for k in self.params:
            for par in self.params[k]:
                grads[k].append(np.zeros_like(par))

        H_b, P_b, S_b, S_h_batch, means_b, vars_b = self.EvaluateClassifier(X_batch, train=True)# Foward
        G_b = - (Y_batch - P_b)        # Backward

        grads["W"][self.k] = 1/N * G_b@H_b[self.k].T +  2 * lamda * self.W[self.k]
        grads["b"][self.k] = np.reshape(1/N * G_b@np.ones(N),(grads["b"][self.k].shape[0], 1))

        G_b = self.W[self.k].T@G_b
        H_b[self.k][H_b[self.k] <= 0] = 0
        G_b = np.multiply(G_b, H_b[self.k] > 0)

        for l in range(self.k - 1, -1, -1):
            grads["gamma"][l] = np.reshape(1/N * np.multiply(G_b, S_h_batch[l])@np.ones(N), (grads["gamma"][l].shape[0], 1))
            grads["beta"][l]  = np.reshape(1/N * G_b@np.ones(N),(grads["beta"][l].shape[0], 1))
            G_b = np.multiply(G_b, self.gamma[l])
            G_b = self.BN_back_pass(G_b, S_b[l], means_b[l], vars_b[l])
            grads["W"][l] = 1/N * G_b@H_b[l].T + 2 * lamda * self.W[l]
            grads["b"][l] = np.reshape(1/N * G_b@np.ones(N),(grads["b"][l].shape[0], 1))
            if l > 0:
                G_b = self.W[l].T@G_b
                H_b[l][H_b[l] <= 0] = 0
                G_b = np.multiply(G_b, H_b[l] > 0)
    else:
        grads = {"W": [], "b": []}
        for W, b in zip(self.W, self.b):
            grads["W"].append(np.zeros_like(W))
            grads["b"].append(np.zeros_like(b))

        H_b, P_b = self.EvaluateClassifier(X_batch)# Forward 
        G_b = - (Y_batch - P_b)# Backward

        for l in range(len(self.layers) - 1, 0, -1):
            grads["W"][l] = 1/N * G_b@H_b[l-1].T + 2 * lamda * self.W[l]
            grads["b"][l] = np.reshape(1/N * G_b@np.ones(N),(grads["b"][l].shape[0], 1))
            G_b = self.W[l].T@G_b
            H_b[l-1][H_b[l-1] <= 0] = 0
            G_b = np.multiply(G_b, H_b[l-1] > 0)

        grads["W"][0] = 1/N * G_b@X_batch.T + lamda * self.W[0]
        grads["b"][0] = np.reshape(1/N * G_b@np.ones(N), self.b[0].shape)
    return grads

  def BN_back_pass(self, G_b, S_b, mean_batch, var_batch):
      G_1 = np.multiply(G_b, np.power(var_batch + np.finfo(np.float64).eps, -0.5))
      D = S_b - mean_batch
      c = np.sum(np.multiply(np.multiply(G_b, np.power(var_batch + np.finfo(np.float64).eps, -1.5)), D), axis=1, keepdims=True)
      G_b = G_1 - 1/G_b.shape[1] * np.sum(G_1, axis=1, keepdims=True) - 1/G_b.shape[1] * np.multiply(D, c)
      return G_b

  def ComputeGradientsNum(self, X_batch, Y_batch, size=2, lamda=np.float64(0), h=np.float64(1e-7)):
      if (self.BN==True): grads = {"W": [], "b": [], "gamma": [], "beta": []}
      else:grads = {"W": [], "b": []}
      for j in range(len(self.b)):
          for k in self.params:
              grads[k].append(np.zeros(self.params[k][j].shape))
              for i in range(len(self.params[k][j].flatten())):
                  prev_par = self.params[k][j].flat[i]
                  self.params[k][j].flat[i] = prev_par + h
                  _, f2 = self.ComputeCost(X_batch, Y_batch, lamda)
                  self.params[k][j].flat[i] = prev_par - h
                  _, f3 = self.ComputeCost(X_batch, Y_batch, lamda)
                  self.params[k][j].flat[i] = prev_par
                  grads[k][j].flat[i] = (f2-f3) / (2*h)
      return grads

In [0]:
#@title Functions: Compare Gradients
from decimal import *
getcontext().prec = 6

def CompareGradients():
    err = Texttable()
    err_data = [] 

    err_data.append(['Gradient', 'Method',  'Rel Diff Max [e-06]'])

    data, labels = trainOnSmallDataBatch()

    layers = CreateLayers(
            shapes=[(50, 30), (50, 50), (50, 50), (10, 50)],
            activations=["relu", "relu", "relu", "softmax"])

    clf = ComputeGradients(data, labels, layers)

    grads_num = clf.ComputeGradientsNum(
            clf.X_train[:30, :5],
            clf.Y_train[:30, :5],
            lamda=0)
    
    grads_ana = clf.ComputeGradientsAnalytically(
            clf.X_train[:30, :5],
            clf.Y_train[:30, :5],
            lamda=0)
        
    num_layers = len(grads_ana["W"])
    for l in range(num_layers):
        for k in grads_ana:
            max_rel_err = max(abs(grads_ana[k][l].flat[:] - grads_num[k][l].flat[:]) / np.asarray([max(abs(a), abs(b)) + 1e-10 for a,b in zip(grads_ana[k][l].flat[:], grads_num[k][l].flat[:])]))
            err_data.append(["layer %d %s"%(l+1, k), "ANL vs NUM", max_rel_err*100*10*100])
    
    err.add_rows(err_data)
    print("Method Comparison: Max Err between Analytical vs Numerical")
    print(err.draw())      
      

**i) HOW I CHECKED MY ANALYTICAL GRADIENTS**

The following generates the gradient comparing result that shows that the implemented analytical gradient method is close enough to be regarded as accurate. The gradients are compared to a numerical method (centered difference method). A four layer neural network without batch normalization is applied on a reduced dataset of 5 images with 30 dimensions. What is noteworthy is that the discrepency between the analytical and numerical gradients increase for the earlier layers as the gradient is back-propogated through the network. Checks are done with no regularization i.e. lambda = 0

In [0]:
#@title Code: Compare Analytical with Numerical Gradient
CompareGradients()

Moving on, we want to replicate the same test accuracy in a 3 layer network as in the assignment specification of approx. 53.5% after training with the following hyper parameters He initialization and hyper-parameter settings of eta min = 1e-5, eta max = 1e-1, lambda=.005, two cycles of training and n s = 5 * 45,000 / n batch. After training the model 6 times, we achieve approx. 53

Train the network using mini-batch gradient descent and cyclical learning rates and without batch normalization. Firstly, adapt the code from my previous assignment accordingly:

In [0]:
#@title Functions: Compute Mini Batch Gradient Descent
class MBGradientDescent(ComputeGradients):    
  def MiniBatchGD(self, X, Y, lamda=0, batch_s=100, eta_min=1e-5,
        eta_max=1e-1, n_s = 800, n_epochs=40, plot_id="", plot=False):
    if plot:
        costs_tr, loss_tr, acc_train, costs_val, loss_val, acc_val = \
                np.zeros(n_epochs), np.zeros(n_epochs), np.zeros(n_epochs), \
                np.zeros(n_epochs), np.zeros(n_epochs), np.zeros(n_epochs)

    n_batch = int(np.floor(X.shape[1] / batch_s))
    eta = eta_min
    t = 0
    for n in range(n_epochs):
        for j in range(n_batch):
            N = int(X.shape[1] / n_batch)
            j_start = (j) * N
            j_end = (j+1) * N

            X_batch = X[:, j_start:j_end]
            Y_batch = Y[:, j_start:j_end]

            grads = self.ComputeGradientsAnalytically(X_batch, Y_batch, lamda)

            for k in self.params:
                for par, grad in zip(self.params[k], grads[k]):
                    par -= eta * grad

            if t <= n_s:eta = eta_min + t/n_s * (eta_max - eta_min)
            elif t <= 2*n_s: eta = eta_max - (t - n_s)/n_s * (eta_max - eta_min)
            t = (t+1) % (2*n_s)
            # print("Epoch: "+str(n) + " n_batch: " + str(j) + " t: " + str(t))

        if plot:
            loss_tr[n], costs_tr[n] = self.ComputeCost(X, Y, lamda)
            loss_val[n], costs_val[n] = self.ComputeCost(self.X_val, self.Y_val, lamda)
            acc_train[n] = self.ComputeAccuracy(self.X_train, self.y_train)
            acc_val[n] = self.ComputeAccuracy(self.X_val, self.y_val)


    if plot:
      self.PlotFigure(n_epochs, costs_tr,  costs_val, "cost",y_label="cost", y_max=4.5)
      self.PlotFigure(n_epochs, loss_tr, loss_val, "loss",y_label="loss", y_max=3.0)
      self.PlotFigure(n_epochs, acc_train, acc_val, "accuracy",y_label="accuracy", y_max=1.0)

    acc_train = self.ComputeAccuracy(self.X_train, self.y_train)
    acc_val   = self.ComputeAccuracy(self.X_val, self.y_val)
    acc_test  = self.ComputeAccuracy(self.X_test, self.y_test)
    
    return acc_train, acc_val, acc_test     

  def PlotFigure(self, n_epochs, i_train, i_val, title, y_label, y_max=4):
        epochs = np.arange(n_epochs)
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(epochs, i_val, 'r-', label="validation")
        ax.plot(epochs, i_train, 'g-', label="training")
        ax.legend()
        ax.set(xlabel='epochs', ylabel=y_label)
        ax.set_ylim([0, y_max])
        ax.grid()

  def PlotTable(self, acc_train, acc_val,acc_test):
        t = Texttable()
        dd = [] 
        dd.append(['Train Accuracy', 'Val Accuracy', 'Test Accuracy'])
        dd.append([acc_train, acc_val,acc_test])
        t.add_rows(dd)
        print("**********************************************")
        print(t.draw())
        print(" ")               

In [0]:
#@title Functions: Load All Data
def trainOnAllDataBatches(val):
    X_train1, Y_train1, y_train1 = \
        load_batch('1M6-KBxAaqIqy9ekv3vhBlcTd3g1rkFcw')
    X_train2, Y_train2, y_train2 = \
        load_batch('155Iy6tGX9HkNgZSdTPc9qGg9HCfay-Fy')
    X_train3, Y_train3, y_train3 = \
        load_batch('10VIE8MElRqjIz0z-fIUX80OcWarKDdPp')
    X_train4, Y_train4, y_train4 = \
        load_batch('1ht7wULP6aOycu2J5F2zheesizftY2V2b')
    X_train5, Y_train5, y_train5 = \
        load_batch('1laJAlpuTD-YR_k9_rE0ZsSrbVApLaWgz')

    X_train = np.concatenate((X_train1, X_train2, X_train3, X_train4, X_train5),
            axis=1)
    Y_train = np.concatenate((Y_train1, Y_train2, Y_train3, Y_train4, Y_train5),
            axis=1)
    y_train = np.concatenate((y_train1, y_train2, y_train3, y_train4, y_train5))
    X_val = X_train[:, -val:]
    Y_val = Y_train[:, -val:]
    y_val = y_train[-val:]
    X_train = X_train[:, :-val]
    Y_train = Y_train[:, :-val]
    y_train = y_train[:-val]

    X_test, Y_test, y_test = \
        load_batch("1HdB9Dv2I9y1K-qcb__9M7Za0ZNtU1aIy")

    labels = loadLabels('18LLg8Ch3GkdXI0MRAcoSwTzPKdgJMOQv')
    
    data = {
        'X_train': X_train,
        'Y_train': Y_train,
        'y_train': y_train,
        'X_val': X_val,
        'Y_val': Y_val,
        'y_val': y_val,
        'X_test': X_test,
        'Y_test': Y_test,
        'y_test': y_test
    }

    return data, labels


Now, the training function is defined where the hyper-parameters setting is n batch=100, eta min = 1e-5, eta max = 1e-1, lambda=.005, two cycles of training and n s = 5 * 45,000 / n batch (2250)

In [0]:
#@title Functions: Training
def Training(layers, BN, lamda=0.005, trainingTimes=1):
  data, labels = trainOnAllDataBatches(val=5000)
  acc_tr_set = []
  acc_val_set = []
  acc_tst_set = []
  for j in range(trainingTimes):
      plotNow = False
      if(j==(trainingTimes-1)):
        plotNow = True
      clf = MBGradientDescent(data, labels, layers, BN=BN)
      acc_train, acc_val, acc_test = clf.MiniBatchGD(
              data['X_train'], data['Y_train'], lamda=lamda,
              batch_s=100, eta_min=1e-5, eta_max=1e-1, n_s=2250,
              n_epochs=20,  plot=plotNow)

      acc_tr_set.append(acc_train)
      acc_val_set.append(acc_val)
      acc_tst_set.append(acc_test)

  clf.PlotTable(statistics.mean(acc_tr_set), statistics.mean(acc_val_set),statistics.mean(acc_tst_set))


Now run on a 2-layer networ with cyclical learning rate. Here it is clear that the results from assignemnt 2 are replicated. Test accuracy is displayed below:

In [0]:
two_layers = CreateLayers(shapes=[(50, 3072), (10, 50)],activations=["relu", "softmax"])Training(two_layers, BN=False, trainingTimes=1)

**ii) GRAPHS OF EVOLUTION OF LOSS WITHOUT BATCH NORMALIZATION FOR 3-LAYER NETWORK**

Next, a 3-layer network is produced with the same hyper-paramaters and no batch normalization. 

In [0]:
three_layers = CreateLayers(shapes=[(50, 3072), (50, 50), (10, 50)],activations=["relu", "relu", "softmax"])
Training(three_layers, BN=False, trainingTimes=1)

**iii) GRAPHS OF EVOLUTION OF LOSS WITHOUT BATCH NORMALIZATION FOR 9-LAYER NETWORK**

Next, a 9-layer network is produced with the same hyper-paramaters and no batch normalization. 

In [0]:
nine_layers = CreateLayers(shapes=[(50, 3072), (30, 50), (20, 30), (20, 20), (10, 20),(10, 10), (10, 10), 
                                   (10, 10), (10, 10)],
        activations=["relu", "relu", "relu", "relu", "relu", "relu","relu", "relu", "softmax"])
Training(nine_layers, BN=False, trainingTimes=1) # 9-layer model w\o batch norm

EXERCISE 3

*Implement batch normalization*

**ii) GRAPHS OF EVOLUTION OF LOSS WITH BATCH NORMALIZATION FOR 3-LAYER NETWORK**

In the following batch normalization is applied, first to three layer network. If compated to the results above, there is not much difference.

In [0]:
Training(three_layers, BN=True, trainingTimes=1)

**iii) GRAPHS OF EVOLUTION OF LOSS WITH BATCH NORMALIZATION FOR 9-LAYER NETWORK**

However, as can be observed in the next results, whereby batch normalization is applied to the 9 layer network, it has a signifant impact. It is clear that for deeper neural network, batch normalization is very important.

In [0]:
Training(nine_layers, BN=True, trainingTimes=1)

Moving on, we want to replicate the same test accuracy in a 3 layer network as in the assignment specification of approx. 53.5% after training with the following hyper parameters He initialization and hyper-parameter settings of eta min = 1e-5, eta max = 1e-1, lambda=.005, two cycles of training and n s = 5 * 45,000 / n batch. After training the model 6 times, approx. 54% is achieved.

In [0]:
Training(three_layers, BN=True, trainingTimes=6)

**iv) RANGE OF VALUES IN SEARCH FOR LAMBDA FOR 3-LAYER NETWORK WITH BATCH NORMALIZATION**

Thereafter, the coarse search for lambda is performed. 

In [0]:
#@title Functions: Search for Lambda
def searchForLambda(lamda=0.005, trainingTimes=1):
  data, labels = trainOnAllDataBatches(val=5000)
  layers = CreateLayers(
      shapes=[(50, 3072), (50, 50), (10, 50)],
      activations=["relu", "relu", "softmax"])

  acc_tst_set = []
  acc_tr_set = []
  acc_val_set = []
  for j in range(trainingTimes):
      clf = MBGradientDescent(data, labels, layers, BN=True)
      acc_train, acc_val, acc_test = clf.MiniBatchGD(
              data['X_train'], data['Y_train'], lamda=lamda,
              batch_s=100, eta_min=1e-5, eta_max=1e-1, n_s=2250,
              n_epochs=20,  plot=False)

      acc_tr_set.append(acc_train)
      acc_val_set.append(acc_val)
      acc_tst_set.append(acc_test)

  tr_mean_acc = str(statistics.mean(acc_tr_set))
  val_mean_acc =  str(statistics.mean(acc_val_set))
  tst_mean_acc =  str(statistics.mean(acc_tst_set))
  
  return tr_mean_acc, val_mean_acc, tst_mean_acc

def getResultsWhenTrainingForDifferentLambdas(lambdas, title):
    t = Texttable()
    data = [] 
    data.append(['Parameters', 'Train Accuracy', 'Val Accuracy', 'Test Accuracy'])

    for x in range(0, len(lambdas)):
      train_mean_acc, val_mean_acc, test_mean_acc = searchForLambda(lamda=lambdas[x])
      saveFortbl = "lambda="+str(lambdas[x])
      data.append([saveFortbl,train_mean_acc, val_mean_acc,test_mean_acc])
      print("Lambda test done:" + str(lambdas[x]))
    t.add_rows(data)
    print("********************* "+title +" *************************")
    print("n epochs=20")
    print("*************************************************************")
    print(t.draw())
    print(" ")  

In [0]:
#@title Code: Coarse Search
lambdas = [.0100, .0200, .0300, .0400, .0500, .0600] #random selection of very low regularization
getResultsWhenTrainingForDifferentLambdas(lambdas, "COARSE SEARCH")

In the above coarse search I randomly selected five very low regularzation terms. In the result table it becomes clear that the validation accuracy is highest where lambda is below 0.02, thereby the fine search will be between 0.005 and 0.02

Continuing, the fine search for lambda is performed. 

In [0]:
#@title Code: Fine Search
lambdas = [.0051, .0063, .0069, .0094, .0134, .0154] #random selection of very low regularization
getResultsWhenTrainingForDifferentLambdas(lambdas, "FINE SEARCH")

The best fine search that yielded the highest accuracy on the validation set was at lambda=0.0134 as is displayed in the table below. For future work, I would explore running a much wider random search. 

Finally, the optimal lambda is implemented

In [0]:
Training(three_layers, BN=True, trainingTimes=6, lamda=0.0134)

**v) SENSITIVITY TO INITIALIZATION**

Next we need to implement a sensitivity analysis to the initialization where I experiment on training with batch normalization and without it. Instead of He initialization I will apply sig=1e-1, 1e-3 and 1e-4. In order to test this

He initialization without BN (change on the top in order to test sig=1e-1, 1e-3 and 1e-4)

In [0]:
Training(three_layers, BN=False, trainingTimes=6, lamda=0.0134)

He initialization with BN

In [0]:
Training(three_layers, BN=True, trainingTimes=6, lamda=0.0134)

As can be observed in the loss plots above, the networks are less sensitive to weight initialization when I train with batch normalization. It be concluded that batch normalization helps significantly the network to learn even if we start of with a bad initialization. 