# Optimization (Part 1)

In this activity, students will add Glorot weight initializtion (STEP 1), and Momentum (using velocity; STEP 2). Look for the locations to write this code after the (first) class definition for ANN (Students will add code in code cells 5 & 6)

After implementing these code cells, students should run the notebook and review the performance of the models on XOR.   Note that the random seeds for initialization are not constrained, so multiple repetitions should be accomplished to see how the various configuraitons perform.


In [1]:
import os
import numpy as np
import copy
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#GLOBAL VARIABLES

EPS = np.finfo(np.float32).eps  #minimum floating point number for numerical stability in calculations
SHOW_EXAMPLES = False  #shows graphs and output for individually-trained models.
REPS=20  #number of times to repeat model training experiments

In [3]:
# debug printing tool
DEBUG = True
def debug(*kargs):
    if DEBUG:
        print(*kargs)

### Defining the activation functions and their derivatives

In [4]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))


def dSigmoid(x):  # derivative of sigmoid
    s = sigmoid(x)
    return np.multiply(s, (1-s))


def relu(z):  # rectified linear unit activation
    return np.maximum(0, z)


def dRelu(z):
    """ 
    Derivative of Rectified Linear Unit

    """
    return 1 * (z > 0)


##  Setting up the basic ANN class



In [5]:
class ANN:
    data = []
    layers = []
    inputWidth = 1
    outputWidth = 1

    class Layer:

        """class defining the elements of an ANN layer"""

        @staticmethod
        def gradl2norm(weight_vals):
            """returns the gradient of the l2 norm with respect to the weights"""
            return weight_vals
    
        @staticmethod
        def gradl1norm(weight_vals):
            """returns the gradient of the l1 norm with respect to the weights"""
            return np.sign(weight_vals)

        @staticmethod
        def l2norm(vals):
            """returns the l2 norm of the vals"""
            return np.linalg.norm(vals,ord=2)
    
        @staticmethod
        def l1norm(vals):
            """returns the l1 norm of the vals"""
            return np.linalg.norm(vals,ord=1)
        
        def __init__(self):
            self.w = []
            self.b = []
            self.lam = 0  #for weight regularization
            self.weightRegFunction = self.l2norm  #placeholder for regularization function
            self.weightRegGradFunction = self.gradl2norm   #placeholder for regularization function
            self.vel_w=[] # for standard momentum of weight gradients
            self.vel_b=[] # for standard momentum of bias gradients
            self.nodecount = []
            self.activation_fcn = []
            self.activation_fcn_derivative = []
            self.orderNumber = []
            self.previous = None  # link to previous layer
            self.next = None  # link to next layer
            

        def set_weights(self, w, b):
            """set the weights and bias for the layer.  Layer weights should have dimesion: (thislayer_nodecount, previouslayer_nodecount)
            the dimension of the bias should be (thislayer_nodecount,1)"""
            self.w = w
            self.b = b
            return self
        
            
        def initialize_velocity(self):
            assert self.w.size > 0
            assert self.b.size > 0
            self.vel_w = np.zeros(self.w.shape)  # same shape as hidden weight matrix [rows = to, columns = from]
            self.vel_b = np.zeros(self.b.shape)  # same shape as hidden biases (column vector)
        

        def set_lambda(self, lam):
            self.lam = lam

        def set_weightRegFunction(self, fcn, d_fcn):
            self.weightRegFunction = fcn
            self.weightRegGradFunction = d_fcn
            
        def set_activation(self, activation_fcn):
            self.activation_fcn = activation_fcn
            return self

        def set_activation_deriv(self, activation_fcn):
            if activation_fcn == sigmoid:
                self.activation_fcn_derivative = copy.deepcopy(dSigmoid)
            elif activation_fcn == relu:
                self.activation_fcn_derivative = copy.deepcopy(dRelu)
            else:
                self.activation_fcn_derivative = None
                
        def display_params(self):
            """displays the weights and biases of the layer (rows = to, colums = from)"""
            for outNum in range(self.w.shape[0]):
                print(self.w[outNum,:], "  ", self.b[outNum])
        

        def compute_pre_activation(self, inputs):
            net = np.dot(self.w, inputs) + self.b
            return net

        def compute_bias_gradient(self, gradient):
            g = np.mean(gradient, axis=1)[:, np.newaxis]  # no regularization
            return g

        def compute_weight_gradient(self, inputs, gradient):
            g = np.dot(gradient, inputs.T)
            g = g/inputs.shape[1]   #divide by m (batchsize)
            return g

        def compute_activation(self, net):
            return self.activation_fcn(net)

        def compute_activation_derivative(self, net):
            return self.activation_fcn_derivative(net)

        def compute_activation_gradient(self, d_activation, gradient):
            g = np.multiply(gradient, d_activation)
            return g

        def compute_forward(self, inputs):
            """Returns layer ouput from input (shape = [nodeCount, input]) of the weighted input plus bias
            input shape must be [lastlayer_nodeCount, samples] or [featurecount, samplecount] """
            net = self.compute_pre_activation(self, inputs)
            layer_out = self.compute_activation(net)
            return layer_out


        def regularization_grad_weights(self,vals):
            """computes the regularization cost for the current layer weights"""
            mylam = self.lam
            myregs = self.weightRegGradFunction(vals)
            return mylam*myregs
        
        
        def compute_layer_gradients(self, net, activation, gradient):
            """ computes the loss gradient with respect to desired output of the layer
            a set of desired targets is assumed to be matrix of shape [nodecount, samples]: SGD will have [nodecount,1]
            hidden_inputs is assumed to be a matrix of shape [hiddenNodeCount, samples]
            
            This follows algorithm 6.4 line by line in the book!
            
            """
            
            # f'(a(k))
            d_activation = self.compute_activation_derivative(net)  # derivative of sigmoid:  shape = [NodeCount, samples]
            
            # g <- g * f'(a(k))
            g_loss = self.compute_activation_gradient(d_activation, gradient)  # shape = [NodeCount, samples]  for outer layer
            
            # Delta_b(k) J = g (Take the mean across all samples (batch))
            g_loss_b = self.compute_bias_gradient(g_loss)  # mean gradient with respect to BIAS, shape = [NodeCount, 1]
            
            # Delta w(k) J = g * h(k-1) +lam*regularizationGrad w.r.t weights
            g_loss_w = self.compute_weight_gradient(activation, g_loss) + self.regularization_grad_weights(self.w) # [thisLayerNodecount,prevLayerOutputcount]  
            #NOTE - regularization grad weights NOT WORKING YET
            
            # g <- W(k).T * g
            g_loss_backprop = np.dot(self.w.T, g_loss)  # gradient to propagate back, shape = [hiddenNodeCount,samples]
            
            return g_loss_w, g_loss_b, g_loss_backprop


    def __init__(self):
        self.data = []
        self.layers = []
        self.inputWidth = 1
        self.outputWidth = 1
    
        
    def set_input_width(self, inputWidth):
        """defines the input layer width for the network"""
        self.inputWidth = inputWidth

    def add_layer(self, nodecount=1, activation_fcn=relu):
        """adds a layer to the neural network and returns the layer"""
        oldLayerCount = len(self.layers)
        thislayer = ANN.Layer()
        thislayer.orderNumber = oldLayerCount + 1
        if oldLayerCount > 0:  # other layers have been added already
            lastLayer = self.layers[-1]
            #lastLayer.display_params()
            thislayer.previous = lastLayer
            lastLayer.next = thislayer
            layerInputSize = lastLayer.w.shape[0]
        else:  # this will be the first layer
            layerInputSize = self.inputWidth
            
        thislayer.w = np.zeros((nodecount, layerInputSize))  #[NODECOUNT,FROM]
        thislayer.b = np.zeros((nodecount, 1 )) #[NODECOUNT,FROM]
        thislayer.vel_w = np.zeros(thislayer.w.shape)  # same shape as hidden weight matrix [rows = to, columns = from]
        thislayer.vel_b = np.zeros(thislayer.b.shape)  # same shape as hidden biases (column vector)

        thislayer.activation_fcn = activation_fcn
        thislayer.set_activation_deriv(activation_fcn)
        self.outputWidth = nodecount
        self.layers = self.layers + [thislayer]
        return thislayer
    
    def initialize(self, glorot = False, seed = None):
        """initialize weights & biases & velocity: overwrites current network parameters"""
        for layer in self.layers:
            layer.initialize_weights(glorot=glorot, seed=seed)
            layer.initialize_velocity()
            
    def setL1weightNormalization(self,lam=0):
        for idx,layer in enumerate(self.layers):
            layer.set_lambda(lam)
            layer.set_weightRegFunction(layer.l1norm,layer.gradl1norm)
            print(" Set Layer ", idx," weightNorm to gradl1norm with lambda = ", lam)
            

    def setL2weightNormalization(self,lam=0):
        for idx,layer in enumerate(self.layers):
            layer.set_lambda(lam)
            layer.set_weightRegFunction(layer.l2norm,layer.gradl2norm )
            print(" Set Layer ", idx," weightNorm to gradl2norm with lambda = ", lam)

            
    def summary(self):
        """displays a summary of the model"""
        tot_train_parameters = 0
        print("\n")
        print("Layer     Inshape     Outshape     Param #     LambdaReg")
        print("==========================================================")
        for lnum, layer in enumerate(self.layers):
            inshape = layer.w.shape[1]
            weightCount = layer.w.shape[0]*layer.w.shape[1]  #assume fully connected
            biasCount = layer.b.shape[0]
            thislayerparams = weightCount+biasCount
            tot_train_parameters += thislayerparams
            lam = layer.lam
            print("% 3d       % 3d         % 3d         %3d         %3f" %(lnum,inshape,biasCount,thislayerparams,lam))
        print("==========================================================")
        print("total trainable params: ",tot_train_parameters )
        
    def display_params(self):
        """displays the weights and biases of the network (rows = to, colums = from)"""
        print("\n")
        print("input width: ", self.inputWidth)
        for lnum, layer in enumerate(self.layers):
            print("Layer ",lnum)
            layer.display_params()
        print("output width: ", layer.w.shape[0])
                
                
    def forwardPropagation(self, inputs):
        """Compute forward pass of two layer network
        inputs are assumed to be (shape=[sampleCount,featureCount])
        returns a matrix of raw outputs with one row of output per node (shape=[sampleCount, outputNodeCount])
        Internal matrices are shaped for efficiency to avoid internal transposes (columns hold observations/samples) """

        # inputs and outputs will be transposed for efficiency during forwardPropagation and untransposed before returning

        nets = []
        activations = []
        layer_input = inputs.T

        for lnum, layer in enumerate(self.layers):
            # inputs = inputs + inputs
            layer_net = layer.compute_pre_activation(layer_input)
            nets.append(layer_net)

            layer_out = layer.compute_activation(layer_net)
            activations.append(layer_out)

            layer_input = layer_out
        raw_output = layer_out.T
        return raw_output, inputs, nets, activations

    def backPropagation(self, inputs, desiredOutputs, learningRate, momentum=0):
        w_grads = []
        b_grads = []
        # store nets and activations for each layer
        raw_output, _, nets, activations = self.forwardPropagation(inputs)
        layer_desired_out = desiredOutputs

        # Note: This is only part of the gradient
        layer_grad = desiredOutputs - raw_output
        layer_grad = layer_grad.T  #in order to match expectation for last layer output
        prev_layer_outputs = [inputs.T] + activations  #insert inputs onto activation stream for easy computations

        #  computation of full gradient handled inside the loop below
        for lnum, layer in reversed(list(enumerate(self.layers))):
            #get the input to this layer
            curr_layer_input=prev_layer_outputs[lnum]
            #get the gradients for the layer    
            w_grad, b_grad, loss_grad = layer.compute_layer_gradients(nets[lnum], curr_layer_input, layer_grad)    

            layer.update_Layer(w_grad * learningRate, b_grad * learningRate, momentum=momentum)
            layer_grad = loss_grad

    def predict(self, X, threshold=0.5):
        """Compute predictions using forward propagation for single binary classification at threshold
        X is a standard dataFrame without biases (shape=[observationCount,featureCount])
        returns a standard column vector of binary predictions in {0,1}: (shape=[observationCount, 1])"""
        raw_predictions, net_inputs, net_lst, activation_lst = self.forwardPropagation(X)
        preds = raw_predictions > threshold
        return preds

    def compute_mse_loss(self, inputs, desired_targets):
        """computes the (scalar) loss using MSE of a set of targets and sigmoid outputs
        inputs is assumed to be a matrix of shape [samples, features]
         desired_targets is assumed to be a matrix of shape [samples, 1]"""
        raw_outputs = self.forwardPropagation(inputs)[0]
        error = desired_targets - raw_outputs
        mse = np.dot(error.T, error) / error.size
        return mse

    def compute_loss(self, raw_outputs, desired_targets):
        """
        computes the (scalar) loss using Binary Cross Entropy of a set raw ouputs and desired targets 
        raw_outputs and desired_targets are assumed to be arrays of shape [samples, 1]
        """
        y = desired_targets
        bce = -np.mean( (y*np.log(raw_outputs+EPS)) + ((1-y)*np.log(1-raw_outputs+EPS)) )
        #using EPS to prevent log(0) numerical problems
        return bce
    
    
    def fit(self, tngInputs, tngTargets, valInputs, valTargets, learningRate, learningRateDecay,
            batchsize = 1, momentum=0, valPatience=0, tolerance=1e-2, maxEpochs = 100, verbose = True):
        """fit model to map tngInputs to tngTargets. If valPatience > 0 then use early stopping on valInputs & valTargets
        returns training loss history and val loss history """
        done = False
        best_model_so_far = self  #store existing model

        tng_loss_history = []
        val_loss_history = []
        if verbose:
            print("Training Model...")
        epoch = 0
        #get current tng performance
        tng_out_raw, _, _, _ = self.forwardPropagation(tngInputs)
        tngPreds = self.predict(tngInputs)
        tngCorrect = tngTargets == tngPreds
        curr_train_loss = self.compute_loss(tng_out_raw, tngTargets).item()
        tng_loss_history.append(curr_train_loss)

        #get current val performance
        val_out_raw, _, _, _ = self.forwardPropagation(valInputs)
        valPreds = self.predict(valInputs)
        prev_val_loss = self.compute_loss(val_out_raw, valTargets).item()
        val_loss_history.append(prev_val_loss)
        val_epochs_nonimproved = 0
        training_count = tngInputs.shape[0]
        if batchsize>training_count: batchsize=training_count #prevent sampling beyond training size
            
        
        while not done:
            epoch+=1
            if epoch>maxEpochs: 
                done = True
            learningRate = learningRate * learningRateDecay
            #eval training performance
            tng_out_raw, _, _, _ = self.forwardPropagation(tngInputs)
            tngPreds = self.predict(tngInputs)
            tngCorrect = tngTargets == tngPreds
            curr_train_loss = self.compute_loss(tng_out_raw, tngTargets).item()
            tng_loss_history.append(curr_train_loss)
            #evaluate validation performance
            val_out_raw, _, _, _ = self.forwardPropagation(valInputs)
            valPreds = self.predict(valInputs)
            cur_val_loss = self.compute_loss(val_out_raw, valTargets).item()

            if cur_val_loss < tolerance:  #regular stopping
                done = True
                if verbose:
                    print(" --- Regular Stopping due to val loss < tolerance; val loss:", cur_val_loss)
                break
            
            # run an epoch of backprop
            #shuffle the indexes of the inputs & targets simultaneously
            order=np.random.permutation(training_count)
            #debug("range check ", np.arange(training_count))
            #debug("tng count:", training_count, ";   tng index order: ", order)
            first_tng_index=0
            last_tng_index = batchsize
            
            tinp = tngInputs[order]
            ttar = tngTargets[order]
            
            while last_tng_index<=training_count:
                #get a batch
                batchIn = tinp[first_tng_index:last_tng_index,:]
                batchTar = ttar[first_tng_index:last_tng_index,:]
                #train on the batch using backprop
                self.backPropagation(batchIn, batchTar, learningRate=learningRate, momentum=momentum)
                first_tng_index+=batchsize
                last_tng_index+=batchsize
                #handle mis-aligned training set sizes
                if first_tng_index < training_count-1 and last_tng_index>training_count-1:
                    batchIn=tinp[first_tng_index:training_count,:]
                    batchTar=ttar[first_tng_index:training_count,:]
                    self.backPropagation(batchIn, batchTar, learningRate=learningRate, momentum=momentum)
            

            # Early Stopping via VAL loss improvement
            # if validation loss has not improved in patience epochs then stop
            if cur_val_loss < prev_val_loss:
                val_epochs_nonimproved = 0
                prev_val_loss = cur_val_loss
                #TODO:  save best model info
                best_model_so_far = self
                tempTngHistory = tng_loss_history
                tempValHistory = val_loss_history
                
            else:
                val_epochs_nonimproved+=1
                if valPatience > 0 and val_epochs_nonimproved > valPatience :
                    if verbose:
                        print(" --- EARLY STOPPING ACTIVATED AT val_epochs_nonimproved =  ",val_epochs_nonimproved)
                    #replace this model with best model so far (based on val loss)
                    self = best_model_so_far
                    val_loss_history = tempValHistory
                    tng_loss_history = tempTngHistory
                    done=True

            val_loss_history.append(cur_val_loss)
            
        if verbose:
            print("Training Complete!")

        return tng_loss_history, val_loss_history

# STEP 1 Student Coding - Glorot initialization:

Add Glorot weight initialization in Layer subclass function ```def initialize_weights(self, glorot=False, seed=None )```

Note that ```m``` (fan-in) and ```n``` (fan-out) are pre-defined in the code above the student code location and available for your use.

The function to be implemented is to select the correct quantity of weights to set ```w``` using values drawn from a uniform distribution between [-sqrt(6) / sqrt(fan-in + fan-out), sqrt(6) / sqrt(fan-in + fan-out)]

(Glorot is not necessarily the best or the state of the art... the point of this exercise is to see *how* to change the initializtion scheme)


In [6]:
# Extending the ANN class to allow for weight initialization and momentum in the Layer class
class ANN(ANN):

    # Extending the Layer class to glorot weight initialization
    class Layer(ANN.Layer):

        def initialize_weights(self, glorot=False, seed=None ):
            assert self.w.size > 0
            assert self.b.size > 0
            
            biasShape = self.b.shape
            weightShape = self.w.shape
            
            #set biases 
            self.b = np.ones(self.b.shape)  #  biases (column vector of 1s)
            if self.activation_fcn == relu:
                self.b = self.b * 0.1  #for relu
            else:
                self.b = self.b * 0.0  # for sigmoid
            
            np.random.seed(seed)
            if glorot:  #use glorot initialization
                n = self.w.shape[0]
                m = self.w.shape[1]
                self.w = np.zeros(self.w.shape)   #placeholder
                
                ############# STUDENT CODE - ADD GLOROT INITIALIZATION OF WEIGHTS (NOT BIASES)##############
                
                #note:  the shape of self.w should be preserved... n-by-m
                
                # compute the upper and lower edges of the glorot-specified uniform distribution
                u_edge = np.sqrt(6/(m+n))
                
                # set self.w to values drawn from a uniform distribution
                # hint: use np.random.uniform(loweredge,upperedge,size) where size is the numpy shape of the weights (self.w)
                self.w = np.random.uniform(low=-u_edge,high=u_edge,size=(self.w.shape))
                
                ###################################### END STUDENT CODE ####################################
                
                
            else:
                #standard initialization
                self.w = np.random.normal(size=(self.w.shape))  # hidden weight matrix [rows = to, columns = from]
                
            #asserting the correct shapes on self.b and self.w - if these fail the code may be incorrect
            assert (self.b.shape == biasShape)
            assert (self.w.shape == weightShape)
                
                

# STEP 2 Student Coding - Momentum:

Add momentum & velocity in Layer subclass function ```def update_Layer(self, weightUpdate, biasUpdate, momentum=0)```

note that the ```momentum``` is passed in during the function call and the values for velocity (from the previous training iteration) are stored in
```self.vel_w``` (velocity of the weight values) and ```self.vel_b```  (velocity of the bias values)


In [7]:
 # Extending the ANN class to allow for momentum in the Layer class
class ANN(ANN):

    # extending the layer class 
    class Layer(ANN.Layer):
       
        def update_Layer(self, weightUpdate, biasUpdate, momentum=0.):
            """Update weights and biases. weightUpdate is shape [thisLayerNodecount,prevLayerOutputcount];
            biasUpdate is shape [thisLayerNodecount,1]
            """
            weightShape = self.w.shape
            biasShape = self.b.shape
            if momentum == 0:  #note this if-else statement not required if written as math eq with momentum & velocity >> more efficient!
                self.w = self.w + weightUpdate
                self.b = self.b + biasUpdate
            else:  #note this if-else statement not required if written as math eq with momentum & velocity
                # need to compute the new values for self.w and self.b using momentum
                # note: momentum is passed in but velocity must be updated & stored in self.vel_w and self.vel_b
                 
                ############# STUDENT CODING - ADD VELOCITY TO WEIGHT & BIAS UPDATE ##################
                
                # compute & store the new velocity for the weights (self.vel_w)
                self.vel_w = momentum*self.vel_w+weightUpdate
                
                # compute & store the new velocity for the biases (self.vel_b)
                self.vel_b = momentum*self.vel_b+biasUpdate
                
                # compute & store the weights (self.w) using the previous weights and the new weight velocity (self.vel_w)
                self.w = self.w + self.vel_w
                
                # compute & store the biases (self.b) using the previous biases and the new bias velocity (self.vel_b)
                self.b = self.b + self.vel_b
                
                ############################ END STUDENT CODING  #####################################

            #confirm the shapes are correct after momentum
            assert (self.b.shape == biasShape)
            assert (self.w.shape == weightShape)

                

## Helper functions - Visualization

For displaying graphical output

In [8]:
def dataplotter(featureData, labelData, title):
    '''plot annotated points to show where the boolean inputs lie on the graph''' 

    fig, ax = plt.subplots()

    xmin, xmax = np.min(featureData[:, 0]) - 0.1, np.max(featureData[:, 0]) + 0.1
    ymin, ymax = np.min(featureData[:, 1]) - 0.1, np.max(featureData[:, 1]) + 0.1

        
#     ax.scatter(featureData[:, 0], featureData[:, 1], color='b', alpha=0.2)
    for i, txt in enumerate(labelData):
        ax.annotate(txt.item(), (featureData[i, 0], featureData[i, 1]))

        # adjust the axis & add labels for the graph
    plt.axis([xmin, xmax, ymin, ymax])
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.title(title)
    plt.show()
    




def makeDecisionBoundaryBool2(model, featureData, labelData, title):
    '''Build decision boundary figrue for 2-input, 1-output boolean logic functions
    Note that this assumes a hard sigmoid was used and establishes a cutoff at 0.5
    for predicting 0 or 1'''
    cutoff = 0.5  # 0.5 for Sigmoid. 0.0 for TANH
    fig, ax = plt.subplots()

    xmin, xmax = np.min(featureData[:, 0]) - 0.1, np.max(featureData[:, 0]) + 0.1
    ymin, ymax = np.min(featureData[:, 1]) - 0.1, np.max(featureData[:, 1]) + 0.1

    # Create filled countour map to color both sides of the boundary
    x = np.linspace(xmin, xmax, 200)
    y = np.linspace(ymin, ymax, 200)
    X, Y = np.meshgrid(x, y)
    grid = np.c_[X.ravel(), Y.ravel()]
    preds = model.predict(grid)  # get predictions
    z = preds.reshape(X.shape) > cutoff  # cutoff on predictions to return boolean output
    plt.contourf(X, Y, z, cmap='YlOrBr')

    # add annotated points to show where the boolean inputs lie on the graph
    ax.scatter(featureData[:, 0], featureData[:, 1], color='b', alpha=0.5)
    for i, txt in enumerate(labelData):
        ax.annotate(txt.item(), (featureData[i, 0], featureData[i, 1]))

        # adjust the axis & add labels for the graph
    plt.axis([xmin, xmax, ymin, ymax])
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.title(title)
    plt.show()


def show2dFunctionOutput(model_function, featureData, labelData, title):
    """display results of arbitrary model function on 2-input (x1,x2) , 1-output (z) graphs"""
    # cutoff = 0.5  # 0.5 for Sigmoid. 0.0 for TANH
    fig, ax = plt.subplots(figsize=(8,5))

    xmin, xmax = np.min(featureData[:, 0]) - 0.1, np.max(featureData[:, 0]) + 0.1
    ymin, ymax = np.min(featureData[:, 1]) - 0.1, np.max(featureData[:, 1]) + 0.1

    # Create filled countour map to color both sides of the boundary
    x = np.linspace(xmin, xmax, 200)
    y = np.linspace(ymin, ymax, 200)
    X, Y = np.meshgrid(x, y)
    grid = np.c_[X.ravel(), Y.ravel()]
    outputs, _, _, _ = model_function(grid)  # get predictions
    z = outputs.reshape(X.shape)  # reshape predictions for 2d representation
    plotlevels = np.linspace(0.,1., 25)  # split colors between 0 and 1
    CS = plt.contourf(X, Y, z, levels = plotlevels, cmap='YlOrBr')

    # add annotated points to show where the boolean inputs lie on the graph
    ax.scatter(featureData[:, 0], featureData[:, 1], color='b', alpha=0.5)
    for i, txt in enumerate(labelData):
        ax.annotate(txt.item(), (featureData[i, 0], featureData[i, 1]))

    
    cbar = plt.colorbar(CS)
    cbar.ax.set_ylabel('raw network output')
    # adjust the axis & add labels for the graph
    plt.axis([xmin, xmax, ymin, ymax])
    plt.ylabel('x2')
    plt.xlabel('x1')

    plt.title(title)
    plt.show()
    
def showLossHistory(tng_loss_history=[],val_loss_history=[],semilog=True,plotname=""):
    plt.figure()
    if semilog:
        plt.semilogy(tng_loss_history,'r', label = "Training Loss")
        plt.semilogy(val_loss_history, 'b', label = "Validation Loss")
        plt.ylabel("Loss (Binary Cross-entropy) - semilog scale")

    else:    
        plt.plot(tng_loss_history,'r', label = "Training Loss")
        plt.plot(val_loss_history, 'b', label = "Validation Loss")
        plt.ylabel("Loss (Binary Cross-entropy)")
    plt.xlabel("iteration")
    plt.legend()
    plt.title(plotname+" Loss over iterations")
    plt.show() 
    

---

### Helper function to produce datasets for logic gates

In [9]:
def get_input_output_data(gate='XOR'):
    """ Two dimensional inputs for logic gates

    Parameters
    ----------
    gate : str
        Must be either AND, OR, XOR

    Returns
    -------
    X : array-like, shape(samples, features)
        Two dim input for logic gates

    truth[gate] : array-like, shapes(samples, )
        The truth value for this logic gate

    """

    X = np.array([[0., 0.],
                  [0., 1.],
                  [1., 0.],
                  [1., 1.]])

    truth = {
        'AND': np.array([0, 0, 0, 1]),
        'OR': np.array([0, 1, 1, 1]),
        'XOR': np.array([0, 1, 1, 0])
    }

    return X, truth[gate][:, np.newaxis]

In [10]:
X, Y = get_input_output_data(gate='XOR')

trainX = X
trainY = Y
valX = X
valY = Y


### Helper function to instantiate an ANN from parameters

In [11]:
def make_ANN_model(input_width = 2,
                   layer_widths = [2,1],
                   layer_activiations = [sigmoid,sigmoid],
                   glorot = False,
                   verbose = True):
    model = ANN()
    if verbose:
        print(list(zip(layer_widths,layer_activiations)))
        
    model.set_input_width(input_width)
    for lnum,(layerWidth,layerActivation) in enumerate(zip(layer_widths,layer_activiations)):
        model.add_layer(nodecount = layerWidth, activation_fcn=layerActivation)
    model.initialize(glorot=glorot)
    return model
        
 
    
    

---

### Helper function to train an ANN from parameters.  Allows for separate training and validation data

In [12]:
def train_ANN_model(model=None,
                    trainX = None, trainY = None,
                    valX=None,valY=None,
                    learning_rate=1.0,
                    lr_decay=0.999,
                    batchsize = 1,
                    momentum = 0,
                    valPatience=0,
                    maxEpochs = 100,
                    verbose = True,
                    showgraphs = True,
                    modelTitle = ""):
    
    

    preds = model.predict(valX)
    correct = valY == preds
    if verbose:
        textToPrint = modelTitle + " BEFORE TRAINING (randomized weights)"
        print(textToPrint)
        model.display_params()
        
    tng_loss_history,val_loss_history = model.fit(tngInputs=trainX, tngTargets=trainY, valInputs=valX, valTargets = valY,
                                                  learningRate=learning_rate, learningRateDecay=lr_decay, batchsize=batchsize,
                                                  momentum = momentum, valPatience=valPatience,
                                                  tolerance=1e-1, maxEpochs=maxEpochs, verbose=verbose)

    preds = model.predict(valX)
    correct = valY == preds

    if verbose:
        textToPrint = modelTitle + " AFTER TRAINING (learned model weights)"
        print(textToPrint)
        model.display_params()

    if showgraphs:
        show2dFunctionOutput(model.forwardPropagation, X, Y, modelTitle + " Raw Response of Network")
        makeDecisionBoundaryBool2(model, X, Y, modelTitle + " XOR predictions from Network")
        showLossHistory(tng_loss_history,val_loss_history, modelTitle + " Loss over Iterations")

    return tng_loss_history,val_loss_history


### Helper function to repeatedly train an ANN with model and training params. Reports results 

In [13]:
def repeat_model_eval(model_params, train_function_params, reps = 1, progress_reporting=True):
    ''' Repeatedly instantiate and fit a model multiple times and report results
    params:
    model_params:  parameters to pass to the model instantiation function (make_ANN_model())
    train_function_params:  parameters to pass to the 
    '''
    totalCorrect = 0
    countPerfectVal = 0
    print("running "+str(reps)+" iterations:")

    for idx in range(reps):
        model = make_ANN_model(**model_params)  #activations at each layer)

        tng_loss_history,val_loss_history= train_ANN_model(model,**train_function_params)

        preds = model.predict(valX)
        correct = valY == preds
        totalCorrect=totalCorrect+correct
        final_val_loss = round(val_loss_history[-1], 4)
        countPerfectVal = countPerfectVal+(sum(correct)==4)
        print(idx, end=": ")
        print("epoch count: "+str(len(val_loss_history)), end = "; ")
        print("val loss: " +str(final_val_loss), end="; ")
        print("val preds correct: "+str(sum(correct).item()))

    correctness = sum(totalCorrect).item()/(reps*4)
    
    print("\n"+str(reps)+
          "x reps Correctness = "+
          str(correctness)+
          ", Perfect Count = "+str(countPerfectVal.item()))
    return correctness,countPerfectVal.item()

---

# Training the networks

Here we initialize a network without, and with Glorot initialization.

Students will need to implement the code in the ANN class before running the sections below.

We will do this by running a set of 8 experiments over 3 levels with 2 factors per level
Modelsize = {Small, Large}
Glorot Weight Initialization = {False, True}
Momentum = {False, True}

The performance score (average correctness and number of perfectly correct classification scores on XOR)
will be captured in the array listed below

In [14]:
#placeholder array for experiment results
# 3 levels with 2 factors per level Modelsize = {Small, Large} Glorot Weight Initialization = {False, True} Momentum = {False, True}
model_experiment_results=np.zeros([2,2,2,2])
#first index: Model size: 0 = small, 1 = large
#second index: Glorot initialization: 0 = False, 1= True
#third index: Momentum: 0 = False, 1= True
#4th index contains performance:  0:  average correctness;  1:  number of perfectly correct classification scores



---

## Simple network *without* Glorot initialization
this is a 2,1 fully connected sigmoid network.  works some of the time...

In [15]:
smallmodel_layer_widths = [2,1]
smallmodel_layer_activations = [relu, sigmoid]

run the model multiple times to determine performance

In [16]:

model_params = {'input_width':2,            
               'layer_widths':smallmodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':smallmodel_layer_activations,
               'glorot':False,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.999,
    'batchsize':1,
    'momentum' : 0,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[0,0,0,:] = repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 5002; val loss: 0.3538; val preds correct: 3
1: epoch count: 115; val loss: 0.1; val preds correct: 4
2: epoch count: 151; val loss: 0.1; val preds correct: 4
3: epoch count: 230; val loss: 0.1009; val preds correct: 4
4: epoch count: 5002; val loss: 0.4808; val preds correct: 3
5: epoch count: 5002; val loss: 0.3543; val preds correct: 3
6: epoch count: 5002; val loss: 0.3544; val preds correct: 3
7: epoch count: 187; val loss: 0.1007; val preds correct: 4
8: epoch count: 153; val loss: 0.1007; val preds correct: 4
9: epoch count: 5002; val loss: 0.3538; val preds correct: 3
10: epoch count: 5002; val loss: 0.4804; val preds correct: 3
11: epoch count: 165; val loss: 0.1005; val preds correct: 4
12: epoch count: 5002; val loss: 0.481; val preds correct: 3
13: epoch count: 161; val loss: 0.1007; val preds correct: 4
14: epoch count: 5002; val loss: 0.6931; val preds correct: 2
15: epoch count: 145; val loss: 0.1001; val preds correct: 4
16: epoch 

The code below can show an example of model fitting if SHOW_EXAMPLES is set to True in the global variable section (top of jupyter notebook)

In [17]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                           layer_widths = smallmodel_layer_widths,  #number of nodes in each layer
                           layer_activiations = smallmodel_layer_activations,
                           glorot=False)  #activations at each layer

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history= train_ANN_model(model,trainX,trainY,valX,valY,
                    learning_rate=1.0,lr_decay=0.999, batchsize=1, 
                    momentum = 0, valPatience = 0, maxEpochs = 5000, modelTitle="Layers 2-1, no glorot, no momentum")

# Simple network with Glorot initialization
this is a 2,1 fully connected sigmoid network.  works some of the time...

but works more often once glorot initializations are used

In [18]:

model_params = {'input_width':2,            
               'layer_widths':smallmodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':smallmodel_layer_activations,
               'glorot':True,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.999,
    'batchsize':1,
    'momentum' : 0,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[1,0,0,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 5002; val loss: 0.4807; val preds correct: 3
1: epoch count: 5002; val loss: 0.3538; val preds correct: 3
2: epoch count: 203; val loss: 0.1015; val preds correct: 4
3: epoch count: 5002; val loss: 0.3541; val preds correct: 3
4: epoch count: 5002; val loss: 0.4808; val preds correct: 3
5: epoch count: 187; val loss: 0.1009; val preds correct: 4
6: epoch count: 5002; val loss: 0.354; val preds correct: 3
7: epoch count: 5002; val loss: 0.3542; val preds correct: 3
8: epoch count: 180; val loss: 0.1009; val preds correct: 4
9: epoch count: 5002; val loss: 0.4803; val preds correct: 3
10: epoch count: 188; val loss: 0.1007; val preds correct: 4
11: epoch count: 237; val loss: 0.1009; val preds correct: 4
12: epoch count: 176; val loss: 0.1001; val preds correct: 4
13: epoch count: 5002; val loss: 0.4804; val preds correct: 3
14: epoch count: 211; val loss: 0.1002; val preds correct: 4
15: epoch count: 5002; val loss: 0.4817; val preds correct: 3
16:

In [19]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                           layer_widths = smallmodel_layer_widths,  #number of nodes in each layer
                           layer_activiations = smallmodel_layer_activations,
                           glorot=True)  #activations at each layer

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=1.0,lr_decay=0.999, batchsize=1, 
                                                      momentum = 0, valPatience = 0, maxEpochs = 5000)

# Simple Network *without* Glorot initialization and with Momentum
this is a 2,1 fully connected sigmoid network. works more often and probably trains quicker...

you can vary the momentum parameter which should be in the interval \[0,1) 

In [20]:
momentum = 0.1  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':smallmodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':smallmodel_layer_activations,
               'glorot':False,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.999,
    'batchsize':1,
    'momentum' : momentum,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[0,1,0,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 5002; val loss: 0.4804; val preds correct: 3
1: epoch count: 5002; val loss: 0.3529; val preds correct: 3
2: epoch count: 5002; val loss: 0.3533; val preds correct: 3
3: epoch count: 189; val loss: 0.1; val preds correct: 4
4: epoch count: 5002; val loss: 0.4807; val preds correct: 3
5: epoch count: 5002; val loss: 0.4803; val preds correct: 3
6: epoch count: 5002; val loss: 0.3537; val preds correct: 3
7: epoch count: 5002; val loss: 0.3536; val preds correct: 3
8: epoch count: 5002; val loss: 0.4806; val preds correct: 3
9: epoch count: 172; val loss: 0.1008; val preds correct: 4
10: epoch count: 92; val loss: 0.1007; val preds correct: 4
11: epoch count: 201; val loss: 0.1016; val preds correct: 4
12: epoch count: 192; val loss: 0.1009; val preds correct: 4
13: epoch count: 133; val loss: 0.1015; val preds correct: 4
14: epoch count: 5002; val loss: 0.4805; val preds correct: 3
15: epoch count: 114; val loss: 0.101; val preds correct: 4
16: epo

# Simple Network with Glorot initialization and Momentum
this is a 2,1 fully connected sigmoid network. works more often and probably trains quicker...

you can vary the momentum parameter which should be in the interval \[0,1) 

In [21]:
momentum = 0.1  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':smallmodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':smallmodel_layer_activations,
               'glorot':True,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.999,
    'batchsize':1,
    'momentum' : momentum,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[1,1,0,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 5002; val loss: 0.3534; val preds correct: 3
1: epoch count: 5002; val loss: 0.4805; val preds correct: 3
2: epoch count: 5002; val loss: 0.3533; val preds correct: 3
3: epoch count: 5002; val loss: 0.4805; val preds correct: 3
4: epoch count: 5002; val loss: 0.3539; val preds correct: 3
5: epoch count: 143; val loss: 0.101; val preds correct: 4
6: epoch count: 5002; val loss: 0.4801; val preds correct: 3
7: epoch count: 5002; val loss: 0.3538; val preds correct: 3
8: epoch count: 5002; val loss: 0.6931; val preds correct: 2
9: epoch count: 5002; val loss: 0.4806; val preds correct: 3
10: epoch count: 5002; val loss: 0.6931; val preds correct: 2
11: epoch count: 5002; val loss: 0.3534; val preds correct: 3
12: epoch count: 5002; val loss: 0.4807; val preds correct: 3
13: epoch count: 197; val loss: 0.1004; val preds correct: 4
14: epoch count: 5002; val loss: 0.3534; val preds correct: 3
15: epoch count: 114; val loss: 0.1013; val preds correct: 4

In [22]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                           layer_widths = smallmodel_layer_widths,  #number of nodes in each layer
                           layer_activiations = smallmodel_layer_activations,
                          glorot=True)  #activations at each layer


    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=1.0,lr_decay=0.999, batchsize=1,
                                                      momentum = 0.7, valPatience = 0, maxEpochs = 5000)

# Higher Capacity network
 a 5-3-1 network should learn easier.

In [23]:
largemodel_layer_widths = [5,3,1]
largemodel_layer_activations = [relu,relu,sigmoid]

 
## First lets try the larger network without Glorot or momentum ...

In [24]:
momentum = 0.1  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':largemodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':largemodel_layer_activations,
               'glorot':False,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.99,
    'batchsize':1,
    'momentum' : 0,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[0,0,1,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 137; val loss: 0.1005; val preds correct: 4
1: epoch count: 5002; val loss: 4.9356; val preds correct: 2
2: epoch count: 110; val loss: 0.1002; val preds correct: 4
3: epoch count: 5002; val loss: 0.6901; val preds correct: 3
4: epoch count: 5002; val loss: 0.3106; val preds correct: 4
5: epoch count: 195; val loss: 0.1004; val preds correct: 4
6: epoch count: 30; val loss: 0.1022; val preds correct: 4
7: epoch count: 62; val loss: 0.1002; val preds correct: 4
8: epoch count: 57; val loss: 0.1037; val preds correct: 4
9: epoch count: 72; val loss: 0.103; val preds correct: 4
10: epoch count: 137; val loss: 0.1; val preds correct: 4
11: epoch count: 147; val loss: 0.1; val preds correct: 4
12: epoch count: 134; val loss: 0.1004; val preds correct: 4
13: epoch count: 5002; val loss: 0.6931; val preds correct: 2
14: epoch count: 5002; val loss: 0.1091; val preds correct: 4
15: epoch count: 64; val loss: 0.1005; val preds correct: 4
16: epoch count: 5

In [25]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                       layer_widths = largemodel_layer_widths,  #number of nodes in each layer
                       layer_activiations = largemodel_layer_activations)  #activations at each layer

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=0.5,lr_decay=1.0, batchsize=1, 
                                                      valPatience = 0, maxEpochs = 5000)

## Large model with Glorot but no momentum

In [26]:
momentum = 0  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':largemodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':largemodel_layer_activations,
               'glorot':True,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.99,
    'batchsize':1,
    'momentum' : momentum,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[1,0,1,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 5002; val loss: 0.1047; val preds correct: 4
1: epoch count: 72; val loss: 0.1017; val preds correct: 4
2: epoch count: 98; val loss: 0.1002; val preds correct: 4
3: epoch count: 194; val loss: 0.1001; val preds correct: 4
4: epoch count: 159; val loss: 0.1002; val preds correct: 4
5: epoch count: 56; val loss: 0.1003; val preds correct: 4
6: epoch count: 304; val loss: 0.1001; val preds correct: 4
7: epoch count: 133; val loss: 0.1004; val preds correct: 4
8: epoch count: 64; val loss: 0.1002; val preds correct: 4
9: epoch count: 106; val loss: 0.1011; val preds correct: 4
10: epoch count: 133; val loss: 0.1003; val preds correct: 4
11: epoch count: 5002; val loss: 0.4882; val preds correct: 3
12: epoch count: 5002; val loss: 0.3703; val preds correct: 3
13: epoch count: 70; val loss: 0.1008; val preds correct: 4
14: epoch count: 5002; val loss: 0.1257; val preds correct: 4
15: epoch count: 253; val loss: 0.1; val preds correct: 4
16: epoch count

In [27]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                   layer_widths = largemodel_layer_widths,  #number of nodes in each layer
                   layer_activiations = largemodel_layer_activations, #activations at each layer
                   glorot=True)  

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=0.5,lr_decay=1.0, batchsize=1, 
                                                      valPatience = 0, maxEpochs = 5000)

# and with the model using standard intialization (not Glorot) and with momentum

In [28]:
momentum = 0.1  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':largemodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':largemodel_layer_activations,
               'glorot':False,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.99,
    'batchsize':1,
    'momentum' : momentum,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[0,1,1,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 73; val loss: 0.1006; val preds correct: 4
1: epoch count: 183; val loss: 0.1004; val preds correct: 4
2: epoch count: 116; val loss: 0.1002; val preds correct: 4
3: epoch count: 5002; val loss: 0.4842; val preds correct: 3
4: epoch count: 30; val loss: 0.1013; val preds correct: 4
5: epoch count: 93; val loss: 0.1005; val preds correct: 4
6: epoch count: 5002; val loss: 0.4832; val preds correct: 3
7: epoch count: 5002; val loss: 0.3542; val preds correct: 3
8: epoch count: 5002; val loss: 2.4348; val preds correct: 3
9: epoch count: 5002; val loss: 0.6931; val preds correct: 2
10: epoch count: 5002; val loss: 0.2089; val preds correct: 4
11: epoch count: 34; val loss: 0.1036; val preds correct: 4
12: epoch count: 5002; val loss: 0.1588; val preds correct: 4
13: epoch count: 266; val loss: 0.1001; val preds correct: 4
14: epoch count: 28; val loss: 0.101; val preds correct: 4
15: epoch count: 5002; val loss: 0.1919; val preds correct: 4
16: epoch

In [29]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                       layer_widths = largemodel_layer_widths,  #number of nodes in each layer
                       layer_activiations = largemodel_layer_activations, #activations at each layer
                       glorot=False)  

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=0.5,lr_decay=1.0, batchsize=1, 
                                                      valPatience = 0, momentum = 0.7,maxEpochs = 5000)

## And finally, the large model initialized with glorot and trained with momentum

In [30]:
momentum = 0.1  #value 0 to less than 1.   Too high overshoots!

model_params = {'input_width':2,            
               'layer_widths':largemodel_layer_widths,  #number of nodes in each layer
               'layer_activiations':largemodel_layer_activations,
               'glorot':True,
               'verbose':False}
train_function_params = {
    'trainX':trainX,'trainY':trainY,'valX':valX,'valY':valY,
    'learning_rate':0.5,
    'lr_decay':0.99,
    'batchsize':1,
    'momentum' : momentum,
    'valPatience' : 0,
    'maxEpochs' : 5000,
    'verbose' : False,
    'showgraphs':False}
model_experiment_results[1,1,1,:]=repeat_model_eval(model_params =model_params ,train_function_params = train_function_params , reps = REPS )

running 20 iterations:
0: epoch count: 133; val loss: 0.1; val preds correct: 4
1: epoch count: 47; val loss: 0.1003; val preds correct: 4
2: epoch count: 122; val loss: 0.1002; val preds correct: 4
3: epoch count: 49; val loss: 0.1026; val preds correct: 4
4: epoch count: 158; val loss: 0.1002; val preds correct: 4
5: epoch count: 5002; val loss: 0.1199; val preds correct: 4
6: epoch count: 5002; val loss: 0.3647; val preds correct: 3
7: epoch count: 308; val loss: 0.1001; val preds correct: 4
8: epoch count: 123; val loss: 0.1006; val preds correct: 4
9: epoch count: 5002; val loss: 0.396; val preds correct: 3
10: epoch count: 79; val loss: 0.1013; val preds correct: 4
11: epoch count: 5002; val loss: 0.3677; val preds correct: 3
12: epoch count: 43; val loss: 0.1016; val preds correct: 4
13: epoch count: 5002; val loss: 0.2585; val preds correct: 4
14: epoch count: 5002; val loss: 0.4876; val preds correct: 3
15: epoch count: 169; val loss: 0.1002; val preds correct: 4
16: epoch cou

In [31]:
if SHOW_EXAMPLES:
    model = make_ANN_model(input_width = 2,
                       layer_widths = largemodel_layer_widths,  #number of nodes in each layer
                       layer_activiations = largemodel_layer_activations, #activations at each layer
                       glorot=True)  

    print("empty model info")
    model.summary()

    X, Y = get_input_output_data(gate='XOR')

    trainX = X
    trainY = Y
    valX = X
    valY = Y

    tng_loss_history,val_loss_history=train_ANN_model(model,trainX,trainY,valX,valY,
                                                      learning_rate=0.5,lr_decay=1.0, batchsize=1, 
                                                      momentum = 0.7, valPatience = 0, maxEpochs = 5000)

## Results Reporting for all models

In [32]:
resultsDF = pd.DataFrame()
for s in [0,1]:
    for i in [0,1]:
        for m in [0,1]:
            avg_correctness, perfect_count = model_experiment_results[s,i,m,:]
            rowdata = {'sizeSmall': [s], 'initGlorot': [i], 'momentum': [m], 'correctness':[avg_correctness],'perfectCount':[perfect_count]}
            rowDF = pd.DataFrame(rowdata)
            resultsDF = pd.concat([resultsDF,rowDF ],ignore_index=True)
            
  
display(resultsDF)


Unnamed: 0,sizeSmall,initGlorot,momentum,correctness,perfectCount
0,0,0,0,0.85,9.0
1,0,0,1,0.9125,15.0
2,0,1,0,0.85,8.0
3,0,1,1,0.9,13.0
4,1,0,0,0.8625,9.0
5,1,0,1,0.975,18.0
6,1,1,0,0.775,4.0
7,1,1,1,0.95,16.0
