In [3]:
from ComputationalGraphPrimer import ComputationalGraphPrimer
import operator
import random
import numpy as np
from tqdm import tqdm

class myADAMMultiNeuron(ComputationalGraphPrimer):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def backprop_and_update_params_multineuron(self, y_error, class_labels):
            # backproped prediction error:
            pred_err_backproped_at_layers = {i : [] for i in range(1,self.num_layers-1)}  
            pred_err_backproped_at_layers[self.num_layers-1] = [y_error]
            for back_layer_index in reversed(range(1,self.num_layers)):
                input_vals = self.forw_prop_vals_at_layers[back_layer_index -1]
                input_vals_avg = [sum(x) for x in zip(*input_vals)]
                input_vals_avg = list(map(operator.truediv, input_vals_avg, [float(len(class_labels))] * len(class_labels)))
                deriv_sigmoid =  self.gradient_vals_for_layers[back_layer_index]
                deriv_sigmoid_avg = [sum(x) for x in zip(*deriv_sigmoid)]
                deriv_sigmoid_avg = list(map(operator.truediv, deriv_sigmoid_avg, 
                                                                [float(len(class_labels))] * len(class_labels)))
                vars_in_layer  =  self.layer_vars[back_layer_index]                 ## a list like ['xo']
                vars_in_next_layer_back  =  self.layer_vars[back_layer_index - 1]   ## a list like ['xw', 'xz']

                layer_params = self.layer_params[back_layer_index]         
                ## note that layer_params are stored in a dict like        
                    ##     {1: [['ap', 'aq', 'ar', 'as'], ['bp', 'bq', 'br', 'bs']], 2: [['cp', 'cq']]}
                ## "layer_params[idx]" is a list of lists for the link weights in layer whose output nodes are in layer "idx"
                transposed_layer_params = list(zip(*layer_params))         ## creating a transpose of the link matrix

                backproped_error = [None] * len(vars_in_next_layer_back)
                for k,varr in enumerate(vars_in_next_layer_back):
                    for j,var2 in enumerate(vars_in_layer):
                        backproped_error[k] = sum([self.vals_for_learnable_params[transposed_layer_params[k][i]] * 
                                                pred_err_backproped_at_layers[back_layer_index][i] 
                                                for i in range(len(vars_in_layer))])
    #                                               deriv_sigmoid_avg[i] for i in range(len(vars_in_layer))])
                pred_err_backproped_at_layers[back_layer_index - 1]  =  backproped_error
                input_vars_to_layer = self.layer_vars[back_layer_index-1]
                for j,var in enumerate(vars_in_layer):
                    layer_params = self.layer_params[back_layer_index][j]
                    ##  Regarding the parameter update loop that follows, see the Slides 74 through 77 of my Week 3 
                    ##  lecture slides for how the parameters are updated using the partial derivatives stored away 
                    ##  during forward propagation of data. The theory underlying these calculations is presented 
                    ##  in Slides 68 through 71. 
                    for i,param in enumerate(layer_params):
                        
                        # @akamsali: update the velocity parameter and use 
                        g_t = input_vals_avg[i] * pred_err_backproped_at_layers[back_layer_index][j] * deriv_sigmoid_avg[j] 

                        m_val = self.beta_1 * self.m[param] + (1-self.beta_1) * g_t
                        v_val = self.beta_2 * self.v[param] + (1-self.beta_2) * (g_t**2)
                        m_hat = m_val / (1 - self.beta_1 ** self.time[param] )
                        v_hat = v_val / (1 - self.beta_2 ** self.time[param] )

                        ## Update the learnable parameters
                        step = self.learning_rate * m_hat / np.sqrt(v_hat + self.epsilon)
                        self.vals_for_learnable_params[param] += step
                        # store the current values of first and second moment parameters 
                        # for next iteration of training
                        self.m[param] = m_val
                        self.v[param] = v_val
                        self.time[param] += 1 # update time step 

                ## Update the bias
                m_bias_val = self.beta_1 * self.m_bias[back_layer_index -1] + \
                            (1 - self.beta_1) * (np.sum(pred_err_backproped_at_layers[back_layer_index]) * np.mean(deriv_sigmoid_avg))
                v_bias_val = self.beta_2 * self.v_bias[back_layer_index -1] + \
                            (1 - self.beta_2) * (np.sum(pred_err_backproped_at_layers[back_layer_index]) * np.mean(deriv_sigmoid_avg)**2)

                m_bias_hat = m_bias_val / (1 - (self.beta_1 ** self.time_bias[back_layer_index -1]))
                v_bias_hat = v_bias_val / (1 - (self.beta_2 ** self.time_bias[back_layer_index -1]))
                
                ## Update the bias parameters
                bias_step = self.learning_rate * (m_bias_hat / np.sqrt(v_bias_hat + 1e-7)) 
                self.bias += bias_step

                # store the current values of first and second moment parameters 
                # for next iteration of training

                self.m_bias[back_layer_index -1] = m_bias_val
                self.v_bias[back_layer_index -1] = v_bias_val
                self.time_bias[back_layer_index -1] += 1 # update time step 
                        

    ######################################################################################################
    # @akamsali: modified func call name and take in momentum value \mu
    def train_multineuron(self, training_data, beta_1=0.9, beta_2=0.99, epsilon=1e-7):


        class DataLoader:
            def __init__(self, training_data, batch_size):
                self.training_data = training_data
                self.batch_size = batch_size
                self.class_0_samples = [(item, 0) for item in self.training_data[0]]    ## Associate label 0 with each sample
                self.class_1_samples = [(item, 1) for item in self.training_data[1]]    ## Associate label 1 with each sample

            def __len__(self):
                return len(self.training_data[0]) + len(self.training_data[1])

            def _getitem(self):    
                cointoss = random.choice([0,1])                            ## When a batch is created by getbatch(), we want the
                                                                           ##   samples to be chosen randomly from the two lists
                if cointoss == 0:
                    return random.choice(self.class_0_samples)
                else:
                    return random.choice(self.class_1_samples)            

            def getbatch(self):
                batch_data,batch_labels = [],[]                            ## First list for samples, the second for labels
                maxval = 0.0                                               ## For approximate batch data normalization
                for _ in range(self.batch_size):
                    item = self._getitem()
                    if np.max(item[0]) > maxval: 
                        maxval = np.max(item[0])
                    batch_data.append(item[0])
                    batch_labels.append(item[1])
                batch_data = [item/maxval for item in batch_data]          ## Normalize batch data       
                batch = [batch_data, batch_labels]
                return batch                


        """
        The training loop must first initialize the learnable parameters.  Remember, these are the 
        symbolic names in your input expressions for the neural layer that do not begin with the 
        letter 'x'.  In this case, we are initializing with random numbers from a uniform distribution 
        over the interval (0,1).
        """
        self.vals_for_learnable_params = {param: random.uniform(0,1) for param in self.learnable_params}


        self.bias = [random.uniform(0,1) for _ in range(self.num_layers-1)]      ## Adding the bias to each layer improves 
                                                                                 ##   class discrimination. We initialize it 
                                                                                 ##   to a random number.
        # @akamsali: set hyperparameters
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon 
        # @akamsali: initialise learnable parameter moments
        self.m = {param: 0 for param in self.learnable_params}
        self.v = {param: 0 for param in self.learnable_params}
        self.time = {param: 1 for param in self.learnable_params}

        # @akamsali: initialise bias parameter moments
        self.time_bias = [1]*(self.num_layers-1)
        self.m_bias = [0]*(self.num_layers-1)
        self.v_bias = [0]*(self.num_layers-1)


        data_loader = DataLoader(training_data, batch_size=self.batch_size)
        loss_running_record = []
        i = 0
        avg_loss_over_iterations = 0.0                                          ##  Average the loss over iterations for printing out 
                                                                                 ##    every N iterations during the training loop.   
        for i in tqdm(range(self.training_iterations)):
            data = data_loader.getbatch()
            data_tuples = data[0]
            print(data_tuples)
            class_labels = data[1]
            self.forward_prop_multi_neuron_model(data_tuples)                                  ## FORW PROP works by side-effect 
            predicted_labels_for_batch = self.forw_prop_vals_at_layers[self.num_layers-1]      ## Predictions from FORW PROP
            y_preds =  [item for sublist in  predicted_labels_for_batch  for item in sublist]  ## Get numeric vals for predictions
            loss = sum([(abs(class_labels[i] - y_preds[i]))**2 for i in range(len(class_labels))])  ## Calculate loss for batch
            loss_avg = loss / float(len(class_labels))                                         ## Average the loss over batch
            avg_loss_over_iterations += loss_avg                                              ## Add to Average loss over iterations
            if i%(self.display_loss_how_often) == 0: 
                avg_loss_over_iterations /= self.display_loss_how_often
                loss_running_record.append(avg_loss_over_iterations)
                # print("[iter=%d]  loss = %.4f" %  (i+1, avg_loss_over_iterations))            ## Display avg loss
                avg_loss_over_iterations = 0.0                                                ## Re-initialize avg-over-iterations loss
            y_errors = list(map(operator.sub, class_labels, y_preds))
            y_error_avg = sum(y_errors) / float(len(class_labels))
            # @akamsali: change to modified backprop
            self.backprop_and_update_params_multineuron(y_error_avg, class_labels)      ## BACKPROP loss
            
        return loss_running_record

In [4]:
adam_mn = myADAMMultiNeuron(
               num_layers = 3,
               layers_config = [4,2,1],                         # num of nodes in each layer
               expressions = ['xw=ap*xp+aq*xq+ar*xr+as*xs',
                              'xz=bp*xp+bq*xq+br*xr+bs*xs',
                              'xo=cp*xw+cq*xz'],
               output_vars = ['xo'],
               dataset_size = 5000,
               learning_rate = 1e-3,
#               learning_rate = 5 * 1e-2,
               training_iterations = 40000,
               batch_size = 8,
               display_loss_how_often = 100,
               debug = True,
      )

adam_mn.parse_multi_layer_expressions()

training_data = adam_mn.gen_training_data()
sgd_mn_loss_0 = adam_mn.train_multineuron(training_data)
# sgd_mn_loss_5 = adam_mn.train(training_data, mu=0.5)
# sgd_mn_loss_9 = adam_mn.train(training_data, mu=0.9)



self.layer_expressions:  {1: ['xw=ap*xp+aq*xq+ar*xr+as*xs', 'xz=bp*xp+bq*xq+br*xr+bs*xs'], 2: ['xo=cp*xw+cq*xz']}


[layer index: 1] all variables: {'xz', 'xw', 'xr', 'xq', 'xs', 'xp'}


[layer index: 1] learnable params: {'as', 'br', 'ap', 'bs', 'bq', 'aq', 'bp', 'ar'}


[layer index: 1] dependencies: {'xw': ['xp', 'xq', 'xr', 'xs'], 'xz': ['xp', 'xq', 'xr', 'xs']}


[layer index: 1] expressions dict: {'xw': 'ap*xp+aq*xq+ar*xr+as*xs', 'xz': 'bp*xp+bq*xq+br*xr+bs*xs'}


[layer index: 1] var_to_var_param dict: {'xw': {'xp': 'ap', 'xq': 'aq', 'xr': 'ar', 'xs': 'as'}, 'xz': {'xp': 'bp', 'xq': 'bq', 'xr': 'br', 'xs': 'bs'}}


[layer index: 1] node to int labels: {'xp': 0, 'xq': 1, 'xr': 2, 'xs': 3, 'xw': 4, 'xz': 5}


[layer index: 1] independent vars: {'xs', 'xr', 'xq', 'xp'}


[layer index: 1] leads_to dictionary: {'xz': set(), 'xw': set(), 'xr': {'xz', 'xw'}, 'xq': {'xz', 'xw'}, 'xs': {'xz', 'xw'}, 'xp': {'xz', 'xw'}}


[layer index: 2] all variables: {'xz', 'xw', 'xr', 'xo', 'xq', 'x

  bias_step = self.learning_rate * (m_bias_hat / np.sqrt(v_bias_hat + 1e-7))
  0%|          | 1/40000 [00:00<02:02, 326.99it/s]

[array([0.26613241, 0.33472789, 0.26284519, 0.47481621]), array([0.41121133, 0.07398863, 0.20280787, 0.33127024]), array([0.27224541, 0.3150424 , 0.49610528, 0.26374297]), array([0.34846216, 0.19218335, 0.25382971, 0.38505873]), array([0.54688986, 0.42842752, 0.1464999 , 0.53522788]), array([-0.05895718,  0.27288617,  0.43104638,  0.28889614]), array([1.        , 0.65034497, 0.14605088, 0.22077299]), array([0.60953372, 0.81883191, 0.41011074, 0.18566385])]
[array([ 0.29262678,  0.40141971, -0.04441873,  0.45633414]), array([0.52018342, 0.56687619, 0.14056941, 0.3893314 ]), array([0.13334504, 0.24188899, 0.34890361, 0.30680977]), array([0.45145664, 0.35476537, 0.28351984, 0.76326755]), array([0.39009914, 0.56129811, 0.34963711, 0.2480258 ]), array([0.38225309, 0.12145688, 0.0381678 , 0.54185796]), array([0.40850117, 0.35623534, 0.32909018, 0.40955579]), array([1.        , 0.68887441, 0.3806984 , 0.48415363])]





NameError: name 'nan' is not defined