In [3]:
!pip install numpy



In [4]:
import numpy as np

In [6]:
# Take the input variable values (inputs), weights (randomly initialized if this is the first iteration), 
# and the actual outputs in the provided dataset as the parameters of the feed_forward function

# To make this exercise a little more realistic, we will have bias associated with each node. Thus the 
# weights array will contain not only the weights connecting different nodes but also the bias associated 
# with nodes in hidden/ output layers.


def feed_forward(inputs, outputs, weights):
    weight_between_input_to_hidden_layer = weights[0]
    bias_between_input_to_hidden_layer = weights[1]
    pre_hidden = np.dot(inputs, weight_between_input_to_hidden_layer) + bias_between_input_to_hidden_layer

    # Apply the sigmoid activation function on top of the hidden layer values obtained in the previous step – pre_hidden:
    hidden = 1/(1 + np.exp(-pre_hidden)) # We will define separate func for activation next

    weight_between_hidden_to_output_layer = weights[2]
    bias_between_hidden_to_output_layer = weights[3]

    # Calculate output by dot producting hidden layer with weights connected to output layer
    output_prediction = np.dot(hidden, weight_between_hidden_to_output_layer) + bias_between_hidden_to_output_layer

    # Calculate error
    mean_squared_error = np.mean(np.square(output_prediction - outputs))
                                 
    return mean_squared_error

In [9]:
# Definining Activation Functions 

def tanh(x): 
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))

def relu(x):       
    return np.where(x>0,x,0)

def linear(x):       
    return x

# Unlike other activations, softmax is performed on top of an array of values. 
# This is generally done to determine the probability of an input belonging to one 
# of the m number of possible output classes in a given scenario. Let's say we are 
# trying to classify an image of a digit into one of the possible 10 classes (numbers from 0 to 9). 
# In this case, there are 10 output values, where each output value should represent 
#the probability of an input image belonging to one of the 10 classes.

def softmax(x):       
    return np.exp(x)/np.sum(np.exp(x))

# Here’s how it works:

# Input Vector: Suppose you have a vector 

# Exponential: Apply the exponential function to each element of the vector, which gives 
# a new vector

# Normalization: Divide each exponential by the sum of all exponentials in the vector. 
# This ensures that the sum of all output values of the softmax function is 1, forming a 
# valid probability distribution.

In [11]:
# Definining Loss Functions 

# The mean squared error is typically used when trying to predict a value that is continuous in nature.
def mse(p, y):   
    return np.mean(np.square(p - y))

# Similar to the mean squared error, the mean absolute error is generally employed on continuous variables. 
# Further, in general, it is preferable to have a mean absolute error as a loss function when the outputs to predict 
# have a value less than 1, as the mean squared error would reduce the magnitude of loss considerably (the square of a number between 1 and -1 is an even smaller number) when the expected output is less than 1.
# The mean absolute error between an array of predicted output values (p) and an array of actual output values (y) is implemented as follows: 
def mae(p, y):       
    return np.mean(np.abs(p-y))


# Binary Cross Entropy
# Cross-entropy is a measure of the difference between two different distributions: actual and predicted. 
# Binary cross-entropy is applied to binary output data
# Binary cross-entropy loss has a high value when the predicted value is far away from the actual value and a low value when the predicted and actual values are close.

def binary_cross_entropy(p, y):      
    return -np.mean(np.sum((y*np.log(p)+(1-y)*np.log(1-p))))


# Categorical cross-entropy between an array of predicted values (p) and an array of actual values (y) is implemented as follows: 
def categorical_cross_entropy(p, y):         
    return -np.mean(np.sum(y*np.log(p)))

In [None]:
# Now we will learn about backpropagation, a technique to adjust weights so that they will result in a loss 
# that is as small as possible.

# In feedforward propagation, we connected the input layer to the hidden layer, which then was connected to the output layer.
# In the first iteration, we initialized weights randomly and then calculated the loss resulting from those weight values. 
# In backpropagation, we take the reverse approach. We start with the loss value obtained in feedforward propagation and 
# update the weights of the network in such a way that the loss value is minimized as much as possible.

# The loss value is reduced as we perform the following steps: 
# 1. Change each weight within the neural network by a small amount – one at a time. 
# 2. Measure the change in loss ( ∂L ) when the weight value is changed ( ∂W ). 
# 3. Update the weight by -k . ∂L/∂W  (where k is a positive value and is a hyperparameter known as the learning rate).

# Note that the update made to a particular weight is proportional to the amount of loss that is reduced by changing it 
# by a small amount. Intuitively, if changing a weight reduces the loss by a large value, then we can update the weight 
# by a large amount. However, if the loss reduction is small by changing the weight, 
# then we update it only by a small amount.

# If the preceding steps are performed n number of times on the entire dataset (where we have done both the 
# feedforward propagation and backpropagation), it essentially results in training for n epochs.

# Q: Is it practical to update every single weight (millions or billions) in a production setting while training models?

# As a typical neural network contains thousands/millions (if not billions) of weights, changing the value of each weight, 
# and checking whether the loss increased or decreased is not optimal. The core step in the preceding list is the measurement
# of "change of loss" when the weight is changed. As you might have studied in calculus, measuring this is the same as 
# computing the gradient of loss concerning the weight. There's more on leveraging partial derivatives from calculus to 
# calculate the gradient of the loss concerning the weight in the next section, on the chain rule for backpropagation.


In [12]:
# Lets understand one addition concept, Learning Rate before we implement backpropagation

# Intuitively, the learning rate helps in building trust in the algorithm. For example, 
# when deciding on the magnitude of the weight update, we would potentially not change the weight value by a big amount 
# in one go but update it more slowly. This results in obtaining stability in our model;

#This whole process by which we update weights to reduce errors is called gradient descent.

# Stochastic gradient descent is how errors are minimized in the preceding scenario. As mentioned earlier, 
# gradient stands for the difference (which is the difference in loss values when the weight value is updated by 
# a small amount) and descent means to reduce. Stochastic stands for the selection of random samples based on which 
# a decision is taken.

# Apart from stochastic gradient descent, many other similar optimizers help to minimize loss values, will discuss them later.


In [16]:
# Gradient Descent in Code
from copy import deepcopy

def feed_forward(inputs, outputs, weights):
    weight_between_input_to_hidden_layer = weights[0]
    bias_between_input_to_hidden_layer = weights[1]
    pre_hidden = np.dot(inputs, weight_between_input_to_hidden_layer) + bias_between_input_to_hidden_layer

    # Apply the sigmoid activation function on top of the hidden layer values obtained in the previous step – pre_hidden:
    hidden = 1/(1 + np.exp(-pre_hidden)) # We will define separate func for activation next

    weight_between_hidden_to_output_layer = weights[2]
    bias_between_hidden_to_output_layer = weights[3]

    # Calculate output by dot producting hidden layer with weights connected to output layer
    output_prediction = np.dot(hidden, weight_between_hidden_to_output_layer) + bias_between_hidden_to_output_layer

    # Calculate error
    mean_squared_error = np.mean(np.square(output_prediction - outputs))
                                 
    return mean_squared_error
    

# Increase each weight and bias value by a very small amount (0.0001) and calculate the overall 
# squared error loss value one at a time for each of the weight and bias updates.

def update_weights(inputs, outputs, weights, lr):
    # Ensure that you deepcopy the list of weights. As the weights will be manipulated in later steps, 
    # deepcopy ensures we can work with multiple copies of weights without disturbing actual weights. 
    # We will create three copies of the original set of weights that were passed as an input to the function – 
    # 1. original_weights, 2. temp_weights, and 3. updated_weights:

    original_weights = deepcopy(weights)
    temp_weights = deepcopy(weights)
    updated_weights = deepcopy(weights)

    # Calculate the loss value (original_loss) with the original set of weights by passing inputs, outputs, and original_weights 
    # through the feed_forward function:
    original_loss = feed_forward(inputs, outputs, original_weights)

    # Loop through all the layers of the network:
    for i, layer in enumerate(original_weights):
        # There are a total of four lists of parameters within our neural network – 
        # two lists for the weight and bias parameters that connect the input to the 
        # hidden layer and another two lists for the weight and bias parameters that 
        # connect the hidden layer to the output layer. Now, we loop through all the 
        # individual parameters and because each list has a different shape, we leverage
        # np.ndenumerate to loop through each parameter within a given list
        for index, weight in np.ndenumerate(layer):
            temp_weights = deepcopy(weights)
            temp_weights[i][index] += 0.0001
            _loss_plus = feed_forward(inputs, outputs, temp_weights)
            # We calculate the gradient (change in loss value) due to the weight change:
            grad = (_loss_plus - original_loss)/(0.0001)
            # This ^  process of updating a parameter by a very small amount and then 
            # calculating the gradient is equivalent to the process of differentiation.

            # Finally, we update the parameter present in the corresponding ith layer and
            # index, of updated_weights. The updated weight value will be reduced in 
            # proportion to the value of the gradient. Further, instead of completely reducing
            # it by a value equal to the gradient value, we bring in a mechanism to build trust 
            # slowly by using the learning rate – lr
            updated_weights[i][index] -= grad*lr

    # Once the parameter values across all layers and indices within layers are updated,
    # we return the updated weight values – updated_weights:
    return updated_weights, original_loss


# In the preceding scenario, we considered all the data points to calculate the loss (mean squared error) value. 
# However, in practice, when we have thousands (or in some cases, millions) of data points, 
# the incremental contribution of a greater number of data points while calculating the loss value 
# would follow the law of diminishing returns, and hence we would be using a batch size that is much smaller 
# compared to the total number of data points we have. We will apply gradient descent (after feedforward propagation) 
# using one batch at a time until we exhaust all data points within one epoch of training. 

# The typical batch size considered in building a model is anywhere between 32 and 1,024.

# In this section, we learned about updating weight values based on the change in 
# loss values when the weight values are changed by a small amount. 

# In the next section, we will learn about how weights can be updated without computing
# gradients one gradient at a time.

In [None]:
# Implementing backpropagation using the chain rule


