In [None]:
# Backpropagation
# We know how to measure the impact of variables on a function using calculus
# and calculating the partial derivatives of a particular variable within a function
# and then using chain rule for calculating the impact of a variable in nested functions. 

# With this knowledge, we can now calculate the impart of each of our weights on the loss
# function by calculating the partial derivatives of the weights. 

# Lets experiment this theory on a single neuron. 
# Minimizing our loss function is the end goal of the machine learning optimization. 


In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Analysis of the forward pass
x = [1.0, -2.0, 3.0]
w = [-3.0, -1.0, 2.0]
b = 1.0


In [3]:
x[0], w[0]

(1.0, -3.0)

In [7]:
xw0 = x[0] * w[0]

In [8]:
xw1 = x[1] * w[1]

In [9]:
xw2 = x[2] * w[2]

In [10]:
xw0, xw1, xw2

(-3.0, 2.0, 6.0)

In [12]:
z = xw0 + xw1 + xw2 + b
z

6.0

In [13]:
y = max(z, 0)
y

6.0

In [None]:
# We have just computed the full forward pass of the a single artifical neuron with 
# 3 inputs. Let's take all these small functions as one large chained function
# which takes inputs, weights and biases. and produces the value y as the output. 

# The bigger function consists of multiple smaller functions, within that large functoin
# we have smaller operation such as addition, multiplication and ma. 
# The first step of backpropagation is to calculate the partial derivatives of the inputs, weights and biases
# used in computing the function. In order to do this we would be making use of the chain rule.
# We need to calculate how much each input, weight and bias affects the output.

# We will start by calculating the partial derivatives for w[0]

# y = ReLU(sum(mul(x[0], w[0]), mul(x[1], w[1]), mul(x[2], w[2]), b))
# The above equation contains 3 nested functions, ReLU, sum and multiplication.
# We would like to know the impact of a given weight or bias on a loss. This would
# involve us calculating the partial derivatives of the weights with respect to the 
# loss function. 

# During backpropagation, we calculate the derivative of the loss function. and use it to 
# multiply with the derivative of the activation function of the output layer. then use it 
# to multiply the derivatives of the output layer, and so on, we keep moving backward and multiplying
# until we reach the leaf paramaters, which are the actual inputs to the the whole function itself. 
# Then we use the gradients of those leaf parameters to make adjustments in order to improve our loss. 


In [14]:
# We will use the function above to calculate the loss of the ReLU activation function. 
relu_dz = (1. if z > 0 else 0) # ReLU derivative function
# Since the value of Z is 6, then the derivative value is 1

In [15]:
relu_dz

1.0

In [None]:
# The process of going back and calculating the gradient through each step is
# called backpropagation using the chain rule. 

# We  used backpropagation to calculate the inpact of each neuron on the 
# loss function, by calculating the gradient at a particular point in the 
# computational graph and moving it sequentially backwards through the computational
# graph.

In [16]:
# The partiall derivative of the bias on the output of a neuron is always 1
# The partiall derivative of the weights on the output of a neuron is always the value of the corresponding input
# The partiall derivative of the input of a neuron is always the value of the corresponding weight
"""
All together all the all the partial derivatives of the parameters makek up a combined vector called our gradients..
In this case our gradients would be
dx = [drelu_dx0, drelu_dx1, drelu_dx2]
dw = [drelu_dw0, drelu_dw1, drelu_dw2]
db = drelu_db
"""
# For this single neuron example we will not be using the dx gradient, but in future examples where this might be a hidden layers, we will need to the gradients of the input, which might be the output of another neuron.
# With the gradient values in hand we can then apply these gradients on the weights to hopefully minimize the weight values. 
# We would be using a simplied version of an optimizere by simply subracting the value of our gradient from the weights.
# We need a negative fraction of the gradients, since we would like to reduce our loss function and move in small incremental steps, since
# we would be making a lot of changes simultaneously. 

# With this method we can reduce the output value of our gradients. 


'\nAll together all the all the partial derivatives of the parameters makek up a combined vector called our gradients..\nIn this case our gradients would be\ndx = [drelu_dx0, drelu_dx1, drelu_dx2]\ndw = [drelu_dw0, drelu_dw1, drelu_dw2]\ndb = drelu_db\n'

In [17]:
# So far we have performed a single backward pass with a single neuron, now
# we would be combining the neurons together to form a layer and experimient 
# performing the backward pass on that input. 


# Lets replace this single neuron with a layer of neurons. A layer outputs a vector
# rather than a single scalar value. Each neuron in a layer connects to all the neurons of
# the previous layer. 
# At the backpropagation step, this layer will recieve a vector of partial derivatives rather than
# a single scalar partial derivative value. 

In [55]:
import numpy as np
dvalues = np.array([[1, 1, 1]])
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]])
weights

array([[ 0.2 ,  0.8 , -0.5 ,  1.  ],
       [ 0.5 , -0.91,  0.26, -0.5 ],
       [-0.26, -0.27,  0.17,  0.87]])

In [19]:
weights.T

array([[ 0.2 ,  0.5 , -0.26],
       [ 0.8 , -0.91, -0.27],
       [-0.5 ,  0.26,  0.17],
       [ 1.  , -0.5 ,  0.87]])

In [56]:
weights = weights.T

In [57]:
values = np.sum(weights, axis=1, keepdims=True)
values

array([[ 0.44],
       [-0.38],
       [-0.07],
       [ 1.37]])

In [59]:
weights = weights.T

In [60]:
sum(weights[0] * dvalues[0])

0.43999999999999995

In [61]:
dx0 = sum(weights[0] * dvalues[0])
dx1 = sum(weights[1] * dvalues[0])
dx2 = sum(weights[2] * dvalues[0])
dx3  = sum(weights[3] * dvalues[0])

In [62]:
dinputs = np.array([dx0, dx1, dx2, dx3])
dinputs

array([ 0.44, -0.38, -0.07,  1.37])

In [54]:
dvalues[0]

1

In [63]:
# dinputs is a gradient of the neuron funcition with respect to the inputs
np.dot(dvalues[0], weights.T)

array([ 0.44, -0.38, -0.07,  1.37])

In [66]:
# Working with a batch of samples
dvalues = np.array([[1, 1, 1],
                    [2, 2, 2], 
                    [3, 3, 3],
                    [4, 4, 4]])

weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]])

dinputs = np.dot(dvalues, weights)
print(dinputs)

[[ 0.44 -0.38 -0.07  1.37]
 [ 0.88 -0.76 -0.14  2.74]
 [ 1.32 -1.14 -0.21  4.11]
 [ 1.76 -1.52 -0.28  5.48]]


In [69]:
# Calculating the derivatives of the weights. 
dvalues = np.array([[1, 1, 1],
                    [2, 2, 2],
                    [3, 3, 3],
                    [4, 4, 4]])

inputs = np.array([[1, 2, 3,  2.5],
                  [2, 5,  -1, 2],
                  [-1.5, 2.7, 3.3, -0.8],
                  [-2.4, -1., -0.3, -2.3]])

dweights = np.dot(inputs.T, dvalues)
dweights

array([[-9.1, -9.1, -9.1],
       [16.1, 16.1, 16.1],
       [ 9.7,  9.7,  9.7],
       [-5.1, -5.1, -5.1]])

In [70]:
# How to calculate the derivatives of the biases
dvalues = np.array([[1, 1, 1],
                    [2, 2, 2],
                    [3, 3, 3],
                    [4, 4, 4]])

biases = np.array([[2, 3, 0.5]])

dbiases = np.sum(dvalues, axis=0, keepdims=True)

In [71]:
dbiases

array([[10, 10, 10]])