In [49]:
import autograd.numpy as np
from autograd import jacobian,elementwise_grad,grad

# 1. Simple Scalar Network

## Fixed, scalar inputs x,y and z

In [50]:
x = np.array([5])
y = np.array([3])
z = np.array([2])

### The forward function, with 3 weights wx,wy,wz associated with each of x,y,z respectively

In [51]:
def ScalarForward(wx,wy,wz):
    p = wx*x + wy*y
    q = wz*z
    f = p * q
    return f

### df/dwx = (df/dp)\*(dp/dwx) = q*x
### df/dwy = (df/dp)\*(dp/dwy) = q*y
### df/dwz = (df/dq)\*(dq/dwz) = p*z

In [52]:
# pick weights randomly
wx = np.random.random((1,1))
wy = np.random.random((1,1))
wz = np.random.random((1,1))

# compute gradients of f w.r.t wx,wy,wz
df_dwx = (wz*z)*x
df_dwy = (wz*z)*y
df_dwz = (wx*x + wy*y)*z

print("Computed Gradients:",df_dwx,df_dwy,df_dwz)

Computed Gradients: [[9.87425431]] [[5.92455258]] [[4.73717653]]


### Verify manually computed gradients with autograd

In [53]:
# autograd's `grad` returns a gradient function, that you can use later, with ScalarForward's arguments.
# ScalarForward takes in wx,wy,wz at positions 0,1,2 in the argument list.
# So, grad(ScalarForward,0) returns gradient of ScalarForward w.r.t wx.

gradient_fn_wx = grad(ScalarForward,0) # function to compute gradient w.r.t wx
gradient_fn_wy = grad(ScalarForward,1) # function to compute gradient w.r.t wy
gradient_fn_wz = grad(ScalarForward,2) # function to compute gradient w.r.t wz

#call gradient functions with the weights we picked randomly before.

auto_df_dwx = gradient_fn_wx(wx,wy,wz)
auto_df_dwy = gradient_fn_wy(wx,wy,wz)
auto_df_dwz = gradient_fn_wz(wx,wy,wz)

print("Autograd Gradients:",auto_df_dwx,auto_df_dwy,auto_df_dwz)

Autograd Gradients: [[9.87425431]] [[5.92455258]] [[4.73717653]]


# 2. Network with vector input, no hidden layers
\begin{align}
f = \sigma(W^TX)
\end{align}

In [54]:
X = np.array([  [0,0,1] ]) # Input vector
Y = np.array([ [0] ]) # Target output
W0 = np.random.random((3,1)) # random weight W0

#### Activation function and its derivative

In [55]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x)*(1-sigmoid(x))

#### Forward Operations, step-by-step

In [56]:
Linear = np.dot(X,W0) # Linear Operation
NonLinear = sigmoid(Linear) # Apply Non-linear activation
Error = Y - NonLinear # Compute Error w.r.t target Y

#### Backprop: Compute gradient of Error w.r.t W0

In [57]:
# The gradient has to backprop from output, via the activation, to W0
# Work through these steps, backwards. From the last equation in this cell, to the first.

# Since Linear output = W^T*X
dLinear_W0 = X.T

# For NonLinear w.r.t W0, we need NonLinear w.r.t Linear via sigmoid , then Linear w.r.t W0
dNonLinear_W0 = dLinear_W0 * sigmoid_derivative(Linear)

# Second Step: Trivial
dError_NonLinear = -1

# First step: For gradient of Error w.r.t W0, first find its gradient w.r.t to NonLinear.
#             Then find gradient of NonLinear w.r.t W0
dError_W0 = dNonLinear_W0 * dError_NonLinear

print("Computed Gradients:")
print("Gradient w.r.t W0:", dError_W0)

Computed Gradients:
Gradient w.r.t W0: [[-0.        ]
 [-0.        ]
 [-0.22394959]]


#### Backprop: Verification by autograd

In [58]:
def VectorForward(W0):
    Linear = np.dot(X,W0)
    NonLinear = sigmoid(Linear)
    Error = Y - NonLinear
    return Error 

gradient_fn_W0 = grad(VectorForward)

autograd_dError_W0 = gradient_fn_W0(W0)
print("Autograd Gradients:")
print("Gradient w.r.t W0:", autograd_dError_W0)

Autograd Gradients:
Gradient w.r.t W0: [[ 0.        ]
 [ 0.        ]
 [-0.22394959]]


# 3. Vector Input, One Hidden Layer

In [59]:
# Input X, Target Y , and 2 Weights because we have 1 hidden layer.
X = np.array([  [0,0,1] ])
Y = np.array([ [0] ])
W0 = np.random.random((3,4))
W1 = np.random.random((4,1))

#### Forward Operations, step-by-step

In [60]:
L1_Input = np.dot(X,W0) # Unactivated input to the hidden layer
L1 = sigmoid(L1_Input) # Hidden Layer activated output

Last_Input = np.dot(L1,W1)
Pred = sigmoid(Last_Input)

Error = Y - Pred

In [61]:
# Compute the gradient w.r.t last weight before output.
dLast_Input_W1 = L1.T
dPred_W1 = dLast_Input_W1 * sigmoid_derivative(Last_Input)
dError_Pred = -1
dError_W1 = dPred_W1 * dError_Pred   # This is gradient of Error w.r.t. W1

# Now compute the Upstream gradient - i.e, compute gradient of Error w.r.t the output of the previous layer.
dLast_Input_L1 = W1.T
dPred_L1 =  dLast_Input_L1 * sigmoid_derivative(Last_Input)
dError_L1 = dPred_L1 * dError_Pred  # Gradient of Error w.r.t Layer output

# Now use Upstream Gradient and comput gradient of Error w.r.t the weights in the layer before that.
dL1_Input_W0 = X.T
dL1_W0 = dL1_Input_W0 * sigmoid_derivative(L1_Input)
dError_W0 = dL1_W0 * dError_L1 # Gradient of Error w.r.t W0

print("Computed Gradients:")
print("Gradient w.r.t W1:", dError_W1)
print("Gradient w.r.t W0:", dError_W0)

Computed Gradients:
Gradient w.r.t W1: [[-0.11468638]
 [-0.15016915]
 [-0.13416744]
 [-0.12815256]]
Gradient w.r.t W0: [[-0.         -0.         -0.         -0.        ]
 [-0.         -0.         -0.         -0.        ]
 [-0.02617879 -0.00168374 -0.03976565 -0.0017205 ]]


#### Verify computed gradients using autograd

In [62]:
def vector_one_hidden_forward(W0,W1):
    L1_Input = np.dot(X,W0)
    L1 = sigmoid(L1_Input)

    Last_Input = np.dot(L1,W1)
    Pred = sigmoid(Last_Input)

    Error = Y - Pred

    return Error

autogradient_fn_W0 = grad(vector_one_hidden_forward,0)
autogradient_fn_W1 = grad(vector_one_hidden_forward,1)

autograd_dError_W0 = autogradient_fn_W0(W0,W1)
autograd_dError_W1 = autogradient_fn_W1(W0,W1)

print("Autograd Gradients:")
print("Gradient w.r.t W1:", autograd_dError_W1)
print("Gradient w.r.t W0:", autograd_dError_W0)

Autograd Gradients:
Gradient w.r.t W1: [[-0.11468638]
 [-0.15016915]
 [-0.13416744]
 [-0.12815256]]
Gradient w.r.t W0: [[ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.02617879 -0.00168374 -0.03976565 -0.0017205 ]]


# 4. Two Hidden Layers.

In [63]:
# 2 Hidden Layers, so 3 Weights now to compute for
X = np.array([  [1,0,1] ])
Y = np.array([ [0] ])
W0 = np.random.random((3,4))
W1 = np.random.random((4,4))
W2 = np.random.random((4,1))

#### Forward Operations

In [64]:
L1_Input = np.dot(X,W0)
L1 = sigmoid(L1_Input)

L2_Input = np.dot(L1,W1)
L2 = sigmoid(L2_Input)

Last_Input = np.dot(L2,W2)
Pred = sigmoid(Last_Input)

Error = Y - Pred

In [65]:
# Fill this section

In [66]:
def vector_two_hidden_forward(W0,W1,W2):
    L1_Input = np.dot(X,W0)
    L1 = sigmoid(L1_Input)
    L2_Input = np.dot(L1,W1)
    L2 = sigmoid(L2_Input)
    Last_Input = np.dot(L2,W2)
    Pred = sigmoid(Last_Input)
    Error = Y - Pred
    
    return Error

autogradient_fn_W0 = grad(vector_two_hidden_forward,0)
autogradient_fn_W1 = grad(vector_two_hidden_forward,1)
autogradient_fn_W2 = grad(vector_two_hidden_forward,2)

autograd_dError_W0 = autogradient_fn_W0(W0,W1,W2)
autograd_dError_W1 = autogradient_fn_W1(W0,W1,W2)
autograd_dError_W2 = autogradient_fn_W2(W0,W1,W2)

print("Autograd Gradients:")
print("Gradient w.r.t W2:", autograd_dError_W2)
print("Gradient w.r.t W1:", autograd_dError_W1)
print("Gradient w.r.t W0:", autograd_dError_W0)

Autograd Gradients:
Gradient w.r.t W2: [[-0.05355395]
 [-0.06044564]
 [-0.06279225]
 [-0.06094853]]
Gradient w.r.t W1: [[-0.00800912 -0.00398036 -0.00437293 -0.00224655]
 [-0.00826884 -0.00410943 -0.00451473 -0.0023194 ]
 [-0.00838213 -0.00416574 -0.00457659 -0.00235118]
 [-0.00836132 -0.0041554  -0.00456523 -0.00234535]]
Gradient w.r.t W0: [[-0.00286654 -0.00353481 -0.00071599 -0.00270369]
 [ 0.          0.          0.          0.        ]
 [-0.00286654 -0.00353481 -0.00071599 -0.00270369]]
