In [1]:
# Random seed generator
import torch
import numpy as np
import torch.nn as nn

dtype = torch.float
device = torch.device("cpu") 
np.random.seed(5980202)

In [2]:
# 3.c
N = 2 # dimensions
H = 3

# random input and output data
x = np.random.randn(N,1)
y = np.random.randn(N,1)

# Randomly initialize weights
W1 = np.random.randn(H, N)
W2 = np.random.randn(H, H)
W3 = np.random.randn(N, H)

In [3]:
# 3.c.i (Analytic Gradient)
# Forward pass 
M1 = W1@x
J = np.maximum(M1,0)
M2 = W2@J
K = np.maximum(M2,0)
M3 = W3@K
L = y - M3

# Gradient calculation
W3_1_grad = -L@K.T
temp = -W3.T@L
temp[M2<0]=0 # derivative of relu and hadamard prdo
W2_1_grad = temp@J.T
temp = W2.T@temp
temp[M1<0]=0 # derivative of relu
W1_1_grad = temp@x.T

In [4]:
# 3.c.ii (Backward pass)
class Node:
    def __init__(self, v=0):
        self.v = v
        self.d = 0

# Creating Computation graph
X = Node(x)
Y = Node(y)
W1_2 = Node(np.array(W1, copy=True))
W2_2 = Node(np.array(W2, copy=True))
W3_2 = Node(np.array(W3, copy=True))
V1, V2 = Node(), Node()
V3, V4 = Node(), Node()
V5, D, Z = Node(), Node(), Node()

# Forward pass
V1.v = W1_2.v@X.v
V2.v = np.maximum(V1.v,0)
V3.v = W2_2.v@V2.v
V4.v = np.maximum(V3.v,0)
V5.v = W3_2.v@V4.v
D.v = Y.v - V5.v
Z.v = 0.5*(np.linalg.norm(D.v)**2)

# Backward pass
Z.d = 1
D.d = D.v.T
Y.d = 0
V5.d = -D.d
W3_2.d = V5.d.T@V4.v.T
V4.d = W3_2.v.T@V5.d.T
temp = V4.d.copy()
temp[V3.v < 0] = 0 # derivative of relu
V3.d = temp.T
W2_2.d = V3.d.T@V2.v.T
V2.d = W2_2.v.T@V3.d.T
temp = V2.d.copy()
temp[V1.v < 0] = 0 # derivative of relu
V1.d = temp.T
W1_2.d = V1.d.T@X.v.T

In [5]:
# 3.c.iii (Training via Autograd)
# Initialize weights, input and output
import torch.nn.functional as F

x_tensor = torch.tensor(x)
y_tensor = torch.tensor(y)

W1_3 = torch.tensor(W1, requires_grad=True)
W2_3 = torch.tensor(W2, requires_grad=True)
W3_3 = torch.tensor(W3, requires_grad=True)

# Forward pass
A = W1_3.mm(x_tensor).clamp(min=0)
B = W2_3.mm(A).clamp(min=0)
C = W3_3.mm(B)
loss = (C - y_tensor).pow(2).sum()/2

# automatic backward pass
loss.backward()

In [6]:
# From previous assignment
def matrix_comp(A_1, A_2):
    # comapres two matrices via subspace projectors
    # return np.linalg.norm(A_1@A_1.T - A_2@np.linalg.pinv(A_2))
    
    # comparing two matrices using relative difference
    return np.linalg.norm(A_1 - A_2) / np.linalg.norm(A_1)

In [7]:
print ('Comparing Autograd VS Analytic Gradient')
print ('W1 Gradient Comparison: ', matrix_comp(W1_3.grad,W1_1_grad))
print ('W2 Gradient Comparison: ', matrix_comp(W2_3.grad,W2_1_grad))
print ('W3 Gradient Comparison: ', matrix_comp(W3_3.grad,W3_1_grad))

Comparing Autograd VS Analytic Gradient
W1 Gradient Comparison:  1.9707605543043863e-16
W2 Gradient Comparison:  3.0492163003753727e-17
W3 Gradient Comparison:  0.0


In [8]:
print ('Comparing Autograd VS Reverse Mode Auto Differentiation')
print ('W1 Gradient Comparison: ', matrix_comp(W1_3.grad,W1_2.d))
print ('W2 Gradient Comparison: ', matrix_comp(W2_3.grad,W2_2.d))
print ('W3 Gradient Comparison: ', matrix_comp(W3_3.grad,W3_2.d))

Comparing Autograd VS Reverse Mode Auto Differentiation
W1 Gradient Comparison:  1.9707605543043863e-16
W2 Gradient Comparison:  3.0492163003753727e-17
W3 Gradient Comparison:  0.0


In [9]:
# 3ci gradients
print (W1_1_grad, W2_1_grad, W3_1_grad, sep='\n')

[[  9.8147963  -10.00224379]
 [  1.67837113  -1.71042544]
 [ -1.89755417   1.93379453]]
[[0.         0.         0.        ]
 [7.95080507 5.96399103 7.56588437]
 [1.11280508 0.83472799 1.05893108]]
[[0.         4.00047362 8.84974739]
 [0.         3.77378988 8.34828331]]


In [10]:
# 3cii gradients
print (W1_2.d, W2_2.d, W3_2.d, sep='\n')

[[  9.8147963  -10.00224379]
 [  1.67837113  -1.71042544]
 [ -1.89755417   1.93379453]]
[[0.         0.         0.        ]
 [7.95080507 5.96399103 7.56588437]
 [1.11280508 0.83472799 1.05893108]]
[[0.         4.00047362 8.84974739]
 [0.         3.77378988 8.34828331]]


In [11]:
# 3ciii gradients
print (W1_3.grad, W2_3.grad, W3_3.grad, sep='\n')

tensor([[  9.8148, -10.0022],
        [  1.6784,  -1.7104],
        [ -1.8976,   1.9338]], dtype=torch.float64)
tensor([[-0.0000, -0.0000, -0.0000],
        [7.9508, 5.9640, 7.5659],
        [1.1128, 0.8347, 1.0589]], dtype=torch.float64)
tensor([[0.0000, 4.0005, 8.8497],
        [0.0000, 3.7738, 8.3483]], dtype=torch.float64)
