In [1]:
import torch
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
import matplotlib.pyplot as plt

# Question 1
Given a fully connected Neural Network as follows:
1. Input (x1,x2): 2 nodes
2. First hidden layer: 10 nodes, with weights (w) and bias (b), sigmoid activation 
function
3. Second hidden layer: 10 nodes, with weights (w) and bias (b), sigmoid activation 
function
4. Output (predict): 1 node

In [2]:
net1 = torch.nn.Sequential(
    torch.nn.Linear(2,10),
    torch.nn.Sigmoid(),
    torch.nn.Linear(10,10),
    torch.nn.Sigmoid(),
    torch.nn.Linear(10,1)
)

In [3]:
print(net1)

Sequential(
  (0): Linear(in_features=2, out_features=10, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=10, out_features=10, bias=True)
  (3): Sigmoid()
  (4): Linear(in_features=10, out_features=1, bias=True)
)


# Question 2
Generate the input date (x1,x2) \in [0,1] drawn from a uniform random distribution

In [4]:
sampler = torch.distributions.Uniform(low=0, high=1)
x = sampler.sample((100,2))
x

tensor([[0.0845, 0.4541],
        [0.1000, 0.0077],
        [0.5280, 0.7234],
        [0.4587, 0.0669],
        [0.2261, 0.5997],
        [0.4660, 0.7235],
        [0.8067, 0.8530],
        [0.2272, 0.8020],
        [0.7508, 0.5379],
        [0.7824, 0.7798],
        [0.6293, 0.3630],
        [0.1827, 0.5907],
        [0.8485, 0.8222],
        [0.3559, 0.1848],
        [0.0068, 0.0971],
        [0.3872, 0.9265],
        [0.1222, 0.7810],
        [0.1364, 0.1409],
        [0.8931, 0.9876],
        [0.2827, 0.9217],
        [0.8821, 0.7959],
        [0.3673, 0.1539],
        [0.1380, 0.5071],
        [0.3385, 0.0753],
        [0.9487, 0.4769],
        [0.9868, 0.1300],
        [0.0333, 0.1226],
        [0.4721, 0.7510],
        [0.2858, 0.4291],
        [0.6794, 0.8045],
        [0.4923, 0.0603],
        [0.8873, 0.4778],
        [0.2597, 0.6872],
        [0.6658, 0.2171],
        [0.8340, 0.9294],
        [0.7972, 0.3764],
        [0.2811, 0.1619],
        [0.9062, 0.2838],
        [0.5

# Question 3
Generate the labels y = (x1*x1+x2*x2)/2

In [5]:
y = (x[:,0]*x[:,0] + x[:,1]*x[:,1])/2
y

tensor([1.0670e-01, 5.0284e-03, 4.0107e-01, 1.0743e-01, 2.0540e-01, 3.7027e-01,
        6.8917e-01, 3.4741e-01, 4.2650e-01, 6.1016e-01, 2.6390e-01, 1.9113e-01,
        6.9800e-01, 8.0425e-02, 4.7352e-03, 5.0413e-01, 3.1245e-01, 1.9231e-02,
        8.8650e-01, 4.6476e-01, 7.0576e-01, 7.9289e-02, 1.3808e-01, 6.0136e-02,
        5.6368e-01, 4.9536e-01, 8.0743e-03, 3.9340e-01, 1.3290e-01, 5.5439e-01,
        1.2298e-01, 5.0783e-01, 2.6985e-01, 2.4521e-01, 7.7962e-01, 3.8861e-01,
        5.2615e-02, 4.5084e-01, 3.6459e-01, 8.2902e-02, 4.8593e-01, 9.1232e-01,
        3.8626e-01, 4.0845e-01, 1.1007e-02, 9.2333e-02, 5.4997e-01, 1.9417e-01,
        1.4074e-01, 2.0488e-01, 1.9824e-01, 8.4020e-02, 1.7813e-01, 8.3347e-01,
        4.7391e-01, 3.2822e-01, 6.4327e-01, 5.6733e-01, 7.0137e-04, 3.7293e-01,
        3.3281e-02, 2.3647e-01, 5.2337e-01, 4.9467e-01, 4.5381e-01, 1.5188e-02,
        4.0496e-01, 4.4260e-01, 4.5484e-01, 3.4376e-02, 2.8905e-01, 7.3342e-03,
        5.0232e-01, 4.6262e-01, 5.0572e-

# Question 4
 Implement a loss function L = (predict-y)^2
 

In [6]:
def L(predict,y):
    return torch.sum((predict-y)**2)

# Question 5
Use batch size of 1, that means feed data one point at a time into network and compute 
the loss. Do one time forward propagation with one data point.

In [7]:
x[0]

tensor([0.0845, 0.4541])

In [8]:
y[0]

tensor(0.1067)

In [9]:
optimizer = torch.optim.SGD(net1.parameters(), lr=0.05)
optimizer.zero_grad()
prediction = net1(x[0])
loss = L(prediction, y[0])
loss.backward()
print("Loss = " + str(loss.data.numpy()))

Loss = 0.6834869


# Question 6
 Compute the gradients using pytorch autograd:
 
a. dL/dw, dL/db

b. Print these values into a text file: torch_autograd.dat

In [10]:
input_layer = net1[0]
input_layer

Linear(in_features=2, out_features=10, bias=True)

In [11]:
hidden_layer1 = net1[2]
hidden_layer1

Linear(in_features=10, out_features=10, bias=True)

In [12]:
hidden_layer2 = net1[4]
hidden_layer2

Linear(in_features=10, out_features=1, bias=True)

In [13]:
print("Loss = " + str(np.round(loss.item(),5)))
print("y_prediction:" + str(np.round(prediction.tolist(),5)))
print("Input_layer w_gradient: " + str(np.round(input_layer.weight.grad.tolist(),5)))
print("Input_layer b_gradient: " + str(np.round(input_layer.bias.grad.tolist(),5)))
print("Hidden_layer_1 weight_gradient: " + str(np.round(hidden_layer1.weight.grad.tolist(),5)))
print("Hidden_layer_1 bias_gradient: " + str(np.round(hidden_layer1.bias.grad.tolist(),5)))
print("Hidden_layer_2 weight_gradient: " + str(np.round(hidden_layer2.weight.grad.tolist()[0],5)))
print("Hidden_layer_2 bias_gradient: " + str(np.round(hidden_layer2.bias.grad.tolist(),5)))

Loss = 0.68349
y_prediction:[-0.72003]
Input_layer w_gradient: [[-1.00e-05 -8.00e-05]
 [ 2.10e-04  1.13e-03]
 [ 9.00e-05  5.00e-04]
 [-1.30e-04 -6.80e-04]
 [-2.80e-04 -1.52e-03]
 [ 6.80e-04  3.67e-03]
 [-1.13e-03 -6.05e-03]
 [ 2.10e-04  1.12e-03]
 [ 1.10e-03  5.93e-03]
 [ 9.00e-04  4.85e-03]]
Input_layer b_gradient: [-0.00017  0.00248  0.0011  -0.0015  -0.00336  0.00809 -0.01333  0.00246
  0.01305  0.01068]
Hidden_layer_1 weight_gradient: [[ 0.02321  0.02342  0.0252   0.03736  0.03658  0.03411  0.0398   0.02897
   0.02532  0.01899]
 [ 0.02352  0.02373  0.02553  0.03786  0.03706  0.03456  0.04032  0.02935
   0.02565  0.01924]
 [ 0.03394  0.03424  0.03684  0.05463  0.05348  0.04988  0.05819  0.04236
   0.03702  0.02776]
 [ 0.0408   0.04117  0.0443   0.06568  0.0643   0.05997  0.06997  0.05093
   0.04451  0.03338]
 [ 0.0046   0.00464  0.00499  0.0074   0.00724  0.00675  0.00788  0.00574
   0.00501  0.00376]
 [-0.01027 -0.01036 -0.01115 -0.01653 -0.01618 -0.01509 -0.01761 -0.01282
  -0.011

# Question 7
Implement the forward propagation and backpropagation algorithm from scratch, 
without using pytorch autograd, compute the gradients using your implementation

a. dL/dw, dL/db

b. Print these values into a text file: my_autograd.dat

In [14]:
w1 = input_layer.weight.t().detach().numpy()
b1 = input_layer.bias.detach().numpy()
w2 = hidden_layer1.weight.t().detach().numpy()
b2 = hidden_layer1.bias.detach().numpy()
w3 = hidden_layer2.weight.t().detach().numpy()
b3 = hidden_layer2.bias.detach().numpy()

In [15]:
w1

array([[-0.2655839 ,  0.65283793,  0.6149356 , -0.38900924,  0.58017904,
        -0.5895183 ,  0.07981533,  0.14305836, -0.00192529,  0.32974452],
       [-0.31671306, -0.3248271 ,  0.43579274,  0.46074122,  0.46709436,
        -0.38845772,  0.2446639 , -0.26276284,  0.4096616 , -0.53121144]],
      dtype=float32)

In [16]:
x0 = x[0].numpy()
x0 = np.round(x0,4)
x0

array([0.0845, 0.4541], dtype=float32)

In [17]:
a=np.ones([1,1])
a[0][0]=y[0]
a

array([[0.10669841]])

In [18]:
y0 = a
y0 = np.round(y0,4)
y0

array([[0.1067]])

## Forward

In [19]:
hidden1 = x0.dot(w1)+b1
hidden1_sigmoid = 1.0 / (1.0 + np.exp(-hidden1))
hidden2 = hidden1_sigmoid.dot(w2)+b2
hidden2_sigmoid = 1.0 / (1.0 + np.exp(-hidden2))
prediction_ = hidden2_sigmoid.dot(w3)+b3
loss_forward = np.square(prediction_ - y0).sum()

## Backward

In [20]:
dy_prediction_ = 2.0 * (prediction_ - y0)

dt = float(dy_prediction_)
dw3 = np.dot(hidden2_sigmoid.T, dt)
db3 = np.ones(1).dot(dt)

dt = np.dot(dt, w3.T)*hidden2_sigmoid*(1-hidden2_sigmoid)
dw2 = np.dot(hidden1_sigmoid.reshape(len(hidden1_sigmoid),1), dt)
db2 = np.ones(1).dot(dt)

dt = np.dot(dt, w2.T)*hidden1_sigmoid*(1-hidden1_sigmoid)
dw1 = np.dot(x0.reshape(len(x0),1), dt)
db1 = np.ones(1).dot(dt)

w1 -= 0.05 * dw1 
w2 -= 0.05 * dw2
w3 -= 0.05 * dw3.reshape(len(dw3),1)

b1 -= 0.05 * db1 
b2 -= 0.05 * db2
b3 -= 0.05 * db3

In [21]:
print("Loss = " + str(round(loss_forward,5)))
print("y prediction: " + str(np.round(prediction_,5)))
print("Input_layer w_gradient: " + str(np.round(dw1.T,5)))
print("Input_layer b_gradient: " +  str(np.round(db1,5)))
print("Hidden_layer_1 weight_gradient: " + str(np.round(dw2,5)))
print("Hidden_layer_1 bias_gradient: " + str(np.round(db2,5)))
print("Hidden_layer_2 weight_gradient: " + str(np.round(dw3,5)))   
print("Hidden_layer_2 bias_gradient: " + str(np.round(db3,5)))    

Loss = 0.68349
y prediction: [-0.72003]
Input_layer w_gradient: [[-1.00e-05 -8.00e-05]
 [ 2.10e-04  1.13e-03]
 [ 9.00e-05  5.00e-04]
 [-1.30e-04 -6.80e-04]
 [-2.80e-04 -1.52e-03]
 [ 6.80e-04  3.67e-03]
 [-1.13e-03 -6.05e-03]
 [ 2.10e-04  1.12e-03]
 [ 1.10e-03  5.93e-03]
 [ 9.00e-04  4.85e-03]]
Input_layer b_gradient: [-0.00017  0.00248  0.0011  -0.0015  -0.00336  0.00809 -0.01333  0.00246
  0.01305  0.01068]
Hidden_layer_1 weight_gradient: [[ 0.02321  0.02352  0.03394  0.0408   0.0046  -0.01027  0.03103 -0.01203
  -0.0257   0.02161]
 [ 0.02342  0.02373  0.03424  0.04117  0.00464 -0.01036  0.03131 -0.01214
  -0.02594  0.0218 ]
 [ 0.0252   0.02553  0.03684  0.0443   0.00499 -0.01115  0.03368 -0.01307
  -0.02791  0.02346]
 [ 0.03736  0.03786  0.05463  0.06568  0.0074  -0.01653  0.04995 -0.01937
  -0.04138  0.03478]
 [ 0.03658  0.03706  0.05348  0.0643   0.00724 -0.01618  0.0489  -0.01897
  -0.04051  0.03405]
 [ 0.03411  0.03456  0.04988  0.05997  0.00675 -0.01509  0.0456  -0.01769
  -0.03

# Question 8
Compare the two files torch_autograd.dat and my_autograd.dat and show that they give the same values up to numerical precision errors

In [22]:
print("loss_deviation = " + str(np.round(loss.item(),5) - np.round(loss_forward.item(),5)))
print("prediction_deviation = " + str(np.round(prediction.item(),5) - np.round(prediction_.item(),5)))
print("input_layer w_deviation = " + str((np.round(input_layer.weight.grad.tolist(),4)-np.round(dw1.T,4)).sum()))
print("input_layer b_deviation = " + str((np.round(input_layer.bias.grad.tolist(),4)-np.round(db1,4)).sum()))
print("hidden_layer_1 w_deviation = " + str((np.round(hidden_layer1.weight.grad.tolist(),4)-np.round(dw2,4)).sum()))
print("hidden_layer_1 b_deviation = " + str((np.round(hidden_layer1.bias.grad.tolist(),4)-np.round(db2,4)).sum()))
print("hidden_layer_2 w_deviation = " + str((np.round(hidden_layer2.weight.grad.tolist(),4)-np.round(dw3,4)).sum()))
print("hidden_layer_2 b_deviation = " + str((np.round(hidden_layer2.bias.grad.tolist(),4)-np.round(db3,4)).sum()))

loss_deviation = 0.0
prediction_deviation = 0.0
input_layer w_deviation = 0.0
input_layer b_deviation = 0.0
hidden_layer_1 w_deviation = 0.0
hidden_layer_1 b_deviation = 0.0
hidden_layer_2 w_deviation = 0.0
hidden_layer_2 b_deviation = 0.0


# And I compare all the gradients of two methods, the result are the same.