### Warm-up: numpy

In [1]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    #Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35085673.5744
1 29152320.9546
2 25719684.9325
3 21270186.6668
4 15828714.7687
5 10572329.3388
6 6660169.42016
7 4150003.44604
8 2689480.4031
9 1852591.94893
10 1362372.1164
11 1057579.53626
12 854921.702839
13 710857.047815
14 602503.819072
15 517260.634328
16 448319.045746
17 391324.169916
18 343486.084949
19 302922.148328
20 268182.948197
21 238225.751713
22 212271.752935
23 189697.833477
24 169976.02322
25 152683.956417
26 137453.510157
27 123994.015395
28 112067.220889
29 101474.947068
30 92042.8291108
31 83628.9427811
32 76110.6034995
33 69369.4931511
34 63318.9071856
35 57870.2845482
36 52956.6638644
37 48522.3325789
38 44508.474197
39 40868.8768494
40 37564.6362527
41 34562.134672
42 31830.8405168
43 29343.3164225
44 27074.5845803
45 25001.5573751
46 23105.9826738
47 21370.7079083
48 19780.0159832
49 18320.6785824
50 16980.6690085
51 15748.8715034
52 14615.9564306
53 13572.6996569
54 12611.3042657
55 11724.9292348
56 10906.865741
57 10151.9246323
58 9453.95090619
59 8808.66967

### PyTorch: Tensors

In [3]:
# Code in file tensor/two_layer_net_tensor.py
import torch

#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
  
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
  
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
  
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 41882532.0
1 41397944.0
2 40370916.0
3 32229242.0
4 19835026.0
5 9919456.0
6 4810091.5
7 2645262.5
8 1731744.5
9 1286385.375
10 1024079.125
11 843522.5
12 707276.625
13 599408.4375
14 511876.625
15 439779.4375
16 379860.71875
17 329624.125
18 287365.15625
19 251542.46875
20 220984.296875
21 194766.515625
22 172166.96875
23 152602.875
24 135611.3125
25 120805.390625
26 107876.8203125
27 96535.640625
28 86562.8515625
29 77771.3515625
30 70000.265625
31 63109.96875
32 56993.0234375
33 51549.625
34 46697.6484375
35 42366.55859375
36 38490.48828125
37 35016.59765625
38 31900.232421875
39 29098.12890625
40 26581.78125
41 24312.78515625
42 22263.216796875
43 20408.951171875
44 18729.52734375
45 17207.3515625
46 15824.5859375
47 14567.6728515625
48 13423.7607421875
49 12381.3994140625
50 11430.263671875
51 10561.7255859375
52 9768.1044921875
53 9042.0576171875
54 8376.904296875
55 7767.08203125
56 7208.009765625
57 6694.4599609375
58 6222.5556640625
59 5788.0595703125
60 5387.8486328125
61 5

### PyTorch: Variables and autograd

In [7]:
# Code in file autograd/two_layer_net_autograd.py
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()


    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients before running the backward pass
    #w1.grad.data.zero_()
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 27118618.0
1 27118618.0
2 27118618.0
3 27118618.0
4 27118618.0
5 27118618.0
6 27118618.0
7 27118618.0
8 27118618.0
9 27118618.0
10 27118618.0
11 27118618.0
12 27118618.0
13 27118618.0
14 27118618.0
15 27118618.0
16 27118618.0
17 27118618.0
18 27118618.0
19 27118618.0
20 27118618.0
21 27118618.0
22 27118618.0
23 27118618.0
24 27118618.0
25 27118618.0
26 27118618.0
27 27118618.0
28 27118618.0
29 27118618.0
30 27118618.0
31 27118618.0
32 27118618.0
33 27118618.0
34 27118618.0
35 27118618.0
36 27118618.0
37 27118618.0
38 27118618.0
39 27118618.0
40 27118618.0
41 27118618.0
42 27118618.0
43 27118618.0
44 27118618.0
45 27118618.0
46 27118618.0
47 27118618.0
48 27118618.0
49 27118618.0
50 27118618.0
51 27118618.0
52 27118618.0
53 27118618.0
54 27118618.0
55 27118618.0
56 27118618.0
57 27118618.0
58 27118618.0
59 27118618.0
60 27118618.0
61 27118618.0
62 27118618.0
63 27118618.0
64 27118618.0
65 27118618.0
66 27118618.0
67 27118618.0
68 27118618.0
69 27118618.0
70 27118618.0
71 27118618.0
72