### Warm-up: numpy

In [1]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    #Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35085673.5744
1 29152320.9546
2 25719684.9325
3 21270186.6668
4 15828714.7687
5 10572329.3388
6 6660169.42016
7 4150003.44604
8 2689480.4031
9 1852591.94893
10 1362372.1164
11 1057579.53626
12 854921.702839
13 710857.047815
14 602503.819072
15 517260.634328
16 448319.045746
17 391324.169916
18 343486.084949
19 302922.148328
20 268182.948197
21 238225.751713
22 212271.752935
23 189697.833477
24 169976.02322
25 152683.956417
26 137453.510157
27 123994.015395
28 112067.220889
29 101474.947068
30 92042.8291108
31 83628.9427811
32 76110.6034995
33 69369.4931511
34 63318.9071856
35 57870.2845482
36 52956.6638644
37 48522.3325789
38 44508.474197
39 40868.8768494
40 37564.6362527
41 34562.134672
42 31830.8405168
43 29343.3164225
44 27074.5845803
45 25001.5573751
46 23105.9826738
47 21370.7079083
48 19780.0159832
49 18320.6785824
50 16980.6690085
51 15748.8715034
52 14615.9564306
53 13572.6996569
54 12611.3042657
55 11724.9292348
56 10906.865741
57 10151.9246323
58 9453.95090619
59 8808.66967

### PyTorch: Tensors

In [3]:
# Code in file tensor/two_layer_net_tensor.py
import torch

#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
  
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
  
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
  
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 41882532.0
1 41397944.0
2 40370916.0
3 32229242.0
4 19835026.0
5 9919456.0
6 4810091.5
7 2645262.5
8 1731744.5
9 1286385.375
10 1024079.125
11 843522.5
12 707276.625
13 599408.4375
14 511876.625
15 439779.4375
16 379860.71875
17 329624.125
18 287365.15625
19 251542.46875
20 220984.296875
21 194766.515625
22 172166.96875
23 152602.875
24 135611.3125
25 120805.390625
26 107876.8203125
27 96535.640625
28 86562.8515625
29 77771.3515625
30 70000.265625
31 63109.96875
32 56993.0234375
33 51549.625
34 46697.6484375
35 42366.55859375
36 38490.48828125
37 35016.59765625
38 31900.232421875
39 29098.12890625
40 26581.78125
41 24312.78515625
42 22263.216796875
43 20408.951171875
44 18729.52734375
45 17207.3515625
46 15824.5859375
47 14567.6728515625
48 13423.7607421875
49 12381.3994140625
50 11430.263671875
51 10561.7255859375
52 9768.1044921875
53 9042.0576171875
54 8376.904296875
55 7767.08203125
56 7208.009765625
57 6694.4599609375
58 6222.5556640625
59 5788.0595703125
60 5387.8486328125
61 5

### PyTorch: Variables and autograd

In [8]:
# Code in file autograd/two_layer_net_autograd.py
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()


    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients before running the backward pass
    #w1.grad.data.zero_()
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 22377096.0
1 19034090.0
2 19379214.0
3 21014446.0
4 21881262.0
5 20300738.0
6 16203696.0
7 11085920.0
8 6760728.5
9 3891078.0
10 2254802.5
11 1381930.875
12 920535.375
13 667264.8125
14 518036.8125
15 422081.34375
16 354813.375
17 304161.0625
18 263985.71875
19 230973.46875
20 203265.65625
21 179709.78125
22 159457.96875
23 141940.078125
24 126692.140625
25 113382.9140625
26 101712.75
27 91449.0859375
28 82393.640625
29 74379.59375
30 67267.5703125
31 60943.046875
32 55307.73046875
33 50281.6015625
34 45787.12890625
35 41755.6171875
36 38135.63671875
37 34878.83203125
38 31941.423828125
39 29290.90625
40 26892.0859375
41 24719.134765625
42 22745.91015625
43 20953.205078125
44 19321.705078125
45 17836.287109375
46 16481.330078125
47 15243.416015625
48 14111.955078125
49 13076.30078125
50 12127.0126953125
51 11255.9755859375
52 10455.509765625
53 9719.4482421875
54 9042.8232421875
55 8421.591796875
56 7848.515625
57 7319.64990234375
58 6830.8720703125
59 6379.00146484375
60 5960.504394

402 0.0003615499008446932
403 0.00035100162494927645
404 0.00034203394898213446
405 0.0003327681333757937
406 0.00032392231514677405
407 0.00031438632868230343
408 0.0003065534110646695
409 0.00029807418468408287
410 0.0002904129505623132
411 0.0002825179835781455
412 0.0002752671716734767
413 0.0002676322765182704
414 0.0002607099886517972
415 0.0002536948013585061
416 0.00024679023772478104
417 0.0002403460821369663
418 0.00023492882610298693
419 0.00022852406254969537
420 0.000222338130697608
421 0.0002176494017476216
422 0.00021311633463483304
423 0.00020772573770955205
424 0.00020254182163625956
425 0.00019794123363681138
426 0.0001932195882545784
427 0.0001887372782221064
428 0.00018430115596856922
429 0.00017973802459891886
430 0.00017556057719048113
431 0.00017122150165960193
432 0.0001682949368841946
433 0.00016469851834699512
434 0.00016121334920171648
435 0.00015747279394418
436 0.00015372165944427252
437 0.0001503937237430364
438 0.00014740013284608722
439 0.000143952507642

### PyTorch: Defining new autograd functions

In [12]:
# Code in file autograd/two_layer_net_custom_function.py
import torch
from torch.autograd import Variable

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    def forward(self, input):
        """
        In the forward pass we receive a Tensor containing the input and return a
        Tensor containing the output. You can cache arbitrary Tensors for use in the
        backward pass using the save_for_backward method.
        """
        self.save_for_backward(input)
        return input.clamp(min=0)
  
    def backward(self, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()
    
    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
  
    # Use autograd to compute the backward pass.
    loss.backward()
  
    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients before running the backward pass
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 32114520.0
1 27009834.0
2 23622512.0
3 19190498.0
4 14071144.0
5 9331276.0
6 5879006.0
7 3684305.0
8 2401114.5
9 1658749.5
10 1218721.0
11 942771.0
12 758475.0625
13 626876.3125
14 527602.0
15 449697.375
16 386727.375
17 334730.5
18 291242.125
19 254544.3125
20 223325.328125
21 196590.40625
22 173573.671875
23 153678.4375
24 136428.03125
25 121421.109375
26 108320.34375
27 96869.7734375
28 86825.9140625
29 77982.4375
30 70176.9140625
31 63267.24609375
32 57135.34375
33 51684.0390625
34 46829.01171875
35 42495.40234375
36 38621.4609375
37 35153.2421875
38 32041.98828125
39 29248.029296875
40 26734.92578125
41 24470.96484375
42 22426.390625
43 20578.470703125
44 18905.396484375
45 17389.16015625
46 16013.572265625
47 14762.6044921875
48 13624.7880859375
49 12588.0810546875
50 11642.0341796875
51 10778.1875
52 9987.966796875
53 9264.6923828125
54 8601.900390625
55 7993.92578125
56 7435.7705078125
57 6922.689453125
58 6450.3603515625
59 6015.4453125
60 5614.20458984375
61 5243.9296875
62

487 0.00027261607465334237
488 0.0002675472933333367
489 0.00026229163631796837
490 0.00025686275330372155
491 0.00025232823099941015
492 0.0002469093888066709
493 0.00024169577227439731
494 0.00023723565391264856
495 0.0002330009447177872
496 0.00022851926041767
497 0.0002238658635178581
498 0.00021945075422991067
499 0.00021549899247474968


### TensorFlow: Static Graphs

In [1]:
# Code in file autograd/tf_two_layer_net.py
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())
  
    # Create numpy arrays holding the actual data for the inputs x and targets y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

3.1985e+07
2.68149e+07
2.83946e+07
3.18445e+07
3.25176e+07
2.7697e+07
1.87915e+07
1.05402e+07
5.37383e+06
2.83132e+06
1.67747e+06
1.14142e+06
864604.0
699428.0
586919.0
502412.0
435288.0
380138.0
333890.0
294655.0
261052.0
232169.0
207188.0
185462.0
166483.0
149846.0
135196.0
122268.0
110817.0
100635.0
91559.8
83452.2
76184.0
69657.5
63786.8
58492.6
53706.9
49374.2
45449.6
41887.2
38645.0
35687.5
32986.9
30518.7
28258.2
26187.4
24286.2
22538.5
20931.2
19451.1
18087.7
16830.5
15669.6
14596.3
13603.7
12685.1
11835.1
11047.6
10317.0
9638.88
9008.92
8423.23
7878.6
7372.04
6900.35
6462.15
6054.16
5673.73
5318.94
4987.81
4678.86
4390.33
4120.61
3868.57
3632.84
3412.31
3205.9
3012.8
2832.18
2663.34
2505.14
2356.94
2218.22
2088.17
1966.23
1851.72
1744.19
1643.18
1548.36
1459.28
1375.57
1296.88
1222.88
1153.33
1087.89
1026.3
968.35
913.823
862.51
814.179
768.645
725.781
685.388
647.349
611.484
577.679
545.812
515.757
487.406
460.685
435.468
411.681
389.228
368.05
348.068
329.196
311.38
294.564


### PyTorch: nn
The `nn` package defines a set of **Modules**, which are roughly equivalent to neural network layers

In [2]:
# Code in file nn/two_layer_net_nn.py
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)
  
    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
  
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
  
    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 736.9913330078125
1 680.4479370117188
2 631.951171875
3 589.7120971679688
4 552.2689208984375
5 518.9169921875
6 488.7153625488281
7 461.306640625
8 436.1474609375
9 412.55633544921875
10 390.5765686035156
11 369.9192810058594
12 350.43499755859375
13 331.92425537109375
14 314.3824462890625
15 297.6042785644531
16 281.603271484375
17 266.30328369140625
18 251.73846435546875
19 237.9233856201172
20 224.7880401611328
21 212.2001495361328
22 200.21286010742188
23 188.78427124023438
24 177.89141845703125
25 167.4981231689453
26 157.6195068359375
27 148.26565551757812
28 139.3912811279297
29 130.97816467285156
30 123.00859832763672
31 115.47413635253906
32 108.3648452758789
33 101.63951873779297
34 95.3074722290039
35 89.35140228271484
36 83.7685546875
37 78.52030181884766
38 73.59060668945312
39 68.96087646484375
40 64.61405944824219
41 60.543800354003906
42 56.71101379394531
43 53.124332427978516
44 49.76331329345703
45 46.62273406982422
46 43.68167495727539
47 40.93010711669922
48 38.3

489 2.3175460228230804e-05
490 2.262961061205715e-05
491 2.2095293388701975e-05
492 2.1576008293777704e-05
493 2.1068051864858717e-05
494 2.0570219930959865e-05
495 2.0085230062250048e-05
496 1.9613427866715938e-05
497 1.9150698790326715e-05
498 1.870006599347107e-05
499 1.826129482651595e-05


### PyTorch: optim

In [3]:
# Code in file nn/two_layer_net_optim.py
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)
  
    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()
  
    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()
  
    # Calling the step function on an Optimizer makes an update to its parameters
    optimizer.step()

0 601.39453125
1 585.4522705078125
2 569.9218139648438
3 554.8595581054688
4 540.3343505859375
5 526.2741088867188
6 512.6185913085938
7 499.4132385253906
8 486.59991455078125
9 474.1710205078125
10 462.01800537109375
11 450.1717834472656
12 438.6464538574219
13 427.439697265625
14 416.5256652832031
15 405.9120178222656
16 395.6161804199219
17 385.5746765136719
18 375.80633544921875
19 366.25830078125
20 356.9421691894531
21 347.8526916503906
22 339.0453186035156
23 330.5022277832031
24 322.15484619140625
25 314.0132141113281
26 306.11004638671875
27 298.4252014160156
28 290.9478759765625
29 283.6487121582031
30 276.5545654296875
31 269.62890625
32 262.83551025390625
33 256.199462890625
34 249.703857421875
35 243.35377502441406
36 237.1551971435547
37 231.08499145507812
38 225.12611389160156
39 219.2904052734375
40 213.58815002441406
41 208.01852416992188
42 202.56797790527344
43 197.23422241210938
44 192.0198516845703
45 186.9265594482422
46 181.9625701904297
47 177.0923309326172
48 1

373 5.6038552429527044e-06
374 5.152961421117652e-06
375 4.739097676065285e-06
376 4.356992121756775e-06
377 4.0050481402431615e-06
378 3.6817166346736485e-06
379 3.3827545848907903e-06
380 3.1071087960299337e-06
381 2.8541476240206975e-06
382 2.6209274892607937e-06
383 2.405982286290964e-06
384 2.2083434032538207e-06
385 2.0271427274565212e-06
386 1.8598149154058774e-06
387 1.7065988231479423e-06
388 1.565117258905957e-06
389 1.4351705885928823e-06
390 1.3160064327166765e-06
391 1.205838771056733e-06
392 1.1054526112275198e-06
393 1.0134222065971699e-06
394 9.281276334149879e-07
395 8.50504932259355e-07
396 7.782205670991971e-07
397 7.127193271116994e-07
398 6.524351192638278e-07
399 5.975932140245277e-07
400 5.46685896551935e-07
401 5.001803060622478e-07
402 4.577069034894521e-07
403 4.1807027173490496e-07
404 3.8254967194006895e-07
405 3.496505200928368e-07
406 3.19476981758271e-07
407 2.920846782217268e-07
408 2.6688780963013414e-07
409 2.4361668238270795e-07
410 2.225488344720361e

### PyTorch: Custom nn Modules

In [4]:
# Code in file nn/two_layer_net_module.py
import torch
from torch.autograd import Variable

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
  
    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
  
    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
  
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 644.1124267578125
1 596.0675048828125
2 554.3326416015625
3 517.3652954101562
4 484.4731750488281
5 455.00360107421875
6 428.4359436035156
7 404.01873779296875
8 381.70025634765625
9 360.7786865234375
10 341.1051940917969
11 322.6124572753906
12 305.22796630859375
13 288.8855285644531
14 273.36346435546875
15 258.60955810546875
16 244.59033203125
17 231.2813262939453
18 218.62899780273438
19 206.60968017578125
20 195.1645050048828
21 184.27914428710938
22 173.92820739746094
23 164.1330108642578
24 154.81678771972656
25 145.98458862304688
26 137.5868377685547
27 129.6487274169922
28 122.12763977050781
29 115.02843475341797
30 108.30043029785156
31 101.96150970458984
32 95.98503875732422
33 90.34422302246094
34 84.9623794555664
35 79.88780212402344
36 75.11293029785156
37 70.62095642089844
38 66.39459228515625
39 62.42181396484375
40 58.68661880493164
41 55.18118667602539
42 51.888916015625
43 48.801597595214844
44 45.904048919677734
45 43.18291091918945
46 40.629093170166016
47 38.230

400 4.223705400363542e-05
401 4.10153043048922e-05
402 3.983293572673574e-05
403 3.867741907015443e-05
404 3.7562356737907976e-05
405 3.647656922112219e-05
406 3.542009289958514e-05
407 3.440074942773208e-05
408 3.340716284583323e-05
409 3.244541585445404e-05
410 3.150980774080381e-05
411 3.059961454709992e-05
412 2.972016045532655e-05
413 2.886299080273602e-05
414 2.8035632567480206e-05
415 2.7231879357714206e-05
416 2.6448717107996345e-05
417 2.5688465029816143e-05
418 2.4952652893261984e-05
419 2.4236846002168022e-05
420 2.354402568016667e-05
421 2.286647941218689e-05
422 2.2212521798792295e-05
423 2.1574192942352965e-05
424 2.095919262501411e-05
425 2.035867510130629e-05
426 1.9774963220697828e-05
427 1.9210912796552293e-05
428 1.8663726223167032e-05
429 1.8128752344637178e-05
430 1.7614067473914474e-05
431 1.711052391328849e-05
432 1.6621886970824562e-05
433 1.614585744391661e-05
434 1.568943844176829e-05
435 1.5242490917444229e-05
436 1.4809852473263163e-05
437 1.4385183931153733

### PyTorch: Control Flow + Weight Sharing

In [5]:
# Code in file nn/dynamic_net.py
import random
import torch
from torch.autograd import Variable

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
  
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
    
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.
    
        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
  
    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
  
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 569.1258544921875
1 615.6617431640625
2 570.0558471679688
3 566.2113037109375
4 562.9085083007812
5 400.80230712890625
6 555.7615966796875
7 559.6937866210938
8 289.1585388183594
9 257.3406677246094
10 221.5051727294922
11 183.21884155273438
12 144.83592224121094
13 561.776611328125
14 538.80029296875
15 68.90328216552734
16 57.6834602355957
17 527.6583862304688
18 514.6808471679688
19 554.5123291015625
20 551.87353515625
21 444.4581604003906
22 89.38655853271484
23 536.8999633789062
24 352.7065124511719
25 94.51013946533203
26 513.1881103515625
27 502.1343688964844
28 75.87789916992188
29 238.70372009277344
30 460.0959777832031
31 49.420413970947266
32 424.5330810546875
33 33.65983581542969
34 384.35137939453125
35 22.06290626525879
36 343.168212890625
37 324.551513671875
38 304.2316589355469
39 142.45045471191406
40 17.38932991027832
41 126.0511474609375
42 219.0490264892578
43 19.015888214111328
44 18.299646377563477
45 182.26956176757812
46 16.754304885864258
47 15.90393352508545

389 4.131418228149414
390 0.40626785159111023
391 0.40635475516319275
392 0.29232272505760193
393 1.3624107837677002
394 4.233582496643066
395 0.15681195259094238
396 2.0580785274505615
397 1.346246600151062
398 1.4714089632034302
399 8.18515682220459
400 0.6672055721282959
401 0.15527775883674622
402 2.635371208190918
403 3.7451436519622803
404 1.9652230739593506
405 1.3213837146759033
406 1.6230359077453613
407 7.2317214012146
408 1.5712506771087646
409 0.8179207444190979
410 0.8716679811477661
411 0.8063486814498901
412 5.9855875968933105
413 0.37211430072784424
414 0.791755199432373
415 0.28190985321998596
416 0.2828046679496765
417 3.821457862854004
418 21.31769371032715
419 1.1653937101364136
420 5.964888572692871
421 23.486896514892578
422 2.0967657566070557
423 0.5913346409797668
424 0.19357195496559143
425 22.065494537353516
426 2.9479353427886963
427 0.29225000739097595
428 0.21041886508464813
429 13.068719863891602
430 0.2226700484752655
431 4.828792095184326
432 0.355241954