In [None]:
%matplotlib inline


PyTorch: optim
--------------

A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing squared Euclidean distance.

This implementation uses the nn package from PyTorch to build the network.

Rather than manually updating the weights of the model as we have been doing,
we use the optim package to define an Optimizer that will update the weights
for us. The optim package defines many optimization algorithms that are commonly
used for deep learning, including SGD+momentum, RMSProp, Adam, etc.



In [18]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 702.12353515625
1 684.8876953125
2 668.185546875
3 651.9042358398438
4 636.04638671875
5 620.641845703125
6 605.7340087890625
7 591.21923828125
8 577.1903686523438
9 563.5825805664062
10 550.3766479492188
11 537.556396484375
12 525.0966796875
13 513.0325927734375
14 501.3138732910156
15 489.8553161621094
16 478.7751770019531
17 467.98370361328125
18 457.4845886230469
19 447.22906494140625
20 437.181396484375
21 427.3769836425781
22 417.79296875
23 408.4178771972656
24 399.2539978027344
25 390.342041015625
26 381.6273193359375
27 373.093505859375
28 364.76409912109375
29 356.6661071777344
30 348.7985534667969
31 341.052734375
32 333.4393310546875
33 325.9784851074219
34 318.6656494140625
35 311.4751281738281
36 304.42791748046875
37 297.5335693359375
38 290.77911376953125
39 284.1456604003906
40 277.6423645019531
41 271.2696228027344
42 265.0374755859375
43 258.9576110839844
44 253.00759887695312
45 247.1679229736328
46 241.44418334960938
47 235.81399536132812
48 230.2808837890625
49 