In [2]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
#     print(t, loss.data[0])
    print(t, loss.data)

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 tensor(29716444.)
1 tensor(25066280.)
2 tensor(26427694.)
3 tensor(29611068.)
4 tensor(30507492.)
5 tensor(26582772.)
6 tensor(18590738.)
7 tensor(10809925.)
8 tensor(5605145.)
9 tensor(2908919.7500)
10 tensor(1641962.8750)
11 tensor(1051719.1250)
12 tensor(755695.8125)
13 tensor(588574.7500)
14 tensor(481199.9375)
15 tensor(404418.7812)
16 tensor(345402.3438)
17 tensor(297955.9062)
18 tensor(258875.4062)
19 tensor(226118.9688)
20 tensor(198374.8125)
21 tensor(174711.0781)
22 tensor(154409.9219)
23 tensor(136896.9219)
24 tensor(121733.5312)
25 tensor(108539.1484)
26 tensor(97021.8359)
27 tensor(86923.6484)
28 tensor(78043.0234)
29 tensor(70216.1094)
30 tensor(63289.6875)
31 tensor(57148.8438)
32 tensor(51693.0352)
33 tensor(46836.0156)
34 tensor(42497.3828)
35 tensor(38612.6719)
36 tensor(35126.6250)
37 tensor(31993.4141)
38 tensor(29174.4375)
39 tensor(26629.9648)
40 tensor(24333.9805)
41 tensor(22256.4961)
42 tensor(20374.5918)
43 tensor(18668.6309)
44 tensor(17120.0469)
45 tensor(