In [1]:
import torch

In [2]:
dtype = torch.float
device = torch.device("cpu")

### directly using the gradient decent update

* define network weights manually
* define the training formulas manually 

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    
    # forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # copmute loss
    loss = (y_pred - y).pow(2).sum()
    if t%100 == 99:
        print(t+1, loss.item())
        
    # use autograd to copmute backward pass
    # computes gradient for w1 and w2 and writes them in w1.grad and w2.grad
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

100 550.1476440429688
200 4.426885604858398
300 0.0485905185341835
400 0.000861109234392643
500 9.715423948364332e-05


### using optim optimizers

* use predefined **layers** to define the network
* use predefined optimizer

In [13]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t-1, loss.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

98 51.56581115722656
198 0.8330827355384827
298 0.005636509507894516
398 2.222410694230348e-05
498 2.7616980702305227e-08
