# PyTorch gradient descent

Following along with the excellent [PyTorch tutorial](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html).

## Explicit backpropagation

In [1]:
#import numpy as np
import torch

dtype = torch.float
device = torch.device("cpu")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)  # instead of np.random.randn()
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)  # mm is dot() in numpy
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum() # np: loss = np.square(y_pred - y).sum()
    if t % 30 == 0:
        print(f"Epoch {t:3d} loss {loss:12.2f}")

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.T)
    grad_h = grad_h_relu.clone()  # clone() is copy() in numpy
    grad_h[h < 0] = 0
    grad_w1 = x.T.mm(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Epoch   0 loss  28446274.00
Epoch  30 loss     76031.40
Epoch  60 loss      6711.67
Epoch  90 loss      1043.08
Epoch 120 loss       203.26
Epoch 150 loss        43.90
Epoch 180 loss        10.02
Epoch 210 loss         2.37
Epoch 240 loss         0.58
Epoch 270 loss         0.14
Epoch 300 loss         0.04
Epoch 330 loss         0.01
Epoch 360 loss         0.00
Epoch 390 loss         0.00
Epoch 420 loss         0.00
Epoch 450 loss         0.00
Epoch 480 loss         0.00


## Autograd

In [11]:
import torch

dtype = torch.float
device = torch.device("cpu")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)  # instead of np.random.randn()
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)  # <-- track for autograd
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 30 == 0:
        print(f"Epoch {t:3d} loss {loss:12.2f}")

    # Backprop to compute gradients of w1 and w2 with respect to loss
    loss.backward() # autograd

    # Update weights; weights have requires_grad=True but we don't need to track these updates
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

Epoch   0 loss  35567960.00
Epoch  30 loss     66783.85
Epoch  60 loss      6043.99
Epoch  90 loss       884.66
Epoch 120 loss       157.14
Epoch 150 loss        31.17
Epoch 180 loss         6.67
Epoch 210 loss         1.52
Epoch 240 loss         0.36
Epoch 270 loss         0.09
Epoch 300 loss         0.02
Epoch 330 loss         0.01
Epoch 360 loss         0.00
Epoch 390 loss         0.00
Epoch 420 loss         0.00
Epoch 450 loss         0.00
Epoch 480 loss         0.00
