# Intro to torch.autograd
torch.autograd is PyTorch’s automatic differentiation engine that powers neural network training.

Usage in PyTorch

In [1]:
import torch
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights = ResNet18_Weights.DEFAULT)
data = torch.rand(1,3,64,64)
# single image with 3 channels, and height & width of 64
labels = torch.rand(1,1000)

  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/aryamantepal/anaconda3/envs/ML-env/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [2]:
# forward pass; running input data through all the layers
prediction = model(data)

In [3]:
loss = (prediction - labels).sum()
# Backward propagation is kicked off when we call .backward() on the error tensor. 
loss.backward() # backward pass
# Autograd then calculates and stores the gradients for each model parameter in the parameter’s .grad attribute.

In [4]:
# learning rate of 0.01 & momentum of 0.9
optim = torch.optim.SGD(model.parameters(), lr = 1e-2, momentum=0.9)

In [5]:
# step() initiates gradient descent; the optimizer adjusts each parameter by its gradient stored in grad
optim.step()

Differentiation in Autograd

In [8]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

# requires grad signals to autograd that all operations on a & b should be tracked

In [10]:
Q = 3*a**3 - b**2
# let a & b be parameters of an NN, Q = loss

In [11]:
# calling .backward() on Q (loss) will store the gradient in each parameters .grad attribute
external_grad = torch.tensor([1.,1.])
Q.backward(gradient=external_grad)

# then, the gradient of a should be 9a^2, and the gradient of b should be -2b

In [13]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [16]:
# torch.autograd() computes vector - jacobian product; computes J.T * v

Computational Graph

In [17]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
print(f"Does `a` require gradients?: {a.requires_grad}")
b = x + z
print(f"Does `b` require gradients?: {b.requires_grad}")

# the output tensor of an operation will require a gradient even if only a single input tensor requires a gradient

Does `a` require gradients?: False
Does `b` require gradients?: True


In [18]:
# when fine-tuning, we freeze most of the model to modify only the classifier layers to make predictions on new labels
from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

In [19]:
# replacing the last linear layer (unfrozen by default)

model.fc = nn.Linear(512,10)

In [21]:
# Optimize only the classifier - all parameters except model.fc are frozen
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)