In [1]:
# typically using backprop to train, using gradient of loss to optimize parameters
# torch.autograd used for this

In [2]:
import torch

x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True) # requires_grad enables autograd
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [3]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x7f42b00674c0>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f42b0067460>


In [4]:
# computing gradients wrt weights, wrt biases with .grad
loss.backward() # kicks off computations
print(w.grad)
print(b.grad)
# this only is possible for leaf nodes of computational graph with requires_grad=True

tensor([[0.1581, 0.0495, 0.0256],
        [0.1581, 0.0495, 0.0256],
        [0.1581, 0.0495, 0.0256],
        [0.1581, 0.0495, 0.0256],
        [0.1581, 0.0495, 0.0256]])
tensor([0.1581, 0.0495, 0.0256])


In [5]:
# option to stop tracking computations
z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

True
False


In [6]:
# or detach()
z = torch.matmul(x, w)+b
z_det = z.detach()
print(z_det.requires_grad)
# this allows marking parameters as frozen in NN
# speeds up computations with only the forward pass

False


In [None]:
# the record of all operations is saved in a DAG of Function objects
#  (leaves are tensors, roots are the output tensors)
# these are dynamic in pytorch, graph recreated from scratch for each .backward()

In [7]:
# for vectors, pytorch computes the Jacobian product (v^T * J)
inp = torch.eye(4, 5, requires_grad=True)
out = (inp+1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
# note need to zero grad before computing as they are accumulated

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
