In [1]:
# Autograd stands for automatic gradients which is a way to calculate the gradient
# for all the variables leanding to the computation of a single variable using the computational graph

# When training neural networks the most frequently used algorithm is backpropagation
# In the backpropagation, the parameters are adjusted according to the gradient of the loss funciton with respec to a given parameter

# Pytorch has a built in differentiation engine called torch.autograd it supports automtic computationf othe gradient for any computational graph


In [2]:
import torch

In [7]:
x = torch.ones(5) # The input of size 5
y = torch.zeros(3) # expected ouput

# Sample weights and bias, which are a matrix and vector respectively
W = torch.rand(5, 3, requires_grad=True)
b = torch.rand(3, requires_grad=True)

z = torch.matmul(x, W) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z,y)

In [6]:
loss

tensor(3.0387)

In [8]:
loss.grad_fn

<BinaryCrossEntropyWithLogitsBackward0 at 0x13722aad0>

In [9]:
z.grad_fn

<AddBackward0 at 0x1367eebf0>

In [11]:
W.grad_fn

In [12]:
loss.backward()

In [18]:
b.grad

tensor([0.3162, 0.3176, 0.3052])

In [19]:
# Pytorch only calculates gradients for leaf tensors which have the required_grad
# property set to True, for all other nodes in the computational graph the 
# gradient will not be available, but will definitely be available have to 
# be computed in order for the chain rule to work


In [21]:
# You can only fun the backward function through a 
# computational graph once for performance reasons
loss.backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [25]:
# Disabling gradient tracking 
# by default all tensors with autograd set to 
# true are being tracked through the computational graph
# However in some cases eg. during inference we would not need to track the gradients of the tensors
z = torch.matmul(x, W) + b
# z.backward() will not work, gradients can only be created for scalar outputs
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, W) + b
print(z.requires_grad)

True
False


In [26]:
# We would want to turn off gradient tracking if :
# 1) We want to freeze some parameters and do not want them to change
# 2) We are only interesed in performing a forward pass, in which case
# it would be more efficient to perform the operation without gradient tracking

In [27]:
# Autograd keeps track of all tensors, operations and their results in a direct acyclic graph (DAG).
# In this DAG, the leavesa re the input to the tensor and the roots are the output of the tensor,
# By tracing the graph from root to leaves you can automatically compute the gradients using the chain rule


# In a forward pass:
# 1) run the requested operation to compute resulting tensor
# 2) maintain the operations gradient function in the DAG

# In the backward pass:
# 1) computes the gradient from each .grad_fn
# 2) accumulates the gradients in each respective tensor in the .grad attribute
# 3) using chain rule, propagates all the way to the leaf tensors

In [30]:
# Jacobian products is used in place of gradients in situations where there
# are multiple outputs, i.e. the output is a vector and not a scalar

inp = torch.eye(4,5, requires_grad=True)
out = (inp+1).pow(2).t()

out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call : {inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"Second call : {inp.grad}")

inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"Call after zeroing grad: {inp.grad}")

First call : tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
Second call : tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])
Call after zeroing grad: tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
