# Autograd
Automatic gradient calculations for optimization of nn models

In [10]:
import torch

x = torch.randn(3, requires_grad=True)
print(x)

tensor([-0.8570, -0.4600, -1.6681], requires_grad=True)


In [11]:
y = x+2
#this creates a computational graph with + op as a node, x an 2 as inputs to the node and y as output
#autograd then creates a gradient for y wrt x (dy/dx) in x 

print(y)

z = y*y*2
z= z.mean()
print(z)

tensor([1.1430, 1.5400, 0.3319], grad_fn=<AddBackward0>)
tensor(2.5254, grad_fn=<MeanBackward0>)


In [None]:
#Call the gradients
z.backward() #dz/dx

#usually the last value before backward needs to be a scalar, else throws an error
#this will not work if x(the leaf) has requires_grad=False
print(x.grad)
print(x)

tensor([1.5239, 2.0533, 0.4426])
tensor([-0.8570, -0.4600, -1.6681], requires_grad=True)


## Stopping or not adding a tensor to the autograd graph

In [15]:
x = torch.randn(3,requires_grad=True)
print(x)

y = x.detach()
print(y)
#you can also use x.reuqires_grad_(False)
#or
#x.detach()
#or
# with torch.no_grad():

tensor([-0.4585,  0.8202,  0.8619], requires_grad=True)
tensor([-0.4585,  0.8202,  0.8619])


## Accessing gradients and avoiding gradient accumulation

In [16]:
#Lets take a dummy training loop example

weights = torch.ones(4,requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()

    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


As you can see, the gradients are being accumulated each time the loop runs
i.e, the gradients keep getting added to weights each time backward is run.

To prevent this, we always set the tensor to grad.zero_() after calling backward()

In [17]:
#same as before but with grad_zero_()
weights = torch.ones(4,requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()

    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


And this is how you would do it when working with optimizers (like SGD)

In [23]:
weights = torch.ones(4,requires_grad=True)

optimizer = torch.optim.SGD([weights], lr=0.01)
optimizer.step() #run the optimizer step
optimizer.zero_grad() #same as grad.zero()_ but for optimizers

#grad.zero()_ is used only for a single variable/tensor.
#when using optimizer and otherwise, always use _zero.grad()