In [1]:
import torch 
import torch.nn as nn 
from torch import autograd
from torch import optim
import torch.nn.functional as F

In [None]:
"""
This is my understanding of autograd.grad() and backward() It could be wrong somewhere.
"""

In [2]:
# Let's first create a network and it is: y = ( (w1*x)*w2 + b2 ) ^ 2 
# In this example: w1=3,  w2=2,  b2=6

In [3]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.linear1 = nn.Linear(1,1, bias=False)
        self.linear1.weight.data = torch.tensor([[3.]])
        
        self.linear2 = nn.Linear(1,1, bias=True)
        self.linear2.weight.data = torch.tensor([[2.]])
        self.linear2.bias.data = torch.tensor([6.])
        
        
    def forward(self, x):
        l = self.linear1(x)
        return l, self.linear2(l).pow(2)
        

In [4]:
# create a network and define an input (z) as 4 

In [5]:
net = Net()
z = torch.tensor([[4.]])

In [6]:
# run forward, here we have intermediate output l and final output y
# l is actually (w1*x) 

In [7]:
l, y = net(z)
l, y

(tensor([[12.]], grad_fn=<MmBackward>),
 tensor([[900.]], grad_fn=<PowBackward0>))

In [8]:
# we can verify here that l = (w1*x) = 3*4 = 12
# and y = (l*w2+b)^2 = (12*2+6)^2 = 900

In [9]:
# now let's call backward. Here we want to calculate grad with respect to l 
# in other words, if you think l as an input, then we want to know what is its effect 
# on final output y when it changes 1 (basically that's defination/understanding of gradient)
# we can maunally calculate here: y'(l) = 2*(w2)^2*l + 2(b2)(w2). 
# plug in the values we can have: y'(l) = 2*(2^2)*12 + 2*6*2 = 120 

In [10]:
grad, = autograd.grad( outputs=y, inputs=l )
grad

tensor([[120.]])

In [11]:
# IMPOTRANT 
# In the above autograd, it looks like equivalent to call backward() (Of course, since inputs is l,
# backward only needs to happend to l, which means w1 does not need to be reached. But I am not sure
# whether internally they calculate w1 grad, but this is not important). However, autograd.grad() is 
# not same as calling backward().  

In [12]:
# When you call backward() to a tensor (usually loss), Pytorch will calculate and POPULATE grad to 
# all leaf tensors used for calculting this tensor. By leaf tensor, in pytorch, it means 'input'
# and 'weight'. In the above example x w1 w2 b2 are leaf tensors. l is not, it is just an intermediate result.
# After getting grad, you can use optimizer to do update

In [13]:
# Let's define a new net and forward 

In [14]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)

In [15]:
# call backward for f 

In [16]:
y.backward()

In [17]:
# we can see grad exists for all leaf nodes 
# Of course, we did not set z.requires_grad=True, thus z does not have grad 

In [18]:
print(z.grad)
print(net.linear1.weight.grad)
print(net.linear2.weight.grad)
print(net.linear2.bias.grad)

None
tensor([[480.]])
tensor([[720.]])
tensor([60.])


In [19]:
# Now let's back to augograd.grad again. The difference for autograd.grad is that: it will 
# only return grad with respect to input given in augograd.grad(), however, this grad will not 
# be populated into that tensor 

In [20]:
# for example, let's do it again  

In [21]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)
grad, = autograd.grad( outputs=y, inputs=net.linear1.weight )
print(grad)

tensor([[480.]])


In [22]:
# See, it is same as when you call backward(), however, this grad is not stored in net.linear1.weight tensor

In [23]:
print(net.linear1.weight.grad)

None


In [24]:
# thus you can not expect to use autograd and then use optimizer.step() to update your weights.
# since grad is not populated to any tensors along the way to calculating grad of w1 
# and the worse thing here is: like calling backward(), autograd.grad() also needs computational graph, 
# by default, once you call autograd.grad(), the graph will be freed and, you can not call the autograd() again
# or call backward 

In [25]:
grad, = autograd.grad( outputs=y, inputs=net.linear2.weight )

RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.

In [26]:
y.backward()

RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.

In [27]:
# Thus, if you want to use the same graph, you need to set retain_graph=True. 
# Let's do it again 

In [31]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)
grad, = autograd.grad( outputs=y, inputs=net.linear1.weight, retain_graph=True )
print(grad)
grad, = autograd.grad( outputs=y, inputs=net.linear2.weight, retain_graph=True )
print(grad)
y.backward()

tensor([[480.]])
tensor([[720.]])


In [32]:
# we can check grad here 

In [33]:
print(net.linear1.weight.grad)
print(net.linear2.weight.grad)
print(net.linear2.bias.grad)

tensor([[480.]])
tensor([[720.]])
tensor([60.])


In [34]:
# Note that these three grad is stored when you last call f.backward(), it does not happen when you 
# call two times autograd.grad before. 

In [35]:
# So, what are the benefits of using autograd if it can not populate grad to any leaf tensors?
# one benefit is that it can return grad to non-leaf tensor. Just like the very first example 
# we can calculate grad with respect to l.
# another benefit is: it can be used to calculate higher derivative. 
# Here comes create_graph=True

In [36]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)
f, = autograd.grad( outputs=y, inputs=l, create_graph=True )
f

tensor([[120.]], grad_fn=<TBackward>)

In [37]:
# by setting create_graph=True, we create a graph for grad. In other words, 
# in this graph, the final output is no longer y, but it is grad. 
# mathematically, y'(l) = 2*(w2)^2*l + 2(b2)(w2) 
# Thus, this new grah is for above equation, let's define f = y'(l)
# f = 2*(w2)^2*l + 2(b2)(w2) = 2*(w2)^2*(w1)x + 2(b2)(w2)
# This could be useful in some cases where your loss is related with grad. 
# For example, in some regulization, you want to minimize grad. 
# Let's say you want to minimize grad with respect to l, in other words, you want to minimize f 
# so let's call f.backward()

In [38]:
f.backward()

In [39]:
# let's check grad 

In [40]:
print(net.linear1.weight.grad)
print(net.linear2.weight.grad)
print(net.linear2.bias.grad)

tensor([[32.]])
tensor([[108.]])
tensor([4.])


In [41]:
# and verify them
# f'(w1) = 2*(w2)^2*x = 2*2^2*4 = 32
# f'(w2) = 4*(w2)*x*(w1) + 2*(b2) = 4*2*4*3 + 2*6 = 108
# f'(b2) = 2*(w2) = 2*2 = 4

In [42]:
# Now let's use autograd to calculate second derivative of w1: y''(w1)

In [43]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)
first_derivative, = autograd.grad( outputs=y, inputs=net.linear1.weight, create_graph=True )
print(first_derivative)
second_derivative, = autograd.grad( outputs=first_derivative, inputs=net.linear1.weight )
print(second_derivative)

tensor([[480.]], grad_fn=<TBackward>)
tensor([[128.]])


In [44]:
# We can verify 
# like before the first derivative is y'(w1) = 2*x^2*(w1)*(w2)^2 + 2*x*w2*b2 = 480
# the second derivative is y''(w1) = 2*x^2*(w2)^2 = 128

In [45]:
# finally, grad is accumulated in Pytorch. For example if you call backward to both y and f
# then grad will be summed 

In [46]:
net = Net()
z = torch.tensor([[4.]])
l, y = net(z)
f, = autograd.grad( outputs=y, inputs=l, create_graph=True )
f.backward(retain_graph=True)
y.backward()

print(net.linear1.weight.grad)
print(net.linear2.weight.grad)
print(net.linear2.bias.grad)

tensor([[512.]])
tensor([[828.]])
tensor([64.])


In [None]:
# You can see that 512=32+480; 828=108+720; 64=4+60

In [None]:
# One thing buffles me is that: my understanding is that once you call create_graph=True, then there will be 
# two graphs. When I call f.backward(), if not set retain_graph=True then the second graph will be freed 
# and the first one is still there, so you should be able to call y.backward(). However, this is not the case
# it performances like: there is one graph. If you do not set retain_graph=True in f.backward(), then you can 
# not call y.backward(). Note that in autograd.grad(), there is also one argument retain_graph,
# it is set the same as create_graph. So, if you set retain_graph=False in autograd.grad(),
# then you can not call f.backward() either. 
