# derivative & gradient

### 自动求导
假设要对$y=2x^Tx$关于列向量$x$求导

In [1]:
import torch

x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [2]:
x.requires_grad_(True)  # 同x=torch.arange(4.0,requires_grad=True) 
x.requires_grad  # Instance attribute requires_grad

True

In [3]:
x.grad  # 梯度gradient,默认是None

In [4]:
y = 2 * torch.dot(x, x)
y

tensor(28., grad_fn=<MulBackward0>)

In [5]:
# 调用反向传播来自动计算y关于x每个分量的梯度
y.backward()
x.grad

tensor([ 0.,  4.,  8., 12.])

In [6]:
x.grad == 4 * x

tensor([True, True, True, True])

In [7]:
# 默认情况下,PyTorch会累积梯度,我们需要清除之前的值
x.grad.zero_()
y = x.sum()  # 一个新的函数
y.backward()
x.grad
# 如果不清0, 把x.grad.zero_()注释,结果为:tensor([ 1.,  5.,  9., 13.])

tensor([1., 1., 1., 1.])

In [8]:
# (机器学习中,很少对非标量进行一个backward)
# 对于非标量调用backward需要传入一个gradient参数,该参数指定微分函数
x.grad.zero_()
y = x * x  # 注意: 这里的*运算符被torch重写了
# 现在y是一个矩阵,直接对其backward会报错
# y.backward()  # Error:RuntimeError: grad can be implicitly(隐式)
#                 created only for scalar(标量) outputs

#print("了解retain_graph的作用,监视y.sum():\n",y.sum(),y.sum())
y.sum().backward(retain_graph=True)  # <=>y.backward(torch.ones(len(x)))
# retain_graph=True表示不清除计算图,这样中间的算子(eg:y)即可再次backward
print(x.grad)
print(y)
x.grad.zero_()  # <=> y.backward(torch.ones(len(x)))
y.backward(gradient=torch.ones(len(x)))  # <=> y.sum().backward()
# 参数gradient表示给self一个梯度,这里给一个[1,1,1,1],正向走一步<=>sum(self)
print(x.grad)
print(y)
#print("了解retain_graph的作用,监视y.sum():\n",y.sum())

tensor([0., 2., 4., 6.])
tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>)
tensor([0., 2., 4., 6.])
tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>)


```
Tensor.backward(.) Args:
            gradient (Tensor, optional): The gradient of the function
                being differentiated w.r.t. ``self``.
                This argument can be omitted if ``self`` is a scalar.
            retain_graph (bool, optional): If ``False``, the graph used to compute
                the grads will be freed. Note that in nearly all cases setting
                this option to True is not needed and often can be worked around
                in a much more efficient way. Defaults to the value of
                ``create_graph``.
            create_graph (bool, optional): If ``True``, graph of the derivative will
                be constructed, allowing to compute higher order derivative
                products. Defaults to ``False``.
            inputs (sequence of Tensor, optional): Inputs w.r.t. which the gradient will be
                accumulated into ``.grad``. All other tensors will be ignored. If not
                provided, the gradient is accumulated into all the leaf Tensors that were
                used to compute the :attr:`tensors`.
```

In [9]:
# 将某些计算移动到记录的计算图之外
x.grad.zero_()
y = x * x
u = y.detach()  # 创建新张量,不再追踪梯度信息(分离后的张量操作不会影响y)
# 即 u=[0,1,4,9],与x无关
u.requires_grad_()
z = u * x
z.sum().backward()  # 生成一个反向计算图,记z.sum()为r,即计算图(树)的根
x.grad == u
# 分析: 记r=z.sum()=sum(u⊙x)
# x是向量(默认为列向量),u是独立于x的另一个向量
# 对x求偏导,u与x无关,u可看成是一个常数向量
# ∴x.grad=∂r/∂x = ∂r/∂z * ∂z/∂x = 1^T*∂(u⊙x)/∂x = u
# 注意,现在x的梯度是∂r/∂x, 不是之前的∂(x*x)/∂x

tensor([True, True, True, True])

In [10]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

tensor([True, True, True, True])

In [11]:
# 即使是走python控制流, backward依然可行
def f(num):
    b = num * 2
    while b.norm() < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c


a = torch.randn(size=(), requires_grad=True)  # size=() means that the tensor will be a scalar
d = f(a)
d.backward()
a.grad

tensor(204800.)

---
### 其他

In [12]:
a1 = torch.tensor([1.], requires_grad=True)
a2 = a1 * a1
a3 = a1.sum()
a3.backward()
print(a1.grad)
a1.grad.zero_()
a3.backward()
print(a1.grad)

v1 = torch.tensor([1, 2.], requires_grad=True)
v2 = v1 * v1
v3 = v2.sum()
# v3.backward(retain_graph=True)
v3.backward()
print(v1.grad)
v1.grad.zero_()
v3.backward()  # Error (要进行多次backward,上一次时必须加retain_graph=True)
print(v1.grad)

# 但是标量的就没有这个问题,所以retain_graph保留计算图的作用是什么?


tensor([1.])
tensor([1.])
tensor([2., 4.])


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.