In [2]:
from mxnet import nd, autograd

In [3]:
'''
    1. A Simple Example
'''
x = nd.arange(4)
# Tell an NDArray that we plan to store a gradient by invoking its attach_grad() method
x.attach_grad()
# Tell MXNet to build the graph explicitly
with autograd.record():
    y = 2*nd.dot(x, x)
    print("y =", y)
y.backward()
print(x.grad)

y = 
[28.]
<NDArray 1 @cpu(0)>

[ 0.  4.  8. 12.]
<NDArray 4 @cpu(0)>


In [5]:
'''
    2. Backward for Non-scalar Variable
'''
# When y is not scalar, the gradients could be high order tensor and complex to compute
# As loss functions are ofter scalars, so MXNet will sum the element in y to get the new variable by default
# y is a vector
with autograd.record():
    y = x * x
y.backward()
print("dy/dx = ", x.grad)

u = x.copy()
u.attach_grad()
# v is a scalar
with autograd.record():
    v = (u * u).sum()   
v.backward()
print("dv/du = ", u.grad)

dy/dx =  
[0. 2. 4. 6.]
<NDArray 4 @cpu(0)>
dv/du =  
[0. 2. 4. 6.]
<NDArray 4 @cpu(0)>


In [8]:
'''
    3. Detach Computation
'''
# Move some parts of computations out of the computation graph
# As code following, u will forget how y is computed and be treated as a constant
with autograd.record():
    y = x * x
    u = y.detach()
    z = u * x
z.backward()
print("Detach dz/dx = ", x.grad)

with autograd.record():
    y = x * x
    z = y * x
z.backward()
print("dz/dx = ", x.grad)

Detach dz/dx =  
[0. 1. 4. 9.]
<NDArray 4 @cpu(0)>
dz/dx =  
[ 0.  3. 12. 27.]
<NDArray 4 @cpu(0)>


In [21]:
'''
    4. Attach Gradients to Internal Variables
'''
x = nd.arange(4)
x.attach_grad()

y = nd.ones(4) * 2
y.attach_grad()
with autograd.record():
    u = x * y
    # You can add this line to see what will happen
    u.attach_grad()  #implicitly run detach() and then u will forget how y is computed and be treated as a constant
    z = u + x
z.backward()
print("dz/dx", x.grad)
print("dz/dy", y.grad)
print("dz/du", u.grad)

dz/dx 
[1. 1. 1. 1.]
<NDArray 4 @cpu(0)>
dz/dy 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>
dz/du 
[1. 1. 1. 1.]
<NDArray 4 @cpu(0)>


In [26]:
'''
    5. Head Gradients
'''
y = nd.ones(4) * 2
y.attach_grad()
with autograd.record():
    u = x * y
    v = u.detach()  # u still keeps the computation graph
    v.attach_grad()
    z = v + x
z.backward()
print("dz/dv", v.grad)
print("dz/dx", x.grad)
print("dz/dy", y.grad)

# Pass v.grad as the first term of u.backward(), then you would able to conduct the chain rule during back propagation
# pass the first term as the head gradients to multiply both terms so that x.grad will contains  𝑑𝑧/𝑑𝑥  instead of  𝑑𝑢/𝑑𝑥
u.backward(v.grad)
print("dz/dv", v.grad)
print("dz/dx", x.grad)
print("dz/dy", y.grad)


dz/dv 
[1. 1. 1. 1.]
<NDArray 4 @cpu(0)>
dz/dx 
[1. 1. 1. 1.]
<NDArray 4 @cpu(0)>
dz/dy 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>
dz/dv 
[1. 1. 1. 1.]
<NDArray 4 @cpu(0)>
dz/dx 
[2. 2. 2. 2.]
<NDArray 4 @cpu(0)>
dz/dy 
[0. 1. 2. 3.]
<NDArray 4 @cpu(0)>
