<a href="https://colab.research.google.com/github/anubhavgupta1/D2L.AI/blob/main/Preliminaries/Automatic%20Differentiation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries

In [1]:
!pip install -U mxnet-cu101==1.7.0

Collecting mxnet-cu101==1.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/40/26/9655677b901537f367c3c473376e4106abc72e01a8fc25b1cb6ed9c37e8c/mxnet_cu101-1.7.0-py2.py3-none-manylinux2014_x86_64.whl (846.0MB)
[K     |███████████████████████████████▌| 834.1MB 1.7MB/s eta 0:00:08tcmalloc: large alloc 1147494400 bytes == 0x65216000 @  0x7f8d40591615 0x591e47 0x4cc179 0x4cc2db 0x50a1cc 0x50beb4 0x507be4 0x509900 0x50a2fd 0x50beb4 0x507be4 0x509900 0x50a2fd 0x50cc96 0x58e683 0x50c127 0x58e683 0x50c127 0x58e683 0x50c127 0x58e683 0x50c127 0x5095c8 0x50a2fd 0x50beb4 0x507be4 0x509900 0x50a2fd 0x50beb4 0x5095c8 0x50a2fd
[K     |████████████████████████████████| 846.0MB 20kB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Installing collected packages: graphviz, mxnet-cu101
  Found existing installation: graphviz 0.10.1
    Uninstalling

## Automatic Differentiation

In [2]:
from mxnet import autograd, np, npx
npx.set_np()
import torch
import tensorflow as tf

### A Simple Example

####MXNET

In [3]:
x = np.arange(4.0)
# We allocate memory for a tensor's gradient by invoking `attach_grad`
x.attach_grad()
# After we calculate a gradient taken with respect to `x`, we will be able to
# access it via the `grad` attribute, whose values are initialized with 0s
print(x.grad)

[0. 0. 0. 0.]


In [4]:
with autograd.record():
    y = 2 * np.dot(x, x)
print(y)

28.0


In [5]:
y.backward()
print(x.grad)

[ 0.  4.  8. 12.]


####Pytorch

In [6]:
x = torch.arange(4.0)
x.requires_grad_(True)  # Same as `x = torch.arange(4.0, requires_grad=True)`
print(x.grad)  # The default value is None

None


In [7]:
y = 2 * torch.dot(x, x)
print(y)

tensor(28., grad_fn=<MulBackward0>)


In [8]:
y.backward()
print(x.grad)

tensor([ 0.,  4.,  8., 12.])


#### Tensorflow

In [9]:
x = tf.range(4, dtype=tf.float32)
x = tf.Variable(x)

In [10]:
# Record all computations onto a tape
with tf.GradientTape() as t:
    y = 2 * tf.tensordot(x, x, axes=1)
print(y)

tf.Tensor(28.0, shape=(), dtype=float32)


In [11]:
print(t.gradient(y, x))

tf.Tensor([ 0.  4.  8. 12.], shape=(4,), dtype=float32)


### Backward for Non-Scalar Variables

#### Mxnet

In [12]:
x = np.arange(4.0)
x.attach_grad()

In [13]:
# When we invoke `backward` on a vector-valued variable `y` (function of `x`),
# a new scalar variable is created by summing the elements in `y`. Then the
# gradient of that scalar variable with respect to `x` is computed
with autograd.record():
    y = x * x  # `y` is a vector

In [14]:
y.backward()
print(x.grad)  # Equals to y = sum(x * x)

[0. 2. 4. 6.]


####Pytorch

In [15]:
x = torch.arange(4.0)
x.requires_grad_(True)  # Same as `x = torch.arange(4.0, requires_grad=True)`

tensor([0., 1., 2., 3.], requires_grad=True)

In [16]:
# Invoking `backward` on a non-scalar requires passing in a `gradient` argument
# which specifies the gradient of the differentiated function w.r.t `self`.
# In our case, we simply want to sum the partial derivatives, so passing
# in a gradient of ones is appropriate
y = x * x
# y.backward(torch.ones(len(x))) equivalent to the below
y.sum().backward()
print(x.grad)

tensor([0., 2., 4., 6.])


####Tensorflow

In [17]:
x = tf.range(4, dtype=tf.float32)
x = tf.Variable(x)

In [18]:
with tf.GradientTape() as t:
    y = x * x
print(t.gradient(y, x))  # Same as `y = tf.reduce_sum(x * x)`

tf.Tensor([0. 2. 4. 6.], shape=(4,), dtype=float32)


### Detaching Computation

#### Mxnet

In [19]:
x = np.arange(4.0)
x.attach_grad()

In [20]:
with autograd.record():
    y = x * x
    u = y.detach()
    z = u * x

In [21]:
z.backward()
print(x.grad)
print(u)
print(x.grad == u)

[0. 1. 4. 9.]
[0. 1. 4. 9.]
[ True  True  True  True]


In [22]:
y.backward()
print(x.grad)

[0. 2. 4. 6.]


####Pytorch

In [23]:
x = torch.arange(4.0)
x.requires_grad_(True)

tensor([0., 1., 2., 3.], requires_grad=True)

In [24]:
y = x * x
u = y.detach()
z = u * x

In [25]:
z.sum().backward()
print(x.grad)
print(u)
print(x.grad == u)

tensor([0., 1., 4., 9.])
tensor([0., 1., 4., 9.])
tensor([True, True, True, True])


In [26]:
x.grad.zero_()
y.sum().backward()
print(x.grad)

tensor([0., 2., 4., 6.])


####Tensorflow

In [27]:
x = tf.range(4, dtype=tf.float32)
x = tf.Variable(x)
print(x)

<tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 1., 2., 3.], dtype=float32)>


In [28]:
# Set `persistent=True` to run `t.gradient` more than once
with tf.GradientTape(persistent=True) as t:
    y = x * x
    u = tf.stop_gradient(y)
    z = u * x

In [29]:
print(t.gradient(z, x))

tf.Tensor([0. 1. 4. 9.], shape=(4,), dtype=float32)


In [30]:
print(t.gradient(y, x))

tf.Tensor([0. 2. 4. 6.], shape=(4,), dtype=float32)
