# **Buil an Autograd System From Scratch with NumPy**

# 0. Set Up

Let's load the packages we need to run our tests:

In [3]:
import os
import random as rd
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.style.use('seaborn-darkgrid')
# Change the font ?
font = {'size'   : 15}
plt.rc('font', **font)

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [4]:
# Let's fix the seed

SEED = 42
rd.seed(SEED)
np.random.seed(SEED)

# 1. Basic Operations

## 1.1. With PyTorch

In [5]:
import torch

t1 = torch.tensor([[1, 3], 
                  [5, 7]])
t2 = torch.tensor([[2, 4], 
                  [6, 8]])

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nSome basic operations")
print(f"t1 + t2: \n{torch.add(t1, t2)}")
print(f"t1 - t2: \n{torch.sub(t1, t2)}")
print(f"t1 * t2: \n{torch.mul(t1, t2)}")
print(f"t1 @ t2: \n{t1 @ t2}")
print(f"t1 ** 2: \n{torch.pow(t1, 2)}")
print(f"t1 / t2: \n{torch.div(t1, t2)}")
print(f"t1 / 10: \n{t1 / 10}")

t1 =
tensor([[1, 3],
        [5, 7]])
t2 =
tensor([[2, 4],
        [6, 8]])

Some basic operations
t1 + t2: 
tensor([[ 3,  7],
        [11, 15]])
t1 - t2: 
tensor([[-1, -1],
        [-1, -1]])
t1 * t2: 
tensor([[ 2, 12],
        [30, 56]])
t1 @ t2: 
tensor([[20, 28],
        [52, 76]])
t1 ** 2: 
tensor([[ 1,  9],
        [25, 49]])
t1 / t2: 
tensor([[0, 0],
        [0, 0]])
t1 / 10: 
tensor([[0, 0],
        [0, 0]])


In [6]:
import nets

t1 = nets.Tensor([[1, 3], 
                  [5, 7]])
t2 = nets.Tensor([[2, 4], 
                  [6, 8]])

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nSome basic operations")
print(f"t1 + t2: \n{nets.add(t1, t2)}")
print(f"t1 - t2: \n{nets.sub(t1, t2)}")
print(f"t1 * t2: \n{nets.multiply(t1, t2)}")
print(f"t1 @ t2: \n{nets.dot(t1, t2)}")
print(f"t1 ** 2: \n{nets.pow(t1, 2)}")
print(f"t1 / t2: \n{nets.div(t1, t2)}")
print(f"t1 / 10: \n{t1 / 10}")

t1 =
tensor([[1 3]
        [5 7]])
t2 =
tensor([[2 4]
        [6 8]])

Some basic operations
t1 + t2: 
tensor([[ 3  7]
        [11 15]])
t1 - t2: 
tensor([[-1 -1]
        [-1 -1]])
t1 * t2: 
tensor([[ 2 12]
        [30 56]])
t1 @ t2: 
tensor([[20 28]
        [52 76]])
t1 ** 2: 
tensor([[ 1  9]
        [25 49]])
t1 / t2: 
tensor([[0.5    0.75  ]
        [0.8333 0.875 ]])
t1 / 10: 
tensor([[0.1 0.3]
        [0.5 0.7]])


# 2. Autograd


**NETS** uses a custom autograd system, made with numpy. Some vanilla architectures do not depends on this functionality however, like ``DNN`` networks.
All these informations will be detailed in the model's section.

As you may have seen, there is a ``requires_grad`` set to ``False`` by default when we create a tensor. This attribute attributes works similarly as **PyTorch**'s attribute. If set to ``True``, previous gradients will be registered and saved in this tensor, in the ``_hooks`` attribute. This attribute is basically a list containing all previous gradients. That is, when calling the ``backward`` method on this tensor with an upstream gradient, it will propagate through all previous gradients.

## 2.1. With Pytorch

In [7]:
t1 = torch.tensor([1., 3], requires_grad=True)
t2 = torch.tensor([2., 4], requires_grad=True)

# Some operations
t3 = t1 + t2 + 4
t4 = t3 * t2

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nOperation:")
print("t3 = t1 + t2 + 4")
print("t4 = t3 * t2")

print(f"\nt3 =\n{t3}")
print(f"t4 =\n{t4}")

print("\nBefore backpropagation")
print(f"t1 gradient: {t1.grad}")
print(f"t2 gradient: {t2.grad}")
print(f"t3 gradient: {t3.grad}")
print(f"t4 gradient: {t4.grad}")

# Upstream gradient
grad = torch.tensor([-1., 2.])

# Back-propagation
t4.backward(grad)

print("\nAfter backpropagation")
print(f"t1 gradient: {t1.grad}")
print(f"t2 gradient: {t2.grad}")
print(f"t3 gradient: {t3.grad}")
print(f"t4 gradient: {t4.grad}")

t1 =
tensor([1., 3.], requires_grad=True)
t2 =
tensor([2., 4.], requires_grad=True)

Operation:
t3 = t1 + t2 + 4
t4 = t3 * t2

t3 =
tensor([ 7., 11.], grad_fn=<AddBackward0>)
t4 =
tensor([14., 44.], grad_fn=<MulBackward0>)

Before backpropagation
t1 gradient: None
t2 gradient: None
t3 gradient: None
t4 gradient: None

After backpropagation
t1 gradient: tensor([-2.,  8.])
t2 gradient: tensor([-9., 30.])
t3 gradient: None
t4 gradient: None


## 2.2. With NETS

In [8]:
t1 = nets.Tensor([1, 3], requires_grad=True)
t2 = nets.Tensor([2, 4], requires_grad=True)

# Some operations
t3 = t1 + t2 + 4
t4 = t3 * t2

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nOperation:")
print("t3 = t1 + t2 + 4")
print("t4 = t3 * t2")

print(f"\nt3 =\n{t3}")
print(f"t4 =\n{t4}")

print("\nBefore backpropagation")
print(f"t1 gradient: {t1.grad}")
print(f"t2 gradient: {t2.grad}")
print(f"t3 gradient: {t3.grad}")
print(f"t4 gradient: {t4.grad}")

# Upstream gradient
grad = nets.Tensor([-1, 2])

# Back-propagation
t4.backward(grad)

print("\nAfter backpropagation")
print(f"t1 gradient: {t1.grad}")
print(f"t2 gradient: {t2.grad}")
print(f"t3 gradient: {t3.grad}")
print(f"t4 gradient: {t4.grad}")

t1 =
tensor([1 3], requires_grad=True)
t2 =
tensor([2 4], requires_grad=True)

Operation:
t3 = t1 + t2 + 4
t4 = t3 * t2

t3 =
tensor([ 7 11], requires_grad=True)
t4 =
tensor([14 44], requires_grad=True)

Before backpropagation
t1 gradient: tensor([0. 0.])
t2 gradient: tensor([0. 0.])
t3 gradient: tensor([0. 0.])
t4 gradient: tensor([0. 0.])

After backpropagation
t1 gradient: tensor([-2.  8.])
t2 gradient: tensor([-9. 30.])
t3 gradient: tensor([-2.  8.])
t4 gradient: tensor([-1.  2.])


### Differences so far

# 3. Advance operations and autograd

## 3.1. With PyTorch

In [9]:
t1 = torch.tensor([[1., 3.],
                   [5., 7.]], requires_grad=True)
t2 = torch.tensor([[2., 4.],
                   [6., 8.]], requires_grad=True)

# Some operations
t3 = torch.tanh(t1 * t2)
t4 = t3 ** 3 + t2
t5 = torch.exp(t4)
t6 = t5.T
t7 = torch.exp(torch.log(t6))

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nBefore backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 = tanh(t1 * t2) gradient:\n{t3.grad}")
print(f"t4 = t3 ** 3 + t2 gradient:\n{t4.grad}")
print(f"t5 = exp(t4) gradient:\n{t5.grad}")
print(f"t6 = t5.T gradient:\n{t6.grad}")
print(f"t7 = log(t6) gradient:\n{t7.grad}")


# Upstream gradient
grad = torch.tensor([[-1., 2.],
                     [-3., 4.]])

# Back-propagation
t7.sum(axis=1).backward(grad[0])

print("\nAfter backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 = tanh(t1 * t2) gradient:\n{t3.grad}")
print(f"t4 = t3 ** 3 + t2 gradient:\n{t4.grad}")
print(f"t5 = exp(t4) gradient:\n{t5.grad}")
print(f"t6 = t5.T gradient:\n{t6.grad}")
print(f"t7 = log(t6) gradient:\n{t7.grad}")

t1 =
tensor([[1., 3.],
        [5., 7.]], requires_grad=True)
t2 =
tensor([[2., 4.],
        [6., 8.]], requires_grad=True)

Before backpropagation
t1 gradient:
None
t2 gradient:
None
t3 = tanh(t1 * t2) gradient:
None
t4 = t3 ** 3 + t2 gradient:
None
t5 = exp(t4) gradient:
None
t6 = t5.T gradient:
None
t7 = log(t6) gradient:
None

After backpropagation
t1 gradient:
tensor([[-7.1306,  0.0000],
        [-0.0000,  0.0000]])
t2 gradient:
tensor([[  -21.6654,   296.8263],
        [-1096.6332, 16206.1680]])
t3 = tanh(t1 * t2) gradient:
None
t4 = t3 ** 3 + t2 gradient:
None
t5 = exp(t4) gradient:
None
t6 = t5.T gradient:
None
t7 = log(t6) gradient:
None


## 3.2. With NETS

In [10]:
t1 = nets.Tensor([[1., 3.],
                   [5., 7.]], requires_grad=True)
t2 = nets.Tensor([[2., 4.],
                   [6., 8.]], requires_grad=True)

# Some operations
t3 = nets.tanh(t1 * t2)
t4 = t3 ** 3 + t2
t5 = nets.exp(t4)
t6 = t5.T
t7 = nets.exp(nets.log(t6))

print(f"t1 =\n{t1}")
print(f"t2 =\n{t2}")

print("\nBefore backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 = tanh(t1 * t2) gradient:\n{t3.grad}")
print(f"t4 = t3 ** 3 + t2 gradient:\n{t4.grad}")
print(f"t5 = exp(t4) gradient:\n{t5.grad}")
print(f"t6 = t5.T gradient:\n{t6.grad}")
print(f"t7 = log(t6) gradient:\n{t7.grad}")

# Upstream gradient
grad = nets.Tensor([[-1., 2.],
                    [-3., 4.]])

# Back-propagation
t7.sum(axis=1).backward(grad[0])

print("\nAfter backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 = tanh(t1 * t2) gradient:\n{t3.grad}")
print(f"t4 = t3 ** 3 + t2 gradient:\n{t4.grad}")
print(f"t5 = exp(t4) gradient:\n{t5.grad}")
print(f"t6 = t5.T gradient:\n{t6.grad}")
print(f"t7 = log(t6) gradient:\n{t7.grad}")

t1 =
tensor([[1. 3.]
        [5. 7.]], requires_grad=True)
t2 =
tensor([[2. 4.]
        [6. 8.]], requires_grad=True)

Before backpropagation
t1 gradient:
tensor([[0. 0.]
        [0. 0.]])
t2 gradient:
tensor([[0. 0.]
        [0. 0.]])
t3 = tanh(t1 * t2) gradient:
tensor([[0. 0.]
        [0. 0.]])
t4 = t3 ** 3 + t2 gradient:
tensor([[0. 0.]
        [0. 0.]])
t5 = exp(t4) gradient:
tensor([[0. 0.]
        [0. 0.]])
t6 = t5.T gradient:
tensor([[0. 0.]
        [0. 0.]])
t7 = log(t6) gradient:
tensor([[0. 0.]
        [0. 0.]])

After backpropagation
t1 gradient:
tensor([[-7.1306e+00  5.3787e-07]
        [ 0.0000e+00  0.0000e+00]])
t2 gradient:
tensor([[  -21.6654   296.8263]
        [-1096.6332 16206.1679]])
t3 = tanh(t1 * t2) gradient:
tensor([[  -50.464    890.479 ]
        [-3289.8995 48618.5036]])
t4 = t3 ** 3 + t2 gradient:
tensor([[  -18.1001   296.8263]
        [-1096.6332 16206.1679]])
t5 = exp(t4) gradient:
tensor([[-1.  2.]
        [-1.  2.]])
t6 = t5.T gradient:
tensor([[-1. -1.

## So far so good !

# 4. Even more advanced

## 4.1. With PyTorch

In [11]:
np.random.seed(42)
data1 = np.random.rand(3, 3, 3) - 0.5
data2 = np.random.rand(3, 3, 3) - 0.5
grad = np.random.rand(3, 3, 3) - 0.5

t1 = torch.tensor(data1, requires_grad=True)
t2 = torch.tensor(data2, requires_grad=True)

# Some operations
t3 = torch.nn.functional.leaky_relu(t1, 0.1)

print("\nBefore backpropagation")
print(f"t1:\n{t1}")
print(f"t2:\n{t2}")
print(f"t3:\n{t3}")


# Upstream gradient
grad = torch.tensor(grad)

# Back-propagation
t3.backward(grad)

print("\nAfter backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 gradient:\n{t3.grad}")
print('grad\n', grad)


Before backpropagation
t1:
tensor([[[-0.1255,  0.4507,  0.2320],
         [ 0.0987, -0.3440, -0.3440],
         [-0.4419,  0.3662,  0.1011]],

        [[ 0.2081, -0.4794,  0.4699],
         [ 0.3324, -0.2877, -0.3182],
         [-0.3166, -0.1958,  0.0248]],

        [[-0.0681, -0.2088,  0.1119],
         [-0.3605, -0.2079, -0.1336],
         [-0.0439,  0.2852, -0.3003]]], dtype=torch.float64,
       requires_grad=True)
t2:
tensor([[[ 0.0142,  0.0924, -0.4535],
         [ 0.1075, -0.3295, -0.4349],
         [ 0.4489,  0.4656,  0.3084]],

        [[-0.1954, -0.4023,  0.1842],
         [-0.0598, -0.3780, -0.0048],
         [-0.4656,  0.4093, -0.2412]],

        [[ 0.1625, -0.1883,  0.0201],
         [ 0.0467, -0.3151,  0.4696],
         [ 0.2751,  0.4395,  0.3948]]], dtype=torch.float64,
       requires_grad=True)
t3:
tensor([[[-0.0125,  0.4507,  0.2320],
         [ 0.0987, -0.0344, -0.0344],
         [-0.0442,  0.3662,  0.1011]],

        [[ 0.2081, -0.0479,  0.4699],
         [ 0.3324,

## 4.2. With NETS

In [12]:
np.random.seed(42)
data1 = np.random.rand(3, 3, 3) - 0.5
data2 = np.random.rand(3, 3, 3) - 0.5
grad = np.random.rand(3, 3, 3) - 0.5

t1 = nets.Tensor(data1, requires_grad=True)
t2 = nets.Tensor(data2, requires_grad=True)

# Some operations
t3 = nets.leaky_relu(t1, 0.1)

print("\nBefore backpropagation")
print(f"t1:\n{t1}")
print(f"t2:\n{t2}")
print(f"t3:\n{t3}")


# Upstream gradient
grad = nets.Tensor(grad)

# Back-propagation
t3.backward(grad)

print("\nAfter backpropagation")
print(f"t1 gradient:\n{t1.grad}")
print(f"t2 gradient:\n{t2.grad}")
print(f"t3 gradient:\n{t3.grad}")


Before backpropagation
t1:
tensor([[[-0.1255  0.4507  0.232 ]
         [ 0.0987 -0.344  -0.344 ]
         [-0.4419  0.3662  0.1011]]

        [[ 0.2081 -0.4794  0.4699]
         [ 0.3324 -0.2877 -0.3182]
         [-0.3166 -0.1958  0.0248]]

        [[-0.0681 -0.2088  0.1119]
         [-0.3605 -0.2079 -0.1336]
         [-0.0439  0.2852 -0.3003]]], requires_grad=True)
t2:
tensor([[[ 0.0142  0.0924 -0.4535]
         [ 0.1075 -0.3295 -0.4349]
         [ 0.4489  0.4656  0.3084]]

        [[-0.1954 -0.4023  0.1842]
         [-0.0598 -0.378  -0.0048]
         [-0.4656  0.4093 -0.2412]]

        [[ 0.1625 -0.1883  0.0201]
         [ 0.0467 -0.3151  0.4696]
         [ 0.2751  0.4395  0.3948]]], requires_grad=True)
t3:
tensor([[[-0.0125  0.4507  0.232 ]
         [ 0.0987 -0.0344 -0.0344]
         [-0.0442  0.3662  0.1011]]

        [[ 0.2081 -0.0479  0.4699]
         [ 0.3324 -0.0288 -0.0318]
         [-0.0317 -0.0196  0.0248]]

        [[-0.0068 -0.0209  0.1119]
         [-0.0361 -0.0208 -0.01