### By Hans Martin Aannestad

In [8]:
import torch
from torch import nn
from tests_backpropagation import main_test

torch.manual_seed(42)
torch.set_default_dtype(torch.double)

## Class ``MyNet``

Read carefully how ``MyNet`` is implemented in the cell below. In particular:  
- ``n_hid`` is a list of integer, representing the number of hidden units in each hidden layer.   
-  ``MyNet([2, 3, 2]) = MiniNet()`` where ``MiniNet`` is the neural network defined in the fourth tutorial, in which notations are also clarified.     
- ``model.L`` is the number of hidden layers, ``L``   
- ``model.f[l]`` is the activation function of layer ``l``, $f^{[l]}$ (here ``torch.tanh``)   
- ``model.df[l]`` is the derivative of the activation function, $f'^{[l]}$   
- ``model.a[l]``  is the tensor $A^{[l]}$, (shape: ``(1, n(l))``)   
- ``model.z[l]``  is the tensor $Z^{[l]}$, (shape: ``(1, n(l))``)  
- Weights $W^{[l]}$ (shape: ``(n(l+1), n(l))``) and biases $\mathbf{b}^{[l]}$ (shape: ``(n(l+1))``) can be accessed as follows:
```
weights = model.fc[str(l)].weight.data
bias = model.fc[str(l)].bias.data
```

In [9]:
class MyNet(nn.Module):
    def __init__(self, n_l = [2, 3, 2]):
        super().__init__() 
        
        
        # number of layers in our network (following Andrew's notations)
        self.L = len(n_l)-1
        self.n_l = n_l
        
        # Where we will store our neuron values
        # - z: before activation function 
        # - a: after activation function (a=f(z))
        self.z = {i : None for i in range(1, self.L+1)}
        self.a = {i : None for i in range(self.L+1)}

        # Where we will store the gradients for our custom backpropagation algo
        self.dL_dw = {i : None for i in range(1, self.L+1)}
        self.dL_db = {i : None for i in range(1, self.L+1)}

        # Our activation functions
        self.f = {i : lambda x : torch.tanh(x) for i in range(1, self.L+1)}

        # Derivatives of our activation functions
        self.df = {
            i : lambda x : (1 / (torch.cosh(x)**2)) 
            for i in range(1, self.L+1)
        }
        
        # fully connected layers
        # We have to use nn.ModuleDict and to use strings as keys here to 
        # respect pytorch requirements (otherwise, the model does not learn)
        self.fc = nn.ModuleDict({str(i): None for i in range(1, self.L+1)})
        for i in range(1, self.L+1):
            self.fc[str(i)] = nn.Linear(in_features=n_l[i-1], out_features=n_l[i])
        
    def forward(self, x):
        # Input layer
        self.a[0] = torch.flatten(x, 1)
        
        # Hidden layers until output layer
        for i in range(1, self.L+1):

            # fully connected layer
            self.z[i] = self.fc[str(i)](self.a[i-1])
            # activation
            self.a[i] = self.f[i](self.z[i])

        # return output
        return self.a[self.L]

## Tasks

Write a function ``backpropagation(model, y_true, y_pred)`` that computes:

- $\frac{\partial L}{\partial w^{[l]}_{i,j}}$ and store them in ``model.dL_dw[l][i,j]`` for $l \in [1 .. L]$ 
- $\frac{\partial L}{\partial b^{[l]}_{j}}$ and store them in ``model.dL_db[l][j]`` for $l \in [1 .. L]$ 

assuming ``model`` is an instance of the ``MyNet`` class.

A vectorized implementation would be appreciated.

In [167]:
# Some testing

#print(model)
#model.df[str(L)](model.z[str(L)].data).data
#L = len(model.dL_dw)
#print(L)
#print(model.df[2](model.z[2].data).data)
#layer_name = 'fc' + str(2)
#for param_name in ['weight', 'bias']:
#getattr(model.fc[str(1)], 'weight').data
#model.fc[str(2)].weight.data
#model.z[1].data
#(model.z[1])
#getattr(model.fc[str(1)], 'weight').data
#print(f'bias: {model.fc[str(l)].bias.data}')
#print(f'dbias: {model.dL_db[l].data}')
#print(f'weights: {model.fc[str(l)].weight.data}')
#print(f'dweights: {model.dL_dw[l].data}')


In [181]:
def backpropagation(model, y_true, y_pred):
    # (Vectorization encouraged: I used matmul -> not loops)
    # Nested: model.fc[model.L]

    # Prevent Pytorch from computing and keeping track of gradients
    with torch.no_grad(): # not needed here
    
    # MSE will be Sum(-(y_true - y_pred)**2)/len(y_true)
    # d_MSE is therefore: Sum(-2*(y_true - y_pred))/len(y_true), use in last layer by:
    
        d_current = (-2*(y_true - y_pred)**(2-1)) * model.df[model.L](model.z[model.L])

        model.dL_dw[model.L] = torch.t(torch.matmul(torch.t(model.a[model.L-1]), d_current))
        model.dL_db[model.L] = d_current[0] # last layer bias
    
        n_layers = model.L-1

        for layer in range(n_layers, 0, -1): # reverse (back)propagation loop
    
            l_n = str(layer+1) # next layer (string)
            d_current = torch.t(torch.matmul(torch.t(model.fc[l_n].weight.data), torch.t(d_current))) * model.df[layer](model.z[layer])
        
            model.dL_dw[layer] = torch.t(torch.matmul(torch.t(model.a[layer-1]),d_current))
            model.dL_db[layer] = d_current[0]

    return None

In [182]:
print(getattr(model.fc['2'], 'weight').requires_grad)

True


## Run the cells below, and check the output

- In the 1st cell, we use a toy dataset and the same architecture as the MiniNet class of the fourth tutorial. 
- In the 2nd cell, we use a few samples of the MNIST dataset with a consistent model architecture (``24x24`` black and white cropped images as input and ``10`` output classes). 

You can set ``verbose`` to ``True`` if you want more details about your computations versus what is expected.

In [183]:
model = MyNet([2, 3, 2])
main_test(backpropagation, model, verbose=True, data='toy')




 ------------ fc['1'].weight.grad ------------ 
  Our computation:
 tensor([[-4.1753e-07, -3.9125e-07],
        [-1.7184e-03, -1.6103e-03],
        [ 5.7746e-05,  5.4111e-05]])
  Autograd's computation:
 tensor([[-4.1753e-07, -3.9125e-07],
        [-1.7184e-03, -1.6103e-03],
        [ 5.7746e-05,  5.4111e-05]])

 ------------- fc['1'].bias.grad ------------- 
  Our computation:
 tensor([-4.3507e-08, -1.7906e-04,  6.0171e-06])
  Autograd's computation:
 tensor([-4.3507e-08, -1.7906e-04,  6.0171e-06])

 ------------- relative error ------------ 
(fc[1].weight.grad, model.dL_dw[1]):   0.0000
(fc[1].bias.grad, model.dL_db[1]):   0.0000
(fc[2].weight.grad, model.dL_dw[2]):   0.0000
(fc[2].bias.grad, model.dL_db[2]):   0.0000
Gradients consistent with finite differences computations. :) 


 ------------ fc['1'].weight.grad ------------ 
  Our computation:
 tensor([[-2.8201e-07, -2.6425e-07],
        [-8.3564e-04, -7.8304e-04],
        [ 3.0540e-05,  2.8618e-05]])
  Autograd's computation:

In [180]:
model = MyNet([24*24, 16, 10])
main_test(backpropagation, model, verbose=True, data='mnist')


True


  return F.mse_loss(input, target, reduction=self.reduction)


True
True
True
True

Gradients consistent with autograd's computations. :) 
Gradients consistent with finite differences computations. :) 

Weights have been updated. :)

All parameters seem correctly attached to the computational graph! :) 
