# First Laboratory of Machine Learning per il Calcolo Sceintifico
In this notebook, we want to create the network from the paper "Error bounds for approximations with deep ReLU networks" by D. Yarotsky (Neural Netowrks, 2017, https://doi.org/10.1016/j.neunet.2017.07.002).

## Step zero: libraries

Install  Python (https://www.python.org/)

And the following libraries:

-PyTorch (https://pytorch.org/)

-Numpy (https://numpy.org/)

-Matplotlib (https://matplotlib.org/)


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

torch.set_default_dtype(torch.float64)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_device(device) # default tensor device
print("I'm using: ", device)


## First step: Define $\Phi^\wedge$ using torch Tensor 
Define the weights and the bias of the NN $\Phi^\wedge$ with ReLU activation, seen during the lecutres that emulates $F_1$ and plot in on [0,1] 
$$
F_1 : x\mapsto 
\begin{cases}
2x &x\in (0, 1/2]\\
2-2x &x \in (1/2, 1)\\
0 &\text{elsewhere.}
\end{cases}
$$
We want to define the hidden weight $W_1 \in \mathbb{R}^{3\times 1}$ and bias $b\in \mathbb{R}^{1\times 3}$ and the output weight $W_2\in \mathbb{R}^{1\times 3}$. The NN then will be the map
$$
x\mapsto \left[\mathrm{ReLU}(xW_1^\top + b)\right] W_2^\top
$$
where $x \in \mathbb{R}^{N\times 1}$ can be a vector of $N$ points in $[0,1]$. Note that we choose the formulation above to follow the format implemented in `torch.nn.linear`, see https://pytorch.org/docs/stable/generated/torch.nn.Linear.html. 

In [None]:
input_weight_mat = torch.tensor( ) # this should be W_1 
input_bias_vec = # b
output_weight_mat = # this should be W_2

with torch.inference_mode(): #to disable gradient computations, which conflicts with plotting
    x = torch.linspace(0, 1, 200).unsqueeze(-1)
    y = # implement the function above. you can use torch.relu(...). Matrix-matrix product is done with the @ operator

    x_np = x.cpu()
    
    plt.plot(x_np, y.squeeze().cpu(), label='f')
    plt.grid()
    plt.show()

We now do the same thing using `torch.nn.Linear`

In [None]:
hidden_layer = torch.nn.Linear(, ) # complete with the right dimensions
output_layer = torch.nn.Linear(, , bias=False) # complete with the right dimensions

hidden_layer.weight = torch.nn.Parameter( ) #complete with the right mat
hidden_layer.bias = torch.nn.Parameter( ) #complete with the right mat
output_layer.weight = torch.nn.Parameter( ) #complete with the right mat

with torch.inference_mode():
    x = torch.linspace(0, 1, 200).unsqueeze(-1)
    y = # use hidden_layer, output_layer, and torch.relu here. Do not use hidden_layer.weight, hidden_layer.bias, etc.

    x_np = x.cpu()
    
    plt.plot(x_np, y.squeeze().cpu(), label='f')
    plt.grid()
    plt.show()

## Second step: define a general Feed-Forward Neural Network

Complete the following Python class of a FNN (This will be used in future laboratories!)

In [None]:
class DeepNet(torch.nn.Module):
    def __init__(self, activation, n_input, n_hidden=None, n_output=None):
        """
        construct a NN with
        activation: activation function
        n_input: input dimension
        n_hidden: list of hidden layer widths
        n_output: output dim
        example:  NN = DeepNet(torch.tanh, 2, [4, 3, 4], 1)
        """
        super(DeepNet, self).__init__()  # Constructor of the super class torch.nn.Module
        torch.manual_seed(0) # set the seed for reproducibility
        self.dim_in = # input dimension (given as arg in constructor)
        self.activation = # activation function (given as arg in constructor)
        self.hidden = torch.nn.ModuleList() #initialize the inner layers
        if n_hidden is not None:
            self.L = # number of hidden layers
            self.widths = # the list given as argument in the constructor
            self.hidden.append( ) # the input layer, as a torch.nn.Linear
            torch.nn.init.xavier_normal_(self.hidden[0].weight)
            torch.nn.init.normal_(self.hidden[0].bias)
            for i in range(1, self.L):
                self.hidden.append( ) # the hidden layers, as torch.nn.Linear
                torch.nn.init.xavier_normal_(self.hidden[i].weight)
                torch.nn.init.normal_(self.hidden[i].bias)
        else:
            self.L = 0

        if n_output is not None:
            self.dim_out = # output dimension
            self.output = torch.nn.Linear( ) # the output layer as a torch.nn.Linear - no bias here
            torch.nn.init.xavier_normal_(self.output.weight)
        else:
            self.output = None

    def forward(self, x):
        """
        Given input vector x produces the output of the NN
        """
        if x.dim() == 1:
            x = x.unsqueeze(-1) # add a dimension at the end
        for i in range(self.L):
            x = # the affine transformation (we have defined this layer in the constructor)
            x = # the nonlinearity (we have given the activation in the constructor)
            
        if self.output is not None:
            x = # the output layer
        return x

    #Functions needed for Exercise 2
    def set_weight(self, layer, weight_mat, requires_grad=True):
        assert self.L > layer
        assert weight_mat.shape == self.hidden[layer].weight.shape
        # set the weight matrix for the layer with index "layer". We have to use torch.nn.Parameter, as we have already done

    def set_bias(self, layer, bias_mat):
        assert self.L > layer
        assert bias_mat.flatten().shape == self.hidden[layer].bias.shape
        # the bias of layer with index "layer"

    def set_output_weight(self, weight_mat, requires_grad=True):
        assert self.output is not None
        assert weight_mat.shape == self.output.weight.shape
        # set the output weight


### Create a FNN with 1 layer 3 neurons and ReLU activation function

In [None]:
input_d = # input dim
output_d = # output dim
hidden_layer = # the list of widths of (one) hidden layer
activation_function = # relu activation

FNN = # use the class created above
FNN

### Redefine $\Phi^\wedge$ using the FNN defined before  

In [None]:
Phiwedge = FNN 

# set the hidden weight/bias and the output weight of Phiwedge

Plot the output of the NN in [0,1].

In [None]:
with torch.inference_mode():
    x = torch.linspace(0, 1, 200)
    y = Phiwedge(x)

    x_np = x.cpu()
    
    plt.plot(x_np, y.cpu(), label='f')
    plt.grid()
    plt.show()

## Third step: Concatenation of two Networks
Complete the following code which implements the non-sparse concatenation

In [None]:
def concatenate(nn1, nn2):
    assert nn1.dim_out == nn2.dim_in
    assert nn1.activation == nn2.activation
    new_L = # numer of hidden layers of concatenated network
    new_widths = # concatenate the two lists (+ operator in python)
    out_net = # construct the network

    for i in range(nn1.L):
        out_net.set_weight( ) 
        out_net.set_bias( )
    new_weight = # compute new weight, which is the product of the first weight of nn2 and the output weight of nn1
    out_net.set_weight( )
    out_net.set_bias( )
    for i in range(1, nn2.L):
        out_net.set_weight( )
        out_net.set_bias( )
    out_net.set_output_weight( )

    return out_net

Concatenate the NN $\Phi^\wedge$ defined in the second step 4 time ($F_4 = F_1\circ F_1 \circ F_1 \circ F_1$) 

In [None]:
F4 = # concatenate Phiwedge 4 times
print(F4)
with torch.inference_mode():
    x = torch.linspace(0, 1, 400)
    x_np = x.cpu().numpy()

    # plot F4(x)

## Fourth step: Define the Yarotsky Network 

We now want to construct the network from Yarotsky's paper. Defining by $F_n$ the $n$-fold composition of $\Phi^\wedge$, we use the fact that
$$
x - \sum_{k=1}^N \frac{F_k(x)}{4^k} \to x^2, \qquad \text{as }N\to\infty,
$$ 
for all $x\in[0,1]$. The network therefore uses a non-standard architecture, as shown in this picture:

<center>
    <img src="./Yarotsky-arch.png" alt="Architecture">
</center>

In [None]:
class YarotskyNet(DeepNet):
    def __init__(self, net, out_weights):
        """
        Initialize by copying a net given as input, and a vector of
        weights that are the weights of the skip connections
        """
        super(YarotskyNet, self).__init__(net.activation, net.dim_in, net.widths, net.dim_out)
        self.out_weights = # these are the weights of the skip connections, and are passed as an argument to the constructor
        for i in range(net.L):
            self.hidden[i] = # copy the hidden weights of the NN given as argument (net)
        
        self.output = # copy also the output weight
        
        assert len(out_weights) == len(net.widths) + 1
 
    def forward(self, x):
        if x.dim() == 1:
            x = x[:, None]
        
        y = # the first skip connection goes straight from the input
        for i in range(self.L):
            x = # affine transformation
            x = # activation
            y = # skip connection to the output (use self.output_weights[i+1] here)
        return y

Define a Yarotsky network, with depth = 3. Plot the output of the Yarotsky network over [0,1] and compare it with respect to $x^2$. Then evaluate the $L^2$ error

In [None]:
with torch.inference_mode():
    depth = 3

    out_weights = # define the weights of the skip connections to the output for Yarotsy's network
   

    basenet = Phiwedge
    for j in range(depth-2):
        # concatenate the Phiwedge network, to obtain a network with the right weights but no skip connections
        

    squarenet = # construct the network with the class we have defined, copying from the one we have concatenated, but also putting in skip connections

    x = torch.linspace(0, 1, 100)
    x_np = x.cpu().numpy()
    
    y = # complete

    


    plt.plot(x_np, y.cpu().numpy(), label='Yarotsky')
    plt.plot(x_np, x_np**2, 'r-.',label = '$x^2$')
    plt.grid()
    plt.legend()
    plt.show()

    print(f'L2 error: {np.linalg.norm(y.cpu().numpy().flatten()-x_np**2)/10:4e}')

Evalute the $L^2$ error of the Yarotsky Network with respect to $x^2$, as a function of the depth of the network

In [None]:
def compute_err(depth):

    # define squarenet as above 
    
    Npts = 10**6
    x = torch.linspace(0, 1, Npts)
    y = squarenet(x)
    x_np = x.cpu().detach().numpy()

    return np.linalg.norm(y.cpu().detach().numpy().flatten()-x_np**2)/np.sqrt(Npts)

err = []
all_depths = range(3, 15)
for depth in all_depths:
    err.append(compute_err(depth))

plt.semilogy(all_depths, err, 'o-')
plt.grid()
plt.xlabel("Depth")
plt.ylabel("$L^2$ error")
plt.show()

err