In [None]:
import numpy as np
import unittest

## Implementation of a 2 layer neural network

Hi!, in this task we will implement a 2 layer fully connected neural network from scratch, using only numpy. For that, we will go through all the steps, explaning all the math behind to derivate all the equations using plain linear algebra and some basic calculus. In this way, we get full understanding of a neural network. Let's us do some recap first:


- Let be A and B two matrixes:

$$
\begin{align*}
\frac{\partial AB}{\partial A} &= B^T \\
\frac{\partial AB}{\partial B} &= A^T
\end{align*}
$$
- Sum of square errors:
$$
SSE = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2
$$
- Gradient descend method, for a cost function $J$:

$$
\theta = \theta - \alpha \cdot \frac{\partial J(\theta)}{\partial \theta}
$$

Where:

- $\theta$: Parameters (weights and biases)
- $\alpha$: Learning rate
- $\frac{\partial J(\theta)}{\partial \theta}$: Gradient of the cost function

In [None]:
def sserror(y_true, y_pred):
    """
    Implement sum of square error
    
    Arguments:
    
    y_true -- vector with n elements
    y_pred -- vector with n elements
    
    Returns:
    
    sse - sum of square errors (scalar)
    
    """
    # YOUR CODE STARTS HERE
    sse = #Your code here
    #YOUR CODE ENDS HERE
    return sse

In [None]:
def sserror(y_true, y_pred):
    sse = 0.5 * np.sum(np.power(y_pred - y_true, 2))
    return sse

## Forward propagation
Suppose you have a data of $m$ observations and $k$ features. So for a neural network the idea is to start with an input of $k$ neurons and an output of $n$ neurons.
$$
\begin{align*}
Z &= WX+b\\
A &= g(Z)
\end{align*}
$$
where:
- X is the input matrix with dimension $k \times m$
- W is the matrix of weights of dimension $n \times k$
- b is the vector of bias of dimension $n \times 1$. If we are rigorous, this must be a matrix of dimension $(n,m)$
- Z is the pre activated output, a matrix of dimension $n \times m$
- $g$ is the activation function (e.g., ReLU, sigmoid, tanh)  
- A is the activated output, a matrix of dimension $n \times m$

So, What are these $\mathcal{activation}$ $\mathcal{functions}$ $g$? These are functions that add non linearity to our model. Otherwise this will be just linear combinations as a traditional linear regression. We will implement only 3 of them:

Sigmoid:
$$
\sigma(x) = \frac{1}{1 + e^{-x}}
$$
Tanh:
$$
\tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}
$$
Relu:
$$
\text{ReLU}(x) = \max(0, x)
$$

In [None]:
def actfunc(x, funct) :
    """
    Implement activation functions
    
    Arguments:
    
    x -- input data 
    funct -- activation function, f(x)
    
    Returns:
    
    f -- funct activation function 
    
    """
    # YOUR CODE STARTS HERE
    if funct == 'sigmoid' :
        f = #Your code here
    elif funct == 'tanh' :
        f = #Your code here
    elif funct == 'Relu' :
        f =  #Your code here
    else :
        f = x
    # YOUR CODE ENDS HERE
    return f

In [None]:
def actfunc(x, funct) :
    """
    Implement activation functions
    
    Arguments:
    
    x -- input data 
    funct -- activation function, f(x)
    
    Returns:
    
    f -- funct activation function 
    
    """
    if funct == 'sigmoid' :
        f = 1/(1+np.exp(-x))
    elif funct == 'tanh' :
        f = np.tanh(x)
    elif funct == 'Relu' :
        f =  np.maximum(0,x)
    else :
        f = x
    return f

In [None]:
class TestActivationFunction(unittest.TestCase):

    def test_sigmoid(self):
        x = np.array([0])
        expected = 1 / (1 + np.exp(0))  # = 0.5
        result = actfunc(x, 'sigmoid')
        self.assertAlmostEqual(result[0], expected, msg=f"Expected sigmoid(0) to be {expected}, but got {result[0]}")

    def test_tanh(self):
        x = np.array([0])
        expected = np.tanh(0)  # = 0.0
        result = actfunc(x, 'tanh')
        self.assertAlmostEqual(result[0], expected, msg=f"Expected tanh(0) to be {expected}, but got {result[0]}")

    def test_relu_positive(self):
        x = np.array([5])
        expected = np.maximum(0, 5)  # = 5
        result = actfunc(x, 'Relu')
        self.assertEqual(result[0], expected, msg=f"Expected Relu(5) to be {expected}, but got {result[0]}")

    def test_relu_negative(self):
        x = np.array([-3])
        expected = 0
        result = actfunc(x, 'Relu')
        self.assertEqual(result[0], expected, msg=f"Expected Relu(-3) to be {expected}, but got {result[0]}")

    def test_default(self):
        x = np.array([7])
        # No known function provided → return input
        result = actfunc(x, 'unknown')
        self.assertTrue(np.array_equal(result, x), msg=f"Expected input array to be returned, but got {result} instead")

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


In general, for a layer $l$, the forward propagation equations are:

$$
\begin{align*}
Z^{[l]} &= W^{[l]} A^{[l-1]} + b^{[l]} \\
A^{[l]} &= g^{[l]}(Z^{[l]})
\end{align*}
$$

Where:

- $ A^{[l-1]} $: Activated output from the previous layer with dimension $n^{[l-1]} \times m$ 
- $ W^{[l]} $: Weight matrix for layer with dimension $ n^{[l]} \times n^{[l-1]}$
- $ b^{[l]} $: Bias vector for layer with dimension $n^{[l]} \times 1$
- $ Z^{[l]}$: Pre activated output with dimension $n^{[l]} \times m $   
- $ g^{[l]} $: Activation function (e.g., ReLU, sigmoid, tanh) of layer $l$
- $A^{[l]} $: Activated output with dimension $n^{[l]} \times m$  
- $ m $: Batch size

In [None]:
def forwardprop(X,W1,W2,b1,b2, funct1, funct2) :
    """
    Implement forward propagation with 2 layers
    
    Arguments:
    X: input data of shape (n_inputlayer, m)
    W1: weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    b1: bias of 1st layer, shape (n_hiddenlayer, 1)
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    b2: bias of 2nd layer, shape (n_outputlayer, 1)

    Returns:
    A2: activated output of 2nd layer, shape (n_outputlayer,m)
    Z2: preactivated output of 2nd layer, shape (n_outputlayer,m)
    A1: activated output of 1st layer, shape (n_hiddenlayer,m)
    Z1: preactivated output of 1st layer, shaoe (n_hiddenlayer,m)
    """
    # YOUR CODE STARTS HERE
    Z1 = #your code here
    A1 = #your code here
    Z2 = #your code here
    A2 = #your code here
    # YOUR CODE ENDS HERE
    return A2, Z2, A1, Z1

In [None]:
def forwardprop(X,W1,W2,b1,b2, funct1, funct2) :
    """
    Implement forward propagation with 2 layers
    
    Arguments:
    X: input data of shape (n_inputlayer, m)
    W1: weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    b1: bias of 1st layer, shape (n_hiddenlayer, 1)
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    b2: bias of 2nd layer, shape (n_outputlayer, 1)

    Returns:
    A2: activated output of 2nd layer, shape (n_outputlayer,m)
    Z2: preactivated output of 2nd layer, shape (n_outputlayer,m)
    A1: activated output of 1st layer, shape (n_hiddenlayer,m)
    Z1: preactivated output of 1st layer, shaoe (n_hiddenlayer,m)
    """
    # YOUR CODE STARTS HERE
    Z1 = W1.dot(X)+b1
    A1 = actfunc(Z1, funct1)
    Z2 = W2.dot(A1)+b2
    A2 = actfunc(Z2, funct2)
    return A2, Z2, A1, Z1

In [None]:
class TestForwardProp(unittest.TestCase):

    def test_forwardprop_shapes(self):
        # Setup dummy input
        X = np.array([[0.5], [0.1]])  # shape: (2, 1)
        W1 = np.array([[0.2, 0.4], [0.3, 0.1]])  # shape: (2, 2)
        b1 = np.array([[0.1], [0.2]])           # shape: (2, 1)
        W2 = np.array([[0.7, 0.5]])             # shape: (1, 2)
        b2 = np.array([[0.3]])                  # shape: (1, 1)

        # Call forwardprop
        A2, Z2, A1, Z1 = forwardprop(X, W1, W2, b1, b2, 'tanh', 'sigmoid')

        # Check output shapes
        self.assertEqual(A2.shape, (1, 1), msg=f"Expected A2 shape (1,1), got {A2.shape}")
        self.assertEqual(Z2.shape, (1, 1), msg=f"Expected Z2 shape (1,1), got {Z2.shape}")
        self.assertEqual(A1.shape, (2, 1), msg=f"Expected A1 shape (2,1), got {A1.shape}")
        self.assertEqual(Z1.shape, (2, 1), msg=f"Expected Z1 shape (2,1), got {Z1.shape}")

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

The forward propagation outputs in the last layer the predicted variable, given known parameters $W$ and $b$. So for different values of these paremeters, we get different values of the predicted variable. How can we get the right $W$ and $b$ such that the predicted variable gets really closed to the ground truth label? This is where the backward propagation takes place
## Backward propagation
It is a procedure where we start from the loss function and we go all the way back to the begining, updating the weight and bias matrixes in each layer:

Applying chain rule:
$$
\begin{align*}
\frac{\partial J}{\partial Z^{[l]}} &= \frac{\partial J}{\partial A^{[l]}}\cdot \frac{\partial g^{[l]}(Z^{[l]})}{\partial Z^{[l]}} \\
\frac{\partial J}{\partial W^{[l]}} &= \frac{\partial J}{\partial Z^{[l]}}\cdot \frac{\partial Z^{[l]}}{\partial W^{[l]}}
= \frac{\partial J}{\partial Z^{[l]}}\cdot (A^{[l]})^T \\
\frac{\partial J}{\partial b^{[l]}} &= \frac{\partial J}{\partial Z^{[l]}}\cdot \frac{\partial Z^{[l]}}{\partial b^{[l]}}
= \frac{\partial J}{\partial Z^{[l]}}\cdot \mathbf{1} \\
\frac{\partial J}{\partial A^{[l-1]}} &= \frac{\partial J}{\partial Z^{[l]}}\cdot \frac{\partial Z^{[l]}}{\partial A^{[l-1]}}= (W^{[l]})^T \cdot \frac{\partial J}{\partial Z^{[l]}}
\end{align*}
$$


As we see, we need the derivative of the activation function $g$. The derivative of the functions we have considered for this task are:

Derivative of sigmoid:
$$
\frac{d}{dx} \sigma(x) = \sigma(x) \left( 1 - \sigma(x) \right)
$$
Derivative of tanh:
$$
\frac{d}{dx} \tanh(x) = 1 - \tanh^2(x)
$$
Derivative of ReLU:
$$
\frac{d}{dx} \text{ReLU}(x) =
\begin{cases} 
0 & \text{if } x < 0 \\
1 & \text{if } x \geq 0
\end{cases}
$$

In [None]:
def derivactfunc(x, funct) :
    """
    Implement derivative activation functions
    
    Arguments:
    
    x -- input data 
    funct -- activation function, f(x)
    
    Returns:
    
    df -- derivative of funct activation function, f'(x)
    
    """
    # YOUR CODE STARTS HERE
    if funct == 'sigmoid' :
        f = #your code here
        df = #your code here
    elif funct == 'tanh' :
        f = #your code here
        df = #your code here
    elif funct == 'Relu' :
        f =  #your code here
        df = #your code here
    else :
        df = 1
    # YOUR CODE ENDS HERE
    return df

In [None]:
def derivactfunc(x, funct) :
    """
    Implement derivative activation functions
    
    Arguments:
    
    x -- input data 
    funct -- activation function, f(x)
    
    Returns:
    
    df -- derivative of funct activation function, f'(x)
    
    """
    if funct == 'sigmoid' :
        f = 1/(1+np.exp(-x))
        df = f*(1-f)
    elif funct == 'tanh' :
        f = np.tanh(x)
        df = 1-f**2
    elif funct == 'Relu' :
        f =  np.maximum(0,x)
        df = f > 0
    else :
        df = 1
    return df

In [None]:
class TestDerivActFunc(unittest.TestCase):

    def test_sigmoid_derivative(self):
        x = np.array([0.0])
        expected = 0.25  # sigmoid(0) = 0.5, derivative = 0.5 * (1 - 0.5)
        result = derivactfunc(x, 'sigmoid')
        self.assertAlmostEqual(result[0], expected, msg=f"Expected sigmoid' at 0 to be {expected}, got {result[0]}")

    def test_tanh_derivative(self):
        x = np.array([0.0])
        expected = 1.0  # tanh(0) = 0, derivative = 1 - 0^2 = 1
        result = derivactfunc(x, 'tanh')
        self.assertAlmostEqual(result[0], expected, msg=f"Expected tanh' at 0 to be {expected}, got {result[0]}")

    def test_relu_derivative(self):
        x = np.array([-1.0, 0.0, 2.0])
        expected = np.array([0.0, 0.0, 1.0])
        result = derivactfunc(x, 'Relu')
        np.testing.assert_array_equal(result, expected, err_msg=f"Expected ReLU' at {x} to be {expected}, got {result}")

    def test_unknown_function(self):
        x = np.array([42.0])
        expected = np.array([1.0])
        result = derivactfunc(x, 'unknown')
        np.testing.assert_array_equal(result, expected, err_msg="Expected derivative of unknown to return ones.")

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

The previous equations are often written using differential notations, e.g, $db^{[l]} = \frac{\partial J}{\partial b^{[l]}}$ , so the Backward equations are:

$$
\begin{align*}
 dZ^{[l]} &= dA^{[l]} * g'^{[l]}(Z^{[l]}) \\
 dW^{[l]} &= \frac{1}{m}dZ^{[l]}\cdot (A^{[l]})^T \\
 db^{[l]} &= \frac{1}{m}dZ^{[l]}\cdot \mathbf{1} \\
 dA^{[l-1]} &= (W^{[l]})^T \cdot dZ^{[l]}
\end{align*}
$$

where:

- $*$ is the element-wise multiplication
- $\cdot$ is the matrix multiplication 

Notice that we have included now the term $\frac{1}{m} $ where $m$ is the batch size. This is to average the parameters to make it independent of the batch size.

In this task we consider the sum square error:
$$
J( A,Y) = \frac{( A-Y)^2}{2}
$$
thus
$$
dA =  A-Y
$$
where $A$ is the predicted output in the last layer and $Y$ is the ground truth label

In [None]:
def backwardprop(X, y_true, A2, Z2, A1, Z1, W2, funct1, funct2):
    """
    Implement backwad propagation equations
    
    Arguments:
    X: Input data, shape (n_inputlayer, m)
    y_true: Ground truth label, shape (n_outputlayer, m) 
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    A2: activated output of 2nd layer, shape (n_outputlayer,m)
    Z2: preactivated output of 2nd layer, shape (n_outputlayer,m)
    A1: activated output of 1st layer, shape (n_hiddenlayer,m)
    Z1: preactivated output of 1st layer, shaoe (n_hiddenlayer,m)
    funct1: activation function of 1st layer
    funct2: activation function of 2nd layer
        
    Returns:
    dW1: Gradient of loss w.r.t. W1, shape (n_outputlayer,m)
    db1: Gradient of loss w.r.t. b1, shape (n_hiddenlayer,1)
    dW2: Gradient of loss w.r.t. W2, shape (n_outputlayer,n_hiddenlayer)
    db2: Gradient of loss w.r.t. b2, shape (n_outputlayer,1)
    """
    # YOUR CODE STARTS HERE
    m = # your code here, batch size
    dA2 = # your code here, shape (n_outputlayer,m) -- consider A-Y 
    dZ2 = # your code here, shape (n_outputlayer,m) -- element wise multiplication of dA2 and funct1'(Z2)
    dW2 = # your code here, shape (n_outputlayer,n_hiddenlayer)
    db2 = # your code here, shape (n_outputlayer,1)
    ###################
    dA1 = # your code here, shape (n_hiddenlayer,m)
    dZ1 = # your code here, shape (n_hiddenlayer,m) -- element wise multiplication of dA1 and funct2'(Z1)
    dW1 = # your code here, shape (n_hiddenlayer,n_inputlayer)
    db1 = # your code here, shape (n_hiddenlayer,1)
    # YOUR CODE ENDS HERE
    return db2, dW2, db1, dW1


In [None]:
def backwardprop(X, y_true, A2, Z2, A1, Z1, W2, funct1, funct2):
    """
    Implement backwad propagation equations
    
    Arguments:
    X: Input data, shape (n_inputlayer, m)
    y_true: Ground truth label, shape (n_outputlayer, m) 
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    A2: activated output of 2nd layer, shape (n_outputlayer,m)
    Z2: preactivated output of 2nd layer, shape (n_outputlayer,m)
    A1: activated output of 1st layer, shape (n_hiddenlayer,m)
    Z1: preactivated output of 1st layer, shaoe (n_hiddenlayer,m)
    funct1: activation function of 1st layer
    funct2: activation function of 2nd layer
        
    Returns:
    dW1: Gradient of loss w.r.t. W1, shape (n_hiddenlayer,m)
    db1: Gradient of loss w.r.t. b1, shape (n_hiddenlayer,1)
    dW2: Gradient of loss w.r.t. W2, shape (n_outputlayer,n_hiddenlayer)
    db2: Gradient of loss w.r.t. b2, shape (n_outputlayer,1)
    """
    m = X.shape[0]
    dA2 = (A2-y_true)
    dZ2 = dA2*derivactfunc(Z2, funct2)
    dW2 = dZ2.dot(A1.T)/m
    db2 = dZ2.dot(np.ones((dZ2.shape[1], 1)))/m
    ###################"
    dA1 = (W2.T).dot(dZ2)
    dZ1 = dA1*derivactfunc(Z1, funct1)
    dW1 = dZ1.dot(X.T)/m
    db1 = dZ1.dot(np.ones((dZ1.shape[1], 1)))/m
    return db2, dW2, db1, dW1

In [None]:
class TestBackwardProp(unittest.TestCase):

    def test_backwardprop_shapes(self):
        # Setup dummy data
        X = np.array([[0.5, 0.2], [0.1, 0.3]])  # shape: (2, 2)
        y_true = np.array([[0.7, 0.8]])          # shape: (1, 2)
        A2 = np.array([[0.6, 0.7]])              # shape: (1, 2)
        Z2 = np.array([[0.1, 0.2]])              # shape: (1, 2)
        A1 = np.array([[0.5, 0.6], [0.4, 0.5]])  # shape: (2, 2)
        Z1 = np.array([[0.3, 0.4], [0.2, 0.3]])  # shape: (2, 2)
        W2 = np.array([[0.1, 0.2]])              # shape: (1, 2)

        # Call backwardprop
        db2, dW2, db1, dW1 = backwardprop(X, y_true, A2, Z2, A1, Z1, W2, 'sigmoid', 'sigmoid')

        # Check shapes of the gradients
        self.assertEqual(dW1.shape, (2, 2), msg=f"Expected dW1 shape (2, 2), got {dW1.shape}")
        self.assertEqual(db1.shape, (2, 1), msg=f"Expected db1 shape (2, 1), got {db1.shape}")
        self.assertEqual(dW2.shape, (1, 2), msg=f"Expected dW2 shape (1, 2), got {dW2.shape}")
        self.assertEqual(db2.shape, (1, 1), msg=f"Expected db2 shape (1, 1), got {db2.shape}")


if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

## Gradient descent

Once we have compute all the derivatives, we plug them into the gradient descent method to update the weights and bias:

$$
\begin{align*}
W^{[l]} & = W^{[l]} - \alpha \,dW^{[l]}\\
b^{[l]} & = b^{[l]} - \alpha \, db^{[l]}
\end{align*}
$$
where $\alpha$ is the learning rate

In [None]:
def gradient_descent(b1, b2, W1, W2,db2, dW2, db1, dW1, lr) :
    """
    Implement gradient descent
    
    Arguments:
    b1: bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: bias of 2nd layer, shape (n_outputlayer, 1)
    W1: weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    dW1: Gradient of loss w.r.t. W1, shape (n_hiddenlayer,n_inputlayer)
    db1: Gradient of loss w.r.t. b1, shape (n_hiddenlayer,1)
    dW2: Gradient of loss w.r.t. W2, shape (n_outputlayer,n_hiddenlayer)
    db2: Gradient of loss w.r.t. b2, shape (n_outputlayer,1)
    lr: learning rate
        
    Returns:
    b1: Updated bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: Updated bias of 2nd layer, shape (n_outputlayer, 1)
    W1: Updated weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: Updated weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    """
    # YOUR CODE STARTS HERE
    b1 = #your code here
    b2 = #your code here
    W1 = #your code here
    W2 = #your code here
    # YOUR CODE ENDS HERE
    return b1, b2, W1, W2

In [None]:
def gradient_descent(b1, b2, W1, W2,db2, dW2, db1, dW1, learning_rate) :
    """
    Implement gradient descent
    
    Arguments:
    b1: bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: bias of 2nd layer, shape (n_outputlayer, 1)
    W1: weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    dW1: Gradient of loss w.r.t. W1, shape (n_hiddenlayer,n_inputlayer)
    db1: Gradient of loss w.r.t. b1, shape (n_hiddenlayer,1)
    dW2: Gradient of loss w.r.t. W2, shape (n_outputlayer,n_hiddenlayer)
    db2: Gradient of loss w.r.t. b2, shape (n_outputlayer,1)
    lr: learning rate
        
    Returns:
    b1: Updated bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: Updated bias of 2nd layer, shape (n_outputlayer, 1)
    W1: Updated weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: Updated weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    """
    b1 = b1-learning_rate*db1
    b2 = b2-learning_rate*db2
    W1 = W1-learning_rate*dW1
    W2 = W2-learning_rate*dW2
    return b1, b2, W1, W2

In [None]:
class TestGradientDescent(unittest.TestCase):

    def test_gradient_descent_shapes(self):
        # Setup dummy data
        b1 = np.array([[0.1], [0.2]])  # shape: (2, 1)
        b2 = np.array([[0.3]])         # shape: (1, 1)
        W1 = np.array([[0.4, 0.5], [0.6, 0.7]])  # shape: (2, 2)
        W2 = np.array([[0.8, 0.9]])  # shape: (1, 2)
        db2 = np.array([[0.1]])  # shape: (1, 1)
        dW2 = np.array([[0.2, 0.3]])  # shape: (1, 2)
        db1 = np.array([[0.4], [0.5]])  # shape: (2, 1)
        dW1 = np.array([[0.6, 0.7], [0.8, 0.9]])  # shape: (2, 2)
        learning_rate = 0.01

        # Call gradient_descent
        b1_updated, b2_updated, W1_updated, W2_updated = gradient_descent(
            b1, b2, W1, W2, db2, dW2, db1, dW1, learning_rate
        )

        # Check shapes of the updated parameters
        self.assertEqual(b1_updated.shape, b1.shape, msg=f"Expected b1 shape {b1.shape}, got {b1_updated.shape}")
        self.assertEqual(b2_updated.shape, b2.shape, msg=f"Expected b2 shape {b2.shape}, got {b2_updated.shape}")
        self.assertEqual(W1_updated.shape, W1.shape, msg=f"Expected W1 shape {W1.shape}, got {W1_updated.shape}")
        self.assertEqual(W2_updated.shape, W2.shape, msg=f"Expected W2 shape {W2.shape}, got {W2_updated.shape}")


if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

## Trainning process
This is an iterative process where parameters are repeatedly updated, aiming to make the error approach zero:

 step 1: Forward propagation given parameters \
 step 2: Calculate error \
 step 3: Backward propagation \
 step 4: Update parameters with gradient descent \
 step 5: Repeat steps 1 to 4

In [None]:
def trainproc(x_train, y_train, n_inputlayer, n_hiddenlayer, n_outputlayer, iteration, lr, actfunct1,actfunct2 ) :
    """
    Implement training algorithm
    
    Arguments:
    x_train:train data, shape (m, n_inputlayer)
    y_train: ground truth label, shape (n_outputlayer, m)
    n_inputlayer: number of neurons in the input layer
    n_hiddenlayer: number of neurons in the hidden layer
    n_outputlayer: number of neurons in the output layer
    iteration: number of iterations for training
    lr: learning rate
    actfunct1: activation function in the 1st layer
    actfunct2: activation function in the 2nd layer
        
    Returns:
    b1: Updated bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: Updated bias of 2nd layer, shape (n_outputlayer, 1)
    W1: Updated weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: Updated weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    """
    
    # YOUR CODE STARTS HERE
    # Initialize W1,b1, W2 and b2 with random values you can use np.random.rand(n_x,n_y)
    np.random.seed(0) 
    W1 = #your code here - initialize W1
    b1 = #your code here - initialize b1
    W2 = #your code here - initialize W2
    b2 = #your code here - initialize b2
    error = #your code here - initialize error with cero
    for i in range(iteration) :
        #call the forward propagation function in this line
        if n_outputlayer == 1 :  
            error =  # your code here, call the SSE function. This type of error only works when n_outputlayer = 1
        else
            error = # your code here, otherwhise cero.
        #call the backward propagation function in this line
        #call the gradient descendent function in this line
        #print the iteration and the error 
    # YOUR CODE ENDS HERE
    return b1, b2, W1, W2


In [None]:
def trainproc(x_train, y_train, n_inputlayer, n_hiddenlayer, n_outputlayer, iteration, lr, actfunct1,actfunct2 ) :
    
    """
    Implement training algorithm
    
    Arguments:
    x_train:train data, shape (m, n_inputlayer)
    y_train: ground truth label, shape (n_outputlayer, m)
    n_inputlayer: number of neurons in the input layer
    n_hiddenlayer: number of neurons in the hidden layer
    n_outputlayer: number of neurons in the output layer
    iteration: number of iterations for training
    lr: learning rate
    actfunct1: activation function in the 1st layer
    actfunct2: activation function in the 2nd layer
        
    Returns:
    b1: Updated bias of 1st layer, shape (n_hiddenlayer, 1)
    b2: Updated bias of 2nd layer, shape (n_outputlayer, 1)
    W1: Updated weights of 1st layer, shape (n_hiddenlayer, n_inputlayer)
    W2: Updated weights of 2nd layer, shape (n_outputlayer, n_hiddenlayer)
    """
    
    np.random.seed(0)

    W1 = np.random.rand(n_hiddenlayer, n_inputlayer)
    b1 = np.random.rand(n_hiddenlayer,1)
    W2 = np.random.rand(n_outputlayer, n_hiddenlayer)
    b2 = np.random.rand(n_outputlayer,1)
    error = 0
    for i in range(iteration) :
        A2, Z2, A1, Z1 = forwardprop(x_train,W1,W2,b1,b2, actfunct1, actfunct2)
        if n_outputlayer == 1:
            error = sserror(y_train,A2)
        else :
            error = 0
        db2, dW2, db1, dW1 = backwardprop(x_train, y_train, A2, Z2, A1, Z1, W2, actfunct1, actfunct2)
        b1, b2, W1, W2 = gradient_descent(b1, b2, W1, W2, db2, dW2, db1, dW1, lr)
        print('iteration :', i , 'error:', error)
    return b1, b2, W1, W2

## Example test - XOR table

Now it is time to validate your implementation with experimental data. The XOR table it is not linear separable so it is
a good example to test your neural network. For this, consider the next:
- x_train is the XOR table 4 rows and 2 columns
- y_train is XOR output, 4 rows and 1 column
- n_inputlayer = 2
- n_hiddenlayer = 4
- n_outputlayer = 1
- iterations = 500
- learning rate = 0.1
- actfunct1 = 'tanh'
- actfunct2 = 'Relu

In [None]:
x_train = np.array([[0,0], [0,1], [1,0], [1,1]])
y_train = np.array([[0], [1], [1], [0]])

In [None]:
#Do you need to reshape/Transpose your x_train, y_train?
b1, b2, W1, W2 = #your code here, call trainproc with the previous settings

In [None]:
# now we get the prediction from forward propagation, Use W1,W2,b1 and b2 from the previous cell and compare it with y_train
np.round(forwardprop(,,,,,,)[0])

In [None]:
y_train

### Congrats !!