In [None]:
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic

# Multi-Layered Neural Networks and the Backpropagation Algorithm

For easy computing potential on a neuron, the weights of incoming
synapses of the neuron are stored as a row vector.
 
Let us take a neural network with the topology [2,2,1], i.e., the network
has 2 input neurons, 2 hidden neurons in a single hidden layer, and one
output neuron. Let the weights of synapses between the input and the
hidden layer be in the following matrix:

In [None]:
w_i_h = np.array([[0.5, -0.5],
                  [1.5,  0.5]])

`w_i_h[i,j]` is the weight of the synapse from the input `i` into the
hidden neuron `j`. I.e., each row of the weight matrix corresponds to
the weights of synapses leading **from** one neuron!

Let the synaptic weights between the hidden and the output layer
be in the matrix:

In [None]:
w_h_o = np.array([[2.0], [-1.0]])
ic(w_h_o)

ic| w_h_o: array([[ 2.],
                  [-1.]])


array([[ 2.],
       [-1.]])

`w_h_o[i,0]` is the weight of the connection from the hidden neuron `i` 
to the output neuron. Thresholds of the hidden neurons are in the vector:

In [None]:
 b_h = np.array([0, 0.5])

and the threshold of the outout neuron is:

In [None]:
b_o = np.array([-0.5])

Hence the weights from the input layer into the hidden layer with added 
virtual neuron with fixed output 1 (for representing thresholds) are:

In [None]:
# note that r_ is not a method of numpy array!
w_i_hb = np.r_[w_i_h, b_h.reshape(1,-1)]
w_i_hb

array([[ 0.5, -0.5],
       [ 1.5,  0.5],
       [ 0. ,  0.5]])

The weights from the hidden layer into the output layer
with added virtual neuron with output 1 are:

In [None]:
w_h_ob = np.r_[w_h_o, b_o.reshape(1,-1)]
w_h_ob

array([[ 2. ],
       [-1. ],
       [-0.5]])

A sigmoidal transfer function $$logsig(x) = \frac{1}{1 + e^{-\lambda x}}$$ can be implemented as

In [None]:
def sigmoid(x, lam=1.0):
    # sigmoid transfer function
    #     sigmoid(x) = 1/(1 + exp{-lam * x)
    return 1 / (1 + np.exp(-lam * x))

In [None]:
1/(1+np.exp(-3))

0.9525741268224334

In [None]:
sigmoid(3)

0.9525741268224334

This is the sigmoid function with the slope $\lambda$. The default value for the slope is $\lambda = 1$.

## Tasks:

* *Let $\lambda=1$. Compute the output of the network for the input patterns `p1` and `p2`.*

In [None]:
lam = 1.0
p1 = np.array([-1, 1])
p2 = np.array([ 1,-1])

In [None]:
y_o = sigmoid(np.dot(np.r_[sigmoid(np.dot(np.r_[p1, 1], w_i_hb)),1], w_h_ob))
print("y_o\n",y_o)

y_o
 [0.53607289]


In [None]:
print(sigmoid(np.r_[sigmoid(np.r_[p2, 1.] @ w_i_hb),1] @ w_h_ob))


[0.4158926]


In [None]:
print(w_i_hb)
print("p1")

print("p1 extended\n", np.r_[p1, 1])
print(sigmoid(np.dot(np.r_[p1, 1], w_i_hb[:,0])))    # outputs on the first hidden neuron
print(sigmoid(np.dot(np.r_[p1, 1], w_i_hb[:,1])))    # outputs on the second hidden neuron

y_h = sigmoid(np.dot(np.r_[p1, 1], w_i_hb))     # outputs on the hidden layer
print("y_h\n",y_h)
y_o = sigmoid(np.dot(np.r_[y_h,1], w_h_ob))     # output on the output layer
print("y_o\n",y_o)

print("p2")
print(sigmoid(np.dot(np.r_[p2, 1], w_i_hb[:,0])))    # outputs on the hidden layer
print(sigmoid(np.dot(np.r_[p2, 1], w_i_hb[:,1])))    # outputs on the hidden layer

y_h = sigmoid(np.dot(np.r_[p2, 1], w_i_hb))     # outputs on the hidden layer
print("y_h\n",y_h)
y_o = sigmoid(np.dot(np.r_[y_h,1], w_h_ob))     # output on the output layer
print("y_o\n",y_o)

# YOUR CODE HERE
pass

[[ 0.5 -0.5]
 [ 1.5  0.5]
 [ 0.   0.5]]
p1
p1 extended
 [-1  1  1]
0.7310585786300049
0.8175744761936437
y_h
 [0.73105858 0.81757448]
y_o
 [0.53607289]
p2
0.2689414213699951
0.3775406687981454
y_h
 [0.26894142 0.37754067]
y_o
 [0.4158926]


* *Compute the utput of the network for the whole training set `X` consisting of the patterns `p1` and `p2`.*

In [None]:
X = np.vstack((p1, p2))
print("X\n", X)
print(np.c_[X, np.ones(X.shape[0])])

X_ex = np.c_[X, np.ones(X.shape[0])]
y_h = sigmoid(X_ex @ w_i_hb)
print("y_h\n", y_h)

y_h_ex = np.c_[y_h, np.ones(y_h.shape[0])]
y_o = sigmoid(y_h_ex @ w_h_ob)
print("y_o\n", y_o)

X
 [[-1  1]
 [ 1 -1]]
[[-1.  1.  1.]
 [ 1. -1.  1.]]
y_h
 [[0.73105858 0.81757448]
 [0.26894142 0.37754067]]
y_o
 [[0.53607289]
 [0.4158926 ]]


The input pattern  `p1` is a training vector with the desired
output 0.9 and the input pattern `p2` is also a trianing pattern with the desired output 0.8. Hence the desired outputs we can store in an array, where row `d[i]` are the desired output for the pattern `X[i]`.

In [None]:
d = np.array([[0.9],[0.8]])
print("d\n",d)

d
 [[0.9]
 [0.8]]


* *What is the error of the network on each of the patterns `p1` and `p2`?*

In [None]:
# YOUR CODE HERE
1/2 * np.sum((d[0] - y_o[0])**2)

0.06622147161927612

In [None]:
1/2 * np.sum((d[1] - y_o[1])**2)

0.07376924828933176

* *What is the mean squared error (MSE) of the network on the whole training set?*

In [None]:
# YOUR CODE HERE
ic(1/2*np.sum((d-y_o)**2, axis=1))
np.mean(1/2*np.sum((d-y_o)**2, axis=1))

ic| 1/2*np.sum((d-y_o)**2, axis=1): array([0.06622147, 0.07376925])


0.06999535995430395

* *How will change the weights of the network after one step of the
  backpropagation learning algorithm (without momentum) with the training pattern `p1`
  with the learning rate $\alpha = 0.2$?*

In [None]:
alpha = 0.2

The error terms at neuron $j$ in the output layer

$$\hspace{4em} \displaystyle \delta_j = (d_j-y_j)\cdot \lambda  y_j (1 - y_j)$$

In [None]:
# YOUR CODE HERE
y_h = sigmoid(np.c_[X, np.ones(X.shape[0])] @ w_i_hb)
y_o = sigmoid(np.c_[y_h, np.ones(X.shape[0])] @ w_h_ob)

delta_o = (d - y_o) * lam * y_o * (1. - y_o)
print(delta_o[0])

[0.09050822]


The error term at neuron $j$ in a hidden layer
$$\hspace{4em} \displaystyle \delta_j = \big(\sum_k \delta_k w_{jk}\big) \cdot \lambda y_j (1 - y_j)$$

In [None]:
delta_o

array([[0.09050822],
       [0.09330965]])

In [None]:
ic((delta_o[0] * w_h_ob))

ic| delta_o[0] * w_h_ob: array([[ 0.18101643],
                                [-0.09050822],
                                [-0.04525411]])


array([[ 0.18101643],
       [-0.09050822],
       [-0.04525411]])

In [None]:
y_h

array([[0.73105858, 0.81757448],
       [0.26894142, 0.37754067]])

In [None]:
y_h_ex = np.c_[y_h, np.ones(y_h.shape[0])]
y_h_ex

array([[0.73105858, 0.81757448, 1.        ],
       [0.26894142, 0.37754067, 1.        ]])

In [None]:
delta_h = (delta_o[0] * w_h_ob) * lam * y_h_ex[0].reshape(-1,1) * (1 - y_h_ex[0].reshape(-1,1))            # delta terms at the hidden layer
print(delta_h)

w_h_ob1 = ...               # new weights from the hidden to the output layer
w_i_hb1 = ...               # new weights form the input to the output layer
# YOUR CODE HERE


[[ 0.03558999]
 [-0.01349898]
 [-0.        ]]


   
* How will change the output of the network for input `p1` after the first 
  iteration of the backpropagation algorithm?*

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

* *Estimate the number of iterations over the pattern `p1` necessary to obtain an error "close" to 0*

In [None]:
alpha = 0.2
lam = 1.0



**Notation:**

Using `numpy` for working with vectors and matrices when we train a neural network has some problems:
* Input: input patterns are stored as rows in a 2D matrix $X$, but one input pattern is a 1D vector.
* Output, desired output: output patterns are stored as rows in a 2D matrix $Y$, however one output pattern is a 1D vector.
* Output of hidden neurons: can be stored in rows of a 2D matrix if we compute output for more than one pattern, but it is a 1D vector if we compute with one input vector.

A possible solution: is to *store vectors as two-dimensional arrays*:
* Then we can distinguish row and column vectors.
* If we work with a single vector, we will convert it into a row vector.

In [None]:
p1_2d = p1.reshape(1,-1)
print("p1_2d\n",p1_2d)

In [None]:
# output of the hidden neurons
y_h = ...
print("y_h\n", y_h)

In [None]:
# output of the network 
y_o = ...
print("y_o\n", y_o)

In [None]:
delta_o = ...
print("delta_o\n", delta_o)

Note that `delta_o` **is a row vector**? Why?

In [None]:
print("np.c_[y_h,[[1]]]\n", np.c_[y_h,[[1]]])

w_h_ob1 = w_h_ob + ...
print("w_h_ob1\n", w_h_ob1)

In [None]:
delta_h = ...
print("delta_h\n", delta_h)

In [None]:

w_i_hb1 = ...
print("w_i_hb1\n", w_i_hb1)

Now for the second pattern `p2`.

In [None]:
p2_2d = p2.reshape(1,-1)
print("p2_2d\n",p2_2d)

In [None]:
# output of the hidden neurons

y_h = ...
print("y_h\n", y_h)

In [None]:
y_o = ...
print("y_o\n", y_o)

In [None]:

delta_o = ...
print("delta_o\n", delta_o)

In [None]:
w_h_ob1 = ...
print("w_h_ob1\n", w_h_ob1)

In [None]:
delta_h = ...
print("delta_h\n", delta_h)

In [None]:
w_i_hb1 = ... 
print("w_i_hb1\n", w_i_hb1)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=61b28771-206e-4d0b-8437-9015c75dcd39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>