In [64]:
import numpy as np

In [65]:
def relu(x):
    return np.maximum(x, 0)

In [66]:
def softmax(x):
    return np.exp(x)/(np.sum(np.exp(x), axis = 0))

In [67]:
def cross_entropy(output, target):
    return (target * np.log(output) + (1-target) * np.log(1-output)) * (-1.0)

In [99]:
# Function to build neuron network from scatch 
# w_ji, w_jk, w_ko: Weight at layer j, k ,o
# i : input
# t : target
# l_m : maximum of loss, where to stop
def build_NN(w_ij, w_jk, w_ko, i, t, l_m):
    # set learning rate
    l_rate = 0.01
    
    # Set temporary loss
    L_sum = 1.0
    
    #Bias
    b = 0.3
    
    # Count nums time of backpropagation
    count = 0
    
    
    # Keep update weight until loss is good enough
    while True:
        
        ### Forward path
        # Calculate input at j by dot product and add bias b = 0.3
        j = np.dot(i, w_ij) + b
        # output at layer j by apply activation function (Relu)
        j_out = relu(j)
        
        # Calculate input at k by dot product and add bias b = 0.3
        k = np.dot(j_out, w_jk) + b
        # output at layer k by apply activation function (Relu)
        k_out = relu(k)
        
        # Calculate input at o by dot product and add bias b = 0.3
        o =  np.dot(k_out, w_ko) + b
        
        # output at layer o by apply activation function (softmax)
        # Return softmax output (activation function)
        o_out = softmax(o)
        
        # Calculate cross entropy loss function
        L = cross_entropy(o_out, t)
        
        # Loss sum
        L_sum = sum(L)
        
        if(L_sum <= l_m): # why break here ??? --> should cross-entropy always < 1.0
            break
        
        # Otherwise do Backpropagation
        
        # Gradient of layer O
        g_ko = k_out.reshape(3,1) * (((-1.0) * (t / o_out)) + ((1 - t) / (1 - o_out))) * (o_out - o_out * o_out)
        
        # Gradient of layer K
        dL_dk = np.sum((((-1.0) * (t / o_out)) + ((1 - t) / (1 - o_out))) * (o_out - o_out * o_out) * w_ko, axis = 1)
        g_jk = j_out.reshape(3,1) * dL_dk
        
        # Gradient of layer J
        # Get diagonal of w_ij (weigth ij)
        w_jj_kj = np.diag(w_jk)
        g_ij = i.reshape(3,1) * (dL_dk * w_jj_kj)
        
        ### Set new weigth
        w_ij = w_ij - (g_ij * l_rate)
        w_jk = w_jk - (g_jk * l_rate)
        w_ko = w_ko - (g_ko * l_rate)
        
        # Update count
        count = count + 1
    
    return (w_ij, w_jk, w_ko, o_out, L_sum, count)
        

### Testing running

In [100]:
# input at i
i = np.array([0.1, 0.2, 0.7])

In [101]:
# Weight from layer i to j
w_ij = np.array([0.1 ,0.2, 0.3, 0.3, 0.2, 0.7, 0.4, 0.3, 0.9]).reshape(3,3)
w_ij

array([[0.1, 0.2, 0.3],
       [0.3, 0.2, 0.7],
       [0.4, 0.3, 0.9]])

In [102]:
# Weight from layer j to k
w_jk = np.array([0.2, 0.3, 0.5, 0.3, 0.5, 0.7, 0.6, 0.4, 0.8]).reshape(3,3)
w_jk

array([[0.2, 0.3, 0.5],
       [0.3, 0.5, 0.7],
       [0.6, 0.4, 0.8]])

In [103]:
# Weight from layer k to o
w_ko = np.array([0.1, 0.4, 0.8, 0.3, 0.7, 0.2, 0.5, 0.2, 0.9]).reshape(3,3)
w_ko

array([[0.1, 0.4, 0.8],
       [0.3, 0.7, 0.2],
       [0.5, 0.2, 0.9]])

In [104]:
# Array of target
t = np.array([1.0, 0.0, 0.0])

In [105]:
# set learning rate
l_rate = 0.01

In [106]:
# Maximum Loss
l_m = 0.01

In [107]:
result = build_NN(w_ij, w_jk, w_ko, i, t, l_m)
my_out = result[3]
result

(array([[0.09940437, 0.20408816, 0.32922893],
        [0.29880874, 0.20817633, 0.75845786],
        [0.39583058, 0.32861715, 1.10460249]]),
 array([[0.16126579, 0.42938604, 0.68898428],
        [0.26636851, 0.61639099, 0.87037136],
        [0.53769613, 0.63410452, 1.14424305]]),
 array([[ 0.6672003 ,  0.1980114 ,  0.4347883 ],
        [ 0.97509169,  0.45492945, -0.23002114],
        [ 1.53561836, -0.17588125,  0.24026289]]),
 array([0.99501222, 0.00221463, 0.00277314]),
 0.009994345544873688,
 927)

### Step By Step Explaination

In [77]:
w_ij

array([[0.1, 0.2, 0.3],
       [0.3, 0.2, 0.7],
       [0.4, 0.3, 0.9]])

In [78]:
# Calculate input at j by dot product and add bias b = 0.3
b = 0.3
j = np.dot(i, w_ij) + b
j

array([0.65, 0.57, 1.1 ])

In [79]:
# output at layer j by apply activation function (Relu)
j_out = relu(j)
j_out

array([0.65, 0.57, 1.1 ])

In [80]:
w_jk

array([[0.2, 0.3, 0.5],
       [0.3, 0.5, 0.7],
       [0.6, 0.4, 0.8]])

In [81]:
# Calculate input at k by dot product and add bias b = 0.3
k = np.dot(j_out, w_jk) + b
k

array([1.261, 1.22 , 1.904])

In [82]:
# output at layer k by apply activation function (Relu)
k_out = relu(k)
k_out

array([1.261, 1.22 , 1.904])

In [83]:
w_ko

array([[0.1, 0.4, 0.8],
       [0.3, 0.7, 0.2],
       [0.5, 0.2, 0.9]])

In [84]:
# Calculate input at o by dot product and add bias b = 0.3
o =  np.dot(k_out, w_ko) + b
o

array([1.7441, 2.0392, 3.2664])

In [85]:
# output at layer o by apply activation function (softmax)
# Return softmax output (activation function)
#o_out = np.exp(o)/(np.sum(np.exp(o), axis = 0))
o_out = softmax(o)
o_out

array([0.14438319, 0.19394426, 0.66167255])

In [86]:
# Calculate cross entropy loss function
#L = (t * np.log(o_out) + (1-t) * np.log(1-o_out)) * (-1.0)
L = cross_entropy(o_out, t)
L

array([1.93528448, 0.21560238, 1.08374107])

In [87]:
sum(L)

3.2346279327622796

## Backpropagation

#### Find gradient for layer O by using backpropagation algorithm

\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{W}_{k_io_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{O}_{out_j}} * \frac{\partial \mathbf{O}_{out_j}}{\partial \mathbf{O}_{j}} * \frac{\partial \mathbf{O}_{j}}{\partial \mathbf{W}_{k_io_j}} = (- \frac{t_j}{\mathbf{O}_{out_j}} + \frac{1 - t_j}{1 - \mathbf{O}_{out_j}}) * (\mathbf{O}_{out_j} - \mathbf{O}_{out_j} ^ 2) * \mathbf{K}_{out_i}
\end{align}

In [88]:
t

array([1., 0., 0.])

In [89]:
o_out

array([0.14438319, 0.19394426, 0.66167255])

In [90]:
k_out

array([1.261, 1.22 , 1.904])

In [91]:
#Gradient of layer O
g_ko = k_out.reshape(3,1) * (((-1.0) * (t / o_out)) + ((1 - t) / (1 - o_out))) * (o_out - o_out * o_out)

In [92]:
g_ko

array([[-1.0789328 ,  0.24456371,  0.83436909],
       [-1.04385251,  0.236612  ,  0.80724051],
       [-1.62909441,  0.36926987,  1.25982454]])

In [93]:
(-1 * t[0] / o_out[0] + (1 - t[0]) / (1 - o_out[0])) * (o_out[0] - o_out[0] * o_out[0]) * k_out[0]

-1.078932799024619

#### Find gradient for layer K by using backpropagation algorithm

\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{W}_{j_ik_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} * \frac{\partial \mathbf{K}_{out_j}}{\partial \mathbf{K}_{j}} * \frac{\partial \mathbf{K}_{j}}{\partial \mathbf{W}_{j_ik_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} * 1 * \mathbf{J}_{out_i}
\end{align}


\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} = \frac{\partial \mathbf{L_1}}{\partial \mathbf{K}_{out_j}} + \frac{\partial \mathbf{L_2}}{\partial \mathbf{K}_{out_j}} + \frac{\partial \mathbf{L_3}}{\partial \mathbf{K}_{out_j}} 
\end{align}

\begin{align}
\frac{\partial \mathbf{L_1}}{\partial \mathbf{K}_{out_1}} = \frac{\partial \mathbf{L_1}}{\partial \mathbf{O}_{out_1}} * \frac{\partial \mathbf{O}_{out_1}}{\partial \mathbf{O}_{1}} * \frac{\partial \mathbf{O}_{1}}{\partial \mathbf{K}_{out_j}} = (- \frac{t_1}{\mathbf{O}_{out_1}} + \frac{1 - t_1}{1 - \mathbf{O}_{out_1}}) * (\mathbf{O}_{out_1} - \mathbf{O}_{out_1} ^ 2) * \mathbf{W}_{k_jo_1}
\end{align}

#### Find
\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} 
\end{align}

In [94]:
dL_dk = np.sum((((-1.0) * (t / o_out)) + ((1 - t) / (1 - o_out))) * (o_out - o_out * o_out) * w_ko, axis = 1)
dL_dk

array([0.52135406, 0.01141045, 0.20648574])

In [95]:
#Gradient of layer K
g_jk = j_out.reshape(3,1) * dL_dk
g_jk

array([[0.33888014, 0.00741679, 0.13421573],
       [0.29717182, 0.00650396, 0.11769687],
       [0.57348947, 0.01255149, 0.22713432]])

#### Find gradient for layer J by using backpropagation algorithm

\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{W}_{i_ij_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{J}_{out_j}} * \frac{\partial \mathbf{J}_{out_j}}{\partial \mathbf{J}_{j}} * \frac{\partial \mathbf{J}_{j}}{\partial \mathbf{W}_{i_ij_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{J}_{out_j}} * 1 * \mathbf{I}_{out_i}
\end{align}


\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{J}_{out_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} * \frac{\partial \mathbf{K}_{out_j}}{\partial \mathbf{K}_{j}} * \frac{\partial \mathbf{K}_{j}}{\partial \mathbf{J}_{out_j}} = \frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} * 1 * \mathbf{W}_{j_jk_j}
\end{align}

This derivative is calculated as above
\begin{align}
\frac{\partial \mathbf{L}}{\partial \mathbf{K}_{out_j}} 
\end{align}


In [96]:
dL_dk = np.sum((((-1.0) * (t / o_out)) + ((1 - t) / (1 - o_out))) * (o_out - o_out * o_out) * w_ko, axis = 1)
dL_dk

array([0.52135406, 0.01141045, 0.20648574])

In [97]:
# Get diagonal of w_ij (weigth ij)
w_jj_kj = np.diag(w_ij)
w_jj_kj

array([0.1, 0.2, 0.9])

In [98]:
#Gradient of layer J
g_ij = i.reshape(3,1) * (dL_dk * w_jj_kj)
g_ij

array([[0.00521354, 0.00022821, 0.01858372],
       [0.01042708, 0.00045642, 0.03716743],
       [0.03649478, 0.00159746, 0.13008602]])