In [5]:
import numpy as np
import pandas as pd
# from bokeh.plotting import figure as bokeh_figure, output_notebook, show
# output_notebook()

In [7]:
#def figure(*args, **kwargs):
#     return bokeh_figure(*args, **kwargs, width=550, height=300)

### Define our function

In [301]:
model = pd.DataFrame({'x1': [0, 0, 1, 1], 'x2': [0, 1, 0, 1], 'x3': [1, 1, 1, 1], 'y': [0, 1, 1, 0]})
model

Unnamed: 0,x1,x2,x3,y
0,0,0,1,0
1,0,1,1,1
2,1,0,1,1
3,1,1,1,0


### Define our unbiased feedforward neural network
Each layer consists of a set of weights, a set of biases, and an activation function:
\begin{align}
\vec{y^{(l)}} &= \operatorname A(\vec{z^{(l)}})\\
\vec{z^{(l)}} &= \hat{w}^{(l)}\times \vec{y^{(l-1)}}+\vec{b^{(l)}}
\end{align}

In the unbiased case, $b^{(l)}=0$

### Define the activation function

In [302]:
from scipy.special import expit

def expit_derivative(y):
    """y = σ(x), y' = (1 - y) * y"""
    return (1 - y) * y 

In [595]:
x = model.values.T[:-1]
y = model.values.T[-1].reshape((1,-1))

In [596]:
x

array([[0, 0, 1, 1],
       [0, 1, 0, 1],
       [1, 1, 1, 1]])

In [597]:
y

array([[0, 1, 1, 0]])

In [598]:
n_x = x.shape[0]
hidden_layer_size = 4
w_1 = np.random.rand(hidden_layer_size, n_x, )
b_1 = np.zeros(hidden_layer_size)
w_2 = np.random.rand(1, hidden_layer_size)
b_2 = np.zeros(1)
output = np.zeros_like(y)

In [599]:
# Each row is a set of values for each input neuron xi
# Each column is a set of data for {x1, x2, x3}
x

array([[0, 0, 1, 1],
       [0, 1, 0, 1],
       [1, 1, 1, 1]])

In [600]:
def feed_forward_layer(activate, inputs, layer_weights, layer_biases):
    return activate(layer_weights@inputs + layer_biases)

In [601]:
def feed_forward(inputs, activators, weights, biases):
    layers = [inputs]
    for activator, layer_weights, layer_biases in zip(activators, weights, biases):
        inputs = feed_forward_layer(activator, inputs, layer_weights, layer_biases)
        layers.append(inputs)
    return layers

In [780]:
# @numba.jit(nopython=True)
def dz_dw(a, w, l, m, i, p, q, deriv):
#     assert len(a) == len(w), (len(a), len(w))
    assert 0 <= l < len(a)
    assert l >= m
    
    if l == m:
        return a[m-1][q] if i == p else 0.0
    
    w_l = w[l-1]
    
    terms = []
    
    for k in range(w_l.shape[1]):
        w_l_i_k = w_l[i, k]
        deriv_a_l_sub_1 = deriv(a[l-1][k])
        T = dz_dw(a, w, l-1, m, k, p, q, deriv)
        terms.append(w_l_i_k * deriv_a_l_sub_1 *  T)
                
    return np.sum(terms)


# @numba.jit(nopython=True)
def dc_dw(a, y, w, l, m, p, q, deriv):
#     print(len(a),l)
    a_l = a[l]
    if l == m:
        is_ = [p]
    else:
        is_ = [*range(len(a_l))]
    
#     assert 0 <= p < len(a_l)
    
    terms = []
    for i in is_:
        dy = (a_l[i] - y[i])
        deriv_a_l_i = deriv(a_l[i])
        T = dz_dw(a, w, l, m, i, p, q, deriv)
        terms.append(dy * deriv_a_l_i * T)
    return np.sum(terms)


def dc_dw_man_m_2(a, y, w, l, p, q, deriv, m=None):
    al = a[l]#.reshape(y.shape)
    Y =  deriv(al[p]) * a[l-1][q]
    err = (al[p] - y[p])
    print("\n")
    print("al[p] =", al[p])
    print("al[p] - y[p] =", (al[p] - y[p]))
    print("deriv(al[p]) =",deriv(al[p]))
    print("a[l-1][q] =", a[l-1][q])
    return np.sum(err * Y)

def dc_dw_man_m_1(a, y, w, l, p, q, deriv, m=None):
    n_l = len(a[l])
    print("(a[l][i]-y[i])", np.array([w[l-1][i,p]*(a[l][i]-y[i]) for i in range(n_l)]))
    s = np.sum([w[l-1][i,p]*(a[l][i]-y[i])*
                deriv(a[l][i]) 
                for i in range(n_l)])
    res= a[l-2][q]*deriv(a[l-1][p])*s
    return res.sum()


def d_to_mat(mat, f, layers, y, weights, l, m):
    dmat = np.empty_like(mat)
    for p in range(mat.shape[0]):
        for q in range(mat.shape[1]):
            dw = f(layers, y, weights, l=l, m=m, p=p, q=q, deriv=expit_derivative)
            dmat[p, q] = dw
    return dmat

In [781]:
w_1=np.array([[-0.16595599,  0.44064899, -0.99977125, -0.39533485],
       [-0.70648822, -0.81532281, -0.62747958, -0.30887855],
       [-0.20646505,  0.07763347, -0.16161097,  0.370439  ]]).T
w_2=np.array([[-0.5910955 ],
       [ 0.75623487],
       [-0.94522481],
       [ 0.34093502]]).T

In [782]:
layers = feed_forward(x, [expit]*2, [w_1, w_2], [0, 0])

In [783]:
d_to_mat(w_1, dc_dw_man_m_1, layers, y, weights, l=2, m=1)

(a[l][i]-y[i]) [[-0.81591008  0.88017551  0.78564832 -0.9381577 ]]
(a[l][i]-y[i]) [[-0.81591008  0.88017551  0.78564832 -0.9381577 ]]
(a[l][i]-y[i]) [[-0.81591008  0.88017551  0.78564832 -0.9381577 ]]
(a[l][i]-y[i]) [[-0.25088103  0.27064177  0.24157596 -0.28847048]]
(a[l][i]-y[i]) [[-0.25088103  0.27064177  0.24157596 -0.28847048]]
(a[l][i]-y[i]) [[-0.25088103  0.27064177  0.24157596 -0.28847048]]
(a[l][i]-y[i]) [[-0.65191464  0.70326292  0.62773541 -0.74959087]]
(a[l][i]-y[i]) [[-0.65191464  0.70326292  0.62773541 -0.74959087]]
(a[l][i]-y[i]) [[-0.65191464  0.70326292  0.62773541 -0.74959087]]
(a[l][i]-y[i]) [[-0.8033454   0.86662118  0.77354966 -0.92371047]]
(a[l][i]-y[i]) [[-0.8033454   0.86662118  0.77354966 -0.92371047]]
(a[l][i]-y[i]) [[-0.8033454   0.86662118  0.77354966 -0.92371047]]


array([[-0.0091517 , -0.00836254, -0.01874642],
       [-0.0031251 , -0.00302658, -0.00618472],
       [-0.00516383, -0.00572758, -0.01302418],
       [-0.01031271, -0.01030857, -0.0205889 ]])

In [672]:
m = 2
wm = weights[m-1]
d_w_2_ = np.empty_like(w_2)
for p in range(wm.shape[0]):
    for q in range(wm.shape[1]):
        dw = dc_dw(layers, y, weights, l=2, m=m, p=p, q=q, deriv=expit_derivative)
        d_w_2_[p, q] = dw
d_w_2_

array([[0.00451385, 0.00669325, 0.00676932, 0.00455133]])

In [625]:
d_w_2

array([[-0.00451385, -0.00669325, -0.00676932, -0.00455133]])

In [673]:
def train(n, inputs, outputs, weights, biases, activators, activator_derivatives, z_derivative, rate):
    for i in range(n):
        *hidden_layers, result = layers = feed_forward(inputs, activators, weights, biases)
#         print()
        for m in range(1, 3):
            wm = weights[m-1]
            dwm = np.empty_like(wm)
            for p in range(wm.shape[0]):
                for q in range(wm.shape[1]):
#                     print(wm,p,q,m,len(layers)-1)
                    dw = dc_dw(layers, outputs, weights, l=2, m=m, p=p, q=q, deriv=expit_derivative)
                    dwm[p, q] = -rate * dw
            print(dwm)
            wm += dwm
        
#         print(result, outputs)
    return result
        
# print(y)
train(1, x, y, [w_1, w_2], [0, 0], [expit]*2, [expit_derivative]*2, lambda x: x, 1)
# train(10_000, x_2, y_2, [w_1, w_2], [0, 0], [expit]*2, [expit_derivative]*2, z_derivative, 0.1)
# train(10_000, x_3, y_3, [w_1, w_2], [0, 0], [expit]*2, [expit_derivative]*2, z_derivative, 0.1)
# train(10_000, x_4, y_4, [w_1, w_2], [0, 0], [expit]*2, [expit_derivative]*2, z_derivative, 0.1)

[[ 0.00314085  0.00287001  0.00643375]
 [-0.00446255 -0.00432187 -0.00883159]
 [ 0.00354688  0.00393411  0.00894593]
 [-0.00207335 -0.00207252 -0.00413936]]
[[-0.00451385 -0.00669325 -0.00676932 -0.00455133]]


array([[0.47372957, 0.48895696, 0.54384086, 0.54470837]])

In [648]:
*_, result = feed_forward(x_2, [expit]*2, [w_1, w_2], [0, 0])
result

NameError: name 'x_2' is not defined

In [421]:
dy = y - outputs
dy

array([[array([[-0.65785401],
       [-0.51707436],
       [-0.57572272]]),
        array([[-0.74112955]])]], dtype=object)

In [158]:
y_1 = expit(w_1@x_0)
y_1

array([[0.65785401],
       [0.51707436],
       [0.57572272]])

In [161]:
de_dy = 2 * dy
dy_dz = deriv_expit(outputs)
de_dz = de_dy.T @ dy_dz
de_dw_2 = de_dz @ y_1.T # column [p] X row [q] vector gives matrix [p X q]
de_dw_2

array([[-0.18708125, -0.14704618, -0.16372466]])

In [162]:
w_2

array([[0.87638824, 0.90957082, 0.0086785 ]])

In [151]:
def back_propogate_2_layer(inv_activate, y, outputs, w_1, w_2, rate):
    d_y = y - outputs
    d_w_2 = 2 * dy * inv_activate(outputs)
    
    w_1_ = w_1 - rate * d_w_1
    w_2_ = w_2 - rate * d_w_2

### Then define a loss function
$$
\epsilon (y, \hat{y}) = \sum_{i=1}^n\left(y-\hat{y}\right)^2\,.
$$
The derivative of the loss function with respect to the first layer weights $\hat{w}^{(2)}$  is 
$$
\frac{\partial \epsilon}{\partial \hat{w}^{(2)}} = \frac{\partial \epsilon}{\partial\hat{y}}\frac{\partial\hat{y}}{\partial z}\frac{\partial z}{\partial \hat{w}^{(2)}}
$$

In [None]:
d_weights1 = np.dot(
    self.input.T,
    #
    np.dot(
        2
        * (self.y - self.output)
        * sigmoid_derivative(self.output),
        #
        self.weights2.T,
    )
    * sigmoid_derivative(self.layer1),
)

In [626]:
dc_dw(layers, y, [w_1, w_2], l=2, m=2, p=0, q=0, deriv=expit_derivative)

0.004513854642904304

In [627]:
dc_dw_man_m_2(layers, y, [w_1, w_2], l=2, p=0, q=0, deriv=expit_derivative)



al[p] = [0.47372957 0.48895696 0.54384086 0.54470837]
al[p] - y[p] = [ 0.47372957 -0.51104304 -0.45615914  0.54470837]
deriv(al[p]) = [0.24930986 0.24987805 0.24807798 0.24800116]
a[l-1][q] = [0.44856632 0.28639589 0.40795614 0.25371248]


0.004513854642904304

<hr>

In [628]:
dc_dw(layers, y, [w_1, w_2], l=2, m=1, p=0, q=0, deriv=expit_derivative)

-0.003140854770533802

In [629]:
dc_dw_man_m_1(layers, y, [w_1, w_2], l=2, p=0, q=0, deriv=expit_derivative)

array([-0.        , -0.        , -0.00176063, -0.00138022])