In [1]:
import numpy as np
import matplotlib.pyplot as plt

#### cost/activation

In [2]:
def cost(y,y_hat):
    return 1/2*np.sum( np.power(y - y_hat, 2) )

def logistic_sigmoid(x, derivative=0):
    sigm = 1/(1 + np.exp(-x))
    
    if derivative:
        return sigm*(1. - sigm)
    
    return sigm

# from https://deepnotes.io/softmax-crossentropy
def stable_softmax(a, derivative=0):
    exps = np.exp(a - np.max(a))

    if len(exps.shape) > 1:
        ans = np.zeros((exps.shape[0],2))
        numerator = exps.sum(axis=1).reshape((exps.shape[0], 1))
        numerator = np.append(numerator, numerator, axis=1)
    else:
#         ans = np.zeros((exps.shape[0],))
        ans = np.zeros((2,2))
        numerator = np.sum(exps)
        
    S = exps/numerator
    
    if derivative:
        if len(exps.shape) > 1: # for more than one sample
            for i in range(exps.shape[0]):
                ans[i, 0] = S[i, 1]*(1 - S[i, 0])
                ans[i, 1] = S[i, 0]*(1 - S[i, 1])
        else:
            kro_delta = 0
            for i in range(ans.shape[0]):
                for j in range(ans.shape[1]):                        
                    if i==j:
                        kro_delta = 1
                    else:
                        kro_delta = 0
                        
                    ans[i,j] = S[i]*(kro_delta - S[j])
                        
#                     ans[i,j] = S[0]*(1-S[1])
#                     
# ans[1] = S[1]*(1-S[0])

#                     ans[0] = S[0]*(1-S[1])
#                     ans[1] = S[1]*(1-S[0])
#             ans[1] = -S[0]*S[1]
#             ans[1] = S[0]*(1-S[1])
#             ans[1] = -S[0]*S[1]
        return ans
    return S

#### NN functions

In [83]:
def forward_prop(W1, b1, W2, b2, X):
    A1 = (X @ W1 + b1)[0]
    Z1 = logistic_sigmoid(A1) 
    A2 = (Z1 @ W2 + b2)[0]
    Y = stable_softmax(A2)
    return A1, A2, Y

def backprop(W2, A1, A2, X, Y, t):
    
    ## mid layer
    # 2 x 1
    step1 = (t - Y) @ -stable_softmax(A2, derivative=1)
    
    # 1 X N hidden units
    step2 = logistic_sigmoid(A1, derivative=1)
    step2 = step2.reshape(1,step2.shape[0])
    # N x 1
    step3 = W2
#     step3 = step3.reshape(step3.shape[0],1)
    
    grad_mid_layer = X.T @ step1 @ step2 @ step3
    
    ## output layer (first step has been calculated already)
    step1 = step1.reshape(step1.shape[0],1)
    # 1 X N hidden units
    
    step2 = logistic_sigmoid(A1, derivative=0)
    N_no_hid_units = step2.shape[0]
    
    step2 = step2.reshape(1, N_no_hid_units)
    
    grad_output = step1 @ step2
    
#     grad_mid_layer = ((t-Y) @ -stable_softmax(A2, derivative=1)).T @ logistic_sigmoid(A1, derivative=1) @ W2 @ X.T
#     grad_output = ((t-Y) @ -stable_softmax(A2, derivative=1)).T @ logistic_sigmoid(A1, derivative=0) 
    
#     grad_output = grad_output @ np.eye(N_no_hid_units,M=1)

    return grad_mid_layer, grad_output

#### dataset / targets
X: possible inputs of a logic function.
t: dictionary with possible outputs for each logic gates. 
    4 binary ouputs to match NN's output probabilities of 0 or 1. 
    - if [p(0) p(1)] == [1 0] then probability of 0 == 1 && probability of 1 == 1

In [84]:
X = np.array([[0,0],\
              [0,1],\
              [1,0],\
              [1,1]], dtype=np.float32)

t = { #dictionary for getting both the target logic values and the correlated string 
    # binary labels to represent the probabilities of 1 or 0 (first column is 0, 2nd 1)
    "AND": np.array([[1, 0],\
                     [1, 0],\
                     [1, 0],\
                     [0, 1]], dtype=np.float32),
    
    "NAND": np.array([[0, 1],\
                      [0, 1],\
                      [0, 1],\
                      [1, 0]], dtype=np.float32),
    
    "OR": np.array([[1, 0],\
                    [0, 1],\
                    [0, 1],\
                    [0, 1]], dtype=np.float32),
    
    "NOR": np.array([[0, 1],\
                     [1, 0],\
                     [1, 0],\
                     [1, 0]], dtype=np.float32),
    
    "XOR": np.array([[1, 0],\
                     [0, 1],\
                     [0, 1],\
                     [1, 0]], dtype=np.float32) 
}

In [85]:
# stable_softmax(t["AND"][1,:].reshape((1,2)))
# stable_softmax(t["AND"][0,:],derivative=1)
stable_softmax(t["AND"][:,1],derivative=1)

array([[ 0.14429549, -0.03058221],
       [-0.03058221,  0.14429549]])

#### generating weights and biases
no. of hidden units in the second layer is defined here

In [86]:
NO_UNITS_L1 = 4

W1 = np.random.randn(2, NO_UNITS_L1)
b1 = np.zeros((1, NO_UNITS_L1))
W2 = np.random.randn(NO_UNITS_L1, 2) # 2 outputs, P(0) and P(1)
b2 = np.zeros((1,2))

#### running the NN

In [87]:
rho = .001
T = t["OR"]

for i in range(30_000):
    for j in range(X.shape[0]):
        A1, A2, Y = forward_prop(W1, b1, W2, b2, X[j,:])
        
        grad_mid_layer, grad_output = backprop(W2, A1, A2, X[j,:], Y, T[j,:])
        
        print(cost(T[j,:],Y))
        
        W1 = W1 - rho*grad_mid_layer
        W2 = W2 - rho*grad_output.T
        b1 = b1 - rho*np.mean(grad_mid_layer)
        b2 = b2 - rho*np.mean(grad_output) 

0.6541730248524862


ValueError: operands could not be broadcast together with shapes (2,4) (2,) 

#### test 

In [64]:
test = 1
A1, A2, Y = forward_prop(W1, b1, W2, b2, X[test,:])

print("t: " + str(T[test,:]))
print("Y: " + str(Y))

t: [0. 1.]
Y: [0.01189078 0.98810922]


In [None]:
# A1, A2, Y = forward_prop(W1, b1, W2, b2, X[1,:])


In [None]:
# test = 1
# # A1, A2, Y = forward_prop(W1, b1, W2, b2, X[1,:])

# 2 x 1
step1 = (t["OR"][test,:] - Y) @ -stable_softmax(A2, derivative=1)
step1 = step1.reshape(step1.shape[0],1)
# 1 X N hidden units
step2 = logistic_sigmoid(A1, derivative=0)
step2 = step2.reshape(1,step2.shape[0])

step1 @ step2

In [14]:
W1
# grad_mid_layer

array([[ 2.03760353,  1.7403077 , -0.03290419, -0.18653771],
       [-1.65769565, -0.81123515, -0.11098208,  0.16165617]])

In [25]:
stable_softmax(A2, derivative=0)
A2

array([-0.97711749, -0.01575291])

In [88]:
grad_mid_layer

array([0., 0.])

In [74]:
W2.shape

(4, 2)