In [None]:
import numpy as np
import matplotlib.pyplot as plt

def forward(X, W1, b1, W2, b2):
    Z = np.tanh(X.dot(W1) + b1)
    activation = Z.dot(W2) + b2
    y = 1 / (1 + np.exp(-activation ) )
    
    return y, Z

def predict(X, W1, b1, W2, b2):
    Y, _ = forward(X, W1, b1, W2, b2)
    return np.round(Y)
    
def derivative_w2(Z, T, Y):
    return Z.T.dot(T-Y)

def derivative_b2(T, Y):
    return (T-Y).sum()

def derivative_w1(X, Z, T, Y, W2):
    dZ = np.outer(T-Y,W2) * (1 - Z * Z) # equal to (T-Y).dot(W2.T) * (1 - Z * Z)
    return X.T.dot(dZ)

def derivative_b1(Z, T, Y, W2):
    # dZ = np.outer(T-Y, W2) * Z * (1 - Z) # this is for sigmoid activation
    dZ = np.outer(T-Y, W2) * (1 - Z * Z) # this is for tanh activation
    return dZ.sum(axis = 0)

#becareful of this one
def cost(T, pY):
    return np.sum(T*np.log(pY) + (1-T)*np.log(1-pY))

def test_xor():
    X = np.array([[0,0], [0,1], [1,0], [1,1]])
    Y = np.array([0, 1, 1, 0])

    W1 = np.random.randn(2,5)
    b1 = np.zeros(5)
    W2 = np.random.randn(5)
    b2 = 0
    LL = [] #likelihood
    a = 10e-3 #learning rate
    regularization = 0.
    last_error_rate = None
    for i in range(30000):
        pY, Z = forward(X, W1, b1, W2, b2)
        ll = cost(Y, pY)
        
        prediction = predict(X, W1, b1, W2, b2)
        er = np.mean(prediction != Y)

        if er != last_error_rate:
            last_error_rate = er
            print ("error rate: ",er)
            print ("true: ", Y)
            print ("prediction:", prediction)
        if LL and ll < LL[-1]:
            print ("early exit")
            break
        LL.append(ll)
        W2 += a*(derivative_w2(Z, Y, pY) - regularization * W2)
        b2 += a*(derivative_b2(Y, pY) - regularization * b2)
        W1 += a*(derivative_w1(X, Z, Y, pY, W2) - regularization * W1)
        b1 += a*(derivative_b1(Z, Y, pY, W2) - regularization * b1)
        if i % 1000 == 0:
            print (i, "%0.*f" % (10, ll))
        
    print ("final class rate: ", np.mean(prediction == Y))
    plt.plot(LL)
    plt.show()

def test_donut():
    # donut example
    N = 1000
    R_inner = 5
    R_outer = 10

    # distance from origin is radius + random normal
    # angle theta is uniformly distributed between (0, 2pi)
    R1 = np.random.randn(int(N/2)) + R_inner
    theta = 2*np.pi*np.random.random(int(N/2))
    X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T

    R2 = np.random.randn(int(N/2)) + R_outer
    theta = 2*np.pi*np.random.random(int(N/2))
    X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T

    X = np.concatenate([ X_inner, X_outer ])
    Y = np.array([0]*int(N/2) + [1]*int(N/2))

    n_hidden = 8
    W1 = np.random.randn(2, n_hidden)
    b1 = np.random.randn(n_hidden)
    W2 = np.random.randn(n_hidden)
    b2 = np.random.randn(1)
    LL = [] # keep track of likelihoods
    learning_rate = 0.00005
    regularization = 0.2
    last_error_rate = None
    for i in range(160000):
        pY, Z = forward(X, W1, b1, W2, b2)
        ll = cost(Y, pY)
        prediction = predict(X, W1, b1, W2, b2)
        er = np.abs(prediction - Y).mean()
        LL.append(ll)
        W2 += learning_rate * (derivative_w2(Z, Y, pY) - regularization * W2)
        b2 += learning_rate * (derivative_b2(Y, pY) - regularization * b2)
        W1 += learning_rate * (derivative_w1(X, Z, Y, pY, W2) - regularization * W1)
        b1 += learning_rate * (derivative_b1(Z, Y, pY, W2) - regularization * b1)
        if i % 1000 == 0:
            print ("i:", i, "ll:", ll, "classification rate:", 1 - er)
    plt.plot(LL)
    plt.show()
if __name__ == "__main__":
    #test_xor()
    test_donut()

i: 0 ll: -935.497242817 classification rate: 0.512
i: 1000 ll: -405.457003923 classification rate: 0.917
i: 2000 ll: -234.302419597 classification rate: 0.981
i: 3000 ll: -169.882325424 classification rate: 0.982
i: 4000 ll: -136.092914476 classification rate: 0.985
i: 5000 ll: -115.454925855 classification rate: 0.989
i: 6000 ll: -101.437040921 classification rate: 0.991
i: 7000 ll: -91.1352341845 classification rate: 0.992
i: 8000 ll: -83.0926715761 classification rate: 0.992
i: 9000 ll: -76.6905901974 classification rate: 0.993
i: 10000 ll: -71.4830974928 classification rate: 0.993
i: 11000 ll: -67.1282024286 classification rate: 0.993
i: 12000 ll: -63.4200389503 classification rate: 0.993
i: 13000 ll: -60.278190034 classification rate: 0.993
i: 14000 ll: -57.5975347491 classification rate: 0.996
i: 15000 ll: -55.2647649844 classification rate: 0.996
i: 16000 ll: -53.2063991571 classification rate: 0.995
i: 17000 ll: -51.3722022365 classification rate: 0.995
i: 18000 ll: -49.7263958