In [50]:
import numpy as np
from matplotlib import pyplot as plt
np.seterr('raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [56]:
DEFAULT_FIG_SIZE = (15, 10)

def plot_wrapper(fn):
    def wrapper(*args, **kwargs):
        ncols = kwargs.get("ncols", 1)
        id = kwargs.get("id", 0)
        nrows = len(args)

        if id == 0:
            plt.figure(figsize=kwargs.get("figsize", DEFAULT_FIG_SIZE))

        for a_id, a in enumerate(args):
            plt.subplot(nrows, ncols, nrows*id + a_id+1)
            fn(a, **kwargs)

        if kwargs.get("file"):
            plt.savefig(kwargs["file"])
            plt.clf()
        elif kwargs.get("show", True) and id+1 == ncols:
            plt.show()

    return wrapper



@plot_wrapper
def shm(matrix, **kwargs):
    plt.imshow(np.squeeze(matrix).T, cmap='gray', origin='lower')
    plt.colorbar()
    
def shl(*vector, **kwargs):
    plt.figure(figsize=kwargs.get("figsize", DEFAULT_FIG_SIZE))
    
    labels = kwargs.get("labels", [])
    for id, v in enumerate(vector):
        if len(labels) > 0:
            plt.plot(np.squeeze(v), label=labels[id])
        else:
            plt.plot(np.squeeze(v))

    if len(labels) > 0:
        plt.legend()
    
    if not kwargs.get("title") is None:
        plt.suptitle(kwargs["title"])

    if kwargs.get("file"):
        plt.savefig(kwargs["file"])
        plt.clf()
    elif kwargs.get("show", True):
        plt.show()

relu = lambda x: np.maximum(x, 0.0)
relu_prime = lambda x: np.where(x > 0.0, 1.0, 0.0)

sigmoid = lambda x: 1.0/(1.0 + np.exp(-x))
def sigmoid_prime(x):
    v = sigmoid(x)
    return v * (1.0 - v)

t_clip_value = 1.0

threshold = lambda x: np.where(x > 0.0, 1.0, 0.0)
threshold_prime = lambda x: np.where(threshold(x) <= t_clip_value, 1.0, 0.0)

# def threshold_prime(x, threshold_value = 0.0):
#     return 1.0/np.square(1.0 + np.abs(x - threshold_value))

def one_hot_encode(target_v, size=None):
    y_v = np.zeros((target_v.shape[0], size if not size is None else len(np.unique(target_v))))
    for cl_id, cl_v in enumerate(np.unique(target_v)):
        y_v[np.where(target_v==cl_v)[0], cl_id] = 1.0

    return y_v

def weights_init(fan_in, fan_out, const=1.0):
    low = -const * np.sqrt(6.0 / (fan_in + fan_out))
    high = const * np.sqrt(6.0 / (fan_in + fan_out))
    return (
        (low + np.random.random((fan_in, fan_out)) * (high - low)).astype(np.float32),
        (low + np.random.random((fan_out,)) * (high - low)).astype(np.float32)
    )

def number_of_equal_act(a, a_t):
    a_t_m = a_t.copy()
    a_t_m[np.where(np.abs(a_t_m) < 1e-10)] = -1

    a_m = a.copy()
    a_m[np.where(np.abs(a_m) < 1e-10)] = -10

    a_t_mean = np.mean(a_t)
    if a_t_mean == 0.0:
        if np.mean(np.equal(a_t_m, a_m)) == 0.0:
            return 1.0
        else:
            return 0.0
    else:
        return np.mean(np.equal(a_t_m, a_m)) / np.mean(a_t)


In [3]:
# np.random.seed(1)

input_dim = 20
output_dim = 3
batch_size = 100
epochs = 500
learning_rate = 0.01

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig, _ = weights_init(input_dim, output_dim)

X = (np.random.random((batch_size, input_dim)) < 0.1).astype(np.float32)
Yt = f(np.dot(X, W_orig))

W, _ = weights_init(input_dim, output_dim)

for epoch in range(epochs):
    u = np.dot(X, W)
    y = f(u)
    
    e = Yt - y
    
    dW = np.dot(X.T, e)
    
    W += learning_rate * dW
    
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, {:.3f}".format(epoch, np.linalg.norm(e)))

Epoch 0, 11.358
Epoch 50, 0.000
Epoch 100, 0.000
Epoch 150, 0.000
Epoch 200, 0.000
Epoch 250, 0.000
Epoch 300, 0.000
Epoch 350, 0.000
Epoch 400, 0.000
Epoch 450, 0.000
Epoch 499, 0.000


In [4]:
# np.random.seed(1)

input_dim = 2
output_dim = 2
batch_size = 4
epochs = 500
learning_rate = 0.01

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

X = np.asarray([
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0]
], dtype=np.float32)
Yt = one_hot_encode(np.asarray([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
], dtype=np.float32), 2)


W, _ = weights_init(input_dim, output_dim)

for epoch in range(epochs):
    u = np.dot(X, W)
    y = f(u)
    
    e = Yt - y
    
    dW = np.dot(X.T, e)
    
    W += learning_rate * dW
    
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, {:.3f}".format(epoch, np.linalg.norm(e)))
 

Epoch 0, 1.732
Epoch 50, 2.000
Epoch 100, 2.000
Epoch 150, 2.000
Epoch 200, 2.000
Epoch 250, 2.000
Epoch 300, 2.000
Epoch 350, 2.000
Epoch 400, 2.000
Epoch 450, 2.000
Epoch 499, 2.000


In [71]:
np.random.seed(10)

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig_0 = np.random.randn(20, 100)
W_orig_1 = np.random.randn(100, 10)

# train data
X = (np.random.random((2000, 20)) < 0.1).astype(np.float32)
Y = f(np.dot(f(np.dot(X, W_orig_0)), W_orig_1))

# test data
Xt = (np.random.random((200, 20)) < 0.1).astype(np.float32)
Yt = f(np.dot(f(np.dot(Xt, W_orig_0)), W_orig_1))

# Xt, Yt = X, Y

# X = np.asarray([
#     [0.0, 0.0],
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0]
# ], dtype=np.float32)
# Yt = one_hot_encode(np.asarray([
#     [0.0],
#     [1.0],
#     [1.0],
#     [0.0]
# ], dtype=np.float32), 2)


input_dim = X.shape[1]
output_dim = Yt.shape[1]
batch_size = 200
number_of_train_batches = X.shape[0] // batch_size
number_of_test_batches = Xt.shape[0] // batch_size

hidden_dim = 300
epochs = 2000
init_learning_rate = 0.0005

W0, b0 = weights_init(input_dim, hidden_dim)
W1, b1 = weights_init(hidden_dim, output_dim)
W0_fb, b0_fb = weights_init(output_dim, hidden_dim)

W0_start = W0.copy()

for epoch in range(epochs):
    permute_ids = np.random.permutation(X.shape[0])
    X = X[permute_ids, :]
    Y = Y[permute_ids, :]

    e_avg, et_avg = 0.0, 0.0
    du0_avg = 0.0
    num_eq_avg = 0.0
    learning_rate = (float(init_learning_rate) / (epoch + 1) ** 0.5)
#     learning_rate = init_learning_rate * (1.0 - epoch / epochs)
    for i in range(number_of_train_batches):
        x = X[i*batch_size:(i+1)*batch_size,:]
        y = Y[i*batch_size:(i+1)*batch_size,:]

        u0 = np.dot(x, W0)
        y0 = f(u0)

        u1 = np.dot(y0, W1)
        y1 = f(u1)

        e = y - y1

        y0_fb = f(np.dot(y, W0_fb)) 

        du1 = e
#         du0 = np.dot(du1, W1.T) * f_prime(u0) # BP
#         du0 = np.dot(du1, W0_fb) * f_prime(u0) # FA
        du0 = (y0 * y0_fb - y0) * f_prime(u0)

        dW1 = np.dot(y0.T, du1)
        db1 = np.sum(du1, 0)

        dW0 = np.dot(x.T, du0)
        db0 = np.sum(du0, 0)

        W0 += learning_rate * dW0
        b0 += learning_rate * db0
        W1 += learning_rate * dW1
        b1 += learning_rate * db1
        
        e_avg += np.linalg.norm(e)
        du0_avg += np.linalg.norm(du0)
        
    for i in range(number_of_test_batches):
        xt = Xt[i*batch_size:(i+1)*batch_size,:]
        yt = Yt[i*batch_size:(i+1)*batch_size,:]

        yt0 = f(np.dot(xt, W0))
        yt1 = f(np.dot(yt0, W1))

        et = yt - yt1
        et_avg += np.linalg.norm(et)
        num_eq_avg += number_of_equal_act(yt, yt1)
        
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, |e| {:.3f} |et| {:.3f} |du0| {:.3f} |num.eq| {:.3f}%".format(
            epoch, 
            e_avg /  number_of_train_batches,
            et_avg /  number_of_test_batches,
            du0_avg /  number_of_train_batches,
            100.0 * num_eq_avg / number_of_test_batches,
        ))

Epoch 0, |e| 24.225 |et| 20.273 |du0| 102.678 |num.eq| 72.612%
Epoch 200, |e| 13.186 |et| 15.524 |du0| 4.356 |num.eq| 83.424%
Epoch 400, |e| 11.890 |et| 15.330 |du0| 2.977 |num.eq| 83.740%
Epoch 600, |e| 11.758 |et| 15.067 |du0| 2.119 |num.eq| 83.644%
Epoch 800, |e| 11.239 |et| 14.731 |du0| 1.517 |num.eq| 85.054%
Epoch 1000, |e| 11.227 |et| 14.697 |du0| 1.061 |num.eq| 84.148%
Epoch 1200, |e| 11.283 |et| 15.067 |du0| 0.949 |num.eq| 83.733%
Epoch 1400, |e| 11.038 |et| 14.967 |du0| 0.924 |num.eq| 83.798%
Epoch 1600, |e| 10.951 |et| 14.832 |du0| 0.441 |num.eq| 84.522%
Epoch 1800, |e| 10.828 |et| 14.663 |du0| 0.441 |num.eq| 85.000%
Epoch 1999, |e| 10.862 |et| 15.166 |du0| 0.341 |num.eq| 84.690%


In [None]:
np.random.seed(10)

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig_0 = np.random.randn(20, 100)
W_orig_1 = np.random.randn(100, 100)
W_orig_2 = np.random.randn(100, 10)

# train data
X = (np.random.random((2000, 20)) < 0.1).astype(np.float32)
Y = f(np.dot(f(np.dot(f(np.dot(X, W_orig_0)), W_orig_1)), W_orig_2))

# test data
Xt = (np.random.random((200, 20)) < 0.1).astype(np.float32)
Yt = f(np.dot(f(np.dot(f(np.dot(Xt, W_orig_0)), W_orig_1)), W_orig_2))

# Xt, Yt = X, Y

# X = np.asarray([
#     [0.0, 0.0],
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0]
# ], dtype=np.float32)
# Yt = one_hot_encode(np.asarray([
#     [0.0],
#     [1.0],
#     [1.0],
#     [0.0]
# ], dtype=np.float32), 2)


input_dim = X.shape[1]
output_dim = Yt.shape[1]
batch_size = 200
number_of_train_batches = X.shape[0] // batch_size
number_of_test_batches = Xt.shape[0] // batch_size

hidden_dim = 300
epochs = 2000
init_learning_rate = 0.0005

W0, b0 = weights_init(input_dim, hidden_dim)
W1, b1 = weights_init(hidden_dim, hidden_dim)
W2, b2 = weights_init(hidden_dim, output_dim)
W0_fb, b0_fb = weights_init(hidden_dim, hidden_dim)
W1_fb, b1_fb = weights_init(output_dim, hidden_dim)

W0_start = W0.copy()
W1_start = W1.copy()

for epoch in range(epochs):
    permute_ids = np.random.permutation(X.shape[0])
    X = X[permute_ids, :]
    Y = Y[permute_ids, :]

    e_avg, et_avg = 0.0, 0.0
    du0_avg, du1_avg = 0.0, 0.0
    num_eq_avg = 0.0
    learning_rate = (float(init_learning_rate) / (epoch + 1) ** 0.5)
#     learning_rate = init_learning_rate * (1.0 - epoch / epochs)
    for i in range(number_of_train_batches):
        x = X[i*batch_size:(i+1)*batch_size,:]
        y = Y[i*batch_size:(i+1)*batch_size,:]

        u0 = np.dot(x, W0)
        y0 = f(u0)

        u1 = np.dot(y0, W1)
        y1 = f(u1)

        u2 = np.dot(y1, W2)
        y2 = f(u2)

        e = y - y2

        y1_fb = f(np.dot(y, W1_fb)) 
        y0_fb = f(np.dot(y1_fb, W0_fb)) 

        du2 = e
        # BP
#         du1 = np.dot(du2, W2.T) * f_prime(u1)
#         du0 = np.dot(du1, W1.T) * f_prime(u0)
        
#         du0 = np.dot(du1, W0_fb) * f_prime(u0) # FA

        # FB
        du1 = (y1 * y1_fb - y1) * f_prime(u1)
        du0 = (y0 * y0_fb - y0) * f_prime(u0)

        dW2 = np.dot(y1.T, du2)
        db2 = np.sum(du2, 0)

        dW1 = np.dot(y0.T, du1)
        db1 = np.sum(du1, 0)

        dW0 = np.dot(x.T, du0)
        db0 = np.sum(du0, 0)

        W0 += learning_rate * dW0
        b0 += learning_rate * db0
        
        W1 += learning_rate * dW1
        b1 += learning_rate * db1

        W2 += learning_rate * dW2
        b2 += learning_rate * db2
        
        e_avg += np.linalg.norm(e)
        du0_avg += np.linalg.norm(du0)
        du1_avg += np.linalg.norm(du1)
        
    for i in range(number_of_test_batches):
        xt = Xt[i*batch_size:(i+1)*batch_size,:]
        yt = Yt[i*batch_size:(i+1)*batch_size,:]

        yt0 = f(np.dot(xt, W0))
        yt1 = f(np.dot(yt0, W1))
        yt2 = f(np.dot(yt1, W2))

        et = yt - yt2
        et_avg += np.linalg.norm(et)
        num_eq_avg += number_of_equal_act(yt, yt2)
        
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, |e| {:.3f} |et| {:.3f} |du0| {:.3f} |du1| {:.3f} |num.eq| {:.3f}%".format(
            epoch, 
            e_avg /  number_of_train_batches,
            et_avg /  number_of_test_batches,
            du0_avg /  number_of_train_batches,
            du1_avg /  number_of_train_batches,
            100.0 * num_eq_avg / number_of_test_batches,
        ))

Epoch 0, |e| 22.602 |et| 21.541 |du0| 95.123 |du1| 23.305 |num.eq| 75.660%
Epoch 200, |e| 15.957 |et| 17.916 |du0| 5.210 |du1| 2.540 |num.eq| 83.943%
Epoch 400, |e| 16.766 |et| 17.607 |du0| 3.694 |du1| 1.921 |num.eq| 83.514%
Epoch 600, |e| 16.243 |et| 17.234 |du0| 3.059 |du1| 1.789 |num.eq| 85.670%
Epoch 800, |e| 15.920 |et| 17.117 |du0| 2.538 |du1| 1.089 |num.eq| 84.320%
Epoch 1000, |e| 16.222 |et| 17.146 |du0| 2.151 |du1| 1.111 |num.eq| 83.836%


In [20]:
# np.random.seed(10)

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig_0 = np.random.randn(20, 100)
W_orig_1 = np.random.randn(100, 100)
W_orig_2 = np.random.randn(100, 10)
X = (np.random.random((100, 20)) < 0.1).astype(np.float32)
Yt = f(np.dot(f(np.dot(f(np.dot(X, W_orig_0)), W_orig_1)), W_orig_2))

# X = np.asarray([
#     [0.0, 0.0],
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0]
# ], dtype=np.float32)
# Yt = one_hot_encode(np.asarray([
#     [0.0],
#     [1.0],
#     [1.0],
#     [0.0]
# ], dtype=np.float32), 2)


input_dim = X.shape[1]
output_dim = Yt.shape[1]
batch_size = X.shape[0]

hidden_dim = 300
epochs = 200
learning_rate = 0.001

W0, b0 = weights_init(input_dim, hidden_dim)
W1, b1 = weights_init(hidden_dim, hidden_dim)
W2, b2 = weights_init(hidden_dim, output_dim)
W0_fb, b0_fb = weights_init(hidden_dim, hidden_dim)
W1_fb, b1_fb = weights_init(output_dim, hidden_dim)

W0_start = W0.copy()
W1_start = W1.copy()

for epoch in range(epochs):
    u0 = np.dot(X, W0)
    y0 = f(u0)

    u1 = np.dot(y0, W1)
    y1 = f(u1)

    u2 = np.dot(y1, W2)
    y2 = f(u2)
    
    e = Yt - y2
    
    y1_fb = f(np.dot(Yt, W1_fb)) 
    y0_fb = f(np.dot(y1_fb, W0_fb)) 
    
    du2 = e
    du0 = np.dot(du1, W1.T) * f_prime(u0) # BP
    du0 = np.dot(du1, W0_fb) * f_prime(u0) # FA
    
    du1 = (y1 * y1_fb - y1)# * f_prime(u1)
    du0 = (y0 * y0_fb - y0)# * f_prime(u0)

    dW2 = np.dot(y1.T, du2)
    db2 = np.sum(du2, 0)
    
    dW1 = np.dot(y0.T, du1)
    db1 = np.sum(du1, 0)
    
    dW0 = np.dot(X.T, du0)
    db0 = np.sum(du0, 0)
    
    W0 += learning_rate * dW0
    b0 += learning_rate * db0
    W1 += learning_rate * dW1
    b1 += learning_rate * db1
    W2 += learning_rate * dW2
    b2 += learning_rate * db2

    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, |e| {:.3f} |du0| {:.3f} |du1| {:.3f}".format(
            epoch, 
            np.linalg.norm(e), 
            np.linalg.norm(du0),
            np.linalg.norm(du1)
        ))

Epoch 0, |e| 21.932 |du0| 82.480 |du1| 81.240
Epoch 20, |e| 14.036 |du0| 35.071 |du1| 5.916
Epoch 40, |e| 11.874 |du0| 17.607 |du1| 3.000
Epoch 60, |e| 9.381 |du0| 9.747 |du1| 2.000
Epoch 80, |e| 8.832 |du0| 6.325 |du1| 2.236
Epoch 100, |e| 6.481 |du0| 3.162 |du1| 0.000
Epoch 120, |e| 3.464 |du0| 0.000 |du1| 0.000
Epoch 140, |e| 0.000 |du0| 0.000 |du1| 0.000
Epoch 160, |e| 0.000 |du0| 0.000 |du1| 0.000
Epoch 180, |e| 0.000 |du0| 0.000 |du1| 0.000
Epoch 199, |e| 0.000 |du0| 0.000 |du1| 0.000
