In [75]:
import numpy as np
from matplotlib import pyplot as plt
np.seterr('raise')

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

In [None]:
DEFAULT_FIG_SIZE = (15, 10)

def plot_wrapper(fn):
    def wrapper(*args, **kwargs):
        ncols = kwargs.get("ncols", 1)
        id = kwargs.get("id", 0)
        nrows = len(args)

        if id == 0:
            plt.figure(figsize=kwargs.get("figsize", DEFAULT_FIG_SIZE))

        for a_id, a in enumerate(args):
            plt.subplot(nrows, ncols, nrows*id + a_id+1)
            fn(a, **kwargs)

        if kwargs.get("file"):
            plt.savefig(kwargs["file"])
            plt.clf()
        elif kwargs.get("show", True) and id+1 == ncols:
            plt.show()

    return wrapper



@plot_wrapper
def shm(matrix, **kwargs):
    plt.imshow(np.squeeze(matrix).T, cmap='gray', origin='lower')
    plt.colorbar()
    
def shl(*vector, **kwargs):
    plt.figure(figsize=kwargs.get("figsize", DEFAULT_FIG_SIZE))
    
    labels = kwargs.get("labels", [])
    for id, v in enumerate(vector):
        if len(labels) > 0:
            plt.plot(np.squeeze(v), label=labels[id])
        else:
            plt.plot(np.squeeze(v))

    if len(labels) > 0:
        plt.legend()
    
    if not kwargs.get("title") is None:
        plt.suptitle(kwargs["title"])

    if kwargs.get("file"):
        plt.savefig(kwargs["file"])
        plt.clf()
    elif kwargs.get("show", True):
        plt.show()

relu = lambda x: np.maximum(x, 0.0)
relu_prime = lambda x: np.where(x > 0.0, 1.0, 0.0)

sigmoid = lambda x: 1.0/(1.0 + np.exp(-x))
def sigmoid_prime(x):
    v = sigmoid(x)
    return v * (1.0 - v)

t_clip_value = 1.0

threshold = lambda x: np.where(x > 0.0, 1.0, 0.0)
threshold_prime = lambda x: np.where(relu(x) <= t_clip_value, 1.0, 0.0)

# def threshold_prime(x, threshold_value = 0.0):
#     return 1.0/np.square(1.0 + np.abs(x - threshold_value))

def one_hot_encode(target_v, size=None):
    y_v = np.zeros((target_v.shape[0], size if not size is None else len(np.unique(target_v))))
    for cl_id, cl_v in enumerate(np.unique(target_v)):
        y_v[np.where(target_v==cl_v)[0], cl_id] = 1.0

    return y_v

def weights_init(fan_in, fan_out, const=1.0):
    low = -const * np.sqrt(6.0 / (fan_in + fan_out))
    high = const * np.sqrt(6.0 / (fan_in + fan_out))
    return (
        (low + np.random.random((fan_in, fan_out)) * (high - low)).astype(np.float32),
        (low + np.random.random((fan_out,)) * (high - low)).astype(np.float32)
    )

def number_of_equal_act(a, a_t):
    a_t_m = a_t.copy()
    a_t_m[np.where(np.abs(a_t_m) < 1e-10)] = -1

    a_m = a.copy()
    a_m[np.where(np.abs(a_m) < 1e-10)] = -10

    a_t_mean = np.mean(a_t)
    if a_t_mean == 0.0:
        if np.mean(np.equal(a_t_m, a_m)) == 0.0:
            return 1.0
        else:
            return 0.0
    else:
        return np.mean(np.equal(a_t_m, a_m)) / np.mean(a_t)

    
def feedback_filter(y, y_fb, ltp=1.0, ltd=-1.0):
    return y * np.where(y == y_fb, ltp, ltd)
#     return y * (y_fb - 1.0)



In [None]:
# np.random.seed(1)

input_dim = 20
output_dim = 3
batch_size = 100
epochs = 500
learning_rate = 0.01

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig, _ = weights_init(input_dim, output_dim)

X = (np.random.random((batch_size, input_dim)) < 0.1).astype(np.float32)
Yt = f(np.dot(X, W_orig))

W, _ = weights_init(input_dim, output_dim)

for epoch in range(epochs):
    u = np.dot(X, W)
    y = f(u)
    
    e = Yt - y
    
    dW = np.dot(X.T, e)
    
    W += learning_rate * dW
    
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, {:.3f}".format(epoch, np.linalg.norm(e)))

In [None]:
# np.random.seed(1)

input_dim = 2
output_dim = 2
batch_size = 4
epochs = 500
learning_rate = 0.01

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

X = np.asarray([
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0]
], dtype=np.float32)
Yt = one_hot_encode(np.asarray([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
], dtype=np.float32), 2)


W, _ = weights_init(input_dim, output_dim)

for epoch in range(epochs):
    u = np.dot(X, W)
    y = f(u)
    
    e = Yt - y
    
    dW = np.dot(X.T, e)
    
    W += learning_rate * dW
    
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, {:.3f}".format(epoch, np.linalg.norm(e)))
 

In [180]:
np.random.seed(10)

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig_0 = np.random.randn(20, 100)
W_orig_1 = np.random.randn(100, 10)

# train data
X = (np.random.random((2000, 20)) < 0.1).astype(np.float32)
Y = f(np.dot(f(np.dot(X, W_orig_0)), W_orig_1))

# test data
Xt = (np.random.random((200, 20)) < 0.1).astype(np.float32)
Yt = f(np.dot(f(np.dot(Xt, W_orig_0)), W_orig_1))

# Xt, Yt = X, Y

# X = np.asarray([
#     [0.0, 0.0],
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0]
# ], dtype=np.float32)
# Yt = one_hot_encode(np.asarray([
#     [0.0],
#     [1.0],
#     [1.0],
#     [0.0]
# ], dtype=np.float32), 2)


input_dim = X.shape[1]
output_dim = Yt.shape[1]
batch_size = 200
number_of_train_batches = X.shape[0] // batch_size
number_of_test_batches = Xt.shape[0] // batch_size

hidden_dim = 300
epochs = 1000
init_learning_rate = 0.001

W0, b0 = weights_init(input_dim, hidden_dim)
W1, b1 = weights_init(hidden_dim, output_dim)
W0_fb, b0_fb = weights_init(output_dim, hidden_dim)

W0_start = W0.copy()

for epoch in range(epochs):
    permute_ids = np.random.permutation(X.shape[0])
    X = X[permute_ids, :]
    Y = Y[permute_ids, :]

    e_avg, et_avg = 0.0, 0.0
    du0_avg = 0.0
    num_eq_avg = 0.0
    
    learning_rate = init_learning_rate / (epoch + 1) ** 0.5

    for i in range(number_of_train_batches):
        x = X[i*batch_size:(i+1)*batch_size,:]
        y = Y[i*batch_size:(i+1)*batch_size,:]

        u0 = np.dot(x, W0)
        y0 = f(u0)

        u1 = np.dot(y0, W1)
        y1 = f(u1)

        e = y - y1

        y0_fb = f(np.dot(y, W0_fb)) 

        du1 = e
#         du0 = np.dot(du1, W1.T) * f_prime(u0) # BP
#         du0 = np.dot(du1, W0_fb) * f_prime(u0) # FA
        du0 = feedback_filter(y0, y0_fb, 0.1, -1.0) * f_prime(u0)

        dW1 = np.dot(y0.T, du1)
        db1 = np.sum(du1, 0)

        dW0 = np.dot(x.T, du0)
        db0 = np.sum(du0, 0)

        W0 += learning_rate * dW0
        b0 += learning_rate * db0
        W1 += learning_rate * dW1
        b1 += learning_rate * db1
        
        e_avg += np.linalg.norm(e)
        du0_avg += np.linalg.norm(du0)

    for i in range(number_of_test_batches):
        xt = Xt[i*batch_size:(i+1)*batch_size,:]
        yt = Yt[i*batch_size:(i+1)*batch_size,:]

        yt0 = f(np.dot(xt, W0))
        yt1 = f(np.dot(yt0, W1))

        et = yt - yt1
        et_avg += np.linalg.norm(et)
        num_eq_avg += number_of_equal_act(yt, yt1)
        
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, |e| {:.3f} |et| {:.3f} |du0| {:.3f} |num.eq| {:.3f}%".format(
            epoch, 
            e_avg /  number_of_train_batches,
            et_avg /  number_of_test_batches,
            du0_avg /  number_of_train_batches,
            100.0 * num_eq_avg / number_of_test_batches,
        ))

Epoch 0, |e| 24.160 |et| 20.445 |du0| 83.561 |num.eq| 82.474%
Epoch 100, |e| 13.040 |et| 15.166 |du0| 18.751 |num.eq| 82.705%
Epoch 200, |e| 12.153 |et| 14.765 |du0| 19.049 |num.eq| 83.660%
Epoch 300, |e| 11.891 |et| 14.248 |du0| 19.040 |num.eq| 86.005%
Epoch 400, |e| 11.519 |et| 14.142 |du0| 18.726 |num.eq| 85.581%
Epoch 500, |e| 11.351 |et| 14.353 |du0| 18.942 |num.eq| 85.087%
Epoch 600, |e| 11.173 |et| 14.526 |du0| 18.642 |num.eq| 84.433%
Epoch 700, |e| 10.899 |et| 14.422 |du0| 18.535 |num.eq| 85.330%
Epoch 800, |e| 10.807 |et| 13.928 |du0| 18.438 |num.eq| 87.172%
Epoch 900, |e| 10.457 |et| 14.283 |du0| 18.476 |num.eq| 86.283%
Epoch 999, |e| 10.497 |et| 13.784 |du0| 18.633 |num.eq| 86.541%


In [185]:
np.random.seed(10)

# f, f_prime = sigmoid, sigmoid_prime
f, f_prime = threshold, threshold_prime

W_orig_0 = np.random.randn(20, 100)
W_orig_1 = np.random.randn(100, 100)
W_orig_2 = np.random.randn(100, 10)

# train data
X = (np.random.random((2000, 20)) < 0.1).astype(np.float32)
Y = f(np.dot(f(np.dot(f(np.dot(X, W_orig_0)), W_orig_1)), W_orig_2))

# test data
Xt = (np.random.random((200, 20)) < 0.1).astype(np.float32)
Yt = f(np.dot(f(np.dot(f(np.dot(Xt, W_orig_0)), W_orig_1)), W_orig_2))

# Xt, Yt = X, Y

# X = np.asarray([
#     [0.0, 0.0],
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0]
# ], dtype=np.float32)
# Yt = one_hot_encode(np.asarray([
#     [0.0],
#     [1.0],
#     [1.0],
#     [0.0]
# ], dtype=np.float32), 2)


input_dim = X.shape[1]
output_dim = Yt.shape[1]
batch_size = 200
number_of_train_batches = X.shape[0] // batch_size
number_of_test_batches = Xt.shape[0] // batch_size

hidden_dim = 300
epochs = 2000
init_learning_rate = 0.0005

W0, b0 = weights_init(input_dim, hidden_dim)
W1, b1 = weights_init(hidden_dim, hidden_dim)
W2, b2 = weights_init(hidden_dim, output_dim)
W0_fb, b0_fb = weights_init(hidden_dim, hidden_dim)
W1_fb, b1_fb = weights_init(output_dim, hidden_dim)

W0_start = W0.copy()
W1_start = W1.copy()

for epoch in range(epochs):
    permute_ids = np.random.permutation(X.shape[0])
    X = X[permute_ids, :]
    Y = Y[permute_ids, :]

    e_avg, et_avg = 0.0, 0.0
    du0_avg, du1_avg = 0.0, 0.0
    num_eq_avg = 0.0

    learning_rate = init_learning_rate / (epoch + 1) ** 0.5

    for i in range(number_of_train_batches):
        x = X[i*batch_size:(i+1)*batch_size,:]
        y = Y[i*batch_size:(i+1)*batch_size,:]

        u0 = np.dot(x, W0)
        y0 = f(u0)

        u1 = np.dot(y0, W1)
        y1 = f(u1)

        u2 = np.dot(y1, W2)
        y2 = f(u2)

        e = y - y2

        y1_fb = f(np.dot(y, W1_fb)) 
        y0_fb = f(np.dot(y1_fb, W0_fb)) 

        du2 = e
        # BP
#         du1 = np.dot(du2, W2.T) * f_prime(u1)
#         du0 = np.dot(du1, W1.T) * f_prime(u0)
        
        # FB
        du1 = feedback_filter(y1, y1_fb, 1.0, -1.0) * f_prime(u1)
        du0 = feedback_filter(y0, y0_fb, 1.0, -1.0) * f_prime(u0)

        dW2 = np.dot(y1.T, du2)
        db2 = np.sum(du2, 0)

        dW1 = np.dot(y0.T, du1)
        db1 = np.sum(du1, 0)

        dW0 = np.dot(x.T, du0)
        db0 = np.sum(du0, 0)

        W0 += learning_rate * dW0
        b0 += learning_rate * db0
        
        W1 += learning_rate * dW1
        b1 += learning_rate * db1

        W2 += learning_rate * dW2
        b2 += learning_rate * db2
        
        e_avg += np.linalg.norm(e)
        du0_avg += np.linalg.norm(du0)
        du1_avg += np.linalg.norm(du1)
        
    for i in range(number_of_test_batches):
        xt = Xt[i*batch_size:(i+1)*batch_size,:]
        yt = Yt[i*batch_size:(i+1)*batch_size,:]

        yt0 = f(np.dot(xt, W0))
        yt1 = f(np.dot(yt0, W1))
        yt2 = f(np.dot(yt1, W2))

        et = yt - yt2
        et_avg += np.linalg.norm(et)
        num_eq_avg += number_of_equal_act(yt, yt2)
        
    if (epoch % (epochs // 10)) == 0 or epoch == 0 or epoch == epochs - 1:
        print("Epoch {}, |e| {:.3f} |et| {:.3f} |du0| {:.3f} |du1| {:.3f} |num.eq| {:.3f}%".format(
            epoch, 
            e_avg /  number_of_train_batches,
            et_avg /  number_of_test_batches,
            du0_avg /  number_of_train_batches,
            du1_avg /  number_of_train_batches,
            100.0 * num_eq_avg / number_of_test_batches,
        ))

Epoch 0, |e| 22.299 |et| 21.024 |du0| 121.501 |du1| 33.904 |num.eq| 73.158%
Epoch 200, |e| 16.360 |et| 18.628 |du0| 47.338 |du1| 16.572 |num.eq| 79.129%
Epoch 400, |e| 15.976 |et| 15.362 |du0| 45.174 |du1| 16.678 |num.eq| 88.069%
Epoch 600, |e| 15.117 |et| 17.263 |du0| 43.693 |du1| 16.666 |num.eq| 86.027%
Epoch 800, |e| 15.838 |et| 15.780 |du0| 42.282 |du1| 16.237 |num.eq| 85.783%
Epoch 1000, |e| 15.331 |et| 15.716 |du0| 41.458 |du1| 16.241 |num.eq| 89.892%
Epoch 1200, |e| 14.884 |et| 18.330 |du0| 40.370 |du1| 16.159 |num.eq| 83.556%
Epoch 1400, |e| 14.725 |et| 14.213 |du0| 40.264 |du1| 16.164 |num.eq| 91.640%
Epoch 1600, |e| 14.918 |et| 14.697 |du0| 39.316 |du1| 16.144 |num.eq| 87.781%
Epoch 1800, |e| 14.990 |et| 16.186 |du0| 38.781 |du1| 15.751 |num.eq| 88.223%
Epoch 1999, |e| 15.200 |et| 13.964 |du0| 38.452 |du1| 15.829 |num.eq| 92.597%


In [None]:
# seed 10
# FB filter 0.0, -1.0
# Epoch 0, |e| 22.074 |et| 20.952 |du0| 80.796 |du1| 23.690 |num.eq| 75.992%
# Epoch 200, |e| 16.260 |et| 17.635 |du0| 6.632 |du1| 2.803 |num.eq| 84.173%
# Epoch 400, |e| 16.076 |et| 17.550 |du0| 4.653 |du1| 2.613 |num.eq| 86.032%
# Epoch 600, |e| 15.545 |et| 17.833 |du0| 3.869 |du1| 1.917 |num.eq| 82.478%
# Epoch 800, |e| 15.772 |et| 16.823 |du0| 3.316 |du1| 1.875 |num.eq| 86.242%
# Epoch 1000, |e| 15.289 |et| 16.371 |du0| 2.616 |du1| 1.387 |num.eq| 85.658%
# Epoch 1200, |e| 15.629 |et| 18.111 |du0| 2.382 |du1| 1.107 |num.eq| 90.652%
# Epoch 1400, |e| 15.344 |et| 16.553 |du0| 2.185 |du1| 0.865 |num.eq| 88.565%
# Epoch 1600, |e| 14.979 |et| 16.941 |du0| 1.691 |du1| 1.258 |num.eq| 86.947%
# Epoch 1800, |e| 15.150 |et| 16.882 |du0| 1.710 |du1| 0.341 |num.eq| 85.341%
# Epoch 1999, |e| 15.714 |et| 17.889 |du0| 0.939 |du1| 0.683 |num.eq| 81.586%

# FB filter 0.1, -1.0 
# Epoch 0, |e| 22.095 |et| 20.149 |du0| 81.351 |du1| 24.139 |num.eq| 75.459%
# Epoch 200, |e| 17.560 |et| 18.947 |du0| 17.757 |du1| 6.499 |num.eq| 81.126%
# Epoch 400, |e| 17.149 |et| 16.340 |du0| 17.361 |du1| 7.026 |num.eq| 86.762%
# Epoch 600, |e| 16.833 |et| 16.793 |du0| 17.312 |du1| 6.853 |num.eq| 85.599%
# Epoch 800, |e| 16.683 |et| 15.684 |du0| 17.233 |du1| 7.015 |num.eq| 90.513%
# Epoch 1000, |e| 16.504 |et| 16.523 |du0| 17.033 |du1| 7.132 |num.eq| 86.831%
# Epoch 1200, |e| 15.598 |et| 15.811 |du0| 16.741 |du1| 7.070 |num.eq| 89.018%
# Epoch 1400, |e| 15.397 |et| 15.524 |du0| 16.829 |du1| 7.104 |num.eq| 85.019%
# Epoch 1600, |e| 15.152 |et| 15.395 |du0| 16.567 |du1| 6.922 |num.eq| 88.525%
# Epoch 1800, |e| 14.847 |et| 16.310 |du0| 16.567 |du1| 7.097 |num.eq| 88.498%
# Epoch 1999, |e| 16.272 |et| 16.793 |du0| 16.454 |du1| 6.935 |num.eq| 85.315%

# FB filter 1.0, -1.0
# Epoch 0, |e| 22.299 |et| 21.024 |du0| 121.501 |du1| 33.904 |num.eq| 73.158%
# Epoch 200, |e| 16.360 |et| 18.628 |du0| 47.338 |du1| 16.572 |num.eq| 79.129%
# Epoch 400, |e| 15.976 |et| 15.362 |du0| 45.174 |du1| 16.678 |num.eq| 88.069%
# Epoch 600, |e| 15.117 |et| 17.263 |du0| 43.693 |du1| 16.666 |num.eq| 86.027%
# Epoch 800, |e| 15.838 |et| 15.780 |du0| 42.282 |du1| 16.237 |num.eq| 85.783%
# Epoch 1000, |e| 15.331 |et| 15.716 |du0| 41.458 |du1| 16.241 |num.eq| 89.892%
# Epoch 1200, |e| 14.884 |et| 18.330 |du0| 40.370 |du1| 16.159 |num.eq| 83.556%
# Epoch 1400, |e| 14.725 |et| 14.213 |du0| 40.264 |du1| 16.164 |num.eq| 91.640%
# Epoch 1600, |e| 14.918 |et| 14.697 |du0| 39.316 |du1| 16.144 |num.eq| 87.781%
# Epoch 1800, |e| 14.990 |et| 16.186 |du0| 38.781 |du1| 15.751 |num.eq| 88.223%
# Epoch 1999, |e| 15.200 |et| 13.964 |du0| 38.452 |du1| 15.829 |num.eq| 92.597%

# BP
# Epoch 0, |e| 22.037 |et| 20.976 |du0| 20.434 |du1| 22.077 |num.eq| 83.192%
# Epoch 200, |e| 8.618 |et| 13.342 |du0| 5.416 |du1| 5.781 |num.eq| 90.918%
# Epoch 400, |e| 7.488 |et| 12.884 |du0| 4.358 |du1| 4.595 |num.eq| 91.692%
# Epoch 600, |e| 6.860 |et| 12.961 |du0| 3.979 |du1| 4.192 |num.eq| 92.189%
# Epoch 800, |e| 7.450 |et| 12.610 |du0| 4.205 |du1| 4.434 |num.eq| 92.089%
# Epoch 1000, |e| 6.760 |et| 12.884 |du0| 3.768 |du1| 3.963 |num.eq| 91.524%
# Epoch 1200, |e| 6.537 |et| 12.530 |du0| 3.575 |du1| 3.771 |num.eq| 92.710%
# Epoch 1400, |e| 6.038 |et| 12.410 |du0| 3.297 |du1| 3.467 |num.eq| 92.300%
# Epoch 1600, |e| 5.915 |et| 12.410 |du0| 3.338 |du1| 3.501 |num.eq| 92.300%
# Epoch 1800, |e| 5.491 |et| 12.083 |du0| 3.081 |du1| 3.232 |num.eq| 92.879%
# Epoch 1999, |e| 5.489 |et| 12.166 |du0| 2.987 |du1| 3.109 |num.eq| 92.952%