In [1]:
import numpy as np

# seed for reproducibility
np.random.seed(1)

# dimensions
input_size = 1
hidden_size = 4
output_size = 1
learning_rate = 0.01

# weights
Wxh = np.random.randn(hidden_size, input_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(output_size, hidden_size) * 0.01

# biases
bh = np.zeros((hidden_size, 1))
by = np.zeros((output_size, 1))


In [2]:
# input sequence
X = [np.array([[1]]), np.array([[2]]), np.array([[3]])]
Y = [np.array([[2]]), np.array([[4]]), np.array([[6]])]  # target



In [3]:
h_prev = np.zeros((hidden_size, 1))
hs = {}
ys = {}

for t in range(len(X)):
    hs[t] = np.tanh(Wxh @ X[t] + Whh @ h_prev + bh)
    ys[t] = Why @ hs[t] + by
    h_prev = hs[t]


In [4]:
loss = 0
for t in range(len(Y)):
    loss += np.square(ys[t] - Y[t])

print("Loss:", loss[0][0])


Loss: 56.01962908037016


# Back Propagation through time

In [5]:
dWxh = np.zeros_like(Wxh)
dWhh = np.zeros_like(Whh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)

dh_next = np.zeros_like(h_prev)


In [6]:
for t in reversed(range(len(X))):
    dy = ys[t] - Y[t]
    dWhy += dy @ hs[t].T
    dby += dy

    dh = Why.T @ dy + dh_next
    dtanh = (1 - hs[t] ** 2) * dh

    dbh += dtanh
    dWxh += dtanh @ X[t].T
    dWhh += dtanh @ hs[t-1].T if t > 0 else 0

    dh_next = Whh.T @ dtanh


In [7]:
Wxh -= learning_rate * dWxh
Whh -= learning_rate * dWhh
Why -= learning_rate * dWhy
bh  -= learning_rate * dbh
by  -= learning_rate * dby


In [8]:
for epoch in range(1000):
    # forward
    h_prev = np.zeros((hidden_size, 1))
    hs, ys = {}, {}
    loss = 0

    for t in range(len(X)):
        hs[t] = np.tanh(Wxh @ X[t] + Whh @ h_prev + bh)
        ys[t] = Why @ hs[t] + by
        loss += np.square(ys[t] - Y[t])
        h_prev = hs[t]

    # backward
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dh_next = np.zeros_like(h_prev)

    for t in reversed(range(len(X))):
        dy = ys[t] - Y[t]
        dWhy += dy @ hs[t].T
        dby += dy
        dh = Why.T @ dy + dh_next
        dtanh = (1 - hs[t] ** 2) * dh
        dbh += dtanh
        dWxh += dtanh @ X[t].T
        dWhh += dtanh @ hs[t-1].T if t > 0 else 0
        dh_next = Whh.T @ dtanh

    # update
    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh  -= learning_rate * dbh
    by  -= learning_rate * dby

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss[0][0]}")


Epoch 0, Loss: 53.17024697852227
Epoch 100, Loss: 1.1758970565815385
Epoch 200, Loss: 0.03515377404884714
Epoch 300, Loss: 0.005957144536961547
Epoch 400, Loss: 0.0024095217195119053
Epoch 500, Loss: 0.00099951940562552
Epoch 600, Loss: 0.0004127753103659227
Epoch 700, Loss: 0.00016994859019575496
Epoch 800, Loss: 6.983990646036436e-05
Epoch 900, Loss: 2.8666615553039435e-05
