In [None]:
import numpy as np

In [None]:
#activation function for the output layer
#converts raw scores to probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / exp_x.sum(axis=1, keepdims=True)

In [None]:
#initializes weights randomly
def initialize_weights(input_dim, hidden_dim, output_dim):
    Wh = np.random.randn(hidden_dim, hidden_dim) * 0.01  #weights used during loops/reccurent computation
    Wx = np.random.randn(input_dim, hidden_dim) * 0.01   # weights used from input to hidden layer
    Wy = np.random.randn(hidden_dim, output_dim) * 0.01  #weights used from hidden layer to output
    bh = np.zeros((1, hidden_dim))  #bias in the hidden layer
    by = np.zeros((1, output_dim))  #bias in the output layer
    return Wh, Wx, Wy, bh, by

Forward pass

In [None]:
def rnn_forward(X, Wh, Wx, Wy, bh, by, h_prev):
    #T is the sequence length
    T, input_dim = X.shape
    hidden_dim, output_dim = Wy.shape

    #h holds the hidden states
    h = np.zeros((T, hidden_dim))
    #y holds the outputs
    y = np.zeros((T, output_dim))

    #t is the time step
    for t in range(T):
        h[t] = np.tanh(np.dot(X[t], Wx) + np.dot(h_prev, Wh) + bh) #tanh is the activation function used to calculate the hidden state
        y[t] = softmax(np.dot(h[t], Wy) + by) #activation function is used to calculate the output
        h_prev = h[t]

    return h, y

Backward pass

In [None]:
#Computes gradients of the loss with respect to the weights and biases.
#A gradient simply measures the change in all weights with regard to the change in error.
#The gradient of a function at any point can be calculated as the first-order derivative of that function at that point.
def rnn_backward(X, y_true, y_pred, h, Wh, Wx, Wy, bh, by):
    T, input_dim = X.shape
    hidden_dim, output_dim = Wy.shape

    #initializing gradients with zeros
    dWh = np.zeros_like(Wh)
    dWx = np.zeros_like(Wx)
    dWy = np.zeros_like(Wy)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)

    dh_next = np.zeros((1, hidden_dim))

    #calculating loss or error using cross-entropy
    loss = -np.sum(y_true * np.log(y_pred + 1e-8))

    #calculating gradients
    #back propagating
    for t in reversed(range(T)):
        dy = y_pred[t] - y_true[t]
        dWy += np.dot(h[t].reshape(-1, 1), dy.reshape(1, -1))
        dby += dy

        dh = np.dot(dy, Wy.T) + dh_next
        dh_raw = (1 - h[t]**2) * dh #derivative of tanh activation function

        dbh += dh_raw
        dWx += np.dot(X[t].reshape(-1, 1), dh_raw.reshape(1, -1))
        dWh += np.dot(h[t-1].reshape(-1, 1), dh_raw.reshape(1, -1)) if t != 0 else np.zeros_like(dWh)

        #for next time step
        dh_next = np.dot(dh_raw, Wh.T)

    return dWh, dWx, dWy, dbh, dby, loss

Training loop

In [None]:
def train_rnn(X_train, y_train, input_dim, hidden_dim, output_dim, epochs=1000, learning_rate=0.01):
    Wh, Wx, Wy, bh, by = initialize_weights(input_dim, hidden_dim, output_dim)
    h_prev = np.zeros((1, hidden_dim))

    #the ultimate training loop
    #epoch is the number of times you give the data to the neursl network
    #in other words, an epoch means training the neural network with all the training data for one cycle.
    for epoch in range(epochs):
        for X, y_true in zip(X_train, y_train):
            h, y_pred = rnn_forward(X, Wh, Wx, Wy, bh, by, h_prev)
            dWh, dWx, dWy, dbh, dby, loss = rnn_backward(X, y_true, y_pred, h, Wh, Wx, Wy, bh, by)

            #Updating weights and biases using gradient descent.
            #Gradient Descent is an algorithm that is used to optimize the cost function or the error of the model.
            Wh -= learning_rate * dWh
            Wx -= learning_rate * dWx
            Wy -= learning_rate * dWy
            bh -= learning_rate * dbh
            by -= learning_rate * dby

        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Loss: {loss}')

    return Wh, Wx, Wy, bh, by

Example usage

In [None]:
# Example data (sequences of one-hot encoded vectors)
X_train = [np.random.rand(10, 5) for _ in range(100)]  # 100 sequences of length 10, input_dim=5
y_train = [np.random.randint(0, 2, (10, 3)) for _ in range(100)]  # 100 sequences of length 10, output_dim=3

# Train the RNN
Wh, Wx, Wy, bh, by = train_rnn(X_train, y_train, input_dim=5, hidden_dim=10, output_dim=3, epochs=1000, learning_rate=0.01)

Epoch 0, Loss: 16.679187689077008
Epoch 100, Loss: 16.376423250610294
Epoch 200, Loss: 16.376392185389953
Epoch 300, Loss: 16.376400798604063
Epoch 400, Loss: 16.376403662318467
Epoch 500, Loss: 16.376406048018307
Epoch 600, Loss: 16.376408983441152
Epoch 700, Loss: 16.376409174312986
Epoch 800, Loss: 16.376409191811412
Epoch 900, Loss: 16.376408939342323
