In [5]:
import numpy as np

class ElmanRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        # Initialize the parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Weights initialization
        self.Wxh = np.random.randn(self.hidden_size, self.input_size) * 0.01  # Input to hidden
        self.Whh = np.random.randn(self.hidden_size, self.hidden_size) * 0.01  # Hidden to hidden
        self.Why = np.random.randn(self.output_size, self.hidden_size) * 0.01  # Hidden to output
        self.bh = np.zeros((self.hidden_size, 1))  # Hidden bias
        self.by = np.zeros((self.output_size, 1))  # Output bias

    def tanh(self, x):
        return np.tanh(x)

    def tanh_derivative(self, x):
        return 1 - np.tanh(x) ** 2

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0, keepdims=True)

    def forward(self, x):
        # Forward pass through the network
        self.h_prev = np.zeros((self.hidden_size, 1))  # Previous hidden state
        self.x = x
        self.z = np.dot(self.Wxh, self.x) + np.dot(self.Whh, self.h_prev) + self.bh
        self.h = self.tanh(self.z)
        self.y = np.dot(self.Why, self.h) + self.by
        self.output = self.softmax(self.y)
        return self.output

    def backward(self, y_true):
        # Backpropagation to compute gradients
        self.dL_dy = self.output - y_true
        self.dL_dWhy = np.dot(self.dL_dy, self.h.T)
        self.dL_dby = self.dL_dy
        
        self.dL_dh = np.dot(self.Why.T, self.dL_dy)
        self.dL_dz = self.dL_dh * self.tanh_derivative(self.z)
        self.dL_dWhh = np.dot(self.dL_dz, self.h_prev.T)
        self.dL_dWxh = np.dot(self.dL_dz, self.x.T)
        self.dL_dbh = self.dL_dz

        # Gradients for previous hidden state
        self.dL_dh_prev = np.dot(self.Whh.T, self.dL_dz)
        
        return self.dL_dWxh, self.dL_dWhh, self.dL_dWhy, self.dL_dbh, self.dL_dby, self.dL_dh_prev

    def update_parameters(self, grads):
        # Update the parameters using the gradients and learning rate
        dL_dWxh, dL_dWhh, dL_dWhy, dL_dbh, dL_dby, _ = grads
        self.Wxh -= self.learning_rate * dL_dWxh
        self.Whh -= self.learning_rate * dL_dWhh
        self.Why -= self.learning_rate * dL_dWhy
        self.bh -= self.learning_rate * dL_dbh
        self.by -= self.learning_rate * dL_dby

    def train(self, X, y, epochs=100):
        for epoch in range(epochs):
            total_loss = 0
            for i in range(len(X)):
                # Forward pass
                output = self.forward(X[i])

                # Compute loss (using Cross-Entropy for classification)
                loss = -np.sum(y[i] * np.log(output))
                total_loss += loss

                # Backward pass and parameter updates
                grads = self.backward(y[i])
                self.update_parameters(grads)

            # Print the loss for the current epoch
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss}")

# Example usage
if __name__ == "__main__":
    # Toy dataset: X (input sequence) and y (target sequence)
    X = [np.array([[0], [1]]), np.array([[1], [0]])]  # input sequences
    y = [np.array([[1], [0]]), np.array([[0], [1]])]  # target sequences

    # Create an Elman RNN
    rnn = ElmanRNN(input_size=2, hidden_size=3, output_size=2, learning_rate=0.01)

    # Train the model
    rnn.train(X, y, epochs=100)

    # Testing with a new input
    test_input = np.array([[.1], [0.9]])
    output = rnn.forward(test_input)
    print("Test output:", output)


Epoch 1/100, Loss: 1.3912010149105054
Epoch 2/100, Loss: 1.3911923302976534
Epoch 3/100, Loss: 1.391183603048437
Epoch 4/100, Loss: 1.3911748296171274
Epoch 5/100, Loss: 1.391166006441113
Epoch 6/100, Loss: 1.3911571299394703
Epoch 7/100, Loss: 1.391148196511523
Epoch 8/100, Loss: 1.3911392025354008
Epoch 9/100, Loss: 1.3911301443665833
Epoch 10/100, Loss: 1.3911210183364418
Epoch 11/100, Loss: 1.391111820750769
Epoch 12/100, Loss: 1.391102547888299
Epoch 13/100, Loss: 1.3910931959992188
Epoch 14/100, Loss: 1.3910837613036693
Epoch 15/100, Loss: 1.3910742399902323
Epoch 16/100, Loss: 1.3910646282144086
Epoch 17/100, Loss: 1.3910549220970843
Epoch 18/100, Loss: 1.3910451177229795
Epoch 19/100, Loss: 1.3910352111390893
Epoch 20/100, Loss: 1.3910251983531063
Epoch 21/100, Loss: 1.3910150753318282
Epoch 22/100, Loss: 1.3910048379995559
Epoch 23/100, Loss: 1.3909944822364655
Epoch 24/100, Loss: 1.3909840038769725
Epoch 25/100, Loss: 1.3909733987080752
Epoch 26/100, Loss: 1.390962662467678
E