# Elman RNN
Ashok Kumar Pant | AI Solution Architect | CTO and Co-founder at Treeleaf/Anydone


**1. Introduction**
Elman RNN is a type of recurrent neural network (RNN) proposed by Jeffrey Elman in 1990. It is one of the simplest RNN architectures and is often used to model sequential data.

**2. Architecture of Elman RNN**

Elman RNN consists of three layers:

- **Input Layer $x_t$** - Takes in sequential data at each time step.
- **Hidden Layer $h_t$** - Has **recurrent connections**, meaning it receives inputs from both:
   - The **current input** $x_t$
   - The **previous hidden state** $h_{t-1}$
- **Output Layer $y_t$** - Produces the network's output at each time step.

The **key feature** of Elman RNN is the presence of **context units**, which store past information and help in sequential learning.

**3. Mathematical Formulation**

At each time step $t$:

- **Hidden state update**:
   
   $h_t = f(W_{xh} x_t + W_{hh} h_{t-1} + b_h)$

   where:
   - $W_{xh} $ = Weight matrix for input to hidden layer
   - $W_{hh} $ = Weight matrix for hidden-to-hidden recurrence
   - $b_h $ = Bias term
   - $f $ = Activation function (e.g., tanh or ReLU)

- **Output computation**:
   
   $y_t = g(W_{hy} h_t + b_y)$
   where:
   - $W_{hy} $ = Weight matrix from hidden to output
   - $b_y $ = Bias term
   - $g $ = Activation function (e.g., softmax for classification)

- **Loss function (for training using Backpropagation Through Time - BPTT)**:
  
   $L = \sum_{t=1}^{T} \mathcal{L}(y_t, \hat{y}_t)$
   where $\mathcal{L} $ is the loss function (e.g., cross-entropy for classification, mean squared error for regression).

- **Gradient computation (for weight updates using BPTT)**:

   $\frac{\partial L}{\partial W_{xh}}, \frac{\partial L}{\partial W_{hh}}, \frac{\partial L}{\partial W_{hy}}$
   These gradients are calculated by unrolling the network in time and applying the chain rule.

In [2]:
import torch
import torch.nn as nn

class ElmanRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ElmanRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h):
        out, h = self.rnn(x, h)
        out = self.fc(out[:, -1, :])  # Use the last time step's output
        return out, h

# Example Usage
input_size = 5
hidden_size = 10
output_size = 3

rnn = ElmanRNN(input_size, hidden_size, output_size)
x = torch.randn(1, 4, input_size)  # Batch=1, SeqLen=4, Features=5
h = torch.zeros(1, 1, hidden_size) # Initial hidden state

output, h = rnn(x, h)
print(output.shape)  # (1, 3) -> Output for the last time step


torch.Size([1, 3])


In [10]:
import torch
import torch.nn as nn

class ElmanRNN(nn.Module):
    """An Elman RNN built using RNNCell."""
    
    def __init__(self, input_size, hidden_size, batch_first=False):
        """
        Args:
            input_size (int): Size of the input vectors.
            hidden_size (int): Size of the hidden state vectors.
            batch_first (bool): Whether the 0th dimension is batch.
        """
        super(ElmanRNN, self).__init__()
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        self.batch_first = batch_first
        self.hidden_size = hidden_size

    def initialize_hidden(self, batch_size):
        """Initializes the hidden state with zeros."""
        return torch.zeros((batch_size, self.hidden_size))

    def forward(self, x_in, initial_hidden=None):
        """
        Forward pass of the ElmanRNN.

        Args:
            x_in (torch.Tensor): Input data tensor.
                If self.batch_first: x_in.shape = (batch_size, seq_size, feat_size)
                Else: x_in.shape = (seq_size, batch_size, feat_size)
            initial_hidden (torch.Tensor, optional): Initial hidden state for the RNN.

        Returns:
            torch.Tensor: The outputs of the RNN at each time step.
                If self.batch_first: shape = (batch_size, seq_size, hidden_size)
                Else: shape = (seq_size, batch_size, hidden_size)
        """
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)  # Convert to (seq_size, batch_size, feat_size)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        
        if initial_hidden is None:
            initial_hidden = self.initialize_hidden(batch_size)
        
        initial_hidden = initial_hidden.to(x_in.device)
        hidden_t = initial_hidden

        for t in range(seq_size):
            hidden_t = self.rnn_cell(x_in[t], hidden_t)
            hiddens.append(hidden_t)

        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)  # Convert back to (batch_size, seq_size, hidden_size)

        return hiddens


In [11]:
# Example Usage
input_size = 5
hidden_size = 10
output_size = 3

rnn = ElmanRNN(input_size, hidden_size, output_size)
x = torch.randn(1, 4, input_size)  # Batch=1, SeqLen=4, Features=5
h = torch.zeros(1, 1, hidden_size) # Initial hidden state

output, h = rnn(x, h)
print(output.shape)  # (1, 3) -> Output for the last time step
print(output)

ValueError: RNNCell: Expected hidden to be 1D or 2D, got 3D instead

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

# Load dataset (assumed structure: "name, nationality")
df = pd.read_csv('../../data/names-by-nationality.csv')

# Preprocess: Convert surnames to characters
class SurnameDataset(Dataset):
    def __init__(self, df, label_encoder, seq_length=15):
        self.surnames = df['name'].values
        self.nationalities = label_encoder.transform(df['nationality'].values)
        self.seq_length = seq_length
    
    def __len__(self):
        return len(self.surnames)
    
    def __getitem__(self, idx):
        surname = self.surnames[idx]
        nationality = self.nationalities[idx]
        
        # Pad surname to sequence length (or truncate if too long)
        surname = surname[:self.seq_length].ljust(self.seq_length)
        
        # Convert surname characters to ASCII values
        surname_chars = torch.tensor([ord(c) for c in surname], dtype=torch.long)
        
        return surname_chars, nationality

# Prepare data
label_encoder = LabelEncoder()
df['nationality'] = label_encoder.fit_transform(df['nationality'])
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = SurnameDataset(train_data, label_encoder)
test_dataset = SurnameDataset(test_data, label_encoder)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the Elman RNN model
class ElmanRNN(nn.Module):
    def __init__(self, input_size=256, hidden_size=128, output_size=10, seq_length=15):
        super(ElmanRNN, self).__init__()
        
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        
        # Embedding layer for input characters (if needed, here we're using raw ASCII values)
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        # Elman RNN layer
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Get embeddings for each character
        x = self.embedding(x)  # Shape: [batch_size, seq_length, hidden_size]
        
        # RNN layer
        rnn_out, _ = self.rnn(x)  # rnn_out shape: [batch_size, seq_length, hidden_size]
        
        # We only care about the output of the last character (the last timestep)
        last_hidden_state = rnn_out[:, -1, :]
        
        # Output layer
        out = self.fc(last_hidden_state)
        return out

# Hyperparameters
input_size = 256  # ASCII range (0-255)
hidden_size = 128
output_size = len(label_encoder.classes_)  # Number of nationalities
seq_length = 15  # Max length of surname

# Model, loss, and optimizer
model = ElmanRNN(input_size, hidden_size, output_size, seq_length)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for surname_chars, labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(surname_chars)
        
        # Compute loss
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        
        # Track accuracy
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for surname_chars, labels in test_loader:
        outputs = model(surname_chars)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct/total:.4f}")


ValueError: y contains previously unseen labels: 6