##Step 1. Imports and configs

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
import random
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, TensorDataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [3]:
HIDDEN_SIZE = 64
NUM_LAYERS = 1
LR = 0.001
BATCH_SIZE = 64
EPOCHS = 50

SEQUENCE_LENGTHS = [10, 25, 50, 100, 200, 500]
TRIAL_SEEDS = [1, 2, 3]

##Step 2. Reproducibility function


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

##Step 3. Dataset generation

In [5]:
def generate_adding_problem(num_samples, seq_len):
    x_values = np.random.uniform(0, 1, (num_samples, seq_len))
    x_mask = np.zeros((num_samples, seq_len))

    for i in range(num_samples):
        indices = np.random.choice(seq_len, size=2, replace=False)
        x_mask[i, indices] = 1.0

    X = np.stack([x_values, x_mask], axis=-1)
    y = np.sum(x_values * x_mask, axis=1, keepdims=True)

    return X.astype(np.float32), y.astype(np.float32)

In [6]:
X, y = generate_adding_problem(5, 10)
print("X shape:", X.shape)   # should be (5, 10, 2)
print("y shape:", y.shape)   # should be (5, 1)

X shape: (5, 10, 2)
y shape: (5, 1)


##Step 4. Model Definitions

In [7]:
class RNNModel(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=1):
        super(RNNModel, self).__init__()

        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]   # last time step
        out = self.fc(out)
        return out

In [8]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=1):
        super(LSTMModel, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [9]:
class GRUModel(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=1):
        super(GRUModel, self).__init__()

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

##Step 5. Parameter count (Table 1)

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

rnn_model = RNNModel().to(device)
lstm_model = LSTMModel().to(device)
gru_model = GRUModel().to(device)

print("RNN params:", count_parameters(rnn_model))
print("LSTM params:", count_parameters(lstm_model))
print("GRU params:", count_parameters(gru_model))

RNN params: 4417
LSTM params: 17473
GRU params: 13121


##Step 6. Training and evaluation functions

In [11]:
def train_model(model, train_loader, test_loader):
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)

    train_losses = []
    grad_norms = []
    epoch_times = []

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0.0

        start_time = time.time()

        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()

            # ---- Gradient norm tracking ----
            total_norm = 0
            for p in model.parameters():
                if p.grad is not None:
                    total_norm += p.grad.data.norm(2).item() ** 2
            total_norm = total_norm ** 0.5
            grad_norms.append(total_norm)

            optimizer.step()

            epoch_loss += loss.item() * X_batch.size(0)

        epoch_loss /= len(train_loader.dataset)
        train_losses.append(epoch_loss)

        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

    # Final evaluation
    test_mse = evaluate_model(model, test_loader)

    return train_losses, grad_norms, epoch_times, test_mse

In [12]:
def evaluate_model(model, test_loader):
    model.eval()
    criterion = nn.MSELoss()
    total_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item() * X_batch.size(0)

    return total_loss / len(test_loader.dataset)

##Step 7. Quick sanity check

In [13]:
set_seed(1)

seq_len = 10

X_train, y_train = generate_adding_problem(10000, seq_len)
X_test, y_test = generate_adding_problem(2000, seq_len)

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = RNNModel(hidden_size=HIDDEN_SIZE).to(device)

train_losses, grad_norms, epoch_times, test_mse = train_model(model, train_loader, test_loader)

print("Final Test MSE:", test_mse)
print("Average time per epoch:", sum(epoch_times)/len(epoch_times))

Final Test MSE: 0.0009208881286904216
Average time per epoch: 1.1421778202056885


##Step 8. Builiding experiment pipeline

In [14]:
results = {
    "RNN": {},
    "LSTM": {},
    "GRU": {}
} #Create Results Storage

In [15]:
{
    seq_len: {
        "mse": [],
        "time": [],
        "loss_curves": [],
        "grad_norms": []
    }
}

{10: {'mse': [], 'time': [], 'loss_curves': [], 'grad_norms': []}}

In [16]:
def get_model(model_name):
    if model_name == "RNN":
        return RNNModel(hidden_size=HIDDEN_SIZE)
    elif model_name == "LSTM":
        return LSTMModel(hidden_size=HIDDEN_SIZE)
    elif model_name == "GRU":
        return GRUModel(hidden_size=HIDDEN_SIZE) #model selector function

In [None]:
for seq_len in SEQUENCE_LENGTHS:
    print(f"\n===== Sequence Length: {seq_len} =====")

    for model_name in ["RNN", "LSTM", "GRU"]:
        print(f"\n--- Model: {model_name} ---")

        results[model_name][seq_len] = {
            "mse": [],
            "time": [],
            "loss_curves": [],
            "grad_norms": []
        }

        for seed in TRIAL_SEEDS:
            print(f"Trial seed: {seed}")

            set_seed(seed)

            # Generate dataset
            X_train, y_train = generate_adding_problem(10000, seq_len)
            X_test, y_test = generate_adding_problem(2000, seq_len)

            train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
            test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

            train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

            model = get_model(model_name).to(device)

            train_losses, grad_norms, epoch_times, test_mse = train_model(
                model, train_loader, test_loader
            )

            results[model_name][seq_len]["mse"].append(test_mse)
            results[model_name][seq_len]["time"].append(sum(epoch_times)/len(epoch_times))
            results[model_name][seq_len]["loss_curves"].append(train_losses)
            results[model_name][seq_len]["grad_norms"].append(grad_norms)


===== Sequence Length: 10 =====

--- Model: RNN ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: LSTM ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: GRU ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

===== Sequence Length: 25 =====

--- Model: RNN ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: LSTM ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: GRU ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

===== Sequence Length: 50 =====

--- Model: RNN ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: LSTM ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: GRU ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

===== Sequence Length: 100 =====

--- Model: RNN ---
Trial seed: 1
Trial seed: 2
Trial seed: 3

--- Model: LSTM ---
Trial seed: 1
Trial seed: 2
