# 🔁 RNN vs LSTM: Long-Term Dependency Demonstration
This notebook compares a simple RNN and LSTM on a synthetic task where the output depends only on the **first** token of the sequence.

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

## 📦 Generate Synthetic Data

In [None]:
def generate_data(n=1000, seq_len=20):
    X = torch.randint(0, 2, (n, seq_len)).float()
    y = X[:, 0]  # label = first element
    return X.unsqueeze(-1), y

X, y = generate_data()

## 🧠 Define RNN and LSTM Models

In [None]:
class RNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(input_size=1, hidden_size=16, batch_first=True)
        self.fc = nn.Linear(16, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        return torch.sigmoid(self.fc(out[:, -1, :]))

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=16, batch_first=True)
        self.fc = nn.Linear(16, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return torch.sigmoid(self.fc(out[:, -1, :]))

## 🏋️ Train and Compare Both Models

In [None]:
def train_model(model, X, y, epochs=20):
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    losses = []
    for _ in range(epochs):
        pred = model(X).squeeze()
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return losses

rnn_model = RNNModel()
lstm_model = LSTMModel()
rnn_losses = train_model(rnn_model, X, y)
lstm_losses = train_model(lstm_model, X, y)

## 📉 Loss Comparison

In [None]:
plt.plot(rnn_losses, label='RNN Loss')
plt.plot(lstm_losses, label='LSTM Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('RNN vs LSTM on Long-Term Dependency Task')
plt.legend()
plt.grid(True)
plt.show()

## ✅ Conclusion
- The RNN struggles to reduce loss because it **forgets** the first token.
- The LSTM learns the task more effectively, demonstrating its ability to retain **long-term dependencies**.