# Week 3 Day 11: Language Modeling Datasets & Perplexity

## Overview
In this notebook, we'll explore language modeling datasets, learn how to calculate perplexity, and train a simple model to see these concepts in action.

In [None]:
# Import necessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
import random
import math
from collections import Counter
from typing import List, Dict, Tuple, Optional

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Creating a Toy Dataset

We'll download classic literature from Project Gutenberg to create our toy dataset.

In [None]:
def download_text(url: str) -> str:
    response = requests.get(url)
    response.encoding = 'utf-8'
    return response.text

def clean_gutenberg_text(text: str) -> str:
    start_markers = ["*** START OF THIS PROJECT GUTENBERG", "***START OF THE PROJECT GUTENBERG", "*** START OF THE PROJECT GUTENBERG"]
    end_markers = ["*** END OF THIS PROJECT GUTENBERG", "***END OF THE PROJECT GUTENBERG", "*** END OF THE PROJECT GUTENBERG", "End of the Project Gutenberg"]
    
    start_pos = -1
    for marker in start_markers:
        pos = text.find(marker)
        if pos != -1:
            start_pos = text.find('\n', pos) + 1
            break
    if start_pos == -1: start_pos = 0
        
    end_pos = len(text)
    for marker in end_markers:
        pos = text.find(marker)
        if pos != -1:
            end_pos = pos
            break
            
    content = text[start_pos:end_pos].strip()
    content = re.sub(r'\r\n', '\n', content)
    content = re.sub(r'\n{3,}', '\n\n', content)
    return content

book_urls = {
    "alice": "https://www.gutenberg.org/files/11/11-0.txt",
    "sherlock": "https://www.gutenberg.org/files/1661/1661-0.txt",
    "frankenstein": "https://www.gutenberg.org/files/84/84-0.txt"
}

books = {}
for name, url in book_urls.items():
    try:
        print(f"Downloading {name}...")
        text = download_text(url)
        books[name] = clean_gutenberg_text(text)
        print(f"Downloaded {name}: {len(books[name])} characters")
    except Exception as e:
        print(f"Error downloading {name}: {e}")

## 2. Simple Tokenization & Data Preparation

In [None]:
class CharacterTokenizer:
    def __init__(self, texts: List[str]):
        all_text = ''.join(texts)
        self.chars = sorted(list(set(all_text)))
        self.vocab_size = len(self.chars)
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}

    def encode(self, text: str) -> List[int]:
        return [self.char_to_idx[char] for char in text]

    def decode(self, indices: List[int]) -> str:
        return ''.join([self.idx_to_char[i] for i in indices])

tokenizer = CharacterTokenizer(list(books.values()))
print(f"Vocabulary size: {tokenizer.vocab_size}")

def create_train_val_split(text: str, val_ratio: float = 0.1) -> Tuple[torch.Tensor, torch.Tensor]:
    encoded_text = tokenizer.encode(text)
    n = int(len(encoded_text) * (1 - val_ratio))
    train_data = torch.tensor(encoded_text[:n], dtype=torch.long)
    val_data = torch.tensor(encoded_text[n:], dtype=torch.long)
    return train_data, val_data

processed_data = {name: create_train_val_split(text) for name, text in books.items()}

def get_batch(split, book_name, seq_len, batch_size):
    data = processed_data[book_name][0] if split == 'train' else processed_data[book_name][1]
    ix = torch.randint(len(data) - seq_len, (batch_size,))
    x = torch.stack([data[i:i+seq_len] for i in ix])
    y = torch.stack([data[i+1:i+seq_len+1] for i in ix])
    return x.to(device), y.to(device)

## 3. Language Model and Perplexity Calculation

In [None]:
class SimpleCharLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(self.dropout(lstm_out))
        return logits

@torch.no_grad()
def calculate_perplexity(model, data_loader):
    model.eval()
    total_loss = 0
    total_tokens = 0
    for x, y in data_loader:
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, model.embedding.num_embeddings), y.view(-1), reduction='sum')
        total_loss += loss.item()
        total_tokens += y.numel()
    model.train()
    return math.exp(total_loss / total_tokens)

## 4. Training and Evaluation

In [None]:
def train_model(book_name, epochs=3, seq_len=64, batch_size=32, lr=0.001):
    model = SimpleCharLM(vocab_size=tokenizer.vocab_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    history = {'train_ppl': [], 'val_ppl': []}
    
    # Create a simple dataloader for validation
    val_data = processed_data[book_name][1]
    val_loader = [(val_data[i:i+seq_len].unsqueeze(0).to(device), val_data[i+1:i+seq_len+1].unsqueeze(0).to(device)) for i in range(0, len(val_data)-seq_len-1, seq_len)]

    for epoch in range(epochs):
        model.train()
        # Simplified training loop for demonstration
        for _ in range(100): # 100 steps per epoch
            xb, yb = get_batch('train', book_name, seq_len, batch_size)
            logits = model(xb)
            loss = F.cross_entropy(logits.view(-1, tokenizer.vocab_size), yb.view(-1))
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
        
        # Evaluation
        train_ppl = calculate_perplexity(model, [(xb, yb)]) # Approx train PPL
        val_ppl = calculate_perplexity(model, val_loader)
        history['train_ppl'].append(train_ppl)
        history['val_ppl'].append(val_ppl)
        print(f'Epoch {epoch+1}, Book: {book_name}, Train PPL: {train_ppl:.2f}, Val PPL: {val_ppl:.2f}')
    
    return model, history

# Train on one book
trained_model, history = train_model('alice')

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(history['train_ppl'], label='Train Perplexity')
plt.plot(history['val_ppl'], label='Validation Perplexity')
plt.title('Perplexity over Training Epochs')
plt.xlabel('Epoch')
plt.ylabel('Perplexity')
plt.legend()
plt.grid(True)
plt.show()