## Character-level RNN for Text Generation

Train a small character-level language model on a single file to generate new text

## Setup & Config
- imports, device, seed
- simple config dictionary for reproducibility

In [6]:
import math
import os
import random
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed: int = 42):
    random.seed(seed)                           # Python's random functions
    torch.manual_seed(seed)                     # PyTorch CPU ops
    torch.cuda.manual_seed_all(seed)            # PyTorch GPU ops
    torch.backends.cudnn.deterministic = True   # cuDNN algortihm choice
    torch.backends.cudnn.benchmark = False               # cuDNN auto-tuner

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = {
    "data_path": None,      # path to text file (e.g., "data/shakespeare.text"); None uses built-in sample
    "seq_len": 128,         # sequence length for training (input and target chunks)
    "batch_size": 128,      # number of sequences per training batch
    "embedding_dim": 256,   # size of character embedding vectors
    "hidden_dim": 256,      # size of hidden state in RNN (GRU or LSTM)
    "num_layers": 1,        # number of stacked RNN layers
    "dropout": 0.1,         # dropout probability between layers
    "rnn_type": "GRU",      # type of RNN: "GRU" or "LSTM"
    "num_epochs": 5,        # number of full passes through the training dataset
    "learning_rate": 2e-3,  # inital learning rate for the optimizer
    "grad_clip": 1,         # gradient clipping threshold to prevent exploding gradients
    "log_every": 100,       # how often (in steps) to print training loss
    "sample_every": 100,    # how often (in steps) to generate sample text
    "max_generate": 400,    # number of characters tto generate during sampling
    "temperature": 0.9,     # sampling temperature (controls randomness in output)
    "top_k": 40,            # top-k sampling: consider only the top k most probable characters
    "top_p": 0.9,           # top-p (nucleus) sampling: consider top tokens whose probabilites sum to p
    "val_fraction": 0.05,   # fraction of the dataset to use for validation
    "overlap_step": None,   # if set, use overlapping training chunks (e.g, step size = seq_len // 2)
    "save_path": "char_rnn_checkpoint.pt",  # where to save the trained model checkpoint
}

## Load Data
- provide a path to corpus (plain .txt) in config["data-path"]
- if not provided, we use a built-in snippet

In [8]:
# load text
if config["data_path"] and os.path.exists(config["data_path"]):
    with open(config["data_path"], "r", encoding="utf-8") as f:
        text = f.read()
else:
    # snippet for testing
    text = (
        "ROMEO:\nBut soft, what light through yonder window breaks?\n"
        "It is the east, and Juliet is the sun.\nArise, fair sun, and kill the envious moon,\n"
        "Who is already sick and pale with grief.\n\n"
        "JULIET:\nO Romeo, Romeo! wherefore art thou Romeo?\n"
        "Deny thy father and refuse thy name;\nOr, if thou wilt not, be but sworn my love,\n"
        "And I'll no longer be a Capulet.\n"       
    )

## Character Vocabulary
- build stoi and itos
    - stoi encodes text into IDs for the model, and itos decodes model outputs back into readable text
    - stoi (string-to-index): a dict mapping each character to an integer
    - itos (index-to-string): a list mapping each ID back to its character
- encode/decode utilities

In [27]:
class CharVocab:
    def __init__(self, text: str):
        chars = sorted(list(set(text)))     # 45 unique chars
        self.itos = chars                   
        self.stoi = {ch: i for i, ch in enumerate(chars)}
    
    # takes text and return list of ids for each char
    def encode(self, s: str) -> List[int]:
        return [self.stoi[c] for c in s if c in self.stoi]

    def decode(self, ids: List[int]) -> str:
        return "". join(self.itos[i] for i in ids)

vocab = CharVocab(text)
vocab_size = len(vocab.itos)    # 45

## Encode & Split, Dataset & DataLoader
- encode the full text into integer IDs
- split into train/val by fraction
- create chunked dataset returning (x, y) where y is the next-char targets
### Additional Notes
for Shakespeare text:
- train/val split
    - len(text) = 347
        - the original text
    - len(vocab.encode(text)) = 347
        - the list of ids for each char
    - n_total = len(data_ids) = 347
        - turns the encoding into a tensor
    - n_val = 347 * 0.05 = 17

- CharChunkDataset
    - chunk: one training example (128 consecutive characters)
    - input: those 128 characters
    - target: the next 128 characters, shifted by one position
        - the model learns to predict the next character at each step
    - len(train_ds) = train_ds.num_chunks = (330 - 1 - 128) // 128 + 1 = 2
    - self.starts
        - list of starting indices where each chunk begins
        - makes getitem fast


In [45]:
# encode entire corpus
data_ids = torch.tensor(vocab.encode(text), dtype=torch.long)

# train/val split
n_total = len(data_ids)
n_val = max(1, int(n_total * config["val_fraction"]))   # at least one token for validation
train_ids = data_ids[:-n_val]   # 330 ids
val_ids = data_ids[-n_val:]     # 17 ids

# splits a long 1D tokenized tensor into (input, target) chunk pairs
# each sample: x of length T, y of length T (next-char prediction)
class CharChunkDataset(Dataset):
    def __init__(self, ids: torch.Tensor, seq_len: int, step: Optional[int] = None):
        self.ids = ids      # 1d tensor of all character ids
        self.T = seq_len    # chunk length
        self.step = step if step is not None else seq_len               # stride: None means non-overlapping
        self.num_chunks = (len(ids) - 1 - self.T) // self.step + 1      # number of chunks you can extract
        self.starts = [i * self.step for i in range(self.num_chunks)]
    
    # built-in behavior for len(obj)
    def __len__(self):
        return self.num_chunks
    
    # built-in behavior for obj[i]
    def __getitem__(self, idx):
        s = self.starts[idx]
        x = self.ids[s : s + self.T]
        y = self.ids[s + 1: s + 1 + self.T]
        return x, y

train_ds = CharChunkDataset(train_ids, config["seq_len"], config["overlap_step"])
val_ds = CharChunkDataset(val_ids, config["seq_len"], config["overlap_step"])

train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False, drop_last=True)