## Character-level RNN for Text Generation

Train a small character-level language model on a single file to generate new text

## Setup & Config
- imports, device, seed
- simple config dictionary for reproducibility

In [6]:
import math
import os
import random
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed: int = 42):
    random.seed(seed)                           # Python's random functions
    torch.manual_seed(seed)                     # PyTorch CPU ops
    torch.cuda.manual_seed_all(seed)            # PyTorch GPU ops
    torch.backends.cudnn.deterministic = True   # cuDNN algortihm choice
    torch.backends.cudnn.benchmark = False               # cuDNN auto-tuner

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = {
    "data_path": None,      # path to text file (e.g., "data/shakespeare.text"); None uses built-in sample
    "seq_len": 128,         # sequence length for training (input and target chunks)
    "batch_size": 128,      # number of sequences per training batch
    "embedding_dim": 256,   # size of character embedding vectors
    "hidden_dim": 256,      # size of hidden state in RNN (GRU or LSTM)
    "num_layers": 1,        # number of stacked RNN layers
    "dropout": 0.1,         # dropout probability between layers
    "rnn_type": "GRU",      # type of RNN: "GRU" or "LSTM"
    "num_epochs": 5,        # number of full passes through the training dataset
    "learning_rate": 2e-3,  # inital learning rate for the optimizer
    "grad_clip": 1,         # gradient clipping threshold to prevent exploding gradients
    "log_every": 100,       # how often (in steps) to print training loss
    "sample_every": 100,    # how often (in steps) to generate sample text
    "max_generate": 400,    # number of characters tto generate during sampling
    "temperature": 0.9,     # sampling temperature (controls randomness in output)
    "top_k": 40,            # top-k sampling: consider only the top k most probable characters
    "top_p": 0.9,           # top-p (nucleus) sampling: consider top tokens whose probabilites sum to p
    "val_fraction": 0.05,   # fraction of the dataset to use for validation
    "overlap_step": None,   # if set, use overlapping training chunks (e.g, step size = seq_len // 2)
    "save_path": "char_rnn_checkpoint.pt",  # where to save the trained model checkpoint
}