<a href="https://colab.research.google.com/github/bythyag/smolgpt/blob/main/GPT_scratchpad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m nltk.downloader all -q



In [8]:
#------------------------------------
# Snippet 1: Setup and Google Drive Mount
#------------------------------------
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast # For mixed precision

import math
import os
import requests # For downloading data
import nltk # For word tokenization
from collections import Counter
import pickle # To save/load vocabulary
import time
from tqdm import tqdm # Progress bar

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# --- Configuration ---
DRIVE_SAVE_DIR = "/content/drive/MyDrive/gpt2_scratch_wordlevel_tiny" # CHANGE AS NEEDED
if not os.path.exists(DRIVE_SAVE_DIR):
    os.makedirs(DRIVE_SAVE_DIR)

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Download NLTK tokenizer data (if not already downloaded)
nltk.download('punkt')
print("NLTK punkt downloaded.")

print("Setup Complete.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
NLTK punkt downloaded.
Setup Complete.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
#------------------------------------
# Snippet 2: Configuration
#------------------------------------

# --- Model Hyperparameters --- Choose one preset ---
# Preset: 'nano' (very small, fast to train, low quality)
#n_layer = 3
#n_head = 3
#n_embd = 48

# Preset: 'micro' (slightly larger nano)
#n_layer = 4
#n_head = 4
#n_embd = 128

# Preset: 'tiny' (closer to small models, feasible on T4)
n_layer = 6
n_head = 6
n_embd = 384 # Embedding dimension must be divisible by n_head

# Preset: 'small' (might push T4 limits, reduce batch_size if needed)
# n_layer = 12
# n_head = 12
# n_embd = 768 # Original GPT-2 small size - LIKELY TOO BIG FOR T4 FREE TIER

# --- Training Hyperparameters ---
block_size = 128      # Max context length for predictions (sequence length)
batch_size = 32       # How many sequences process in parallel? Reduce if OOM.
max_iters = 5000      # Total training iterations
eval_interval = 250   # How often to evaluate on validation set
learning_rate = 3e-4  # Learning rate
eval_iters = 100      # Number of batches to average for validation loss
dropout = 0.1         # Dropout rate
use_amp = True        # Use Automatic Mixed Precision (highly recommended on T4)

# --- Data ---
DATA_URL = "https://www.gutenberg.org/files/1661/1661-0.txt" # Sherlock Holmes
DATA_PATH = "sherlock_holmes.txt"
VOCAB_PATH = os.path.join(DRIVE_SAVE_DIR, "word_vocab.pkl")
TRAIN_MODEL_PATH = os.path.join(DRIVE_SAVE_DIR, "gpt2_word_level.pth")
MIN_WORD_FREQ = 3     # Minimum frequency for a word to be included in vocab

# Derived parameters
assert n_embd % n_head == 0, "Embedding dimension must be divisible by number of heads"

config = {
    'n_layer': n_layer,
    'n_head': n_head,
    'n_embd': n_embd,
    'block_size': block_size,
    'batch_size': batch_size,
    'max_iters': max_iters,
    'eval_interval': eval_interval,
    'learning_rate': learning_rate,
    'eval_iters': eval_iters,
    'dropout': dropout,
    'use_amp': use_amp,
    'vocab_size': -1, # Will be set after data loading
    'device': device,
    'min_word_freq': MIN_WORD_FREQ,
    'vocab_path': VOCAB_PATH,
    'model_path': TRAIN_MODEL_PATH
}

print("Configuration:")
for key, val in config.items():
    print(f"{key}: {val}")

Configuration:
n_layer: 6
n_head: 6
n_embd: 384
block_size: 128
batch_size: 32
max_iters: 5000
eval_interval: 250
learning_rate: 0.0003
eval_iters: 100
dropout: 0.1
use_amp: True
vocab_size: -1
device: cuda
min_word_freq: 3
vocab_path: /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/word_vocab.pkl
model_path: /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth


In [10]:
#------------------------------------
# Snippet 3: Data Preparation
#------------------------------------

# --- Download Data ---
if not os.path.exists(DATA_PATH):
    print(f"Downloading data from {DATA_URL}...")
    try:
        response = requests.get(DATA_URL)
        response.raise_for_status() # Raise an exception for bad status codes
        # The downloaded text often has BOM (Byte Order Mark) and needs decoding
        text = response.content.decode('utf-8-sig')
        # Basic cleaning: remove Gutenberg header/footer (heuristic)
        start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
        start_idx = text.find(start_marker)
        if start_idx != -1:
            start_idx += len(start_marker)
            text = text[start_idx:]
        end_idx = text.find(end_marker)
        if end_idx != -1:
            text = text[:end_idx]

        text = text.strip() # Remove leading/trailing whitespace
        print(f"Data downloaded and saved to {DATA_PATH}. Length: {len(text)} characters.")
        with open(DATA_PATH, 'w', encoding='utf-8') as f:
            f.write(text)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        # Handle error appropriately, maybe exit or use cached data if available
        text = "" # Ensure 'text' exists even on failure
    except Exception as e:
        print(f"An error occurred during data processing: {e}")
        text = ""
else:
    print(f"Data file {DATA_PATH} already exists.")
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        text = f.read()
    print(f"Loaded data from {DATA_PATH}. Length: {len(text)} characters.")


# --- Tokenization & Vocabulary ---
print("Tokenizing text...")
# Use NLTK for word tokenization. Consider lowercasing for smaller vocab.
tokens = nltk.word_tokenize(text.lower()) # Lowercasing reduces vocab size
print(f"Total tokens: {len(tokens)}")

# Build Vocabulary
print("Building vocabulary...")
word_counts = Counter(tokens)
# Keep words that appear at least MIN_WORD_FREQ times
filtered_word_counts = {word: count for word, count in word_counts.items() if count >= config['min_word_freq']}

# Create mapping from word to integer index
# Add special tokens: PAD (optional but good practice) and UNK
# We won't explicitly use PAD here for simplicity with causal masking,
# but UNK is important.
# <PAD> = 0 , <UNK> = 1
# Start actual words from index 2
# Note: For generation, sometimes people add <SOS> and <EOS> (Start/End of Sentence)
# but for standard GPT pretraining on long texts, they are less common.
stoi = {word: i+2 for i, word in enumerate(filtered_word_counts)}
stoi['<PAD>'] = 0 # Padding token index
stoi['<UNK>'] = 1 # Unknown token index

itos = {i: word for word, i in stoi.items()}
vocab_size = len(stoi)
config['vocab_size'] = vocab_size # Update config

print(f"Vocabulary size: {vocab_size}")
print(f"Saving vocabulary to {config['vocab_path']}...")
with open(config['vocab_path'], 'wb') as f:
    pickle.dump({'stoi': stoi, 'itos': itos}, f)
print("Vocabulary saved.")

# --- Encode/Decode Functions ---
def encode(text_string):
    words = nltk.word_tokenize(text_string.lower())
    return [stoi.get(word, stoi['<UNK>']) for word in words]

def decode(indices):
    return ' '.join([itos.get(i, '?') for i in indices]) # Use '?' for unexpected indices

# --- Create Data Tensors ---
print("Encoding entire dataset...")
full_data = torch.tensor(encode(text), dtype=torch.long)
print(f"Encoded data shape: {full_data.shape}")

# Split data into train and validation sets
n = len(full_data)
train_data = full_data[:int(n*0.9)]
val_data = full_data[int(n*0.9):]

print(f"Train set size: {len(train_data)} tokens")
print(f"Validation set size: {len(val_data)} tokens")

# --- Dataset Class ---
class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # We can start a sequence at almost any point
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Grab a chunk of data for context (x) and target (y)
        chunk = self.data[idx:idx + self.block_size + 1]
        x = chunk[:-1]
        y = chunk[1:]
        return x, y

# Example usage (optional):
# test_dataset = TextDataset(train_data, config['block_size'])
# x_ex, y_ex = test_dataset[0]
# print("Example x:", x_ex)
# print("Example y:", y_ex)
# print("Decoded x:", decode(x_ex.tolist()))
# print("Decoded y:", decode(y_ex.tolist()))

print("Data Preparation Complete.")

Data file sherlock_holmes.txt already exists.
Loaded data from sherlock_holmes.txt. Length: 581421 characters.
Tokenizing text...
Total tokens: 128528
Building vocabulary...
Vocabulary size: 3291
Saving vocabulary to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/word_vocab.pkl...
Vocabulary saved.
Encoding entire dataset...
Encoded data shape: torch.Size([128528])
Train set size: 115675 tokens
Validation set size: 12853 tokens
Data Preparation Complete.


In [11]:
#------------------------------------
# Snippet 4: GPT-2 Model Components
#------------------------------------

# --- Layer Normalization ---
# Simplified LayerNorm implementation for understanding,
# but using nn.LayerNorm is standard and often optimized.
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
    def __init__(self, ndim, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        # Calculate mean and variance along the last dimension (embedding dimension)
        # Keep dimension for broadcasting
        mean = input.mean(dim=-1, keepdim=True)
        var = input.var(dim=-1, keepdim=True, unbiased=False) # Use population variance

        # Normalize
        # Add eps for numerical stability (avoid division by zero)
        normalized_input = (input - mean) / torch.sqrt(var + 1e-5)

        # Scale and shift
        output = normalized_input * self.weight
        if self.bias is not None:
            output = output + self.bias
        return output

# --- Causal Self-Attention Head ---
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config['n_embd'] % config['n_head'] == 0
        # K, Q, V projections for all heads, but in a batch
        self.c_attn = nn.Linear(config['n_embd'], 3 * config['n_embd'])
        # Output projection
        self.c_proj = nn.Linear(config['n_embd'], config['n_embd'])
        # Regularization
        self.attn_dropout = nn.Dropout(config['dropout'])
        self.resid_dropout = nn.Dropout(config['dropout'])
        # Causal mask
        # Not a parameter, assigned to buffer
        # Uses a lower triangular matrix for masking future positions
        self.register_buffer("bias", torch.tril(torch.ones(config['block_size'], config['block_size']))
                                      .view(1, 1, config['block_size'], config['block_size']))
        self.n_head = config['n_head']
        self.n_embd = config['n_embd']
        self.dropout = config['dropout']

    def forward(self, x):
        B, T, C = x.size() # Batch size, sequence length, embedding dimensionality (n_embd)

        # Calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # q, k, v shape: (B, n_head, T, head_size)
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        head_size = C // self.n_head
        k = k.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)

        # Causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        # Manual implementation of attention
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_size))
        # Apply causal mask (upper triangular part is masked)
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        # Apply softmax
        att = F.softmax(att, dim=-1)
        # Apply dropout to attention weights
        att = self.attn_dropout(att)

        # Weighted sum of values
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        # Re-assemble all head outputs side by side
        y = y.transpose(1, 2).contiguous().view(B, T, C) # (B, T, C)

        # Output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

# --- Feed Forward Network (MLP) ---
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config['n_embd'], 4 * config['n_embd'])
        # GELU activation function is standard in GPT-2
        # For compatibility or simplicity, ReLU could be used but GELU is preferred
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config['n_embd'], config['n_embd'])
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

# --- Transformer Block ---
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config['n_embd']) # Or nn.LayerNorm(config['n_embd'])
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config['n_embd']) # Or nn.LayerNorm(config['n_embd'])
        self.mlp = MLP(config)

    def forward(self, x):
        # Forward pass through the transformer block
        # Residual connections are crucial
        x = x + self.attn(self.ln_1(x)) # Attention path
        x = x + self.mlp(self.ln_2(x))  # MLP path
        return x

print("Model Components Defined (LayerNorm, CausalSelfAttention, MLP, Block).")

Model Components Defined (LayerNorm, CausalSelfAttention, MLP, Block).


In [12]:
#------------------------------------
# Snippet 5: Full GPT-2 Model
#------------------------------------

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            # Token embeddings
            wte = nn.Embedding(config['vocab_size'], config['n_embd']),
            # Positional embeddings (learned)
            wpe = nn.Embedding(config['block_size'], config['n_embd']),
            # Dropout after embedding + positional encoding
            drop = nn.Dropout(config['dropout']),
            # Stack of transformer blocks
            h = nn.ModuleList([Block(config) for _ in range(config['n_layer'])]),
            # Final layer normalization before the output head
            ln_f = LayerNorm(config['n_embd']), # Or nn.LayerNorm(config['n_embd'])
        ))
        # Language modeling head (maps embeddings to vocabulary logits)
        self.lm_head = nn.Linear(config['n_embd'], config['vocab_size'], bias=False)

        # Tie the weights between the token embeddings and the final linear layer
        # This improves performance and reduces parameters
        self.transformer.wte.weight = self.lm_head.weight

        # Initialize weights (important for transformer stability)
        self.apply(self._init_weights)

        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config['n_layer']))

        # Report number of parameters
        n_params = sum(p.numel() for p in self.parameters())
        print(f"Model Parameter Count: {n_params/1e6:.2f} M")


    def _init_weights(self, module):
        # Initialize Linear and Embedding layers
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size() # Batch size, sequence length
        assert t <= self.config['block_size'], f"Cannot forward sequence of length {t}, block size is only {self.config['block_size']}"

        # --- Forward pass through the transformer ---
        # 1. Get token embeddings
        tok_emb = self.transformer.wte(idx) # Shape: (b, t, n_embd)

        # 2. Get positional embeddings
        # Create position IDs: tensor of [0, 1, ..., t-1]
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # Shape: (1, t)
        pos_emb = self.transformer.wpe(pos) # Shape: (1, t, n_embd)

        # 3. Add token and positional embeddings
        x = self.transformer.drop(tok_emb + pos_emb)

        # 4. Pass through transformer blocks
        for block in self.transformer.h:
            x = block(x)

        # 5. Final layer normalization
        x = self.transformer.ln_f(x) # Shape: (b, t, n_embd)

        # --- Language Modeling Head ---
        if targets is not None:
            # If we are given some desired targets also calculate the loss
            logits = self.lm_head(x) # Shape: (b, t, vocab_size)
            # Calculate loss using cross-entropy
            # Need to reshape for CrossEntropyLoss: expects (N, C) and (N)
            # N = b * t, C = vocab_size
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0) # Ignore PAD token if targets contain it
        else:
            # Inference-time configuration: only forward the lm_head on the very last position
            # This is slightly more efficient during generation
            logits = self.lm_head(x[:, [-1], :]) # Note: using list [-1] keeps the T dimension
            loss = None

        return logits, loss

    # --- Generation Method ---
    @torch.no_grad() # IMPORTANT: Disable gradient calculation during generation
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        self.eval() # Set model to evaluation mode

        for _ in range(max_new_tokens):
            # If the sequence context is growing too long, crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config['block_size'] else idx[:, -self.config['block_size']:]

            # Forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond) # We don't need the loss here

            # Pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature # Shape: (b, vocab_size)

            # Optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                # Set logits not in the top k to -infinity
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1) # Shape: (b, vocab_size)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # Shape: (b, 1)

            # Append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1) # Shape: (b, t+1)

        self.train() # Set model back to train mode if needed later
        return idx

# --- Instantiate the model ---
# Ensure vocab size is set in config
if config['vocab_size'] == -1:
     raise ValueError("Vocabulary size not set. Run Snippet 3 first.")

model = GPT(config)
model.to(device)

print("GPT Model Instantiated.")
# Optional: Print model structure
# print(model)

Model Parameter Count: 11.96 M
GPT Model Instantiated.


In [13]:
#------------------------------------
# Snippet 6: Training Loop
#------------------------------------

# --- DataLoader ---
train_dataset = TextDataset(train_data, config['block_size'])
val_dataset = TextDataset(val_data, config['block_size'])

# Use pin_memory=True if data fits in CPU RAM and using GPU for faster transfer
# num_workers > 0 can speed up data loading but might cause issues in Colab sometimes
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

# --- Optimizer ---
# AdamW is a standard optimizer for transformers
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

# --- Mixed Precision Scaler ---
# Used only if config['use_amp'] is True
scaler = GradScaler(enabled=config['use_amp'])

# --- Learning Rate Scheduler (Optional but recommended) ---
# Example: Cosine decay schedule
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['max_iters'], eta_min=config['learning_rate']/10)

# --- Estimate Validation Loss Function ---
@torch.no_grad() # Disable gradient calculation for evaluation
def estimate_loss(model_to_eval, eval_iters):
    out = {}
    model_to_eval.eval() # Set model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        loader = train_loader if split == 'train' else val_loader
        loader_iter = iter(loader)
        for k in range(eval_iters):
            try:
                X, Y = next(loader_iter)
            except StopIteration: # Reset iterator if needed
                 loader_iter = iter(loader)
                 X, Y = next(loader_iter)

            X, Y = X.to(device), Y.to(device)
            with autocast(enabled=config['use_amp']): # Enable AMP context
                 logits, loss = model_to_eval(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model_to_eval.train() # Set model back to training mode
    return out

# --- Training Loop ---
print(f"Starting training for {config['max_iters']} iterations...")
best_val_loss = float('inf')
start_time = time.time()

# Resume from checkpoint if exists
if os.path.exists(config['model_path']):
    print(f"Resuming training from checkpoint: {config['model_path']}")
    checkpoint = torch.load(config['model_path'], map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    # Careful: Optimizer state might need adjustment if hyperparameters changed
    # For simplicity here, we re-initialize optimizer, but loading it is better practice
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    # best_val_loss = checkpoint.get('best_val_loss', float('inf')) # Load best loss if saved
    # start_iter = checkpoint.get('iter', 0) + 1 # Resume iteration count
    print("Loaded model weights. Re-initializing optimizer.")
else:
    print("Starting training from scratch.")
    start_iter = 0


# Use an iterator for the training data loader
train_iter = iter(train_loader)

for iter_num in range(start_iter, config['max_iters']):
    # Every eval_interval evaluate loss on train and val sets
    if iter_num % config['eval_interval'] == 0 or iter_num == config['max_iters'] - 1:
        losses = estimate_loss(model, config['eval_iters'])
        current_time = time.time()
        elapsed_time = current_time - start_time
        print(f"Step {iter_num}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}, Time: {elapsed_time:.2f}s")

        # Save model checkpoint if validation loss improved
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            print(f"New best validation loss: {best_val_loss:.4f}. Saving model...")
            checkpoint = {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(), # Save optimizer state too
                'config': config,
                'best_val_loss': best_val_loss,
                'iter': iter_num
            }
            torch.save(checkpoint, config['model_path'])
            print(f"Model saved to {config['model_path']}")

    # Sample a batch of data
    try:
        X_batch, Y_batch = next(train_iter)
    except StopIteration:
        # Epoch finished, start new epoch
        train_iter = iter(train_loader)
        X_batch, Y_batch = next(train_iter)

    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

    # Forward pass & loss calculation, using mixed precision context
    with autocast(enabled=config['use_amp']):
        logits, loss = model(X_batch, Y_batch)

    # Backward pass & optimization
    optimizer.zero_grad(set_to_none=True) # More efficient zeroing
    scaler.scale(loss).backward()         # Scale loss for mixed precision backward pass
    # Optional: Gradient Clipping - helps prevent exploding gradients
    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    scaler.step(optimizer)                # Optimizer step (unscales gradients internally)
    scaler.update()                       # Update scaler for next iteration

    # Update learning rate (if using scheduler)
    # scheduler.step()


end_time = time.time()
print(f"\nTraining finished in {end_time - start_time:.2f} seconds.")
print(f"Best validation loss achieved: {best_val_loss:.4f}")
print(f"Final model saved to {config['model_path']}")

Starting training for 5000 iterations...
Starting training from scratch.


  scaler = GradScaler(enabled=config['use_amp'])
  with autocast(enabled=config['use_amp']): # Enable AMP context


Step 0: Train loss 8.2298, Val loss 8.2239, Time: 4.00s
New best validation loss: 8.2239. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth


  with autocast(enabled=config['use_amp']):


Step 250: Train loss 4.2291, Val loss 4.5362, Time: 25.53s
New best validation loss: 4.5362. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth
Step 500: Train loss 3.7176, Val loss 4.4019, Time: 45.14s
New best validation loss: 4.4019. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth
Step 750: Train loss 3.1563, Val loss 4.4389, Time: 64.19s
Step 1000: Train loss 2.4839, Val loss 4.6733, Time: 82.29s
Step 1250: Train loss 1.7021, Val loss 5.0798, Time: 100.41s
Step 1500: Train loss 1.0136, Val loss 5.5482, Time: 118.78s
Step 1750: Train loss 0.5694, Val loss 6.0136, Time: 137.16s
Step 2000: Train loss 0.3424, Val loss 6.4148, Time: 156.01s
Step 2250: Train loss 0.2554, Val loss 6.7523, Time: 174.22s
Step 2500: Train loss 0.2120, Val loss 6.9744, Time: 192.41s
Step 2750: Train loss 0.1912, Val loss 7.1061, Time: 212.28s
Step 3000: Train loss 0.1733, Val loss 7.1965, Time: 230.48s
Ste

In [14]:
#------------------------------------
# Snippet 7: Generation / Inference
#------------------------------------

# --- Load Model and Vocab ---
print("Loading model and vocabulary for generation...")

# Load vocabulary
try:
    with open(config['vocab_path'], 'rb') as f:
        saved_vocab = pickle.load(f)
        stoi = saved_vocab['stoi']
        itos = saved_vocab['itos']
    print(f"Vocabulary loaded from {config['vocab_path']}")
    # Update config with loaded vocab size if necessary (should match)
    config['vocab_size'] = len(stoi)
except FileNotFoundError:
    print(f"ERROR: Vocabulary file not found at {config['vocab_path']}. Cannot generate.")
    # Exit or handle error appropriately
    stoi, itos = {}, {} # Avoid crashing later code

# Re-create model architecture using saved config (or current config if running sequentially)
# Important: Make sure the config matches the one used for the saved weights!
# If loading a checkpoint, the config is usually saved within it.
# Here we assume the 'config' dictionary is still available and matches.
if config['vocab_size'] > 0: # Only proceed if vocab loaded
    gen_model = GPT(config)
    gen_model.to(device)
    print("Model architecture created.")

    # Load trained weights
    try:
        checkpoint = torch.load(config['model_path'], map_location=device)
        gen_model.load_state_dict(checkpoint['model_state_dict'])
        gen_model.eval() # Set model to evaluation mode
        print(f"Loaded trained model weights from {config['model_path']}")

        # --- Generate Text ---
        print("\n--- Generating Text ---")

        # Starting context (prompt)
        # Make sure words are likely in the vocabulary or use <UNK>
        start_text = "My dear Watson,"
        # start_text = "The game is"
        # start_text = "It was a dark and stormy" # May contain words not in vocab if MIN_WORD_FREQ was high

        print(f"Starting prompt: '{start_text}'")
        start_ids = encode(start_text)
        # Context needs to be a batch (even if batch size is 1)
        x = torch.tensor(start_ids, dtype=torch.long, device=device).unsqueeze(0) # Add batch dimension

        # Generate!
        max_tokens_to_generate = 100
        temperature = 0.8 # Lower -> less random; Higher -> more random
        top_k = 50        # Consider only top 50 words

        # Run generation within torch.no_grad() context
        with torch.no_grad():
            with autocast(enabled=config['use_amp']): # Use AMP for generation too (optional, less critical)
                y = gen_model.generate(x, max_tokens_to_generate, temperature=temperature, top_k=top_k)

        # Decode the generated sequence
        generated_ids = y[0].tolist() # Get the list of IDs from the batch
        generated_text = decode(generated_ids)

        print("\nGenerated Text:")
        print(generated_text)
        print("-" * 30)

    except FileNotFoundError:
        print(f"ERROR: Model checkpoint file not found at {config['model_path']}. Train the model first (Snippet 6).")
    except Exception as e:
        print(f"An error occurred during generation: {e}")

else:
    print("Skipping generation because vocabulary could not be loaded.")

Loading model and vocabulary for generation...
Vocabulary loaded from /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/word_vocab.pkl
Model Parameter Count: 11.96 M
Model architecture created.
Loaded trained model weights from /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth

--- Generating Text ---
Starting prompt: 'My dear Watson,'


  with autocast(enabled=config['use_amp']): # Use AMP for generation too (optional, less critical)



Generated Text:
my dear watson , ” “ i shall have your name <UNK> ” “ then i am not <UNK> ” “ oh , i am , ” i asked . “ he remarked for it . “ you think , ” lestrade , “ you would <UNK> ” “ not say that is in so . “ yes , that , or it is very quietly . i shall not be <UNK> ” “ no one is a matter and i shall not think , and have you find your hands. ” “ but the <UNK> ? ” “ they have the <UNK> ”
------------------------------


In [7]:
#----- nano gpt2 training and val loss-------

Starting training for 5000 iterations...
Starting training from scratch.
<ipython-input-7-5a45c5a8127e>:20: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-7-5a45c5a8127e>:43: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']): # Enable AMP context
Step 0: Train loss 8.1136, Val loss 8.1158, Time: 2.41s
New best validation loss: 8.1158. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
<ipython-input-7-5a45c5a8127e>:107: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 5.5643, Val loss 5.6511, Time: 8.98s
New best validation loss: 5.6511. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 500: Train loss 5.0102, Val loss 5.1285, Time: 14.27s
New best validation loss: 5.1285. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 750: Train loss 4.6698, Val loss 4.8321, Time: 20.34s
New best validation loss: 4.8321. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 1000: Train loss 4.5097, Val loss 4.7091, Time: 25.61s
New best validation loss: 4.7091. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 1250: Train loss 4.3960, Val loss 4.6327, Time: 32.04s
New best validation loss: 4.6327. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 1500: Train loss 4.2993, Val loss 4.5837, Time: 37.59s
New best validation loss: 4.5837. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 1750: Train loss 4.2251, Val loss 4.5558, Time: 43.33s
New best validation loss: 4.5558. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 2000: Train loss 4.1589, Val loss 4.5330, Time: 49.61s
New best validation loss: 4.5330. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 2250: Train loss 4.1200, Val loss 4.5168, Time: 55.11s
New best validation loss: 4.5168. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 2500: Train loss 4.0658, Val loss 4.5007, Time: 61.25s
New best validation loss: 4.5007. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 2750: Train loss 4.0075, Val loss 4.4830, Time: 66.65s
New best validation loss: 4.4830. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 3000: Train loss 3.9664, Val loss 4.4647, Time: 72.81s
New best validation loss: 4.4647. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 3250: Train loss 3.9211, Val loss 4.4585, Time: 78.12s
New best validation loss: 4.4585. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 3500: Train loss 3.8668, Val loss 4.4474, Time: 84.09s
New best validation loss: 4.4474. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 3750: Train loss 3.8305, Val loss 4.4408, Time: 89.74s
New best validation loss: 4.4408. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 4000: Train loss 3.7955, Val loss 4.4431, Time: 95.35s
Step 4250: Train loss 3.7554, Val loss 4.4286, Time: 101.25s
New best validation loss: 4.4286. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 4500: Train loss 3.7198, Val loss 4.4271, Time: 106.63s
New best validation loss: 4.4271. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 4750: Train loss 3.6804, Val loss 4.4246, Time: 112.78s
New best validation loss: 4.4246. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth
Step 4999: Train loss 3.6473, Val loss 4.4231, Time: 118.12s
New best validation loss: 4.4231. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth

Training finished in 118.17 seconds.
Best validation loss achieved: 4.4231
Final model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel/gpt2_word_level.pth





# --------micro gpt2 training and val loss------


Starting training for 5000 iterations...
Starting training from scratch.
<ipython-input-12-5a45c5a8127e>:20: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-12-5a45c5a8127e>:43: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']): # Enable AMP context
Step 0: Train loss 8.1091, Val loss 8.1132, Time: 2.19s
New best validation loss: 8.1132. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
<ipython-input-12-5a45c5a8127e>:107: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 4.6725, Val loss 4.8251, Time: 9.66s
New best validation loss: 4.8251. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 500: Train loss 4.3028, Val loss 4.5740, Time: 16.31s
New best validation loss: 4.5740. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 750: Train loss 4.0856, Val loss 4.4823, Time: 23.69s
New best validation loss: 4.4823. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 1000: Train loss 3.8919, Val loss 4.4057, Time: 30.46s
New best validation loss: 4.4057. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 1250: Train loss 3.7193, Val loss 4.3860, Time: 38.10s
New best validation loss: 4.3860. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 1500: Train loss 3.5488, Val loss 4.3806, Time: 44.89s
New best validation loss: 4.3806. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_micro/gpt2_word_level.pth
Step 1750: Train loss 3.4059, Val loss 4.4012, Time: 53.10s
Step 2000: Train loss 3.2641, Val loss 4.4179, Time: 60.18s
Step 2250: Train loss 3.1045, Val loss 4.4706, Time: 67.02s
Step 2500: Train loss 2.9557, Val loss 4.5053, Time: 74.52s
Step 2750: Train loss 2.7976, Val loss 4.5889, Time: 81.70s
Step 3000: Train loss 2.6644, Val loss 4.6525, Time: 89.00s
Step 3250: Train loss 2.5169, Val loss 4.7121, Time: 95.61s
Step 3500: Train loss 2.3694, Val loss 4.8029, Time: 102.88s
Step 3750: Train loss 2.2329, Val loss 4.8874, Time: 109.60s
Step 4000: Train loss 2.0989, Val loss 4.9620, Time: 116.89s
Step 4250: Train loss 1.9689, Val loss 5.0316, Time: 123.61s
Step 4500: Train loss 1.8493, Val loss 5.1189, Time: 130.93s
Step 4750: Train loss 1.7385, Val loss 5.2209, Time: 137.88s
Step 4999: Train loss 1.6245, Val loss 5.2630, Time: 144.81s


#------------tiny gpt2 training and val loss-------------------------


Starting training for 5000 iterations...
Starting training from scratch.
<ipython-input-13-5a45c5a8127e>:20: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-13-5a45c5a8127e>:43: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']): # Enable AMP context
Step 0: Train loss 8.2298, Val loss 8.2239, Time: 4.00s
New best validation loss: 8.2239. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth
<ipython-input-13-5a45c5a8127e>:107: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 4.2291, Val loss 4.5362, Time: 25.53s
New best validation loss: 4.5362. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth
Step 500: Train loss 3.7176, Val loss 4.4019, Time: 45.14s
New best validation loss: 4.4019. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth
Step 750: Train loss 3.1563, Val loss 4.4389, Time: 64.19s
Step 1000: Train loss 2.4839, Val loss 4.6733, Time: 82.29s
Step 1250: Train loss 1.7021, Val loss 5.0798, Time: 100.41s
Step 1500: Train loss 1.0136, Val loss 5.5482, Time: 118.78s
Step 1750: Train loss 0.5694, Val loss 6.0136, Time: 137.16s
Step 2000: Train loss 0.3424, Val loss 6.4148, Time: 156.01s
Step 2250: Train loss 0.2554, Val loss 6.7523, Time: 174.22s
Step 2500: Train loss 0.2120, Val loss 6.9744, Time: 192.41s
Step 2750: Train loss 0.1912, Val loss 7.1061, Time: 212.28s
Step 3000: Train loss 0.1733, Val loss 7.1965, Time: 230.48s
Step 3250: Train loss 0.1665, Val loss 7.3343, Time: 249.06s
Step 3500: Train loss 0.1585, Val loss 7.4599, Time: 267.30s
Step 3750: Train loss 0.1487, Val loss 7.6057, Time: 285.73s
Step 4000: Train loss 0.1414, Val loss 7.6944, Time: 305.11s
Step 4250: Train loss 0.1387, Val loss 7.7234, Time: 323.30s
Step 4500: Train loss 0.1346, Val loss 7.8634, Time: 341.61s
Step 4750: Train loss 0.1274, Val loss 7.9672, Time: 360.48s
Step 4999: Train loss 0.1274, Val loss 7.9480, Time: 378.61s

Training finished in 378.66 seconds.
Best validation loss achieved: 4.4019
Final model saved to /content/drive/MyDrive/gpt2_scratch_wordlevel_tiny/gpt2_word_level.pth

SyntaxError: invalid decimal literal (<ipython-input-7-8d5439708259>, line 3)

In [18]:
#------------------------------------
# Snippet 1: Setup and Google Drive Mount (Character Level)
#------------------------------------
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast # For mixed precision

import math
import os
import requests # For downloading data
# import nltk # No longer needed for basic character tokenization
from collections import Counter # Still useful for analysis, but not vocab building here
import pickle # To save/load vocabulary
import time
from tqdm import tqdm # Progress bar

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# --- Configuration ---
# !! CHANGE PATHS FOR CHARACTER MODEL !!
DRIVE_SAVE_DIR = "/content/drive/MyDrive/gpt2_scratch_charlevel_tiny" # CHANGE AS NEEDED
if not os.path.exists(DRIVE_SAVE_DIR):
    os.makedirs(DRIVE_SAVE_DIR)

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# NLTK download no longer needed for basic char level
# try:
#     nltk.data.find('tokenizers/punkt')
# except nltk.downloader.DownloadError:
#     print("Downloading NLTK punkt tokenizer...")
#     nltk.download('punkt')
#     print("NLTK punkt downloaded.")

print("Setup Complete.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Setup Complete.


In [19]:
#------------------------------------
# Snippet 2: Configuration (Character Level)
#------------------------------------

# --- Model Hyperparameters --- Choose one preset ---
# Character models can sometimes benefit from larger embedding dimensions relative
# to vocab size, but we keep similar sizes to start.
# Preset: 'nano'
#n_layer = 3
#n_head = 3
#n_embd = 48

# Preset: 'micro'
#n_layer = 4
#n_head = 4
#n_embd = 128

# Preset: 'tiny' (Good starting point for T4)
n_layer = 6
n_head = 6
n_embd = 384 # Embedding dimension must be divisible by n_head

# Preset: 'small' (Monitor memory closely)
# n_layer = 12
# n_head = 12
# n_embd = 768

# --- Training Hyperparameters ---
# Character models often use larger block sizes as characters are less informative
# than words. Increase if memory allows. 256 is common.
block_size = 128      # Max context length (sequence of characters)
batch_size = 64       # Can often use slightly larger batch size due to smaller emb table lookup
max_iters = 5000      # Total training iterations (might need more for chars)
eval_interval = 250   # How often to evaluate
learning_rate = 3e-4  # Learning rate (might need tuning)
eval_iters = 100      # Number of batches for validation loss avg
dropout = 0.1         # Dropout rate
use_amp = True        # Use Automatic Mixed Precision

# --- Data ---
DATA_URL = "https://www.gutenberg.org/files/1661/1661-0.txt" # Sherlock Holmes
DATA_PATH = "sherlock_holmes_char.txt" # Keep data separate if desired
# !! CHANGE PATHS FOR CHARACTER MODEL !!
VOCAB_PATH = os.path.join(DRIVE_SAVE_DIR, "char_vocab.pkl")
TRAIN_MODEL_PATH = os.path.join(DRIVE_SAVE_DIR, "gpt2_char_level.pth")
# MIN_WORD_FREQ = 3     # No longer needed for characters

# Derived parameters
assert n_embd % n_head == 0, "Embedding dimension must be divisible by number of heads"

config = {
    'n_layer': n_layer,
    'n_head': n_head,
    'n_embd': n_embd,
    'block_size': block_size,
    'batch_size': batch_size,
    'max_iters': max_iters,
    'eval_interval': eval_interval,
    'learning_rate': learning_rate,
    'eval_iters': eval_iters,
    'dropout': dropout,
    'use_amp': use_amp,
    'vocab_size': -1, # Will be set after data loading (character vocab size)
    'device': device,
    # 'min_word_freq': MIN_WORD_FREQ, # Removed
    'vocab_path': VOCAB_PATH,
    'model_path': TRAIN_MODEL_PATH
}

print("Character-Level Configuration:")
for key, val in config.items():
    print(f"{key}: {val}")

Character-Level Configuration:
n_layer: 6
n_head: 6
n_embd: 384
block_size: 128
batch_size: 64
max_iters: 5000
eval_interval: 250
learning_rate: 0.0003
eval_iters: 100
dropout: 0.1
use_amp: True
vocab_size: -1
device: cuda
vocab_path: /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/char_vocab.pkl
model_path: /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth


In [20]:
#------------------------------------
# Snippet 3: Data Preparation (Character Level)
#------------------------------------

# --- Download Data ---
# (Using a different DATA_PATH to avoid potential conflicts if needed, but content is the same)
if not os.path.exists(DATA_PATH):
    print(f"Downloading data from {DATA_URL}...")
    try:
        response = requests.get(DATA_URL)
        response.raise_for_status()
        text = response.content.decode('utf-8-sig')
        start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
        start_idx = text.find(start_marker)
        if start_idx != -1:
            start_idx += len(start_marker)
            text = text[start_idx:]
        end_idx = text.find(end_marker)
        if end_idx != -1:
            text = text[:end_idx]
        text = text.strip()
        print(f"Data downloaded and saved to {DATA_PATH}. Length: {len(text)} characters.")
        with open(DATA_PATH, 'w', encoding='utf-8') as f:
            f.write(text)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        text = ""
    except Exception as e:
        print(f"An error occurred during data processing: {e}")
        text = ""
else:
    print(f"Data file {DATA_PATH} already exists.")
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        text = f.read()
    print(f"Loaded data from {DATA_PATH}. Length: {len(text)} characters.")

# --- Tokenization & Vocabulary (Character Level) ---
print("Building character vocabulary...")
# Find all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
config['vocab_size'] = vocab_size # Update config

print(f"Character vocabulary size: {vocab_size}")
print(f"Vocabulary: {''.join(chars)}") # Print the actual characters

# Create mapping from character to integer index and vice-versa
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

print(f"Saving vocabulary to {config['vocab_path']}...")
with open(config['vocab_path'], 'wb') as f:
    pickle.dump({'stoi': stoi, 'itos': itos}, f)
print("Vocabulary saved.")

# --- Encode/Decode Functions (Character Level) ---
def encode(text_string):
    # Convert string to list of character indices
    return [stoi[c] for c in text_string] # Assumes chars in text_string are in vocab

def decode(indices):
    # Convert list of indices back to string
    return ''.join([itos[i] for i in indices])

# --- Create Data Tensors ---
print("Encoding entire dataset (character level)...")
full_data = torch.tensor(encode(text), dtype=torch.long)
print(f"Encoded data shape: {full_data.shape}") # Shape will be (total_chars,)

# Split data into train and validation sets
n = len(full_data)
train_data = full_data[:int(n*0.9)]
val_data = full_data[int(n*0.9):]

print(f"Train set size: {len(train_data)} characters")
print(f"Validation set size: {len(val_data)} characters")

# --- Dataset Class (Unchanged logic, works for chars too) ---
class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # We can start a sequence at almost any point
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Grab a chunk of data for context (x) and target (y)
        chunk = self.data[idx:idx + self.block_size + 1]
        x = chunk[:-1]
        y = chunk[1:]
        return x, y

# Example usage (optional):
# test_dataset = TextDataset(train_data, config['block_size'])
# x_ex, y_ex = test_dataset[0]
# print("Example x:", x_ex)
# print("Example y:", y_ex)
# print("Decoded x:", decode(x_ex.tolist()))
# print("Decoded y:", decode(y_ex.tolist()))

print("Character Data Preparation Complete.")

Data file sherlock_holmes_char.txt already exists.
Loaded data from sherlock_holmes_char.txt. Length: 581421 characters.
Building character vocabulary...
Character vocabulary size: 97
Vocabulary: 
 !#$%&()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz£½àâæèéœ—‘’“”•™
Saving vocabulary to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/char_vocab.pkl...
Vocabulary saved.
Encoding entire dataset (character level)...
Encoded data shape: torch.Size([581421])
Train set size: 523278 characters
Validation set size: 58143 characters
Character Data Preparation Complete.


In [21]:
#------------------------------------
# Snippet 4: GPT-2 Model Components (Unchanged)
#------------------------------------
# This snippet defining LayerNorm, CausalSelfAttention, MLP, Block
# does NOT need modification. The components operate on embedding
# dimensions and sequence lengths, independent of whether the
# initial tokens were words or characters.

# --- Layer Normalization ---
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
    def __init__(self, ndim, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        mean = input.mean(dim=-1, keepdim=True)
        var = input.var(dim=-1, keepdim=True, unbiased=False)
        normalized_input = (input - mean) / torch.sqrt(var + 1e-5)
        output = normalized_input * self.weight
        if self.bias is not None:
            output = output + self.bias
        return output

# --- Causal Self-Attention Head ---
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config['n_embd'] % config['n_head'] == 0
        self.c_attn = nn.Linear(config['n_embd'], 3 * config['n_embd'])
        self.c_proj = nn.Linear(config['n_embd'], config['n_embd'])
        self.attn_dropout = nn.Dropout(config['dropout'])
        self.resid_dropout = nn.Dropout(config['dropout'])
        self.register_buffer("bias", torch.tril(torch.ones(config['block_size'], config['block_size']))
                                      .view(1, 1, config['block_size'], config['block_size']))
        self.n_head = config['n_head']
        self.n_embd = config['n_embd']
        self.dropout = config['dropout']

    def forward(self, x):
        B, T, C = x.size() # Batch size, sequence length, embedding dimensionality (n_embd)
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        head_size = C // self.n_head
        k = k.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, head_size).transpose(1, 2) # (B, nh, T, hs)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_size))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # (B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

# --- Feed Forward Network (MLP) ---
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config['n_embd'], 4 * config['n_embd'])
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config['n_embd'], config['n_embd'])
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

# --- Transformer Block ---
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config['n_embd'])
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config['n_embd'])
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x)) # Attention path
        x = x + self.mlp(self.ln_2(x))  # MLP path
        return x

print("Model Components Defined (Unchanged - LayerNorm, CausalSelfAttention, MLP, Block).")

Model Components Defined (Unchanged - LayerNorm, CausalSelfAttention, MLP, Block).


In [22]:
#------------------------------------
# Snippet 5: Full GPT-2 Model
#------------------------------------

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            # Token embeddings
            wte = nn.Embedding(config['vocab_size'], config['n_embd']),
            # Positional embeddings (learned)
            wpe = nn.Embedding(config['block_size'], config['n_embd']),
            # Dropout after embedding + positional encoding
            drop = nn.Dropout(config['dropout']),
            # Stack of transformer blocks
            h = nn.ModuleList([Block(config) for _ in range(config['n_layer'])]),
            # Final layer normalization before the output head
            ln_f = LayerNorm(config['n_embd']), # Or nn.LayerNorm(config['n_embd'])
        ))
        # Language modeling head (maps embeddings to vocabulary logits)
        self.lm_head = nn.Linear(config['n_embd'], config['vocab_size'], bias=False)

        # Tie the weights between the token embeddings and the final linear layer
        # This improves performance and reduces parameters
        self.transformer.wte.weight = self.lm_head.weight

        # Initialize weights (important for transformer stability)
        self.apply(self._init_weights)

        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config['n_layer']))

        # Report number of parameters
        n_params = sum(p.numel() for p in self.parameters())
        print(f"Model Parameter Count: {n_params/1e6:.2f} M")


    def _init_weights(self, module):
        # Initialize Linear and Embedding layers
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size() # Batch size, sequence length
        assert t <= self.config['block_size'], f"Cannot forward sequence of length {t}, block size is only {self.config['block_size']}"

        # --- Forward pass through the transformer ---
        # 1. Get token embeddings
        tok_emb = self.transformer.wte(idx) # Shape: (b, t, n_embd)

        # 2. Get positional embeddings
        # Create position IDs: tensor of [0, 1, ..., t-1]
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # Shape: (1, t)
        pos_emb = self.transformer.wpe(pos) # Shape: (1, t, n_embd)

        # 3. Add token and positional embeddings
        x = self.transformer.drop(tok_emb + pos_emb)

        # 4. Pass through transformer blocks
        for block in self.transformer.h:
            x = block(x)

        # 5. Final layer normalization
        x = self.transformer.ln_f(x) # Shape: (b, t, n_embd)

        # --- Language Modeling Head ---
        if targets is not None:
            # If we are given some desired targets also calculate the loss
            logits = self.lm_head(x) # Shape: (b, t, vocab_size)
            # Calculate loss using cross-entropy
            # Need to reshape for CrossEntropyLoss: expects (N, C) and (N)
            # N = b * t, C = vocab_size
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0) # Ignore PAD token if targets contain it
        else:
            # Inference-time configuration: only forward the lm_head on the very last position
            # This is slightly more efficient during generation
            logits = self.lm_head(x[:, [-1], :]) # Note: using list [-1] keeps the T dimension
            loss = None

        return logits, loss

    # --- Generation Method ---
    @torch.no_grad() # IMPORTANT: Disable gradient calculation during generation
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        self.eval() # Set model to evaluation mode

        for _ in range(max_new_tokens):
            # If the sequence context is growing too long, crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config['block_size'] else idx[:, -self.config['block_size']:]

            # Forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond) # We don't need the loss here

            # Pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature # Shape: (b, vocab_size)

            # Optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                # Set logits not in the top k to -infinity
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1) # Shape: (b, vocab_size)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # Shape: (b, 1)

            # Append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1) # Shape: (b, t+1)

        self.train() # Set model back to train mode if needed later
        return idx

# --- Instantiate the model ---
# Ensure vocab size is set in config
if config['vocab_size'] == -1:
     raise ValueError("Vocabulary size not set. Run Snippet 3 first.")

model = GPT(config)
model.to(device)

print("GPT Model Instantiated.")
# Optional: Print model structure
# print(model)

Model Parameter Count: 10.73 M
GPT Model Instantiated.


In [23]:
#------------------------------------
# Snippet 6: Training Loop (Using Char Paths)
#------------------------------------

# --- DataLoader ---
# Using the same TextDataset class, but with character data
train_dataset = TextDataset(train_data, config['block_size'])
val_dataset = TextDataset(val_data, config['block_size'])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

# --- Optimizer ---
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

# --- Mixed Precision Scaler ---
scaler = GradScaler(enabled=config['use_amp'])

# --- Estimate Validation Loss Function (Unchanged Logic) ---
@torch.no_grad()
def estimate_loss(model_to_eval, eval_iters):
    out = {}
    model_to_eval.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        loader = train_loader if split == 'train' else val_loader
        loader_iter = iter(loader)
        for k in range(eval_iters):
            try: X, Y = next(loader_iter)
            except StopIteration:
                 loader_iter = iter(loader)
                 X, Y = next(loader_iter)
            X, Y = X.to(device), Y.to(device)
            with autocast(enabled=config['use_amp']):
                 logits, loss = model_to_eval(X, Y)
            # Check for valid loss (not NaN or Inf)
            if torch.isnan(loss) or torch.isinf(loss):
                print(f"Warning: Encountered {loss.item()} loss in {split} evaluation step {k}. Skipping.")
                losses[k] = losses[k-1] if k > 0 else 50.0 # Use previous or a high value
            else:
                losses[k] = loss.item()
        out[split] = losses[losses != 0].mean() # Avoid averaging zeros if issues occurred
    model_to_eval.train()
    return out

# --- Training Loop ---
print(f"Starting character-level training for {config['max_iters']} iterations...")
best_val_loss = float('inf')
start_time = time.time()
train_iter = iter(train_loader) # Initialize train iterator

# Resume from checkpoint if exists (using the CHARACTER model path)
if os.path.exists(config['model_path']):
    print(f"Resuming training from checkpoint: {config['model_path']}")
    try:
        checkpoint = torch.load(config['model_path'], map_location=device)
        # Ensure loaded config vocab size matches current config
        # This is a basic check; more robust checks might compare more config keys
        if checkpoint['config']['vocab_size'] != config['vocab_size']:
             print(f"Warning: Checkpoint vocab size ({checkpoint['config']['vocab_size']}) differs "
                   f"from current config ({config['vocab_size']}). Loading weights may fail or lead to errors.")
        model.load_state_dict(checkpoint['model_state_dict'])
        # Load optimizer state if needed (recommended for longer training)
        # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        # best_val_loss = checkpoint.get('best_val_loss', float('inf'))
        # start_iter = checkpoint.get('iter', 0) + 1
        print("Loaded model weights. Re-initializing optimizer/iteration count for simplicity.")
        start_iter = 0 # Or load from checkpoint if desired
    except Exception as e:
        print(f"Error loading checkpoint: {e}. Starting training from scratch.")
        start_iter = 0
else:
    print("Starting training from scratch.")
    start_iter = 0


for iter_num in range(start_iter, config['max_iters']):
    # Evaluation
    if iter_num > 0 and (iter_num % config['eval_interval'] == 0 or iter_num == config['max_iters'] - 1):
        losses = estimate_loss(model, config['eval_iters'])
        current_time = time.time()
        elapsed_time = current_time - start_time
        print(f"Step {iter_num}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}, Time: {elapsed_time:.2f}s")

        # Save checkpoint (using CHARACTER model path)
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            print(f"New best validation loss: {best_val_loss:.4f}. Saving model...")
            checkpoint = {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'config': config, # Save config used for this checkpoint
                'best_val_loss': best_val_loss,
                'iter': iter_num
            }
            torch.save(checkpoint, config['model_path'])
            print(f"Model saved to {config['model_path']}")

    # Sample batch
    try:
        X_batch, Y_batch = next(train_iter)
    except StopIteration:
        train_iter = iter(train_loader)
        X_batch, Y_batch = next(train_iter)

    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

    # Forward/Backward pass with AMP
    with autocast(enabled=config['use_amp']):
        logits, loss = model(X_batch, Y_batch)

    if torch.isnan(loss) or torch.isinf(loss):
        print(f"Warning: NaN or Inf loss detected at iteration {iter_num}. Skipping backward step.")
        optimizer.zero_grad(set_to_none=True) # Still zero grads
        continue # Skip optimizer step and scaler update if loss is invalid

    optimizer.zero_grad(set_to_none=True)
    scaler.scale(loss).backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Optional clipping
    scaler.step(optimizer)
    scaler.update()

    # Optional LR scheduler step here

end_time = time.time()
print(f"\nTraining finished in {end_time - start_time:.2f} seconds.")
print(f"Best validation loss achieved: {best_val_loss:.4f}")
print(f"Final character model saved to {config['model_path']}")

Starting character-level training for 5000 iterations...


  scaler = GradScaler(enabled=config['use_amp'])
  with autocast(enabled=config['use_amp']):


Starting training from scratch.


  with autocast(enabled=config['use_amp']):


Step 250: Train loss 2.1820, Val loss 2.2673, Time: 31.31s
New best validation loss: 2.2673. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 500: Train loss 1.7160, Val loss 1.7539, Time: 62.38s
New best validation loss: 1.7539. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 750: Train loss 1.4598, Val loss 1.5471, Time: 93.20s
New best validation loss: 1.5471. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1000: Train loss 1.3112, Val loss 1.4290, Time: 124.36s
New best validation loss: 1.4290. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1250: Train loss 1.2058, Val loss 1.3561, Time: 155.27s
New best validation loss: 1.3561. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1500: Train loss

In [24]:
#------------------------------------
# Snippet 7: Generation / Inference (Character Level)
#------------------------------------

# --- Load Model and Vocab ---
print("Loading character model and vocabulary for generation...")

# Load CHARACTER vocabulary
try:
    with open(config['vocab_path'], 'rb') as f: # Uses char vocab path from config
        saved_vocab = pickle.load(f)
        stoi = saved_vocab['stoi']
        itos = saved_vocab['itos']
    print(f"Character vocabulary loaded from {config['vocab_path']}")
    loaded_vocab_size = len(stoi)
    # Update config vocab size based on loaded vocab if needed (should match)
    if config['vocab_size'] != loaded_vocab_size:
        print(f"Updating config vocab size from {config['vocab_size']} to loaded {loaded_vocab_size}")
        config['vocab_size'] = loaded_vocab_size
except FileNotFoundError:
    print(f"ERROR: Character vocabulary file not found at {config['vocab_path']}. Cannot generate.")
    stoi, itos = {}, {} # Avoid crashing
    config['vocab_size'] = 0 # Indicate vocab load failure

# Re-create model architecture using config
# Ensure config matches the saved model's config (esp. vocab_size)
if config['vocab_size'] > 0:
    gen_model = GPT(config)
    gen_model.to(device)
    print("Model architecture created.")

    # Load trained CHARACTER weights
    try:
        checkpoint = torch.load(config['model_path'], map_location=device) # Uses char model path

        # Optional: Load config from checkpoint for robustness
        # loaded_config = checkpoint['config']
        # gen_model = GPT(loaded_config) # Re-create model with exact saved config
        # gen_model.to(device)
        # print("Model architecture created from checkpoint config.")

        gen_model.load_state_dict(checkpoint['model_state_dict'])
        gen_model.eval()
        print(f"Loaded trained character model weights from {config['model_path']}")

        # --- Generate Text ---
        print("\n--- Generating Character-Level Text ---")

        # Starting context (prompt) - must only contain characters from the vocab
        start_text = "Sherlock Holmes was"
        # start_text = "My dear Watson"
        # start_text = "It was a"

        print(f"Starting prompt: '{start_text}'")
        # Use the character encode function
        start_ids = encode(start_text)
        x = torch.tensor(start_ids, dtype=torch.long, device=device).unsqueeze(0)

        # Generate! Increase max_tokens_to_generate for characters.
        max_tokens_to_generate = 500 # Generate more characters
        temperature = 0.75          # Adjust temperature as needed
        top_k = 40                  # Adjust top-k as needed

        with torch.no_grad():
            with autocast(enabled=config['use_amp']):
                y = gen_model.generate(x, max_tokens_to_generate, temperature=temperature, top_k=top_k)

        # Decode the generated sequence using character decoder
        generated_ids = y[0].tolist()
        generated_text = decode(generated_ids) # Decodes directly to characters

        print("\nGenerated Text:")
        print(generated_text)
        print("-" * 30)

    except FileNotFoundError:
        print(f"ERROR: Character model checkpoint file not found at {config['model_path']}. Train the model first (Snippet 6).")
    except KeyError as e:
         print(f"ERROR: Mismatch between loaded checkpoint and model architecture (KeyError: {e}). "
               "Ensure the config used for generation matches the training config.")
    except Exception as e:
        print(f"An error occurred during generation: {e}")
        import traceback
        traceback.print_exc()


else:
    print("Skipping generation because character vocabulary could not be loaded.")

Loading character model and vocabulary for generation...
Character vocabulary loaded from /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/char_vocab.pkl
Model Parameter Count: 10.73 M
Model architecture created.
Loaded trained character model weights from /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth

--- Generating Character-Level Text ---
Starting prompt: 'Sherlock Holmes was'


  with autocast(enabled=config['use_amp']):



Generated Text:
------------------------------


In [25]:
#---- nano ----
Starting character-level training for 5000 iterations...
<ipython-input-13-76ccea85ce5a>:17: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-13-76ccea85ce5a>:109: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Starting training from scratch.
<ipython-input-13-76ccea85ce5a>:34: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 2.7152, Val loss 2.7607, Time: 5.76s
New best validation loss: 2.7607. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 500: Train loss 2.4923, Val loss 2.5561, Time: 11.37s
New best validation loss: 2.5561. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 750: Train loss 2.4134, Val loss 2.4872, Time: 17.52s
New best validation loss: 2.4872. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 1000: Train loss 2.3665, Val loss 2.4399, Time: 22.98s
New best validation loss: 2.4399. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 1250: Train loss 2.2930, Val loss 2.3684, Time: 29.34s
New best validation loss: 2.3684. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 1500: Train loss 2.2239, Val loss 2.2957, Time: 34.74s
New best validation loss: 2.2957. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 1750: Train loss 2.1572, Val loss 2.2205, Time: 40.98s
New best validation loss: 2.2205. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 2000: Train loss 2.0883, Val loss 2.1549, Time: 47.28s
New best validation loss: 2.1549. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 2250: Train loss 2.0420, Val loss 2.1034, Time: 53.59s
New best validation loss: 2.1034. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 2500: Train loss 1.9959, Val loss 2.0562, Time: 59.03s
New best validation loss: 2.0562. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 2750: Train loss 1.9591, Val loss 2.0119, Time: 64.99s
New best validation loss: 2.0119. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 3000: Train loss 1.9238, Val loss 1.9800, Time: 70.74s
New best validation loss: 1.9800. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 3250: Train loss 1.8994, Val loss 1.9518, Time: 76.17s
New best validation loss: 1.9518. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 3500: Train loss 1.8701, Val loss 1.9168, Time: 82.88s
New best validation loss: 1.9168. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 3750: Train loss 1.8524, Val loss 1.8998, Time: 88.38s
New best validation loss: 1.8998. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 4000: Train loss 1.8321, Val loss 1.8812, Time: 94.63s
New best validation loss: 1.8812. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 4250: Train loss 1.8113, Val loss 1.8606, Time: 100.06s
New best validation loss: 1.8606. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 4500: Train loss 1.7933, Val loss 1.8377, Time: 106.36s
New best validation loss: 1.8377. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 4750: Train loss 1.7806, Val loss 1.8230, Time: 111.82s
New best validation loss: 1.8230. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth
Step 4999: Train loss 1.7624, Val loss 1.8109, Time: 117.99s
New best validation loss: 1.8109. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth

Training finished in 118.07 seconds.
Best validation loss achieved: 1.8109
Final character model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_nano/gpt2_char_level.pth

#---------micro--------------
Starting character-level training for 5000 iterations...
<ipython-input-6-76ccea85ce5a>:17: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-6-76ccea85ce5a>:109: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Starting training from scratch.
<ipython-input-6-76ccea85ce5a>:34: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 2.4005, Val loss 2.4682, Time: 9.72s
New best validation loss: 2.4682. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 500: Train loss 2.1778, Val loss 2.2607, Time: 18.42s
New best validation loss: 2.2607. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 750: Train loss 1.9528, Val loss 2.0107, Time: 29.18s
New best validation loss: 2.0107. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 1000: Train loss 1.8069, Val loss 1.8660, Time: 40.45s
New best validation loss: 1.8660. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 1250: Train loss 1.7006, Val loss 1.7459, Time: 53.30s
New best validation loss: 1.7459. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 1500: Train loss 1.6125, Val loss 1.6709, Time: 64.34s
New best validation loss: 1.6709. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 1750: Train loss 1.5468, Val loss 1.6165, Time: 74.02s
New best validation loss: 1.6165. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 2000: Train loss 1.4991, Val loss 1.5807, Time: 84.97s
New best validation loss: 1.5807. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 2250: Train loss 1.4537, Val loss 1.5423, Time: 95.82s
New best validation loss: 1.5423. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 2500: Train loss 1.4188, Val loss 1.5158, Time: 105.18s
New best validation loss: 1.5158. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 2750: Train loss 1.3765, Val loss 1.4844, Time: 113.92s
New best validation loss: 1.4844. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 3000: Train loss 1.3579, Val loss 1.4723, Time: 123.36s
New best validation loss: 1.4723. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 3250: Train loss 1.3325, Val loss 1.4458, Time: 132.51s
New best validation loss: 1.4458. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 3500: Train loss 1.3128, Val loss 1.4284, Time: 141.68s
New best validation loss: 1.4284. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 3750: Train loss 1.2939, Val loss 1.4247, Time: 150.95s
New best validation loss: 1.4247. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 4000: Train loss 1.2784, Val loss 1.4114, Time: 160.23s
New best validation loss: 1.4114. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 4250: Train loss 1.2635, Val loss 1.3985, Time: 169.50s
New best validation loss: 1.3985. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 4500: Train loss 1.2510, Val loss 1.3881, Time: 178.32s
New best validation loss: 1.3881. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 4750: Train loss 1.2382, Val loss 1.3793, Time: 187.29s
New best validation loss: 1.3793. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth
Step 4999: Train loss 1.2219, Val loss 1.3679, Time: 196.48s
New best validation loss: 1.3679. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth

Training finished in 196.58 seconds.
Best validation loss achieved: 1.3679
Final character model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_micro/gpt2_char_level.pth



# -----------tiny--------------------

Starting character-level training for 5000 iterations...
<ipython-input-23-76ccea85ce5a>:17: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler(enabled=config['use_amp'])
<ipython-input-23-76ccea85ce5a>:109: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Starting training from scratch.
<ipython-input-23-76ccea85ce5a>:34: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast(enabled=config['use_amp']):
Step 250: Train loss 2.1820, Val loss 2.2673, Time: 31.31s
New best validation loss: 2.2673. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 500: Train loss 1.7160, Val loss 1.7539, Time: 62.38s
New best validation loss: 1.7539. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 750: Train loss 1.4598, Val loss 1.5471, Time: 93.20s
New best validation loss: 1.5471. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1000: Train loss 1.3112, Val loss 1.4290, Time: 124.36s
New best validation loss: 1.4290. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1250: Train loss 1.2058, Val loss 1.3561, Time: 155.27s
New best validation loss: 1.3561. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1500: Train loss 1.1288, Val loss 1.3267, Time: 186.15s
New best validation loss: 1.3267. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 1750: Train loss 1.0602, Val loss 1.3150, Time: 217.42s
New best validation loss: 1.3150. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 2000: Train loss 0.9976, Val loss 1.3072, Time: 248.26s
New best validation loss: 1.3072. Saving model...
Model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth
Step 2250: Train loss 0.9275, Val loss 1.3082, Time: 279.19s
Step 2500: Train loss 0.8579, Val loss 1.3319, Time: 309.53s
Step 2750: Train loss 0.7940, Val loss 1.3464, Time: 339.82s
Step 3000: Train loss 0.7223, Val loss 1.3872, Time: 370.37s
Step 3250: Train loss 0.6523, Val loss 1.4332, Time: 400.78s
Step 3500: Train loss 0.5894, Val loss 1.4881, Time: 431.11s
Step 3750: Train loss 0.5221, Val loss 1.5423, Time: 461.64s
Step 4000: Train loss 0.4595, Val loss 1.6109, Time: 492.05s
Step 4250: Train loss 0.4077, Val loss 1.6873, Time: 522.32s
Step 4500: Train loss 0.3642, Val loss 1.7249, Time: 552.82s
Step 4750: Train loss 0.3190, Val loss 1.8078, Time: 583.23s
Step 4999: Train loss 0.2891, Val loss 1.8470, Time: 613.53s

Training finished in 613.62 seconds.
Best validation loss achieved: 1.3072
Final character model saved to /content/drive/MyDrive/gpt2_scratch_charlevel_tiny/gpt2_char_level.pth

SyntaxError: invalid decimal literal (<ipython-input-25-ec81f8f81ffb>, line 2)