# Training Experiments - Quick and Dirty

Trying different training approaches, lots of trial and error here!
This notebook is where I'm figuring out training loops, hyperparameters, etc.

In [None]:
# More messy imports
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import pickle
from datetime import datetime
import time
from tqdm import tqdm
import random
from collections import defaultdict
import seaborn as sns

# Disable warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU'))}")

# Project paths
PROJECT_ROOT = '/home/akshat/GPT_from_scratch'
NOTEBOOK_DIR = f'{PROJECT_ROOT}/notebooks'
print(f"Working from: {PROJECT_ROOT}")

## Load Previous Experiment Data

Loading tokenizers and data from previous experiments...

In [None]:
# Load tokenizer from previous experiment
tokenizer_path = f"{NOTEBOOK_DIR}/tokenizer_experiments/char_tokenizer_v1.json"
try:
    with open(tokenizer_path, 'r') as f:
        tokenizer_data = json.load(f)
    
    char_to_id = tokenizer_data['char_to_id']
    id_to_char = tokenizer_data['id_to_char']
    vocab = tokenizer_data['vocab']
    vocab_size = len(vocab)
    
    print(f"✓ Loaded tokenizer: {vocab_size} tokens")
    print(f"Sample vocab: {vocab[:10]}")
    
except FileNotFoundError:
    print("❌ Tokenizer not found, creating a quick one...")
    # Quick fallback tokenizer
    text_path = f'{PROJECT_ROOT}/text_data/alice_story.txt'
    with open(text_path, 'r') as f:
        text = f.read()
    
    chars = sorted(list(set(text)))
    vocab = ['<PAD>', '<UNK>'] + chars
    char_to_id = {ch: i for i, ch in enumerate(vocab)}
    id_to_char = {i: ch for i, ch in enumerate(vocab)}
    vocab_size = len(vocab)
    print(f"✓ Created fallback tokenizer: {vocab_size} tokens")

# Load some text data
alice_path = f'{PROJECT_ROOT}/text_data/alice_story.txt'
with open(alice_path, 'r') as f:
    training_text = f.read()

print(f"\nTraining text: {len(training_text):,} characters")
print(f"Preview: {training_text[:100]}...")

## Quick Model Definitions

Copy-pasting and modifying model code for quick experiments...

In [None]:
# Copy of attention layer with some modifications
class QuickAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.dropout_rate = dropout
        
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, x, mask=None, training=None):
        batch_size = tf.shape(x)[0]
        seq_len = tf.shape(x)[1]
        
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        # Scaled dot-product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(self.depth, tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        # Causal mask for GPT
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        mask = tf.where(mask == 0, -1e9, 0.0)
        scaled_attention_logits += mask
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention_weights = self.dropout(attention_weights, training=training)
        
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))
        
        return self.dense(output)

# Quick transformer block
class QuickTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        
        self.att = QuickAttention(d_model, num_heads, dropout)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
        ])
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, training=None):
        attn_output = self.att(x, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

print("✓ Defined quick model components")

In [None]:
# Quick GPT model for experiments
class ExperimentalGPT(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dff, maximum_position_encoding, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = tf.keras.layers.Embedding(maximum_position_encoding, d_model)
        
        self.dec_layers = [QuickTransformerBlock(d_model, num_heads, dff, dropout) 
                          for _ in range(num_layers)]
        
        self.final_layer = tf.keras.layers.Dense(vocab_size)
        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, training=None):
        seq_len = tf.shape(x)[1]
        
        # Embeddings + positional encoding
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        pos = tf.range(seq_len)
        x += self.pos_encoding(pos)
        
        x = self.dropout(x, training=training)
        
        # Pass through transformer blocks
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, training=training)
        
        return self.final_layer(x)

print("✓ Defined experimental GPT model")

## Training Data Preparation

Quick data prep for experiments...

In [None]:
# Quick data preparation function
def prepare_data_quick(text, tokenizer, seq_length=64, batch_size=32):
    """Quick data preparation for experiments"""
    print(f"Preparing data: seq_len={seq_length}, batch_size={batch_size}")
    
    # Tokenize
    tokens = [tokenizer.get(ch, tokenizer.get('<UNK>', 0)) for ch in text]
    print(f"Tokenized to {len(tokens)} tokens")
    
    # Create sequences
    inputs, targets = [], []
    
    # Use sliding window
    step_size = seq_length // 4  # Overlap windows
    for i in range(0, len(tokens) - seq_length, step_size):
        input_seq = tokens[i:i+seq_length]
        target_seq = tokens[i+1:i+seq_length+1]
        inputs.append(input_seq)
        targets.append(target_seq)
    
    print(f"Created {len(inputs)} sequences")
    
    # Convert to numpy
    inputs = np.array(inputs)
    targets = np.array(targets)
    
    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
    dataset = dataset.shuffle(10000).batch(batch_size)
    
    return dataset, len(inputs)

# Prepare training data
SEQ_LENGTH = 64
BATCH_SIZE = 16

# Use only part of text for quick experiments
sample_text = training_text[:10000]  # First 10k chars
train_dataset, num_examples = prepare_data_quick(sample_text, char_to_id, SEQ_LENGTH, BATCH_SIZE)
steps_per_epoch = num_examples // BATCH_SIZE

print(f"\nDataset ready: {num_examples} examples, {steps_per_epoch} steps/epoch")

## Model Training Experiments

Trying different model sizes and hyperparameters...

In [None]:
# Experiment 1: Small model
print("=== Experiment 1: Small Model ===")

model_small = ExperimentalGPT(
    vocab_size=vocab_size,
    d_model=64,
    num_heads=4,
    num_layers=2,
    dff=128,
    maximum_position_encoding=SEQ_LENGTH,
    dropout=0.1
)

# Compile
optimizer_small = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Quick test
test_input = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)  # Dummy input
test_output = model_small(test_input)
print(f"Small model output shape: {test_output.shape}")

# Count parameters
total_params = sum([tf.size(var).numpy() for var in model_small.trainable_variables])
print(f"Small model parameters: {total_params:,}")

In [None]:
# Manual training loop with debugging
def train_model_experimental(model, dataset, optimizer, epochs=3, model_name="model"):
    """Quick training function with lots of debugging"""
    print(f"\nTraining {model_name} for {epochs} epochs...")
    
    # Track metrics
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    
    history = {'loss': [], 'accuracy': []}
    
    @tf.function
    def train_step(inputs, targets):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)
            loss = loss_fn(targets, predictions)
        
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        train_loss(loss)
        train_accuracy(targets, predictions)
        
        return loss, predictions
    
    # Training loop
    for epoch in range(epochs):
        start = time.time()
        
        train_loss.reset_states()
        train_accuracy.reset_states()
        
        # Progress tracking
        step_losses = []
        
        for step, (inputs, targets) in enumerate(dataset):
            loss, predictions = train_step(inputs, targets)
            step_losses.append(loss.numpy())
            
            # Print progress every 20 steps
            if step % 20 == 0:
                print(f"  Epoch {epoch+1}, Step {step:3d}: loss={loss.numpy():.4f}, acc={train_accuracy.result():.4f}")
                
                # Show sample prediction
                if step == 0:  # Only on first step
                    sample_pred = tf.nn.softmax(predictions[0, 0, :])  # First token of first example
                    top_3_preds = tf.nn.top_k(sample_pred, 3)
                    pred_chars = [id_to_char[str(idx.numpy())] for idx in top_3_preds.indices]
                    target_char = id_to_char[str(targets[0, 0].numpy())]
                    print(f"    Sample prediction: {pred_chars} (target: '{target_char}')")
        
        # Epoch summary
        epoch_time = time.time() - start
        final_loss = train_loss.result().numpy()
        final_acc = train_accuracy.result().numpy()
        
        history['loss'].append(final_loss)
        history['accuracy'].append(final_acc)
        
        print(f"\n  Epoch {epoch+1} Summary:")
        print(f"    Loss: {final_loss:.4f}")
        print(f"    Accuracy: {final_acc:.4f}")
        print(f"    Time: {epoch_time:.2f}s")
        print(f"    Loss trend: {step_losses[0]:.4f} -> {step_losses[-1]:.4f}")
        print("-" * 50)
    
    return history

# Train small model
history_small = train_model_experimental(model_small, train_dataset, optimizer_small, epochs=3, model_name="Small")

In [None]:
# Experiment 2: Medium model
print("\n=== Experiment 2: Medium Model ===")

model_medium = ExperimentalGPT(
    vocab_size=vocab_size,
    d_model=128,
    num_heads=8,
    num_layers=4,
    dff=256,
    maximum_position_encoding=SEQ_LENGTH,
    dropout=0.1
)

optimizer_medium = tf.keras.optimizers.Adam(learning_rate=0.0005)  # Lower LR for bigger model

# Test model
test_output = model_medium(test_input)
print(f"Medium model output shape: {test_output.shape}")

total_params = sum([tf.size(var).numpy() for var in model_medium.trainable_variables])
print(f"Medium model parameters: {total_params:,}")

# Train medium model (fewer epochs due to size)
history_medium = train_model_experimental(model_medium, train_dataset, optimizer_medium, epochs=2, model_name="Medium")

## Text Generation Testing

Quick tests of text generation from trained models...

In [None]:
# Quick text generation function
def generate_quick(model, prompt, max_length=50, temperature=0.8):
    """Generate text quickly for testing"""
    print(f"Generating from prompt: '{prompt}'")
    
    # Tokenize prompt
    input_ids = [char_to_id.get(ch, char_to_id.get('<UNK>', 0)) for ch in prompt]
    
    # Generate
    for i in range(max_length):
        # Prepare input (last seq_length tokens)
        current_input = input_ids[-SEQ_LENGTH:]
        if len(current_input) < SEQ_LENGTH:
            current_input = [0] * (SEQ_LENGTH - len(current_input)) + current_input
        
        # Get prediction
        input_tensor = tf.constant([current_input])
        predictions = model(input_tensor, training=False)
        
        # Sample next token
        logits = predictions[0, -1, :] / temperature
        predicted_id = tf.random.categorical([logits], 1)[0, 0].numpy()
        
        input_ids.append(predicted_id)
        
        # Convert to text for progress
        if i % 10 == 0 or i < 5:
            current_text = ''.join([id_to_char[str(id)] for id in input_ids])
            print(f"  Step {i:2d}: {current_text[-20:]}")
    
    # Final result
    generated_text = ''.join([id_to_char[str(id)] for id in input_ids])
    return generated_text

# Test generation with small model
print("=== Small Model Generation ===")
prompts = ["Alice", "The cat", "Once upon"]

for prompt in prompts:
    print(f"\n--- Prompt: '{prompt}' ---")
    try:
        generated = generate_quick(model_small, prompt, max_length=30, temperature=0.8)
        print(f"Generated: '{generated}'")
    except Exception as e:
        print(f"Error: {e}")
    print()

In [None]:
# Compare models side by side
print("=== Model Comparison ===")
test_prompt = "Alice was"

print(f"\nPrompt: '{test_prompt}'")
print("-" * 60)

try:
    print("Small model:")
    small_gen = generate_quick(model_small, test_prompt, max_length=40, temperature=0.7)
    print(f"  Result: '{small_gen}'")
except Exception as e:
    print(f"  Error: {e}")

print()

try:
    print("Medium model:")
    medium_gen = generate_quick(model_medium, test_prompt, max_length=40, temperature=0.7)
    print(f"  Result: '{medium_gen}'")
except Exception as e:
    print(f"  Error: {e}")

## Quick Performance Analysis

Visualizing training results and model comparison...

In [None]:
# Plot training curves
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss comparison
axes[0, 0].plot(history_small['loss'], 'b-o', label='Small Model', linewidth=2)
if len(history_medium['loss']) > 0:
    axes[0, 0].plot(history_medium['loss'], 'r-s', label='Medium Model', linewidth=2)
axes[0, 0].set_title('Training Loss Comparison')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy comparison
axes[0, 1].plot(history_small['accuracy'], 'b-o', label='Small Model', linewidth=2)
if len(history_medium['accuracy']) > 0:
    axes[0, 1].plot(history_medium['accuracy'], 'r-s', label='Medium Model', linewidth=2)
axes[0, 1].set_title('Training Accuracy Comparison')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Parameter count comparison
small_params = sum([tf.size(var).numpy() for var in model_small.trainable_variables])
medium_params = sum([tf.size(var).numpy() for var in model_medium.trainable_variables])

models = ['Small', 'Medium']
param_counts = [small_params, medium_params]

bars = axes[1, 0].bar(models, param_counts, color=['skyblue', 'lightcoral'], alpha=0.7)
axes[1, 0].set_title('Model Size Comparison')
axes[1, 0].set_ylabel('Parameters')

# Add parameter count labels on bars
for bar, count in zip(bars, param_counts):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
                   f'{count:,}', ha='center', va='bottom')

# Final performance summary
final_metrics = {
    'Small Model': {
        'Final Loss': history_small['loss'][-1] if history_small['loss'] else 0,
        'Final Accuracy': history_small['accuracy'][-1] if history_small['accuracy'] else 0,
        'Parameters': small_params
    },
    'Medium Model': {
        'Final Loss': history_medium['loss'][-1] if history_medium['loss'] else 0,
        'Final Accuracy': history_medium['accuracy'][-1] if history_medium['accuracy'] else 0,
        'Parameters': medium_params
    }
}

# Table-like visualization
table_data = []
for model_name, metrics in final_metrics.items():
    table_data.append([
        model_name,
        f"{metrics['Final Loss']:.4f}",
        f"{metrics['Final Accuracy']:.4f}",
        f"{metrics['Parameters']:,}"
    ])

axes[1, 1].axis('tight')
axes[1, 1].axis('off')
table = axes[1, 1].table(cellText=table_data,
                        colLabels=['Model', 'Final Loss', 'Final Acc', 'Parameters'],
                        cellLoc='center',
                        loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
axes[1, 1].set_title('Final Metrics Summary')

plt.tight_layout()
plt.show()

print("\n📊 Training Experiments Summary:")
print(f"- Small Model: {small_params:,} params, final loss: {history_small['loss'][-1]:.4f}")
print(f"- Medium Model: {medium_params:,} params, final loss: {history_medium['loss'][-1]:.4f}")
print(f"- Training data: {num_examples} examples, {steps_per_epoch} steps/epoch")
print(f"- Sequence length: {SEQ_LENGTH}, Vocabulary: {vocab_size} tokens")

In [None]:
# Save models and results for later use
experiment_dir = f"{NOTEBOOK_DIR}/training_experiments"
os.makedirs(experiment_dir, exist_ok=True)

# Save models
model_small.save_weights(f"{experiment_dir}/small_model_weights")
model_medium.save_weights(f"{experiment_dir}/medium_model_weights")

# Save training history
with open(f"{experiment_dir}/training_history.pkl", 'wb') as f:
    pickle.dump({
        'small_history': history_small,
        'medium_history': history_medium,
        'model_configs': {
            'small': {'d_model': 64, 'num_heads': 4, 'num_layers': 2, 'dff': 128},
            'medium': {'d_model': 128, 'num_heads': 8, 'num_layers': 4, 'dff': 256}
        },
        'training_config': {
            'seq_length': SEQ_LENGTH,
            'batch_size': BATCH_SIZE,
            'vocab_size': vocab_size,
            'num_examples': num_examples
        }
    }, f)

# Quick experiment log
experiment_log = f"""
Training Experiments Log - {datetime.now()}
==========================================

Models Tested:
1. Small Model: 64d, 4h, 2l -> {small_params:,} params
   - Final Loss: {history_small['loss'][-1]:.4f}
   - Final Accuracy: {history_small['accuracy'][-1]:.4f}

2. Medium Model: 128d, 8h, 4l -> {medium_params:,} params
   - Final Loss: {history_medium['loss'][-1]:.4f}
   - Final Accuracy: {history_medium['accuracy'][-1]:.4f}

Training Setup:
- Data: Alice story (first 10k chars)
- Sequence Length: {SEQ_LENGTH}
- Batch Size: {BATCH_SIZE}
- Vocabulary: {vocab_size} character tokens
- Examples: {num_examples}

Observations:
- Both models converged quickly on this small dataset
- Medium model shows better final metrics but much larger
- Text generation works but needs more training data
- Character-level tokenization works well for this dataset size

Next Steps:
- Scale up training data
- Try different learning rate schedules
- Implement better text generation
- Add proper evaluation metrics
- Try different tokenization approaches
"""

with open(f"{experiment_dir}/experiment_log.txt", 'w') as f:
    f.write(experiment_log)

print(f"✓ Saved experiment results to {experiment_dir}")
print("\n🎉 Training experiments complete!")
print("Ready to implement clean training pipeline in organized modules...")