# GPT Experimental Development - Messy Prototype

This is my experimental notebook where I'm figuring out how to build a GPT from scratch. 
Lots of trial and error, debugging prints, and quick hacks here.

**TODO**: Clean this up later into proper modules

## Import Libraries and Setup

Just importing everything I might need... will clean this up later

In [None]:
# Let me just import everything I might need
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import sys
import json
import pickle
import re
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Some imports I'm not sure I need but keeping just in case
from sklearn.metrics import accuracy_score
import math
from datetime import datetime
import random

print("Tensorflow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# Quick path setup - hardcoded for now
PROJECT_PATH = '/home/akshat/GPT_from_scratch'
DATA_PATH = f'{PROJECT_PATH}/text_data'
print(f"Working from: {PROJECT_PATH}")

## Messy Data Loading and Exploration

Let me just load some data and see what we're working with...

In [None]:
# Load some text data - using Alice for now because it's small
alice_path = f'{DATA_PATH}/alice_story.txt'

try:
    with open(alice_path, 'r', encoding='utf-8') as f:
        text_data = f.read()
    print("✓ Loaded Alice story")
except:
    print("❌ Failed to load Alice story")
    # Let me try a different file
    alice_path = f'{DATA_PATH}/alice_extended.txt'
    with open(alice_path, 'r', encoding='utf-8') as f:
        text_data = f.read()
    print("✓ Loaded Alice extended")

print(f"Text length: {len(text_data)} characters")
print(f"First 200 chars: {text_data[:200]}")
print(f"Last 200 chars: {text_data[-200:]}")

# Quick stats
unique_chars = list(set(text_data))
print(f"\nUnique characters: {len(unique_chars)}")
print(f"Characters: {unique_chars[:20]}...")  # Just show first 20

# Word-level stats too
words = text_data.split()
print(f"\nTotal words: {len(words)}")
print(f"Unique words: {len(set(words))}")
print(f"First 10 words: {words[:10]}")

## Quick and Dirty Tokenizer Experiments

Let me try different ways to tokenize this...

In [None]:
# Character-level tokenizer - simplest approach
def char_tokenizer(text):
    """Dead simple character tokenizer"""
    chars = sorted(list(set(text)))
    char_to_id = {ch: i for i, ch in enumerate(chars)}
    id_to_char = {i: ch for i, ch in enumerate(chars)}
    return char_to_id, id_to_char, chars

char_to_id, id_to_char, vocab = char_tokenizer(text_data)
print(f"Character vocab size: {len(vocab)}")
print(f"Vocab: {vocab}")

# Test tokenization
test_text = "Hello Alice!"
tokens = [char_to_id.get(ch, 0) for ch in test_text]  # 0 for unknown
reconstructed = ''.join([id_to_char.get(tok, '?') for tok in tokens])
print(f"\nTest: '{test_text}'")
print(f"Tokens: {tokens}")
print(f"Reconstructed: '{reconstructed}'")

# Let's also try a simple word tokenizer
def simple_word_tokenizer(text):
    """Basic word tokenizer with some cleaning"""
    # Simple cleaning
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', ' ', text)
    words = text.lower().split()
    
    vocab = sorted(list(set(words)))
    word_to_id = {word: i+1 for i, word in enumerate(vocab)}  # Start from 1, 0 for UNK
    word_to_id['<UNK>'] = 0
    id_to_word = {i: word for word, i in word_to_id.items()}
    
    return word_to_id, id_to_word, vocab

word_to_id, id_to_word, word_vocab = simple_word_tokenizer(text_data)
print(f"\nWord vocab size: {len(word_vocab)}")
print(f"First 20 words: {word_vocab[:20]}")

# Test word tokenization
test_words = "hello alice how are you?".split()
word_tokens = [word_to_id.get(word, 0) for word in test_words]
print(f"\nWord test: {test_words}")
print(f"Word tokens: {word_tokens}")

## Rough Model Architecture Prototyping

Let me build some basic transformer layers... this is gonna be messy

In [None]:
# Let me try to build a simple attention mechanism first
class SimpleAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        print(f"Creating attention with d_model={d_model}, heads={num_heads}, head_dim={self.head_dim}")
        
        # Linear projections for Q, K, V
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model) 
        self.wv = tf.keras.layers.Dense(d_model)
        self.wo = tf.keras.layers.Dense(d_model)
        
    def call(self, x, mask=None):
        batch_size, seq_len, _ = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
        
        # Get Q, K, V
        q = self.wq(x)  # (batch, seq, d_model)
        k = self.wk(x)
        v = self.wv(x)
        
        # Reshape for multi-head
        q = tf.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim))
        k = tf.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim))
        v = tf.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim))
        
        # Transpose to (batch, heads, seq, head_dim)
        q = tf.transpose(q, [0, 2, 1, 3])
        k = tf.transpose(k, [0, 2, 1, 3])
        v = tf.transpose(v, [0, 2, 1, 3])
        
        # Scaled dot-product attention
        scores = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
        
        # Apply causal mask for GPT
        mask_val = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        mask_val = tf.where(mask_val == 0, -1e9, 0.0)
        scores += mask_val
        
        # Softmax
        attention_weights = tf.nn.softmax(scores, axis=-1)
        
        # Apply attention to values
        out = tf.matmul(attention_weights, v)
        
        # Concatenate heads
        out = tf.transpose(out, [0, 2, 1, 3])
        out = tf.reshape(out, (batch_size, seq_len, self.d_model))
        
        # Final linear layer
        return self.wo(out)

# Test the attention layer
print("Testing attention layer...")
attention = SimpleAttention(d_model=64, num_heads=4)
test_input = tf.random.normal((2, 10, 64))  # (batch=2, seq=10, features=64)
output = attention(test_input)
print(f"Input shape: {test_input.shape}")
print(f"Output shape: {output.shape}")
print("✓ Attention layer works!")

In [None]:
# Now let me build a simple transformer block
class SimpleTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        print(f"Creating transformer block: d_model={d_model}, heads={num_heads}, ff_dim={ff_dim}")
        
        self.attention = SimpleAttention(d_model, num_heads)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        # Feed-forward network
        self.ff = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(d_model),
        ])
        
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        
    def call(self, x, training=None):
        # Self-attention + residual
        attn_out = self.attention(x)
        attn_out = self.dropout1(attn_out, training=training)
        x1 = self.norm1(x + attn_out)
        
        # Feed-forward + residual  
        ff_out = self.ff(x1)
        ff_out = self.dropout2(ff_out, training=training)
        return self.norm2(x1 + ff_out)

# Test transformer block
print("\nTesting transformer block...")
block = SimpleTransformerBlock(d_model=64, num_heads=4, ff_dim=128)
test_input = tf.random.normal((2, 10, 64))
output = block(test_input, training=True)
print(f"Input shape: {test_input.shape}")
print(f"Output shape: {output.shape}")
print("✓ Transformer block works!")

In [None]:
# Now let me build a simple GPT model
class SimpleGPT(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, ff_dim, max_len=512, **kwargs):
        super().__init__(**kwargs)
        print(f"Creating GPT: vocab={vocab_size}, d_model={d_model}, layers={num_layers}")
        
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.max_len = max_len
        
        # Embeddings
        self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_embedding = tf.keras.layers.Embedding(max_len, d_model)
        
        # Transformer blocks
        self.blocks = [SimpleTransformerBlock(d_model, num_heads, ff_dim) 
                      for _ in range(num_layers)]
        
        # Output head
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.head = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x, training=None):
        seq_len = tf.shape(x)[1]
        
        # Embeddings
        positions = tf.range(seq_len)
        token_emb = self.token_embedding(x)
        pos_emb = self.pos_embedding(positions)
        
        # Add embeddings
        x = token_emb + pos_emb
        
        # Apply transformer blocks
        for i, block in enumerate(self.blocks):
            x = block(x, training=training)
            if i == 0:  # Debug print for first block
                print(f"After block {i}: shape {x.shape}, mean {tf.reduce_mean(x):.4f}")
        
        # Final norm and head
        x = self.norm(x)
        return self.head(x)

# Test the full model
print("\nTesting full GPT model...")
vocab_size = len(char_to_id)  # Use our character vocab
model = SimpleGPT(
    vocab_size=vocab_size,
    d_model=64,
    num_heads=4,
    num_layers=2,
    ff_dim=128,
    max_len=128
)

# Test with some input
test_sequence = [char_to_id.get(ch, 0) for ch in "Hello Alice"]
test_input = tf.constant([test_sequence + [0] * (20 - len(test_sequence))])  # Pad to length 20
print(f"Test input shape: {test_input.shape}")
print(f"Test input: {test_input}")

output = model(test_input, training=False)
print(f"Model output shape: {output.shape}")
print(f"Output sample: {output[0, 0, :5]}")
print("✓ Full model works!")

## Hacky Training Loop with Print Debugging

Let me write a quick training loop with lots of debug prints...

In [None]:
# Prepare some training data quickly
def prepare_training_data(text, tokenizer_dict, seq_len=32):
    """Quick and dirty data preparation"""
    print(f"Preparing training data with seq_len={seq_len}")
    
    # Tokenize the entire text
    tokens = [tokenizer_dict.get(ch, 0) for ch in text]
    print(f"Total tokens: {len(tokens)}")
    
    # Create input-target pairs
    inputs, targets = [], []
    
    for i in range(0, len(tokens) - seq_len, seq_len // 2):  # Overlapping windows
        input_seq = tokens[i:i+seq_len]
        target_seq = tokens[i+1:i+seq_len+1]
        
        if len(input_seq) == seq_len and len(target_seq) == seq_len:
            inputs.append(input_seq)
            targets.append(target_seq)
    
    print(f"Created {len(inputs)} training examples")
    return np.array(inputs), np.array(targets)

# Prepare data
seq_length = 32
X_train, y_train = prepare_training_data(text_data[:5000], char_to_id, seq_length)  # Use only first 5k chars for speed
print(f"Training data shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Sample input: {X_train[0][:10]}")
print(f"Sample target: {y_train[0][:10]}")

# Convert to tensorflow dataset
batch_size = 8
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(batch_size)
print(f"Created dataset with batch_size={batch_size}")

In [None]:
# Create a small model for quick testing
small_model = SimpleGPT(
    vocab_size=vocab_size,
    d_model=32,  # Smaller for faster training
    num_heads=2,
    num_layers=2,
    ff_dim=64,
    max_len=seq_length
)

# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

print("Model compiled!")

# Manual training loop with lots of debugging
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = small_model(inputs, training=True)
        loss = loss_fn(targets, predictions)
    
    gradients = tape.gradient(loss, small_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, small_model.trainable_variables))
    
    return loss, predictions

# Training loop
epochs = 3
losses = []

print(f"\nStarting training for {epochs} epochs...")
for epoch in range(epochs):
    print(f"\n=== EPOCH {epoch+1}/{epochs} ===")
    epoch_losses = []
    
    for step, (batch_inputs, batch_targets) in enumerate(train_dataset):
        loss, predictions = train_step(batch_inputs, batch_targets)
        epoch_losses.append(loss.numpy())
        
        if step % 10 == 0:  # Print every 10 steps
            print(f"  Step {step:3d}: loss = {loss.numpy():.4f}")
            
            # Quick sanity check - show a prediction
            sample_pred = tf.nn.softmax(predictions[0, 0, :])  # First token of first example
            top_pred = tf.argmax(sample_pred).numpy()
            pred_char = id_to_char.get(top_pred, '?')
            target_char = id_to_char.get(batch_targets[0, 0].numpy(), '?')
            print(f"    Predicted '{pred_char}' (confidence: {sample_pred[top_pred]:.3f}), Target: '{target_char}'")
    
    avg_loss = np.mean(epoch_losses)
    losses.append(avg_loss)
    print(f"  Epoch {epoch+1} average loss: {avg_loss:.4f}")

print("\n✓ Training completed!")
print(f"Final loss: {losses[-1]:.4f}")

## Model Testing with Manual Validation

Let me test the model with some manual examples...

In [None]:
# Test text generation
def generate_text(model, start_text, max_length=50, temperature=1.0):
    """Generate text from the model - quick and dirty version"""
    print(f"Generating text starting with: '{start_text}'")
    
    # Convert start text to tokens
    input_tokens = [char_to_id.get(ch, 0) for ch in start_text]
    generated = input_tokens.copy()
    
    for i in range(max_length):
        # Prepare input (last seq_length tokens)
        current_seq = generated[-seq_length:]
        if len(current_seq) < seq_length:
            current_seq = [0] * (seq_length - len(current_seq)) + current_seq
        
        input_tensor = tf.constant([current_seq])
        
        # Get prediction
        predictions = model(input_tensor, training=False)
        next_token_logits = predictions[0, -1, :] / temperature
        
        # Sample from the distribution
        next_token = tf.random.categorical([next_token_logits], 1)[0, 0].numpy()
        generated.append(next_token)
        
        # Convert to character and print progress
        next_char = id_to_char.get(next_token, '?')
        if i % 10 == 0:
            current_text = ''.join([id_to_char.get(tok, '?') for tok in generated])
            print(f"  Step {i:2d}: '{current_text[-20:]}' (next: '{next_char}')")
    
    # Convert back to text
    generated_text = ''.join([id_to_char.get(tok, '?') for tok in generated])
    return generated_text

# Test generation
test_prompts = ["Alice", "The", "Once"]

for prompt in test_prompts:
    print(f"\n{'='*50}")
    generated = generate_text(small_model, prompt, max_length=30, temperature=0.8)
    print(f"\nGenerated: '{generated}'")
    print(f"Length: {len(generated)} characters")

In [None]:
# Quick accuracy test
def test_model_accuracy(model, test_inputs, test_targets, num_samples=50):
    """Quick accuracy calculation"""
    print(f"Testing model accuracy on {num_samples} samples...")
    
    # Take a subset for quick testing
    test_X = test_inputs[:num_samples]
    test_y = test_targets[:num_samples]
    
    predictions = model(test_X, training=False)
    predicted_tokens = tf.argmax(predictions, axis=-1)
    
    # Calculate token-level accuracy
    correct = tf.cast(predicted_tokens == test_y, tf.float32)
    accuracy = tf.reduce_mean(correct)
    
    print(f"Token-level accuracy: {accuracy.numpy():.4f}")
    
    # Show some examples
    for i in range(min(3, num_samples)):
        input_text = ''.join([id_to_char.get(tok, '?') for tok in test_X[i]])
        target_text = ''.join([id_to_char.get(tok, '?') for tok in test_y[i]])
        pred_text = ''.join([id_to_char.get(tok, '?') for tok in predicted_tokens[i]])
        
        print(f"\nExample {i+1}:")
        print(f"  Input:  '{input_text[:30]}...'")
        print(f"  Target: '{target_text[:30]}...'")
        print(f"  Pred:   '{pred_text[:30]}...'")
    
    return accuracy.numpy()

# Test accuracy
acc = test_model_accuracy(small_model, X_train, y_train, num_samples=50)
print(f"\nFinal model accuracy: {acc:.4f}")

## Performance Analysis with Ad-hoc Visualizations

Let me make some quick plots to see how we did...

In [None]:
# Plot training loss
plt.figure(figsize=(10, 6))

plt.subplot(2, 2, 1)
plt.plot(losses, 'b-', linewidth=2, marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
for i, loss in enumerate(losses):
    plt.annotate(f'{loss:.3f}', (i, loss), textcoords="offset points", xytext=(0,10), ha='center')

# Token distribution in training data
plt.subplot(2, 2, 2)
token_counts = {}
for ch in text_data[:1000]:  # Sample first 1000 chars
    token_counts[ch] = token_counts.get(ch, 0) + 1

chars = list(token_counts.keys())[:20]  # Top 20 chars
counts = [token_counts[ch] for ch in chars]
plt.bar(range(len(chars)), counts)
plt.title('Top 20 Character Frequencies')
plt.xlabel('Character')
plt.ylabel('Count')
plt.xticks(range(len(chars)), chars, rotation=45)

# Model parameter count (rough estimate)
plt.subplot(2, 2, 3)
total_params = sum([tf.size(var).numpy() for var in small_model.trainable_variables])
layer_sizes = [tf.size(var).numpy() for var in small_model.trainable_variables[:5]]  # First 5 layers
layer_names = [f'Layer {i+1}' for i in range(len(layer_sizes))]

plt.pie(layer_sizes, labels=layer_names, autopct='%1.1f%%')
plt.title(f'Model Parameters Distribution\nTotal: {total_params:,} params')

# Generate some text and show character diversity
plt.subplot(2, 2, 4)
sample_generated = generate_text(small_model, "Alice", max_length=100, temperature=0.8)
gen_char_counts = {}
for ch in sample_generated:
    gen_char_counts[ch] = gen_char_counts.get(ch, 0) + 1

gen_chars = list(gen_char_counts.keys())[:15]
gen_counts = [gen_char_counts[ch] for ch in gen_chars]
plt.bar(range(len(gen_chars)), gen_counts, alpha=0.7, color='orange')
plt.title('Generated Text Character Dist.')
plt.xlabel('Character')
plt.ylabel('Count')
plt.xticks(range(len(gen_chars)), gen_chars, rotation=45)

plt.tight_layout()
plt.show()

print(f"\nModel Summary:")
print(f"- Vocabulary size: {vocab_size}")
print(f"- Total parameters: {total_params:,}")
print(f"- Final training loss: {losses[-1]:.4f}")
print(f"- Token accuracy: {acc:.4f}")
print(f"- Training data size: {len(X_train)} sequences")

In [None]:
# Let me save this model quickly in case I want to use it later
import os
checkpoint_dir = f"{PROJECT_PATH}/notebooks/messy_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

model_path = f"{checkpoint_dir}/quick_gpt_prototype.keras"
small_model.save(model_path)
print(f"✓ Saved model to: {model_path}")

# Also save the tokenizer
tokenizer_path = f"{checkpoint_dir}/char_tokenizer.json"
with open(tokenizer_path, 'w') as f:
    json.dump({
        'char_to_id': char_to_id,
        'id_to_char': id_to_char,
        'vocab_size': vocab_size
    }, f)
print(f"✓ Saved tokenizer to: {tokenizer_path}")

# Quick notes for later
notes = f"""
Quick GPT Prototype - {datetime.now().strftime('%Y-%m-%d %H:%M')}
==================================================
Model specs:
- d_model: 32
- num_heads: 2  
- num_layers: 2
- vocab_size: {vocab_size}
- sequence_length: {seq_length}

Training:
- epochs: {epochs}
- batch_size: {batch_size}
- final_loss: {losses[-1]:.4f}
- token_accuracy: {acc:.4f}

TODO:
- Clean up the architecture
- Better data preprocessing
- More training epochs
- Proper evaluation metrics
- Hyperparameter tuning
"""

notes_path = f"{checkpoint_dir}/experiment_notes.txt"
with open(notes_path, 'w') as f:
    f.write(notes)
print(f"✓ Saved notes to: {notes_path}")

print("\n🎉 Experiment complete! Time to clean this up into proper modules...")