# Get the GPT-2 checkpoints to use in our c-code

In [None]:
# Install requirements
!pip install transformers torch

In [None]:
import torch
import numpy as np
import struct
from transformers import GPT2Model
import hashlib
import json

In [None]:
# Python script to convert weights
import torch
from transformers import GPT2Model, GPT2Config

# Choose model size: 'gpt2' (124M), 'gpt2-medium' (355M), 'gpt2-large' (774M), 'gpt2-xl' (1.5B)
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)

# Save weights
torch.save(model.state_dict(), f'{model_name}_weights.bin')

In [None]:
model

In [None]:
# Match C alignment constants
CACHE_ALIGN = 64  # bytes
FLOAT_SIZE = 4    # sizeof(float)

In [None]:
def align_up(n, alignment=CACHE_ALIGN):
    """Align number of floats to cache boundary"""
    bytes_needed = n * FLOAT_SIZE
    aligned_bytes = ((bytes_needed + alignment - 1) // alignment) * alignment
    return aligned_bytes // FLOAT_SIZE

In [None]:
def create_bump_weights(model_name='gpt2', output_file='gpt2_bump.weights'):
    """
    Convert PyTorch model to bump-aligned weight format.
    The output file layout EXACTLY matches the C memory layout.
    """
    
    print(f"Loading {model_name} from HuggingFace...")
    model = GPT2Model.from_pretrained(model_name)
    state_dict = model.state_dict()
    config = model.config
    
    # Extract dimensions
    n_layers = config.n_layer
    n_heads = config.n_head
    embed_dim = config.n_embd
    vocab_size = config.vocab_size
    context_len = config.n_positions
    head_dim = embed_dim // n_heads
    
    # Calculate aligned dimensions (matching C code)
    aligned_embed_dim = align_up(embed_dim)
    aligned_head_dim = align_up(head_dim)
    aligned_context = align_up(context_len)
    
    print(f"\nModel Configuration:")
    print(f"  Layers:              {n_layers}")
    print(f"  Embedding:           {embed_dim} -> aligned: {aligned_embed_dim * FLOAT_SIZE} bytes")
    print(f"  Heads:               {n_heads}")
    print(f"  Head dim:            {head_dim} -> aligned: {aligned_head_dim * FLOAT_SIZE} bytes")
    print(f"  Vocab:               {vocab_size}")
    print(f"  Context:             {context_len} -> aligned: {aligned_context * FLOAT_SIZE} bytes")
    
    # ============================================================================
    # HEADER STRUCTURE (128 bytes total for nice alignment)
    # ============================================================================
    # Magic:           8 bytes   "BUMPWGT2"
    # Version:         4 bytes   (2)
    # Model Type:      4 bytes   (0=GPT2, 1=LLAMA, 2=MISTRAL, etc)
    # Hyperparams:     6 * 4 = 24 bytes
    # Aligned dims:    3 * 8 = 24 bytes (using size_t/uint64)
    # Checksum:        32 bytes  (SHA256)
    # Reserved:        32 bytes  (for future use)
    # ============================================================================
    
    with open(output_file, 'wb') as f:
        # Reserve space for header (will write it at the end with checksum)
        header_size = 128
        f.write(b'\x00' * header_size)
        
        # Track what we're writing for verification
        weight_map = {}
        current_offset = header_size
        
        # Helper function to write aligned tensor
        def write_tensor_aligned(tensor, name, expected_floats):
            nonlocal current_offset
            
            # Convert to numpy
            if hasattr(tensor, 'detach'):
                data = tensor.detach().cpu().numpy().astype(np.float32)
            else:
                data = tensor.astype(np.float32)
            
            # Flatten
            flat = data.flatten()
            actual_floats = len(flat)
            
            # Create aligned buffer with zeros for padding
            aligned_buffer = np.zeros(expected_floats, dtype=np.float32)
            aligned_buffer[:actual_floats] = flat
            
            # Write to file
            bytes_written = f.write(aligned_buffer.tobytes())
            
            # Record in map
            weight_map[name] = {
                'offset': current_offset,
                'actual_floats': actual_floats,
                'aligned_floats': expected_floats,
                'bytes': bytes_written,
                'shape': list(data.shape)
            }
            
            current_offset += bytes_written
            print(f"  {name}: {data.shape} -> {expected_floats} floats ({bytes_written} bytes)")
            
            return bytes_written
        
        total_bytes = 0
        
        # ============================================================================
        # WRITE WEIGHTS IN EXACT BUMP ORDER
        # This order must match layout_transformer() in C exactly!
        # ============================================================================
        
        print("\n📝 Writing embeddings...")
        
        # 1. Token embeddings [vocab_size × aligned_embed_dim]
        write_tensor_aligned(
            state_dict['wte.weight'],
            'token_embeddings',
            vocab_size * aligned_embed_dim
        )
        
        # 2. Position embeddings [context_len × aligned_embed_dim]
        write_tensor_aligned(
            state_dict['wpe.weight'],
            'position_embeddings',
            context_len * aligned_embed_dim
        )
        
        # 3. Skip embedded_input_offset (runtime buffer, not weights)
        # In C: M->embedded_input_offset = bump(&off, context * aligned_embed_dim, CACHE_ALIGN)
        # We don't write this - it's an activation buffer
        
        print("\n🔧 Writing layer weights...")
        
        # 4. Layer weights
        for layer_idx in range(n_layers):
            print(f"\n  Layer {layer_idx + 1}/{n_layers}:")
            prefix = f'h.{layer_idx}'
            
            # Skip layer_start_canary_offset (debug marker, not weights)
            
            # LayerNorm 1
            write_tensor_aligned(
                state_dict[f'{prefix}.ln_1.weight'],
                f'layer_{layer_idx}_ln1_weight',
                aligned_embed_dim
            )
            write_tensor_aligned(
                state_dict[f'{prefix}.ln_1.bias'],
                f'layer_{layer_idx}_ln1_bias',
                aligned_embed_dim
            )
            
            # Skip ln1_mean_offset, ln1_rstd_offset (runtime buffers)
            # Skip layer_input_offset, ln1_output_offset (runtime buffers)
            
            # QKV weights - need to split from combined tensor
            qkv_weight = state_dict[f'{prefix}.attn.c_attn.weight'].T  # Transpose for C
            qkv_bias = state_dict[f'{prefix}.attn.c_attn.bias']
            
            # Split QKV (they're concatenated in dim 0)
            q_weight = qkv_weight[:embed_dim, :]
            k_weight = qkv_weight[embed_dim:2*embed_dim, :]
            v_weight = qkv_weight[2*embed_dim:3*embed_dim, :]
            
            q_bias = qkv_bias[:embed_dim]
            k_bias = qkv_bias[embed_dim:2*embed_dim]
            v_bias = qkv_bias[2*embed_dim:3*embed_dim]
            
            # Q weights and bias
            write_tensor_aligned(
                q_weight,
                f'layer_{layer_idx}_q_weight',
                aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                q_bias,
                f'layer_{layer_idx}_q_bias',
                aligned_embed_dim
            )
            # Skip q_output_offset (runtime buffer)
            
            # K weights and bias
            write_tensor_aligned(
                k_weight,
                f'layer_{layer_idx}_k_weight',
                aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                k_bias,
                f'layer_{layer_idx}_k_bias',
                aligned_embed_dim
            )
            # Skip k_output_offset (runtime buffer)
            
            # V weights and bias
            write_tensor_aligned(
                v_weight,
                f'layer_{layer_idx}_v_weight',
                aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                v_bias,
                f'layer_{layer_idx}_v_bias',
                aligned_embed_dim
            )
            # Skip v_output_offset (runtime buffer)
            
            # Skip attention_scores_offset (runtime buffer)
            
            # Projection weights
            proj_weight = state_dict[f'{prefix}.attn.c_proj.weight'].T  # Transpose
            proj_bias = state_dict[f'{prefix}.attn.c_proj.bias']
            
            write_tensor_aligned(
                proj_weight,
                f'layer_{layer_idx}_proj_weight',
                aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                proj_bias,
                f'layer_{layer_idx}_proj_bias',
                aligned_embed_dim
            )
            
            # Skip attention_output_offset, residual1_output_offset (runtime buffers)
            
            # LayerNorm 2
            write_tensor_aligned(
                state_dict[f'{prefix}.ln_2.weight'],
                f'layer_{layer_idx}_ln2_weight',
                aligned_embed_dim
            )
            write_tensor_aligned(
                state_dict[f'{prefix}.ln_2.bias'],
                f'layer_{layer_idx}_ln2_bias',
                aligned_embed_dim
            )
            
            # Skip ln2_mean_offset, ln2_rstd_offset, ln2_output_offset (runtime buffers)
            
            # MLP weights
            fc1_weight = state_dict[f'{prefix}.mlp.c_fc.weight'].T  # Transpose
            fc1_bias = state_dict[f'{prefix}.mlp.c_fc.bias']
            
            write_tensor_aligned(
                fc1_weight,
                f'layer_{layer_idx}_fc1_weight',
                4 * aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                fc1_bias,
                f'layer_{layer_idx}_fc1_bias',
                4 * aligned_embed_dim
            )
            # Skip fc1_output_offset (runtime buffer)
            
            fc2_weight = state_dict[f'{prefix}.mlp.c_proj.weight'].T  # Transpose
            fc2_bias = state_dict[f'{prefix}.mlp.c_proj.bias']
            
            write_tensor_aligned(
                fc2_weight,
                f'layer_{layer_idx}_fc2_weight',
                4 * aligned_embed_dim * aligned_embed_dim
            )
            write_tensor_aligned(
                fc2_bias,
                f'layer_{layer_idx}_fc2_bias',
                aligned_embed_dim
            )
            
            # Skip mlp_output_offset, residual2_output_offset (runtime buffers)
            # Skip layer_end_canary_offset (debug marker)
        
        # 5. Final LayerNorm
        print("\n🏁 Writing final LayerNorm...")
        write_tensor_aligned(
            state_dict['ln_f.weight'],
            'final_ln_weight',
            aligned_embed_dim
        )
        write_tensor_aligned(
            state_dict['ln_f.bias'],
            'final_ln_bias',
            aligned_embed_dim
        )
        
        # Skip final_ln_mean_offset, final_ln_rstd_offset (runtime buffers)
        # Skip final_output_offset, logits_offset (runtime buffers)
        
    # Now reopen to calculate checksum and write header
    with open(output_file, 'r+b') as f:
        # Calculate checksum of weight data
        f.seek(header_size)
        weight_data = f.read()
        checksum = hashlib.sha256(weight_data).digest()
        
        # Write header at beginning
        f.seek(0)
        
        # Magic and version
        f.write(b'BUMPWGT2')  # 8 bytes
        f.write(struct.pack('I', 2))  # version 2, 4 bytes
        f.write(struct.pack('I', 0))  # model_type: 0=GPT2, 4 bytes
        
        # Hyperparameters (6 * 4 = 24 bytes)
        f.write(struct.pack('I', n_layers))
        f.write(struct.pack('I', vocab_size))
        f.write(struct.pack('I', embed_dim))
        f.write(struct.pack('I', context_len))
        f.write(struct.pack('I', n_heads))
        f.write(struct.pack('I', head_dim))
        
        # Aligned dimensions (3 * 8 = 24 bytes, using uint64)
        f.write(struct.pack('Q', aligned_embed_dim))
        f.write(struct.pack('Q', aligned_head_dim))
        f.write(struct.pack('Q', aligned_context))
        
        # Checksum (32 bytes)
        f.write(checksum)
        
        # Reserved (32 bytes) - for future use
        f.write(b'\x00' * 32)
    
    # Save weight map for debugging
    with open(output_file + '.map.json', 'w') as f:
        json.dump({
            'header': {
                'model': model_name,
                'layers': n_layers,
                'embed_dim': embed_dim,
                'aligned_embed_dim': aligned_embed_dim,
                'vocab_size': vocab_size,
                'context_len': context_len,
                'n_heads': n_heads,
                'head_dim': head_dim,
                'aligned_head_dim': aligned_head_dim
            },
            'weights': weight_map,
            'total_bytes': current_offset,
            'checksum': checksum.hex()
        }, f, indent=2)
    
    print(f"\n✅ Success!")
    print(f"  Output file: {output_file}")
    print(f"  Total size: {current_offset / (1024**3):.2f} GB")
    print(f"  Weight map: {output_file}.map.json")
    print(f"  Checksum: {checksum.hex()[:16]}...")
    
    return output_file

In [None]:
create_bump_weights('gpt2')

In [None]:
#if __name__ == '__main__':
#    import argparse
#    parser = argparse.ArgumentParser()
#    parser.add_argument('--model', default='gpt2', coldpla
#                       choices=['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'])
#    parser.add_argument('--output', default=None)
#    args = parser.parse_args()
    
#    output = args.output or f'{args.model}_bump.weights'
#    create_bump_weights(args.model, output)

In [None]:
def verify_bump_weights(weight_file='gpt2_bump.weights', map_file='gpt2_bump.weights.map.json'):
    """
    Verify the bump weight file structure and compare with original model
    """
    print("🔍 Verifying Bump Weight File\n")
    print("=" * 60)
    
    # Load the map file
    with open(map_file, 'r') as f:
        weight_map = json.load(f)
    
    print("📊 File Structure:")
    print(f"  Total size: {weight_map['total_bytes'] / (1024**3):.2f} GB")
    print(f"  Checksum: {weight_map['checksum'][:16]}...")
    print(f"  Model config:")
    for key, value in weight_map['header'].items():
        print(f"    {key}: {value}")
    
    # Read and verify header
    with open(weight_file, 'rb') as f:
        # Read header
        magic = f.read(8)
        version = struct.unpack('I', f.read(4))[0]
        model_type = struct.unpack('I', f.read(4))[0]
        
        print(f"\n📦 Header Verification:")
        print(f"  Magic: {magic.decode('ascii', errors='ignore')}")
        print(f"  Version: {version}")
        print(f"  Model type: {model_type} (0=GPT2)")
        
        # Read hyperparameters
        num_layers = struct.unpack('I', f.read(4))[0]
        vocab_size = struct.unpack('I', f.read(4))[0]
        embed_dim = struct.unpack('I', f.read(4))[0]
        context_len = struct.unpack('I', f.read(4))[0]
        num_heads = struct.unpack('I', f.read(4))[0]
        head_dim = struct.unpack('I', f.read(4))[0]
        
        print(f"\n  Hyperparameters from header:")
        print(f"    Layers: {num_layers}")
        print(f"    Vocab: {vocab_size}")
        print(f"    Embed dim: {embed_dim}")
        print(f"    Context: {context_len}")
        print(f"    Heads: {num_heads}")
        print(f"    Head dim: {head_dim}")
        
        # Sample some weights
        f.seek(128)  # Skip to weights (header is 128 bytes)
        
        print(f"\n🔬 Weight Samples:")
        
        # Read first few token embeddings
        token_emb_sample = np.frombuffer(f.read(4 * 10), dtype=np.float32)
        print(f"  Token embeddings [0:10]: {token_emb_sample}")
        
        # Check value ranges
        f.seek(128)
        chunk = np.frombuffer(f.read(4 * 10000), dtype=np.float32)
        print(f"  Value statistics (first 10k floats):")
        print(f"    Min: {chunk.min():.6f}")
        print(f"    Max: {chunk.max():.6f}")
        print(f"    Mean: {chunk.mean():.6f}")
        print(f"    Std: {chunk.std():.6f}")
    
    # Compare with original model
    print(f"\n🔄 Comparing with original GPT-2 model...")
    model = GPT2Model.from_pretrained('gpt2')
    
    # Check token embeddings match
    with open(weight_file, 'rb') as f:
        f.seek(128)  # Skip header
        
        # Read token embeddings
        aligned_embed_dim = weight_map['header']['aligned_embed_dim']
        vocab_size = weight_map['header']['vocab_size']
        
        file_token_emb = np.zeros((vocab_size, embed_dim), dtype=np.float32)
        for v in range(vocab_size):
            row = np.frombuffer(f.read(4 * aligned_embed_dim), dtype=np.float32)
            file_token_emb[v] = row[:embed_dim]  # Take only actual dims, not padding
    
    model_token_emb = model.wte.weight.detach().numpy()
    
    diff = np.abs(file_token_emb - model_token_emb).max()
    print(f"  Max difference in token embeddings: {diff:.2e}")
    
    if diff < 1e-5:
        print("  ✅ Token embeddings match!")
    else:
        print("  ⚠️  Token embeddings have differences")
    
    print("\n✅ Verification complete!")
    return weight_map

def create_tokenizer_files(output_dir='.'):
    """
    Create tokenizer files for C runtime
    """
    print("\n🔤 Creating Tokenizer Files for C Runtime\n")
    print("=" * 60)
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Save vocabulary
    vocab_file = f'{output_dir}/gpt2_vocab.txt'
    with open(vocab_file, 'w', encoding='utf-8') as f:
        for token_id in range(tokenizer.vocab_size):
            token = tokenizer.convert_ids_to_tokens(token_id)
            # Handle special characters
            if token.startswith('Ġ'):
                token = token[1:]  # Remove the Ġ prefix (indicates space)
                f.write(f"{token_id}\t \t{token}\n")  # Tab indicates space prefix
            else:
                f.write(f"{token_id}\t\t{token}\n")
    
    print(f"  Saved vocabulary to {vocab_file}")
    print(f"  Vocab size: {tokenizer.vocab_size}")
    
    # Save merges (BPE rules)
    merges_file = f'{output_dir}/gpt2_merges.txt'
    with open(merges_file, 'w', encoding='utf-8') as f:
        # Get BPE merges from tokenizer
        merges = tokenizer.byte_encoder
        for merge in tokenizer.bpe_ranks.keys():
            f.write(f"{merge[0]} {merge[1]}\n")
    
    print(f"  Saved BPE merges to {merges_file}")
    
    return tokenizer

def tokenize_for_c(text, tokenizer=None, output_file='input_tokens.bin'):
    """
    Tokenize text and save in format for C runtime
    """
    if tokenizer is None:
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    print(f"\n📝 Tokenizing: '{text}'")
    
    # Tokenize
    tokens = tokenizer.encode(text)
    
    print(f"  Token IDs: {tokens}")
    print(f"  Tokens: {[tokenizer.decode([t]) for t in tokens]}")
    print(f"  Length: {len(tokens)} tokens")
    
    # Save as binary file for C
    with open(output_file, 'wb') as f:
        # Write number of tokens
        f.write(struct.pack('I', len(tokens)))
        # Write token IDs as int32
        for token_id in tokens:
            f.write(struct.pack('i', token_id))
    
    print(f"  Saved to {output_file}")
    
    # Also create a C header file
    header_file = output_file.replace('.bin', '.h')
    with open(header_file, 'w') as f:
        f.write(f"// Auto-generated tokenization for: {text}\n")
        f.write(f"#define NUM_TOKENS {len(tokens)}\n")
        f.write(f"int32_t input_tokens[NUM_TOKENS] = {{\n    ")
        f.write(", ".join(str(t) for t in tokens))
        f.write("\n};\n")
    
    print(f"  C header saved to {header_file}")
    
    return tokens

def visualize_weight_layout(map_file='gpt2_bump.weights.map.json'):
    """
    Visualize the memory layout of weights
    """
    with open(map_file, 'r') as f:
        weight_map = json.load(f)
    
    print("\n📊 Weight Memory Layout Visualization\n")
    print("=" * 60)
    
    weights = weight_map['weights']
    
    # Group by layer
    embeddings = {}
    layers = {i: {} for i in range(12)}
    final = {}
    
    for name, info in weights.items():
        if 'layer_' in name:
            layer_num = int(name.split('_')[1])
            layers[layer_num][name] = info
        elif 'final' in name:
            final[name] = info
        else:
            embeddings[name] = info
    
    def format_size(bytes):
        if bytes < 1024:
            return f"{bytes}B"
        elif bytes < 1024**2:
            return f"{bytes/1024:.1f}KB"
        elif bytes < 1024**3:
            return f"{bytes/1024**2:.1f}MB"
        else:
            return f"{bytes/1024**3:.2f}GB"
    
    # Print layout
    print("EMBEDDINGS:")
    for name, info in embeddings.items():
        print(f"  {name:30} @ 0x{info['offset']:08x} [{format_size(info['bytes']):>8}]")
    
    print("\nLAYERS:")
    for layer_num in range(12):
        layer_size = sum(w['bytes'] for w in layers[layer_num].values())
        print(f"  Layer {layer_num:2d}: {format_size(layer_size):>8}")
        if layer_num == 0:  # Show details for first layer
            for name, info in sorted(layers[layer_num].items(), key=lambda x: x[1]['offset']):
                short_name = name.replace(f'layer_{layer_num}_', '')
                print(f"    {short_name:20} [{format_size(info['bytes']):>8}]")
    
    print("\nFINAL:")
    for name, info in final.items():
        print(f"  {name:30} @ 0x{info['offset']:08x} [{format_size(info['bytes']):>8}]")
    
    # Memory map bar chart
    print("\n📊 Memory Usage by Component:")
    
    emb_size = sum(w['bytes'] for w in embeddings.values())
    layer_size = sum(sum(w['bytes'] for w in layers[i].values()) for i in range(12))
    final_size = sum(w['bytes'] for w in final.values())
    
    total = emb_size + layer_size + final_size
    
    def bar(size, total, width=40):
        n = int(size / total * width)
        return '█' * n + '░' * (width - n)
    
    print(f"  Embeddings: {bar(emb_size, total)} {format_size(emb_size):>8} ({emb_size/total*100:.1f}%)")
    print(f"  Layers:     {bar(layer_size, total)} {format_size(layer_size):>8} ({layer_size/total*100:.1f}%)")
    print(f"  Final:      {bar(final_size, total)} {format_size(final_size):>8} ({final_size/total*100:.1f}%)")

def test_simple_inference():
    """
    Create a simple test case for C runtime
    """
    print("\n🧪 Creating Test Case for C Runtime\n")
    print("=" * 60)
    
    # Test texts
    test_texts = [
        "Hello world",
        "The quick brown fox",
        "Once upon a time",
        "def hello():",
        "import numpy as np"
    ]
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    for i, text in enumerate(test_texts):
        print(f"\nTest {i+1}: '{text}'")
        tokens = tokenize_for_c(text, tokenizer, f'test_{i}.bin')
        
        # Also show what the expected output shape would be
        print(f"  Expected output shape: [{len(tokens)} x 768] -> [{len(tokens)} x 50257] logits")

if __name__ == '__main__':
    # Run all verification steps
    weight_map = verify_bump_weights()
    visualize_weight_layout()
    
    # Create tokenizer files
    tokenizer = create_tokenizer_files()
    
    # Create test cases
    test_simple_inference()
    
    # Example: tokenize custom text
    print("\n" + "="*60)
    print("💡 Example: Tokenize your own text")
    print("="*60)
    
    custom_text = "The artificial intelligence model"
    tokens = tokenize_for_c(custom_text, tokenizer, 'custom_input.bin')
    
    print("\n✅ All verification and preprocessing complete!")
    print("\nTo use in C:")
    print("  1. Load weights: gpt2_bump.weights")
    print("  2. Load tokens: custom_input.bin")
    print("  3. Run inference!")

In [None]:
## Lets decode

In [None]:
# In Python with transformers library:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokens = [1, 2, 3, 42225, 19820, 39356, 40127, 45816, 9928, 16847, 38608, 13960, 27840]
text = tokenizer.decode(tokens)
print(text)