# Data Preprocessing Experiments - Messy Version

Trying different ways to process text data for GPT training.
This is all experimental code, lots of dead ends and quick hacks!

In [None]:
# Messy imports again
import tensorflow as tf
import numpy as np
import os
import re
import json
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import random
from typing import List, Dict, Tuple

# Hardcoded paths for quick testing
PROJECT_ROOT = '/home/akshat/GPT_from_scratch'
TEXT_DIR = f'{PROJECT_ROOT}/text_data'
print(f"Working from {PROJECT_ROOT}")

# Check what text files we have
text_files = [f for f in os.listdir(TEXT_DIR) if f.endswith('.txt')]
print(f"Available text files: {text_files}")

## Loading and Basic Text Analysis

Let me load different text files and see what we're working with...

In [None]:
# Load all available text files and analyze them
texts = {}
stats = {}

for filename in text_files:
    try:
        with open(f'{TEXT_DIR}/{filename}', 'r', encoding='utf-8') as f:
            content = f.read()
        texts[filename] = content
        
        # Quick stats
        stats[filename] = {
            'chars': len(content),
            'words': len(content.split()),
            'lines': len(content.split('\n')),
            'unique_chars': len(set(content)),
            'unique_words': len(set(content.lower().split()))
        }
        
        print(f"✓ Loaded {filename}: {stats[filename]['chars']:,} chars, {stats[filename]['words']:,} words")
        print(f"  Preview: {content[:100]}...")
        
    except Exception as e:
        print(f"❌ Failed to load {filename}: {e}")

print(f"\nLoaded {len(texts)} text files")

In [None]:
# Let me combine some texts for larger training data
# Using Alice as base since it's clean
if 'alice_story.txt' in texts:
    main_text = texts['alice_story.txt']
elif 'alice_extended.txt' in texts:
    main_text = texts['alice_extended.txt']
else:
    # Just use the first available text
    main_text = list(texts.values())[0]
    
print(f"Using main text with {len(main_text):,} characters")
print(f"Sample: {main_text[:200]}")

# Character analysis
char_counts = Counter(main_text)
print(f"\nTop 20 characters:")
for char, count in char_counts.most_common(20):
    char_display = repr(char) if char in '\n\t\r' else char
    print(f"  '{char_display}': {count:,} ({count/len(main_text)*100:.2f}%)")

## Tokenizer Experiments

Trying different tokenization strategies...

In [None]:
# Character tokenizer with special tokens
def build_char_tokenizer_v1(text):
    """Simple character tokenizer"""
    chars = sorted(list(set(text)))
    
    # Add special tokens
    special_tokens = ['<PAD>', '<UNK>', '<START>', '<END>']
    vocab = special_tokens + chars
    
    char_to_id = {ch: i for i, ch in enumerate(vocab)}
    id_to_char = {i: ch for i, ch in enumerate(vocab)}
    
    return char_to_id, id_to_char, vocab

char_to_id_v1, id_to_char_v1, vocab_v1 = build_char_tokenizer_v1(main_text)
print(f"Character tokenizer v1: {len(vocab_v1)} tokens")
print(f"Vocab: {vocab_v1[:20]}...")

# Test encoding/decoding
test_text = "Hello Alice!"
encoded = [char_to_id_v1.get(ch, char_to_id_v1['<UNK>']) for ch in test_text]
decoded = ''.join([id_to_char_v1[idx] for idx in encoded])
print(f"\nTest: '{test_text}' -> {encoded} -> '{decoded}'")

In [None]:
# Word tokenizer with frequency filtering
def build_word_tokenizer_v2(text, min_freq=2, max_vocab=5000):
    """Word tokenizer with frequency filtering"""
    # Basic cleaning
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', ' ', text)
    words = text.lower().split()
    
    # Count frequencies
    word_counts = Counter(words)
    print(f"Total unique words: {len(word_counts)}")
    
    # Filter by frequency
    frequent_words = [word for word, count in word_counts.items() if count >= min_freq]
    frequent_words = sorted(frequent_words)[:max_vocab-4]  # Leave room for special tokens
    
    print(f"Words with freq >= {min_freq}: {len(frequent_words)}")
    
    # Build vocabulary
    special_tokens = ['<PAD>', '<UNK>', '<START>', '<END>']
    vocab = special_tokens + frequent_words
    
    word_to_id = {word: i for i, word in enumerate(vocab)}
    id_to_word = {i: word for i, word in enumerate(vocab)}
    
    return word_to_id, id_to_word, vocab

word_to_id_v2, id_to_word_v2, vocab_v2 = build_word_tokenizer_v2(main_text, min_freq=2)
print(f"\nWord tokenizer v2: {len(vocab_v2)} tokens")
print(f"Sample vocab: {vocab_v2[4:24]}...")  # Skip special tokens

# Test word tokenization
test_words = "hello alice how are you today?".split()
encoded_words = [word_to_id_v2.get(word, word_to_id_v2['<UNK>']) for word in test_words]
decoded_words = [id_to_word_v2[idx] for idx in encoded_words]
print(f"\nWord test: {test_words}")
print(f"Encoded: {encoded_words}")
print(f"Decoded: {decoded_words}")

In [None]:
# Subword tokenizer experiment (BPE-like)
def simple_bpe_tokenizer(text, vocab_size=1000, iterations=500):
    """Very simple BPE implementation - just for experimentation"""
    print(f"Building BPE tokenizer with {vocab_size} vocab size...")
    
    # Start with character-level
    chars = sorted(list(set(text)))
    vocab = ['<PAD>', '<UNK>'] + chars
    
    # Tokenize text as characters initially
    tokens = list(text)
    
    print(f"Starting with {len(vocab)} character tokens")
    
    # Simple BPE iterations
    for iteration in range(min(iterations, vocab_size - len(vocab))):
        if iteration % 100 == 0:
            print(f"  BPE iteration {iteration}, vocab size: {len(vocab)}")
            
        # Count adjacent pairs
        pairs = defaultdict(int)
        for i in range(len(tokens) - 1):
            pair = (tokens[i], tokens[i+1])
            pairs[pair] += 1
        
        if not pairs:
            break
            
        # Find most frequent pair
        best_pair = max(pairs, key=pairs.get)
        if pairs[best_pair] < 2:  # Stop if no pair appears more than once
            break
            
        # Merge the pair
        new_token = best_pair[0] + best_pair[1]
        vocab.append(new_token)
        
        # Replace in tokens
        new_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == best_pair:
                new_tokens.append(new_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        tokens = new_tokens
    
    print(f"Final vocab size: {len(vocab)}")
    
    # Build mappings
    token_to_id = {token: i for i, token in enumerate(vocab)}
    id_to_token = {i: token for i, token in enumerate(vocab)}
    
    return token_to_id, id_to_token, vocab

# Test BPE on a smaller text sample
sample_text = main_text[:2000]  # Use first 2000 chars for speed
bpe_to_id, id_to_bpe, bpe_vocab = simple_bpe_tokenizer(sample_text, vocab_size=200, iterations=100)

print(f"\nBPE vocab sample: {bpe_vocab[-10:]}")
print(f"Some learned subwords: {[tok for tok in bpe_vocab if len(tok) > 1][:10]}")

## TFRecord Creation Experiments

Trying different ways to create training data...

In [None]:
# Quick TFRecord creation function
def create_tfrecord_v1(text, tokenizer_dict, seq_len=64, output_file='test.tfrecord'):
    """Create TFRecord file - messy version"""
    print(f"Creating TFRecord with seq_len={seq_len}")
    
    # Tokenize text
    if isinstance(list(tokenizer_dict.keys())[0], str) and len(list(tokenizer_dict.keys())[0]) == 1:
        # Character tokenizer
        tokens = [tokenizer_dict.get(ch, tokenizer_dict.get('<UNK>', 0)) for ch in text]
    else:
        # Word tokenizer - need to split first
        words = text.lower().split()
        tokens = [tokenizer_dict.get(word, tokenizer_dict.get('<UNK>', 0)) for word in words]
    
    print(f"Tokenized {len(text)} chars into {len(tokens)} tokens")
    
    # Create examples
    examples = []
    for i in range(0, len(tokens) - seq_len, seq_len // 2):  # Overlapping windows
        input_seq = tokens[i:i+seq_len]
        target_seq = tokens[i+1:i+seq_len+1]
        
        if len(input_seq) == seq_len and len(target_seq) == seq_len:
            examples.append((input_seq, target_seq))
    
    print(f"Created {len(examples)} training examples")
    
    # Write TFRecord
    with tf.io.TFRecordWriter(output_file) as writer:
        for inputs, targets in tqdm(examples, desc="Writing TFRecord"):
            # Create features
            feature = {
                'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=inputs)),
                'target_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=targets)),
                'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_len]))
            }
            
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
    
    print(f"✓ Saved {len(examples)} examples to {output_file}")
    return len(examples)

# Test with character tokenizer
output_dir = f"{PROJECT_ROOT}/notebooks/tfrecords_test"
os.makedirs(output_dir, exist_ok=True)

char_file = f"{output_dir}/char_test.tfrecord"
char_examples = create_tfrecord_v1(main_text[:5000], char_to_id_v1, seq_len=32, output_file=char_file)

# Test with word tokenizer  
word_file = f"{output_dir}/word_test.tfrecord"
word_examples = create_tfrecord_v1(main_text[:5000], word_to_id_v2, seq_len=16, output_file=word_file)

print(f"\nCreated TFRecord files:")
print(f"  Character: {char_examples} examples")
print(f"  Word: {word_examples} examples")

In [None]:
# Test reading the TFRecord files
def read_tfrecord_test(filename, num_examples=3):
    """Quick test to read TFRecord"""
    print(f"\nReading {filename}...")
    
    # Define feature description
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([], tf.string),
        'target_ids': tf.io.FixedLenFeature([], tf.string),
        'length': tf.io.FixedLenFeature([1], tf.int64)
    }
    
    # Actually, let me fix this - I saved as int64_list but reading as string
    def _parse_function(proto):
        return tf.io.parse_single_example(proto, {
            'input_ids': tf.io.VarLenFeature(tf.int64),
            'target_ids': tf.io.VarLenFeature(tf.int64),
            'length': tf.io.FixedLenFeature([1], tf.int64)
        })
    
    dataset = tf.data.TFRecordDataset(filename)
    dataset = dataset.map(_parse_function)
    
    # Show first few examples
    for i, example in enumerate(dataset.take(num_examples)):
        inputs = tf.sparse.to_dense(example['input_ids']).numpy()
        targets = tf.sparse.to_dense(example['target_ids']).numpy()
        length = example['length'].numpy()[0]
        
        print(f"  Example {i+1}: length={length}")
        print(f"    Inputs:  {inputs[:10]}...")
        print(f"    Targets: {targets[:10]}...")

# Test reading
read_tfrecord_test(char_file)
read_tfrecord_test(word_file)

## Data Analysis and Visualization

Let me analyze the different tokenization approaches...

In [None]:
# Compare tokenization efficiency
sample_text = main_text[:1000]
print(f"Analyzing sample text ({len(sample_text)} chars)...")

tokenization_results = {}

# Character tokenization
char_tokens = [char_to_id_v1.get(ch, char_to_id_v1['<UNK>']) for ch in sample_text]
tokenization_results['Character'] = {
    'tokens': len(char_tokens),
    'vocab_size': len(vocab_v1),
    'compression_ratio': len(sample_text) / len(char_tokens),
    'avg_token_length': 1.0
}

# Word tokenization
sample_words = sample_text.lower().split()
word_tokens = [word_to_id_v2.get(word, word_to_id_v2['<UNK>']) for word in sample_words]
tokenization_results['Word'] = {
    'tokens': len(word_tokens),
    'vocab_size': len(vocab_v2),
    'compression_ratio': len(sample_text) / len(word_tokens),
    'avg_token_length': len(sample_text) / len(word_tokens)
}

# BPE tokenization (approximate)
bpe_approx_tokens = len(sample_text) // 3  # Rough estimate
tokenization_results['BPE (approx)'] = {
    'tokens': bpe_approx_tokens,
    'vocab_size': len(bpe_vocab),
    'compression_ratio': len(sample_text) / bpe_approx_tokens,
    'avg_token_length': 3.0  # Rough estimate
}

print("\nTokenization Comparison:")
print(f"{'Method':<15} {'Tokens':<8} {'Vocab':<8} {'Compression':<12} {'Avg Len':<8}")
print("-" * 55)
for method, results in tokenization_results.items():
    print(f"{method:<15} {results['tokens']:<8} {results['vocab_size']:<8} {results['compression_ratio']:<12.2f} {results['avg_token_length']:<8.2f}")

In [None]:
# Visualize tokenization statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Token count comparison
methods = list(tokenization_results.keys())
token_counts = [tokenization_results[m]['tokens'] for m in methods]
vocab_sizes = [tokenization_results[m]['vocab_size'] for m in methods]

axes[0, 0].bar(methods, token_counts, alpha=0.7, color='skyblue')
axes[0, 0].set_title('Number of Tokens (1000 chars)')
axes[0, 0].set_ylabel('Token Count')
axes[0, 0].tick_params(axis='x', rotation=45)

# Vocabulary size comparison
axes[0, 1].bar(methods, vocab_sizes, alpha=0.7, color='lightcoral')
axes[0, 1].set_title('Vocabulary Size')
axes[0, 1].set_ylabel('Vocab Size')
axes[0, 1].tick_params(axis='x', rotation=45)

# Character frequency in original text
char_freqs = char_counts.most_common(20)
chars, freqs = zip(*char_freqs)
char_labels = [repr(c) if c in '\n\t\r ' else c for c in chars]

axes[1, 0].bar(range(len(chars)), freqs, alpha=0.7, color='lightgreen')
axes[1, 0].set_title('Top 20 Character Frequencies')
axes[1, 0].set_xlabel('Character')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_xticks(range(len(chars)))
axes[1, 0].set_xticklabels(char_labels, rotation=45)

# Compression ratio comparison
compression_ratios = [tokenization_results[m]['compression_ratio'] for m in methods]
axes[1, 1].bar(methods, compression_ratios, alpha=0.7, color='gold')
axes[1, 1].set_title('Compression Ratio (chars/token)')
axes[1, 1].set_ylabel('Ratio')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print summary
print("\n📊 Data Processing Analysis Summary:")
print(f"- Original text: {len(main_text):,} characters")
print(f"- Character vocab: {len(vocab_v1)} tokens")
print(f"- Word vocab: {len(vocab_v2)} tokens") 
print(f"- BPE vocab: {len(bpe_vocab)} tokens")
print(f"- Character examples created: {char_examples}")
print(f"- Word examples created: {word_examples}")

In [None]:
# Save tokenizers for later use (quick and dirty)
tokenizer_dir = f"{PROJECT_ROOT}/notebooks/tokenizer_experiments"
os.makedirs(tokenizer_dir, exist_ok=True)

# Save character tokenizer
with open(f"{tokenizer_dir}/char_tokenizer_v1.json", 'w') as f:
    json.dump({
        'char_to_id': char_to_id_v1,
        'id_to_char': id_to_char_v1,
        'vocab': vocab_v1,
        'type': 'character'
    }, f, indent=2)

# Save word tokenizer
with open(f"{tokenizer_dir}/word_tokenizer_v2.json", 'w') as f:
    json.dump({
        'word_to_id': word_to_id_v2,
        'id_to_word': id_to_word_v2,
        'vocab': vocab_v2,
        'type': 'word'
    }, f, indent=2)

# Save BPE tokenizer
with open(f"{tokenizer_dir}/bpe_tokenizer_simple.json", 'w') as f:
    json.dump({
        'token_to_id': bpe_to_id,
        'id_to_token': id_to_bpe,
        'vocab': bpe_vocab,
        'type': 'bpe'
    }, f, indent=2)

print(f"✓ Saved tokenizers to {tokenizer_dir}")

# Quick experiment notes
experiment_notes = f"""
Data Preprocessing Experiments - {datetime.now()}
============================================

Text Sources:
{', '.join(texts.keys())}

Tokenization Results:
- Character: {len(vocab_v1)} vocab, {char_examples} examples
- Word: {len(vocab_v2)} vocab, {word_examples} examples  
- BPE: {len(bpe_vocab)} vocab (experimental)

Best Approach:
Character tokenization seems most stable for this dataset size.
Word tokenization has large vocab but good compression.
BPE needs more work but shows promise.

Next Steps:
- Implement proper BPE
- Try SentencePiece
- Compare model performance
- Scale up to larger datasets
"""

with open(f"{tokenizer_dir}/experiment_notes.txt", 'w') as f:
    f.write(experiment_notes)

print("\n🎉 Data preprocessing experiments complete!")
print("Ready to clean up into proper modules...")