In [None]:
# Import required libraries
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2Tokenizer
from datasets import load_dataset
import json
from collections import Counter
import re

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Dataset Loading and Basic Statistics

In [None]:
# Load datasets
print("Loading WikiText-103...")
wikitext = load_dataset("wikitext", "wikitext-103-v1", cache_dir="../data/processed")

print("Loading TinyStories...")
tinystories = load_dataset("roneneldan/TinyStories", cache_dir="../data/processed")

print("Datasets loaded successfully!")

In [None]:
# Basic dataset statistics
def get_dataset_stats(dataset, name):
    stats = {}
    
    for split in dataset.keys():
        texts = dataset[split]['text']
        
        # Filter out empty texts
        texts = [text for text in texts if text.strip()]
        
        # Basic statistics
        stats[split] = {
            'num_examples': len(texts),
            'total_chars': sum(len(text) for text in texts),
            'avg_chars_per_example': np.mean([len(text) for text in texts]),
            'median_chars_per_example': np.median([len(text) for text in texts]),
            'total_words': sum(len(text.split()) for text in texts),
            'avg_words_per_example': np.mean([len(text.split()) for text in texts])
        }
    
    return stats

wikitext_stats = get_dataset_stats(wikitext, "WikiText-103")
tinystories_stats = get_dataset_stats(tinystories, "TinyStories")

print("=== WikiText-103 Statistics ===")
for split, stats in wikitext_stats.items():
    print(f"\n{split.upper()}:")
    print(f"  Examples: {stats['num_examples']:,}")
    print(f"  Total characters: {stats['total_chars']:,}")
    print(f"  Avg chars/example: {stats['avg_chars_per_example']:.1f}")
    print(f"  Total words: {stats['total_words']:,}")
    print(f"  Avg words/example: {stats['avg_words_per_example']:.1f}")

print("\n=== TinyStories Statistics ===")
for split, stats in tinystories_stats.items():
    print(f"\n{split.upper()}:")
    print(f"  Examples: {stats['num_examples']:,}")
    print(f"  Total characters: {stats['total_chars']:,}")
    print(f"  Avg chars/example: {stats['avg_chars_per_example']:.1f}")
    print(f"  Total words: {stats['total_words']:,}")
    print(f"  Avg words/example: {stats['avg_words_per_example']:.1f}")

## 2. Sample Text Analysis

In [None]:
# Show sample texts
print("=== WikiText-103 Samples ===")
wikitext_samples = [text for text in wikitext['train']['text'][:1000] if text.strip()]

for i, sample in enumerate(wikitext_samples[:3]):
    print(f"\nSample {i+1} ({len(sample)} chars, {len(sample.split())} words):")
    print(sample[:500] + "..." if len(sample) > 500 else sample)
    print("-" * 50)

print("\n\n=== TinyStories Samples ===")
tinystories_samples = [text for text in tinystories['train']['text'][:1000] if text.strip()]

for i, sample in enumerate(tinystories_samples[:3]):
    print(f"\nSample {i+1} ({len(sample)} chars, {len(sample.split())} words):")
    print(sample[:500] + "..." if len(sample) > 500 else sample)
    print("-" * 50)

## 3. Text Length Distributions

In [None]:
# Analyze text length distributions
def analyze_length_distribution(dataset, name, max_samples=10000):
    # Get sample of texts
    texts = [text for text in dataset['train']['text'][:max_samples] if text.strip()]
    
    # Calculate lengths
    char_lengths = [len(text) for text in texts]
    word_lengths = [len(text.split()) for text in texts]
    
    # Statistics
    char_stats = {
        'min': np.min(char_lengths),
        'max': np.max(char_lengths),
        'mean': np.mean(char_lengths),
        'median': np.median(char_lengths),
        'std': np.std(char_lengths),
        'q25': np.percentile(char_lengths, 25),
        'q75': np.percentile(char_lengths, 75),
        'q95': np.percentile(char_lengths, 95),
        'q99': np.percentile(char_lengths, 99)
    }
    
    word_stats = {
        'min': np.min(word_lengths),
        'max': np.max(word_lengths),
        'mean': np.mean(word_lengths),
        'median': np.median(word_lengths),
        'std': np.std(word_lengths),
        'q25': np.percentile(word_lengths, 25),
        'q75': np.percentile(word_lengths, 75),
        'q95': np.percentile(word_lengths, 95),
        'q99': np.percentile(word_lengths, 99)
    }
    
    return char_lengths, word_lengths, char_stats, word_stats

# Analyze both datasets
wiki_char_lens, wiki_word_lens, wiki_char_stats, wiki_word_stats = analyze_length_distribution(wikitext, "WikiText-103")
tiny_char_lens, tiny_word_lens, tiny_char_stats, tiny_word_stats = analyze_length_distribution(tinystories, "TinyStories")

# Print statistics
print("=== Length Distribution Statistics ===")
print("\nWikiText-103 Character Lengths:")
for key, value in wiki_char_stats.items():
    print(f"  {key}: {value:.1f}")

print("\nTinyStories Character Lengths:")
for key, value in tiny_char_stats.items():
    print(f"  {key}: {value:.1f}")

print("\nWikiText-103 Word Lengths:")
for key, value in wiki_word_stats.items():
    print(f"  {key}: {value:.1f}")

print("\nTinyStories Word Lengths:")
for key, value in tiny_word_stats.items():
    print(f"  {key}: {value:.1f}")

In [None]:
# Plot length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Character length distributions
axes[0, 0].hist(wiki_char_lens, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[0, 0].hist(tiny_char_lens, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[0, 0].set_title('Character Length Distribution')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].set_xlim(0, 2000)  # Zoom in for better visibility

# Word length distributions
axes[0, 1].hist(wiki_word_lens, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[0, 1].hist(tiny_word_lens, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[0, 1].set_title('Word Length Distribution')
axes[0, 1].set_xlabel('Words')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()
axes[0, 1].set_xlim(0, 400)  # Zoom in for better visibility

# Log scale character distributions
axes[1, 0].hist(wiki_char_lens, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[1, 0].hist(tiny_char_lens, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[1, 0].set_title('Character Length Distribution (Log Scale)')
axes[1, 0].set_xlabel('Characters')
axes[1, 0].set_ylabel('Density')
axes[1, 0].set_yscale('log')
axes[1, 0].legend()

# Log scale word distributions
axes[1, 1].hist(wiki_word_lens, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[1, 1].hist(tiny_word_lens, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[1, 1].set_title('Word Length Distribution (Log Scale)')
axes[1, 1].set_xlabel('Words')
axes[1, 1].set_ylabel('Density')
axes[1, 1].set_yscale('log')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('../results/figures/dataset_length_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Tokenization Analysis

In [None]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.special_tokens_map}")

In [None]:
# Analyze tokenization for both datasets
def analyze_tokenization(texts, tokenizer, name, max_samples=1000):
    print(f"\n=== Tokenization Analysis: {name} ===")
    
    # Sample texts for analysis
    sample_texts = [text for text in texts[:max_samples] if text.strip()]
    
    token_lengths = []
    compression_ratios = []
    
    for text in sample_texts:
        # Tokenize
        tokens = tokenizer.encode(text, add_special_tokens=False)
        
        # Calculate metrics
        num_tokens = len(tokens)
        num_chars = len(text)
        compression_ratio = num_chars / num_tokens if num_tokens > 0 else 0
        
        token_lengths.append(num_tokens)
        compression_ratios.append(compression_ratio)
    
    # Statistics
    token_stats = {
        'mean': np.mean(token_lengths),
        'median': np.median(token_lengths),
        'std': np.std(token_lengths),
        'min': np.min(token_lengths),
        'max': np.max(token_lengths),
        'q95': np.percentile(token_lengths, 95),
        'q99': np.percentile(token_lengths, 99)
    }
    
    compression_stats = {
        'mean': np.mean(compression_ratios),
        'median': np.median(compression_ratios),
        'std': np.std(compression_ratios)
    }
    
    print(f"Token Length Statistics:")
    for key, value in token_stats.items():
        print(f"  {key}: {value:.1f}")
    
    print(f"\nCompression Ratio (chars/token):")
    for key, value in compression_stats.items():
        print(f"  {key}: {value:.2f}")
    
    # Estimate how many examples would fit in different sequence lengths
    seq_lengths = [128, 256, 512, 1024, 2048]
    print(f"\nSequence Length Coverage:")
    for seq_len in seq_lengths:
        coverage = np.mean(np.array(token_lengths) <= seq_len) * 100
        print(f"  {seq_len} tokens: {coverage:.1f}% of examples")
    
    return token_lengths, compression_ratios, token_stats

# Analyze tokenization
wiki_tokens, wiki_compression, wiki_token_stats = analyze_tokenization(
    wikitext['train']['text'], tokenizer, "WikiText-103"
)

tiny_tokens, tiny_compression, tiny_token_stats = analyze_tokenization(
    tinystories['train']['text'], tokenizer, "TinyStories"
)

In [None]:
# Plot token length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Token length distributions
axes[0, 0].hist(wiki_tokens, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[0, 0].hist(tiny_tokens, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[0, 0].set_title('Token Length Distribution')
axes[0, 0].set_xlabel('Number of Tokens')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].set_xlim(0, 1000)  # Zoom in

# Compression ratio distributions
axes[0, 1].hist(wiki_compression, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[0, 1].hist(tiny_compression, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[0, 1].set_title('Compression Ratio Distribution (chars/token)')
axes[0, 1].set_xlabel('Characters per Token')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()

# Sequence length coverage
seq_lengths = [128, 256, 512, 1024, 2048]
wiki_coverage = [np.mean(np.array(wiki_tokens) <= seq_len) * 100 for seq_len in seq_lengths]
tiny_coverage = [np.mean(np.array(tiny_tokens) <= seq_len) * 100 for seq_len in seq_lengths]

x = np.arange(len(seq_lengths))
width = 0.35

axes[1, 0].bar(x - width/2, wiki_coverage, width, label='WikiText-103', alpha=0.8)
axes[1, 0].bar(x + width/2, tiny_coverage, width, label='TinyStories', alpha=0.8)
axes[1, 0].set_title('Sequence Length Coverage')
axes[1, 0].set_xlabel('Maximum Sequence Length')
axes[1, 0].set_ylabel('Coverage (%)')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(seq_lengths)
axes[1, 0].legend()
axes[1, 0].set_ylim(0, 100)

# Sample tokenization examples
axes[1, 1].axis('off')
sample_text = "Once upon a time, there was a little girl named Alice."
tokens = tokenizer.encode(sample_text)
decoded_tokens = [tokenizer.decode([token]) for token in tokens]

example_text = f"Sample tokenization:\n\nOriginal: {sample_text}\n\n"
example_text += f"Tokens ({len(tokens)}): {tokens}\n\n"
example_text += "Decoded tokens:\n"
for i, (token, decoded) in enumerate(zip(tokens, decoded_tokens)):
    example_text += f"{i:2d}: {token:5d} -> '{decoded}'\n"

axes[1, 1].text(0.1, 0.9, example_text, transform=axes[1, 1].transAxes, 
                fontsize=10, verticalalignment='top', fontfamily='monospace')

plt.tight_layout()
plt.savefig('../results/figures/tokenization_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Vocabulary Analysis

In [None]:
# Analyze vocabulary usage
def analyze_vocabulary_usage(texts, tokenizer, name, max_samples=5000):
    print(f"\n=== Vocabulary Analysis: {name} ===")
    
    # Collect all tokens
    all_tokens = []
    sample_texts = [text for text in texts[:max_samples] if text.strip()]
    
    for text in sample_texts:
        tokens = tokenizer.encode(text, add_special_tokens=False)
        all_tokens.extend(tokens)
    
    # Count token frequencies
    token_counts = Counter(all_tokens)
    
    # Statistics
    total_tokens = len(all_tokens)
    unique_tokens = len(token_counts)
    
    print(f"Total tokens: {total_tokens:,}")
    print(f"Unique tokens: {unique_tokens:,}")
    print(f"Vocabulary coverage: {unique_tokens/tokenizer.vocab_size*100:.1f}%")
    print(f"Type-token ratio: {unique_tokens/total_tokens:.4f}")
    
    # Most common tokens
    print(f"\nMost common tokens:")
    for token, count in token_counts.most_common(10):
        decoded = tokenizer.decode([token]).replace('\n', '\\n').replace(' ', '·')
        print(f"  {token:5d}: '{decoded:15s}' ({count:,} times, {count/total_tokens*100:.2f}%)")
    
    # Least common tokens
    print(f"\nLeast common tokens:")
    for token, count in list(token_counts.most_common())[-10:]:
        decoded = tokenizer.decode([token]).replace('\n', '\\n').replace(' ', '·')
        print(f"  {token:5d}: '{decoded:15s}' ({count} times)")
    
    return token_counts, total_tokens, unique_tokens

# Analyze vocabulary for both datasets
wiki_vocab, wiki_total, wiki_unique = analyze_vocabulary_usage(
    wikitext['train']['text'], tokenizer, "WikiText-103"
)

tiny_vocab, tiny_total, tiny_unique = analyze_vocabulary_usage(
    tinystories['train']['text'], tokenizer, "TinyStories"
)

In [None]:
# Plot vocabulary statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Token frequency distributions (log scale)
wiki_freqs = list(wiki_vocab.values())
tiny_freqs = list(tiny_vocab.values())

axes[0, 0].hist(wiki_freqs, bins=50, alpha=0.7, label='WikiText-103', density=True)
axes[0, 0].hist(tiny_freqs, bins=50, alpha=0.7, label='TinyStories', density=True)
axes[0, 0].set_title('Token Frequency Distribution')
axes[0, 0].set_xlabel('Token Frequency')
axes[0, 0].set_ylabel('Density')
axes[0, 0].set_xscale('log')
axes[0, 0].set_yscale('log')
axes[0, 0].legend()

# Zipf's law visualization
wiki_sorted = sorted(wiki_freqs, reverse=True)
tiny_sorted = sorted(tiny_freqs, reverse=True)

ranks = np.arange(1, len(wiki_sorted) + 1)
axes[0, 1].loglog(ranks, wiki_sorted, 'o-', alpha=0.7, label='WikiText-103', markersize=2)

ranks = np.arange(1, len(tiny_sorted) + 1) 
axes[0, 1].loglog(ranks, tiny_sorted, 'o-', alpha=0.7, label='TinyStories', markersize=2)

axes[0, 1].set_title("Zipf's Law: Token Rank vs Frequency")
axes[0, 1].set_xlabel('Rank')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Vocabulary coverage comparison
datasets = ['WikiText-103', 'TinyStories']
total_tokens = [wiki_total, tiny_total]
unique_tokens = [wiki_unique, tiny_unique]
coverage = [wiki_unique/tokenizer.vocab_size*100, tiny_unique/tokenizer.vocab_size*100]

x = np.arange(len(datasets))
width = 0.25

axes[1, 0].bar(x - width, [t/1000 for t in total_tokens], width, label='Total Tokens (K)', alpha=0.8)
axes[1, 0].bar(x, unique_tokens, width, label='Unique Tokens', alpha=0.8)
axes[1, 0].bar(x + width, coverage, width, label='Vocab Coverage (%)', alpha=0.8)

axes[1, 0].set_title('Vocabulary Statistics Comparison')
axes[1, 0].set_xlabel('Dataset')
axes[1, 0].set_ylabel('Count / Percentage')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(datasets)
axes[1, 0].legend()
axes[1, 0].set_yscale('log')

# Summary statistics table
axes[1, 1].axis('off')
summary_data = [
    ['Metric', 'WikiText-103', 'TinyStories'],
    ['Total tokens', f'{wiki_total:,}', f'{tiny_total:,}'],
    ['Unique tokens', f'{wiki_unique:,}', f'{tiny_unique:,}'],
    ['Vocab coverage', f'{wiki_unique/tokenizer.vocab_size*100:.1f}%', 
     f'{tiny_unique/tokenizer.vocab_size*100:.1f}%'],
    ['Type-token ratio', f'{wiki_unique/wiki_total:.4f}', f'{tiny_unique/tiny_total:.4f}'],
    ['Avg token length', f'{wiki_token_stats["mean"]:.1f}', f'{tiny_token_stats["mean"]:.1f}'],
    ['95th percentile', f'{wiki_token_stats["q95"]:.1f}', f'{tiny_token_stats["q95"]:.1f}']
]

table_text = "\n".join([f"{row[0]:<20} {row[1]:<15} {row[2]:<15}" for row in summary_data])
axes[1, 1].text(0.1, 0.9, table_text, transform=axes[1, 1].transAxes,
                fontsize=11, verticalalignment='top', fontfamily='monospace')
axes[1, 1].set_title('Dataset Comparison Summary')

plt.tight_layout()
plt.savefig('../results/figures/vocabulary_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Dataset Recommendations for Training

In [None]:
# Provide recommendations based on analysis
print("=== Dataset Recommendations for Training ===")
print()

print("1. SEQUENCE LENGTH:")
print(f"   - For WikiText-103: Use max_length=1024 (covers ~{np.mean(np.array(wiki_tokens) <= 1024)*100:.1f}% of examples)")
print(f"   - For TinyStories: Use max_length=512 (covers ~{np.mean(np.array(tiny_tokens) <= 512)*100:.1f}% of examples)")
print()

print("2. BATCH SIZE RECOMMENDATIONS:")
print("   - WikiText-103: Start with smaller batches (8-16) due to longer sequences")
print("   - TinyStories: Can use larger batches (16-32) due to shorter sequences")
print()

print("3. DATASET CHARACTERISTICS:")
print("   - WikiText-103: More diverse vocabulary, longer contexts, encyclopedia-style text")
print("   - TinyStories: Simpler vocabulary, shorter contexts, narrative-style text")
print()

print("4. MASKING STRATEGY IMPLICATIONS:")
print("   - WikiText-103: Aggressive masking may hurt performance due to long-range dependencies")
print("   - TinyStories: More tolerant to attention masking due to simpler structure")
print()

print("5. RECOMMENDED TRAINING ORDER:")
print("   1. Start with TinyStories for quick experimentation and hyperparameter tuning")
print("   2. Test masking schedules on TinyStories first")
print("   3. Apply best configurations to WikiText-103 for final evaluation")
print()

# Save analysis results
analysis_results = {
    'wikitext_stats': {
        'char_stats': wiki_char_stats,
        'word_stats': wiki_word_stats,
        'token_stats': wiki_token_stats,
        'vocab_coverage': wiki_unique/tokenizer.vocab_size*100,
        'type_token_ratio': wiki_unique/wiki_total
    },
    'tinystories_stats': {
        'char_stats': tiny_char_stats,
        'word_stats': tiny_word_stats,
        'token_stats': tiny_token_stats,
        'vocab_coverage': tiny_unique/tokenizer.vocab_size*100,
        'type_token_ratio': tiny_unique/tiny_total
    },
    'recommendations': {
        'wikitext_max_length': 1024,
        'tinystories_max_length': 512,
        'wikitext_batch_size': [8, 16],
        'tinystories_batch_size': [16, 32]
    }
}

with open('../data/dataset_analysis.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

print("Analysis results saved to ../data/dataset_analysis.json")