In [None]:
# Import required libraries
import sys
sys.path.append('../src')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json
import time
from src.dataset import prepare_dataset, get_dataloader
from src.evaluate import ModelEvaluator, load_model

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Baseline Models

In [None]:
# Load baseline models (assuming they've been trained)
baseline_models = {}
model_paths = {
    'baseline_tiny': '../models/baseline_gpt2_tiny',
    'baseline_wiki': '../models/baseline_gpt2_wiki'
}

# Try to load models, or use pretrained GPT-2 if not available
for name, path in model_paths.items():
    try:
        print(f"Loading {name} from {path}...")
        model, tokenizer = load_model(path, 'baseline')
        baseline_models[name] = {'model': model, 'tokenizer': tokenizer, 'path': path}
        print(f"âœ“ Loaded {name}")
    except Exception as e:
        print(f"âš  Could not load {name}: {e}")
        print(f"Using pretrained GPT-2 instead...")
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        tokenizer.pad_token = tokenizer.eos_token
        baseline_models[name] = {'model': model, 'tokenizer': tokenizer, 'path': 'gpt2'}
        print(f"âœ“ Using pretrained GPT-2 for {name}")

print(f"\nLoaded {len(baseline_models)} baseline models")

## 2. Model Information

In [None]:
# Display model information
for name, model_info in baseline_models.items():
    model = model_info['model']
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    model_size_mb = total_params * 4 / 1024**2  # Assuming float32
    
    print(f"=== {name.upper()} ===")
    print(f"Model path: {model_info['path']}")
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size: {model_size_mb:.1f} MB")
    print(f"Config: {model.config}")
    print()

## 3. Prepare Test Datasets

In [None]:
# Prepare test datasets
print("Preparing test datasets...")

# TinyStories dataset
tiny_datasets, tiny_tokenizer = prepare_dataset('tiny_stories', max_length=512)
tiny_test_loader = get_dataloader(tiny_datasets['test'], batch_size=8, shuffle=False)

# WikiText-103 dataset  
wiki_datasets, wiki_tokenizer = prepare_dataset('wikitext-103', max_length=1024)
wiki_test_loader = get_dataloader(wiki_datasets['test'], batch_size=4, shuffle=False)

test_datasets = {
    'tiny_stories': tiny_test_loader,
    'wikitext': wiki_test_loader
}

print(f"âœ“ Prepared test datasets")
print(f"  TinyStories test: {len(tiny_datasets['test'])} examples")
print(f"  WikiText test: {len(wiki_datasets['test'])} examples")

## 4. Comprehensive Model Evaluation

In [None]:
# Evaluate models on both datasets
evaluation_results = {}

for model_name, model_info in baseline_models.items():
    model = model_info['model']
    tokenizer = model_info['tokenizer']
    
    print(f"\n{'='*50}")
    print(f"EVALUATING: {model_name.upper()}")
    print(f"{'='*50}")
    
    evaluator = ModelEvaluator(model, tokenizer, device)
    evaluation_results[model_name] = {}
    
    # Evaluate on both datasets
    for dataset_name, test_loader in test_datasets.items():
        print(f"\n--- Evaluating on {dataset_name} ---")
        
        try:
            # Full evaluation
            results = evaluator.full_evaluation(test_loader)
            evaluation_results[model_name][dataset_name] = results
            
            # Print summary
            print(f"Perplexity: {results['perplexity']:.2f}")
            print(f"Throughput: {results['throughput']['tokens_per_second']:.1f} tokens/sec")
            print(f"Latency (mean): {results['latency']['length_512']['mean_latency_ms']:.1f} ms")
            
            if 'gpu_memory_allocated_mb' in results['memory']:
                print(f"GPU Memory: {results['memory']['gpu_memory_allocated_mb']:.1f} MB")
            
        except Exception as e:
            print(f"Error evaluating {model_name} on {dataset_name}: {e}")
            evaluation_results[model_name][dataset_name] = None

print("\nâœ“ Model evaluation completed")

## 5. Results Visualization

In [None]:
# Extract results for visualization
def extract_metrics(results):
    metrics = {
        'model': [],
        'dataset': [],
        'perplexity': [],
        'throughput': [],
        'latency': [],
        'memory': []
    }
    
    for model_name, model_results in results.items():
        for dataset_name, dataset_results in model_results.items():
            if dataset_results is not None:
                metrics['model'].append(model_name)
                metrics['dataset'].append(dataset_name)
                metrics['perplexity'].append(dataset_results['perplexity'])
                metrics['throughput'].append(dataset_results['throughput']['tokens_per_second'])
                metrics['latency'].append(dataset_results['latency']['length_512']['mean_latency_ms'])
                metrics['memory'].append(dataset_results['memory'].get('gpu_memory_allocated_mb', 0))
    
    return pd.DataFrame(metrics)

metrics_df = extract_metrics(evaluation_results)
print("Evaluation Metrics:")
print(metrics_df)

In [None]:
# Plot evaluation results
if not metrics_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Perplexity comparison
    sns.barplot(data=metrics_df, x='dataset', y='perplexity', hue='model', ax=axes[0, 0])
    axes[0, 0].set_title('Perplexity Comparison')
    axes[0, 0].set_ylabel('Perplexity (lower is better)')
    
    # Throughput comparison
    sns.barplot(data=metrics_df, x='dataset', y='throughput', hue='model', ax=axes[0, 1])
    axes[0, 1].set_title('Throughput Comparison')
    axes[0, 1].set_ylabel('Tokens/Second (higher is better)')
    
    # Latency comparison
    sns.barplot(data=metrics_df, x='dataset', y='latency', hue='model', ax=axes[1, 0])
    axes[1, 0].set_title('Latency Comparison')
    axes[1, 0].set_ylabel('Latency ms (lower is better)')
    
    # Memory usage comparison
    if metrics_df['memory'].sum() > 0:  # Only plot if we have GPU memory data
        sns.barplot(data=metrics_df, x='dataset', y='memory', hue='model', ax=axes[1, 1])
        axes[1, 1].set_title('GPU Memory Usage')
        axes[1, 1].set_ylabel('Memory MB')
    else:
        axes[1, 1].text(0.5, 0.5, 'GPU Memory\nData Not Available', 
                       transform=axes[1, 1].transAxes, ha='center', va='center')
        axes[1, 1].set_title('GPU Memory Usage')
    
    plt.tight_layout()
    plt.savefig('../results/figures/baseline_evaluation.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No evaluation results to plot")

## 6. Text Generation Analysis

In [None]:
# Generate sample texts to analyze quality
def generate_sample_texts(model, tokenizer, prompts, max_length=100, num_samples=3):
    model.eval()
    generated_texts = []
    
    with torch.no_grad():
        for prompt in prompts:
            print(f"\nPrompt: '{prompt}'")
            print("Generated texts:")
            
            # Encode prompt
            input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
            
            for i in range(num_samples):
                # Generate
                output = model.generate(
                    input_ids,
                    max_length=input_ids.shape[1] + max_length,
                    num_return_sequences=1,
                    temperature=0.8,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )
                
                # Decode
                generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
                generated_texts.append(generated_text)
                
                print(f"  {i+1}: {generated_text}")
            
            print("-" * 80)
    
    return generated_texts

# Define prompts for different datasets
prompts = {
    'tiny_stories': [
        "Once upon a time, there was a",
        "The little girl wanted to",
        "In the forest, they found"
    ],
    'general': [
        "The theory of relativity",
        "Machine learning is",
        "Climate change affects"
    ]
}

# Generate samples for each model
for model_name, model_info in baseline_models.items():
    model = model_info['model'].to(device)
    tokenizer = model_info['tokenizer']
    
    print(f"\n{'='*60}")
    print(f"TEXT GENERATION: {model_name.upper()}")
    print(f"{'='*60}")
    
    # Choose appropriate prompts
    if 'tiny' in model_name:
        test_prompts = prompts['tiny_stories']
    else:
        test_prompts = prompts['general']
    
    generated_texts = generate_sample_texts(
        model, tokenizer, test_prompts, max_length=50, num_samples=2
    )

## 7. Attention Pattern Analysis

In [None]:
# Analyze attention patterns in baseline model
def extract_attention_patterns(model, tokenizer, text, layer_idx=11):
    """Extract attention patterns from a specific layer."""
    model.eval()
    
    # Encode text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    
    # Forward pass with attention output
    with torch.no_grad():
        outputs = model(input_ids, output_attentions=True)
        attentions = outputs.attentions[layer_idx]  # Shape: [batch, heads, seq_len, seq_len]
    
    return attentions.cpu().numpy(), inputs

# Visualize attention patterns
def visualize_attention(attention, tokens, layer_idx, head_idx=0):
    """Visualize attention pattern for a specific head."""
    attention_head = attention[0, head_idx]  # [seq_len, seq_len]
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        attention_head,
        xticklabels=[tokenizer.decode([token]) for token in tokens],
        yticklabels=[tokenizer.decode([token]) for token in tokens],
        cmap='Blues',
        cbar=True
    )
    plt.title(f'Attention Pattern - Layer {layer_idx}, Head {head_idx}')
    plt.xlabel('Key Tokens')
    plt.ylabel('Query Tokens')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    return plt.gcf()

# Analyze attention patterns for one model
if baseline_models:
    model_name = list(baseline_models.keys())[0]
    model_info = baseline_models[model_name]
    model = model_info['model'].to(device)
    tokenizer = model_info['tokenizer']
    
    sample_text = "The quick brown fox jumps over the lazy dog."
    
    print(f"Analyzing attention patterns for {model_name}...")
    print(f"Sample text: '{sample_text}'")
    
    # Extract attention from last layer
    attention, inputs = extract_attention_patterns(model, tokenizer, sample_text, layer_idx=-1)
    tokens = inputs['input_ids'][0].cpu().numpy()
    
    print(f"Attention shape: {attention.shape}")
    print(f"Tokens: {[tokenizer.decode([token]) for token in tokens]}")
    
    # Visualize attention for first head
    fig = visualize_attention(attention, tokens, layer_idx=-1, head_idx=0)
    plt.savefig('../results/figures/baseline_attention_pattern.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Calculate attention statistics
    attention_stats = {
        'mean_attention': np.mean(attention),
        'max_attention': np.max(attention),
        'attention_entropy': -np.sum(attention * np.log(attention + 1e-10), axis=-1).mean(),
        'attention_sparsity': np.mean(attention < 0.01)  # Fraction of near-zero weights
    }
    
    print(f"\nAttention Statistics:")
    for key, value in attention_stats.items():
        print(f"  {key}: {value:.4f}")

## 8. Performance Summary

In [None]:
# Create comprehensive performance summary
print("\n" + "="*80)
print("BASELINE MODEL PERFORMANCE SUMMARY")
print("="*80)

if not metrics_df.empty:
    summary_stats = metrics_df.groupby('dataset').agg({
        'perplexity': ['mean', 'std'],
        'throughput': ['mean', 'std'],
        'latency': ['mean', 'std'],
        'memory': ['mean', 'std']
    }).round(2)
    
    print("\nPerformance by Dataset:")
    print(summary_stats)
    
    # Best performing model per metric
    print("\nBest Models per Metric:")
    
    for dataset in metrics_df['dataset'].unique():
        dataset_df = metrics_df[metrics_df['dataset'] == dataset]
        print(f"\n{dataset.upper()}:")
        
        # Best perplexity (lowest)
        best_ppl = dataset_df.loc[dataset_df['perplexity'].idxmin()]
        print(f"  Best Perplexity: {best_ppl['model']} ({best_ppl['perplexity']:.2f})")
        
        # Best throughput (highest)
        best_throughput = dataset_df.loc[dataset_df['throughput'].idxmax()]
        print(f"  Best Throughput: {best_throughput['model']} ({best_throughput['throughput']:.1f} tokens/sec)")
        
        # Best latency (lowest)
        best_latency = dataset_df.loc[dataset_df['latency'].idxmin()]
        print(f"  Best Latency: {best_latency['model']} ({best_latency['latency']:.1f} ms)")
else:
    print("No evaluation results available for summary")

# Save all results
results_summary = {
    'evaluation_results': evaluation_results,
    'metrics_summary': metrics_df.to_dict() if not metrics_df.empty else {},
    'evaluation_date': time.strftime('%Y-%m-%d %H:%M:%S')
}

with open('../results/baseline/comprehensive_evaluation.json', 'w') as f:
    # Convert numpy types to Python types for JSON serialization
    import json
    class NumpyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            return super().default(obj)
    
    json.dump(results_summary, f, indent=2, cls=NumpyEncoder)

print("\nâœ“ Evaluation results saved to ../results/baseline/comprehensive_evaluation.json")
print("\nðŸŽ¯ Baseline evaluation completed! Use these results as reference for masked model comparison.")