# HHPF Data Exploration

This notebook explores the datasets and demonstrates the HHPF pipeline.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils import load_config
from src.data_preparation.dataset_loaders import get_loader

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Configuration

In [None]:
config = load_config('datasets')

print("Available datasets:")
for domain, info in config['datasets'].items():
    print(f"  - {domain}: {info['name']} ({info['domain']})")

## 2. Load Dataset (Math Example)

In [None]:
# Start with math dataset (easiest to work with)
domain = 'math'

try:
    loader = get_loader(domain)
    df = loader.load_dataset(domain)
    df = loader.get_prompt_and_answer(domain, df)
    
    print(f"Loaded {len(df)} examples")
    print(f"\nColumns: {list(df.columns)}")
    
    # Display sample
    df.head()
except FileNotFoundError as e:
    print(f"Dataset not found: {e}")
    print("\nPlease place your GSM8K dataset as 'data/raw/gsm8k.csv'")
    print("Expected columns: 'question', 'answer'")

## 3. Explore Dataset Statistics

In [None]:
if 'df' in locals():
    # Prompt length distribution
    df['prompt_length'] = df['prompt'].str.len()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Length distribution
    axes[0].hist(df['prompt_length'], bins=30, edgecolor='black')
    axes[0].set_xlabel('Prompt Length (characters)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Prompt Length Distribution')
    
    # Word count distribution
    df['word_count'] = df['prompt'].str.split().str.len()
    axes[1].hist(df['word_count'], bins=30, edgecolor='black', color='coral')
    axes[1].set_xlabel('Word Count')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Prompt Word Count Distribution')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nPrompt Statistics:")
    print(f"  Mean length: {df['prompt_length'].mean():.0f} characters")
    print(f"  Mean words: {df['word_count'].mean():.1f}")

## 4. Sample Prompts and Answers

In [None]:
if 'df' in locals():
    # Display random samples
    samples = df.sample(n=3)
    
    for idx, row in samples.iterrows():
        print("="*60)
        print(f"Example {idx + 1}")
        print("="*60)
        print(f"Prompt: {row['prompt'][:200]}...")
        print(f"\nGround Truth: {row['ground_truth']}")
        print()

## 5. Process Dataset for Pipeline

In [None]:
from src.data_preparation.process_datasets import process_dataset

# Process the dataset
try:
    processed_df = process_dataset(domain='math')
    
    print("\nâœ“ Dataset processed successfully!")
    print(f"  Saved to: data/processed/math_processed.csv")
    print(f"  Total samples: {len(processed_df)}")
    print(f"  Train samples: {(processed_df['split'] == 'train').sum()}")
    print(f"  Test samples: {(processed_df['split'] == 'test').sum()}")
    
except Exception as e:
    print(f"Error processing dataset: {e}")

## 6. Next Steps

Now that the dataset is processed, you can:

1. **Generate responses**: Run `python -m src.inference.response_generator --dataset math --limit 100` (start with small sample)
2. **Extract features**: Run `python -m src.features.feature_aggregator --responses data/features/responses_math_processed.csv --output data/features/math_features.csv`
3. **Train model**: Run `python -m src.classifier.xgboost_model --features data/features/math_features.csv`

Or use the end-to-end pipeline:
```bash
python run_pipeline.py --domain math --limit 100
```

## 7. Cost Estimation

Estimate API costs before running inference:

In [None]:
if 'processed_df' in locals():
    n_prompts = len(processed_df)
    n_samples = 10  # Stochastic samples per prompt
    avg_tokens = 500  # Estimated average
    
    total_tokens = n_prompts * n_samples * avg_tokens
    
    # Together AI pricing
    cost_8b = (total_tokens / 1_000_000) * 0.20  # $0.20 per 1M tokens
    cost_70b = (total_tokens / 1_000_000) * 0.88  # $0.88 per 1M tokens
    
    print("Estimated API Costs:")
    print(f"  Prompts: {n_prompts:,}")
    print(f"  Total tokens: {total_tokens:,}")
    print(f"  Llama-3-8B: ${cost_8b:.2f}")
    print(f"  Llama-3-70B: ${cost_70b:.2f}")
    print(f"\nRecommendation: Start with 8B model (cheaper, faster)")