# Example Count Cost Analysis

Test how different N_EXAMPLES values affect enrichment costs.

This notebook:
- Tests multiple N_EXAMPLES values (1, 2, 3, 5)
- Tracks actual costs from OpenAI API
- Saves results for analysis
- Visualizes cost trade-offs

## Setup

In [None]:
import sys
from pathlib import Path

# Add project root to path
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import the new test script functions
from scripts.maintenance.test_example_counts import (
    enrich_with_n_examples,
    calculate_cost,
)

# Set up plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Configuration

In [None]:
# File to save/load results
RESULTS_FILE = Path("../data/example_count_results.pkl")

# Model to use
MODEL = "gpt-4o-2024-08-06"

# Example counts to test
EXAMPLE_COUNTS = [1, 2, 3, 5]

## Define Test Words

Add words to test here. Mix different POS types for comprehensive comparison.

In [None]:
# List of (dutch_word, english_hint) tuples to test
test_words = [
    ("lopen", "to walk"),
    ("huis", "house"),
    ("mooi", "beautiful"),
]

print(f"Testing {len(test_words)} words with {len(EXAMPLE_COUNTS)} different N_EXAMPLES values")
print(f"Total enrichments: {len(test_words) * len(EXAMPLE_COUNTS)}")
print()
print("Test words:")
for dutch, english in test_words:
    print(f"  - {dutch} ({english})")

## Run Comparison

This will call both enrichment approaches for each word and collect results.

In [None]:
# Run enrichment tests
results = []

for dutch, english in test_words:
    print(f"\n{'='*60}")
    print(f"Testing: {dutch} ({english})")
    print(f"{'='*60}")
    
    for n in EXAMPLE_COUNTS:
        print(f"  n_examples={n}...", end=" ", flush=True)
        
        try:
            result, cost, duration = enrich_with_n_examples(dutch, english, n, MODEL)
            results.append({
                "dutch_word": dutch,
                "english_hint": english,
                "lemma": result["lemma"],
                "pos": result["pos"],
                "n_examples": n,
                "phase1_cost": result["phase1_cost"],
                "phase1_input": result["phase1_tokens"]["input"],
                "phase1_output": result["phase1_tokens"]["output"],
                "phase2_cost": result["phase2_cost"],
                "phase2_input": result["phase2_tokens"]["input"],
                "phase2_output": result["phase2_tokens"]["output"],
                "total_cost": cost,
                "duration": duration,
            })
            print(f"✓ ${cost:.5f} ({duration:.1f}s)")
            
        except Exception as e:
            print(f"✗ Error: {e}")
            results.append({
                "dutch_word": dutch,
                "english_hint": english,
                "n_examples": n,
                "error": str(e)
            })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print(f"\n{'='*60}")
print(f"Completed {len(df_results)} enrichments")
print(f"{'='*60}")

## Save Results

Merge with existing results (if any) and save to pickle file.

In [None]:
# Save results to pickle
import pickle

RESULTS_FILE.parent.mkdir(exist_ok=True)

# Load existing if exists
if RESULTS_FILE.exists():
    with open(RESULTS_FILE, 'rb') as f:
        existing = pickle.load(f)
    # Merge
    df_all = pd.concat([existing, df_results], ignore_index=True)
    df_all = df_all.drop_duplicates(subset=["dutch_word", "n_examples"], keep="last")
    print(f"Merged with existing: {len(df_all)} total results")
else:
    df_all = df_results
    print(f"New results: {len(df_all)} total")

# Save
with open(RESULTS_FILE, 'wb') as f:
    pickle.dump(df_all, f)

print(f"Saved to {RESULTS_FILE}")

## View Results DataFrame

In [None]:
# Display results
df_all[["dutch_word", "pos", "n_examples", "phase1_cost", "phase2_cost", "total_cost", "duration"]]

## Summary Statistics

In [None]:
print("="*80)
print("COST ANALYSIS BY N_EXAMPLES")
print("="*80)
print()

# Group by n_examples
summary = df_all.groupby("n_examples").agg({
    "total_cost": ["mean", "std", "min", "max"],
    "phase1_cost": "mean",
    "phase2_cost": "mean",
    "duration": "mean",
    "dutch_word": "count"
}).round(5)

summary.columns = ["Avg Total", "Std", "Min", "Max", "Avg P1", "Avg P2", "Avg Duration", "Count"]
print(summary)
print()

# Cost per 100 words
print("PROJECTED COST FOR 100 WORDS:")
for n in sorted(df_all["n_examples"].unique()):
    n_data = df_all[df_all["n_examples"] == n]
    avg = n_data["total_cost"].mean()
    print(f"  N={n}: ${avg * 100:.2f}")

## Visualization: Cost Comparison

In [None]:
# Bar chart: Monolithic vs Modular costs per word
fig, ax = plt.subplots(figsize=(12, 6))

x = range(len(valid_results))
width = 0.35

ax.bar([i - width/2 for i in x], valid_results['monolithic_cost'], width, label='Monolithic', alpha=0.8)
ax.bar([i + width/2 for i in x], valid_results['modular_cost'], width, label='Modular', alpha=0.8)

ax.set_xlabel('Word')
ax.set_ylabel('Cost ($)')
ax.set_title('Cost Comparison: Monolithic vs Modular Enrichment')
ax.set_xticks(x)
ax.set_xticklabels(valid_results['dutch_word'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization: Cost Savings by POS

In [None]:
# Box plot: Cost savings by POS
fig, ax = plt.subplots(figsize=(10, 6))

valid_results.boxplot(column='cost_savings_pct', by='pos', ax=ax)
ax.set_xlabel('Part of Speech')
ax.set_ylabel('Cost Savings (%)')
ax.set_title('Cost Savings Distribution by Part of Speech')
ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='Break-even')
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.show()

## Visualization: Cost vs Time Trade-off

In [None]:
# Scatter plot: Cost difference vs Time difference
fig, ax = plt.subplots(figsize=(10, 6))

scatter = ax.scatter(
    valid_results['cost_difference'],
    valid_results['time_difference'],
    c=valid_results['pos'].astype('category').cat.codes,
    s=100,
    alpha=0.6,
    cmap='viridis'
)

ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.5)

ax.set_xlabel('Cost Difference (Modular - Monolithic) ($)')
ax.set_ylabel('Time Difference (Modular - Monolithic) (s)')
ax.set_title('Cost vs Time Trade-off')

# Add legend for POS
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.viridis(i/len(valid_results['pos'].unique())), 
           markersize=10, label=pos)
    for i, pos in enumerate(valid_results['pos'].unique())
]
ax.legend(handles=legend_elements, title='POS')

plt.tight_layout()
plt.show()

## Projection: 1,000 Words

In [None]:
print("="*80)
print("COST PROJECTION FOR 1,000 WORDS")
print("="*80)
print()

avg_mono_cost = valid_results['monolithic_cost'].mean()
avg_mod_cost = valid_results['modular_cost'].mean()

mono_1k = avg_mono_cost * 1000
mod_1k = avg_mod_cost * 1000
savings_1k = mono_1k - mod_1k
savings_pct = (savings_1k / mono_1k) * 100

print(f"Monolithic: ${mono_1k:.2f}")
print(f"Modular:    ${mod_1k:.2f}")
print(f"Savings:    ${savings_1k:.2f} ({savings_pct:.1f}%)")
print()

# By POS
print("By Part of Speech:")
for pos in valid_results['pos'].unique():
    pos_data = valid_results[valid_results['pos'] == pos]
    pos_mono = pos_data['monolithic_cost'].mean() * 1000
    pos_mod = pos_data['modular_cost'].mean() * 1000
    pos_savings = pos_mono - pos_mod
    pos_savings_pct = (pos_savings / pos_mono) * 100
    print(f"  {pos:12s} - Mono: ${pos_mono:.2f}, Mod: ${pos_mod:.2f}, Savings: ${pos_savings:.2f} ({pos_savings_pct:.1f}%)")

## Next Steps

1. Add more test words above and re-run the comparison
2. Results are automatically merged with previous runs
3. Analyze consistency issues (lemma/POS/translation mismatches)
4. Consider POS-specific cost patterns

---

# Part 2: Example Count Optimization

Test how the number of examples (N_EXAMPLES) affects cost for the modular approach.

In [None]:
from scripts.enrichment.compare_utils import compare_example_counts

# Configuration
EXAMPLE_RESULTS_FILE = Path("../data/example_count_comparison.pkl")

## Define Test Parameters

In [None]:
# Test words (use a subset for faster testing)
example_test_words = [
    ("lopen", "to walk"),
    ("huis", "house"),
    ("mooi", "beautiful"),
]

# Example counts to test
example_counts = [1, 2, 3, 4]

print(f"Testing {len(example_test_words)} words with {len(example_counts)} different N_EXAMPLES values")
print(f"Total API calls: {len(example_test_words) * len(example_counts) * 2} (Phase 1 + Phase 2)")
print(f"\nExample counts to test: {example_counts}")

## Run Example Count Comparison

**WARNING**: This will make multiple API calls and cost money!

In [None]:
# Run comparison
example_results = compare_example_counts(
    example_test_words, 
    example_counts, 
    model=MODEL, 
    verbose=True
)

print("\n" + "="*80)
print(f"Completed {len(example_results)} enrichments")
print("="*80)

## Save Results

In [None]:
# Load existing results (if any)
existing_example_results = load_or_create(EXAMPLE_RESULTS_FILE)

# Merge with new results
if len(existing_example_results) > 0:
    all_example_results = pd.concat([existing_example_results, example_results], ignore_index=True)
    all_example_results = all_example_results.drop_duplicates(
        subset=["dutch_word", "english_hint", "n_examples", "model_used"], 
        keep="last"
    )
    print(f"Merged with existing results: {len(all_example_results)} total")
else:
    all_example_results = example_results
    print(f"No existing results, starting with {len(all_example_results)} new results")

# Save to file
save_results(all_example_results, EXAMPLE_RESULTS_FILE)

## View Results

In [None]:
all_example_results

## Cost Summary by N_EXAMPLES

In [None]:
print("="*80)
print("COST BY N_EXAMPLES")
print("="*80)
print()

valid_example_results = all_example_results[all_example_results["cost"].notna()]

summary_by_n = valid_example_results.groupby("n_examples").agg({
    "cost": ["mean", "std", "min", "max"],
    "duration": "mean",
    "dutch_word": "count"
}).round(5)

summary_by_n.columns = ["Avg Cost", "Std Cost", "Min Cost", "Max Cost", "Avg Duration (s)", "Count"]
print(summary_by_n)
print()

# Calculate cost increase per example
print("Cost increase per additional example:")
costs_by_n = summary_by_n["Avg Cost"].to_dict()
for n in sorted(costs_by_n.keys())[1:]:
    prev_n = n - 1
    if prev_n in costs_by_n:
        increase = costs_by_n[n] - costs_by_n[prev_n]
        pct_increase = (increase / costs_by_n[prev_n]) * 100
        print(f"  {prev_n} → {n} examples: +${increase:.5f} ({pct_increase:+.1f}%)")

## Visualization: Cost vs N_EXAMPLES

In [None]:
# Line plot: Average cost by N_EXAMPLES
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Cost by N_EXAMPLES
summary_by_n = valid_example_results.groupby("n_examples")["cost"].agg(["mean", "std"])
ax1.errorbar(
    summary_by_n.index, 
    summary_by_n["mean"], 
    yerr=summary_by_n["std"],
    marker="o", 
    linewidth=2, 
    markersize=8,
    capsize=5
)
ax1.set_xlabel("N_EXAMPLES")
ax1.set_ylabel("Average Cost ($)")
ax1.set_title("Cost vs Number of Examples")
ax1.grid(alpha=0.3)

# Plot 2: Cost by word and N_EXAMPLES
for word in valid_example_results["dutch_word"].unique():
    word_data = valid_example_results[valid_example_results["dutch_word"] == word]
    ax2.plot(word_data["n_examples"], word_data["cost"], marker="o", label=word, linewidth=2)

ax2.set_xlabel("N_EXAMPLES")
ax2.set_ylabel("Cost ($)")
ax2.set_title("Cost by Word and N_EXAMPLES")
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization: Cost by POS and N_EXAMPLES

In [None]:
# Box plot: Cost distribution by POS and N_EXAMPLES
fig, ax = plt.subplots(figsize=(12, 6))

valid_example_results.boxplot(
    column="cost", 
    by=["pos", "n_examples"], 
    ax=ax,
    figsize=(12, 6)
)
ax.set_xlabel("POS and N_EXAMPLES")
ax.set_ylabel("Cost ($)")
ax.set_title("Cost Distribution by POS and N_EXAMPLES")
plt.suptitle("")  # Remove automatic title
plt.xticks(rotation=45, ha="right")

plt.tight_layout()
plt.show()

## Projection: 1,000 Words with Different N_EXAMPLES

In [None]:
print("="*80)
print("COST PROJECTION FOR 1,000 WORDS")
print("="*80)
print()

for n in sorted(valid_example_results["n_examples"].unique()):
    n_data = valid_example_results[valid_example_results["n_examples"] == n]
    avg_cost = n_data["cost"].mean()
    cost_1k = avg_cost * 1000
    print(f"N_EXAMPLES = {n}: ${cost_1k:.2f} (${avg_cost:.5f} per word)")

print()
print("Recommendation:")
optimal_n = valid_example_results.groupby("n_examples")["cost"].mean().idxmin()
print(f"  Most cost-effective: N_EXAMPLES = {optimal_n}")
print(f"  Current setting: N_EXAMPLES = 2 (default)")

## Recommendations

Based on the results:
1. **Lower N_EXAMPLES** = Lower cost, but potentially less comprehensive examples
2. **Higher N_EXAMPLES** = Higher cost, but more learning material per word
3. Consider trade-off between cost and learning value
4. Could use different N_EXAMPLES for different POS types (verbs may benefit from more examples)