# Enrichment Cost Comparison

Compare the **actual costs** of monolithic vs modular enrichment approaches.

This notebook:
- Runs both approaches on multiple test words
- Tracks actual costs from OpenAI API
- Saves results to pickle for later analysis
- Visualizes cost comparisons and consistency

## Setup

In [None]:
import sys
from pathlib import Path

# Add project root to path
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scripts.enrichment.compare_utils import (
    compare_multiple_words,
    save_results,
    load_results,
    load_or_create,
    merge_results,
)

# Set up plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Configuration

In [None]:
# File to save/load results
RESULTS_FILE = Path("../data/enrichment_comparison_results.pkl")

# Model to use
MODEL = "gpt-4o-2024-08-06"

## Define Test Words

Add words to test here. Mix different POS types for comprehensive comparison.

In [None]:
# List of (dutch_word, english_hint) tuples to test
test_words = [
    ("lopen", "to walk"),
    ("huis", "house"),
    ("mooi", "beautiful"),
    ("eten", "to eat"),
    ("groot", "big"),
]

print(f"Testing {len(test_words)} words")
for dutch, english in test_words:
    print(f"  - {dutch} ({english})")

## Run Comparison

This will call both enrichment approaches for each word and collect results.

In [None]:
# Run comparison (this makes API calls - costs money!)
new_results = compare_multiple_words(test_words, model=MODEL, verbose=True)

print("\n" + "="*80)
print(f"Completed {len(new_results)} comparisons")
print("="*80)

## Save Results

Merge with existing results (if any) and save to pickle file.

In [None]:
# Load existing results (if any)
existing_results = load_or_create(RESULTS_FILE)

# Merge with new results
if len(existing_results) > 0:
    all_results = merge_results(existing_results, new_results, drop_duplicates=True)
    print(f"Merged {len(existing_results)} existing + {len(new_results)} new = {len(all_results)} total results")
else:
    all_results = new_results
    print(f"No existing results, starting with {len(all_results)} new results")

# Save to file
save_results(all_results, RESULTS_FILE)

## View Results DataFrame

In [None]:
# Display the full results
all_results

## Summary Statistics

In [None]:
print("="*80)
print("SUMMARY STATISTICS")
print("="*80)
print()

# Overall stats
print(f"Total words tested: {len(all_results)}")
print(f"Successful comparisons: {all_results['monolithic_cost'].notna().sum()}")
print(f"Errors: {all_results['monolithic_cost'].isna().sum()}")
print()

# Cost stats (excluding errors)
valid_results = all_results[all_results['monolithic_cost'].notna()]

print("ðŸ’° COST STATISTICS:")
print(f"  Monolithic - Avg: ${valid_results['monolithic_cost'].mean():.5f}, Total: ${valid_results['monolithic_cost'].sum():.5f}")
print(f"  Modular    - Avg: ${valid_results['modular_cost'].mean():.5f}, Total: ${valid_results['modular_cost'].sum():.5f}")
print(f"  Savings    - Avg: ${-valid_results['cost_difference'].mean():.5f} ({valid_results['cost_savings_pct'].mean():.1f}%)")
print()

# Consistency stats
print("âœ“ CONSISTENCY:")
print(f"  Lemma matches:       {valid_results['lemma_match'].sum()} / {len(valid_results)} ({valid_results['lemma_match'].mean()*100:.1f}%)")
print(f"  POS matches:         {valid_results['pos_match'].sum()} / {len(valid_results)} ({valid_results['pos_match'].mean()*100:.1f}%)")
print(f"  Translation matches: {valid_results['translation_match'].sum()} / {len(valid_results)} ({valid_results['translation_match'].mean()*100:.1f}%)")
print()

# By POS
print("ðŸ“Š BY PART OF SPEECH:")
pos_summary = valid_results.groupby('pos').agg({
    'dutch_word': 'count',
    'monolithic_cost': 'mean',
    'modular_cost': 'mean',
    'cost_savings_pct': 'mean'
}).round(5)
pos_summary.columns = ['Count', 'Mono Avg Cost', 'Mod Avg Cost', 'Savings %']
print(pos_summary)

## Visualization: Cost Comparison

In [None]:
# Bar chart: Monolithic vs Modular costs per word
fig, ax = plt.subplots(figsize=(12, 6))

x = range(len(valid_results))
width = 0.35

ax.bar([i - width/2 for i in x], valid_results['monolithic_cost'], width, label='Monolithic', alpha=0.8)
ax.bar([i + width/2 for i in x], valid_results['modular_cost'], width, label='Modular', alpha=0.8)

ax.set_xlabel('Word')
ax.set_ylabel('Cost ($)')
ax.set_title('Cost Comparison: Monolithic vs Modular Enrichment')
ax.set_xticks(x)
ax.set_xticklabels(valid_results['dutch_word'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization: Cost Savings by POS

In [None]:
# Box plot: Cost savings by POS
fig, ax = plt.subplots(figsize=(10, 6))

valid_results.boxplot(column='cost_savings_pct', by='pos', ax=ax)
ax.set_xlabel('Part of Speech')
ax.set_ylabel('Cost Savings (%)')
ax.set_title('Cost Savings Distribution by Part of Speech')
ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='Break-even')
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.show()

## Visualization: Cost vs Time Trade-off

In [None]:
# Scatter plot: Cost difference vs Time difference
fig, ax = plt.subplots(figsize=(10, 6))

scatter = ax.scatter(
    valid_results['cost_difference'],
    valid_results['time_difference'],
    c=valid_results['pos'].astype('category').cat.codes,
    s=100,
    alpha=0.6,
    cmap='viridis'
)

ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.5)

ax.set_xlabel('Cost Difference (Modular - Monolithic) ($)')
ax.set_ylabel('Time Difference (Modular - Monolithic) (s)')
ax.set_title('Cost vs Time Trade-off')

# Add legend for POS
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.viridis(i/len(valid_results['pos'].unique())), 
           markersize=10, label=pos)
    for i, pos in enumerate(valid_results['pos'].unique())
]
ax.legend(handles=legend_elements, title='POS')

plt.tight_layout()
plt.show()

## Projection: 1,000 Words

In [None]:
print("="*80)
print("COST PROJECTION FOR 1,000 WORDS")
print("="*80)
print()

avg_mono_cost = valid_results['monolithic_cost'].mean()
avg_mod_cost = valid_results['modular_cost'].mean()

mono_1k = avg_mono_cost * 1000
mod_1k = avg_mod_cost * 1000
savings_1k = mono_1k - mod_1k
savings_pct = (savings_1k / mono_1k) * 100

print(f"Monolithic: ${mono_1k:.2f}")
print(f"Modular:    ${mod_1k:.2f}")
print(f"Savings:    ${savings_1k:.2f} ({savings_pct:.1f}%)")
print()

# By POS
print("By Part of Speech:")
for pos in valid_results['pos'].unique():
    pos_data = valid_results[valid_results['pos'] == pos]
    pos_mono = pos_data['monolithic_cost'].mean() * 1000
    pos_mod = pos_data['modular_cost'].mean() * 1000
    pos_savings = pos_mono - pos_mod
    pos_savings_pct = (pos_savings / pos_mono) * 100
    print(f"  {pos:12s} - Mono: ${pos_mono:.2f}, Mod: ${pos_mod:.2f}, Savings: ${pos_savings:.2f} ({pos_savings_pct:.1f}%)")

## Next Steps

1. Add more test words above and re-run the comparison
2. Results are automatically merged with previous runs
3. Analyze consistency issues (lemma/POS/translation mismatches)
4. Consider POS-specific cost patterns

---

# Part 2: Example Count Optimization

Test how the number of examples (N_EXAMPLES) affects cost for the modular approach.

In [None]:
from scripts.enrichment.compare_utils import compare_example_counts

# Configuration
EXAMPLE_RESULTS_FILE = Path("../data/example_count_comparison.pkl")

## Define Test Parameters

In [None]:
# Test words (use a subset for faster testing)
example_test_words = [
    ("lopen", "to walk"),
    ("huis", "house"),
    ("mooi", "beautiful"),
]

# Example counts to test
example_counts = [1, 2, 3, 4]

print(f"Testing {len(example_test_words)} words with {len(example_counts)} different N_EXAMPLES values")
print(f"Total API calls: {len(example_test_words) * len(example_counts) * 2} (Phase 1 + Phase 2)")
print(f"\nExample counts to test: {example_counts}")

## Run Example Count Comparison

**WARNING**: This will make multiple API calls and cost money!

In [None]:
# Run comparison
example_results = compare_example_counts(
    example_test_words, 
    example_counts, 
    model=MODEL, 
    verbose=True
)

print("\n" + "="*80)
print(f"Completed {len(example_results)} enrichments")
print("="*80)

## Save Results

In [None]:
# Load existing results (if any)
existing_example_results = load_or_create(EXAMPLE_RESULTS_FILE)

# Merge with new results
if len(existing_example_results) > 0:
    all_example_results = pd.concat([existing_example_results, example_results], ignore_index=True)
    all_example_results = all_example_results.drop_duplicates(
        subset=["dutch_word", "english_hint", "n_examples", "model_used"], 
        keep="last"
    )
    print(f"Merged with existing results: {len(all_example_results)} total")
else:
    all_example_results = example_results
    print(f"No existing results, starting with {len(all_example_results)} new results")

# Save to file
save_results(all_example_results, EXAMPLE_RESULTS_FILE)

## View Results

In [None]:
all_example_results

## Cost Summary by N_EXAMPLES

In [None]:
print("="*80)
print("COST BY N_EXAMPLES")
print("="*80)
print()

valid_example_results = all_example_results[all_example_results["cost"].notna()]

summary_by_n = valid_example_results.groupby("n_examples").agg({
    "cost": ["mean", "std", "min", "max"],
    "duration": "mean",
    "dutch_word": "count"
}).round(5)

summary_by_n.columns = ["Avg Cost", "Std Cost", "Min Cost", "Max Cost", "Avg Duration (s)", "Count"]
print(summary_by_n)
print()

# Calculate cost increase per example
print("Cost increase per additional example:")
costs_by_n = summary_by_n["Avg Cost"].to_dict()
for n in sorted(costs_by_n.keys())[1:]:
    prev_n = n - 1
    if prev_n in costs_by_n:
        increase = costs_by_n[n] - costs_by_n[prev_n]
        pct_increase = (increase / costs_by_n[prev_n]) * 100
        print(f"  {prev_n} â†’ {n} examples: +${increase:.5f} ({pct_increase:+.1f}%)")

## Visualization: Cost vs N_EXAMPLES

In [None]:
# Line plot: Average cost by N_EXAMPLES
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Cost by N_EXAMPLES
summary_by_n = valid_example_results.groupby("n_examples")["cost"].agg(["mean", "std"])
ax1.errorbar(
    summary_by_n.index, 
    summary_by_n["mean"], 
    yerr=summary_by_n["std"],
    marker="o", 
    linewidth=2, 
    markersize=8,
    capsize=5
)
ax1.set_xlabel("N_EXAMPLES")
ax1.set_ylabel("Average Cost ($)")
ax1.set_title("Cost vs Number of Examples")
ax1.grid(alpha=0.3)

# Plot 2: Cost by word and N_EXAMPLES
for word in valid_example_results["dutch_word"].unique():
    word_data = valid_example_results[valid_example_results["dutch_word"] == word]
    ax2.plot(word_data["n_examples"], word_data["cost"], marker="o", label=word, linewidth=2)

ax2.set_xlabel("N_EXAMPLES")
ax2.set_ylabel("Cost ($)")
ax2.set_title("Cost by Word and N_EXAMPLES")
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization: Cost by POS and N_EXAMPLES

In [None]:
# Box plot: Cost distribution by POS and N_EXAMPLES
fig, ax = plt.subplots(figsize=(12, 6))

valid_example_results.boxplot(
    column="cost", 
    by=["pos", "n_examples"], 
    ax=ax,
    figsize=(12, 6)
)
ax.set_xlabel("POS and N_EXAMPLES")
ax.set_ylabel("Cost ($)")
ax.set_title("Cost Distribution by POS and N_EXAMPLES")
plt.suptitle("")  # Remove automatic title
plt.xticks(rotation=45, ha="right")

plt.tight_layout()
plt.show()

## Projection: 1,000 Words with Different N_EXAMPLES

In [None]:
print("="*80)
print("COST PROJECTION FOR 1,000 WORDS")
print("="*80)
print()

for n in sorted(valid_example_results["n_examples"].unique()):
    n_data = valid_example_results[valid_example_results["n_examples"] == n]
    avg_cost = n_data["cost"].mean()
    cost_1k = avg_cost * 1000
    print(f"N_EXAMPLES = {n}: ${cost_1k:.2f} (${avg_cost:.5f} per word)")

print()
print("Recommendation:")
optimal_n = valid_example_results.groupby("n_examples")["cost"].mean().idxmin()
print(f"  Most cost-effective: N_EXAMPLES = {optimal_n}")
print(f"  Current setting: N_EXAMPLES = 2 (default)")

## Recommendations

Based on the results:
1. **Lower N_EXAMPLES** = Lower cost, but potentially less comprehensive examples
2. **Higher N_EXAMPLES** = Higher cost, but more learning material per word
3. Consider trade-off between cost and learning value
4. Could use different N_EXAMPLES for different POS types (verbs may benefit from more examples)