# 02 - Retrieval Evaluation (BEIR Metrics)

This notebook evaluates the retrieval system using BEIR-style metrics.

## Objectives
- Load evaluation tasks with ground truth chunk IDs
- Run retrieval for each query
- Compute BEIR metrics (Recall@K, Precision@K, nDCG@K)
- Analyze retrieval performance

In [None]:
# Import required libraries
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# Add src to path
sys.path.append('../src')

# Import raglab modules
from core.io import DataLoader, RunManager
from indexing.index import RAGRetriever, EmbeddingProvider
from beir_metrics import compute_beir_metrics
from core.interfaces import EvaluationExample

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("‚úÖ Imports successful")

## Configuration

In [None]:
# Configuration
K_VALUES = [1, 3, 5, 10]  # Different k values for evaluation
EMBEDDING_DIM = 768

# Your embedding function (should match the one used in indexing)
def your_embedding_function(texts: list) -> np.ndarray:
    """
    Replace this with the same embedding function used in notebook 01.
    """
    return np.random.random((len(texts), EMBEDDING_DIM))

print(f"üìã Evaluation configuration:")
print(f"   K values: {K_VALUES}")
print(f"   Embedding dimension: {EMBEDDING_DIM}")
print("‚ö†Ô∏è  Ensure embedding function matches the one used for indexing")

## Load Data and Retriever

In [None]:
# Load evaluation tasks
loader = DataLoader(base_path='..')
tasks = loader.load_tasks('data/tasks.jsonl')

print(f"üìö Loaded {len(tasks)} evaluation tasks")

# Convert to EvaluationExample objects
evaluation_examples = []
for task in tasks:
    example = EvaluationExample(
        example_id=task['example_id'],
        question=task['question'],
        reference_answer=task['reference_answer'],
        ground_truth_chunk_ids=task['ground_truth_chunk_ids'],
        beir_failure_scale_factor=task.get('beir_failure_scale_factor', 1.0)
    )
    evaluation_examples.append(example)

print(f"‚úÖ Created {len(evaluation_examples)} evaluation examples")

# Display sample tasks
print("\nüìñ Sample evaluation tasks:")
for example in evaluation_examples[:3]:
    print(f"  {example.example_id}: {example.question}")
    print(f"    Ground truth chunks: {example.ground_truth_chunk_ids}")

In [None]:
# Load retriever
embedding_provider = EmbeddingProvider(your_embedding_function)

retriever = RAGRetriever(
    embedding_provider=embedding_provider,
    docstore_path='../artifacts/docstore.parquet',
    index_path='../artifacts/faiss.index'
)

print("‚úÖ Loaded retriever with pre-built index")

# Test retriever
test_results = retriever.retrieve(evaluation_examples[0].question, k=3)
print(f"üîç Test query returned {len(test_results)} results")

## Run Retrieval Evaluation

In [None]:
# Run retrieval evaluation for different k values
retrieval_results = []

for example in evaluation_examples:
    print(f"üîç Evaluating: {example.example_id}")
    
    # Retrieve for maximum k value
    max_k = max(K_VALUES)
    retrieved_chunks = retriever.retrieve(example.question, k=max_k)
    
    # Compute metrics for each k value
    example_results = {
        'example_id': example.example_id,
        'question': example.question,
        'ground_truth_chunk_ids': example.ground_truth_chunk_ids,
        'retrieved_chunk_ids': [chunk.chunk_id for chunk in retrieved_chunks]
    }
    
    for k in K_VALUES:
        # Compute BEIR metrics for this k
        metrics = compute_beir_metrics(
            retrieved_chunks[:k],
            example.ground_truth_chunk_ids,
            k=k
        )
        
        example_results[f'recall_at_{k}'] = metrics.recall_at_k
        example_results[f'precision_at_{k}'] = metrics.precision_at_k
        example_results[f'ndcg_at_{k}'] = metrics.ndcg_at_k
    
    retrieval_results.append(example_results)

print(f"\n‚úÖ Completed retrieval evaluation for {len(evaluation_examples)} examples")

## Analyze Results

In [None]:
# Convert results to DataFrame for analysis
results_df = pd.DataFrame(retrieval_results)

print("üìä Retrieval Results:")
print(results_df.head())

# Compute average metrics across all examples
print("\nüìà Average BEIR Metrics:")

for k in K_VALUES:
    recall_col = f'recall_at_{k}'
    precision_col = f'precision_at_{k}'
    ndcg_col = f'ndcg_at_{k}'
    
    avg_recall = results_df[recall_col].mean()
    avg_precision = results_df[precision_col].mean()
    avg_ndcg = results_df[ndcg_col].mean()
    
    print(f"\n  K={k}:")
    print(f"    Recall@{k}:    {avg_recall:.3f}")
    print(f"    Precision@{k}: {avg_precision:.3f}")
    print(f"    nDCG@{k}:      {avg_ndcg:.3f}")

In [None]:
# Detailed per-example analysis
print("üìã Per-Example Results:")
print("-" * 80)

for _, row in results_df.iterrows():
    print(f"\n{row['example_id']}: {row['question'][:50]}...")
    print(f"  Ground truth: {row['ground_truth_chunk_ids']}")
    print(f"  Retrieved: {row['retrieved_chunk_ids'][:5]}")
    
    # Show metrics for k=5
    print(f"  Metrics@5: R={row['recall_at_5']:.3f}, P={row['precision_at_5']:.3f}, nDCG={row['ndcg_at_5']:.3f}")
    
    # Check if ground truth chunks are in retrieved results
    retrieved_set = set(row['retrieved_chunk_ids'][:5])
    gt_set = set(row['ground_truth_chunk_ids'])
    found = gt_set.intersection(retrieved_set)
    
    if found:
        print(f"  ‚úÖ Found ground truth chunks: {list(found)}")
    else:
        print(f"  ‚ùå No ground truth chunks found in top-5")

## Visualization

In [None]:
# Create visualization of metrics by K value
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ['recall', 'precision', 'ndcg']
colors = ['blue', 'green', 'red']

for i, metric in enumerate(metrics):
    k_vals = []
    metric_vals = []
    
    for k in K_VALUES:
        k_vals.append(k)
        col_name = f'{metric}_at_{k}'
        metric_vals.append(results_df[col_name].mean())
    
    axes[i].plot(k_vals, metric_vals, marker='o', color=colors[i], linewidth=2, markersize=8)
    axes[i].set_title(f'{metric.capitalize()}@K', fontsize=14, fontweight='bold')
    axes[i].set_xlabel('K')
    axes[i].set_ylabel(f'{metric.capitalize()}')
    axes[i].grid(True, alpha=0.3)
    axes[i].set_ylim(0, 1.0)

plt.tight_layout()
plt.suptitle('BEIR Retrieval Metrics by K Value', fontsize=16, fontweight='bold', y=1.02)
plt.show()

In [None]:
# Heatmap of metrics per example
metric_cols = []
for k in K_VALUES:
    metric_cols.extend([f'recall_at_{k}', f'precision_at_{k}', f'ndcg_at_{k}'])

heatmap_data = results_df[['example_id'] + metric_cols].set_index('example_id')

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data.T, annot=True, cmap='RdYlGn', vmin=0, vmax=1, 
            cbar_kws={'label': 'Metric Value'}, fmt='.3f')
plt.title('Retrieval Metrics Heatmap by Example', fontsize=14, fontweight='bold')
plt.ylabel('Metric@K')
plt.xlabel('Example ID')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Save Results

In [None]:
# Save detailed results
run_manager = RunManager()
run_dir = run_manager.create_run_dir('retrieval_evaluation')

# Save configuration
config = {
    'evaluation_type': 'retrieval_beir_metrics',
    'k_values': K_VALUES,
    'num_examples': len(evaluation_examples),
    'embedding_dim': EMBEDDING_DIM
}
run_manager.save_config(config, run_dir)

# Save detailed results
run_manager.save_outputs(retrieval_results, run_dir)

# Compute and save summary metrics
summary_metrics = {}
for k in K_VALUES:
    summary_metrics[f'avg_recall_at_{k}'] = results_df[f'recall_at_{k}'].mean()
    summary_metrics[f'avg_precision_at_{k}'] = results_df[f'precision_at_{k}'].mean()
    summary_metrics[f'avg_ndcg_at_{k}'] = results_df[f'ndcg_at_{k}'].mean()

run_manager.save_metrics(summary_metrics, run_dir)

print(f"‚úÖ Saved retrieval evaluation results to: {run_dir}")
print(f"üìÅ Run directory contains:")
for file in Path(run_dir).iterdir():
    print(f"   {file.name}")

## Summary

In [None]:
print("üéØ Retrieval Evaluation Summary:")
print(f"   Examples evaluated: {len(evaluation_examples)}")
print(f"   K values tested: {K_VALUES}")

# Best performing K
best_recall_k = max(K_VALUES, key=lambda k: results_df[f'recall_at_{k}'].mean())
best_ndcg_k = max(K_VALUES, key=lambda k: results_df[f'ndcg_at_{k}'].mean())

print(f"\nüìä Best Performance:")
print(f"   Best Recall@K: K={best_recall_k} ({results_df[f'recall_at_{best_recall_k}'].mean():.3f})")
print(f"   Best nDCG@K: K={best_ndcg_k} ({results_df[f'ndcg_at_{best_ndcg_k}'].mean():.3f})")

# Identify problematic examples
poor_examples = results_df[results_df['recall_at_5'] < 0.5]
if len(poor_examples) > 0:
    print(f"\n‚ö†Ô∏è  Examples with low recall@5 (< 0.5): {len(poor_examples)}")
    for _, row in poor_examples.iterrows():
        print(f"     {row['example_id']}: recall={row['recall_at_5']:.3f}")
else:
    print(f"\n‚úÖ All examples have recall@5 ‚â• 0.5")

print(f"\nüéâ Ready for notebook 03_agent_eval.ipynb!")