# 03 - Agent Evaluation (LLM-as-Judge + Meta-Eval)

This notebook evaluates the complete RAG agent using LLM-as-Judge and meta-evaluation.

## Objectives
- Set up RAG agent with retrieval + generation
- Run complete evaluation pipeline (BEIR + Judge + Meta-eval)
- Analyze judge performance and reliability
- Generate comprehensive evaluation report

In [None]:
# Import required libraries
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from datetime import datetime

# Add src to path
sys.path.append('../src')

# Import raglab modules
from core.io import DataLoader
from indexing.index import RAGRetriever, EmbeddingProvider
from eval import RAGEvaluationPipeline, run_evaluation, print_evaluation_summary
from core.interfaces import EvaluationExample

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("‚úÖ Imports successful")

## Configuration and Setup

In [None]:
# Configuration
EMBEDDING_DIM = 768
RETRIEVAL_K = 5
TEMPERATURE = 0.1  # Low temperature for reproducible evaluation
MAX_TOKENS = 500

print(f"üìã Agent Evaluation Configuration:")
print(f"   Retrieval K: {RETRIEVAL_K}")
print(f"   LLM Temperature: {TEMPERATURE}")
print(f"   Max Tokens: {MAX_TOKENS}")

In [None]:
# Your LLM and embedding functions
def your_llm_function(prompt: str, temperature: float = 0.1, max_tokens: int = 500) -> str:
    """
    Replace this with your actual LLM API call.
    This function is used for LLM-as-Judge evaluation.
    
    Examples:
    - OpenAI: openai.ChatCompletion.create(...)
    - Azure OpenAI: azure_openai.ChatCompletion.create(...)
    - Anthropic: anthropic.messages.create(...)
    """
    # Mock implementation
    return f"Mock LLM response to prompt (temp={temperature}, max_tokens={max_tokens})"

def your_embedding_function(texts: list) -> np.ndarray:
    """
    Replace this with the same embedding function used in previous notebooks.
    """
    return np.random.random((len(texts), EMBEDDING_DIM))

def your_generator_function(query: str, context_chunks: list) -> str:
    """
    Replace this with your RAG generation function.
    
    Args:
        query: User question
        context_chunks: List of retrieved text chunks
        
    Returns:
        Generated answer based on query and context
    """
    # Mock implementation
    context_preview = " ".join(context_chunks)[:100] if context_chunks else "No context"
    return f"Mock answer to '{query[:30]}...' based on context: {context_preview}..."

print("‚úÖ Function placeholders defined")
print("‚ö†Ô∏è  Remember to replace mock functions with real implementations")

## Load Data and Create Retriever

In [None]:
# Load evaluation examples
loader = DataLoader(base_path='..')
tasks = loader.load_tasks('data/tasks.jsonl')

evaluation_examples = []
for task in tasks:
    example = EvaluationExample(
        example_id=task['example_id'],
        question=task['question'],
        reference_answer=task['reference_answer'],
        ground_truth_chunk_ids=task['ground_truth_chunk_ids'],
        beir_failure_scale_factor=task.get('beir_failure_scale_factor', 1.0)
    )
    evaluation_examples.append(example)

print(f"üìö Loaded {len(evaluation_examples)} evaluation examples")

# Display sample examples
print("\nüìñ Sample evaluation examples:")
for example in evaluation_examples[:2]:
    print(f"  {example.example_id}: {example.question}")
    print(f"    Reference: {example.reference_answer[:80]}...")

In [None]:
# Create retriever
embedding_provider = EmbeddingProvider(your_embedding_function)

retriever = RAGRetriever(
    embedding_provider=embedding_provider,
    docstore_path='../artifacts/docstore.parquet',
    index_path='../artifacts/faiss.index'
)

print("‚úÖ Created retriever with pre-built index")

# Test retrieval
test_query = evaluation_examples[0].question
test_results = retriever.retrieve(test_query, k=3)
print(f"üîç Test retrieval for '{test_query[:40]}...': {len(test_results)} results")

## Test RAG Generation

In [None]:
# Test the complete RAG pipeline
test_example = evaluation_examples[0]
print(f"üß™ Testing RAG pipeline with example: {test_example.example_id}")
print(f"Query: {test_example.question}")

# Retrieve relevant chunks
retrieved_chunks = retriever.retrieve(test_example.question, k=RETRIEVAL_K)
context_texts = [chunk.chunk_text for chunk in retrieved_chunks]

print(f"\nüì• Retrieved {len(retrieved_chunks)} chunks:")
for i, chunk in enumerate(retrieved_chunks[:3]):
    print(f"  {i+1}. {chunk.chunk_id}: {chunk.chunk_text[:60]}...")

# Generate answer
generated_answer = your_generator_function(test_example.question, context_texts)
print(f"\nü§ñ Generated answer: {generated_answer}")

print(f"\nüìù Reference answer: {test_example.reference_answer}")

print("\n‚úÖ RAG pipeline test complete")

## Run Complete Evaluation Pipeline

In [None]:
# Create evaluation run name with timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
run_name = f"{timestamp}_agent_evaluation"

print(f"üöÄ Starting complete evaluation pipeline...")
print(f"   Run name: {run_name}")
print(f"   Examples: {len(evaluation_examples)}")
print(f"   This may take several minutes...")

# Run evaluation
results, run_dir = run_evaluation(
    examples=evaluation_examples,
    retriever=retriever,
    generator_function=your_generator_function,
    llm_function=your_llm_function,
    run_name=run_name,
    retrieval_k=RETRIEVAL_K
)

print(f"\n‚úÖ Evaluation complete!")
print(f"üìÅ Results saved to: {run_dir}")
print(f"üìä Evaluated {len(results)} examples")

## Analyze Results

In [None]:
# Print evaluation summary
print_evaluation_summary(run_dir)

In [None]:
# Detailed analysis
from eval import load_evaluation_results

outputs, metrics = load_evaluation_results(run_dir)

print(f"üìä Detailed Results Analysis:")
print(f"\nüéØ Overall Performance:")
print(f"   Total Examples: {metrics['total_examples']}")
print(f"   Valid Examples: {metrics['valid_examples']}")
print(f"   Error Rate: {metrics['error_rate']:.2%}")

# Convert outputs to DataFrame for analysis
results_df = pd.DataFrame(outputs)

# Extract judge verdicts
judge_data = []
for output in outputs:
    judge_output = output['judge_output']
    meta_output = output['meta_eval_output']
    beir_output = output['beir_metrics']
    
    judge_data.append({
        'example_id': output['example_id'],
        'correctness': judge_output['correctness_binary'],
        'hallucination': judge_output['hallucination_binary'],
        'risk_direction': judge_output.get('risk_direction'),
        'risk_impact': judge_output.get('risk_impact'),
        'judge_correct': meta_output['judge_correct'],
        'recall_at_k': beir_output['recall_at_k'],
        'precision_at_k': beir_output['precision_at_k'],
        'ndcg_at_k': beir_output['ndcg_at_k']
    })

judge_df = pd.DataFrame(judge_data)
print(f"\nüìã Judge Analysis DataFrame created with {len(judge_df)} examples")

In [None]:
# Per-example detailed results
print("üìã Per-Example Results:")
print("-" * 100)

for _, row in judge_df.iterrows():
    print(f"\n{row['example_id']}:")
    print(f"  Correctness: {'‚úÖ' if row['correctness'] else '‚ùå'}")
    print(f"  Hallucination: {'‚ö†Ô∏è' if row['hallucination'] else '‚úÖ'}")
    print(f"  Judge Correct: {'‚úÖ' if row['judge_correct'] else '‚ùå'}")
    print(f"  BEIR Metrics: R={row['recall_at_k']:.3f}, P={row['precision_at_k']:.3f}, nDCG={row['ndcg_at_k']:.3f}")
    
    if row['risk_direction'] is not None:
        risk_desc = {
            -1: "Care Avoidance Risk",
            0: "No Clear Direction", 
            1: "Unexpected Cost Risk"
        }.get(row['risk_direction'], "Unknown")
        print(f"  Risk: {risk_desc} (impact: {row['risk_impact']})")

## Visualizations

In [None]:
# Create comprehensive evaluation dashboard
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Judge Performance Distribution
judge_performance = [
    ('Correct', sum(judge_df['correctness'])),
    ('Incorrect', len(judge_df) - sum(judge_df['correctness']))
]
labels, values = zip(*judge_performance)
axes[0,0].pie(values, labels=labels, autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
axes[0,0].set_title('Judge Correctness Distribution')

# 2. Hallucination Detection
hallucination_data = [
    ('No Hallucination', len(judge_df) - sum(judge_df['hallucination'])),
    ('Hallucination Detected', sum(judge_df['hallucination']))
]
labels, values = zip(*hallucination_data)
axes[0,1].pie(values, labels=labels, autopct='%1.1f%%', colors=['lightblue', 'orange'])
axes[0,1].set_title('Hallucination Detection')

# 3. Meta-Evaluator Performance
meta_performance = [
    ('Judge Correct', sum(judge_df['judge_correct'])),
    ('Judge Incorrect', len(judge_df) - sum(judge_df['judge_correct']))
]
labels, values = zip(*meta_performance)
axes[0,2].pie(values, labels=labels, autopct='%1.1f%%', colors=['mediumseagreen', 'tomato'])
axes[0,2].set_title('Meta-Evaluator Assessment')

# 4. BEIR Metrics Distribution
beir_metrics = ['recall_at_k', 'precision_at_k', 'ndcg_at_k']
beir_values = [judge_df[metric].mean() for metric in beir_metrics]
metric_names = ['Recall@K', 'Precision@K', 'nDCG@K']
bars = axes[1,0].bar(metric_names, beir_values, color=['skyblue', 'lightgreen', 'coral'])
axes[1,0].set_title('Average BEIR Metrics')
axes[1,0].set_ylim(0, 1.0)
axes[1,0].set_ylabel('Score')
# Add value labels on bars
for bar, value in zip(bars, beir_values):
    axes[1,0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                  f'{value:.3f}', ha='center', va='bottom')

# 5. Risk Direction Distribution (if available)
risk_directions = judge_df['risk_direction'].dropna()
if len(risk_directions) > 0:
    risk_counts = risk_directions.value_counts().sort_index()
    risk_labels = {-1: 'Care\nAvoidance', 0: 'No Clear\nDirection', 1: 'Unexpected\nCost'}
    labels = [risk_labels.get(idx, f'Risk {idx}') for idx in risk_counts.index]
    axes[1,1].bar(labels, risk_counts.values, color=['lightcoral', 'lightgray', 'orange'])
    axes[1,1].set_title('Risk Direction Distribution')
    axes[1,1].set_ylabel('Count')
else:
    axes[1,1].text(0.5, 0.5, 'No Risk Data\nAvailable', ha='center', va='center', transform=axes[1,1].transAxes)
    axes[1,1].set_title('Risk Direction Distribution')

# 6. Correlation: Retrieval Quality vs Judge Performance
axes[1,2].scatter(judge_df['recall_at_k'], judge_df['correctness'], alpha=0.7, color='purple')
axes[1,2].set_xlabel('Recall@K')
axes[1,2].set_ylabel('Correctness (0/1)')
axes[1,2].set_title('Retrieval vs Judge Performance')
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle(f'RAG Agent Evaluation Dashboard - {run_name}', fontsize=16, fontweight='bold', y=1.02)
plt.show()

In [None]:
# Example-level performance heatmap
performance_cols = ['correctness', 'hallucination', 'judge_correct', 'recall_at_k', 'precision_at_k', 'ndcg_at_k']
performance_data = judge_df[['example_id'] + performance_cols].set_index('example_id')

plt.figure(figsize=(12, 8))
sns.heatmap(performance_data.T, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Score'}, fmt='.3f')
plt.title('Per-Example Performance Heatmap', fontsize=14, fontweight='bold')
plt.ylabel('Metrics')
plt.xlabel('Example ID')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Summary and Insights

In [None]:
# Generate insights and recommendations
print("üîç Evaluation Insights and Recommendations:")
print("=" * 80)

# 1. Overall Performance
correctness_rate = judge_df['correctness'].mean()
hallucination_rate = judge_df['hallucination'].mean()
judge_accuracy = judge_df['judge_correct'].mean()
avg_recall = judge_df['recall_at_k'].mean()

print(f"\nüìä Key Performance Indicators:")
print(f"   Correctness Rate: {correctness_rate:.1%}")
print(f"   Hallucination Rate: {hallucination_rate:.1%}")
print(f"   Judge Accuracy: {judge_accuracy:.1%}")
print(f"   Average Recall@{RETRIEVAL_K}: {avg_recall:.3f}")

# 2. Performance Analysis
print(f"\nüéØ Performance Analysis:")
if correctness_rate >= 0.8:
    print("   ‚úÖ Strong correctness performance (‚â•80%)")
elif correctness_rate >= 0.6:
    print("   ‚ö†Ô∏è  Moderate correctness performance (60-80%)")
else:
    print("   ‚ùå Low correctness performance (<60%) - needs improvement")

if hallucination_rate <= 0.2:
    print("   ‚úÖ Low hallucination rate (‚â§20%)")
elif hallucination_rate <= 0.4:
    print("   ‚ö†Ô∏è  Moderate hallucination rate (20-40%)")
else:
    print("   ‚ùå High hallucination rate (>40%) - requires attention")

if judge_accuracy >= 0.8:
    print("   ‚úÖ High judge reliability (‚â•80%)")
else:
    print("   ‚ö†Ô∏è  Judge reliability could be improved (<80%)")

# 3. Recommendations
print(f"\nüí° Recommendations:")

if avg_recall < 0.7:
    print("   üîÑ Consider improving retrieval:")
    print("     - Try different embedding models")
    print("     - Adjust chunk size and overlap")
    print("     - Experiment with query expansion")

if hallucination_rate > 0.3:
    print("   üõ°Ô∏è  Reduce hallucinations:")
    print("     - Improve prompt engineering")
    print("     - Add stricter grounding instructions")
    print("     - Implement confidence scoring")

if correctness_rate < 0.7:
    print("   üìà Improve answer quality:")
    print("     - Enhance generation prompts")
    print("     - Increase retrieval context")
    print("     - Consider few-shot examples")

print(f"\nüìÅ Full evaluation details available in: {run_dir}")
print(f"üéâ Agent evaluation complete!")