# RAG Evaluation Notebook

In [7]:
import os
import sys

# Fix import path (same as before)
project_root = os.path.abspath('../..')
sys.path.insert(0, project_root)

from evaluations.runners.rag_evaluator import RAGEvaluator
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

print("✅ Imports successful!")

# Initialize evaluator
evaluator = RAGEvaluator()
print(f"📁 Results directory: {evaluator.results_dir}")

# Find the most recent dataset files
dataset_dir = Path("../datasets/rag_evaluation")
chunks_files = list(dataset_dir.glob("chunks_*.json"))
queries_files = list(dataset_dir.glob("queries_*.json"))

if chunks_files and queries_files:
    # Use the most recent files (by filename)
    chunks_file = str(sorted(chunks_files)[-1])
    queries_file = str(sorted(queries_files)[-1])
    
    print(f"\nUsing dataset files:")
    print(f"📁 Chunks: {chunks_file}")
    print(f"📁 Queries: {queries_file}")
    
    # Load to check dataset size
    with open(chunks_file, 'r') as f:
        chunks_data = json.load(f)
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
        
    print(f"\nDataset info:")
    print(f"  Total chunks: {len(chunks_data)}")
    print(f"  Total queries: {len(queries_data)}")
    
    # Check if the new parameter exists
    import inspect
    sig = inspect.signature(RAGEvaluator.run_evaluation)
    if 'use_llm_judge' in sig.parameters:
        print("🧠 use_llm_judge parameter available!")
    else:
        print("❌ use_llm_judge parameter NOT found")
        
else:
    print("❌ No dataset files found. Please run dataset_creation.ipynb first.")
    chunks_file = None
    queries_file = None

✅ Imports successful!
📁 Results directory: evaluations/results/rag_evaluation

Using dataset files:
📁 Chunks: ../datasets/rag_evaluation/chunks_seattle_100listings_20250529_204239.json
📁 Queries: ../datasets/rag_evaluation/queries_seattle_100listings_20250529_204239.json

Dataset info:
  Total chunks: 250
  Total queries: 596
🧠 use_llm_judge parameter available!


In [12]:
# Initialize evaluator
evaluator = RAGEvaluator()
print(f"Results will be saved to: {evaluator.results_dir}")

# Find the most recent dataset files
dataset_dir = Path("../datasets/rag_evaluation")
chunks_files = list(dataset_dir.glob("chunks_*.json"))
queries_files = list(dataset_dir.glob("queries_*.json"))

if chunks_files and queries_files:
    # Use the most recent files (by filename)
    chunks_file = str(sorted(chunks_files)[-1])
    queries_file = str(sorted(queries_files)[-1])
    
    print(f"Using dataset files:")
    print(f"📁 Chunks: {chunks_file}")
    print(f"📁 Queries: {queries_file}")
    
    # Load to check dataset size
    with open(chunks_file, 'r') as f:
        chunks_data = json.load(f)
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
        
    print(f"\nDataset info:")
    print(f"  Total chunks: {len(chunks_data)}")
    print(f"  Total queries: {len(queries_data)}")
else:
    print("❌ No dataset files found. Please run dataset_creation.ipynb first.")
    chunks_file = None
    queries_file = None

Results will be saved to: evaluations/results/rag_evaluation
Using dataset files:
📁 Chunks: ../datasets/rag_evaluation/chunks_seattle_100listings_20250529_204239.json
📁 Queries: ../datasets/rag_evaluation/queries_seattle_100listings_20250529_204239.json

Dataset info:
  Total chunks: 250
  Total queries: 596


In [13]:
# Run evaluation on a small subset first (5 queries for quick testing)
if chunks_file and queries_file:
    print("🧪 Running test evaluation (5 queries)...")
    
    test_results_file = evaluator.run_evaluation(
        chunks_file=chunks_file,
        queries_file=queries_file,
        chunk_size=100,
        max_queries=5  # Small test run
    )
    
    print(f"\n✅ Test evaluation completed!")
    print(f"📊 Results saved to: {test_results_file}")
else:
    print("❌ Cannot run evaluation without dataset files.")
    test_results_file = None

🧪 Running test evaluation (5 queries)...
Loading evaluation dataset...
Evaluating 5 queries...
Processing query 1/5: What type of property is this?...
Processing query 2/5: How many bedrooms does this place have?...
Processing query 3/5: What amenities are available?...
Processing query 4/5: What neighborhood is this located in?...
Processing query 5/5: How many people can this accommodate?...

Evaluation completed!
Results saved to: evaluations/results/rag_evaluation/rag_evaluation_20250529_212804.json
Summary statistics:
  total_queries: 5
  successful_evaluations: 5
  error_rate: 0.000
  avg_response_length: 364.600
  contains_expected_rate: 0.000
  avg_word_overlap_score: 0.073
  error_response_rate: 0.000
  empty_response_rate: 0.000

✅ Test evaluation completed!
📊 Results saved to: evaluations/results/rag_evaluation/rag_evaluation_20250529_212804.json


In [14]:
if test_results_file:
    # Load results
    with open(test_results_file, 'r') as f:
        evaluation_data = json.load(f)
    
    metadata = evaluation_data['metadata']
    summary = evaluation_data['summary']
    results = evaluation_data['results']
    
    print("📋 EVALUATION SUMMARY:")
    print("="*50)
    for key, value in summary.items():
        if isinstance(value, float):
            print(f"{key:25}: {value:.3f}")
        else:
            print(f"{key:25}: {value}")
    
    print(f"\n⏱️  Runtime: {metadata['total_runtime_seconds']:.2f} seconds")
    print(f"🔧 Chunk size: {metadata['chunk_size']} words")
else:
    print("❌ No results to analyze.")

📋 EVALUATION SUMMARY:
total_queries            : 5
successful_evaluations   : 5
error_rate               : 0.000
avg_response_length      : 364.600
contains_expected_rate   : 0.000
avg_word_overlap_score   : 0.073
error_response_rate      : 0.000
empty_response_rate      : 0.000

⏱️  Runtime: 9.49 seconds
🔧 Chunk size: 100 words


In [15]:
if test_results_file and results:
    df_results = pd.DataFrame(results)
    
    # Filter out error cases
    successful_results = df_results[~df_results['actual_answer'].str.contains('Error', na=False)]
    
    if len(successful_results) > 0:
        print("🔍 SAMPLE RAG RESPONSES:")
        print("="*80)
        
        for i, (_, result) in enumerate(successful_results.head(3).iterrows()):
            print(f"\n📝 EXAMPLE {i+1}:")
            print(f"❓ Query: {result['query']}")
            print(f"🎯 Expected: {result['expected_answer']}")
            print(f"🤖 RAG Answer: {result['actual_answer']}")
            print(f"📊 Word Overlap: {result['word_overlap_score']:.3f}")
            print(f"✅ Contains Expected: {result['contains_expected']}")
            print(f"📂 Category: {result.get('category', 'unknown')}")
            print("-" * 80)
    else:
        print("❌ No successful results to display.")

🔍 SAMPLE RAG RESPONSES:

📝 EXAMPLE 1:
❓ Query: What type of property is this?
🎯 Expected: Entire guest suite
🤖 RAG Answer: Based on the description, it appears that this is a basement suite or an in-law suite, likely located in a single-family home. The mention of "2 Huge Private Rooms" and "Common Area" suggests that the property has multiple living spaces, which is consistent with a multi-unit dwelling such as a duplex or a house with an annex.
📊 Word Overlap: 0.333
✅ Contains Expected: False
📂 Category: property_details
--------------------------------------------------------------------------------

📝 EXAMPLE 2:
❓ Query: How many bedrooms does this place have?
🎯 Expected: 2.0
🤖 RAG Answer: This listing doesn't explicitly mention the number of bedrooms, but it mentions "2 Huge Private Rooms". This suggests that there are at least two private rooms, likely bedrooms. However, without further information, we can't confirm the exact total number of bedrooms in the entire space.
📊 Word O

## Evaluation with LLM as Judge

In [4]:
# Test LLM Judge functionality first
from evaluations.utils.llm_judge import LLMJudge

print("🧪 Testing LLM Judge...")

# Initialize judge
judge = LLMJudge()

# Quick test
test_result = judge.evaluate_response(
    query="How many bedrooms does this place have?",
    expected_answer="2", 
    actual_answer="This listing features 2 comfortable bedrooms with queen-sized beds."
)

print("✅ LLM Judge Test Results:")
for key, value in test_result.items():
    if key.startswith('llm_judge'):
        print(f"  {key}: {value}")

if test_result.get('llm_judge_error'):
    print("❌ LLM Judge has errors - check your Ollama setup")
else:
    print("🎉 LLM Judge working correctly!")

🧪 Testing LLM Judge...
✅ LLM Judge Test Results:
  llm_judge_relevance: 1.0
  llm_judge_completeness: 1.0
  llm_judge_intent: 1.0
  llm_judge_correctness: 1.0
  llm_judge_average: 1.0
  llm_judge_raw_response: {
  "relevance": 1.0,
  "completeness": 1.0,
  "intent": 1.0,
  "correctness": 1.0
}
  llm_judge_error: None
  llm_judge_attempts: 1
🎉 LLM Judge working correctly!


In [9]:
# Run evaluation with LLM judge enabled (small sample first)
if chunks_file and queries_file:
    print("🧠 Running evaluation WITH LLM Judge (this will take longer)...")
    
    judge_results_file = evaluator.run_evaluation(
        chunks_file=chunks_file,
        queries_file=queries_file,
        chunk_size=100,
        max_queries=10,  # Start small for testing
        use_llm_judge=True  # 🧠 Enable LLM judge!
    )
    
    print(f"✅ LLM Judge evaluation completed!")
    print(f"📊 Results saved to: {judge_results_file}")
else:
    print("❌ Need dataset files first")
    judge_results_file = None

🧠 Running evaluation WITH LLM Judge (this will take longer)...
Loading evaluation dataset...
Evaluating 10 queries...
🧠 LLM Judge evaluation enabled (this will take longer)
Processing query 1/10: What type of property is this?...
  🧠 Running LLM judge for query 1...
Processing query 2/10: How many bedrooms does this place have?...
  🧠 Running LLM judge for query 2...
Processing query 3/10: What amenities are available?...
  🧠 Running LLM judge for query 3...
Processing query 4/10: What neighborhood is this located in?...
  🧠 Running LLM judge for query 4...
Processing query 5/10: How many people can this accommodate?...
  🧠 Running LLM judge for query 5...
Processing query 6/10: What is the room type?...
  🧠 Running LLM judge for query 6...
Processing query 7/10: What type of property is this?...
  🧠 Running LLM judge for query 7...
Processing query 8/10: How many bedrooms does this place have?...
  🧠 Running LLM judge for query 8...
Processing query 9/10: What amenities are available?

In [10]:
# Quick summary for just the averages
def quick_judge_summary(results_file_path):
    """Quick summary of just the average scores."""
    
    with open(results_file_path, 'r') as f:
        data = json.load(f)
    
    summary = data['summary']
    
    print("🧠 LLM JUDGE QUICK SUMMARY")
    print("-" * 30)
    
    if summary.get('llm_judge_success_rate'):
        print(f"Success Rate: {summary['llm_judge_success_rate']:.1%}")
        print(f"Relevance:    {summary['avg_llm_judge_relevance']:.3f}")
        print(f"Completeness: {summary['avg_llm_judge_completeness']:.3f}")
        print(f"Intent:       {summary['avg_llm_judge_intent']:.3f}")
        print(f"Correctness:  {summary['avg_llm_judge_correctness']:.3f}")
        print(f"Overall:      {summary['avg_llm_judge_overall']:.3f}")
    else:
        print("No LLM judge results available")

# Use it
if 'judge_results_file' in locals() and judge_results_file:
    quick_judge_summary(judge_results_file)

🧠 LLM JUDGE QUICK SUMMARY
------------------------------
Success Rate: 100.0%
Relevance:    0.840
Completeness: 0.660
Intent:       0.810
Correctness:  0.900
Overall:      0.802


### Phi4

In [12]:
# Test Phi-4 Judge functionality
from evaluations.utils.llm_judge_phi4 import LLMJudgePhi4

print("🔬 Testing Phi-4 Judge...")

try:
    # Initialize judge
    phi4_judge = LLMJudgePhi4()
    print("✅ Phi-4 Judge initialized")

    # Quick test
    phi4_test_result = phi4_judge.evaluate_response(
        query="How many bedrooms does this place have?",
        expected_answer="2", 
        actual_answer="This listing features 2 comfortable bedrooms with queen-sized beds."
    )

    print("✅ Phi-4 Judge Test Results:")
    for key, value in phi4_test_result.items():
        if key.startswith('phi4_judge'):
            print(f"  {key}: {value}")
            
    if phi4_test_result.get('phi4_judge_error'):
        print(f"❌ Error: {phi4_test_result['phi4_judge_error']}")
    else:
        print("🎉 Phi-4 judge working correctly!")
        
except Exception as e:
    print(f"❌ Error: {e}")
    print("💡 Make sure you have Phi-4 installed:")
    print("   ollama pull phi4")

🔬 Testing Phi-4 Judge...
✅ Phi-4 Judge initialized
✅ Phi-4 Judge Test Results:
  phi4_judge_relevance: 1.0
  phi4_judge_completeness: 0.8
  phi4_judge_intent: 1.0
  phi4_judge_correctness: 1.0
  phi4_judge_average: 0.95
  phi4_judge_raw_response: ```json
{
  "relevance": 1.0,
  "completeness": 0.8,
  "intent": 1.0,
  "correctness": 1.0
}
```

**Explanation:**

- **Relevance (1.0):** The actual answer directly addresses the question about the number of bedrooms, making it highly relevant.

- **Completeness (0.8):** While the answer provides additional details about the type and size of beds, which is useful, it goes slightly beyond what was minimally required to answer the specific question. However, this extra information can be seen as enhancing the completeness in a practical context.

- **Intent (1.0):** The user's intent was to know the number of bedrooms, and the answer fulfills this need by confirming there are 2 bedrooms.

- **Correctness (1.0):** The factual accuracy is maintai

In [15]:
# Use Phi-4 judge on existing results manually
if 'judge_results_file' in locals() and judge_results_file:
    # Load existing results
    with open(judge_results_file, 'r') as f:
        existing_data = json.load(f)
    
    print("🔬 Running Phi-4 judge on existing results...")
    
    # Initialize Phi-4 judge
    from evaluations.utils.llm_judge_phi4 import LLMJudgePhi4
    phi4_judge = LLMJudgePhi4()
    
    # Run Phi-4 on a few results
    results = existing_data['results']
    for i, result in enumerate(results[:3]):  # Just first 3
        if not result.get('error') and not result.get('is_error'):
            print(f"  Processing result {i+1}/3...")
            
            phi4_scores = phi4_judge.evaluate_response(
                query=result['query'],
                expected_answer=result.get('expected_answer', ''),
                actual_answer=result['actual_answer']
            )
            
            # Add phi4 scores to result
            result.update(phi4_scores)
    
    # Show summary
    phi4_results = [r for r in results[:3] if r.get('phi4_judge_average') is not None]
    if phi4_results:
        avg_phi4 = sum(r['phi4_judge_average'] for r in phi4_results) / len(phi4_results)
        print(f"✅ Phi-4 average score: {avg_phi4:.3f}")
        
        # Show comparison
        avg_llm = sum(r.get('llm_judge_average', 0) for r in phi4_results) / len(phi4_results)
        print(f"🧠 LLM average score: {avg_llm:.3f}")
        print(f"📊 Difference: {abs(avg_phi4 - avg_llm):.3f}")
        

🔬 Running Phi-4 judge on existing results...
  Processing result 1/3...
  Processing result 2/3...
  Processing result 3/3...
✅ Phi-4 average score: 0.733
🧠 LLM average score: 0.700
📊 Difference: 0.033


In [16]:
# Use Phi-4 judge on existing results with detailed summary
if 'judge_results_file' in locals() and judge_results_file:
    # Load existing results
    with open(judge_results_file, 'r') as f:
        existing_data = json.load(f)
    
    print("🔬 Running Phi-4 judge on existing results...")
    
    # Initialize Phi-4 judge
    from evaluations.utils.llm_judge_phi4 import LLMJudgePhi4
    phi4_judge = LLMJudgePhi4()
    
    # Run Phi-4 on a few results
    results = existing_data['results']
    phi4_results = []
    
    for i, result in enumerate(results[:3]):  # Just first 3
        if not result.get('error') and not result.get('is_error'):
            print(f"  Processing result {i+1}/3...")
            
            phi4_scores = phi4_judge.evaluate_response(
                query=result['query'],
                expected_answer=result.get('expected_answer', ''),
                actual_answer=result['actual_answer']
            )
            
            # Collect successful phi4 results
            if phi4_scores.get('phi4_judge_average') is not None:
                phi4_results.append(phi4_scores)
    
    # Calculate detailed summary like the LLM judge
    if phi4_results:
        print("\n🔬 PHI-4 JUDGE QUICK SUMMARY")
        print("-" * 30)
        
        success_rate = len(phi4_results) / 3  # 3 total attempts
        avg_relevance = sum(r['phi4_judge_relevance'] for r in phi4_results) / len(phi4_results)
        avg_completeness = sum(r['phi4_judge_completeness'] for r in phi4_results) / len(phi4_results)
        avg_intent = sum(r['phi4_judge_intent'] for r in phi4_results) / len(phi4_results)
        avg_correctness = sum(r['phi4_judge_correctness'] for r in phi4_results) / len(phi4_results)
        avg_overall = sum(r['phi4_judge_average'] for r in phi4_results) / len(phi4_results)
        
        print(f"Success Rate: {success_rate:.1%}")
        print(f"Relevance:    {avg_relevance:.3f}")
        print(f"Completeness: {avg_completeness:.3f}")
        print(f"Intent:       {avg_intent:.3f}")
        print(f"Correctness:  {avg_correctness:.3f}")
        print(f"Overall:      {avg_overall:.3f}")
    else:
        print("❌ No successful Phi-4 results")

🔬 Running Phi-4 judge on existing results...
  Processing result 1/3...
