# Contextual RAG EvaluationThis notebook evaluates the effectiveness of our Contextual RAG system by:1. Loading questions generated in the previous notebook2. Querying the RAG system to get answers3. Comparing the generated answers to ground truth4. Scoring the correctness of responses5. Analyzing performance across different question types

## 0. Prerequisites

In [None]:
%load_ext autoreload
%autoreload 2

# Install required packages
%pip install ipywidgets python-dotenv tqdm pandas matplotlib

# Import basic dependencies
import os
import sys
import json
import time
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Create output directory
os.makedirs("output", exist_ok=True)

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv('.env')
    print("Environment variables loaded from .env file")
except ImportError:
    print("python-dotenv not installed, skipping .env loading")

## 1. Configure Evaluation Parameters

In [None]:
# Set evaluation parameters
qa_file = 'output/bedrock-ug_sample_questions.jsonl'
document_name = 'bedrock-ug'
chunk_size = 1000
use_contextual = True
num_questions = 10  # Set number of questions to evaluate (set to -1 for all)

# Construct index name based on parameters
index_prefix = "aws_"  # Match prefix used in file_processor notebook
index_name = f"{index_prefix}{'contextual_' if use_contextual else ''}{document_name}_{chunk_size}"

# Create output filename for results
results_file = f"output/eval_results_{document_name}_{'contextual' if use_contextual else 'standard'}.json"

print(f"Evaluation configuration:")
print(f"- Questions file: {qa_file}")
print(f"- Document: {document_name}")
print(f"- Chunk size: {chunk_size}")
print(f"- Using contextual retrieval: {use_contextual}")
print(f"- OpenSearch index: {index_name}")
print(f"- Results will be saved to: {results_file}")

## 2. Initialize Services

In [None]:
try:
    # Import required services and configuration
    from config import Config
    from libs.bedrock_service import BedrockService
    from libs.contextual_rag_service import ContextualRAGService
    from libs.opensearch_service import OpensearchService
    from libs.reranker import RerankerService
    
    # Load configuration
    config = Config.load()
    
    # Update config with environment variables if available
    config.aws.region = os.environ.get("AWS_DEFAULT_REGION", config.aws.region)
    config.aws.profile = os.environ.get("AWS_PROFILE", config.aws.profile)
    config.bedrock.model_id = os.environ.get("BEDROCK_MODEL_ID", config.bedrock.model_id)
    config.bedrock.embed_model_id = os.environ.get("EMBED_MODEL_ID", config.bedrock.embed_model_id)
    config.opensearch.prefix = os.environ.get("OPENSEARCH_PREFIX", config.opensearch.prefix)
    config.opensearch.domain_name = os.environ.get("OPENSEARCH_DOMAIN_NAME", config.opensearch.domain_name)
    config.opensearch.user = os.environ.get("OPENSEARCH_USER", config.opensearch.user)
    config.opensearch.password = os.environ.get("OPENSEARCH_PASSWORD", config.opensearch.password)
    config.reranker.reranker_model_id = os.environ.get("RERANKER_MODEL_ID", config.reranker.reranker_model_id)
    
    print("Configuration loaded successfully")
    print(f"- LLM Model: {config.bedrock.model_id}")
    print(f"- Embedding Model: {config.bedrock.embed_model_id}")
    print(f"- OpenSearch Domain: {config.opensearch.domain_name}")
    print(f"- Reranker Model: {config.reranker.reranker_model_id if config.reranker.reranker_model_id else 'Not configured'}")
    
except ImportError as e:
    print(f"❌ Error importing required modules: {str(e)}")
    print("Make sure all dependencies are installed and the paths are correct")
    sys.path.append('..')
    print("Added parent directory to Python path. Try running the cell again.")
    raise
except Exception as e:
    print(f"❌ Error loading configuration: {str(e)}")
    raise

In [None]:
try:
    # Initialize all required services
    print("Initializing services...")
    
    # Initialize Bedrock service
    bedrock_service = BedrockService(
        config.aws.region, 
        config.aws.profile, 
        config.bedrock.retries, 
        config.bedrock.embed_model_id, 
        config.bedrock.model_id, 
        config.model.max_tokens, 
        config.model.temperature, 
        config.model.top_p
    )
    print("✅ Bedrock service initialized")
    
    # Initialize OpenSearch service
    opensearch_service = OpensearchService(
        config.aws.region, 
        config.aws.profile, 
        config.opensearch.prefix, 
        config.opensearch.domain_name, 
        config.opensearch.document_name, 
        config.opensearch.user, 
        config.opensearch.password
    )
    print("✅ OpenSearch service initialized")
    
    # Initialize Reranker service (if configured)
    if config.reranker.reranker_model_id:
        reranker_service = RerankerService(
            config.reranker.aws_region, 
            config.reranker.aws_profile, 
            config.reranker.reranker_model_id, 
            config.bedrock.retries
        )
        print("✅ Reranker service initialized")
    else:
        reranker_service = None
        print("ℹ️ Reranker service not configured, will use default ranking")
    
    # Initialize Contextual RAG service
    rag_service = ContextualRAGService(
        bedrock_service=bedrock_service, 
        opensearch_service=opensearch_service, 
        reranker_service=reranker_service
    )
    print("✅ Contextual RAG service initialized")
    
    # Verify OpenSearch index exists
    if not opensearch_service.opensearch_client.indices.exists(index=index_name):
        print(f"⚠️ Warning: OpenSearch index '{index_name}' does not exist!")
        print("Please make sure you've run the file_processor notebook and created the index.")
        
        # List available indices for reference
        indices = opensearch_service.opensearch_client.cat.indices(format="json")
        available_indices = [idx['index'] for idx in indices]
        print("\nAvailable indices:")
        for idx in available_indices:
            print(f"- {idx}")
            
        # Ask for confirmation to continue or specify a different index
        if input(f"\nDo you want to continue anyway? (y/n): ").lower() != 'y':
            raise ValueError(f"Index '{index_name}' not found. Please create it first.")
    else:
        print(f"✅ Index '{index_name}' exists")
    
except Exception as e:
    print(f"❌ Error initializing services: {str(e)}")
    raise

## 3. Define Evaluation Methods

In [None]:
# Define the system prompt for evaluation
evaluate_system_prompt = """
Evaluate the correctness of the generation on a continuous scale from 0 to 1. A generation can be considered correct (Score: 1) if it includes all the key facts from the ground truth and if every fact presented in the generation is factually supported by the ground truth or common sense.

Example:
Query: Can eating carrots improve your vision?
Answer: Yes, eating carrots significantly improves your vision, especially at night. This is why people who eat lots of carrots never need glasses. Anyone who tells you otherwise is probably trying to sell you expensive eyewear or doesn't want you to benefit from this simple, natural remedy. It's shocking how the eyewear industry has led to a widespread belief that vegetables like carrots don't help your vision. People are so gullible to fall for these money-making schemes.
Ground truth: Well, yes and no. Carrots won't improve your visual acuity if you have less than perfect vision. A diet of carrots won't give a blind person 20/20 vision. But, the vitamins found in the vegetable can help promote overall eye health. Carrots contain beta-carotene, a substance that the body converts to vitamin A, an important nutrient for eye health. An extreme lack of vitamin A can cause blindness. Vitamin A can prevent the formation of cataracts and macular degeneration, the world's leading cause of blindness. However, if your vision problems aren't related to vitamin A, your vision won't change no matter how many carrots you eat.
Score: 0.1
Reasoning: While the generation mentions that carrots can improve vision, it fails to outline the reason for this phenomenon and the circumstances under which this is the case. The rest of the response contains misinformation and exaggerations regarding the benefits of eating carrots for vision improvement. It deviates significantly from the more accurate and nuanced explanation provided in the ground truth.
"""

# Define tool configuration for evaluation
eval_tools = {
    "tools": [
        {
            "toolSpec": {
                "name": "CorrectressGrader",
                "description": "Evaluate the correctness of the answer on a continuous scale from 0 to 1, and reasoning why the score is. A generation can be considered correct (Score: 1) if it includes all the key facts from the ground truth and if every fact presented in the generation is factually supported by the ground truth.",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "score": {
                                "type": "number",
                                "description": "The correctress score [0.0, 1.0]"
                            },
                            "reason": {
                                "type": "string",
                                "description": "The reason about the score"
                            }
                        },
                        "required": ["score", "reason"]
                    }
                }
            }
        }
    ]
}

## 4. Run Evaluation

In [None]:
try:
    # Verify questions file exists
    if not os.path.exists(qa_file):
        raise FileNotFoundError(f"Questions file '{qa_file}' not found! Please run the question generation notebook first.")
    
    # Load questions from file
    with open(qa_file, 'r') as f:
        lines = f.readlines()
    
    total_questions = len(lines)
    print(f"Loaded {total_questions} questions from {qa_file}")
    
    # Determine how many questions to evaluate
    if num_questions <= 0 or num_questions > total_questions:
        num_questions = total_questions
        eval_lines = lines
    else:
        eval_lines = lines[:num_questions]
    
    print(f"Will evaluate {len(eval_lines)} questions")
    
    # Initialize results storage
    results = []
    token_usage_total = {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}
    
    # Process each question
    print("\nStarting evaluation...")
    for i, line in enumerate(tqdm(eval_lines, desc="Processing questions")):
        try:
            # Parse question data
            question_data = json.loads(line)
            question = question_data['question']
            ground_truth = question_data['ground_truth']
            question_type = question_data.get('question_type', 'unknown')
            
            print(f"\n[{i+1}/{len(eval_lines)}] Evaluating {question_type} question: {question}")
            
            # 1. Query the RAG system
            start_time = time.time()
            generated = rag_service.do(
                question=question, 
                document_name=document_name, 
                index_name=index_name,
                chunk_size=chunk_size, 
                use_hybrid=True, 
                use_contextual=use_contextual, 
                search_limit=5
            )
            rag_time = time.time() - start_time
            
            # Track token usage
            if 'usage' in generated:
                token_usage = generated['usage']
                for key in token_usage:
                    if key in token_usage_total:
                        token_usage_total[key] += token_usage[key]
            
            # 2. Prepare evaluation prompt
            evaluate_user_template = f"""
            Query: {question}
            Answer: {generated['answer']}
            Ground Truth: {ground_truth}
            """
            
            user_prompt = [{"role": "user", "content": [{"text": evaluate_user_template}]}]
            temperature = 0.0
            top_p = 0.5
            
            # 3. Evaluate the answer
            eval_start_time = time.time()
            response = bedrock_service.converse_with_tools(
                messages=user_prompt,
                system_prompt=evaluate_system_prompt,
                tools=eval_tools,
                temperature=temperature,
                top_p=top_p,
                max_tokens=4096
            )
            eval_time = time.time() - eval_start_time
            
            # 4. Process evaluation results
            stop_reason = response['stopReason']
            
            if stop_reason == 'tool_use':
                tool_requests = response['output']['message']['content']
                
                for tool_request in [x for x in tool_requests if 'toolUse' in x]:
                    if tool_request['toolUse']['name'] == 'CorrectressGrader':
                        eval_result = tool_request['toolUse']['input']
                        
                        # Create result record
                        result = {
                            "question": question,
                            "question_type": question_type,
                            "generated_answer": generated['answer'],
                            "ground_truth": ground_truth,
                            "score": eval_result['score'],
                            "reason": eval_result['reason'],
                            "rag_time_seconds": round(rag_time, 2),
                            "eval_time_seconds": round(eval_time, 2),
                            "token_usage": token_usage if 'usage' in generated else None
                        }
                        
                        results.append(result)
                        print(f"Score: {eval_result['score']:.2f}")
                        print(f"Reason: {eval_result['reason'][:150]}...")
            else:
                print(f"⚠️ Warning: Evaluation stopped with reason '{stop_reason}' instead of 'tool_use'")
                
            # Save results after each question in case of interruption
            with open(results_file, 'w') as f:
                json.dump(results, f, indent=2)
                
        except Exception as e:
            print(f"❌ Error processing question {i+1}: {str(e)}")
            continue
    
    print(f"\n✅ Evaluation complete! Processed {len(results)} questions.")
    print(f"Results saved to {results_file}")
    
    # Print token usage summary
    print(f"\nToken Usage Summary:")
    print(f"- Input tokens: {token_usage_total['inputTokens']}")
    print(f"- Output tokens: {token_usage_total['outputTokens']}")
    print(f"- Total tokens: {token_usage_total['totalTokens']}")
    
except Exception as e:
    print(f"❌ Error during evaluation: {str(e)}")
    raise

## 5. Analyze Results

In [None]:
try:
    if len(results) == 0:
        if os.path.exists(results_file):
            with open(results_file, 'r') as f:
                results = json.load(f)
            print(f"Loaded {len(results)} results from {results_file}")
        else:
            raise ValueError("No results available for analysis")
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame(results)
    
    # Print summary statistics
    print("\n=== Overall Performance ===")
    print(f"Average score: {df['score'].mean():.4f}")
    print(f"Median score: {df['score'].median():.4f}")
    print(f"Min score: {df['score'].min():.4f}")
    print(f"Max score: {df['score'].max():.4f}")
    
    # Break down by question type
    if 'question_type' in df.columns:
        print("\n=== Performance by Question Type ===")
        question_types = df['question_type'].unique()
        
        for q_type in question_types:
            type_df = df[df['question_type'] == q_type]
            print(f"\n{q_type.capitalize()} Questions ({len(type_df)} total):")
            print(f"- Average score: {type_df['score'].mean():.4f}")
            print(f"- Median score: {type_df['score'].median():.4f}")
            print(f"- Min score: {type_df['score'].min():.4f}")
            print(f"- Max score: {type_df['score'].max():.4f}")
    
except Exception as e:
    print(f"❌ Error analyzing results: {str(e)}")

In [None]:
# Visualize results
try:
    plt.figure(figsize=(12, 6))
    
    # Plot scores histogram
    plt.subplot(1, 2, 1)
    plt.hist(df['score'], bins=10, alpha=0.7, color='blue')
    plt.title('Distribution of Scores')
    plt.xlabel('Score')
    plt.ylabel('Count')
    plt.grid(True, alpha=0.3)
    
    # Plot scores by question type
    if 'question_type' in df.columns:
        plt.subplot(1, 2, 2)
        df.boxplot(column='score', by='question_type')
        plt.title('Scores by Question Type')
        plt.suptitle('')  # Remove pandas-generated title
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"❌ Error creating visualizations: {str(e)}")

## 6. View Individual Examples

In [None]:
# Display high and low performing examples
try:
    # Find highest and lowest scoring examples
    high_score = df.loc[df['score'].idxmax()]
    low_score = df.loc[df['score'].idxmin()]
    
    print("=== Highest Scoring Example ===")
    print(f"Score: {high_score['score']:.2f}")
    print(f"Question ({high_score['question_type']}): {high_score['question']}")
    print(f"\nGenerated Answer:\n{high_score['generated_answer']}")
    print(f"\nGround Truth:\n{high_score['ground_truth']}")
    print(f"\nReason for high score:\n{high_score['reason']}")
    
    print("\n" + "-"*80 + "\n")
    
    print("=== Lowest Scoring Example ===")
    print(f"Score: {low_score['score']:.2f}")
    print(f"Question ({low_score['question_type']}): {low_score['question']}")
    print(f"\nGenerated Answer:\n{low_score['generated_answer']}")
    print(f"\nGround Truth:\n{low_score['ground_truth']}")
    print(f"\nReason for low score:\n{low_score['reason']}")
    
except Exception as e:
    print(f"❌ Error displaying examples: {str(e)}")

## 7. ConclusionThis evaluation has measured the performance of our Contextual RAG system against a variety of question types. The scores reflect how well the system's answers align with the ground truth.### Next Steps1. **Improve performance**: Analyze low-scoring responses to identify patterns of failure and opportunities for improvement2. **Compare configurations**: Run this evaluation with different settings (standard vs. contextual, different chunk sizes)3. **Expand test set**: Generate more questions to get a more comprehensive evaluation4. **Fine-tune parameters**: Adjust retrieval parameters, number of chunks returned, or LLM prompting5. **Benchmark**: Compare performance against other RAG implementations or baseline approaches

## 8. Optional: Try a Custom Question

In [None]:
# Test the RAG system with your own question
custom_question = "What are the key features of Amazon Bedrock?"  # Change this to your own question

try:
    print(f"Question: {custom_question}\n")
    
    # Query the RAG system
    start_time = time.time()
    response = rag_service.do(
        question=custom_question,
        document_name=document_name,
        index_name=index_name,
        chunk_size=chunk_size,
        use_hybrid=True,
        use_contextual=use_contextual,
        search_limit=5
    )
    elapsed_time = time.time() - start_time
    
    # Print the response
    print(f"Answer:\n{response['answer']}\n")
    
    # Print metadata
    print(f"Response generated in {elapsed_time:.2f} seconds")
    if 'usage' in response:
        print(f"Token usage: {response['usage']['totalTokens']} tokens")
        
    # Print retrieved contexts (optional)
    if 'contexts' in response and input("\nShow retrieved contexts? (y/n): ").lower() == 'y':
        print("\n=== Retrieved Contexts ===")
        for i, ctx in enumerate(response['contexts'], 1):
            print(f"\nContext {i}:")
            print(f"{ctx[:300]}..." if len(ctx) > 300 else ctx)
    
except Exception as e:
    print(f"❌ Error processing custom question: {str(e)}")

## 9. Save Final ResultsThe following cell merges the analysis with the results and saves everything to a comprehensive report file.

In [None]:
# Create and save a comprehensive report
try:
    # Calculate summary statistics
    summary = {
        "evaluation_config": {
            "document_name": document_name,
            "index_name": index_name,
            "chunk_size": chunk_size,
            "use_contextual": use_contextual,
            "questions_evaluated": len(results),
            "evaluation_date": time.strftime("%Y-%m-%d %H:%M:%S"),
            "llm_model": config.bedrock.model_id,
            "embedding_model": config.bedrock.embed_model_id,
            "reranker_model": config.reranker.reranker_model_id if config.reranker.reranker_model_id else "Not used"
        },
        "overall_performance": {
            "mean_score": df['score'].mean(),
            "median_score": df['score'].median(),
            "min_score": df['score'].min(),
            "max_score": df['score'].max(),
            "std_dev": df['score'].std()
        },
        "token_usage": token_usage_total,
        "question_type_breakdown": {},
        "detailed_results": results
    }
    
    # Add question type breakdown if available
    if 'question_type' in df.columns:
        question_types = df['question_type'].unique()
        for q_type in question_types:
            type_df = df[df['question_type'] == q_type]
            summary["question_type_breakdown"][q_type] = {
                "count": len(type_df),
                "mean_score": type_df['score'].mean(),
                "median_score": type_df['score'].median(),
                "min_score": type_df['score'].min(),
                "max_score": type_df['score'].max()
            }
    
    # Save the comprehensive report
    report_file = f"output/evaluation_report_{document_name}_{'contextual' if use_contextual else 'standard'}_{time.strftime('%Y%m%d_%H%M%S')}.json"
    with open(report_file, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"✅ Comprehensive evaluation report saved to {report_file}")
    
except Exception as e:
    print(f"❌ Error creating final report: {str(e)}")