# Step 6: Evaluation

This notebook performs evaluation of the entire IDP pipeline results using the EvaluationService class to assess accuracy and generate comprehensive reports.

**Inputs:**
- Document object with all processing results from Step 5
- Evaluation configuration
- Optional ground truth data for accuracy assessment

**Outputs:**
- Comprehensive evaluation report (Markdown and JSON)
- Accuracy metrics for each processing step
- Performance analysis and recommendations

## 1. Load Previous Step Data

In [None]:
import os
import json
import time
import logging
import boto3
from pathlib import Path
from datetime import datetime
from IPython.display import Markdown, display

# Import IDP libraries
from idp_common.models import Document, Status, Section
from idp_common import evaluation

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.evaluation.service').setLevel(logging.INFO)

print("Libraries imported successfully")

In [None]:
# Load document from previous step
summarization_data_dir = Path(".data/step5_summarization")

# Load document object from JSON
document_path = summarization_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
import yaml
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "evaluation.yaml",
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = summarization_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Number of sections: {len(document.sections) if document.sections else 0}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")
print(f"Processing complete - ready for evaluation")

## 2. Helper Functions for Evaluation

In [None]:
# Helper function to parse S3 URIs and load JSON
def parse_s3_uri(uri):
    parts = uri.replace("s3://", "").split("/")
    bucket = parts[0]
    key = "/".join(parts[1:])
    return bucket, key

def load_json_from_s3(uri):
    s3_client = boto3.client('s3')
    bucket, key = parse_s3_uri(uri)
    response = s3_client.get_object(Bucket=bucket, Key=key)
    content = response['Body'].read().decode('utf-8')
    return json.loads(content)

def create_ground_truth_document(source_document, expected_results_dict):
    """Creates a ground truth document for evaluation from an existing document and expected results.
    
    Args:
        source_document: The original document to copy structure from
        expected_results_dict: Dictionary mapping section IDs to expected attribute values
        
    Returns:
        Document: A document with the same structure but with expected results
    """
    # Create a new document with the same core attributes
    ground_truth = Document(
        id=source_document.id,
        input_bucket=source_document.input_bucket,
        input_key=source_document.input_key,
        output_bucket=source_document.output_bucket,
        status=Status.COMPLETED
    )
    
    # Copy sections and add expected result URIs
    for section in source_document.sections:
        # Create section with same structure
        expected_section = Section(
            section_id=section.section_id,
            classification=section.classification,
            confidence=1.0,
            page_ids=section.page_ids.copy(),
            extraction_result_uri=section.extraction_result_uri  # Copy the URI from actual document
        )
        ground_truth.sections.append(expected_section)
    
    # Copy pages
    for page_id, page in source_document.pages.items():
        ground_truth.pages[page_id] = page
    
    # Store expected results to S3 for sections that have extraction results
    s3_client = boto3.client('s3')
    for section_id, expected_data in expected_results_dict.items():
        # Find the section in the document
        for section in ground_truth.sections:
            if section.section_id == section_id and section.extraction_result_uri:
                # Load the original extraction result as template
                uri = section.extraction_result_uri
                bucket, key = parse_s3_uri(uri)
                
                try:
                    # Get the original result structure
                    response = s3_client.get_object(Bucket=bucket, Key=key)
                    result_data = json.loads(response['Body'].read().decode('utf-8'))
                    
                    # Replace the inference_result with our expected data
                    if "inference_result" in result_data:
                        result_data["inference_result"] = expected_data
                    else:
                        # Or just replace the entire content if no inference_result key
                        result_data = expected_data
                    
                    # Write back to S3 with a different key for expected values
                    expected_key = key.replace("/result.json", "/expected.json")
                    s3_client.put_object(
                        Bucket=bucket,
                        Key=expected_key,
                        Body=json.dumps(result_data).encode('utf-8')
                    )
                    
                    # Update the section's extraction URI to point to our expected data
                    section.extraction_result_uri = f"s3://{bucket}/{expected_key}"
                    print(f"Stored expected results for section {section_id} at {section.extraction_result_uri}")
                except Exception as e:
                    print(f"Error storing expected results for section {section_id}: {e}")
    
    return ground_truth

print("Helper functions defined")

## 3. Configure Evaluation Service

In [None]:
# Extract evaluation configuration
evaluation_config = CONFIG.get('evaluation', {}).get('llm_method', {})
print("Evaluation Configuration:")
print("Summarization Configuration:")
print(f"Model: {evaluation_config.get('model')}")
print(f"Temperature: {evaluation_config.get('temperature')}")
print(f"Max Tokens: {evaluation_config.get('max_tokens')}")
print(f"Default Confidence Threshold: {evaluation_config.get('default_confidence_threshold')}")
print("*"*50)
print(f"System Prompt:\n{evaluation_config.get('system_prompt')}")
print("*"*50)
print(f"Task Prompt:\n{evaluation_config.get('task_prompt')}")
print("*"*50)

# Initialize evaluation service
try:
    evaluation_service = evaluation.EvaluationService(config=CONFIG)
    print("\n✅ EvaluationService initialized successfully")
    print(f"Service configured with: {evaluation_config.get('llm_method', {}).get('model', 'default model')}")
except Exception as e:
    print(f"\n❌ Error initializing EvaluationService: {e}")
    evaluation_service = None

## 4. Create Ground Truth Data (Optional)

For demonstration purposes, we'll create sample ground truth data. In a real scenario, you would load actual ground truth data from your test dataset.

In [None]:
# Check if we have extraction results to evaluate
sections_with_extractions = [section for section in document.sections if hasattr(section, 'extraction_result_uri') and section.extraction_result_uri]

print(f"Found {len(sections_with_extractions)} sections with extraction results")

# Create sample ground truth data if we have extractions
if sections_with_extractions:
    # Load actual extraction results to create realistic ground truth
    print("\nCreating sample ground truth based on actual extractions...")
    
    expected_results = {}
    
    for section in sections_with_extractions[:3]:  # Limit to first 3 sections
        try:
            # Load actual extraction result
            extraction_data = load_json_from_s3(section.extraction_result_uri)
            
            if 'inference_result' in extraction_data:
                actual_result = extraction_data['inference_result']
                
                # Create expected result based on actual (with some intentional variations for demo)
                expected_result = actual_result.copy() if isinstance(actual_result, dict) else {}
                
                # Add some variations to demonstrate different evaluation scenarios
                for key, value in expected_result.items():
                    if isinstance(value, str) and len(value) > 10:
                        # For longer strings, create slight variations to test fuzzy matching
                        expected_result[key] = value.replace("  ", " ").strip()  # Normalize whitespace
                    elif isinstance(value, (int, float)):
                        # Keep numbers exactly the same for exact matching
                        expected_result[key] = value
                
                expected_results[section.section_id] = expected_result
                print(f"  Created ground truth for section {section.section_id} ({section.classification})")
                
        except Exception as e:
            print(f"  Error creating ground truth for section {section.section_id}: {e}")
    
    print(f"\nCreated ground truth for {len(expected_results)} sections")
    
else:
    print("\nNo extraction results found - skipping ground truth creation")
    expected_results = {}

## 5. Run Evaluation Using EvaluationService

In [None]:
if evaluation_service and expected_results:
    print("=== Running Document Evaluation ===")
    
    # Create ground truth document
    print("Creating ground truth document...")
    expected_document = create_ground_truth_document(document, expected_results)
    
    # Run evaluation using EvaluationService
    print("\nRunning evaluation with EvaluationService...")
    start_time = time.time()
    
    try:
        document = evaluation_service.evaluate_document(
            actual_document=document,
            expected_document=expected_document
        )
        evaluation_time = time.time() - start_time
        
        print(f"✅ Evaluation completed in {evaluation_time:.2f} seconds")        
        evaluation_completed = True
        
    except Exception as e:
        print(f"❌ Error running evaluation: {e}")
        evaluation_completed = False
        
elif not evaluation_service:
    print("⚠️ EvaluationService not available - skipping evaluation")
    evaluation_completed = False
    
elif not expected_results:
    print("⚠️ No ground truth data available - skipping evaluation")
    evaluation_completed = False
else:
    print("⚠️ Missing requirements for evaluation")
    evaluation_completed = False

## 6. Display Evaluation Results

In [None]:
if evaluation_completed and hasattr(document, 'evaluation_report_uri') and document.evaluation_report_uri:
    print("=== Evaluation Results ===")

    print(f"📊 Evaluation report URI: {document.evaluation_report_uri}")
    
    # Show evaluation result summary
    if hasattr(document, 'evaluation_result') and document.evaluation_result:
        eval_result = document.evaluation_result
        print(f"\n📈 Evaluation Summary - Overall:")
        print(f"{eval_result.overall_metrics}")
        print(f"\n📈 Evaluation Summary - Per Section:")
        for section_result in eval_result.section_results:
            print(f"\n📈 Evaluation Summary - Section {section_result.section_id}:")
            print(f"{section_result.metrics}")
    else:
        print("❌ No evaluation result found in document")

    try:
        # Read the evaluation report from S3
        print("Reading evaluation report from S3...")
        bucket, key = parse_s3_uri(document.evaluation_report_uri)
        s3_client = boto3.client('s3')
        response = s3_client.get_object(Bucket=bucket, Key=key)
        report_content = response['Body'].read().decode('utf-8')
        
        print(f"📄 Successfully loaded report from {document.evaluation_report_uri}")
               
        print("\n" + "="*60)
        print("📋 EVALUATION REPORT")
        print("="*60)
        
        # Display the markdown report
        display(Markdown(report_content))
        
    except Exception as e:
        print(f"❌ Error loading evaluation report: {e}")
        

## 7. Save Final Evaluation Results

In [None]:
# Create data directory for this step
data_dir = Path(".data/step6_evaluation")
data_dir.mkdir(parents=True, exist_ok=True)

# Update document status to completed
document.status = Status.COMPLETED

# Save final document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration (pass through)
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info (pass through)
env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

print(f"Saved final document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")


## 8. Final Summary

In [None]:
print("=== Step 6: Evaluation Complete ===")
print(f"✅ Document processed: {document.id}")
print(f"✅ Pages processed: {getattr(document, 'num_pages', 0)}")
print(f"✅ Sections identified: {len(document.sections) if document.sections else 0}")

if 'evaluation_completed' in locals() and evaluation_completed:
    print(f"✅ EvaluationService analysis completed successfully")
    if hasattr(document, 'evaluation_report_uri') and document.evaluation_report_uri:
        print(f"✅ Evaluation report generated: {document.evaluation_report_uri}")
else:
    print(f"✅ Performance analysis completed (no ground truth evaluation)")

print(f"✅ Results saved to: .data/step6_evaluation/")

print("\n=== 🎉 MODULAR IDP PIPELINE COMPLETE! 🎉 ===")
print("\nAll steps have been successfully executed:")
print("  0️⃣ Setup - Environment and document initialization")
print("  1️⃣ OCR - Text and image extraction from PDF")
print("  2️⃣ Classification - Document type identification")
print("  3️⃣ Extraction - Structured data extraction")
print("  4️⃣ Assessment - Confidence evaluation")
print("  5️⃣ Summarization - Content summarization")
print("  6️⃣ Evaluation - Final analysis and reporting")

print("\n📊 Key Benefits of Modular Approach:")
print("  • Independent step execution and testing")
print("  • Modular configuration management")
print("  • Step-by-step result persistence")
print("  • Easy experimentation with different configurations")
print("  • Comprehensive evaluation and reporting")
print("  • Professional-grade EvaluationService integration")

print("\n🔧 Next Steps for Experimentation:")
print("  • Modify config files to try different models or parameters")
print("  • Add new document classes in classes.yaml")
print("  • Run individual steps with different configurations")
print("  • Compare results across different pipeline runs")
print("  • Experiment with different confidence thresholds")
print("  • Provide real ground truth data for accuracy evaluation")