# Step 3: Information Extraction with Custom Lambda Demonstration

This notebook demonstrates the **custom prompt generator Lambda feature** for Patterns 2 and 3. It shows how to:

- Configure custom Lambda functions for extraction prompt generation
- Compare default vs custom prompt extraction results
- Inspect Lambda payloads and responses
- Handle errors and monitor performance

**Prerequisites:**
- Completed Step 2 (Classification)
- AWS Lambda permissions to create/invoke functions
- Demo Lambda function deployed (see deployment section below)

**Key Feature:**
The `custom_prompt_lambda_arn` configuration field allows you to inject custom business logic into the extraction process while leveraging the existing IDP infrastructure.

## 1. Setup and Import Libraries

In [None]:
import os
import json
import time
import logging
import boto3
from pathlib import Path
import yaml

# Import IDP libraries
from idp_common.models import Document, Status
from idp_common import extraction

# Configure logging to see Lambda invocation details
logging.basicConfig(level=logging.INFO)
logging.getLogger('idp_common.extraction').setLevel(logging.INFO)
logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)

print("Libraries imported successfully")

## 2. Load Previous Step Data

In [None]:
# Load document from previous step
classification_data_dir = Path(".data/step2_classification")

# Load document object from JSON
document_path = classification_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "extraction.yaml",
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = classification_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Custom-Lambda-Demo'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Number of sections: {len(document.sections) if document.sections else 0}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")

## 3. Configure Lambda ARN (Set Your Function ARN Here)

In [None]:
# 🔧 CONFIGURATION: Set your custom Lambda ARN here
# Replace with your actual Lambda function ARN for live testing

# Example ARNs (replace with your actual ARN):
# DEMO_LAMBDA_ARN = "arn:aws:lambda:us-east-1:123456789012:function:GENAIIDP-notebook-demo-extractor"
# DEMO_LAMBDA_ARN = "arn:aws:lambda:us-east-1:123456789012:function:GENAIIDP-my-custom-extractor"

# Check if Lambda function exists
lambda_client = boto3.client('lambda')
DEMO_LAMBDA_ARN = None

try:
    response = lambda_client.get_function(FunctionName='GENAIIDP-notebook-demo-extractor')
    DEMO_LAMBDA_ARN = response['Configuration']['FunctionArn']
    print(f"✅ Found demo Lambda function: {DEMO_LAMBDA_ARN}")
except lambda_client.exceptions.ResourceNotFoundException:
    print("⚠️  Demo Lambda function not found: GENAIIDP-notebook-demo-extractor")
    print("💡 Deploy using: cd notebooks/examples/demo-lambda && sam deploy --guided")
except Exception as e:
    print(f"Error checking Lambda function: {e}")

if not DEMO_LAMBDA_ARN:
    print("⚠️  No custom Lambda ARN configured")
    print("💡 This demo will show standard extraction and simulate custom Lambda behavior")
    print("🔧 To test with a real Lambda, set DEMO_LAMBDA_ARN above")
else:
    print(f"✅ Custom Lambda ARN configured: {DEMO_LAMBDA_ARN}")
    print("🚀 This demo will use your custom Lambda function")

## 4. Extraction Comparison: Default vs Custom Lambda

### 4.1 Default Extraction (Without Custom Lambda)

In [None]:
# Create configuration WITHOUT custom Lambda
config_default = CONFIG.copy()
if 'custom_prompt_lambda_arn' in config_default.get('extraction', {}):
    del config_default['extraction']['custom_prompt_lambda_arn']

print("=== DEFAULT EXTRACTION CONFIGURATION ===")
print(f"Model: {config_default.get('extraction', {}).get('model')}")
print(f"Custom Lambda: {config_default.get('extraction', {}).get('custom_prompt_lambda_arn', 'None')}")

# Create extraction service with default config
extraction_service_default = extraction.ExtractionService(config=config_default)
print("\n✅ Default extraction service initialized")

In [None]:
# Run default extraction on first section
if document.sections:
    first_section = document.sections[0]
    print(f"🔄 Processing section {first_section.section_id} with DEFAULT prompts")
    print(f"Classification: {first_section.classification}")
    print(f"Pages: {first_section.page_ids}")
    
    # Save original document state
    document_default = Document.from_json(document.to_json())
    
    # Process with default extraction
    start_time = time.time()
    document_default = extraction_service_default.process_document_section(
        document=document_default,
        section_id=first_section.section_id
    )
    default_extraction_time = time.time() - start_time
    
    print(f"✅ Default extraction completed in {default_extraction_time:.2f} seconds")
    
    # Store results for comparison
    default_section_result = None
    for section in document_default.sections:
        if section.section_id == first_section.section_id:
            default_section_result = section
            break
            
else:
    print("⚠️ No sections found in document")

### 4.2 Custom Lambda Extraction

In [None]:
if DEMO_LAMBDA_ARN:
    # Create configuration WITH custom Lambda
    config_custom = CONFIG.copy()
    config_custom['extraction']['custom_prompt_lambda_arn'] = DEMO_LAMBDA_ARN
    
    print("=== CUSTOM LAMBDA EXTRACTION CONFIGURATION ===")
    print(f"Model: {config_custom.get('extraction', {}).get('model')}")
    print(f"Custom Lambda: {DEMO_LAMBDA_ARN}")
    print(f"Lambda Function Name: {DEMO_LAMBDA_ARN.split(':')[-1]}")
    
    # Create extraction service with custom Lambda config
    extraction_service_custom = extraction.ExtractionService(config=config_custom)
    
    print("\n✅ Custom Lambda extraction service initialized")
    
else:
    print("⚠️ No custom Lambda ARN configured - skipping custom Lambda demonstration")
    config_custom = None
    extraction_service_custom = None

In [None]:
# Run custom Lambda extraction on first section
if DEMO_LAMBDA_ARN and document.sections:
    first_section = document.sections[0]
    print(f"🔄 Processing section {first_section.section_id} with CUSTOM LAMBDA prompts")
    print(f"Classification: {first_section.classification}")
    print(f"Pages: {first_section.page_ids}")
    
    # Create fresh document copy for custom processing
    document_custom = Document.from_json(document.to_json())
    
    # Process with custom Lambda extraction
    start_time = time.time()
    
    try:
        document_custom = extraction_service_custom.process_document_section(
            document=document_custom,
            section_id=first_section.section_id
        )
        custom_extraction_time = time.time() - start_time
        
        print(f"✅ Custom Lambda extraction completed in {custom_extraction_time:.2f} seconds")
        
        # Store results for comparison
        custom_section_result = None
        for section in document_custom.sections:
            if section.section_id == first_section.section_id:
                custom_section_result = section
                break
                
        # Performance comparison
        overhead = custom_extraction_time - default_extraction_time
        print(f"\n📊 Performance Comparison:")
        print(f"   Default: {default_extraction_time:.2f}s")
        print(f"   Custom:  {custom_extraction_time:.2f}s")
        print(f"   Lambda Overhead: {overhead:.2f}s ({overhead/default_extraction_time*100:.1f}% increase)")
        
    except Exception as e:
        print(f"❌ Custom Lambda extraction failed: {e}")
        print("\n🔍 This demonstrates the fail-fast error handling behavior")
        custom_section_result = None
        custom_extraction_time = None
        
else:
    print("⚠️ Skipping custom Lambda extraction (no Lambda configured or no sections)")
    document_custom = None
    custom_section_result = None
    custom_extraction_time = None

## 5. Results and Summary

In [None]:
print("=== DEMO COMPLETE: SUMMARY ===")

sections_processed = 1 if document.sections else 0
lambda_used = DEMO_LAMBDA_ARN is not None

print(f"\n✅ DEMO RESULTS:")
print(f"   📄 Document processed: {document.id}")
print(f"   📊 Sections processed: {sections_processed}")
print(f"   🔧 Custom Lambda used: {'Yes' if lambda_used else 'No (simulated)'}")

if lambda_used and 'custom_extraction_time' in locals():
    print(f"   ⏱️  Performance overhead: {custom_extraction_time - default_extraction_time:.2f}s")

print(f"\n🚀 TO IMPLEMENT CUSTOM LAMBDA IN PRODUCTION:")
print(f"   1. 📝 Create your Lambda function with GENAIIDP-* naming")
print(f"   2. 🔐 Deploy with appropriate IAM role and permissions")
print(f"   3. ⚙️  Add 'custom_prompt_lambda_arn' to your extraction config")
print(f"   4. 🧪 Test with your actual documents and use cases")
print(f"   5. 📊 Monitor CloudWatch logs for performance and errors")

print(f"\n📚 RESOURCES:")
print(f"   📖 Documentation: notebooks/examples/demo-lambda/README.md")
print(f"   🔧 Demo Lambda: notebooks/examples/demo-lambda/GENAIIDP-notebook-demo-extractor.py")
print(f"   ☁️  Deploy: cd notebooks/examples/demo-lambda && sam deploy --guided")

print(f"\n📌 CONTINUE TO: step4_assessment.ipynb")