# Step 5: Summarization

This notebook performs document summarization using AWS Bedrock to create comprehensive summaries.

**Inputs:**
- Document object with assessment results from Step 4
- Summarization configuration
- Document sections for content summarization

**Outputs:**
- Document with summarization results
- Section-level summaries and combined document summary
- Both JSON and Markdown formatted summaries

## 1. Load Previous Step Data

In [None]:
import os
import json
import time
import logging
import boto3
import copy
from pathlib import Path
from IPython.display import Markdown, display

# Import IDP libraries
from idp_common.models import Document, Status
from idp_common import summarization
from idp_common import s3

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.summarization').setLevel(logging.INFO)
logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)

print("Libraries imported successfully")

In [None]:
# Load document from previous step
assessment_data_dir = Path(".data/step4_assessment")

# Load document object from JSON
document_path = assessment_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
import yaml
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "summarization.yaml",
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = assessment_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Number of sections: {len(document.sections) if document.sections else 0}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")

## 2. Configure Summarization Service

In [None]:
# Extract summarization configuration
summarization_config = CONFIG.get('summarization', {})
print("Summarization Configuration:")
print(f"Model: {summarization_config.get('model')}")
print(f"Temperature: {summarization_config.get('temperature')}")
print(f"Max Tokens: {summarization_config.get('max_tokens')}")
print(f"Default Confidence Threshold: {summarization_config.get('default_confidence_threshold')}")
print("*"*50)
print(f"System Prompt:\n{summarization_config.get('system_prompt')}")
print("*"*50)
print(f"Task Prompt:\n{summarization_config.get('task_prompt')}")
print("*"*50)

# Display sections available for summarization
if document.sections:
    print(f"\nSections available for summarization:")
    for section in document.sections:
        print(f"  - Section {section.section_id}: {section.classification} (Pages: {section.page_ids})")
else:
    print("\nNo sections available for summarization")

In [None]:
# Create summarization service
summarization_service = summarization.SummarizationService(config=CONFIG)

print("Summarization service initialized")

## 3. Individual Section Summarization

In [None]:
print("=== PART 1: Processing Individual Sections ===")

if not document.sections:
    print("No sections found in document. Cannot proceed with summarization.")
else:
    section_summarization_results = []
    
    # Process each section (limit to first 3 to save time in demo)
    n = min(3, len(document.sections))
    print(f"Processing first {n} of {len(document.sections)} sections...")
    
    for i, section in enumerate(document.sections[:n]):
        print(f"\n--- Processing Section {i+1}/{n} ---")
        print(f"Section ID: {section.section_id}")
        print(f"Classification: {section.classification}")
        print(f"Pages: {section.page_ids}")
        
        # Process section summarization
        start_time = time.time()
        document, section_metering = summarization_service.process_document_section(
            document=document,
            section_id=section.section_id
        )
        summarization_time = time.time() - start_time
        
        print(f"Summarization completed in {summarization_time:.2f} seconds")
        
        # Record results
        section_summarization_results.append({
            'section_id': section.section_id,
            'classification': section.classification,
            'processing_time': summarization_time,
            'metering': section_metering
        })
    
    print(f"\nSection summarization complete for {n} sections.")

## 4. Display Individual Section Summaries

In [None]:
print("=== Individual Section Summaries ===")

if document.sections:
    n = min(3, len(document.sections))
    
    for i, section in enumerate(document.sections[:n]):
        print(f"\n--- Section {section.section_id} ({section.classification}) ---")
        
        # Check if section has summary attributes
        if section.attributes and 'summary_uri' in section.attributes:
            summary_uri = section.attributes['summary_uri']
            summary_md_uri = section.attributes.get('summary_md_uri')
            
            print(f"JSON Summary URI: {summary_uri}")
            if summary_md_uri:
                print(f"Markdown Summary URI: {summary_md_uri}")
            
            # Get and display JSON summary
            try:
                summary_content = s3.get_json_content(summary_uri)
                print("\nJSON Summary Content:")
                
                if isinstance(summary_content, dict):
                    if 'summary' in summary_content:
                        summary_text = summary_content['summary']
                        print(summary_text[:300] + "..." if len(summary_text) > 300 else summary_text)
                    elif 'content' in summary_content:
                        content_text = str(summary_content['content'])
                        print(content_text[:300] + "..." if len(content_text) > 300 else content_text)
                    else:
                        summary_str = json.dumps(summary_content, indent=2)
                        print(summary_str[:300] + "..." if len(summary_str) > 300 else summary_str)
                else:
                    print(summary_content)
            except Exception as e:
                print(f"Error retrieving JSON summary: {e}")
            
            # Get and display Markdown summary if available
            if summary_md_uri:
                try:
                    markdown_content = s3.get_text_content(summary_md_uri)
                    print("\nMarkdown Summary (first 300 chars):")
                    print(markdown_content[:300] + "..." if len(markdown_content) > 300 else markdown_content)
                    
                    # Display rendered markdown (truncated for readability)
                    if len(markdown_content) <= 500:
                        print("\nRendered Markdown Summary:")
                        display(Markdown(markdown_content))
                except Exception as e:
                    print(f"Error retrieving markdown summary: {e}")
        else:
            print("No summary available for this section")
else:
    print("No sections to display")

## 5. Document-Level Summarization

In [None]:
print("=== PART 2: Processing Document with Sections ===")

# Create a copy of the document for document-level processing
document_with_sections = copy.deepcopy(document)

# Process the entire document using the section-based approach
start_time = time.time()
document_with_sections = summarization_service.process_document(
    document=document_with_sections,
    store_results=True
)
document_summarization_time = time.time() - start_time

print(f"Document summarization completed in {document_summarization_time:.2f} seconds")

# Update the main document with the results
document = document_with_sections

## 6. Display Combined Document Summary

In [None]:
print("=== Combined Document Summary ===")

if document.summary_report_uri:
    print(f"Combined Summary Report URI: {document.summary_report_uri}")
    
    try:
        # Extract bucket and key from the s3 URI
        uri_parts = document.summary_report_uri.replace("s3://", "").split("/", 1)
        bucket = uri_parts[0]
        key = uri_parts[1]
        
        # Use boto3 to get the object directly
        s3_client = boto3.client('s3')
        response = s3_client.get_object(Bucket=bucket, Key=key)
        markdown_content = response['Body'].read().decode('utf-8')
        
        # Display a preview of the summary
        print("\nSummary Preview (first 500 chars):")
        print(markdown_content[:500] + "..." if len(markdown_content) > 500 else markdown_content)
        
        # Display the full markdown summary in a rendered cell
        display(Markdown(markdown_content[:2000]))
        print("...<truncated>")
        
        # Also check if JSON summary exists
        json_key = key.replace("summary.md", "summary.json")
        try:
            json_response = s3_client.get_object(Bucket=bucket, Key=json_key)
            summary_json = json.loads(json_response['Body'].read().decode('utf-8'))
            
            # Check for section summaries
            if 'metadata' in summary_json and 'section_summaries' in summary_json['metadata']:
                section_summaries = summary_json['metadata']['section_summaries']
                print(f"\nDocument contains summaries for sections: {list(section_summaries.keys())}")
        except Exception as e:
            print(f"Note: JSON summary not found or couldn't be parsed: {e}")
            
    except Exception as e:
        print(f"Error retrieving combined summary: {e}")
else:
    print("No combined document summary available")

## 7. Save Results for Next Step

In [None]:
# Create data directory for this step
data_dir = Path(".data/step5_summarization")
data_dir.mkdir(parents=True, exist_ok=True)

# Save updated document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration (pass through)
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info (pass through)
env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

# Save summarization-specific results summary
summarization_summary = {
    'model_used': summarization_config.get('model'),
    'section_based_summarization': summarization_config.get('section_based_summarization'),
    'output_formats': summarization_config.get('output_formats', []),
    'sections_processed': len(section_summarization_results) if 'section_summarization_results' in locals() else 0,
    'total_sections': len(document.sections) if document.sections else 0,
    'document_processing_time': document_summarization_time if 'document_summarization_time' in locals() else None,
    'section_results': section_summarization_results if 'section_summarization_results' in locals() else [],
    'document_summary_uri': document.summary_report_uri,
    'sections_with_summaries': [
        {
            'section_id': section.section_id,
            'classification': section.classification,
            'has_summary': section.attributes and 'summary_uri' in section.attributes if section.attributes else False,
            'summary_uri': section.attributes.get('summary_uri') if section.attributes else None,
            'summary_md_uri': section.attributes.get('summary_md_uri') if section.attributes else None
        } for section in (document.sections or [])
    ]
}

summarization_summary_path = data_dir / "summarization_summary.json"
with open(summarization_summary_path, 'w') as f:
    json.dump(summarization_summary, f, indent=2)

print(f"Saved document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")
print(f"Saved summarization summary to: {summarization_summary_path}")

## 8. Summary

In [None]:
sections_processed = len(section_summarization_results) if 'section_summarization_results' in locals() else 0
sections_with_summaries = sum(1 for section in (document.sections or []) if section.attributes and 'summary_uri' in section.attributes)
has_document_summary = document.summary_report_uri is not None

print("=== Step 5: Summarization Complete ===")
print(f"✅ Document processed: {document.id}")
print(f"✅ Individual sections processed: {sections_processed} of {len(document.sections) if document.sections else 0}")
print(f"✅ Sections with summaries: {sections_with_summaries}")
print(f"✅ Document-level summary: {'Yes' if has_document_summary else 'No'}")
print(f"✅ Model used: {summarization_config.get('model')}")
print(f"✅ Output formats: {', '.join(summarization_config.get('output_formats', []))}")
print(f"✅ Data saved to: .data/step5_summarization/")
print("\n📌 Next step: Run step6_evaluation.ipynb")