# Step 2: Document Classification

This notebook performs document classification using AWS Bedrock to identify document types and segments.

**Inputs:**
- Document object with OCR results from Step 1
- Classification configuration
- Document classes definition

**Outputs:**
- Document with classification results
- Identified document sections and their types
- Page-level classifications

## 1. Load Previous Step Data

In [None]:
import os
import json
import time
import logging
from pathlib import Path

# Import IDP libraries
from idp_common.models import Document, Status
from idp_common import classification

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.classification').setLevel(logging.INFO)
logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)

print("Libraries imported successfully")

In [None]:
# Load document from previous step
ocr_data_dir = Path(".data/step1_ocr")

# Load document object from JSON
document_path = ocr_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
import yaml
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "classification.yaml", 
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = ocr_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Number of pages: {document.num_pages}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")

## 2. Configure Classification Service

In [None]:
# Extract classification configuration
classification_config = CONFIG.get('classification', {})
print("Classification Configuration:")
print(f"Model: {classification_config.get('model')}")
print(f"Classification Method: {classification_config.get('classificationMethod')}")
print(f"Temperature: {classification_config.get('temperature')}")
print(f"Max Tokens: {classification_config.get('max_tokens')}")
print("*"*50)
print(f"System Prompt:\n{classification_config.get('system_prompt')}")
print("*"*50)
print(f"Task Prompt:\n{classification_config.get('task_prompt')}")
print("*"*50)

# Display available document classes
classes = CONFIG.get('classes', [])
print(f"\nAvailable Document Classes: {len(classes)}")
for cls in classes:
    print(f"- {cls['name']}: {cls['description']}")

# Verify that Config specifies holistic classification
print("\n*****************************************************************")
print(f'CONFIG classificationMethod: {classification_config.get("classificationMethod")}')
print("*****************************************************************")

In [None]:
# Create classification service with Bedrock backend
classification_service = classification.ClassificationService(
    config=CONFIG, 
    backend="bedrock" 
)

print("Classification service initialized")

## 3. Classify the Document

In [None]:
# Classify the document
print("Classifying document...")
start_time = time.time()

document = classification_service.classify_document(document)

classification_time = time.time() - start_time
print(f"Classification completed in {classification_time:.2f} seconds")
print(f"Document status: {document.status.value}")

## 4. Display Classification Results

In [None]:
# Show classification results
if document.sections:
    print("\nDetected sections:")
    for section in document.sections:
        print(f"Section {section.section_id}: {section.classification}")
        print(f"  Pages: {section.page_ids}")
        if hasattr(section, 'reason') and section.reason:
            print(f"  Reason: {section.reason}")
        print()
else:
    print("\nNo sections detected")

# Show page classification
print("\nPage-level classifications:")
for page_id, page in sorted(document.pages.items()):
    print(f"Page {page_id}: {page.classification}")

## 5. Save Results for Next Step

In [None]:
# Create data directory for this step
data_dir = Path(".data/step2_classification")
data_dir.mkdir(parents=True, exist_ok=True)

# Save updated document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration (pass through)
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info (pass through)
env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

# Save classification-specific results summary
classification_results = {
    'processing_time_seconds': classification_time,
    'classification_method': classification_config.get('classificationMethod'),
    'model_used': classification_config.get('model'),
    'num_sections': len(document.sections) if document.sections else 0,
    'sections': [
        {
            'section_id': section.section_id,
            'classification': section.classification,
            'page_ids': section.page_ids,
            'reason': getattr(section, 'reason', None)
        } for section in (document.sections or [])
    ],
    'page_classifications': {
        page_id: page.classification for page_id, page in document.pages.items()
    }
}

classification_results_path = data_dir / "classification_results.json"
with open(classification_results_path, 'w') as f:
    json.dump(classification_results, f, indent=2)

print(f"Saved document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")
print(f"Saved classification results to: {classification_results_path}")

## 6. Summary

In [None]:
print("=== Step 2: Classification Complete ===")
print(f"✅ Document classified: {document.id}")
print(f"✅ Sections identified: {len(document.sections) if document.sections else 0}")
print(f"✅ Processing time: {classification_time:.2f} seconds")
print(f"✅ Method used: {classification_config.get('classificationMethod')}")
print(f"✅ Model used: {classification_config.get('model')}")
print(f"✅ Data saved to: .data/step2_classification/")
print("\n📌 Next step: Run step3_extraction.ipynb")