In [None]:
"""
# Hybrid PDF Processing with Docling and GPT Vision

This notebook demonstrates the hybrid PDF processing approach using Docling for native PDFs
and GPT Vision for complex or scanned documents.
"""

# %%
# Import required libraries
import os
import sys
import logging
from pathlib import Path
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Add parent directory to path to import document processing package
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("."))))

# Import document processing modules
from doc_processing.config import get_settings, ensure_directories_exist
from doc_processing.document_pipeline import DocumentPipeline
from doc_processing.models.schema import Document as DocumentSchema

# %%
# Check if OpenAI API key is configured
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please set the 'OPENAI_API_KEY' environment variable.")
else:
    print("OpenAI API key found.")

# %%
# Define input and output directories
settings = get_settings()
ensure_directories_exist()

print(f"Input PDF directory: {settings.PDF_INPUT_DIR}")
print(f"Output directory: {settings.OUTPUT_DIR}")

# %%
# Check if input directory contains PDFs
pdf_files = list(Path(settings.PDF_INPUT_DIR).glob('*.pdf'))
print(f"Found {len(pdf_files)} PDF files in input directory:")
for pdf_file in pdf_files[:5]:  # Show first 5 files
    print(f"- {pdf_file.name}")
if len(pdf_files) > 5:
    print(f"... and {len(pdf_files) - 5} more")

# %%
# Configure hybrid pipeline with Instructor for structured output
pipeline_config = {
    'docling_config': {
        'use_easyocr': True,
        'extract_tables': True,
        'extract_figures': True,
    },
    'pdf_processor_config': {
        'model': 'gpt-4o',
        'max_tokens': 1500,
        'max_retries': 3,
        'concurrent_pages': 2,
        'resolution_scale': 2,
        'prompt_template': 'pdf_extraction.j2',
    },
    'struct_extractor_config': {
        'instructor_config': {
            'model': 'gpt-4o',
            'max_tokens': 4000,
            'temperature': 0.2,
        },
        'system_prompt': "You are an expert document analyzer. Extract structured information from the document."
    }
}

# Create pipeline
pipeline = DocumentPipeline(pipeline_config)
pipeline.configure_pdf_to_structured_pipeline()

print("Hybrid pipeline configured for PDF processing")

# %%
# Process a single PDF file to demonstrate
if pdf_files:
    sample_pdf = pdf_files[0]
    print(f"Processing PDF: {sample_pdf}")
    
    result = pipeline.process_document(sample_pdf)
    
    # Show processing method used
    if 'processing_method' in result:
        print(f"\nProcessing method used: {result['processing_method']}")
    
    # Show metadata
    if 'metadata' in result:
        print("\nDocument Metadata:")
        for key, value in result['metadata'].items():
            if key not in ['content', 'chunks', 'pages']:
                print(f"- {key}: {value}")
    
    # Show structured data
    if 'structured_data' in result:
        print("\nExtracted Structured Data:")
        for key, value in result['structured_data'].items():
            if key not in ['content']:
                print(f"- {key}: {value}")
                
    # Show tables if extracted
    if 'tables' in result and result['tables']:
        print(f"\nExtracted {len(result['tables'])} tables")
else:
    print("No PDF files found to process")