In [None]:
"""
# PDF to Markdown Processing Pipeline Demo

This notebook demonstrates how to use the document processing pipeline to convert PDF documents to markdown format.
"""

# %%
# Import required libraries
import os
import sys
import logging
from pathlib import Path
from pprint import pprint
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Add parent directory to path to import document processing package
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("."))))

# Import document processing modules
from doc_processing.config import get_settings, ensure_directories_exist
from doc_processing.document_pipeline import DocumentPipeline

# %%
# Check if OpenAI API key is configured
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please set the 'OPENAI_API_KEY' environment variable.")
else:
    print("OpenAI API key found.")

# %%
# Define input and output directories
settings = get_settings()
ensure_directories_exist()

print(f"Input PDF directory: {settings.PDF_INPUT_DIR}")
print(f"Output Markdown directory: {settings.MARKDOWN_OUTPUT_DIR}")

# %%
# Check if input directory contains PDFs
pdf_files = list(Path(settings.PDF_INPUT_DIR).glob('*.pdf'))
print(f"Found {len(pdf_files)} PDF files in input directory:")
for pdf_file in pdf_files[:5]:  # Show first 5 files
    print(f"- {pdf_file.name}")
if len(pdf_files) > 5:
    print(f"... and {len(pdf_files) - 5} more")

# %%
# Configure pipeline for PDF to markdown conversion
# Define custom configuration
pipeline_config = {
    'pdf_loader_config': {
        'extract_metadata': True,
        'check_password': True,
    },
    'pdf_processor_config': {
        'model': 'gpt-4o',
        'max_tokens': 1500,
        'max_retries': 3,
        'concurrent_pages': 2,  # Process 2 pages concurrently
        'resolution_scale': 2,  # Higher resolution for better OCR
        'prompt_template': 'pdf_extraction.j2',
        'preserve_page_boundaries': True,
    },
    'markdown_transformer_config': {
        'markdown_template': 'markdown_template.j2',
        'generate_toc': True,
        'detect_headings': True,
        'extract_metadata': True,
        'output_path': settings.MARKDOWN_OUTPUT_DIR,
    }
}

# Create pipeline
pipeline = DocumentPipeline(pipeline_config)
pipeline.configure_pdf_to_markdown_pipeline()

print("Pipeline configured for PDF to Markdown conversion")

# %%
# Process a single PDF file
def process_single_pdf(file_path):
    """Process a single PDF file and return the result."""
    print(f"Processing PDF: {file_path}")
    try:
        result = pipeline.process_document(file_path)
        print(f"Processing complete. Output saved to: {settings.MARKDOWN_OUTPUT_DIR}")
        return result
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

# Select the first PDF file to process
if pdf_files:
    sample_pdf = pdf_files[0]
    result = process_single_pdf(sample_pdf)
    
    # Show metadata
    if result and 'metadata' in result:
        print("\nDocument Metadata:")
        for key, value in result['metadata'].items():
            if key not in ['content', 'chunks', 'pages']:
                print(f"- {key}: {value}")
else:
    print("No PDF files found to process")

# %%
# View the first part of the extracted markdown content
if result and 'markdown' in result:
    markdown_content = result['markdown']
    print("\nExtracted Markdown Content (first 1000 characters):")
    print("-" * 80)
    print(markdown_content[:1000] + "...")
    print("-" * 80)
    
    # Show output file path
    if 'metadata' in result and 'filename' in result['metadata']:
        output_file = Path(settings.MARKDOWN_OUTPUT_DIR) / f"{Path(result['metadata']['filename']).stem}.md"
        print(f"\nMarkdown saved to: {output_file}")
        if output_file.exists():
            print(f"File size: {output_file.stat().st_size / 1024:.2f} KB")
else:
    print("No markdown content generated")

# %%
# Process all PDF files in the directory
def process_all_pdfs():
    """Process all PDF files in the input directory."""
    if not pdf_files:
        print("No PDF files found to process")
        return
    
    print(f"Processing {len(pdf_files)} PDF files...")
    try:
        results = pipeline.process_directory(settings.PDF_INPUT_DIR)
        print(f"Processing complete. {len(results)} files processed.")
        return results
    except Exception as e:
        print(f"Error processing PDF directory: {str(e)}")
        return None

# Uncomment to process all PDFs
# all_results = process_all_pdfs()

# %%
# Define a function to extract document statistics
def get_document_stats(result):
    """Extract statistics from processing result."""
    if not result:
        return {}
    
    stats = {
        'title': result.get('metadata', {}).get('title', 'Unknown'),
        'filename': result.get('metadata', {}).get('filename', 'Unknown'),
        'num_pages': result.get('metadata', {}).get('num_pages', 0),
        'num_processed_pages': result.get('metadata', {}).get('num_processed_pages', 0),
        'content_length': len(result.get('content', '')),
        'markdown_length': len(result.get('markdown', '')),
        'processing_status': 'Success' if 'error' not in result else f"Error: {result['error']}"
    }
    return stats

# Example of getting stats for the processed document
if result:
    stats = get_document_stats(result)
    print("\nDocument Statistics:")
    for key, value in stats.items():
        print(f"- {key}: {value}")

# %%
# Custom modifications for specific documents
def customize_markdown(markdown_content, title):
    """Customize markdown content with additional information."""
    header = f"""# {title}

*Document processed with Document Processing Pipeline*

---

"""
    footer = """

---

*End of document*
"""
    return header + markdown_content + footer

# Example of customizing markdown
if result and 'markdown' in result:
    original_markdown = result['markdown']
    title = result.get('metadata', {}).get('title', 'Untitled Document')
    custom_markdown = customize_markdown(original_markdown, title)
    
    # Save to custom file
    custom_output_file = Path(settings.MARKDOWN_OUTPUT_DIR) / f"{Path(result['metadata']['filename']).stem}_custom.md"
    
    with open(custom_output_file, 'w', encoding='utf-8') as f:
        f.write(custom_markdown)
    
    print(f"\nCustomized markdown saved to: {custom_output_file}")

# %%
# Function to analyze document content
def analyze_document_content(document):
    """Analyze document content and extract insights."""
    if not document or 'content' not in document:
        return "No content to analyze"
    
    content = document['content']
    word_count = len(content.split())
    
    # Simple analysis
    analysis = {
        'word_count': word_count,
        'character_count': len(content),
        'average_word_length': sum(len(word) for word in content.split()) / max(1, word_count),
        'number_of_paragraphs': content.count('\n\n') + 1,
    }
    
    return analysis

# Example of analyzing document content
if result and 'content' in result:
    analysis = analyze_document_content(result)
    print("\nDocument Content Analysis:")
    for key, value in analysis.items():
        print(f"- {key}: {value:.2f}" if isinstance(value, float) else f"- {key}: {value}")

# %%
# Extract headings to create a table of contents
def extract_toc(markdown_content):
    """Extract headings from markdown to create a table of contents."""
    if not markdown_content:
        return []
    
    toc = []
    for line in markdown_content.split('\n'):
        if line.startswith('#'):
            # Count number of # to determine heading level
            level = 0
            for char in line:
                if char == '#':
                    level += 1
                else:
                    break
            
            # Extract heading text
            heading_text = line[level:].strip()
            if heading_text:
                toc.append((level, heading_text))
    
    return toc

# Example of extracting TOC
if result and 'markdown' in result:
    toc = extract_toc(result['markdown'])
    if toc:
        print("\nTable of Contents:")
        for level, heading in toc:
            indent = "  " * (level - 1)
            print(f"{indent}- {heading}")
    else:
        print("No headings found in the document")

# %%
# Create a batch processing function with progress tracking
def batch_process_with_progress(file_paths, batch_size=3):
    """Process files in batches with progress tracking."""
    if not file_paths:
        print("No files to process")
        return []
    
    total_files = len(file_paths)
    processed_files = 0
    results = []
    
    print(f"Starting batch processing of {total_files} files...")
    
    # Process in batches
    for i in range(0, total_files, batch_size):
        batch = file_paths[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} files)...")
        
        for file_path in batch:
            try:
                print(f"Processing: {file_path.name}")
                result = pipeline.process_document(file_path)
                results.append(result)
                processed_files += 1
                print(f"Completed {processed_files}/{total_files} files ({processed_files/total_files*100:.1f}%)")
                
            except Exception as e:
                print(f"Error processing {file_path.name}: {str(e)}")
        
    print(f"\nBatch processing complete. {processed_files}/{total_files} files processed successfully.")
    return results

# Uncomment to run batch processing with progress tracking
# batch_results = batch_process_with_progress(pdf_files[:5])  # Process first 5 files

# %%
# Function to check processing quality
def check_processing_quality(result):
    """Check the quality of PDF processing."""
    if not result or 'content' not in result:
        return {'quality': 'Unknown', 'issues': ['No content found']}
    
    content = result.get('content', '')
    issues = []
    
    # Check for common issues
    if len(content) < 100:
        issues.append("Content is very short")
    
    if "�" in content:
        issues.append("Contains encoding issues (replacement characters)")
    
    # Check for common OCR errors
    ocr_error_patterns = [r'\bl\b', r'\bI\b', r'\b0\b', r'\bO\b']
    for pattern in ocr_error_patterns:
        if len(re.findall(pattern, content)) > 50:  # Too many occurrences might indicate OCR issues
            issues.append(f"Potential OCR error with pattern {pattern}")
    
    # Check for layout issues
    if '\n\n\n\n' in content:
        issues.append("Contains excessive line breaks")
    
    # Determine overall quality
    if not issues:
        quality = "Good"
    elif len(issues) <= 2:
        quality = "Fair"
    else:
        quality = "Poor"
    
    return {'quality': quality, 'issues': issues}

# Example of checking processing quality
if result:
    import re
    quality_check = check_processing_quality(result)
    print("\nProcessing Quality Check:")
    print(f"- Overall quality: {quality_check['quality']}")
    if quality_check['issues']:
        print("- Issues found:")
        for issue in quality_check['issues']:
            print(f"  - {issue}")
    else:
        print("- No issues found")

# %%
# Function to compare different processing configurations
def compare_processing_configs(pdf_path):
    """Compare different processing configurations on the same PDF."""
    if not pdf_path or not Path(pdf_path).exists():
        print("Invalid PDF path")
        return {}
    
    configs = {
        'default': {
            'pdf_processor_config': {
                'model': 'gpt-4o',
                'resolution_scale': 2,
            }
        },
        'high_res': {
            'pdf_processor_config': {
                'model': 'gpt-4o',
                'resolution_scale': 3,  # Higher resolution
            }
        },
        'detailed_prompt': {
            'pdf_processor_config': {
                'model': 'gpt-4o',
                'resolution_scale': 2,
                'preserve_page_boundaries': True,
                'extract_columns': True,
            }
        }
    }
    
    results = {}
    for config_name, config in configs.items():
        print(f"\nProcessing with configuration: {config_name}")
        
        # Create pipeline with configuration
        test_pipeline = DocumentPipeline(config)
        test_pipeline.configure_pdf_to_text_pipeline()
        
        # Process document
        try:
            result = test_pipeline.process_document(pdf_path)
            content_length = len(result.get('content', ''))
            quality = check_processing_quality(result)
            
            results[config_name] = {
                'content_length': content_length,
                'quality': quality['quality'],
                'issues': quality['issues'],
            }
            
            print(f"- Content length: {content_length}")
            print(f"- Quality: {quality['quality']}")
            
        except Exception as e:
            print(f"Error with {config_name} configuration: {str(e)}")
            results[config_name] = {'error': str(e)}
    
    return results

# Uncomment to compare different configurations on a sample PDF
# if pdf_files:
#     comparison = compare_processing_configs(pdf_files[0])
#     print("\nConfiguration Comparison Results:")
#     for config_name, result in comparison.items():
#         print(f"\n{config_name.upper()}:")
#         for key, value in result.items():
#             if key == 'issues':
#                 print(f"- issues: {', '.join(value) if value else 'None'}")
#             else:
#                 print(f"- {key}: {value}")

# %%
# Save all templates to the template directory
def create_templates():
    """Create Jinja templates in the template directories."""
    from doc_processing.templates.prompt_manager import PromptTemplateManager
    
    # Create template manager
    template_manager = PromptTemplateManager()
    
    # Define PDF extraction template
    pdf_extraction_template = """{# Template for PDF extraction with GPT-4 Vision #}
Extract all text from this image of page {{ page_number }} of a PDF document. Follow these specific instructions:

1. Extract ALL visible text from the image, including:
   - Main body text
   - Headers and footers
   - Page numbers
   - Table contents
   - Figure captions
   - Footnotes and endnotes
   - Sidebar text
   - References and citations

2. Maintain the original formatting structure as closely as possible:
   - Preserve paragraph breaks
   - Maintain heading levels (indicated with # for Markdown)
   - Preserve bullet points and numbered lists
   - Keep table structure using Markdown table format when possible

3. For tables:
   - If the table is complex, try to maintain its structure using Markdown table syntax
   - If that's not possible, describe the table structure briefly and extract all text content

4. For mathematical equations and formulas:
   - Render using LaTeX notation inside $ $ delimiters when possible
   - If the equation is complex, describe it briefly

5. For special content:
   - Clearly label footnotes as [Footnote: ...]
   - Indicate page numbers as [Page X]
   - Mark figure captions as [Figure: ...]

6. Error correction:
   - Fix obvious OCR errors
   - Correct hyphenation issues at line breaks
   - Join words split across lines

7. Return ONLY the extracted text content.
   - Do NOT include explanations, descriptions, or commentary about the extraction process
   - Do NOT include placeholders for images or other non-text elements beyond brief labels
   - Do NOT include your own analysis of the content

8. Special instructions:
{% if config.preserve_page_boundaries %}
   - Start the extraction with "=== Page {{ page_number }} ===" to clearly mark page boundaries
{% endif %}
{% if config.extract_columns %}
   - Handle multi-column layouts by extracting text from left to right, column by column
{% endif %}
{% if config.handle_rotated_text %}
   - Extract any rotated or sideways text, noting its orientation
{% endif %}
{% if config.extract_tables_only %}
   - Focus on extracting tables and their data, minimizing extraction of body text
{% endif %}

Return only the extracted text, formatted cleanly and ready for use."""
    
    # Define markdown template
    markdown_template = """{# Template for Markdown document generation #}
# {{ title }}

{% if metadata %}
## Document Metadata

| Property | Value |
|----------|-------|
{% for key, value in metadata.items() %}
{% if key not in ['content', 'chunks', 'pages', 'error'] and value is not none %}
| {{ key }} | {{ value }} |
{% endif %}
{% endfor %}
{% endif %}

{% if toc %}
{{ toc }}
{% endif %}

{% for section in sections %}
{% if section.heading %}
{{'#' * section.heading.level }} {{ section.heading.text }}
{% endif %}

{{ section.content }}

{% endfor %}

---

*Document processed on {{ now().strftime('%Y-%m-%d %H:%M:%S') }}*"""
    
    # Create templates
    try:
        # Create prompt template
        template_manager.create_prompt_template("pdf_extraction.j2", pdf_extraction_template)
        print("PDF extraction template created")
        
        # Create output template
        template_manager.create_output_template("markdown_template.j2", markdown_template)
        print("Markdown template created")
        
        return True
    except Exception as e:
        print(f"Error creating templates: {str(e)}")
        return False

# Uncomment to create templates
# create_templates()

# %%
# Final summary of the notebook
print("\n" + "="*80)
print("SUMMARY: PDF to Markdown Processing Pipeline Demo")
print("="*80)
print("\nThis notebook demonstrated how to:")
print("1. Configure and set up a document processing pipeline")
print("2. Process PDF documents to extract text content")
print("3. Convert text to structured Markdown format")
print("4. Analyze document content and quality")
print("5. Customize output formats and templates")
print("\nNext steps could include:")
print("- Integrating with Weaviate vector database")
print("- Adding support for other document types")
print("- Implementing batch processing with progress tracking")
print("- Creating a web interface for document processing")
print("="*80)