### Hugging Face Authentication

In [None]:
#!pip install easyocr


In [9]:
#from huggingface_hub import notebook_login
#access_token = ""
#notebook_login()

### Imports and Configuration

In [None]:
"""
GraphRAG PDF Processing Module (Optimized + Full Functionality)

Handles PDF processing using Docling with EasyOCR. 
Improved with:
1. Content-based hashing (deduplication)
2. Visual noise filtering (removes icons/lines)
3. Native markdown page slicing (efficient O(N) extraction)
"""

import hashlib
import json
import logging
import warnings
import io
from pathlib import Path
from typing import Any, Dict, List, Optional

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

# Configuration Constants
DEFAULT_IMAGE_SCALE = 2.0
DEFAULT_OCR_LANGUAGE = ["en", "fr"]
MIN_IMAGE_WIDTH = 120   # Threshold to ignore icons/logos
MIN_IMAGE_HEIGHT = 80   # Threshold to ignore horizontal lines

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class PDFProcessingError(Exception):
    """Custom exception for PDF processing failures."""
    pass


def setup_converter(
    image_scale: float = DEFAULT_IMAGE_SCALE,
    extract_tables: bool = True,
    extract_figures: bool = True,
    extract_images: bool = False,
) -> DocumentConverter:
    """Configures the Docling converter with your original parameters."""
    try:
        pipeline_options = PdfPipelineOptions()
        pipeline_options.images_scale = image_scale
        pipeline_options.generate_page_images = extract_images
        pipeline_options.generate_table_images = extract_tables
        pipeline_options.generate_picture_images = extract_figures
        
        pipeline_options.do_ocr = True
        pipeline_options.ocr_options = EasyOcrOptions(lang=DEFAULT_OCR_LANGUAGE)
        
        return DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
    except Exception as e:
        raise PDFProcessingError(f"Failed to initialize converter: {e}")


def _get_image_hash(pil_img) -> str:
    """Generates a stable hash based on image pixel data."""
    img_byte_arr = io.BytesIO()
    pil_img.save(img_byte_arr, format='PNG')
    return hashlib.md5(img_byte_arr.getvalue()).hexdigest()[:16]


def _save_image(pil_img, output_path: Path) -> Optional[Dict[str, Any]]:
    """Save PIL image and return dimensions."""
    try:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        pil_img.save(output_path, format='PNG', optimize=True)
        return {"width": pil_img.width, "height": pil_img.height}
    except Exception as e:
        logger.warning(f"Failed to save image to {output_path}: {e}")
        return None


def process_pdf(
    input_path: Path,
    output_dir: Optional[Path] = None,
    image_scale: float = DEFAULT_IMAGE_SCALE,
    extract_tables: bool = True,
    extract_figures: bool = True,
    extract_images: bool = False,
) -> Optional[Dict[str, Any]]:
    """
    Main entry point for PDF processing.
    Restores original parameters while fixing logic flaws.
    """
    if not input_path.exists():
        raise FileNotFoundError(f"File not found: {input_path}")
    
    # Setup Paths
    output_dir = output_dir or input_path.parent.parent / f"{input_path.stem}_output"
    images_dir = output_dir / "images" / input_path.stem
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        converter = setup_converter(image_scale, extract_tables, extract_figures, extract_images)
        logger.info(f"Converting: {input_path.name}...")
        conv_result = converter.convert(input_path)
        doc = conv_result.document
        
        # --- 1. Map Figures/Tables to Pages (Single Pass) ---
        page_assets = {}
        
        # Combine all structural elements
        elements = []
        if extract_figures:
            elements.extend([(p, "figure") for p in getattr(doc, 'pictures', [])])
        if extract_tables:
            elements.extend([(t, "table") for t in getattr(doc, 'tables', [])])

        for idx, (el, el_type) in enumerate(elements):
            pil_img = el.image.pil_image
            
            # Filter noise unless it's a table (tables usually important regardless of size)
            if el_type == "figure" and (pil_img.width < MIN_IMAGE_WIDTH or pil_img.height < MIN_IMAGE_HEIGHT):
                continue

            page_idx = el.prov[0].page_no + 1 if el.prov else 1
            img_hash = _get_image_hash(pil_img)
            
            filename = f"page_{page_idx:03d}_{el_type}_{idx}_{img_hash}.png"
            target_path = images_dir / filename
            
            if size := _save_image(pil_img, target_path):
                asset_data = {
                    "id": img_hash,
                    "filename": filename,
                    "filepath": str(target_path),
                    "description": getattr(el, 'caption', f"{el_type.capitalize()} {idx+1}"),
                    "type": el_type,
                    "size": size
                }
                page_assets.setdefault(page_idx, []).append(asset_data)

        # --- 2. Construct Page Data with Full-Page Image support ---
        pages_data = []
        for i, page_obj in enumerate(conv_result.pages):
            page_num = i + 1
            current_page_images = page_assets.get(page_num, [])

            # Handle optional full-page screenshots
            if extract_images and hasattr(page_obj, 'image') and page_obj.image:
                p_pil = page_obj.image.pil_image
                p_hash = _get_image_hash(p_pil)
                p_filename = f"page_{page_num:03d}_full_{p_hash}.png"
                p_path = images_dir / p_filename
                
                if p_size := _save_image(p_pil, p_path):
                    current_page_images.insert(0, {
                        "id": p_hash,
                        "filename": p_filename,
                        "filepath": str(p_path),
                        "description": f"Full page {page_num} scan",
                        "type": "page_scan",
                        "size": p_size
                    })

            pages_data.append({
                "page_num": page_num,
                "markdown": doc.export_to_markdown(page_no=i), # Fixed O(N) extraction
                "images": current_page_images
            })

        # --- 3. Final Result & Cleanup ---
        result = {
            "document_id": conv_result.input.document_hash,
            "filename": input_path.name,
            "total_pages": len(pages_data),
            "pages": pages_data
        }

        # Save JSON
        with open(output_dir / f"{input_path.stem}_result.json", 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

        logger.info(f"Success. Processed {len(pages_data)} pages.")
        return result

    except Exception as e:
        logger.error(f"Processing failed for {input_path.name}: {e}")
        raise PDFProcessingError(f"Pipeline failure: {e}")

In [None]:
"""
GraphRAG PDF Processing Module (Optimized + Full Functionality)

Handles PDF processing using Docling with EasyOCR. 
Improved with:
1. Content-based hashing (deduplication)
2. Visual noise filtering (removes icons/lines)
3. Native markdown page slicing (efficient O(N) extraction)
"""

import hashlib
import json
import logging
import warnings
import io
from pathlib import Path
from typing import Any, Dict, List, Optional

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

# Configuration Constants
DEFAULT_IMAGE_SCALE = 2.0
DEFAULT_OCR_LANGUAGE = ["en", "fr"]
MIN_IMAGE_WIDTH = 120   # Threshold to ignore icons/logos
MIN_IMAGE_HEIGHT = 80   # Threshold to ignore horizontal lines

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# EasyOCR/Docling sets pin_memory=True by default (assuming GPU might be available)
# I chose to disable this warning to avoid unnecessary noise in the logs. 
warnings.filterwarnings('ignore', message='.*pin_memory.*')



class PDFProcessingError(Exception):
    """Custom exception for PDF processing failures."""
    pass


def setup_converter(
    image_scale: float = DEFAULT_IMAGE_SCALE,
    extract_tables: bool = True,
    extract_figures: bool = True,
    extract_images: bool = False,
) -> DocumentConverter:
    """Configures the Docling converter with your original parameters."""
    try:
        pipeline_options = PdfPipelineOptions()
        pipeline_options.images_scale = image_scale
        pipeline_options.generate_page_images = extract_images
        pipeline_options.generate_table_images = extract_tables
        pipeline_options.generate_picture_images = extract_figures
        
        pipeline_options.do_ocr = True
        pipeline_options.ocr_options = EasyOcrOptions(lang=DEFAULT_OCR_LANGUAGE)
        
        return DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
    except Exception as e:
        raise PDFProcessingError(f"Failed to initialize converter: {e}")


def _get_image_hash(pil_img) -> str:
    """Generates a stable hash based on image pixel data."""
    img_byte_arr = io.BytesIO()
    pil_img.save(img_byte_arr, format='PNG')
    return hashlib.md5(img_byte_arr.getvalue()).hexdigest()[:16]


def _save_image(pil_img, output_path: Path) -> Optional[Dict[str, Any]]:
    """Save PIL image and return dimensions."""
    try:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        pil_img.save(output_path, format='PNG', optimize=True)
        return {"width": pil_img.width, "height": pil_img.height}
    except Exception as e:
        logger.warning(f"Failed to save image to {output_path}: {e}")
        return None


def process_pdf(
    input_path: Path,
    output_dir: Optional[Path] = None,
    image_scale: float = DEFAULT_IMAGE_SCALE,
    extract_tables: bool = True,
    extract_figures: bool = True,
    extract_images: bool = False,
) -> Optional[Dict[str, Any]]:
    """
    Main entry point for PDF processing.
    Restores original parameters while fixing logic flaws.
    """
    if not input_path.exists():
        raise FileNotFoundError(f"File not found: {input_path}")
    
    # Setup Paths
    output_dir = output_dir or input_path.parent.parent / f"{input_path.stem}_output"
    images_dir = output_dir / "images" / input_path.stem
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        converter = setup_converter(image_scale, extract_tables, extract_figures, extract_images)
        logger.info(f"Converting: {input_path.name}...")
        conv_result = converter.convert(input_path)
        doc = conv_result.document

        full_markdown=""
        
        # --- 1. Map Figures/Tables to Pages (Single Pass) ---
        page_assets = {}
        
        # Combine all structural elements
        elements = []
        if extract_figures:
            elements.extend([(p, "figure") for p in getattr(doc, 'pictures', [])])
        if extract_tables:
            elements.extend([(t, "table") for t in getattr(doc, 'tables', [])])

        for idx, (el, el_type) in enumerate(elements):
            pil_img = el.image.pil_image
            
            # Filter noise unless it's a table (tables usually important regardless of size)
            if el_type == "figure" and (pil_img.width < MIN_IMAGE_WIDTH or pil_img.height < MIN_IMAGE_HEIGHT):
                continue

            page_idx = el.prov[0].page_no + 1 if el.prov else 1
            img_hash = _get_image_hash(pil_img)
            
            filename = f"page_{page_idx:03d}_{el_type}_{idx}_{img_hash}.png"
            target_path = images_dir / filename
            
            if size := _save_image(pil_img, target_path):
                asset_data = {
                    "id": img_hash,
                    "filename": filename,
                    "filepath": str(target_path),
                    "description": getattr(el, 'caption', f"{el_type.capitalize()} {idx+1}"),
                    "type": el_type,
                    "size": size
                }
                page_assets.setdefault(page_idx, []).append(asset_data)

        # --- 2. Construct Page Data with Full-Page Image support ---
        pages_data = []
        for i, page_obj in enumerate(conv_result.pages):
            page_num = i + 1
            current_page_images = page_assets.get(page_num, [])

            # Handle optional full-page screenshots
            if extract_images and hasattr(page_obj, 'image') and page_obj.image:
                p_pil = page_obj.image.pil_image
                p_hash = _get_image_hash(p_pil)
                p_filename = f"page_{page_num:03d}_full_{p_hash}.png"
                p_path = images_dir / p_filename
                
                if p_size := _save_image(p_pil, p_path):
                    current_page_images.insert(0, {
                        "id": p_hash,
                        "filename": p_filename,
                        "filepath": str(p_path),
                        "description": f"Full page {page_num} scan",
                        "type": "page_scan",
                        "size": p_size
                    })
            markdown_page_results=doc.export_to_markdown(page_no=i)
            full_markdown+=markdown_page_results
            pages_data.append({
                "page_num": page_num,
                "markdown": markdown_page_results, # Fixed O(N) extraction
                "images": current_page_images
            })

        # --- 3. Final Result & Cleanup ---
        result = {
            "document_id": conv_result.input.document_hash,
            "filename": input_path.name,
            "full_markdown":full_markdown,
            "total_pages": len(pages_data),
            "pages": pages_data
        }

        # Save JSON
        with open(output_dir / f"{input_path.stem}_result.json", 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

        logger.info(f"Success. Processed {len(pages_data)} pages.")
        return result

    except Exception as e:
        logger.error(f"Processing failed for {input_path.name}: {e}")
        raise PDFProcessingError(f"Pipeline failure: {e}")
    


pdf_path = Path("dummy_pdfs/0010514.pdf")

try:
    result = process_pdf(pdf_path, image_scale=2.0, extract_tables=True, extract_figures=True, extract_images=False)
    if result:
        print(f"Processed {result['total_pages']} pages")
        print(f"Document hash: {result['document_hash']}")
except Exception as e:
    logger.error(f"Processing failed: {e}")

INFO:__main__:Converting: 0010514.pdf...
INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash eb35847499f704a2741279d41d6d5e6c
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document 0010514.pdf
INFO:docling.document_converter:Finished converting document 0010514.pdf in 160.49 sec.
INFO:__main__:Success. Processed 14 pages.
ERROR:__main__:Processing failed: 'document_hash'


Processed 14 pages


### Setup Converter Function

### Process PDF Function

```json
{
    "document_id": "hash",
    "filename": "pdf_name.pdf",
    "total_pages": int,
    "pages": [
        {
            "page_num": 1,
            "markdown": "...",
            "images": [
                {
                    "id": "hash",
                    "filename": "page_001_img_0.png",
                    "filepath": "images/pdf_name/page_001_img_0.png",
                    "description": "Figure description",
                    "size": {"width": int, "height": int}
                }
            ]
        }
    ]
}
```