In [3]:
from pathlib import Path
import time
import logging
from typing import Dict

from docling_core.types.doc import TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
# from docling.datamodel.format_options import PdfFormatOption
from docling.document_converter import DocumentConverter, PdfFormatOption

logger = logging.getLogger(__name__)
MARKDOWN_DIR = Path("markdown_out")
PROCESSED_DIR = Path("processed_pdfs")

def setup_converter() -> DocumentConverter:
    # Enable inline OCR, table structure, AND picture-level OCR
    opts = PdfPipelineOptions()
    opts.do_ocr = True
    opts.do_table_structure = True
    opts.table_structure_options.do_cell_matching = True
    
    # Generate page images for picture OCR
    opts.generate_page_images = True
    opts.images_scale = 2  # increase resolution for better accuracy
    
    # You already had EasyOCR configured:
    # opts.ocr_options = EasyOcrOptions()
    opts.ocr_options.lang = ["en"]
    
    return DocumentConverter(format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
    })

def load_and_split(state: Dict) -> Dict:
    pdf_path = Path(state["pdf_path"])
    converter = state.get("converter") or setup_converter()
    
    # Prepare folders
    MARKDOWN_DIR.mkdir(exist_ok=True)
    PROCESSED_DIR.mkdir(exist_ok=True)
    
    processed = PROCESSED_DIR / pdf_path.name
    if processed.exists():
        logger.info(f"Skipping already processed: {pdf_path.name}")
        return state

    md_output = MARKDOWN_DIR / pdf_path.with_suffix(".md").name
    logger.info(f"Processing: {pdf_path.name}")

    try:
        # Convert (with inline OCR, tables, and page-image OCR)
        start = time.time()
        result = converter.convert(str(pdf_path))
        logger.info(f"Converted in {time.time() - start:.2f} s")

        doc = result.document

        # Export base Markdown (inline text + tables)
        md_base = doc.export_to_markdown()

        # Run OCR on each picture / page-image and append to MD
        extra_imgs_md = []
        for picture in doc.pictures:
            ocr_lines = []
            for item, _ in doc.iterate_items(root=picture, traverse_pictures=True):
                if isinstance(item, TextItem):
                    ocr_lines.append(item.text.strip())
            if ocr_lines:
                extra_imgs_md.append(
                    f"\n\n### OCR from image: {picture.caption_text(doc)}\n\n"
                    + "\n".join(f"> {line}" for line in ocr_lines)
                )

        final_md = md_base + "\n".join(extra_imgs_md)

        md_output.write_text(final_md, encoding="utf‑8")
        logger.info(f"Markdown + OCR appended to: {md_output}")
        return final_md
        # chunks = chunk_markdown(final_md)
        # logger.info(f"Split into {len(chunks)} chunks")
        # logger.debug(f"Sample chunks: {chunks[:2]}")
        
        # return {**state, "converter": converter, "markdown": final_md, "chunks": chunks}
    
    except Exception as e:
        logger.error(f"Failed to process {pdf_path.name}: {e}")
        with open("failed_files.log", "a", encoding="utf‑8") as lf:
            lf.write(f"[{time.ctime()}] {pdf_path.name}: {e}\n")
        return state

In [7]:
import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
# from docling.datamodel.format_options import PdfFormatOption
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import TextItem

# ---- CONFIGURE THIS ----
PDF_PATH = r"/Users/alistermarcdomilies/Desktop/Electronics-Data-Extraction/processed/MMBT3906.pdf"
# ------------------------

# Setup converter with OCR + image processing
opts = PdfPipelineOptions()
opts.do_ocr = True
opts.do_table_structure = True
opts.table_structure_options.do_cell_matching = True
opts.generate_page_images = True
opts.images_scale = 2
# opts.ocr_options = EasyOcrOptions()
opts.ocr_options.lang = ["en"]

converter = DocumentConverter(format_options={
    InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
})

pdf = Path(PDF_PATH)
print(f"Processing: {pdf.name}")

start = time.time()
result = converter.convert(str(pdf))
doc = result.document
print(f"Converted in {time.time() - start:.2f} seconds")

# Export base markdown from text and tables
md = doc.export_to_markdown()

# Append OCR results from images
extra_md = []
for picture in doc.pictures:
    lines = []
    print(f"Processing picture: {picture.caption_text(doc)}")
    for item, _ in doc.iterate_items(root=picture, traverse_pictures=True):
        if isinstance(item, TextItem):
            lines.append(item.text.strip())
    if lines:
        section = f"\n\n### OCR from image: {picture.caption_text(doc)}\n\n" + "\n".join(f"> {line}" for line in lines)
        extra_md.append(section)
    print(lines)
final_md = md + "\n".join(extra_md)

# Save or print result
output_path = Path("output.md")
output_path.write_text(final_md, encoding="utf-8")
print(f"Markdown written to: {output_path.resolve()}")


Processing: MMBT3906.pdf
Converted in 18.99 seconds
Processing picture: 
[]
Processing picture: 
['MMBT3906', '40 V, 200 mA PNP switching transistor', '10 April 2025']
Processing picture: 
['1', '2', '3', 'SOT23']
Processing picture: 
['sym132', 'E', 'C', 'B']
Processing picture: 
[]
Processing picture: 
['0', '400', '600', '200', 'mhc459', '- 10 - 1', '- 1', '- 10', 'I', 'C (mA)', 'hFE', '- 10 2', '- 10 3', '(1)', '(3)', '(2)']
Processing picture: Fig. 3. Base-emitter voltage as a function of collector current; typical values
['Fig. 3. Base-emitter voltage as a function of collector current; typical values', 'mhc461', '- 600', '- 800', '- 400', '- 1000', '- 1200', 'VBE', '(mV)', '- 200', 'I', 'C (mA)', '- 10 - 1', '- 10 3', '- 10 2', '- 1', '- 10', '(1)', '(2)', '(3)', 'VCE = -1 V', '(1) Tamb = -55 °C', '(2) Tamb = 25 °C', '(3) Tamb = 150 °C']
Processing picture: 
['0', '-10', '-250', '0', '-50', '-100', '-150', '-200', '-2', 'VCE (V)', 'I', 'C', '(mA)', '-4', '-6', '-8', 'mhc460', 'I

In [6]:
lines

['solder lands',
 'solder resist',
 'occupied area',
 'preferred transport direction during soldering',
 'sot023_fw',
 '2.8',
 '4.5',
 '1.4',
 '4.6',
 '1.4',
 '(2×)',
 '1.2',
 '(2×)',
 '2.2',
 '2.6',
 'Dimensions in mm',
 'Fig. 9. Wave soldering footprint for SOT23']

In [10]:
import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
# from docling.datamodel.format_options import PdfFormatOption
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import TextItem

# === CONFIGURATION ===
PDF_PATH = r"/Users/alistermarcdomilies/Desktop/Electronics-Data-Extraction/processed/MMBT3906.pdf"  # <- Replace this
OUTPUT_PATH = "output.md"
# ======================

# --- Setup the converter ---
opts = PdfPipelineOptions()
opts.do_ocr = True
opts.do_table_structure = True
opts.table_structure_options.do_cell_matching = True
opts.generate_page_images = True
opts.images_scale = 2
opts.force_full_page_ocr = True  # Ensure all text is processed, even if not in tables   

opts.ocr_options = EasyOcrOptions()
opts.ocr_options.lang = ["en"]

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
)

# --- Convert the PDF ---
pdf = Path(PDF_PATH)
print(f"Processing: {pdf.name}")

start = time.time()
result = converter.convert(str(pdf))
doc = result.document
print(f"Converted in {time.time() - start:.2f} seconds")

# --- Export base Markdown (text + tables) ---
md = doc.export_to_markdown()

# --- Append OCR results from pictures ---
extra_md = []
for picture in doc.pictures:
    lines = []
    for item, _ in doc.iterate_items(root=picture, traverse_pictures=True):
        if isinstance(item, TextItem):
            lines.append(item.text.strip())

    if lines:
        # Fallback caption generation
        caption = picture.caption_text(doc)
        if not caption.strip():
            page_number = getattr(picture, "page_number", None)
            page_str = f"Page {page_number + 1}" if page_number is not None else "Unknown Page"
            pic_idx = doc.pictures.index(picture) + 1
            caption = f"{page_str} - Picture {pic_idx}"

        section = f"\n\n### OCR from image: {caption}\n\n" + "\n".join(f"> {line}" for line in lines)
        extra_md.append(section)

# --- Combine all content ---
final_md = md + "\n".join(extra_md)

# --- Save the Markdown file ---
output_path = Path(OUTPUT_PATH)
output_path.write_text(final_md, encoding="utf-8")
print(f"Markdown written to: {output_path.resolve()}")


ValueError: "PdfPipelineOptions" object has no field "force_full_page_ocr"