In [7]:
from pathlib import Path

In [8]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

In [9]:
input_doc = Path("Commercial Policy.pdf")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
# ocr_options = RapidOcrOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

doc = converter.convert(input_doc).document
md = doc.export_to_markdown()
print(md)

<!-- image -->

<!-- image -->

## COMMERCIAL POLICY CHANGE REQUEST

<!-- image -->

Page 1 of 2

© 1991-2015 ACORD CORPORATION.

19 fe | 9025

CARRIER

AGENCY

ATTENTION

POLICY NUMBER

[ Si. | 2 uy 4

ACCOUNT NUMBER

99296 GS2 jOO0¢Q

EFFECTIVE DATE OF CHANGE [ 5 OF] 2025

POLICY INCEPTION DATE

\_

POLICY EXPIRATION DATE

CONTACT

ADONESS:

eee

iam

cove:

S023.

SUBCODE:

lle

[20

as

2]

©€[

202)

AGENCY CUSTOMER BD:

Poucy

PROPERTY

‘AUTO

|\_\_|

workers comp

NAMED INSURED

INLAND MARINE

TRUCKERS:

UMBRELLA |\_|

Motor carriers

|\_|

INSURED'S

NAME

AND

MAILING

ADDRESS,

IF

CHANGED

(INC

ZiPr4)

GENERAL LeiuiTy

|\_|

BusINEss

owners

|]

THIS IS AN ACKNOWLEDGEMENT OF YOUR REQUEST. UPON APFROVAL, THE COMPANY'S RECORDS WILL BE ADJUSTED ACCORDINGLY, AND IF A PREMIUM ADJUSTMENT IS REQUIRED, IT WILL BE DONE AT PREMIUM AUDIT OR BY ENDORSEMENT.

Ajay»

Gachibewl?

+ydera

CITY LIMITS

INSIDE

OUTSIDE

INTEREST

OWNER

DELETE

Loc #

BLD?

DELETE

ci

1

SYM/

AGE

OTC

SY

In [10]:
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import (
    ConversionResult,
    DocumentConverter,
    InputFormat,
    PdfFormatOption,
)
from docling_ocr_onnxtr import OnnxtrOcrOptions


def main():
    # Source document to convert
    source = "Commercial Policy.pdf"

    # Available detection & recognition models can be found at
    # https://github.com/felixdittrich92/OnnxTR

    # Or you choose a model from Hugging Face Hub
    # Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842

    ocr_options = OnnxtrOcrOptions(
        # Text detection model
        det_arch="db_mobilenet_v3_large",
        # Text recognition model - from Hugging Face Hub
        reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
        # This can be set to `True` to auto-correct the orientation of the pages
        auto_correct_orientation=False,
    )

    pipeline_options = PdfPipelineOptions(
        ocr_options=ocr_options,
    )
    pipeline_options.allow_external_plugins = True  # <-- enabled the external plugins

    # Convert the document
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            ),
        },
    )

    conversion_result: ConversionResult = converter.convert(source=source)
    doc = conversion_result.document
    md = doc.export_to_markdown()
    print(md)


if __name__ == "__main__":
    main()

Downloading https://github.com/felixdittrich92/OnnxTR/releases/download/v0.2.0/db_mobilenet_v3_large-4987e7bd.onnx to /home/akshaymambakam/.cache/onnxtr/models/db_mobilenet_v3_large-4987e7bd.onnx


16077824it [00:02, 7163358.25it/s]                               


<!-- image -->

ACORD 175 (2016/03)

AGENCY

!ź/#\_ /2ø¿$

CARRIER

NAIC CODE

ATTENTION

9.b

CONIACT.

POLICY! NUMBER

NAME:

|}~

PHONE.

Ł %&amp;'( {\_ô\_ £

12-144

LAIç,-no,Æ Ext):\_

ACCOUNT NUMBER

FAX.

¥45.! Noì:\_

]^\_ 9

'iv 24 ylabiubì empu mi

1132645210072

ADDRESS:\_

EFFECTIVE DĄTE OF( CHANGE 6 |0+ 2025

POLICY! INCEPTIONI DATE

CODE;

Te2.ï 3

SUBCODE:

2./:

POLICY,EXPIRATION.D DATE

lo

6

2ebi

lx

AGENCYC CUSTOMERI ID;

POLICY TYPE

PROPERTY

AUTO.

°6|

2o)2

NAMEDI INSURED

WORKERS comp

INLANDI MARINE

TRUCKERS

unshine

UMBRELLA

kawiin

MOTOR CARRIERS

Żicckton.ì&lt;

INSURED'S

NAME/

ANDI

MAILING./

ADDRESS,!

IFA

CHANGED

(INCZ

CZIP+4)

GENERALI LIABILITY

BUSINESS

OWNERS

Å

THIS IS AN. ACKNOWLEDGEMENT OF YOUR REQUEST. UPON APFROVAL, THE COMPANY'S RECORDS WILL BE ADJUSTED ACCORDINGLY,\_ AND\_ IF A PREMIUM. ADJUSTMENT IS ¡ITI WILLI BEI DONE. REQUIRED,! AT PREMIUM, AUDIT ORI BY ENDORSEMENT.

Gachì|öÄî

#4##ç.í%[!

Aja9

SHORT

DESCRIPTION\_

OF

CHANGE