In [31]:
import json
import logging
import time
from pathlib import Path

In [32]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions

In [33]:
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

In [34]:
_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [35]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["es"]
# pipeline_options.accelerator_options = AcceleratorOptions(
#     num_threads=4, device=Device.AUTO
# )

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)


In [36]:
input_doc_path = "ambuja test.pdf"

In [None]:
start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")


In [39]:
## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

In [None]:
table_counter = 0
picture_counter = 0
for element, _level in conv_result.document.iterate_items():
        if isinstance(element, TableItem):
                table_counter += 1
                element_image_filename = (
                        output_dir / f"{doc_filename}-table-{table_counter}.png"
                )
                with element_image_filename.open("wb") as fp:
                        element.get_image(conv_result.document).save(fp, "PNG")

In [None]:
type(element.get_image(conv_result.document))