In [1]:
import datetime
import logging
import time
from pathlib import Path

import pandas as pd

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash

  from pandas.core import (


In [2]:
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0

In [3]:
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./paciente.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start_time = time.time()

conv_res = doc_converter.convert(input_doc_path)

output_dir.mkdir(parents=True, exist_ok=True)

rows = []
for (
    content_text,
    content_md,
    content_dt,
    page_cells,
    page_segments,
    page,
) in generate_multimodal_pages(conv_res):

    dpi = page._default_image_scale * 72

    rows.append(
        {
            "document": conv_res.input.file.name,
            "hash": conv_res.input.document_hash,
            "page_hash": create_hash(
                conv_res.input.document_hash + ":" + str(page.page_no - 1)
            ),
            "image": {
                "width": page.image.width,
                "height": page.image.height,
                "bytes": page.image.tobytes(),
            },
            "cells": page_cells,
            "contents": content_text,
            "contents_md": content_md,
            "contents_dt": content_dt,
            "segments": page_segments,
            "extra": {
                "page_num": page.page_no + 1,
                "width_in_points": page.size.width,
                "height_in_points": page.size.height,
                "dpi": dpi,
            },
        }
    )

# Generate one parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)

end_time = time.time() - start_time

_log.info(
    f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
)

INFO:docling.document_converter:Going to convert document batch...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

INFO:docling.pipeline.base_pipeline:Processing document paciente.pdf
INFO:docling.document_converter:Finished converting document paciente.pdf in 99.36 sec.
INFO:__main__:Document converted and multimodal pages generated in 99.96 seconds.


In [5]:
# This block demonstrates how the file can be opened with the HF datasets library
from datasets import Dataset
from PIL import Image
multimodal_df = pd.read_parquet(output_filename)

# Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
dataset = Dataset.from_pandas(multimodal_df)
def transforms(examples):
    examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
    return examples
dataset = dataset.map(transforms)

print(multimodal_df)


Map:   0%|          | 0/23 [00:00<?, ? examples/s]

        document                                               hash  \
0   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
1   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
2   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
3   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
4   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
5   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
6   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
7   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
8   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
9   paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
10  paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
11  paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
12  paciente.pdf  2c4dfcf2d685d0801a894a1ef60b02ba055f43d9bd70e4...   
13  pa