In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from docling.datamodel.pipeline_options_vlm_model import ResponseFormat
from pathlib import Path

from archaeo_super_prompt.dataset.load import MagohDataset
from archaeo_super_prompt.modeling.pdf_to_text.stream_ocr_manual import (
    converter,
    vllm_vlm_options
)

# 🦆📃 PDF complete ingestion with Docling preprocessing

We try the young framework Docling and its usage of VLLM on the HuggingFace repositories to achieve thoses tasks:

- document OCR with Italian language analysis (VLLM)
- document chunking with these features:
    - layout-aware
    - smart tokenization

All of these things are possible with incorporating several open ML models into the Docling pipeline

In [None]:
MAX_SAMPLES_FETCHED = 300
SEED = 0.5

_selected_ids = [35983, 31298]
dataset = MagohDataset(_selected_ids)
selected_ids = set(_selected_ids)
inputs = dataset.get_files_for_batch(selected_ids)

In [None]:
inputs["filepath"].tolist()

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    TIMEOUT_PER_PAGE = 60 * 3
    # Example using the Granite Vision model with VLLM:

    doc_converter = converter(
        vllm_vlm_options(
            model="ibm-granite/granite-vision-3.3-2b",
            prompt="OCR the full page for markdown-based processing.",
            # Doctags is only supported by doclings vllm for now
            response_format=ResponseFormat.MARKDOWN,
            allowed_timeout=TIMEOUT_PER_PAGE,
        )
    )
    results = [
        content for content in doc_converter.convert_all(
            [Path(p.filepath) for p in inputs.itertuples()],
            page_range=(1, 2), raises_on_error=False)
    ]
    return results

In [None]:
result = main()

## Inspect the results

We export into markdown the results for display it

In [None]:
from IPython.display import Markdown

Markdown(result[1][1][0].export_to_markdown())

In [None]:
from docling.chunking import HierarchicalChunker

chunker = HierarchicalChunker()
chunk_iter = chunker.chunk(dl_doc=result[0][1][0])
chunks = list(chunk_iter)

In [None]:
for i, chunk in enumerate(chunks):
    print(f"=== {i} ===")
    txt_tokens = len(
        chunk.text.rstrip().split(" ")
    )  # tokenizer.count_tokens(chunk.text)
    print(f"chunk.text ({txt_tokens} tokens):\n{chunk.text!r}")

    ser_txt = chunker.contextualize(chunk=chunk)
    ser_tokens = len(
        ser_txt.rstrip().split(" ")
    )  # tokenizer.count_tokens(ser_txt)
    print(f"chunker.contextualize(chunk) ({ser_tokens} tokens):\n{ser_txt!r}")

    print()

In [None]:
from IPython.display import Markdown

Markdown(result[1][1][0].export_to_markdown())

## Hybrid-Chunking

This enable to use a token-aware chunking after a first layout-aware chunking, from a tokenizer built from the embedding model that will be used for the chunk selection, the thesaurus match comparisons, etc.

This has two advantages:
1. The chunks will be recutted more suitably for a procesing by the embedding model
2. The chunks might be recutted more shortly without losing contextual information, even if the layout was originally set of big paragraphs which does not cut the text.

In [None]:
from docling.chunking import HybridChunker

from docling_core.transforms.chunker.tokenizer.huggingface import (
    HuggingFaceTokenizer,
)
from transformers import AutoTokenizer


EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 512  # set to a small number for illustrative purposes

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer` for HF case
)
chunker = HybridChunker(tokenizer=tokenizer)
chunk_iter = chunker.chunk(dl_doc=result[1][1][0])

In [None]:
hybrid_chunk_list = list(chunk_iter)

In [None]:
for i, chunk in enumerate(hybrid_chunk_list):
    print(f"=== {i} ===")
    txt_tokens = len(
        chunk.text.rstrip().split(" ")
    )  # tokenizer.count_tokens(chunk.text)
    print(f"chunk.text ({txt_tokens} tokens):\n{chunk.text!r}")

    ser_txt = chunker.contextualize(chunk=chunk)
    ser_tokens = len(
        ser_txt.rstrip().split(" ")
    )  # tokenizer.count_tokens(ser_txt)
    print(f"chunker.contextualize(chunk) ({ser_tokens} tokens):\n{ser_txt!r}")

    print()