In [1]:
from qazure import get_llm, get_embedder
import logging
import sys

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import Settings

from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.readers.docling import DoclingReader

  from .autonotebook import tqdm as notebook_tqdm


In [52]:
dir_path = "docs"

In [2]:
logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [54]:
llm = get_llm()
embed_model = get_embedder()

In [55]:
Settings.llm = llm
Settings.embed_model = embed_model

In [56]:
client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

INFO:httpx:HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


In [57]:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

In [58]:
dir_reader = SimpleDirectoryReader(
    input_dir=dir_path,
    file_extractor={".pdf": reader},
)

In [59]:
vector_store = QdrantVectorStore(client=client, collection_name="individual_upload_test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=reader.load_data("docs/2408.09869.pdf"),
    transformations=[node_parser],
    storage_context=storage_context
)

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/individual_upload_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/individual_upload_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/individual_upload_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/individual_upload_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/individual_upload_test/exists "HTTP/1.1 200 OK"
INFO:docling.document_converter:Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with optio



INFO:docling.document_converter:Finished converting document 2408.09869.pdf in 95.90 sec.
Finished converting document 2408.09869.pdf in 95.90 sec.
Finished converting document 2408.09869.pdf in 95.90 sec.
Finished converting document 2408.09869.pdf in 95.90 sec.
Finished converting document 2408.09869.pdf in 95.90 sec.
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://eval

In [40]:
vector_store = QdrantVectorStore(client=client, collection_name="docling_test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=storage_context
)

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/docling_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/docling_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/docling_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/docling_test/exists "HTTP/1.1 200 OK"
INFO:docling.document_converter:Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
INFO:docling.utils.accelerator_utils:



INFO:docling.document_converter:Finished converting document 2408.09869.pdf in 90.06 sec.
Finished converting document 2408.09869.pdf in 90.06 sec.
Finished converting document 2408.09869.pdf in 90.06 sec.
Finished converting document 2408.09869.pdf in 90.06 sec.
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/te

In [41]:
# set Logging to DEBUG for more detailed outputs
query = "Which are the main AI models in Docling?"
query_engine = index.as_query_engine()
result = query_engine.query(query)

INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/docling_test/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/docling_test/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/docling_test/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collect

In [42]:
print(f"Q: {query}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

Q: Which are the main AI models in Docling?
A: The main AI models in Docling are a layout analysis model, which is an accurate object-detector for page elements, and TableFormer, a state-of-the-art table structure recognition model.

Sources:


[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',
  {'file_path': '/home/akshaymambakam/genai/llama_index/docs/2408.09869.pdf',
   'file_name': '2408.09869.pdf',
   'file_type': 'application/pdf',
   'file_size': 5566575,
   'creation_date': '2025-06-06',
   'last_modified_date': '2025-06-06',
   'schema_name': 'docling_core.transforms.chunker.DocMeta',
   'version': '1.0.0',
   'doc_items': [{'self_ref': '#/texts/50',
     'parent'

In [43]:
from pathlib import Path
from tempfile import mkdtemp
import requests

SOURCE = "https://arxiv.org/pdf/2408.09869"
tmp_dir_path = Path(mkdtemp())
r = requests.get(SOURCE)
with open(tmp_dir_path / f"{Path(SOURCE).name}.pdf", "wb") as out_file:
    out_file.write(r.content)

In [None]:
index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=storage_context
)

Loading files:   0%|          | 0/2 [00:00<?, ?file/s]

INFO:docling.document_converter:Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 2408.09869.pdf
Processing document 2408.09869.pdf
Processing document 2408.09869.pdf
Processing document 2408.09869.pdf




INFO:docling.document_converter:Finished converting document 2408.09869.pdf in 69.22 sec.
Finished converting document 2408.09869.pdf in 69.22 sec.
Finished converting document 2408.09869.pdf in 69.22 sec.
Finished converting document 2408.09869.pdf in 69.22 sec.


Loading files:  50%|█████     | 1/2 [01:09<01:09, 69.24s/file]

INFO:docling.document_converter:Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 2506.04899v1.pdf
Processing document 2506.04899v1.pdf
Processing document 2506.04899v1.pdf
Processing document 2506.04899v1.pdf
INFO:docling.document_converter:Finished converting document 2506.04899v1.pdf in 42.59 sec.
Finished converting document 2506.04899v1.pdf in 42.59 sec.
Finished converting document 2506.04899v1.pdf in 42.59 sec.
Finished converting document 2506.04899v1.pdf in 42.59 sec.


Loading files: 100%|██████████| 2/2 [01:51<00:00, 55.93s/file]


INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https

In [3]:
# Test OCR from image

In [4]:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

In [5]:
handler = reader.aload_data('form_docling.png')

In [16]:
import json
for i, document in enumerate(documents):
    print('--------------------------')
    print('Document:', i)
    print(json.dumps(document.dict(),indent=4))

--------------------------
Document: 0
{
    "id_": "7ad5e06f-9e2e-4e11-a475-a1a59dc5c69a",
    "embedding": null,
    "metadata": {},
    "excluded_embed_metadata_keys": [],
    "excluded_llm_metadata_keys": [],
    "relationships": {},
    "metadata_template": "{key}: {value}",
    "metadata_separator": "\n",
    "text_resource": {
        "embeddings": null,
        "text": "{\"schema_name\": \"DoclingDocument\", \"version\": \"1.3.0\", \"name\": \"form_docling\", \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 14045974798944755523, \"filename\": \"form_docling.png\"}, \"furniture\": {\"self_ref\": \"#/furniture\", \"children\": [], \"content_layer\": \"furniture\", \"name\": \"_root_\", \"label\": \"unspecified\"}, \"body\": {\"self_ref\": \"#/body\", \"children\": [{\"$ref\": \"#/texts/0\"}, {\"$ref\": \"#/groups/0\"}, {\"$ref\": \"#/tables/0\"}, {\"$ref\": \"#/texts/91\"}], \"content_layer\": \"body\", \"name\": \"_root_\", \"label\": \"unspecified\"}, \"groups\"