In [1]:
from qazure import get_llm, get_embedder
import logging
import sys

import qdrant_client
from IPython.display import display
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import Settings

from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.readers.docling import DoclingReader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

In [4]:
ocr_documents=reader.load_data('form_docling.png')

INFO:docling.document_converter:Going to convert document batch...
Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
Accelerator device: 'cpu'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
Loading plugin 'doc



INFO:docling.document_converter:Finished converting document form_docling.png in 46.55 sec.
Finished converting document form_docling.png in 46.55 sec.


In [5]:
llm = get_llm()
embed_model = get_embedder()

In [6]:
Settings.llm = llm
Settings.embed_model = embed_model

In [7]:
client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

INFO:httpx:HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


In [8]:
vector_store = QdrantVectorStore(client=client, collection_name="ocr_test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=ocr_documents,
    transformations=[node_parser],
    storage_context=storage_context
)

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/ocr_test/exists "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/ocr_test/exists "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1

In [15]:
# set Logging to DEBUG for more detailed outputs
query = "Fetch the policy expiration date."
query_engine = index.as_query_engine()
result = query_engine.query(query)

INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/ocr_test/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/ocr_test/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://evaln-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"


In [16]:
print(f"Q: {query}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

Q: Fetch the policy expiration date.
A: The policy expiration date is 04/01/2023.

Sources:


[('POLICY EXPRIATION ShIz',
  {'schema_name': 'docling_core.transforms.chunker.DocMeta',
   'version': '1.0.0',
   'doc_items': [{'self_ref': '#/texts/17',
     'parent': {'$ref': '#/groups/0'},
     'children': [],
     'content_layer': 'body',
     'label': 'text',
     'prov': [{'page_no': 1,
       'bbox': {'l': 782.0,
        't': 1166.6666666666667,
        'r': 933.6666666666666,
        'b': 1150.6666666666667,
        'coord_origin': 'BOTTOMLEFT'},
       'charspan': [0, 22]}]}],
   'headings': ['ACORD COMMERCIAL POLICY CHANGE REQUEST'],
   'origin': {'mimetype': 'application/pdf',
    'binary_hash': 14045974798944755523,
    'filename': 'form_docling.png'}}),
 ('04/01/2023',
  {'schema_name': 'docling_core.transforms.chunker.DocMeta',
   'version': '1.0.0',
   'doc_items': [{'self_ref': '#/texts/18',
     'parent': {'$ref': '#/groups/0'},
     'children': [],
     'content_layer': 'body',
     'label': 'text',
     'prov': [{'page_no': 1,
       'bbox': {'l': 613.666666666666