In [1]:
from qdrant_client import models, QdrantClient
# from sentence_transformers import SentenceTransformer
from docling.chunking import HybridChunker
from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from pathlib import Path

output_dir = Path.cwd()/Path("../../outputs").resolve()
document_dir = Path.cwd()/Path("../../documents").resolve()
folder_path = document_dir/"midi_synthesizers/input/test/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
COLLECTION_NAME = "docling"

pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.GRANITEDOCLING_MLX,
)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
)

client = QdrantClient(location=":memory:")

client.set_model("jinaai/jina-embeddings-v3")
client.set_sparse_model("Qdrant/bm25")

In [None]:
result = doc_converter.convert(folder_path/"Digitone-2-User-Manual_ENG_OS1.10D_251022")
# also add "Faderfox EC4 quick start guide.pdf"

documents, metadatas = [], []
for chunk in HybridChunker().chunk(result.document):
    documents.append(chunk.text)
    metadatas.append(chunk.meta.export_json_dict())
    
_ = client.add(
    collection_name=COLLECTION_NAME,
    documents=documents,
    metadata=metadatas,
    batch_size=64,
)

In [None]:
points = client.query(
    collection_name=COLLECTION_NAME,
    query_text="What does it mean to link encoders?",
    limit=10,
)

for i, point in enumerate(points):
    print(f"=== {i} ===")
    print(point.document)
    print()