In [None]:
from rich.pretty import pprint
from rich.console import Console
from rich.markdown import Markdown

from qdrant_client import models

# from rage.embeddings import IonosEmbeddings
from langchain_openai import OpenAIEmbeddings

from rage.loaders import PDFMarkdownLoaeder
from rage.splitters import MarkdownSplitter
from rage.retriever import Retriever

In [None]:
console = Console()

In [None]:
loader = PDFMarkdownLoaeder()
splitter = MarkdownSplitter()

In [None]:
file_path = "/resources/pdf/zaratustra.pdf"
documents = await loader.load(source_path=file_path)

print(f"documents => {len(documents)}")
pprint(documents[0])

In [None]:
text_chunks = splitter.split_documents(documents=documents)

print(f"text_chunks => {len(text_chunks)}")
pprint(text_chunks[:3])

In [None]:
console.print(Markdown(text_chunks[57].text))

In [None]:
dense_embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=256,
)

# dense_embeddings = IonosEmbeddings()
retriever = Retriever(dense_embeddings=dense_embeddings)

In [None]:
collection_name = "zaratustra"
# collection_name = "zaratustra-ie"

retriever.create_collection(collection_name=collection_name)
retriever.insert_text_chunks(
    collection_name=collection_name,
    text_chunks=text_chunks,
)

In [None]:
query = "Que quiere el gran Dragón?"
retriever_items = await retriever.dense_search(
    collection_name=collection_name,
    query=query,
    k=5,
    score_threshold=0.5,
)

pprint(retriever_items)

In [None]:
retriever_items = await retriever.hybrid_search(
    collection_name=collection_name,
    query=query,
    k=5,
    # score_threshold=0.5,
)

pprint(retriever_items)

In [None]:
search_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="metadata.chunk_id",
            match=models.MatchValue(value="7c1c61cb243b57b1fa5609ab2b1afecd"),
        ),
    ],
)

retriever_items = retriever.scroll(
    collection_name=collection_name,
    limit=10,
    scroll_filter=search_filter,
)

pprint(retriever_items)

In [None]:
# retriever_items = await retriever.retrieve(
#     collection_name=collection_name,
#     query=query,
# )

# pprint(retriever_items)