In [None]:
import ollama
from langchain_community.document_loaders import PyPDFLoader
from qdrant_client import QdrantClient, models

from plum_chatbot.configs.folders import PDF_DIR
from plum_chatbot.configs.settings import Settings

In [None]:
pages = []
files = list(filter(lambda x: "merged" not in str(x), PDF_DIR.glob("*.pdf")))

for document in files:
    loader = PyPDFLoader(PDF_DIR / document)
    content = ""
    async for page in loader.alazy_load():
        # pages.append(page)
        content += page.page_content + "\n"
    pages.append(content)

In [None]:
settings = Settings()

COLLECTION_NAME = "FAQ2"

# Initialize Ollama client
oclient = ollama.Client(host="host.docker.internal")

# Initialize Qdrant client
qclient = QdrantClient(url=settings.QDRANT_URL, api_key=settings.QDRANT_API_KEY)

In [None]:
# Text to embed

# Generate embeddings
# page_contents = [document.page_content for document in pages]
responses = [oclient.embeddings(model="llama3.2", prompt=content) for content in pages]
embeddings = [response["embedding"] for response in responses]

# Create a collection if it doesn't already exist
if not qclient.collection_exists(COLLECTION_NAME):
    qclient.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=len(embeddings[0]), distance=models.Distance.COSINE
        ),
    )

In [None]:
# Upload the vectors to the collection along with the original text as payload

qclient.upsert(
    collection_name=COLLECTION_NAME,
    points=[
        models.PointStruct(id=i, vector=embedding, payload={"page_content": text})
        for i, (text, embedding) in enumerate(zip(pages, embeddings))
        if embedding
    ],
)