In [1]:
from uuid import uuid4

import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from qdrant_client import QdrantClient, models

from plum_chatbot.configs.folders import PDF_DIR
from plum_chatbot.configs.settings import Settings

In [None]:
pages = []
files = list(filter(lambda x: "merged" not in str(x), PDF_DIR.glob("*.pdf")))

for document in files:
    loader = PyPDFLoader(PDF_DIR / document)
    content = ""
    async for page in loader.alazy_load():
        # pages.append(page)
        content += page.page_content + "\n"
    pages.append(content)
pages.sort(key=lambda x: int(x.split(".")[0]))

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    # separators=["\n\n", "\n", ".", " "],
    separators=["\n", "."],
)

In [None]:
all_chunks = []
for document in files:
    loader = PyPDFLoader(document)
    docs = loader.load()
    splits = text_splitter.split_documents(docs)  # Proper usage
    for i, split in enumerate(splits):
        all_chunks.append(
            {
                "id": str(uuid4()),
                "text": split.page_content,
                "metadata": {
                    "source": document.name,
                    "chunk_index": i,
                    "title": document.name.replace(".pdf", ""),
                },
            }
        )

In [None]:
settings = Settings()

COLLECTION_NAME = "FAQ2"

# Initialize Ollama client
oclient = ollama.Client(host="host.docker.internal")

# Initialize Qdrant client
qclient = QdrantClient(url=settings.QDRANT_URL, api_key=settings.QDRANT_API_KEY)

In [None]:
# Text to embed

# Generate embeddings
# page_contents = [document.page_content for document in pages]
# responses = [oclient.embeddings(model="llama3.2", prompt=content) for content in pages]
responses = [
    oclient.embeddings(model="llama3.2", prompt=content["text"])
    for content in all_chunks
]
embeddings = [response["embedding"] for response in responses]

# Create a collection if it doesn't already exist
if not qclient.collection_exists(COLLECTION_NAME):
    qclient.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=len(embeddings[0]), distance=models.Distance.COSINE
        ),
    )

In [None]:
# Upload the vectors to the collection along with the original text as payload

qclient.upsert(
    collection_name=COLLECTION_NAME,
    points=[
        models.PointStruct(id=i, vector=embedding, payload=chunk)
        for i, (chunk, embedding) in enumerate(zip(all_chunks, embeddings))
        if embedding
    ],
)