In [None]:
import json
import os
import pathlib

import azure.identity
import openai
import pymupdf4llm
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
import qdrant_client.http.models as qmodels

In [None]:
filename = "../RAG/data/Western_honey_bee.pdf"
COLLECTION_NAME = "BEES"

In [None]:
md_text = pymupdf4llm.to_markdown(filename)

In [None]:
len(md_text)

In [None]:
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

from config_qdrant import *

project = AIProjectClient(
    endpoint=AZURE_OPENAI_ENDPOINT,
    credential=DefaultAzureCredential(),
)

client = project.get_openai_client(api_version="2024-10-21")

In [None]:
all_chunks =[]

In [None]:
# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4o", chunk_size=500, chunk_overlap=125
)
texts = text_splitter.create_documents([md_text])

In [None]:
len(texts)

In [None]:
texts[0]

In [None]:
import uuid

In [None]:
id_list = []
embeddings_all = []
payload_list = []

for section in texts:
    section_embeddings = client.embeddings.create(model="text-embedding-3-small", input=section.page_content).data[0].embedding
    id_list.append(str(uuid.uuid4()))
    embeddings_all.append(section_embeddings)  
    payloads = {}
    payloads["sourcefile"] = filename
    payloads["content"] = section.page_content

    payload_list.append(payloads)

In [None]:
payload_list

In [None]:
from qdrant_client import QdrantClient

In [None]:
qdrant_client = QdrantClient(
    url=HOST,
    api_key=API_KEY
)

In [None]:
def upload_documents(id_list,
                         embeddings_all,
                         payload_list):
        
        embeddings_all_len = len(embeddings_all)

        CHUNK_SIZE = 20
        for i in range(0, embeddings_all_len, CHUNK_SIZE):
            if(i+CHUNK_SIZE > embeddings_all_len -1):
                new_chunk = embeddings_all_len -1
            else:
                new_chunk = i+CHUNK_SIZE -1
            print("Inserting chunk", i , "to", new_chunk)
            qdrant_client.upsert(
                collection_name=COLLECTION_NAME,
                points=qmodels.Batch(
                    ids = id_list[i:new_chunk],
                    vectors=embeddings_all[i:new_chunk],
                    payloads=payload_list[i:new_chunk]
                ),
        )

In [None]:
upload_documents(id_list,embeddings_all,payload_list)