In [1]:
import json
import os
import pathlib

import azure.identity
import openai
import pymupdf4llm
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
import qdrant_client.http.models as qmodels

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filename = "../RAG/data/Western_honey_bee.pdf"
COLLECTION_NAME = "BEES"

In [3]:
md_text = pymupdf4llm.to_markdown(filename)

In [4]:
len(md_text)

133554

In [5]:
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

from config_qdrant import *

project = AIProjectClient(
    endpoint=AZURE_OPENAI_ENDPOINT,
    credential=DefaultAzureCredential(),
)

client = project.get_openai_client(api_version="2024-10-21")

In [6]:
all_chunks =[]

In [7]:
# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4o", chunk_size=500, chunk_overlap=125
)
texts = text_splitter.create_documents([md_text])

In [8]:
len(texts)

109

In [9]:
texts[0]

Document(metadata={}, page_content='# **Western honey bee**\n\nThe **western honey bee** or **European honey bee** ( _**Apis**_\n_**mellifera**_ ) is the most common of the 7–12 species of [honey](https://en.wikipedia.org/wiki/Honey_bee)\n[bees worldwide](https://en.wikipedia.org/wiki/Honey_bee) ~~.~~ [[3][4]] The [genus name](https://en.wikipedia.org/wiki/Genus) _Apis_ is [Latin](https://en.wikipedia.org/wiki/Latin) for \'bee\',\nand _mellifera_ is the Latin for \'honey-bearing\' or \'honeycarrying\', referring to the species\' production of honey. [[5]]\n\n\n[Like all honey bee species, the western honey bee is eusocial,](https://en.wikipedia.org/wiki/Eusociality)\ncreating [colonies with a single fertile female (or "queen"),](https://en.wikipedia.org/wiki/Beehive)\nmany normally non-reproductive females or "workers", and a\n[small proportion of fertile males or "drones". Individual](https://en.wikipedia.org/wiki/Drone_(bee))\ncolonies can house tens of thousands of bees. Colony acti

In [10]:
import uuid

In [11]:
id_list = []
embeddings_all = []
payload_list = []

for section in texts:
    section_embeddings = client.embeddings.create(model="text-embedding-3-small", input=section.page_content).data[0].embedding
    id_list.append(str(uuid.uuid4()))
    embeddings_all.append(section_embeddings)  
    payloads = {}
    payloads["sourcefile"] = filename
    payloads["content"] = section.page_content

    payload_list.append(payloads)

In [12]:
payload_list

[{'sourcefile': '../RAG/data/Western_honey_bee.pdf',
  'content': '# **Western honey bee**\n\nThe **western honey bee** or **European honey bee** ( _**Apis**_\n_**mellifera**_ ) is the most common of the 7–12 species of [honey](https://en.wikipedia.org/wiki/Honey_bee)\n[bees worldwide](https://en.wikipedia.org/wiki/Honey_bee) ~~.~~ [[3][4]] The [genus name](https://en.wikipedia.org/wiki/Genus) _Apis_ is [Latin](https://en.wikipedia.org/wiki/Latin) for \'bee\',\nand _mellifera_ is the Latin for \'honey-bearing\' or \'honeycarrying\', referring to the species\' production of honey. [[5]]\n\n\n[Like all honey bee species, the western honey bee is eusocial,](https://en.wikipedia.org/wiki/Eusociality)\ncreating [colonies with a single fertile female (or "queen"),](https://en.wikipedia.org/wiki/Beehive)\nmany normally non-reproductive females or "workers", and a\n[small proportion of fertile males or "drones". Individual](https://en.wikipedia.org/wiki/Drone_(bee))\ncolonies can house tens of

In [13]:
from qdrant_client import QdrantClient

In [14]:
qdrant_client = QdrantClient(
    url=HOST,
    api_key=API_KEY
)

In [15]:
def upload_documents(id_list,
                         embeddings_all,
                         payload_list):
        
        embeddings_all_len = len(embeddings_all)

        CHUNK_SIZE = 20
        for i in range(0, embeddings_all_len, CHUNK_SIZE):
            if(i+CHUNK_SIZE > embeddings_all_len -1):
                new_chunk = embeddings_all_len -1
            else:
                new_chunk = i+CHUNK_SIZE -1
            print("Inserting chunk", i , "to", new_chunk)
            qdrant_client.upsert(
                collection_name=COLLECTION_NAME,
                points=qmodels.Batch(
                    ids = id_list[i:new_chunk],
                    vectors=embeddings_all[i:new_chunk],
                    payloads=payload_list[i:new_chunk]
                ),
        )

In [16]:
upload_documents(id_list,embeddings_all,payload_list)

Inserting chunk 0 to 19
Inserting chunk 20 to 39
Inserting chunk 40 to 59
Inserting chunk 60 to 79
Inserting chunk 80 to 99
Inserting chunk 100 to 108
