## Smart Chunking & Metadata Tagging

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/devayushrout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import sent_tokenize
import os
from tqdm import tqdm
import tiktoken 

In [3]:
tokenizer = tiktoken.get_encoding("p50k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

In [4]:
def smart_chunk(text, source, max_tokens=500):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        tokens = count_tokens(sentence)
        if current_tokens + tokens > max_tokens:
            if current_chunk:
                chunks.append({
                    "text": current_chunk.strip(),
                    "metadata": {
                        "source": source
                    }
                })
            current_chunk = sentence
            current_tokens = tokens
        else:
            current_chunk += " " + sentence
            current_tokens += tokens

    # Add last chunk
    if current_chunk:
        chunks.append({
            "text": current_chunk.strip(),
            "metadata": {
                "source": source
            }
        })

    return chunks

In [5]:
knowledge_dir = "Baymax_KnowledgeBase"
all_chunks = []

for filename in tqdm(os.listdir(knowledge_dir)):
    filepath = os.path.join(knowledge_dir, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()

    source_name = filename.replace(".txt", "")
    chunks = smart_chunk(raw_text, source=source_name)
    all_chunks.extend(chunks)

print(f"Total chunks created: {len(all_chunks)}")

100%|██████████| 5/5 [00:03<00:00,  1.33it/s]

Total chunks created: 7435





In [6]:
print(all_chunks[0])

{'text': 'Section\nObjectives of IPHS for HWC-PHC | 5 \nObjectives of IPHS \nfor HWC-PHC\n3\nThe broad objectives of the Indian Public Health Standards (IPHS) for PHC in rural and urban areas include \nthe following:\n1. To define uniform benchmark to ensure high quality services that are accountable, responsive, and \nsensitive to the needs of the community. 2. To specify the minimum assured (essential) and achievable (desirable) services that are expected to \nbe provided at different levels of public health facilities. 3. To provide guidance on health systems strengthening components which includes architectural \ndesign of facilities, human resources for health, drugs, diagnostics, equipment, administrative and \nlogistical support services to improve the overall health related outcomes \n4. To achieve and maintain an acceptable standard of the quality of care at public facilities\n5. To facilitate monitoring and supervision of the facilities\n6. To provide guidance and tools for g

In [7]:
import json

with open("baymax_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

In [8]:
from langchain.docstore.document import Document
import os

docs = []

base_path = "Baymax_KnowledgeBase"

# Manual source labels for each file
file_to_source = {
    "rural_care.txt": "rural_care",
    "cmdt.txt": "clinical_guidelines",
    "symptom_flow.txt": "consultation_flow",
    "iphs.txt": "protocol_guidelines",
    "nfi.txt": "medication_safety"
}

# Optional: Priority settings (used later for filtering)
file_priority = {
    "rural_care.txt": 1,
    "symptom_flow.txt": 2,
    "iphs.txt": 2,
    "cmdt.txt": 3,
    "nfi.txt": 3
}

# Loop and load
for filename in os.listdir(base_path):
    if filename.endswith(".txt"):
        filepath = os.path.join(base_path, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
            docs.append(Document(
                page_content=content,
                metadata={
                    "source": file_to_source.get(filename, "unknown"),
                    "priority": file_priority.get(filename, 3),
                    "filename": filename
                }
            ))

## Embeddings

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
