## Another Method

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/devayushrout/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
SYMPTOM_KEYWORDS = [
    "fever", "cough", "cold", "pain", "headache", "vomiting", "diarrhea",
    "sore throat", "rash", "swelling", "chills", "fatigue", "nausea",
    "shortness of breath", "wheezing", "bleeding", "dizziness", "burns",
]

In [12]:
import os
import json
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

SYMPTOM_KEYWORDS = [
    "fever", "cough", "cold", "pain", "headache", "vomiting", "diarrhea",
    "sore throat", "rash", "swelling", "chills", "fatigue", "nausea",
    "shortness of breath", "wheezing", "bleeding", "dizziness", "burns",
]

def find_symptoms(sentence):
    symptoms = [s for s in SYMPTOM_KEYWORDS if s.lower() in sentence.lower()]
    return symptoms

def smart_chunk_text(text, source_name, priority=1, chunk_size=4):
    sentences = sent_tokenize(text)
    chunks = []
    i = 0

    while i < len(sentences):
        group = sentences[i:i+chunk_size]
        combined = " ".join(group)
        found = []

        for sent in group:
            found += find_symptoms(sent)

        found = list(set(found))  # Remove duplicates

        if found:
            chunk = {
                "text": combined,
                "symptoms": found,
                "source": source_name,
                "type": "rural_remedy" if "no_doctor" in source_name else "clinical",
                "priority": priority
            }
            chunks.append(chunk)

        i += chunk_size

    return chunks

# 🔄 Loop over all text files in your knowledge base
all_chunks = []

root_folder = "Baymax_KnowledgeBase"  # ← your base folder with 5 sources

for folder in os.listdir(root_folder):
    folder_path = os.path.join(root_folder, folder)
    if os.path.isdir(folder_path):
        priority = 1 if "no_doctor" in folder else 2 if "iphs" in folder else 3
        for file in os.listdir(folder_path):
            if file.endswith(".txt"):
                with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
                    text = f.read()
                    chunks = smart_chunk_text(text, folder, priority)
                    all_chunks.extend(chunks)

# 💾 Save all chunks as a JSONL file
with open("symptom_chunks.jsonl", "w", encoding="utf-8") as f:
    for chunk in all_chunks:
        f.write(json.dumps(chunk) + "\n")

print(f"✅ Done. Total chunks created: {len(all_chunks)}")

✅ Done. Total chunks created: 4599


In [13]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import json

# Load your symptom-tagged chunks
chunks = []
with open("symptom_chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        chunks.append(
            Document(
                page_content=data["text"],
                metadata={
                    "symptoms": data["symptoms"],
                    "source": data["source"],
                    "type": data["type"],
                    "priority": data["priority"]
                }
            )
        )

print(f"Loaded {len(chunks)} chunks.")

# Set up embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Build FAISS vectorstore from chunks
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Save it to disk
vectorstore.save_local("baymax_vectorstore")
print("✅ FAISS vectorstore saved to 'baymax_vectorstore/'")

Loaded 4599 chunks.
✅ FAISS vectorstore saved to 'baymax_vectorstore/'


In [15]:
from langchain.vectorstores import FAISS

# Load saved vectorstore
vs = FAISS.load_local("baymax_vectorstore", embedding_model, allow_dangerous_deserialization=True)

# Test a query
docs = vs.similarity_search("What to do if someone has fever and headache?", k=5)

for doc in docs:
    print("\n---")
    print("Text:", doc.page_content[:300])
    print("Metadata:", doc.metadata)


---
Text: Where There Is No Doctor 2011
162
HEADACHES AND MIGRAINES
SIMPLE HEADACHE can be helped by rest 
and aspirin. It often helps to put a cloth 
soaked in hot water on the back of the 
neck and to massage (rub) the neck 
and shoulders gently. Some other home 
remedies also seem to help. Headache is comm
Metadata: {'symptoms': ['fever', 'headache'], 'source': 'rural_care', 'type': 'clinical', 'priority': 3}

---
Text: Let 
the air reach his body. This will help the fever go 
down (see p. 76). True. It helps.
Metadata: {'symptoms': ['fever'], 'source': 'rural_care', 'type': 'clinical', 'priority': 3}

---
Text: 4. Pour cool (not cold) water over him, or put cloths soaked in cool water on his 
chest and forehead. Fan the cloths and change them often to keep them cool. Continue to do this until the fever goes down (below 38°).
Metadata: {'symptoms': ['cold', 'fever'], 'source': 'rural_care', 'type': 'clinical', 'priority': 3}

---
Text: Bring the fever down as soon 
as you can and 