# This is notebook for testing and seeing how can we do RAG in MongoDB itself

In [4]:
# --- 🔌 Connect to MongoDB ---
from pymongo import MongoClient
from pprint import pprint
from dotenv import load_dotenv
import os
load_dotenv()
MONGODB_URI = os.getenv("MONGO_URI")
DATABASE_NAME = "AIDoc"
COLLECTION_NAME = "diseases"
client = MongoClient(MONGODB_URI)

In [None]:
# Read from MongoDB
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# --- 📄 Fetch and print 2 sample documents ---
print("📄 Sample documents from 'diseases':\n")
for doc in collection.find().limit(1):
    pprint(doc)
    print("\n" + "-"*60 + "\n")

📄 Sample documents from 'diseases':

{'_id': ObjectId('687b0122aeb398e52bb28ca8'),
 'chapter': {'authors': 'K Ravi, Vallish Shenoy',
             'end_page': 857,
             'number': 6,
             'start_page': 834,
             'title': 'Arboviral Infections'},
 'component': {'editor': 'Dilip Mathai',
               'id': '1C',
               'title': 'Viral Infections'},
 'created_at': datetime.datetime(2025, 7, 19, 2, 21, 22, 60000),
 'extraction_status': 'Success',
 'medical_content': {'investigations': ['CBC with differential',
                                        'Platelet count',
                                        'Hematocrit serial determinations',
                                        'AST/ALT levels',
                                        'NS1 antigen test',
                                        'RT-PCR and serotyping',
                                        'IgM and IgG antibodies by MAC-ELISA '
                                        'or indirect IgG-ELI

In [None]:
# --- Topics we'll extract from 'medical_content' ---
# This will help us format the chunks from the medical content
TOPICS = [
    "symptoms",
    "investigations",
    "treatment_steps",
    "red_flags",
    "patient_advice",
    "prognosis_followup"
]

# --- Format document into natural-language RAG chunks ---
def format_chunks_from_doc(doc):
    chunks = []
    disease = doc.get("chapter", {}).get("title", "Unknown Disease")
    chapter = doc.get("chapter", {}).get("number", "")
    component = doc.get("component", {}).get("title", "")
    source = f"AIDoc > {component} > Chapter {chapter}"
    
    medical = doc.get("medical_content", {})

    for topic in TOPICS:
        if topic in medical:
            items = medical[topic]
            if isinstance(items, list) and items:
                joined = ", ".join(items)
                readable_topic = topic.replace('_', ' ')
                sentence = f"{readable_topic.capitalize()} for {disease} include: {joined}."

                chunk = {
                    "disease": disease,
                    "topic": topic,
                    "text": sentence,
                    "source": source,
                    "tags": [component.lower(), topic.lower()]
                }
                chunks.append(chunk)
    return chunks

# --- Preview chunks from first 2 documents ---
total = 0
for doc in collection.find().limit(2):
    chunks = format_chunks_from_doc(doc)
    for chunk in chunks:
        pprint(chunk)
        print("\n---\n")
    total += len(chunks)

print(f"✅ Formatted {total} chunks from 2 documents.")

{'disease': 'Arboviral Infections',
 'source': 'AIDoc > Viral Infections > Chapter 6',
 'tags': ['viral infections', 'symptoms'],
 'text': 'Symptoms for Arboviral Infections include: Fever, Headache, '
         'Vomiting, Pain on moving eyes, Muscle pains, Joint pains, Macular or '
         'maculopapular rash, Leukopenia, Neck stiffness, Drowsiness, '
         'Disorientation, Stupor or coma, Convulsions, Absent or irregular '
         'deep tendon reflexes, Extensor plantar reflex, Erythrocytic '
         'diapedesis in skin lesions, Edema, Hemorrhage, Thrombocytopenia, '
         'Oligo- or polyarticular arthritis with joint swelling and redness, '
         'Retro-orbital pain, Transient macular rash, Tachycardia, Narrow '
         'pulse pressure, Persistent vomiting, Severe abdominal pain, Tender '
         'hepatomegaly, Ascites, Pleural effusion, Mucosal bleeding, Seizures, '
         'Encephalopathy, Aseptic meningitis/meningoencephalitis, Intracranial '
         'hemorrhage, H

In [9]:
"""
Test run: format + embed ONLY 3 chunks from `diseases`, then insert into `disease_chunks_vector`.
Uses dotenv for env variables.

Env vars expected in your .env:
- MONGODB_URI=mongodb+srv://...
- MONGODB_DB=AIDoc
- OPENAI_API_KEY=sk-...
(optional)
- SOURCE_COLLECTION=diseases
- TARGET_COLLECTION=disease_chunks_vector
- MAX_SAMPLES=3   # number of chunks to embed/insert
"""

from pymongo import MongoClient
from openai import OpenAI
from dotenv import load_dotenv
from pprint import pprint
from datetime import datetime
import os
import sys
from time import sleep

# ----------------------------
# Load environment variables
# ----------------------------
load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
DB_NAME =  "AIDoc"
SOURCE_COLLECTION = "diseases"
TARGET_COLLECTION = "disease_chunks_vector"
MAX_SAMPLES = 3

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not MONGO_URI or not OPENAI_API_KEY:
    print("❌ Missing MONGODB_URI or OPENAI_API_KEY in environment.")
    sys.exit(1)

# ----------------------------
# Connect clients
# ----------------------------
mongo = MongoClient(MONGO_URI)
db = mongo[DB_NAME]
src = db[SOURCE_COLLECTION]
dst = db[TARGET_COLLECTION]

oa = OpenAI()  # reads OPENAI_API_KEY from env
EMBED_MODEL = "text-embedding-3-small"

# ----------------------------
# Formatting config
# ----------------------------
TOPICS = [
    "symptoms",
    "investigations",
    "treatment_steps",
    "red_flags",
    "patient_advice",
    "prognosis_followup",
]

def safe_lower(s: str) -> str:
    return (s or "").strip().lower() or "general"

def format_chunks_from_doc(doc):
    """Turn one `diseases` doc into multiple natural-language RAG chunks."""
    chunks = []
    disease = doc.get("chapter", {}).get("title", "Unknown Disease")
    chapter_no = doc.get("chapter", {}).get("number", "")
    component_title = doc.get("component", {}).get("title", "")
    source_str = f"AIDoc > {component_title} > Chapter {chapter_no}".strip()

    medical = doc.get("medical_content", {}) or {}

    for topic in TOPICS:
        items = medical.get(topic)
        if isinstance(items, list) and items:
            joined = ", ".join(str(x) for x in items)
            readable_topic = topic.replace("_", " ")
            sentence = f"{readable_topic.capitalize()} for {disease} include: {joined}."
            chunk = {
                "disease": disease,
                "topic": topic,
                "text": sentence,
                "source": source_str,
                "tags": [safe_lower(component_title), topic.lower()],
                "source_doc_id": doc.get("_id"),
                "created_at": datetime.utcnow(),
            }
            chunks.append(chunk)
    return chunks

def embed_text(text: str):
    """Call OpenAI to get an embedding for the given text."""
    resp = oa.embeddings.create(model=EMBED_MODEL, input=text)
    return resp.data[0].embedding

# ----------------------------
# Main: gather 3 chunks, embed, insert
# ----------------------------
inserted = 0
scanned_docs = 0

# Stream documents and stop when we've inserted MAX_SAMPLES chunks
for doc in src.find({}, {"chapter": 1, "component": 1, "medical_content": 1}).limit(50):
    scanned_docs += 1
    chunks = format_chunks_from_doc(doc)
    for ch in chunks:
        if inserted >= MAX_SAMPLES:
            break
        try:
            ch["embedding"] = embed_text(ch["text"])
            res = dst.insert_one(ch)
            inserted += 1
            print(f"✅ Inserted [{inserted}/{MAX_SAMPLES}] {ch['disease']} - {ch['topic']}  _id={res.inserted_id}")
        except Exception as e:
            print(f"❌ Error inserting chunk for doc {doc.get('_id')}: {e}")
            # small backoff in case of rate limits
            sleep(1)
    if inserted >= MAX_SAMPLES:
        break

print(f"\n🎉 Done. Scanned docs: {scanned_docs}, Inserted chunks: {inserted}/{MAX_SAMPLES} into '{TARGET_COLLECTION}'.")


✅ Inserted [1/3] Arboviral Infections - symptoms  _id=689eb79e77b9b3ae434bf88c
✅ Inserted [2/3] Arboviral Infections - investigations  _id=689eb79f77b9b3ae434bf88d
✅ Inserted [3/3] Arboviral Infections - treatment_steps  _id=689eb7a077b9b3ae434bf88e

🎉 Done. Scanned docs: 1, Inserted chunks: 3/3 into 'disease_chunks_vector'.


In [10]:
from pymongo import MongoClient
from openai import OpenAI
from dotenv import load_dotenv
from pprint import pprint
import os

# Load environment variables
load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = "AIDoc"
COLL = "disease_chunks_vector"
INDEX_NAME = "vector_index"

# Clients
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
col = db[COLL]

openai_client = OpenAI()
EMBED_MODEL = "text-embedding-3-small"

# 1. User query
question = "What are the danger signs in dengue?"
embedding = openai_client.embeddings.create(
    input=question,
    model=EMBED_MODEL
).data[0].embedding

# 2. Vector search with optional filter
pipeline = [
    {
        "$vectorSearch": {
            "index": INDEX_NAME,
            "path": "embedding",
            "queryVector": embedding,
            "numCandidates": 100,
            "limit": 3,
            # "filter": { "disease": "Arboviral Infections" }  # optional
        }
    },
    {
        "$project": {
            "_id": 0,
            "disease": 1,
            "topic": 1,
            "text": 1,
            "tags": 1,
            "score": { "$meta": "vectorSearchScore" }
        }
    }
]

# 3. Run and print results
results = list(col.aggregate(pipeline))
print(f"\n📌 Top {len(results)} chunks for: '{question}'\n")
for doc in results:
    pprint(doc)
    print("—" * 60)



📌 Top 3 chunks for: 'What are the danger signs in dengue?'

{'disease': 'Arboviral Infections',
 'score': 0.7567188739776611,
 'tags': ['viral infections', 'symptoms'],
 'text': 'Symptoms for Arboviral Infections include: Fever, Headache, '
         'Vomiting, Pain on moving eyes, Muscle pains, Joint pains, Macular or '
         'maculopapular rash, Leukopenia, Neck stiffness, Drowsiness, '
         'Disorientation, Stupor or coma, Convulsions, Absent or irregular '
         'deep tendon reflexes, Extensor plantar reflex, Erythrocytic '
         'diapedesis in skin lesions, Edema, Hemorrhage, Thrombocytopenia, '
         'Oligo- or polyarticular arthritis with joint swelling and redness, '
         'Retro-orbital pain, Transient macular rash, Tachycardia, Narrow '
         'pulse pressure, Persistent vomiting, Severe abdominal pain, Tender '
         'hepatomegaly, Ascites, Pleural effusion, Mucosal bleeding, Seizures, '
         'Encephalopathy, Aseptic meningitis/meningoencephalitis

In [11]:
# Collect the top retrieved texts
context = "\n\n".join([doc["text"] for doc in results])

# Build a system + user prompt
system_prompt = "You are a medical assistant. Answer based only on the given context."
user_prompt = f"""Context:\n{context}\n\nQuestion: What are the danger signs in dengue?"""

# Call OpenAI
response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.3
)

print("\n💬 Answer:\n")
print(response.choices[0].message.content.strip())



💬 Answer:

The danger signs in dengue include:

- Severe abdominal pain
- Persistent vomiting
- Rapid breathing
- Bleeding gums
- Blood in vomit
- Black tarry stools
- Fatigue or restlessness
- Signs of dehydration
- Decreased level of consciousness

These signs indicate a potential progression to severe dengue, which requires immediate medical attention.
