# This is notebook for testing and seeing how can we do RAG in MongoDB itself

In [4]:
# --- 🔌 Connect to MongoDB ---
from pymongo import MongoClient
from pprint import pprint
from dotenv import load_dotenv
import os
load_dotenv()
MONGODB_URI = os.getenv("MONGO_URI")
DATABASE_NAME = "AIDoc"
COLLECTION_NAME = "diseases"
client = MongoClient(MONGODB_URI)

In [None]:
# Read from MongoDB
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# --- 📄 Fetch and print 2 sample documents ---
print("📄 Sample documents from 'diseases':\n")
for doc in collection.find().limit(1):
    pprint(doc)
    print("\n" + "-"*60 + "\n")

📄 Sample documents from 'diseases':

{'_id': ObjectId('687b0122aeb398e52bb28ca8'),
 'chapter': {'authors': 'K Ravi, Vallish Shenoy',
             'end_page': 857,
             'number': 6,
             'start_page': 834,
             'title': 'Arboviral Infections'},
 'component': {'editor': 'Dilip Mathai',
               'id': '1C',
               'title': 'Viral Infections'},
 'created_at': datetime.datetime(2025, 7, 19, 2, 21, 22, 60000),
 'extraction_status': 'Success',
 'medical_content': {'investigations': ['CBC with differential',
                                        'Platelet count',
                                        'Hematocrit serial determinations',
                                        'AST/ALT levels',
                                        'NS1 antigen test',
                                        'RT-PCR and serotyping',
                                        'IgM and IgG antibodies by MAC-ELISA '
                                        'or indirect IgG-ELI

In [None]:
# --- Topics we'll extract from 'medical_content' ---
# This will help us format the chunks from the medical content
TOPICS = [
    "symptoms",
    "investigations",
    "treatment_steps",
    "red_flags",
    "patient_advice",
    "prognosis_followup"
]

# --- Format document into natural-language RAG chunks ---
def format_chunks_from_doc(doc):
    chunks = []
    disease = doc.get("chapter", {}).get("title", "Unknown Disease")
    chapter = doc.get("chapter", {}).get("number", "")
    component = doc.get("component", {}).get("title", "")
    source = f"AIDoc > {component} > Chapter {chapter}"
    
    medical = doc.get("medical_content", {})

    for topic in TOPICS:
        if topic in medical:
            items = medical[topic]
            if isinstance(items, list) and items:
                joined = ", ".join(items)
                readable_topic = topic.replace('_', ' ')
                sentence = f"{readable_topic.capitalize()} for {disease} include: {joined}."

                chunk = {
                    "disease": disease,
                    "topic": topic,
                    "text": sentence,
                    "source": source,
                    "tags": [component.lower(), topic.lower()]
                }
                chunks.append(chunk)
    return chunks

# --- Preview chunks from first 2 documents ---
total = 0
for doc in collection.find().limit(2):
    chunks = format_chunks_from_doc(doc)
    for chunk in chunks:
        pprint(chunk)
        print("\n---\n")
    total += len(chunks)

print(f"✅ Formatted {total} chunks from 2 documents.")

{'disease': 'Arboviral Infections',
 'source': 'AIDoc > Viral Infections > Chapter 6',
 'tags': ['viral infections', 'symptoms'],
 'text': 'Symptoms for Arboviral Infections include: Fever, Headache, '
         'Vomiting, Pain on moving eyes, Muscle pains, Joint pains, Macular or '
         'maculopapular rash, Leukopenia, Neck stiffness, Drowsiness, '
         'Disorientation, Stupor or coma, Convulsions, Absent or irregular '
         'deep tendon reflexes, Extensor plantar reflex, Erythrocytic '
         'diapedesis in skin lesions, Edema, Hemorrhage, Thrombocytopenia, '
         'Oligo- or polyarticular arthritis with joint swelling and redness, '
         'Retro-orbital pain, Transient macular rash, Tachycardia, Narrow '
         'pulse pressure, Persistent vomiting, Severe abdominal pain, Tender '
         'hepatomegaly, Ascites, Pleural effusion, Mucosal bleeding, Seizures, '
         'Encephalopathy, Aseptic meningitis/meningoencephalitis, Intracranial '
         'hemorrhage, H