In [1]:
from langgraph.graph import StateGraph, END 
from typing import TypedDict, List
from langchain_core.documents import Document



In [2]:

from dotenv import load_dotenv
import os
load_dotenv("E:\\AI-Buildathon\\.env")
api_key = os.getenv("GOOGLE_GENAI_API_KEY")




In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7, api_key=api_key)


 



In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_pdf_documents(file_path):
    loader = DirectoryLoader(
    file_path,
    glob="**/*.pdf",
    loader_cls=PyPDFLoader,

)
    
    documents=loader.load()
    return documents

document=load_pdf_documents("E:\\AI-Buildathon\\data")

len(document)
 




Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)


2031

In [5]:
# Enhanced metadata mapping based on actual file names
source_mapping = {
    "9789241505550_eng.pdf": {"source": "WHO", "type": "Clinical Guidelines"},
    "brb-mn-21-01-guideline-2017-eng-guidelines-for-antenatal-care-in-barbados-revised-feb-2017.pdf": {"source": "Barbados Ministry of Health", "type": "Antenatal Care Guidelines"},
    "dokumen.pub_the-pregnancy-encyclopedia-9780241660119-9780241731628.pdf": {"source": "Pregnancy Encyclopedia", "type": "Comprehensive Reference"},
    "Exclusive-breastfeeding-guide-2022_1-August.pdf": {"source": "Breastfeeding Authority", "type": "Breastfeeding Guide"},
    "Maternal Nutrition UNICEF.pdf": {"source": "UNICEF", "type": "Nutrition Guidelines"},
    "pregnancy-and-childbirth-expecting-a-baby-pregnancy-guide-pregnancy-what-to-expect-pregnancy-health-pregnancy-eating-and-recipes.pdf": {"source": "Pregnancy Health Guide", "type": "Patient Education"},
    "WHO.pdf": {"source": "WHO", "type": "General Guidelines"},
    "who_postnatal.pdf": {"source": "WHO", "type": "Postnatal Care Guidelines"}
}

for doc in document:
    # Get source and handle both string and list types
    source = doc.metadata.get("source", "")
    
    # If source is a list, take the first element
    if isinstance(source, list):
        source = source[0] if source else ""
    
    # Extract filename from full path
    if source:
        filename = source.split("\\")[-1]
    else:
        filename = ""
    
    source_info = source_mapping.get(filename, {"source": "Unknown", "type": "General"})
    
    doc.metadata.update({
        "source": source_info["source"],
        "document_type": source_info["type"],
        "filename": filename,
        "region": "Global/Bangladesh",
        "language": "en",
        "confidence_level": "primary",
        "domain": "maternal_health"
    })


In [6]:
def text_splitter(documents):
    # Optimized for medical/clinical content with semantic boundaries
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,  # Smaller chunks for better precision
        chunk_overlap=200,  # More overlap to preserve context
        length_function=len,
        separators=["\n\n", "\n", ". ", ", ", " ", ""],  # Preserve sentence boundaries
        is_separator_regex=False,
    )
    text_chunk = text_splitter.split_documents(documents)
    
    # Add chunk metadata for better tracking
    for i, chunk in enumerate(text_chunk):
        chunk.metadata['chunk_id'] = i
        chunk.metadata['chunk_size'] = len(chunk.page_content)
    
    return text_chunk

text_chunk = text_splitter(document)
print(f"Number of Chunks: {len(text_chunk)}")
print(f"Average Chunk Size: {sum(c.metadata['chunk_size'] for c in text_chunk) / len(text_chunk):.0f} characters") 

Number of Chunks: 6457
Average Chunk Size: 640 characters


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

# Using a more powerful embedding model optimized for semantic understanding
# Options: 
# 1. "sentence-transformers/all-mpnet-base-v2" - Better semantic understanding (768 dim)
# 2. "BAAI/bge-base-en-v1.5" - State-of-the-art retrieval performance (768 dim)
# 3. "sentence-transformers/multi-qa-mpnet-base-dot-v1" - Optimized for Q&A (768 dim)

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={'device': 'cpu'},  # Change to 'cuda' if GPU available
    encode_kwargs={'normalize_embeddings': True}  # Improves retrieval quality
)


In [8]:
load_dotenv("E:\\AI-Buildathon\\.env")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [9]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("test")

In [10]:
from langchain_pinecone import PineconeVectorStore

docstore = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding_model,
    index_name="test",
)


In [11]:
docsearch= PineconeVectorStore.from_existing_index(
    embedding=embedding_model,
    index_name="test"
)

In [12]:
# Optimized retrieval strategy with MMR for diversity and relevance balance
retriever = docsearch.as_retriever(
    search_type="mmr",  # Maximal Marginal Relevance for diverse results
    search_kwargs={
        "k": 8,  # Return top 8 most relevant chunks (reduced from 25 for better focus)
        "fetch_k": 30,  # Fetch 30 candidates before MMR reranking (reduced from 70)
        "lambda_mult": 0.7  # Balance between relevance (1.0) and diversity (0.0)
    }
)

# Alternative: Use similarity search with score threshold for high-precision retrieval
# retriever = docsearch.as_retriever(
#     search_type="similarity_score_threshold",
#     search_kwargs={
#         "k": 8,
#         "score_threshold": 0.7  # Only return chunks with similarity > 0.7
#     }
# )


In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

system_prompt = (
    "You are MaBondhu AI, an expert maternal health assistant providing evidence-based prenatal care guidance.\\n\\n"
    
    "# YOUR MISSION\\n"
    "Deliver safe, accurate, culturally-appropriate maternal health advice for expecting mothers in {language}.\\n\\n"
    
    "# RESPONSE PROTOCOL\\n"
    "## Emergency Detection (Priority 1)\\n"
    "- Identify danger signs: severe bleeding, severe headache, blurred vision, severe abdominal pain, reduced fetal movement, high fever, seizures, severe swelling\\n"
    "- If detected: Start with '‚ö†Ô∏è EMERGENCY' and recommend IMMEDIATE medical attention\\n"
    "- List nearest action: 'Go to emergency room NOW' or 'Call emergency services'\\n\\n"
    
    "## Medical Uncertainty (Priority 2)\\n"
    "- If context insufficient or condition complex, state: 'I recommend consulting with a healthcare provider for proper evaluation'\\n"
    "- List specific tests/vitals needed (e.g., blood pressure, ultrasound, blood sugar)\\n\\n"
    
    "## Standard Guidance (Priority 3)\\n"
    "For routine questions about nutrition, prenatal care, symptoms, lifestyle:\\n"
    "1. **Main Answer**: Provide clear, detailed explanation using retrieved context\\n"
    "2. **Evidence**: Cite sources explicitly (e.g., [WHO Guidelines] or [UNICEF Nutrition Guide])\\n"
    "3. **Actionable Steps**: List 2-3 specific actions with details:\\n"
    "   - What to do (e.g., 'Take 60mg iron supplement daily')\\n"
    "   - When to do it (e.g., 'with meals to reduce nausea')\\n"
    "   - Why it matters (e.g., 'prevents anemia during pregnancy')\\n"
    "4. **Timeline/Schedule**: If relevant, provide checkup schedules or timelines\\n"
    "5. **Safety Note**: Mention any precautions or warning signs to watch\\n\\n"
    
    "## Information Not Available\\n"
    "- If question outside your knowledge: 'I don't have specific information about this. Please consult with your healthcare provider for accurate guidance.'\\n"
    "- Never make up medical information\\n\\n"
    
    "# RESPONSE QUALITY STANDARDS\\n"
    "‚úì Evidence-based: Use only information from retrieved context\\n"
    "‚úì Specific: Provide concrete numbers, dosages, timelines\\n"
    "‚úì Empathetic: Use reassuring, supportive language\\n"
    "‚úì Cultural: Consider Bangladesh context and local healthcare access\\n"
    "‚úì Clear: Write in simple {language} avoiding complex medical jargon\\n"
    "‚úì Structured: Use bullet points, numbered lists for readability\\n\\n"
    
    "# EXAMPLE RESPONSES\\n"
    "Q: 'What foods should I eat during pregnancy?'\\n"
    "A: '**Nutritious Foods for Healthy Pregnancy** [UNICEF Nutrition Guide]\\n\\n"
    "Focus on these food groups:\\n"
    "1. **Iron-rich foods**: Red meat, spinach, lentils (prevents anemia)\\n"
    "2. **Calcium sources**: Milk, yogurt, cheese (builds baby's bones)\\n"
    "3. **Folate-rich**: Leafy greens, beans, fortified cereals (prevents birth defects)\\n"
    "4. **Protein**: Eggs, fish, chicken, dal (supports baby's growth)\\n\\n"
    "**Action Steps:**\\n"
    "- Eat 3 balanced meals + 2 snacks daily\\n"
    "- Take prenatal vitamin with 400mcg folic acid\\n"
    "- Drink 8-10 glasses of water daily\\n"
    "- Avoid raw/undercooked meat and unpasteurized dairy\\n\\n"
    "Next checkup: Discuss any dietary concerns with your doctor.'\\n\\n"
    
    "# CONTEXT FROM KNOWLEDGE BASE\\n"
    "{context}\\n\\n"
    
    "# YOUR RESPONSE\\n"
    "Now answer the user's question following the protocol above in {language}. Be thorough yet concise."
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [14]:
question_answering_chain = create_stuff_documents_chain(llm, prompt=prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)


In [15]:
# Query preprocessing function to improve retrieval
def preprocess_query(query: str) -> str:
    """
    Enhance user query for better retrieval by:
    1. Expanding medical abbreviations
    2. Adding context keywords
    3. Normalizing terminology
    """
    # Common medical term mappings
    term_expansion = {
        "bp": "blood pressure",
        "hb": "hemoglobin",
        "anc": "antenatal care",
        "pnc": "postnatal care",
        "ttv": "tetanus toxoid vaccine",
        "ifa": "iron folic acid",
        "usg": "ultrasound",
        "c-section": "cesarean section",
        "ob-gyn": "obstetrician gynecologist"
    }
    
    query_lower = query.lower()
    for abbrev, full_term in term_expansion.items():
        if abbrev in query_lower:
            query = query.replace(abbrev, full_term)
            query = query.replace(abbrev.upper(), full_term)
    
    # Add pregnancy context if not present
    pregnancy_keywords = ["pregnancy", "pregnant", "prenatal", "antenatal", "maternal", "expecting"]
    if not any(keyword in query_lower for keyword in pregnancy_keywords):
        query = f"{query} during pregnancy"
    
    return query.strip()


In [16]:
# Response validation and safety check
def validate_response(response: dict, query: str) -> dict:
    """
    Add safety checks and response quality validation
    """
    answer = response.get('answer', '')
    
    # Check for emergency keywords in query or context
    emergency_keywords = [
        'bleeding', 'blood', 'severe pain', 'headache severe',
        'blurred vision', 'seizure', 'convulsion', 'unconscious',
        'baby not moving', 'reduced movement', 'high fever',
        'water broke', 'contractions', 'severe swelling'
    ]
    
    query_lower = query.lower()
    is_emergency = any(keyword in query_lower for keyword in emergency_keywords)
    
    if is_emergency and '‚ö†Ô∏è EMERGENCY' not in answer and 'EMERGENCY' not in answer:
        # Add safety prefix if emergency detected but not in response
        answer = (
            "‚ö†Ô∏è IMPORTANT: This may require medical attention. "
            "If you're experiencing severe symptoms, please contact your healthcare provider immediately.\n\n" 
            + answer
        )
        response['answer'] = answer
    
    # Add disclaimer footer
    disclaimer = (
        "\n\n---\n"
        "‚ÑπÔ∏è *This advice is for informational purposes. "
        "Always consult your healthcare provider for medical decisions.*"
    )
    
    if disclaimer not in answer:
        response['answer'] = answer + disclaimer
    
    return response


In [22]:
# Enhanced query interface with preprocessing and validation

print("ü§∞ MaBondhu AI - Your Maternal Health Assistant")
print("="*50)
inp = input("\nEnter your question: ")
language_choice = input("Language (English/Bengali): ").strip() or "English"

# Preprocess query for better retrieval
processed_query = preprocess_query(inp)
print(f"\nüîç Processing: {processed_query}...\n")

# Get response
response = rag_chain.invoke({
    "input": processed_query,
    "language": language_choice
})

# Validate and enhance response
validated_response = validate_response(response, inp)

# Display answer
print("\n" + "="*50)
print("üìã MaBondhu AI Response:")
print("="*50)
print(validated_response['answer'])

# Optionally show source documents
if 'context' in validated_response:
    print("\nüìö Sources Used:")
    sources = set()
    for doc in validated_response.get('context', []):
        source = doc.metadata.get('source', 'Unknown')
        doc_type = doc.metadata.get('document_type', '')
        sources.add(f"- {source} ({doc_type})")
    for source in sorted(sources):
        print(source)

ü§∞ MaBondhu AI - Your Maternal Health Assistant

üîç Processing: What should I do at my first antenatal appointment...


üìã MaBondhu AI Response:
**Your First Antenatal Appointment** [Context from Knowledge Base]

Your first antenatal appointment, often called the booking appointment, is a comprehensive visit usually scheduled when you are 8-12 weeks pregnant. This appointment is crucial for setting up your prenatal care plan and gathering essential health information.

**What to Expect at Your First Appointment:**

1.  **Information and Discussion:**
    *   You will receive information about diet, lifestyle considerations, available pregnancy care services, and maternity benefits. This is an opportunity to ask questions and discuss any concerns you have.
    *   You'll be given information to help you make informed decisions about screening tests.

2.  **Health Checks and Measurements:**
    *   Your Body Mass Index (BMI) and blood pressure (BP) will be measured.
    *   A urine