In [None]:
import os
os.chdir('../')

In [None]:
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec

In [None]:
load_dotenv()

In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY=os.environ.get('GROQ_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [None]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [None]:
extracted_data=load_pdf_file(data='Data/')

In [None]:
extracted_data

In [None]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

In [None]:
text_chunks

In [None]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
)

In [55]:
# Embed each chunk and upsert the embeddings into Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [None]:
# Load Existing index 
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [57]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7052ba761b20>

In [58]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [59]:
retrieved_docs = retriever.invoke("What is Acetaminophen?")

In [60]:
retrieved_docs

[Document(id='d2879b96-ec6f-42ec-949b-a7d138532c62', metadata={'page': 31.0, 'source': 'Data/Medical_book.pdf'}, page_content='tion. 330 C Street SW, Washington, DC 20447. (800) 392-\n3366.\nOTHER\nElder Abuse Prevention. <http://www.oaktrees.org/elder>.\nNational Institute on Drug Abuse. <http://www.nida.nih.gov>.\nLaith Farid Gulli, M.D.\nBilal Nasser, M.Sc.\nAcceleration-deceleration cervical injury\nsee Whiplash\nACE inhibitors see Angiotensin-converting\nenzyme inhibitors\nAcetaminophen\nDefinition\nAcetaminophen is a medicine used to relieve pain\nand reduce fever.\nPurpose\nAcetaminophen is used to relieve many kinds of'),
 Document(id='f883c252-1657-414d-872f-2a6e629d60a6', metadata={'page': 32.0, 'source': 'Data/Medical_book.pdf'}, page_content='immediate medical attention.\nInteractions\nAcetaminophen may interact with a variety of other\nmedicines. When this happens, the effects of one or both\nof the drugs may change or the risk of side effects may\nbe greater. Among the dr

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser
import json

class DocChat:
    def __init__(self, groq_api_key: str, retriever, model_name: str = "deepseek-r1-distill-qwen-32b"):
        """Initialize DocChat with document retriever and model parameters.
        
        Args:
            groq_api_key (str): API key for Groq
            retriever: Document retriever for fetching relevant content
            model_name (str): Model name to use (default: deepseek-r1-distill-qwen-32b)
        """
        self.retriever = retriever
        self.llm = None
        self.qa_chain = None
        self.chat_history = []
        self.prompts = self._create_prompts()
        self.initialize_groq_components(groq_api_key, model_name)

    def _create_prompts(self):
        """Create prompt templates for different query types
        
        Returns:
            dict: Dictionary of prompt templates
        """
        # Base system message template for all queries
        base_system_template = """You are MedBot, an advanced AI assistant specialized in medical knowledge.

CAPABILITIES:
- Provide accurate, evidence-based medical information
- Explain medical terminology in clear, accessible language
- Interpret symptoms and medical conditions with precision
- Reference medical literature and guidelines when appropriate

CONSTRAINTS:
- You are NOT a replacement for professional medical diagnosis or treatment
- Always clarify that users should consult healthcare providers for personal medical advice
- Maintain strict medical accuracy - if unsure, acknowledge limitations
- Avoid making definitive diagnostic statements

RESPONSE FORMAT:
- Begin with a clear, direct answer to the question
- Provide context and additional relevant information
- Use bullet points for symptoms, treatments, or key facts when appropriate
- Include brief mention of relevant medical guidelines or consensus when applicable
- End with a reminder about consulting healthcare professionals when appropriate

DOCUMENT CONTEXT BELOW:
{context}

Remember: Base your responses on the document context provided. If the context doesn't contain relevant information, acknowledge this limitation.
"""

        # Human template focusing on the question
        human_template = """
{input}
"""

        # Create a ChatPromptTemplate
        base_prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(base_system_template),
            HumanMessagePromptTemplate.from_template(human_template)
        ])

        # Create specialized templates for different query types
        prompts = {
            "general": base_prompt,
            
            "medical": ChatPromptTemplate.from_messages([
                SystemMessagePromptTemplate.from_template(base_system_template + """
ADDITIONAL MEDICAL INSTRUCTIONS:
- Use proper medical terminology with layperson explanations
- Include ICD codes when identifying specific conditions
- Reference standard treatment protocols when applicable
- Mention both conventional and evidence-based alternative approaches
- Clarify levels of evidence (e.g., RCT, meta-analysis, case studies)
"""),
                HumanMessagePromptTemplate.from_template(human_template)
            ]),
            
            "educational": ChatPromptTemplate.from_messages([
                SystemMessagePromptTemplate.from_template(base_system_template + """
EDUCATIONAL FOCUS:
- Structure responses with clear learning objectives
- Define all medical terms when first introduced
- Build explanations from basic concepts to more complex ideas
- Use anatomical references and physiological processes to explain mechanisms
- Incorporate mnemonics or memory aids when helpful
"""),
                HumanMessagePromptTemplate.from_template(human_template)
            ]),
            
            "detailed": ChatPromptTemplate.from_messages([
                SystemMessagePromptTemplate.from_template(base_system_template + """
DETAILED ANALYSIS REQUIREMENTS:
- Provide in-depth coverage of the topic with subsections
- Include epidemiological data when relevant
- Discuss pathophysiology in detail
- Cover differential diagnosis considerations
- Elaborate on diagnostic criteria and testing modalities
- Detail treatment approaches with medication classes/options
- Address prognosis and complications
"""),
                HumanMessagePromptTemplate.from_template(human_template)
            ])
        }
        
        return prompts

    def initialize_groq_components(self, groq_api_key: str, model_name: str):
        """Initialize Groq chat components
        
        Args:
            groq_api_key (str): API key for Groq
            model_name (str): Model name to use
        """
        try:
            self.llm = ChatGroq(
                api_key=groq_api_key,
                model_name=model_name,
                temperature=0.2,  # Lower temperature for more consistent medical responses
                max_tokens=2048    # Ensure sufficient tokens for detailed answers
            )
            self._initialize_qa_chain()
        except Exception as e:
            raise Exception(f"Failed to initialize Groq components: {str(e)}")

    def _initialize_qa_chain(self):
        """Initialize document-based retrieval and answering chain"""
        try:
            # Create the question answering chain with the general prompt by default
            question_answer_chain = create_stuff_documents_chain(
                self.llm, 
                self.prompts["general"],
                document_variable_name="context"
            )
            
            # Create the retrieval chain
            self.qa_chain = create_retrieval_chain(self.retriever, question_answer_chain)
        except Exception as e:
            raise Exception(f"Failed to initialize QA chain: {str(e)}")

    def query(self, question: str, query_type: str = "general") -> dict:
        """Execute chat query
        
        Args:
            question (str): User question
            query_type (str): Type of query (medical, educational, detailed, or general)
            
        Returns:
            dict: Response from the QA chain
        """
        if not self.qa_chain:
            raise ValueError("QA chain not initialized. Make sure to initialize components first.")
        
        # Verify the query type exists or default to general
        if query_type not in self.prompts:
            print(f"Warning: Query type '{query_type}' not found. Using 'general' instead.")
            query_type = "general"
        
        try:
            # Create a specialized chain for this query type
            specialized_qa_chain = create_stuff_documents_chain(
                self.llm, 
                self.prompts[query_type],
                document_variable_name="context"
            )
            
            specialized_retrieval_chain = create_retrieval_chain(
                self.retriever, specialized_qa_chain
            )
            
            # Execute the query
            response = specialized_retrieval_chain.invoke({
                "input": question,
                "chat_history": self.get_chat_history()
            })
            
            # Add metadata to the response
            response["query_type"] = query_type
            response["timestamp"] = "TIME_FUNCTION_PLACEHOLDER"  # Would use actual timestamp in real implementation
            
            # Update chat history
            self._update_chat_history(question, response["answer"])
            return response
        except Exception as e:
            error_msg = f"Error executing query: {str(e)}"
            print(error_msg)
            return {"answer": error_msg, "error": True, "query_type": query_type}

    def get_chat_history(self) -> list:
        """Get formatted chat history for LangChain
        
        Returns:
            list: List of (human_message, ai_message) tuples
        """
        formatted_history = []
        
        # Create pairs of human and AI messages
        for i in range(0, len(self.chat_history) - 1, 2):
            if i + 1 < len(self.chat_history):
                if isinstance(self.chat_history[i], HumanMessage) and isinstance(self.chat_history[i+1], AIMessage):
                    formatted_history.append((
                        self.chat_history[i].content, 
                        self.chat_history[i+1].content
                    ))
                
        return formatted_history

    def _update_chat_history(self, question: str, answer: str):
        """Update conversation history
        
        Args:
            question (str): Human question
            answer (str): AI answer
        """
        self.chat_history.extend([
            HumanMessage(content=question),
            AIMessage(content=answer)
        ])

    def reset_chat_history(self):
        """Reset the chat history"""
        self.chat_history = []

    def add_context(self, context: str, context_type: str = "system"):
        """Add additional context to the DocChat
        
        Args:
            context (str): Additional context to consider
            context_type (str): Type of context (system, human, ai)
        """
        if context_type == "system":
            self.chat_history.append(SystemMessage(content=context))
        elif context_type == "human":
            self.chat_history.append(HumanMessage(content=context))
        elif context_type == "ai":
            self.chat_history.append(AIMessage(content=context))
        else:
            raise ValueError(f"Invalid context type: {context_type}. Use 'system', 'human', or 'ai'.")

def create_doc_chat(api_key, retriever, model_name="deepseek-r1-distill-qwen-32b"):
    """Create and return a DocChat instance with error handling
    
    Args:
        api_key (str): Groq API key
        retriever: Document retriever
        model_name (str): Model name to use
        
    Returns:
        DocChat: Instance of DocChat or None if initialization fails
    """
    try:
        doc_chat = DocChat(groq_api_key=api_key, retriever=retriever, model_name=model_name)
        return doc_chat
    except Exception as e:
        print(f"Failed to create DocChat: {str(e)}")
        return None

In [70]:
# Create the DocChat instance
doc_chat = create_doc_chat(GROQ_API_KEY, retriever)

# Make a query
if doc_chat:
    response = doc_chat.query("What is Achalasia?", "medical")
    print(response["answer"])

<think>
Okay, so I need to figure out what Achalasia is. I remember hearing the term before, but I'm not exactly sure what it entails. Let me start by breaking down the word. "Achalasia" sounds Greek, and I think "chalia" means something like "to let go" or "to relax." So maybe "Achalasia" means "failure to relax." That makes sense because I think it's related to muscles not relaxing properly.

From the document, I see that Achalasia is a disorder of the esophagus. The esophagus is the tube that carries food from the throat to the stomach. So if Achalasia affects this, it probably has something to do with how food moves down the esophagus.

The document mentions that the lower esophageal sphincter doesn't relax properly. The sphincter is a ring of muscle that acts like a valve. Normally, when you swallow, this sphincter relaxes to let food into the stomach and then tightens again. In Achalasia, this doesn't happen correctly, so food can't pass through easily.

Symptoms include difficul

In [71]:
response = doc_chat.query("How to Diagnosis Achalasia?", "medical")
print(response["answer"])

<think>
Okay, so I need to figure out how to diagnose achalasia. I remember that achalasia is a condition affecting the esophagus, but I'm not exactly sure about the specifics. Let me start by recalling what I know about achalasia. It has something to do with difficulty swallowing, right? I think it's related to the muscles in the esophagus not working properly.

From the document provided, I see that symptoms include dysphagia, which is difficulty swallowing both solids and liquids. There's also mention of chest pain that can be mistaken for heart-related pain. Other symptoms might include regurgitation, weight loss, and nighttime cough or recurrent pneumonia due to food entering the airways.

The diagnosis process begins with a careful medical history. The doctor would look into the timing of symptoms and try to rule out other conditions that cause similar symptoms. So, the first step is taking a detailed patient history to understand the progression of symptoms and eliminate other p

In [72]:
response = doc_chat.query("What is Breech birth?", "medical")
print(response["answer"])

<think>
Okay, so I need to figure out what a breech birth is. I remember hearing the term before, but I'm not exactly sure what it entails. Let me start by breaking down the word. "Breech" sounds like it might relate to the position of the baby during birth. 

From what I recall, most babies are born head-first. So, a breech birth must be when the baby is in a different position. Maybe the baby is coming out feet or bottom first instead of head first. That makes sense because the document mentioned that it's the delivery of a fetus hind end first. 

I think the document said that between 3-4% of fetuses start labor in the breech position. That's a small percentage, but it's still significant enough that it's a known issue. I wonder why some babies end up in this position. The document mentioned that between 32-36 weeks, the fetus becomes too large to move around as freely, so maybe they get stuck in the breech position. 

There are different types of breech positions, right? I think th