In [None]:
#Capstone Project by Archana Adhi, Havanitha Macha and Shravan Busireddy.

In [None]:
def ARES(n_docs, query, key ,k ,claude_key,deepseek_key ):

    """

        ARES - Automatic Response Evaluation System

             Evaluates the LLM  generated response for a medical query on these metrics.
                     
        Args:
             n_docs(int) : Number of documents to retrieve from Pubmed and Europe PMC.
             query (str): User Medical query to evaluate.
             key (str): API key for OpenAI GPT models.
             k (int): Number of  top documents to retrieve with calculated scores.
             claude_key (str): API key for Claude models.
             deepseek_key (str): API key for DeepSeek models.
        
        Returns:       
             pubmed_query (str): Reformulated  query generated by LLM which is used for PubMed retrieval.
             pseudo_gold (str): Pseudo-ground truth document synthesized by LLM from top-k retrieved documents.
             df (pd.DataFrame): DataFrame containing BM25 and dense scores.
             chatgpt_RE (str): Evaluation result of ChatGPT response.
             claude_RE (str): Evaluation result of Claude response.
             deepseek_RE (str): Evaluation result of Deepseek response.
             empathy_evaluation (str): Empathy scores for each model's response.
             chatgpt_response (str): Response generated by ChatGPT for user query.
             claude_response (str): Response generated by Claude for user query.
             deepseek_response (str): Response generated by Claude for user query. 

    """
    #import dependencies and ignore warnings
    
    import openai
    from openai import OpenAI
    import re
    from typing import Optional
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import FAISS
    from langchain.schema import Document
    from rank_bm25 import BM25Okapi
    import numpy as np
    import openai
    from typing import List, Optional
    import openai
    from typing import List, Dict
    from anthropic import Anthropic
    from typing import Optional
    import json 
    import warnings
    from Bio import Entrez
    from xml.etree import ElementTree as ET
    
    warnings.simplefilter(action='ignore', category=FutureWarning)


    
    def get_medical_advice_deepseek(patient_query: str, api_key: str, model: str = 'deepseek-chat') -> Optional[str]:

        
        """
        Get medical advice from DeepSeek v3 in the style of a doctor's response.
    
        Args:
            patient_query: The patient's description of their condition.
            api_key: DeepSeek API key.
            model: DeepSeek model to use (default: "deepseek-chat").
    
        Returns:
            The generated medical advice, or None if generation failed.
        """
        if not patient_query or not isinstance(patient_query, str):
            raise ValueError("Patient query must be a non-empty string")
        if not api_key or not isinstance(api_key, str):
            raise ValueError("API key must be a non-empty string")
    
        try:
            url = "https://api.deepseek.com/v1/chat/completions"  # Confirm this endpoint from DeepSeek docs
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json"
            }
    
            system_message = (
                "You are a highly knowledgeable and compassionate doctor.\n"
                "1. Carefully analyze the patient's symptoms and concerns.\n"
                "2. Suggest possible causes, but never give a definitive diagnosis.\n"
                "3. Offer evidence-based advice for relief and care.\n"
                "4. Always recommend in-person consultation with a healthcare provider.\n"
                "5. Be clear, empathetic, and professional in tone.\n"
                "6. Consider lifestyle and other details mentioned by the patient.\n"
                "7. Avoid exaggeration or alarming language."
            )
    
            payload = {
                "model": model,
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": patient_query}
                ],
                "temperature": 0.2,
                "max_tokens": 800
            }
    
            response = requests.post(url, headers=headers, json=payload)
    
            if response.status_code == 200:
                return response.json()["choices"][0]["message"]["content"].strip()
            else:
                print("DeepSeek API error:", response.status_code, response.text)
                return None
    
        except Exception as e:
            print(f"Error generating medical advice with DeepSeek: {e}")
            return None
    deepseek_response = get_medical_advice_deepseek(query, api_key=deepseek_key)

    
    def get_medical_advice_claude(patient_query: str, api_key: str, model: str = "claude-3-opus-20240229") -> Optional[str]:
        """
        Get medical advice from Claude in the style of a doctor's response.
    
        Args:
            patient_query: The patient's description of their condition.
            api_key: Anthropic API key.
            model: Claude model to use (defaults to "claude-3-opus-20240229").
    
        Returns:
            The generated medical advice, or None if generation failed.
        """
        if not patient_query or not isinstance(patient_query, str):
            raise ValueError("Patient query must be a non-empty string")
        if not api_key or not isinstance(api_key, str):
            raise ValueError("API key must be a non-empty string")
    
        try:
            client = Anthropic(api_key=api_key)
    
            system_message = """You are a highly knowledgeable and compassionate doctor.
    1. Carefully analyze the patient's symptoms and concerns.
    2. Suggest possible causes, but never give a definitive diagnosis.
    3. Offer evidence-based advice for relief and care.
    4. Always recommend in-person consultation with a healthcare provider.
    5. Be clear, empathetic, and professional in tone.
    6. Consider lifestyle and other details mentioned by the patient.
    7. Avoid exaggeration or alarming language."""
    
            response = client.messages.create(
                model=model,
                system=system_message,
                max_tokens=800,
                temperature=0.2,
                messages=[
                    {"role": "user", "content": patient_query}
                ]
            )
    
            return response.content[0].text
    
        except Exception as e:
            print(f"Error generating medical advice with Claude: {e}")
            return None

    claude_response = get_medical_advice_claude(query, api_key=claude_key)        


    def get_medical_advice(patient_query: str, api_key: str, model: str = "gpt-4-turbo") -> Optional[str]:
        """
        Get medical advice from GPT-4-turbo/4o in the style of a doctor's response.
        
        Args:
            patient_query: The patient's description of their condition
            api_key: OpenAI API key
            model: Which OpenAI model to use (defaults to "gpt-4-turbo")
        
        Returns:
            The generated medical advice, or None if generation failed
        """
        # Validate inputs
        if not patient_query or not isinstance(patient_query, str):
            raise ValueError("Patient query must be a non-empty string")
        if not api_key or not isinstance(api_key, str):
            raise ValueError("API key must be a non-empty string")
    
        try:
            # Initialize client
            client = openai.OpenAI(api_key=api_key)
            
            # Create the system message to guide the AI's response
            system_message = """You are a very knowledgeable and compassionate doctor. 
            When responding to medical questions:
            1. Carefully analyze the patient's description
            2. Provide possible causes based on the symptoms described
            3. Offer practical, evidence-based suggestions for relief
            4. Always recommend consulting an in-person doctor for proper diagnosis
            5. Maintain a professional yet empathetic tone
            6. Never claim to definitively diagnose - only suggest possibilities
            7. Consider the patient's lifestyle factors mentioned"""
            
            # Call OpenAI API
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": patient_query}
                ],
                temperature=0.2,  # Slightly lower for more focused medical advice
                max_tokens=800
            )
    
            return response.choices[0].message.content
    
        except Exception as e:
            print(f"Error generating medical advice: {str(e)}")
            return None
    
    
    
        # Get the medical advice
    chatgpt_response = get_medical_advice(query, api_key=key)


    llmresponse = chatgpt_response + claude_response + deepseek_response



    
    client = OpenAI(api_key=key)  # Replace with your actual API key
    
    def generate_pubmed_query(natural_query: str) -> str:
        """
        Generates high-recall PubMed queries wrapped in triple quotes for immediate use.
        Returns queries in this format:
        '''
        (term1 OR term2) AND (term3 OR term4)
        '''
        """
        system_prompt = """You are a PubMed search expert that creates queries with:
        1. HIGH RECALL: Prioritize finding all relevant papers
        2. READY-TO-USE: Format with triple quotes (''') at start and end
        3. TERM EXPANSION: Include 3-5 synonyms per concept
        4. STRUCTURE: Follow this exact template:
        '''
        (concept1 OR synonym OR synonym) 
        AND 
        (concept2 OR synonym OR synonym)
        [AND (optional filter)]
        '''
        
        Output MUST include the triple quotes and be copy-paste ready."""
        
        user_prompt = f"""Convert this to a ready-to-use PubMed query with triple quotes:
        
        WORKING EXAMPLE:
        Input: "effects of prolonged computer use on back pain"
        Output: '''
        ("back pain" OR "low back pain") 
        AND 
        ("computer use" OR "prolonged sitting")
        AND 
        ("chronic pain" OR "musculoskeletal pain")
        '''
        
        Now convert this:
        {natural_query}"""
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.1,
                max_tokens=150
            )
            
            # Extract and validate the query
            raw_query = response.choices[0].message.content
            clean_query = raw_query.strip()
            
            # Verify triple quotes are present
            if not clean_query.startswith("'''") or not clean_query.endswith("'''"):
                clean_query = f"'''\n{clean_query}\n'''"
                
            return clean_query
        
        except Exception as e:
            print(f"Error generating query: {e}")
            # Fallback with triple quotes
            terms = re.findall(r'\b[\w-]+\b', natural_query.lower())[:3]
            base_query = f'"{terms[0]}" AND "{terms[1]}"' if len(terms) > 1 else f'"{terms[0]}"'
            return f"'''\n{base_query}\n'''"
    
    pubmed_query = generate_pubmed_query(query)
    #pubmed query is generated
    

    Entrez.email = "YOUR_EMAIL"  # Replace with your actual email
    
    def fetch_pubmed_docs_with_dates(query, retmax):

        
       """
        Fetch PubMed articles for a given search query and return their PMIDs, titles, abstracts, and publication years.
    
        Args:
            query (str): The search term or query to use in PubMed.
            retmax (int): The maximum number of articles to retrieve.
    
        Returns:
            List[Dict[str, str]]: A list of dictionaries, each containing:
                - 'pmid': PubMed ID of the article.
                - 'title': Title of the article.
                - 'abstract': Abstract text (concatenated if multiple sections).
                - 'year': Publication year (extracted from PubDate, or 'Unknown' if not found).
        """
        
        # Step 1: Search for PubMed IDs
        search_handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
        search_results = Entrez.read(search_handle)
        search_handle.close()
    
        id_list = search_results["IdList"]
        
        # Step 2: Fetch article data
        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(id_list), rettype="abstract", retmode="xml")
        xml_data = fetch_handle.read()
        fetch_handle.close()
    
        root = ET.fromstring(xml_data)
        results = []
    
        for article in root.findall(".//PubmedArticle"):
            pmid = article.findtext(".//PMID", default="").strip()
            title = article.findtext(".//ArticleTitle", default="").strip()
    
            abstract_texts = article.findall(".//Abstract/AbstractText")
            abstract = " ".join(elem.text.strip() for elem in abstract_texts if elem.text)
            full_doc = f"{title}. {abstract}" if abstract else title
    
            # Extract publication date
            date_elem = article.find(".//PubDate")
            if date_elem is not None:
                year = date_elem.findtext("Year") or ""
                month = date_elem.findtext("Month") or ""
                day = date_elem.findtext("Day") or ""
                pub_date = f"{year}-{month}-{day}".strip("-")
            else:
                pub_date = "Unknown"
    
            results.append({
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "year": pub_date[:4]
            })
    
        return results
    
    
    pub_docs=fetch_pubmed_docs_with_dates(pubmed_query, retmax=n_docs)
    print(len(pub_docs))



    def fetch_pubmed_articles(pubmed_query, page_size=None):
        """
        Fetches PubMed articles based on a search query and extracts relevant information.
        
        Args:
            pubmed_query (str): The search query for PubMed articles
            page_size (int): Number of articles to fetch (default: 30)
            
        Returns:
            list: A list of dictionaries containing article information
        """
        url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    
        params = {
            "query": pubmed_query + " OPEN_ACCESS:Y",
            "format": "json",
            "pageSize": page_size
        }
    
        response = requests.get(url, params=params)
        data = response.json()
    
        # Store all documents here
        documents = []
    
        for res in data['resultList']['result']:
            doc = {
                "title": res.get('title'),
                "year": res.get('pubYear'),
                "pmid": res.get('pmcid', None),
                "abstract": None,
                "conclusion": None
            }
            
            pmcid = res.get('pmcid')
            if pmcid:
                fulltext_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
                fulltext_response = requests.get(fulltext_url)
                
                if fulltext_response.status_code == 200:
                    soup = BeautifulSoup(fulltext_response.content, 'xml')
                    
                    # Extract Abstract
                    abstract = soup.find('abstract')
                    if abstract:
                        doc['abstract'] = abstract.get_text(separator=' ', strip=True)
                    
                    # Extract Results and Conclusion
                    sections = soup.find_all('sec')
                    for sec in sections:
                        title_tag = sec.find('title')
                        if title_tag:
                            title_text = title_tag.get_text(strip=True).lower()
                           
                            if 'conclusion' in title_text and not doc['conclusion']:
                                doc['conclusion'] = sec.get_text(separator=' ', strip=True)
            
            documents.append(doc)
        
        return documents
    
    e_pmc_docs = fetch_pubmed_articles(pubmed_query, page_size=n_docs)
    #relevant documents from Europe PMC retrieved
    print(len(e_pmc_docs))

    df1 = pd.DataFrame(pub_docs)
    df2 = pd.DataFrame(e_pmc_docs)
    if 'df1' in locals() and not df1.empty:
        df1['text'] = df1['abstract']
    
    
    
    if 'df2' in locals() and not df2.empty:
        
        # Safely handle missing abstract or conclusion
        df2['text'] = df2['abstract'].fillna('').astype(str) + ' ' + df2['conclusion'].fillna('').astype(str) 

    #adding abstract and conclusion for RAG. rest all sections are ignored.
    df = pd.concat([df1, df2], ignore_index=True)
    if len(df)==0:
        print('zero docs found')
        return 'No docs found',None,None,None,None,None,None,None,None,None
        # if no documents are retrieved for evaluation, then there is no evaluation. so, returns None.
    df = df.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

    #df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.replace('', np.nan, inplace=True)
    
    df = df.drop(['conclusion','abstract'	], axis=1)
    df=df.dropna()

    if len(df)==0:
        print('zero docs found')
        return 'No docs found',None,None,None,None,None,None,None,None,None
    
    


    def normalize_scores(scores):
        # a function to normalize a sequence of numbers(metric scores)
        min_score = min(scores)
        max_score = max(scores)
        if max_score == min_score:
            return [1.0 for _ in scores]  # all same scores
        return [(s - min_score) / (max_score - min_score) for s in scores]




    # Set your API key
    openai.api_key = key
    
    # Your medical documents
    texts = df['text'].tolist()
    full_docs = [Document(page_content=txt) for txt in texts]
    
    titles= df['title'].tolist()
    title_docs= [Document(page_content=title) for title in titles]
    # 1. BM25 scoring
    
    def bm25(query,docs):
        """
        The function calculates BM25 scores of any query and related texts(docs)  

           Args:
               query: any query or pubmed query
               docs: The retrieved documents from pubmed and europe pmc combined.

           Return:
               returns normalized BM25 scores of query and docs

        """
        
        tokenized_corpus = [doc.page_content.split() for doc in docs]
        bm25 = BM25Okapi(tokenized_corpus)
        bm25_scores = bm25.get_scores(query)
        return normalize_scores(bm25_scores)
        
    
    
    # 2. Dense vector scoring with OpenAI embeddings
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-large",
        openai_api_key=key)
    
    from numpy.linalg import norm
    #calculates cosine similarity between a and b
    def cosine_similarity(a, b):
        return np.dot(a, b) / (norm(a) * norm(b))
    
    
    def dense(query,docs):

        """
        Calculate dense scores( cosine similarity between query and docs)
 
            Args:
               query: any query or pubmed query
               docs: The retrieved documents from pubmed and europe pmc combined.

            Returns:
                    returns dense score

        """
        
        vectorstore = FAISS.from_documents(docs, embeddings)
        query_embedding = embeddings.embed_query(query)
        doc_embeddings = [embeddings.embed_query(doc.page_content) for doc in docs]
        dense_scores = [cosine_similarity(query_embedding, emb) for emb in doc_embeddings]
        return normalize_scores(dense_scores)
    
    bm25_query_docs = bm25(query.split(),full_docs)
    # Extract all quoted phrases
    pquery_keywords = re.findall(r'"(.*?)"', pubmed_query)
    bm25_pubmed_query_docs=bm25(pquery_keywords,full_docs)
    bm25_title_docs = bm25(query.split(),title_docs)
    bm25_pubmed_query_title_docs=bm25(pquery_keywords,title_docs)
    
    dense_query_docs = dense(query,full_docs)
    dense_pquery_docs = dense(" ".join(pquery_keywords),full_docs)
    dense_title_docs = dense(query,title_docs)
    dense_pquery_title_docs = dense(" ".join(pquery_keywords),title_docs)
    dense_llmresponse_docs=dense(llmresponse,full_docs)


    df['bm25_query_docs']=bm25_query_docs
    df['bm25_pubmed_query_docs']=bm25_pubmed_query_docs
    df['bm25_title_docs']=bm25_title_docs
    df['bm25_pubmed_query_title_docs']=bm25_pubmed_query_title_docs
    df['dense_query_docs']=dense_query_docs
    df['dense_pquery_docs']=dense_pquery_docs
    df['dense_title_docs']=dense_title_docs 
    df['dense_pquery_title_docs']=dense_pquery_title_docs
    df['dense_llmresponse_docs']=dense_llmresponse_docs

    df['score'] = (
        0.20 * df["bm25_query_docs"] +
        0.10 * df["bm25_pubmed_query_docs"] +
        0.05 * df["bm25_title_docs"] +
        0.05 * df["bm25_pubmed_query_title_docs"] +
        0.25 * df["dense_query_docs"] +
        0.10 * df["dense_pquery_docs"] +
        0.05 * df["dense_title_docs"] +
        0.05 * df["dense_pquery_title_docs"] +
        0.10 * df["dense_llmresponse_docs"] +
        0.05 * df["year"].apply(lambda y: max(0, 1 - (2025 - int(y)) / 10))
    )
    
    if len(df)< 8:
        top_k=df
    else:
        top_k = df.nlargest(k, 'score')
    
    def format_documents_for_gold_doc(df):
        """
        Converts a DataFrame with pmid, title, and text columns into
        a list of formatted document strings for LLM input.
        """
        docs = []
        for i, row in enumerate(df.itertuples(index=False), start=1):
            doc_str = f"""Document {i} (PMID: {row.pmid})
    Title: {row.title.strip()}
    Text: {row.text.strip()}"""
            docs.append(doc_str)
        return docs
    docs_4r_gold=format_documents_for_gold_doc(top_k)
    
    
    
    def generate_pseudo_gold_document(query: str, documents: List[str], api_key: str, model: str = "gpt-4-turbo") -> Optional[str]:
        """
        Generates a pseudo-gold document by synthesizing information from multiple documents.
        
        Args:
            query: The research question to address.
            documents: List of document texts to use as sources.
            api_key: OpenAI API key.
            model: Which OpenAI model to use (defaults to "gpt-4").
        
        Returns:
            The generated pseudo-gold document, or None if generation failed.
        """
        # Validate inputs
        if not query or not isinstance(query, str):
            raise ValueError("Query must be a non-empty string")
        if not documents or not all(isinstance(doc, str) for doc in documents):
            raise ValueError("Documents must be a non-empty list of strings")
        if not api_key or not isinstance(api_key, str):
            raise ValueError("API key must be a non-empty string")
    
        try:
            # Format documents for context
            context = "\n\n".join([f"Document {i+1}:\n{doc.strip()}" 
                                 for i, doc in enumerate(documents) if doc.strip()])
    
            prompt = f"""You are a factual and meticulous research assistant.
    
    You are given a set of small research documents (abstracts and conclusions) and a query. Follow these steps carefully to create a pseudo-gold document that is fully grounded in the provided texts:
    
    Query: {query}
    
    Documents:
    {context}
    
    Step-by-step instructions:
    1. Extract and list the key facts or conclusions from each document. Tag them with the document number.
    2. Identify any facts that are repeated or consistent across multiple documents.
    3. Highlight any contradictions or disagreements between documents.
    4. Reason about which facts are most reliable or most widely supported.
    5. Write a concise, factually accurate summary that addresses the query, grounded only in the information from the documents. Use inline citations (e.g., [Doc 1], [Doc 3]) to show the source of each claim.
    6. Insert the pmid after the document number 
    Only use information explicitly found in the documents. Do not invent, assume, or hallucinate any facts.
    Include nothing else in your response except the requested summary."""
    
            # Initialize client with API key
            client = openai.OpenAI(api_key=api_key)
            
            # Call OpenAI API
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=2800
            )
    
            return response.choices[0].message.content
    
        except Exception as e:
            print(f"Error generating pseudo-gold document: {str(e)}")
            return None
    

    api_key = key 
        
    pseudo_gold = generate_pseudo_gold_document(query, docs_4r_gold, api_key)
    #pseudo gold docuent is generated

    
    dense_gold_doc_docs = dense(pseudo_gold ,full_docs)
    df['gold']=dense_gold_doc_docs
    df['final scores']=.75*df['score']+.25*df['gold']
    
    if len(df)< 8:
        final_topk=df
    else:
        final_topk = df.nlargest(k, 'final scores')


    def empathy_evaluation(question, response1, response2, response3,key):
        """
        Evaluates and gives empathy ratings for responses and question.

        Args:
            question(str): query given by the user
            response1(str): ChatGPT response 
            response2(str): Claude response
            response3(str): Deepseek response

        Return:
            returns a string of empathy evalation with scores and explanation

        """"
        
        client = OpenAI(api_key=key)
        prompt = f"""
        You are an empathy evaluator. Rate responses on a scale from 1 (least empathetic) to 10 (most empathetic). Empathy requires:
        1. Acknowledging emotions ("That sounds hard")
        2. Validating concerns ("Your fear makes sense")
        3. Offering support ("Let's work through this together")
    
        ---
        **Examples (10-point scale):**
        Q: I had surgery and now my arm is numb. I'm scared.
        A1: "Numbness happens. Wait it out." → 1/10 (Dismissive)
        A2: "Post-op numbness can occur. Monitor symptoms." → 3/10 (Clinical, no empathy)
        A3: "I understand your concern. Let's check if this is normal." → 6/10 (Validates but lacks emotional depth)
        A4: "That sounds terrifying. Surgery recovery can be overwhelming. Let's review your symptoms together." → 10/10 (Emotionally attuned + support)
    
        Q: I've been anxious about my health nonstop.
        A1: "Just relax." → 2/10 (Dismissive)
        A2: "Anxiety is common. Try deep breathing." → 4/10 (Mildly helpful but generic)
        A3: "I hear how distressing this is. Would talking about it help?" → 8/10 (Strong validation + offer to help)
        
        Q: My chronic pain is worse today.
        A1: "Pain fluctuates." → 1/10 (Cold)
        A2: "Consider taking medication." → 3/10 (Solution without empathy)
        A3: "I'm sorry you're hurting. Let's adjust your management plan." → 7/10 (Compassionate + actionable)
        A4: "Chronic pain is exhausting. I'm here to help you find relief." → 9/10 (Emotionally resonant + supportive)
    
        ---
        **Evaluate these responses to:**
        Q: {question}
        A1: {response1}
        A2: {response2}
        A3: {response3}
    
        Return your evaluation in JSON format with the following structure:
        {{
            "ratings": [
                {{"chatgpt response": "A1", "rating": 0, "explanation": "Brief reason"}},
                {{"claude response": "A2", "rating": 0, "explanation": "Brief reason"}},
                {{"deepseek response": "A3", "rating": 0, "explanation": "Brief reason"}}
            ],
            "summary": "Overall comparison of empathy levels"
        }}
        """
    
        try:
            response = client.chat.completions.create(
                model="gpt-4-turbo",  # or "gpt-4-turbo" if available
                messages=[
                    {"role": "system", "content": "You are a precise empathy evaluator. Always return valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=800
            )
            # Extract the content and parse as JSON
            response_content = response.choices[0].message.content
            return json.loads(response_content)
        except Exception as e:
            return {"error": str(e)}
    
        



    def evaluate_llm_response_with_docs(
        query: str,
        llm_response: str,
        retrieved_docs: List[str],
        api_key: str,
        model: str = "gpt-4o"
    ) -> str:
        """
        Evaluates the LLM response against retrieved documents using GPT-4o for medical QA quality.
    
        Args:
            query: The original patient query.
            llm_response: The response generated by the medical assistant.
            retrieved_docs: A list of top relevant document texts used for retrieval.
            api_key: Your OpenAI API key.
            model: The GPT model to use (default: "gpt-4o").
        
        Returns:
            Evaluation result from GPT-4o.
        """
        # Join documents with indexing for citation and grounding
        context = "\n\n".join([f"Document {i+1}:\n{doc.strip()}" for i, doc in enumerate(retrieved_docs)])
    
        # Construct the evaluation prompt
        eval_prompt = f"""
    You are an expert medical QA evaluator. Your task is to assess the quality of an LLM-generated answer to a patient's medical query, using the retrieved research documents provided.
    
    Please evaluate the LLM response on the following criteria:
    
    1. **Correctness**: Are the statements medically accurate based on the retrieved documents?
    2. **Hallucination**: Does the answer contain any unsupported or fabricated facts?
    3. **Completeness**: Does it fully address the patient's question?
    4. **Faithfulness**: Is the response faithful to the information in the documents?
    5. **Groundedness**: Are claims supported with evidence from the documents?
    6. **Details**: Give the PMIDs of documents and published year if available
    7. **Values**: Provide all five scores as a list [correctness, hallucination, completeness, faithfulness, groundedness]
    
    Respond with a structured evaluation including scores (1-10) and explanations for each criterion.
    
    ---
    **Patient Query**:  
    {query}
    
    **LLM Response**:  
    {llm_response}
    
    **Retrieved Documents**:  
    {context}
    """
    
        try:
            client = OpenAI(api_key=api_key)  # Correct initialization
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a clinical QA evaluation expert."},
                    {"role": "user", "content": eval_prompt}
                ],
                temperature=0.2,
                max_tokens=2000
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Evaluation error: {e}")
            return "Error during evaluation."
    retrieved_docs = (final_topk['pmid']+ '  ' +final_topk['year']+ '  ' +final_topk['text']).to_list()
    
    chatgpt_RE = evaluate_llm_response_with_docs(
    query=query,
    llm_response=chatgpt_response,
    retrieved_docs=retrieved_docs,
    api_key=key ) # Replace with your actual API key

    claude_RE = evaluate_llm_response_with_docs(
    query=query,
    llm_response=claude_response,
    retrieved_docs=retrieved_docs,
    api_key=key ) # Replace with your actual API key

    deepseek_RE = evaluate_llm_response_with_docs(
    query=query,
    llm_response=deepseek_response,
    retrieved_docs=retrieved_docs,
    api_key=key )


    empathy_evaluation= empathy_evaluation(query, chatgpt_response, claude_response, deepseek_response,key)

    return pubmed_query,pseudo_gold,df,chatgpt_RE,claude_RE,deepseek_RE,empathy_evaluation,chatgpt_response, claude_response, deepseek_response
           

In [12]:
import pandas as pd
sample_queries=pd.read_csv(r"C:\Users\shrav\Downloads\sample_responses.csv")
sample_queries.head()

Unnamed: 0.1,Unnamed: 0,instruction,input,output
0,47493,"If you are a doctor, please answer the medical...","I wake in the night, usually about 2-3 hours a...",Dear patient Here are the possibilities of wha...
1,65740,"If you are a doctor, please answer the medical...","Honorable Sir,I am Ripon Dev from Bangladesh.M...","Hi, Thanks for writing in. Please add detailed..."
2,69490,"If you are a doctor, please answer the medical...",Ive had a cold which started on Christmas eve ...,"Hi, Welcome to Chat Doctor! Yes, from what you..."
3,39656,"If you are a doctor, please answer the medical...",I had cervical laminectomy surgery for spinal ...,Thanks for writing to us. You have complex sto...
4,45796,"If you are a doctor, please answer the medical...","Hello, At the end of lacrosse practice about a...","Dear List, I believe you may have suffered a m..."


In [66]:
i=0
data=pd.DataFrame()    #dataframe that has all BM25, dense and final scores
pubmed_queries=[]      #list that has all LLM generated pubmed queries
pseudo_gold_docs=[]    #list that has all LLM generated golden documents
chatgpt_REs=[]         #list that has all evaluation results for chatgpt responses
claude_REs=[]          #list that has all evalation results for claude responses
deepseek_REs=[]        #list that has all evalation results for deepseek responses
chatgpt_responses=[]   #list that has all responses generated by chatgpt for user medical queries
claude_responses=[]    #list that has all responses generated by claude for user medical queries
deepseek_responses=[]  #list that has all responses generated by deepseek for user medical queries
emapthy_evaluations=[] #list that has all empathy evaluation results
for query in sample_queries['input']:
    print(i)
    n_docs=64
    key= 'YOUR OPENAI KEY'
    claude_key= "YOUR CLAUDE KEY"
    deepseek_key="YOUR DEEPSEEK KEY"
    
    k=10
    pubmed_query,pseudo_gold,temp_df,chatgpt_RE,claude_RE,deepseek_RE,empathy_evaluation,chatgpt_response, claude_response, deepseek_response=ARES(n_docs, query, key ,k, claude_key,deepseek_key )
    
    if temp_df is not None and not temp_df.empty:
        
        temp_df['query_id'] = i
        data = pd.concat([data, temp_df], ignore_index=True)

    pubmed_queries.append(pubmed_query)
    pseudo_gold_docs.append(pseudo_gold)
    chatgpt_REs.append(chatgpt_RE)
    claude_REs.append(claude_RE)
    deepseek_REs.append(deepseek_RE)
    chatgpt_responses.append(chatgpt_response)
    claude_responses.append(claude_response) 
    deepseek_responses.append(deepseek_response)
    emapthy_evaluations.append(empathy_evaluation)
    i+=1
    

0
0
0
zero docs found
1
64
64
2
1
64
3
1
64
4
64
64
5
0
0
zero docs found
6
0
2
zero docs found
7
5
64
8
22
64
9
1
64
10
41
64
11
0
45
12
0
30
13
0
6
14
0
64
15
0
3
16
22
64
17
0
2
zero docs found
18
64
64
19
30
64
20
5
41
21
10
64
22
25
64
23
64
64
24
13
64
25
4
64
26
0
64
27
0
12
28
63
64
29
7
13
30
2
64
31
0
63


In [74]:
import pandas as pd
#dataframe to store all results from ARES
dfr = pd.DataFrame({
    "pubmed_query": pubmed_queries,
    "pseudo_gold": pseudo_gold_docs,
    "chatgpt_RE": chatgpt_REs,
    "claude_RE": claude_REs,
    "deepseek_RE": deepseek_REs,
    "empathy evaluations":emapthy_evaluations,
    "chatgpt responses":chatgpt_responses,
    "claude responses":claude_responses,
    "deepseek responses":deepseek_responses
})



In [78]:
# Save to CSV
dfr.to_csv("capstone_results.csv", index=False)
