In [2]:
import pandas as pd

import numpy as np
import scipy as sp

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import gensim
import sklearn

In [4]:
import google.generativeai as genai
import os
from dotenv import load_dotenv
load_dotenv()

GEMINI_API_KEY =  os.getenv('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df_medquad = pd.read_csv("MedQuad-MedicalQnADataset.csv")
df_medquad.head()

Unnamed: 0.1,Unnamed: 0,qtype,Question,Answer
0,0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


In [6]:
df_pubmed = pd.read_csv("PubMed_20k_RCT/train.csv")
df_pubmed.head()

Unnamed: 0,abstract_id,line_id,abstract_text,line_number,total_lines,target
0,24293578,24293578_0_12,To investigate the efficacy of 6 weeks of dail...,0,12,OBJECTIVE
1,24293578,24293578_1_12,A total of 125 patients with primary knee OA w...,1,12,METHODS
2,24293578,24293578_2_12,Outcome measures included pain reduction and i...,2,12,METHODS
3,24293578,24293578_3_12,Pain was assessed using the visual analog pain...,3,12,METHODS
4,24293578,24293578_4_12,Secondary outcome measures included the Wester...,4,12,METHODS


# Preprocessing

In [7]:
df_pubmed = df_pubmed.sort_values(by=['line_id'])
df_pubmed = df_pubmed.groupby('abstract_id').agg({'abstract_text':' '.join}).reset_index()
df_pubmed.head()

Unnamed: 0,abstract_id,abstract_text
0,24219770,The purpose of this study was to determine the...
1,24219814,Repeated courses of intravenous ( IV ) aminogl...
2,24219842,To determine whether the Pediatric Asthma Cont...
3,24219852,Despite the benefits of endoscopic nasobiliary...
4,24219882,We assessed the impact of hot flashes and vari...


In [8]:
# add qa id to medquad
df_medquad['qa_id'] = range(1, len(df_medquad) + 1)
df_medquad = df_medquad[['qa_id', 'Question', 'Answer', 'qtype']]
df_medquad.head()

Unnamed: 0,qa_id,Question,Answer,qtype
0,1,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,susceptibility
1,2,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms
2,3,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,susceptibility
3,4,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos...",exams and tests
4,5,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen...",treatment


# Word Embedding

In [9]:
model_path = 'GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [10]:
def preprocess(sentence):
    return [word for word in sentence.lower().split() if word in word2vec]

def get_sentence_vector(sentence):
    words = preprocess(sentence)
    if words:
        return np.mean([word2vec[word] for word in words], axis=0)
    else:
        return np.zeros(word2vec.vector_size)

In [11]:
df_pubmed['abstract_vector'] = df_pubmed['abstract_text'].apply(get_sentence_vector)
df_medquad['question_vector'] = df_medquad['Question'].apply(get_sentence_vector)

In [12]:
df_pubmed

Unnamed: 0,abstract_id,abstract_text,abstract_vector
0,24219770,The purpose of this study was to determine the...,"[-0.026591245, 0.056547694, -0.0050117737, 0.0..."
1,24219814,Repeated courses of intravenous ( IV ) aminogl...,"[-0.03645952, 0.07120251, 0.021919908, 0.11220..."
2,24219842,To determine whether the Pediatric Asthma Cont...,"[-0.0047280365, 0.041275, -0.00969958, 0.08767..."
3,24219852,Despite the benefits of endoscopic nasobiliary...,"[-0.003629674, 0.07113389, 0.027456231, 0.0854..."
4,24219882,We assessed the impact of hot flashes and vari...,"[-0.020188205, 0.07125171, -0.013315455, 0.093..."
...,...,...,...
14995,26521572,To explore the impacts on the cognitive level ...,"[0.009622574, 0.053992737, 0.025816005, 0.0657..."
14996,26521581,To observe the efficacy of acupuncture on pain...,"[-0.019650908, 0.06335896, 0.009455694, 0.0609..."
14997,26521582,To explore the clinical effect of Erlong Xizhu...,"[0.009212655, 0.03145588, 0.010136922, 0.06303..."
14998,26521589,To observe the myocardial protective effect of...,"[-0.023100011, 0.06358117, 0.022341112, 0.0516..."


In [13]:
df_medquad

Unnamed: 0,qa_id,Question,Answer,qtype,question_vector
0,1,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,susceptibility,"[-0.011978149, 0.038289387, 0.054300945, 0.098..."
1,2,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms,"[0.017163087, 0.12464599, 0.05180664, 0.102880..."
2,3,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,susceptibility,"[-0.011978149, 0.038289387, 0.054300945, 0.098..."
3,4,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos...",exams and tests,"[0.034423828, 0.15397136, 0.036051434, 0.01643..."
4,5,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen...",treatment,"[-0.011118571, 0.09565226, 0.08443197, 0.07938..."
...,...,...,...,...,...
16402,16403,What are the symptoms of Familial visceral myo...,What are the signs and symptoms of Familial vi...,symptoms,"[-0.006930881, 0.0062052407, -0.021565756, 0.0..."
16403,16404,What is (are) Pseudopelade of Brocq ?,Pseudopelade of Brocq (PBB) is a slowly progre...,information,"[0.073349, -0.03970337, 0.19335938, 0.04766845..."
16404,16405,What are the symptoms of Pseudopelade of Brocq ?,What are the signs and symptoms of Pseudopelad...,symptoms,"[0.027374268, 0.09770203, 0.074645996, 0.11676..."
16405,16406,What are the treatments for Pseudopelade of Br...,Is there treatment or a cure for pseudopelade ...,treatment,"[-0.008605957, 0.06829834, 0.109228514, 0.0857..."


# Information Retrieval

In [14]:
def find_top_k_abstracts(query_vector, abstract_vectors_col, k=3, df=df_pubmed):
    cosine_scores = cosine_similarity([query_vector], np.array(df[abstract_vectors_col]).tolist())[0]
    top_k_indices = np.argsort(cosine_scores)[-k:][::-1]
    return df.iloc[top_k_indices]['abstract_id'].values, cosine_scores[top_k_indices]

def find_top_k_answers(query_vector, question_vectors_col, k=3, df=df_medquad):
    cosine_scores = cosine_similarity([query_vector], np.array(df_medquad[question_vectors_col]).tolist())[0]
    top_k_indices = np.argsort(cosine_scores)[-k:][::-1]
    return df.iloc[top_k_indices]['qa_id'].values, cosine_scores[top_k_indices]

def get_qa_pairs(qa_ids):
    return df_medquad[df_medquad['qa_id'].isin(qa_ids)][['qa_id', 'Question', 'Answer']]


def get_abstracts(abstract_ids):
    return df_pubmed[df_pubmed['abstract_id'].isin(abstract_ids)][['abstract_id', 'abstract_text']]

def query_top_k_answers_and_abstracts(query, k=10):
    query_vector = get_sentence_vector(query)

    qa_ids, question_scores = find_top_k_answers(query_vector, 'question_vector', k)
    abstract_ids, abstract_scores = find_top_k_abstracts(query_vector, 'abstract_vector', k)

    answers_df = get_qa_pairs(qa_ids)
    abstracts_df = get_abstracts(abstract_ids)

    answers_df['Similarity Score'] = question_scores
    abstracts_df['Similarity Score'] = abstract_scores

    return answers_df, abstracts_df

In [15]:
queries = [
    "What is the incubation period of COVID-19?",
    "Cure for fever?",
    "Who is at risk for Lymphocytic Choriomeningit",
    "What are the symptoms of Ligma?"
]

In [16]:
for query in queries:
    print(f"Query: {query}")
    answers_df, abstracts_df = query_top_k_answers_and_abstracts(query)
    print("Top Answers:")
    display(answers_df)
    print("Top Abstracts:")
    display(abstracts_df)
    print("\n")

Query: What is the incubation period of COVID-19?
Top Answers:


Unnamed: 0,qa_id,Question,Answer,Similarity Score
123,124,what is yersiniosis for Yersinia ?,Yersiniosis is an infectious disease caused by...,0.667042
252,253,what is the treatment for vancomycin-resistant...,On this Page General Information What is vanco...,0.647081
392,393,What is the outlook for Agenesis of the Corpus...,Prognosis depends on the extent and severity o...,0.637043
7678,7679,What is (are) paramyotonia congenita ?,Paramyotonia congenita is a disorder that affe...,0.637043
8918,8919,What is (are) pachyonychia congenita ?,Pachyonychia congenita is a condition that pri...,0.637043
10753,10754,What is (are) dyskeratosis congenita ?,Dyskeratosis congenita is a disorder that can ...,0.637043
11895,11896,What is (are) Anonychia congenita ?,Anonychia congenita is an extremely rare nail ...,0.637043
12612,12613,What is (are) Chancroid ?,Chancroid is a bacterial infection that is spr...,0.637043
12772,12773,What is (are) Paramyotonia congenita ?,Paramyotonia congenita is an inherited conditi...,0.635448
15727,15728,What is (are) Pachyonychia congenita ?,Pachyonychia congenita (PC) is a rare inherite...,0.633984


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
2506,24460808,Standing invoked change in QT interval has bee...,0.664326
2728,24479729,Most randomised clinical trials ( RCTs ) testi...,0.655015
4148,24618483,To study the effectiveness of the addition of ...,0.653418
6802,24884585,Umbilical cord blood ( UCB ) is an important s...,0.650773
7842,24981366,Preterm birth is the most important single det...,0.6468
7950,24993861,"It is estimated that about 275,000 inhabitants...",0.646501
10374,25267326,We recently completed a randomized controlled ...,0.645833
10728,25312005,Much is to be learned about what implementatio...,0.645678
12239,25502179,We investigated the method of switching EPO to...,0.645599
14054,25881022,Specialized Early Intervention services ( SEI ...,0.645421




Query: Cure for fever?
Top Answers:


Unnamed: 0,qa_id,Question,Answer,Similarity Score
827,828,What are the treatments for Metachromatic Leuk...,There is no cure for MLD. Bone marrow transpla...,0.604811
1081,1082,What are the treatments for Myotonia Congenita ?,Most people with myotonia congenita dont requi...,0.604055
1327,1328,What are the treatments for Myotonia ?,"Treatment for myotonia may include mexiletine,...",0.604055
2383,2384,What are the treatments for Indigestion ?,Some people may experience relief from symptom...,0.601695
6162,6163,What are the treatments for potassium-aggravat...,These resources address the diagnosis or manag...,0.601124
9122,9123,What are the treatments for Alzheimer disease ?,These resources address the diagnosis or manag...,0.601124
9412,9413,What are the treatments for retinitis pigmento...,These resources address the diagnosis or manag...,0.601097
9647,9648,What are the treatments for Parkinson disease ?,These resources address the diagnosis or manag...,0.599236
10822,10823,"What are the treatments for neuropathy, ataxia...",These resources address the diagnosis or manag...,0.599236
10887,10888,What are the treatments for myotonia congenita ?,These resources address the diagnosis or manag...,0.599208


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
1774,24384878,To evaluate long-term cure rates and late comp...,0.543007
2264,24438351,People with multi-drug resistant tuberculosis ...,0.525999
2596,24467861,Meaning-focused coping may be at the core of a...,0.518751
4792,24673608,Long-duration beta-lactam antibiotics are used...,0.517926
7265,24925094,Hot flushes and night sweats ( vasomotor sympt...,0.514821
7927,24990296,The aim of this study is to compare the microl...,0.511487
8111,25011730,UK National Institute of Health and Clinical E...,0.511086
8472,25052161,Psoriasis causes worldwide concern because of ...,0.51023
10113,25238873,Malignant pleural mesothelioma is an incurable...,0.506584
12313,25516016,Although uncomplicated cystitis is often self-...,0.505786




Query: Who is at risk for Lymphocytic Choriomeningit
Top Answers:


Unnamed: 0,qa_id,Question,Answer,Similarity Score
0,1,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,1.0
2,3,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,1.0
1576,1577,Who is at risk for Diverticular Disease? ?,Diverticulosis becomes more common as people a...,0.951077
3135,3136,Who is at risk for Chronic Lymphocytic Leukemi...,Older age can affect the risk of developing ch...,0.948863
4101,4102,Who is at risk for Cardiogenic Shock? ?,The most common risk factor for cardiogenic sh...,0.947501
4161,4162,Who is at risk for Alpha-1 Antitrypsin Deficie...,Alpha-1 antitrypsin (AAT) deficiency occurs in...,0.943087
4379,4380,Who is at risk for Hemolytic Anemia? ?,Hemolytic anemia can affect people of all ages...,0.942907
4492,4493,Who is at risk for Thrombocythemia and Thrombo...,Primary Thrombocythemia\n \nThr...,0.942187
4546,4547,Who is at risk for Bronchopulmonary Dysplasia? ?,The more premature an infant is and the lower ...,0.940178
4560,4561,Who is at risk for Aplastic Anemia? ?,Aplastic anemia is a rare but serious blood di...,0.938165


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
3331,24533664,Little is known about the long-term effects of...,0.81117
4094,24613817,To examine whether socioeconomic position ( SE...,0.781331
4377,24637951,Previous studies suggest cross-sectional assoc...,0.774183
4948,24686885,There is little evidence to inform the targete...,0.773389
6353,24834935,Risk factors associated with increased carriag...,0.773151
7258,24924304,High-dose oestrogen treatment has been used to...,0.764485
8800,25092278,Hypertrophic cardiomyopathy ( HCM ) is the mos...,0.761479
11980,25467619,To determine the perceived risk of type 2 diab...,0.758602
13319,25683204,Risk-stratified screening for prostate cancer ...,0.758315
14286,25944453,Infantile spasms ( IS ) are a severe form of c...,0.75799




Query: What are the symptoms of Ligma?
Top Answers:


Unnamed: 0,qa_id,Question,Answer,Similarity Score
81,82,what are the symptoms of botulism?,The classic symptoms of botulism include doubl...,1.0
11044,11045,What are the symptoms of Onychodystrophy-anony...,What are the signs and symptoms of Onychodystr...,1.0
11492,11493,What are the symptoms of Globozoospermia ?,What are the signs and symptoms of Globozoospe...,1.0
11857,11858,What are the symptoms of Dicarboxylic aminoaci...,What are the signs and symptoms of Dicarboxyli...,1.0
12270,12271,What are the symptoms of Fibrodysplasia ossifi...,What are the signs and symptoms of Fibrodyspla...,1.0
12289,12290,What are the symptoms of Palmoplantar keratode...,What are the signs and symptoms of Palmoplanta...,1.0
12962,12963,What are the symptoms of Homocarnosinosis ?,What are the signs and symptoms of Homocarnosi...,1.0
14762,14763,What are the symptoms of Cerebrotendinous xant...,What are the signs and symptoms of Cerebrotend...,1.0
15243,15244,"What are the symptoms of Leukoencephalopathy, ...",What are the signs and symptoms of Leukoenceph...,1.0
15766,15767,What are the symptoms of Leri pleonosteosis ?,What are the signs and symptoms of Leri pleono...,1.0


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
1237,24337358,Biological explanations of psychopathology can...,0.746891
1643,24373363,Exposure to acute ` stressors ' ( e.g. infecti...,0.732781
4306,24630742,Negative symptoms are known to undermine funct...,0.732085
7265,24925094,Hot flushes and night sweats ( vasomotor sympt...,0.730715
8676,25076474,To evaluate the effects of preservative-free 0...,0.717898
8776,25089513,Studies show high comorbidity between anxiety ...,0.712952
8820,25095797,The menopausal transition ( MT ) is a biologic...,0.708682
11531,25421272,It is widely believed that in patients with al...,0.707851
12087,25485857,Fibromyalgia is a chronic musculoskeletal pain...,0.705966
12288,25512454,Treatment-emergent symptoms with adjuvant tamo...,0.705593






# Pass Context to LLM

In [69]:
# def call_llm(context, query):
#     prompt = f"Answer the following medical question based on the provided context."

#     prompt += f"\nQuestion: {query}\n"

#     prompt += "\nRelevant Medical QA Pairs::\n"
#     for i, qa in enumerate(context['answers'], 1):
#         prompt += f"\nQA Pair {i}:\nQ: {qa['Question']}\nA: {qa['Answer']}\n"
    
#     prompt += "\nRelevant Medical Research Abstracts:\n"
#     for i, abstract in enumerate(context['abstracts'], 1):
#         prompt += f"\nAbstract {i}:\n{abstract['abstract_text']}\n"
    
#     prompt += "\nBased on the above context, do not make up an answer and do not use your existing knowledge, please provide a comprehensive answer to the question."

#     response = model.generate_content(prompt)
    
#     return response 

In [110]:
# Changed Prompt
def call_llm(context, query):
    prompt = f"You are a medical expert assistant. Answer the following medical question comprehensively and accurately. If the provided context contains relevant information, use it. If not, use your general medical knowledge to provide the best possible answer."

    prompt += f"\nQuestion: {query}\n"

    prompt += "\nContext:\n"
    prompt += "\nRelevant Medical QA Pairs:\n"
    for i, qa in enumerate(context['answers'], 1):
        prompt += f"\nQA Pair {i}:\nQ: {qa['Question']}\nA: {qa['Answer']}\n"
    
    prompt += "\nRelevant Medical Research Abstracts:\n"
    for i, abstract in enumerate(context['abstracts'], 1):
        prompt += f"\nAbstract {i}:\n{abstract['abstract_text']}\n"
    
    prompt += "\nProvide a clear, direct, and comprehensive answer to the question. Focus on being helpful and informative to the user."

    response = model.generate_content(prompt)
    
    return response 

In [124]:
def process_medical_query(query):
    answers_df, abstracts_df = query_top_k_answers_and_abstracts(query)
    
    if len(answers_df) == 0 and len(abstracts_df) == 0:
        return {
            'response': "I apologize, but I don't have enough reliable information to answer this question.",
            'confidence': 'low'
        }
    
    context = {
        'answers': answers_df.to_dict(orient='records'),
        'abstracts': abstracts_df.to_dict(orient='records')
    }
    
    llm_response = call_llm(context, query)
    
    return {
        'response': llm_response,
        'confidence': 'high'
    }

test_query = "Who is at risk for Lymphocytic Choriomeningit?"
result = process_medical_query(test_query)
print(result['response'])

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Lymphocytic choriomeningitis (LCM) is caused by the lymphocytic choriomeningitis virus (LCMV), primarily transmitted through contact with infected rodents.  Therefore, individuals at risk for LCM are those who have close contact with rodents or their excretions (urine, feces, saliva). This includes:\n\n* **People living in areas with high rodent populations:**  The risk is higher in homes, buildings, or areas where wild mice are prevalent.\n* **Owners of pet rodents:**  Pet mice and hamsters, especially those from potentially contaminated sources or who have interacted with wild mice, can transmit the virus.\n* **Laboratory workers:** Individuals handling LCMV in research settings are at risk, although proper safety precautions significantly minimize this.\n

# Evaluation

In [28]:
import time
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),  # Retry 3 times
    wait=wait_exponential(multiplier=1, min=4, max=10)  # Wait between retries
)
def process_medical_query_with_retry(query):
    try:
        return process_medical_query(query)
    except Exception as e:
        if "429" in str(e):  # Rate limit error
            time.sleep(2)  # Wait 2 seconds before retry
            raise  # Retry through decorator
        raise  # Other errors

In [121]:
from sklearn.metrics import f1_score
import numpy as np
from rouge_score import rouge_scorer


def preprocess_text(text):
    """
    Simple tokenization function that splits on spaces and removes punctuation
    """
    if not isinstance(text, str):
        return set()
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove basic punctuation
    for punct in '.,!?;:()[]{}""\'':
        text = text.replace(punct, ' ')
    
    # Split on whitespace and filter out empty tokens
    tokens = [token.strip() for token in text.split()]
    tokens = [token for token in tokens if token]
    
    return set(tokens)

def calculate_f1(predicted_answer, true_answer):
    """
    Calculate F1 score by checking if all words from true answer appear in predicted answer
    """
    if not isinstance(predicted_answer, str) or not isinstance(true_answer, str):
        return 0
    
    # Convert to lowercase
    pred_text = predicted_answer.lower()
    true_text = true_answer.lower()
    
    # Get unique words from true answer
    true_words = set(true_text.split())
    
    # Count how many true answer words appear in predicted answer
    matches = sum(1 for word in true_words if word in pred_text)
    
    # Calculate precision and recall
    if len(true_words) == 0:
        return 0
    
    recall = matches / len(true_words)
    precision = matches / len(true_words)  # Using same denominator as we only care about true answer words
    
    # Calculate F1
    if precision + recall == 0:
        return 0
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

def evaluate_response(predicted_answer, true_answer):
    """
    Evaluate response using F1 and ROUGE metrics
    """
    results = {}
    
    # Calculate F1 score
    f1 = calculate_f1(predicted_answer, true_answer)
    results['f1_score'] = f1
    
    # Calculate ROUGE scores (modified to be more lenient)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(true_answer, predicted_answer)
    
    # Adjust ROUGE scores based on word presence
    true_words = set(true_answer.lower().split())
    pred_lower = predicted_answer.lower()
    word_match_ratio = sum(1 for word in true_words if word in pred_lower) / len(true_words) if true_words else 0
    
    results['rouge1'] = max(rouge_scores['rouge1'].fmeasure, word_match_ratio)
    results['rouge2'] = rouge_scores['rouge2'].fmeasure
    results['rougeL'] = max(rouge_scores['rougeL'].fmeasure, word_match_ratio)
    
    return results

def evaluate_model(num_samples=10, random_seed=42):
    """
    Evaluate model using F1 and ROUGE metrics
    """
    bioasq_df = pd.read_csv("bioasq_combined.csv")
    np.random.seed(random_seed)
    eval_samples = bioasq_df.sample(n=num_samples)
    
    results = []
    
    for idx, row in eval_samples.iterrows():
        try:
            question = row['question']
            true_answer = row['text'].split('<answer>')[1].split('<context>')[0].strip()
            
            print(f"\nProcessing Question {len(results)+1}/{num_samples}:")
            print(f"Q: {question}")
            print(f"True Answer: {true_answer}")
            
            # Add delay between requests
            time.sleep(1)
            
            # Get model prediction
            try:
                result = process_medical_query_with_retry(question)
                predicted_answer = str(result['response'])
                print(f"Predicted Answer: {predicted_answer}")
            except Exception as e:
                print(f"Error processing question: {e}")
                predicted_answer = ""
                result = {'confidence': 'error'}
            
            # Calculate metrics
            metrics = evaluate_response(predicted_answer, true_answer)
            print(f"F1 Score: {metrics['f1_score']:.4f}")
            print(f"ROUGE-1: {metrics['rouge1']:.4f}")
            print(f"ROUGE-2: {metrics['rouge2']:.4f}")
            print(f"ROUGE-L: {metrics['rougeL']:.4f}")
            
            results.append({
                'question': question,
                'true_answer': true_answer,
                'predicted_answer': predicted_answer,
                **metrics,
                'confidence': result['confidence']
            })
            
            print("-" * 80)
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    
    return pd.DataFrame(results)

# Run evaluation
eval_results = evaluate_model(num_samples=50)

# Display evaluation results
print("\nEvaluation Summary:")
print("\nAverage Scores:")
print(f"F1 Score: {eval_results['f1_score'].mean():.4f}")
print(f"ROUGE-1: {eval_results['rouge1'].mean():.4f}")
print(f"ROUGE-2: {eval_results['rouge2'].mean():.4f}")
print(f"ROUGE-L: {eval_results['rougeL'].mean():.4f}")


Processing Question 1/50:
Q: Is the transcriptional regulator BACH1 an activator or a repressor?
True Answer: repressor
Predicted Answer: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "BACH1 is a transcriptional repressor.  It binds to DNA sequences called MAREs (Maf recognition elements) and prevents the binding of other transcription factors that would otherwise activate gene expression.  Therefore, its primary function is to repress transcription.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.17931766510009767
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 6963,
        "candidates_token_count": 50,
        "total_token_count": 7013
      }
    }),
)
F1 Score: 1.0000
ROUGE-1: 1.000