In [None]:
# !pip install -q sentence-transformers


In [11]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from pathlib import Path

# Paths
DATA_CSV   = Path("data/medquestions_subset_150.csv")
GEMINI_CSV = Path("results/gemini/results_gemini_medquestions150.csv")
GPT_CSV    = Path("results/gpt/results_gpt_medquestions150.csv")

# Dataset columns
QUESTION_COL = "question" 
TRUTH_COL    = "answer"    

# Model results CSVs
RESULT_QUESTION_COL = "question"
RESULT_ANSWER_COL   = "model_answer" 

# LOADS BASE DATASET
print(f"Loading dataset: {DATA_CSV}")
base = pd.read_csv(DATA_CSV)
print("Dataset loaded.")
print("Dataset columns:", list(base.columns))
print(base.head(3), "\n")

# Sanity check
for col in [QUESTION_COL, TRUTH_COL]:
    if col not in base.columns:
        raise ValueError(
            f"Column '{col}' not found in dataset CSV. "
            f"Available columns: {base.columns}"
        )

# EMBEDDING MODEL
print("Loading embedding model 'all-MiniLM-L6-v2'...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded.\n")

Loading dataset: data\medquestions_subset_150.csv
Dataset loaded.
Dataset columns: ['qid', 'question', 'answer']
                     qid                                           question  \
0  medquestions-150-0000  What are the genetic changes related to famili...   
1  medquestions-150-0001      What are the treatments for Noonan syndrome ?   
2  medquestions-150-0002  How to diagnose National Hormone and Pituitary...   

                                              answer  
0  Mutations in the APC gene cause both classic a...  
1  These resources address the diagnosis or manag...  
2  CJD is usually diagnosed based on signs and sy...   

Loading embedding model 'all-MiniLM-L6-v2'...
Embedding model loaded.



In [12]:
def add_semantic_scores(df_merged, pred_col, truth_col=TRUTH_COL, prefix=""):
    """
    df_merged : DataFrame with ground-truth + model answer columns
    pred_col  : column name for model answers (e.g., 'gemini_answer')
    truth_col : column name for ground-truth answers (e.g., 'answer')
    prefix    : 'gemini_' or 'gpt_' to prefix new metric columns
    """
    if df_merged.empty:
        print(f"Dataframe is empty in add_semantic_scores for prefix '{prefix}'.")
        df_merged[f"{prefix}semantic_sim"] = []
        df_merged[f"{prefix}semantic_correct"] = []
        return df_merged

    truth_texts = df_merged[truth_col].fillna("").astype(str).tolist()
    pred_texts  = df_merged[pred_col].fillna("").astype(str).tolist()

    # Encodes both lists into embeddings
    print(f"Encoding embeddings for prefix '{prefix}'...")
    truth_emb = model.encode(truth_texts, convert_to_tensor=True, show_progress_bar=True)
    pred_emb  = model.encode(pred_texts,  convert_to_tensor=True, show_progress_bar=True)

    # Cosine similarity row by row
    sims = F.cosine_similarity(truth_emb, pred_emb) 
    sims_np = sims.detach().cpu().numpy()

    sim_col     = f"{prefix}semantic_sim"
    correct_col = f"{prefix}semantic_correct"

    df_merged[sim_col] = sims_np

    THRESHOLD = 0.75
    df_merged[correct_col] = (df_merged[sim_col] >= THRESHOLD).astype(int)

    return df_merged

In [17]:
def evaluate_model(results_csv: Path, model_label: str):
    """
    results_csv : path to model results CSV
    model_label : short name for model, e.g. 'gemini' or 'gpt'
    """
    print(f"Evaluating {model_label.upper()} from {results_csv}")

    df_raw = pd.read_csv(results_csv)
    print("Results columns:", list(df_raw.columns))
    print(df_raw.head(3), "\n")

    # Checks expected columns
    for col in [RESULT_QUESTION_COL, RESULT_ANSWER_COL]:
        if col not in df_raw.columns:
            raise ValueError(
                f"Column '{col}' not found in {results_csv}. "
                f"Available columns: {df_raw.columns}"
            )

    answer_col = f"{model_label}_answer"
    df_model = df_raw.rename(columns={RESULT_ANSWER_COL: answer_col})

    merged = base.merge(
        df_model[[RESULT_QUESTION_COL, answer_col]],
        left_on=QUESTION_COL,
        right_on=RESULT_QUESTION_COL,
        how="inner"
    )

    print(f"{model_label.upper()} merged shape:", merged.shape)

    if merged.empty:
        print(f"No overlapping questions between dataset and {model_label} results.")
        return None, None

    # Adds semantic similarity + correctness
    merged = add_semantic_scores(
        merged,
        pred_col=answer_col,
        prefix=f"{model_label}_"
    )

    # Semantic accuracy
    acc_col = f"{model_label}_semantic_correct"
    sem_acc = merged[acc_col].mean()

    print(f"\n{model_label.upper()} semantic accuracy: {sem_acc:.3f}\n")

    # Shows a few rows for inspection
    preview_cols = [
        QUESTION_COL,
        TRUTH_COL,
        answer_col,
        f"{model_label}_semantic_sim",
        f"{model_label}_semantic_correct",
    ]
    print("Sample rows:")
    print(merged[preview_cols].head(5), "\n")

    return merged, sem_acc

In [18]:
# Evaluates Gemini
gemini_merged, gemini_sem_acc = evaluate_model(GEMINI_CSV, "gemini")

# Evaluates GPT
gpt_merged, gpt_sem_acc = evaluate_model(GPT_CSV, "gpt")

print("FINAL SEMANTIC ACCURACIES")
print(f"Gemini semantic accuracy: {gemini_sem_acc:.3f}" if gemini_sem_acc is not None else "Gemini: no data")
print(f"GPT semantic accuracy: {gpt_sem_acc:.3f}" if gpt_sem_acc is not None else "GPT: no data")

# COMBINED SIDE BY SIDE TABLE
if (gemini_merged is not None) and (gpt_merged is not None):
    gemini_eval = gemini_merged[
        [QUESTION_COL, TRUTH_COL, "gemini_answer", "gemini_semantic_sim", "gemini_semantic_correct"]
    ]
    gpt_eval = gpt_merged[
        [QUESTION_COL, "gpt_answer", "gpt_semantic_sim", "gpt_semantic_correct"]
    ]

    both = gemini_eval.merge(
        gpt_eval,
        on=QUESTION_COL,
        how="inner"
    )

    print("Combined comparison shape:", both.shape)
    print(both.head(5), "\n")

    out_path = Path("results/eval_medquestions150_semantic.csv")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    both.to_csv(out_path, index=False)
    print(f"Saved combined semantic evaluation to: {out_path}")
else:
    print("Skipping combined table because one of the models had no merged data.")

Evaluating GEMINI from results\gemini\results_gemini_medquestions150.csv
Results columns: ['qid', 'model', 'question', 'model_answer', 'latency_ms', 'status', 'error']
                     qid             model  \
0  medquestions-500-0000  gemini-2.5-flash   
1  medquestions-500-0001  gemini-2.5-flash   
2  medquestions-500-0002  gemini-2.5-flash   

                                            question  \
0  What are the genetic changes related to famili...   
1      What are the treatments for Noonan syndrome ?   
2  How to diagnose National Hormone and Pituitary...   

                                        model_answer  latency_ms status  error  
0               Germline mutation in the *APC* gene.         818     ok    NaN  
1  Treatment for Noonan syndrome is supportive an...        1731     ok    NaN  
2  NHPP is a program providing information for in...        2260     ok    NaN   

GEMINI merged shape: (150, 4)
Encoding embeddings for prefix 'gemini_'...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


GEMINI semantic accuracy: 0.360

Sample rows:
                                            question  \
0  What are the genetic changes related to famili...   
1      What are the treatments for Noonan syndrome ?   
2  How to diagnose National Hormone and Pituitary...   
3     Is Spastic diplegia cerebral palsy inherited ?   
4              Is restless legs syndrome inherited ?   

                                              answer  \
0  Mutations in the APC gene cause both classic a...   
1  These resources address the diagnosis or manag...   
2  CJD is usually diagnosed based on signs and sy...   
3  Is spastic diplegia cerebral palsy inherited? ...   
4  The inheritance pattern of restless legs syndr...   

                                       gemini_answer  gemini_semantic_sim  \
0               Germline mutation in the *APC* gene.             0.559506   
1  Treatment for Noonan syndrome is supportive an...             0.727129   
2  NHPP is a program providing information for i

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


GPT semantic accuracy: 0.560

Sample rows:
                                            question  \
0  What are the genetic changes related to famili...   
1      What are the treatments for Noonan syndrome ?   
2  How to diagnose National Hormone and Pituitary...   
3     Is Spastic diplegia cerebral palsy inherited ?   
4              Is restless legs syndrome inherited ?   

                                              answer  \
0  Mutations in the APC gene cause both classic a...   
1  These resources address the diagnosis or manag...   
2  CJD is usually diagnosed based on signs and sy...   
3  Is spastic diplegia cerebral palsy inherited? ...   
4  The inheritance pattern of restless legs syndr...   

                                          gpt_answer  gpt_semantic_sim  \
0  Familial adenomatous polyposis (FAP) is primar...          0.855486   
1  Treatment for Noonan syndrome is symptomatic a...          0.722646   
2  To diagnose conditions related to the National...        