In [83]:
import pandas as pd
import os

In [84]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "answers")

In [106]:
classification_dict = {"UA": 0, "MBA": 1, "UNA": 2, "ICI": 3}

In [112]:
preds = result_df["Approp Score"].replace(classification_dict)
labels = result_df["ANSKEY1"].replace(classification_dict)

In [115]:
from sklearn.metrics import accuracy_score

In [123]:
accuracy = round(accuracy_score(preds, labels) * 100, 3)

In [120]:
def preprocess(result_df):
    result_df.columns = result_df.columns.str.strip()
    result_df["Approp Score"] = result_df["Approp Score"].replace("USUALLY APPROPRIATE", "UA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("USUALLY NOT APPROPRIATE", "UNA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("MAY BE APPROPRIATE", "MBA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("INSUFFICIENT INFORMATION", "ICI")
    result_df["ANSKEY1"] = result_df["ANSKEY1"].str.strip()
    return result_df
    
def evaluate_results(result_df):
    result_df = preprocess(result_df)
    assert len(result_df) == 70, "Number of testcases must be 70."

    preds = result_df["Approp Score"].replace(classification_dict)
    labels = result_df["ANSKEY1"].replace(classification_dict)

    accuracy = round(accuracy_score(preds, labels) * 100, 3)
    
    df = result_df.groupby(["Approp Score", "ANSKEY1"])["Match"].count().reset_index()
    match_df = df[df["Approp Score"] == df["ANSKEY1"]][["Approp Score", "Match"]]
    
    precision_df = df.groupby("Approp Score")["Match"].sum().reset_index().rename(columns={"Match": "Total"})
    precision_df = precision_df.merge(match_df, on=["Approp Score"])
    precision_df["Precision"] = round(precision_df["Match"] / precision_df["Total"] * 100, 3)
    
    recall_df = df.groupby("ANSKEY1")["Match"].sum().reset_index().rename(columns={"Match": "Total"})
    recall_df = recall_df.merge(match_df, left_on=["ANSKEY1"], right_on=["Approp Score"])
    recall_df["Recall"] = round(recall_df["Match"] / recall_df["Total"] * 100, 3)
    
    try:
        precision_ua = precision_df[precision_df["Approp Score"] == "UA"]["Precision"].values[0]
    except:
        precision_ua = 0
    
    try:
        precision_mba = precision_df[precision_df["Approp Score"] == "MBA"]["Precision"].values[0]
    except:
        precision_mba = 0
        
    try:
        precision_una = precision_df[precision_df["Approp Score"] == "UNA"]["Precision"].values[0]
    except:
        precision_una = 0
        
    try:
        precision_ici = precision_df[precision_df["Approp Score"] == "ICI"]["Precision"].values[0]
    except:
        precision_ici = 0
    
    try:
        recall_ua = recall_df[recall_df["ANSKEY1"] == "UA"]["Recall"].values[0]
    except:
        recall_ua = 0
    try:
        recall_mba = recall_df[recall_df["ANSKEY1"] == "MBA"]["Recall"].values[0]
    except:
        recall_mba = 0
    try:
        recall_una = recall_df[recall_df["ANSKEY1"] == "UNA"]["Recall"].values[0]
    except:
        recall_una = 0
    try:
        recall_ici = recall_df[recall_df["ANSKEY1"] == "ICI"]["Recall"].values[0]
    except:
        recall_ici = 0
    
    return {
        "accuracy": accuracy,
        "precision_ua": precision_ua,
        "precision_mba": precision_mba,
        "precision_una": precision_una,
        "precision_ici": precision_ici,
        "recall_ua": recall_ua,
        "recall_mba": recall_mba,
        "recall_una": recall_una,
        "recall_ici": recall_ici 
    }

# Summaries

In [86]:
summaries = []
answer_files = os.listdir(DATA_DIR)

for answer_file in answer_files:
    print(answer_file)
    summary = {"respondent": answer_file.split(".")[0]}
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = pd.read_csv(answer_path)
    evaluations = evaluate_results(result_df)
    summary.update(evaluations)
    summaries.append(summary)

Human_CLPRad.csv
Human_HiokRes.csv
Human_KGYRad.csv
Human_NYHOrtho.csv
Human_SudRes.csv
Naive_RAG_LLM.csv
No_RAG_LLM.csv
Optimized_cLLM.csv


In [None]:
summary_df = pd.DataFrame(summaries)
summary_df.to_csv("result_summaries.csv")

# McNemar Tests

In [92]:
answer_matching_df = None

for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = preprocess(pd.read_csv(answer_path))
    if answer_matching_df is None:
        print('master:', answer_file)
        answer_matching_df = result_df[["Clinical File", "ANSKEY1", "Approp Score"]]
        answer_matching_df = answer_matching_df.rename(columns={"ANSKEY1": "Ground Truth", "Approp Score": answer_file[:-4]})
        answer_matching_df[answer_file[:-4]] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    else:
        answer_matching_df[answer_file[:-4]] = result_df["Approp Score"]
        answer_matching_df[answer_file[:-4]] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    # Check groundtruth
    match_gt = (answer_matching_df["Ground Truth"] == result_df["ANSKEY1"]).sum()
    if match_gt != 70:
        print(answer_file)

master: Human_CLPRad.csv


In [95]:
from statsmodels.stats.contingency_tables import mcnemar

respondents = [answer_file[:-4] for answer_file in answer_files]
mcnemar_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            mcnemar_results = mcnemar(confusion_matrix, exact=True)
            mcnemar_matrix[row_idx][col_idx] = round(mcnemar_results.pvalue, 5)

In [96]:
mcnemar_df = pd.DataFrame(mcnemar_matrix, columns=respondents, index=respondents)
mcnemar_df

Unnamed: 0,Human_CLPRad,Human_HiokRes,Human_KGYRad,Human_NYHOrtho,Human_SudRes,Naive_RAG_LLM,No_RAG_LLM,Optimized_cLLM
Human_CLPRad,,0.02006,0.09625,0.01067,1.0,1.0,0.00107,0.04329
Human_HiokRes,0.02006,,0.0002,1.0,0.01916,0.0072,0.22952,2e-05
Human_KGYRad,0.09625,0.0002,,2e-05,0.06357,0.10775,0.0,0.69004
Human_NYHOrtho,0.01067,1.0,2e-05,,0.01612,0.01612,0.32694,1e-05
Human_SudRes,1.0,0.01916,0.06357,0.01612,,1.0,0.00119,0.02896
Naive_RAG_LLM,1.0,0.0072,0.10775,0.01612,1.0,,9e-05,0.0169
No_RAG_LLM,0.00107,0.22952,0.0,0.32694,0.00119,9e-05,,0.0
Optimized_cLLM,0.04329,2e-05,0.69004,1e-05,0.02896,0.0169,0.0,


In [None]:
from scipy.stats import fisher_exact

respondents = [answer_file[:-4] for answer_file in answer_files]
fisher_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            odd_ratio, p_value = fisher_exact(confusion_matrix)
            if p_value < 0.05:
                print()
            fisher_matrix[row_idx][col_idx] = round(p_value, 5)

In [102]:
fisher_matrix

[[None, 0.44623, 0.00762, 0.3035, 0.00251, 0.78536, 0.79731, 0.73144],
 [0.44623, None, 0.76763, 0.47274, 0.04339, 0.00032, 0.02644, 1.0],
 [0.00762, 0.76763, None, 0.03896, 0.00943, 0.52548, 0.76443, 0.10478],
 [0.3035, 0.47274, 0.03896, None, 0.20361, 0.20361, 0.04964, 1.0],
 [0.00251, 0.04339, 0.00943, 0.20361, None, 0.10269, 0.79496, 1.0],
 [0.78536, 0.00032, 0.52548, 0.20361, 0.10269, None, 0.00346, 0.15897],
 [0.79731, 0.02644, 0.76443, 0.04964, 0.79496, 0.00346, None, 0.04137],
 [0.73144, 1.0, 0.10478, 1.0, 1.0, 0.15897, 0.04137, None]]