In [1]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score

import pandas as pd
import os

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "answers")

In [3]:
def preprocess(result_df):
    result_df.columns = result_df.columns.str.strip()
    result_df["Approp Score"] = result_df["Approp Score"].replace("USUALLY APPROPRIATE", "UA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("USUALLY NOT APPROPRIATE", "UNA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("MAY BE APPROPRIATE", "MBA")
    result_df["Approp Score"] = result_df["Approp Score"].replace("INSUFFICIENT INFORMATION", "ICI")
    result_df["ANSKEY1"] = result_df["ANSKEY1"].str.strip()
    return result_df
    
def evaluate_results(result_df, classification_dict):
    result_df = preprocess(result_df)
    assert len(result_df) == 70, "Number of testcases must be 70."

    preds = result_df["Approp Score"].replace(classification_dict)
    labels = result_df["ANSKEY1"].replace(classification_dict)

    accuracy = accuracy_score(labels, preds)
    macro_precision = precision_score(labels, preds, average = "macro")
    weighted_precision = precision_score(labels, preds, average = "weighted")
    macro_recall = recall_score(labels, preds, average = "macro")
    weighted_recall = recall_score(labels, preds, average = "weighted")
    macro_f1 = f1_score(labels, preds, average = "macro")
    weighted_f1 = f1_score(labels, preds, average = "weighted")
    cohen_kappa = cohen_kappa_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "macro_precision": macro_precision,
        "weighted_precision": weighted_precision,
        "macro_recall": macro_recall,
        "weighted_recall": weighted_recall,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
        "cohen_kappa": cohen_kappa
    }

# 4 Labels: UA, MBA, UNA, ICI

In [4]:
classification_dict = {"UA": 0, "MBA": 1, "UNA": 2, "ICI": 3}

## Summaries

In [5]:
answer_files = ['Human_rad1(CLP).csv',
 'Human_rad2(KGY).csv',
 'Human_trainee1(CHY).csv',
 'Human_trainee2(SUD).csv',
 'Human_ortho(NYH).csv',
 'Non_RAG_LLM.csv',
 'Naive_RAG_cLLM.csv',
 'Optimized_cLLM.csv'
 ]

In [9]:
summaries = []

for answer_file in answer_files:
    print(answer_file)
    summary = {"respondent": answer_file.split(".")[0]}
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = pd.read_csv(answer_path)
    evaluations = evaluate_results(result_df, classification_dict)
    summary.update(evaluations)
    summaries.append(summary)

Human_rad1(CLP).csv
Human_rad2(KGY).csv
Human_trainee1(CHY).csv
Human_trainee2(SUD).csv
Human_ortho(NYH).csv
Non_RAG_LLM.csv
Naive_RAG_cLLM.csv
Optimized_cLLM.csv


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
summary_df = pd.DataFrame(summaries)
summary_df.to_csv("result_summaries_4classes.csv")

In [32]:
for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = preprocess(pd.read_csv(answer_path))
    # result_df = result_df.replace({"ANSKEY1": classification_dict, "Approp Score": classification_dict})
    count_df = result_df.groupby("ANSKEY1")["Approp Score"].value_counts().reset_index(name="count")
    count_df.to_csv(os.path.join(".", "data", "4class_"+answer_file))

## McNemar Tests

In [11]:
answer_matching_df = None

for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = preprocess(pd.read_csv(answer_path))
    if answer_matching_df is None:
        print('master:', answer_file)
        answer_matching_df = result_df[["Clinical File", "ANSKEY1", "Approp Score"]]
        answer_matching_df = answer_matching_df.rename(columns={"ANSKEY1": "Ground Truth", "Approp Score": answer_file[:-4]})
        answer_matching_df[answer_file[:-4]+"_match"] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    else:
        answer_matching_df[answer_file[:-4]] = result_df["Approp Score"]
        answer_matching_df[answer_file[:-4]+"_match"] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    # Check groundtruth
    match_gt = (answer_matching_df["Ground Truth"] == result_df["ANSKEY1"]).sum()
    if match_gt != 70:
        print(answer_file)

master: Human_rad1(CLP).csv


In [12]:
answer_matching_df.to_csv("answers_matching_4classes.csv")

In [13]:
from statsmodels.stats.contingency_tables import mcnemar

respondents = [answer_file[:-4]+"_match" for answer_file in answer_files]
mcnemar_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            mcnemar_results = mcnemar(confusion_matrix, exact=True)
            mcnemar_matrix[row_idx][col_idx] = round(mcnemar_results.pvalue, 5)

In [14]:
mcnemar_df = pd.DataFrame(mcnemar_matrix,
                          columns=[answer_file[:-4] for answer_file in answer_files],
                          index=[answer_file[:-4] for answer_file in answer_files])
mcnemar_df.to_csv("mcnemar_4classes.csv")

## Fisher's Exact Test

In [None]:
from scipy.stats import fisher_exact

respondents = [answer_file[:-4]+"_match" for answer_file in answer_files]
fisher_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            odd_ratio, p_value = fisher_exact(confusion_matrix)
            if p_value < 0.05:
                print()
            fisher_matrix[row_idx][col_idx] = round(p_value, 5)

In [16]:
fisher_df = pd.DataFrame(fisher_matrix,
                         columns=[answer_file[:-4] for answer_file in answer_files],
                         index=[answer_file[:-4] for answer_file in answer_files])
fisher_df.to_csv("fisher_4classes.csv")

## Breakdown by classes

# 3 Labels: UA/MBA, UNA, ICI

In [6]:
# classification_dict = {"UA": 0, "MBA": 0, "UNA": 1, "ICI": 2}
classification_dict = {"UA": "UA/MBA", "MBA": "UA/MBA", "UNA": "UNA", "ICI": "ICI"}

In [7]:
classification_dict

{'UA': 'UA/MBA', 'MBA': 'UA/MBA', 'UNA': 'UNA', 'ICI': 'ICI'}

## Summaries

In [47]:
summaries = []

for answer_file in answer_files:
    print(answer_file)
    summary = {"respondent": answer_file.split(".")[0]}
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = pd.read_csv(answer_path)
    evaluations = evaluate_results(result_df, classification_dict)
    summary.update(evaluations)
    summaries.append(summary)

Human_rad1(CLP).csv
Human_rad2(KGY).csv
Human_trainee1(CHY).csv
Human_trainee2(SUD).csv
Human_ortho(NYH).csv
Non_RAG_LLM.csv
Naive_RAG_cLLM.csv
Optimized_cLLM.csv


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
summaries = []

for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = pd.read_csv(answer_path)

In [21]:
summary_df = pd.DataFrame(summaries)
summary_df.to_csv("result_summaries_3classes.csv")

In [34]:
for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = preprocess(pd.read_csv(answer_path))
    result_df = result_df.replace({"ANSKEY1": classification_dict, "Approp Score": classification_dict})
    count_df = result_df.groupby("ANSKEY1")["Approp Score"].value_counts().reset_index(name="count")
    count_df.to_csv(os.path.join(".", "data", "3class_"+answer_file))

## McNemar Tests

In [48]:
answer_matching_df = None

for answer_file in answer_files:
    answer_path = os.path.join(DATA_DIR, answer_file)
    result_df = preprocess(pd.read_csv(answer_path))
    result_df = result_df.replace({"ANSKEY1": classification_dict, "Approp Score": classification_dict})
    if answer_matching_df is None:
        print('master:', answer_file)
        answer_matching_df = result_df[["Clinical File", "ANSKEY1", "Approp Score"]]
        answer_matching_df = answer_matching_df.rename(columns={"ANSKEY1": "Ground Truth", "Approp Score": answer_file[:-4]})
        answer_matching_df[answer_file[:-4]+"_match"] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    else:
        answer_matching_df[answer_file[:-4]] = result_df["Approp Score"]
        answer_matching_df[answer_file[:-4]+"_match"] = (answer_matching_df[answer_file[:-4]] == answer_matching_df["Ground Truth"])
    # Check groundtruth
    match_gt = (answer_matching_df["Ground Truth"] == result_df["ANSKEY1"]).sum()
    if match_gt != 70:
        print(answer_file)

master: Human_rad1(CLP).csv


In [30]:
answer_matching_df.to_csv("answers_matching_3classes.csv")

In [49]:
from statsmodels.stats.contingency_tables import mcnemar
from math import comb
import numpy as np

def calculate_midp_mcnemar(
    confusion_matrix
):
    confusion_matrix = _make_df_square(confusion_matrix)
    confusion_matrix = np.confusion_matrix(table, dtype=np.float64)
    n1, n2 = confusion_matrix[0, 1], confusion_matrix[1, 0]
    statistic = np.minimum(n1, n2)
    total_sum = n1 + n2
    mcnemar_results = mcnemar(confusion_matrix, exact=True)
    mcnemar_pvalue = mcnemar_results.pvalue
    midp_mcnemar_pvalue = mcnemar_pvalue - comb(total_sum, statistic) * (0.5 ** total_sum)
    return midp_mcnemar_pvalue

respondents = [answer_file[:-4]+"_match" for answer_file in answer_files]
mcnemar_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            # mcnemar_results = mcnemar(confusion_matrix, exact=True)
            # mcnemar_matrix[row_idx][col_idx] = round(mcnemar_results.pvalue, 5)
            midp_mcnemar_pvalue = calculate_midp_mcnemar(confusion_matrix)
            mcnemar_matrix[row_idx][col_idx] = round(midp_mcnemar_pvalue, 5)

TypeError: list indices must be integers or slices, not tuple

In [31]:
mcnemar_df = pd.DataFrame(mcnemar_matrix,
                          columns=[answer_file[:-4] for answer_file in answer_files],
                          index=[answer_file[:-4] for answer_file in answer_files])
mcnemar_df.to_csv("mcnemar_3classes.csv")

## Fisher's Exact Test

In [25]:
from scipy.stats import fisher_exact

respondents = [answer_file[:-4]+"_match" for answer_file in answer_files]
fisher_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~answer_matching_df[respondents[row_idx]] & answer_matching_df[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~answer_matching_df[respondents[row_idx]] & ~answer_matching_df[respondents[col_idx]]).sum()
            odd_ratio, p_value = fisher_exact(confusion_matrix)
            fisher_matrix[row_idx][col_idx] = round(p_value, 5)

In [32]:
fisher_df = pd.DataFrame(fisher_matrix,
                         columns=[answer_file[:-4] for answer_file in answer_files],
                         index=[answer_file[:-4] for answer_file in answer_files])
fisher_df.to_csv("fisher_3classes.csv")

## Counts

In [13]:
match_df_3 = pd.read_csv("answers_matching_3classes.csv")

In [None]:
for human_name in ['Human_rad1(CLP)_match', 'Human_rad2(KGY)_match', 'Human_trainee1(CHY)_match', 'Human_trainee2(SUD)_match', 'Human_ortho(NYH)_match']:
    print(human_name)
    sub_df = match_df_3[["Ground Truth", human_name]]
    print(sub_df.groupby("Ground Truth").sum())

In [22]:
compare_df = match_df_3[["Human_trainee2(SUD)_match", "Optimized_cLLM_match"]]

In [24]:
compare_df.groupby("Human_trainee2(SUD)_match")["Optimized_cLLM_match"].value_counts()

Human_trainee2(SUD)_match  Optimized_cLLM_match
False                      True                    14
                           False                    3
True                       True                    48
                           False                    5
Name: Optimized_cLLM_match, dtype: int64