In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.proportion import proportion_confint
from math import comb

import os
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [2]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data", "answers")
ARTIFACT_DIR = os.path.join(MAIN_DIR, "artifacts")

In [3]:
participants = {
    "CHY": {"Seniority": "Novice", "Answers": "CHY- Senior Resident radiologist.csv"},
    "CLP": {"Seniority": "Expert", "Answers": "CLP- MSK Radiologist.csv"},
    "Gita": {"Seniority": "Expert", "Answers": "Gita- MSK Radiologist.csv"},
    "Sud": {"Seniority": "Novice", "Answers": "Sud- Resident radiologist.csv"},
    "YH": {"Seniority": "Novice", "Answers": "Yeong Huei- Ortho.csv"}
}

In [73]:
all_df_list = []
correctness_dict = {}
respondents = ["ai"]
scores = {
    "participant": [],
    "accuracy": [], "precision": [], "recall": [], "f1": []
}
counts = {
    "participant": [],
    "Total": [], "UA/MBA": [], "UNA": [], "ICI": []
}

for participant in participants:
    answer_path = os.path.join(DATA_DIR, participants[participant]["Answers"])
    answer_df = pd.read_csv(answer_path)

    rename_dict = {
        "USUALLY APPROPRIATE": "UA/MBA",
        "MAY BE APPROPRIATE": "UA/MBA",
        "MBA": "UA/MBA", "UA": "UA/MBA",
        "USUALLY NOT APPROPRIATE": "UNA",
        "INSUFFICIENT INFORMATION": "ICI"
    }

    ALLOWED_CLASSES = ["UA/MBA", "UNA", "ICI", "NO ANSWER"]
    answer_df["GPT Answer"] = answer_df["GPT Answer"].str.strip().str.upper().replace(rename_dict)
    answer_df["Human-AI Answer"] = answer_df["Human-AI Answer"].str.strip().str.upper().replace(rename_dict)
    answer_df["Correct Answer"] = answer_df["Correct Answer"].str.strip().str.upper().replace(rename_dict)
    answer_df["Human Answer"] = answer_df["Human Answer"].str.strip().str.upper().replace(rename_dict)

    answer_df["GPT Answer"] = answer_df["GPT Answer"].fillna("NO ANSWER")
    answer_df["Human-AI Answer"] = answer_df["Human-AI Answer"].fillna("NO ANSWER")
    answer_df["Correct Answer"] = answer_df["Correct Answer"].fillna("NO ANSWER")
    answer_df["Human Answer"] = answer_df["Human Answer"].fillna("NO ANSWER")


    for category in answer_df["GPT Answer"].unique():
        if category not in ALLOWED_CLASSES:
            print(f"{participant}-GPT-answer: {category} not in allowed class")

    for category in answer_df["Human-AI Answer"].unique():
        if category not in ALLOWED_CLASSES:
            print(f"{participant}-HumanAI-answer: {category} not in allowed class")
            
    for category in answer_df["Correct Answer"].unique():
        if category not in ALLOWED_CLASSES:
            print(f"{participant}-Correct-answer: {category} not in allowed class")
            
    for category in answer_df["Human Answer"].unique():
        if category not in ALLOWED_CLASSES:
            print(f"{participant}-Human-answer: {category} not in allowed class")
            
    ground_truths = answer_df["Correct Answer"]
    ai_correctness = (answer_df["GPT Answer"] == answer_df["Correct Answer"]).astype(int).values
    human_correctness = (answer_df["Human Answer"] == answer_df["Correct Answer"]).astype(int).values
    human_ai_correctness = (answer_df["Human-AI Answer"] == answer_df["Correct Answer"]).astype(int).values

    correctness_dict["ai"] = ai_correctness
    correctness_dict[f"{participant}_noai"] = human_correctness
    correctness_dict[f"{participant}_withai"] = human_ai_correctness
    respondents.extend([f"{participant}_noai", f"{participant}_withai"])

    human_only_df = pd.DataFrame(
        {
        "question_no": list(range(1, 71)),
        "participant": participant,
        "seniority": participants[participant]["Seniority"],
        "ai_correctness": ai_correctness,
        "human_correctness": human_correctness,
        "ai_usage": 0,
        }
    )

    humanai_df = pd.DataFrame(
        {
        "question_no": list(range(1, 71)),
        "participant": participant,
        "seniority": participants[participant]["Seniority"],
        "ai_correctness": ai_correctness,
        "human_correctness": human_ai_correctness,
        "ai_usage": 1,
        }
    )
    
    all_df_list.extend([human_only_df, humanai_df])
    
    from copy import deepcopy
    answer_number_df = deepcopy(answer_df)
    CLASS2IDX = {"UA/MBA": 0, "UNA": 1, "ICI": 2, 'NO ANSWER': 3}
    answer_number_df["GPT Answer"] = answer_number_df["GPT Answer"].replace(CLASS2IDX)
    answer_number_df["Human-AI Answer"] = answer_number_df["Human-AI Answer"].replace(CLASS2IDX)
    answer_number_df["Correct Answer"] = answer_number_df["Correct Answer"].replace(CLASS2IDX)
    answer_number_df["Human Answer"] = answer_number_df["Human Answer"].replace(CLASS2IDX)
    
    scores["participant"].append(participant)
    scores["accuracy"].append(accuracy_score(answer_number_df["Correct Answer"], answer_number_df["Human-AI Answer"]))
    scores["precision"].append(precision_score(answer_number_df["Correct Answer"], answer_number_df["Human-AI Answer"],
                                               labels = [0, 1, 2], average = "macro"))
    scores["recall"].append(recall_score(answer_number_df["Correct Answer"], answer_number_df["Human-AI Answer"],
                                         labels = [0, 1, 2], average = "macro"))
    scores["f1"].append(f1_score(answer_number_df["Correct Answer"], answer_number_df["Human-AI Answer"],
                                 labels = [0, 1, 2], average = "macro"))
    
    correct_df = answer_df[answer_df["Correct Answer"] == answer_df["Human-AI Answer"]]
    count_dict = correct_df.groupby("Correct Answer")["Human-AI Answer"].count().to_dict()
    counts["participant"].append(participant)
    counts["Total"].append(count_dict["UA/MBA"] + count_dict["UNA"] + count_dict["ICI"])
    counts["UA/MBA"].append(count_dict["UA/MBA"])
    counts["UNA"].append(count_dict["UNA"])
    counts["ICI"].append(count_dict["ICI"])    

In [75]:
pd.DataFrame(counts).to_csv(os.path.join(ARTIFACT_DIR, "counts.csv"))

# Binomial proportion confidence interval

In [52]:
alpha = 0.05
lci = []
uci = []

for respondent in respondents:
    correctness_list = correctness_dict[respondent]
    wilson_stats = proportion_confint(correctness_list.sum(), len(correctness_list), alpha=alpha, method="wilson")
    lci.append(wilson_stats[0])
    uci.append(wilson_stats[1])
    
wilson_df = pd.DataFrame(
    {
        "respondent": respondents,
        "lci": lci, "uci": uci
    }
)
wilson_df["accuracy"] = (wilson_df["lci"] + wilson_df["uci"])/2
wilson_df.to_csv(os.path.join(ARTIFACT_DIR, "ci.csv"))

# Pairwise Mcnemar Statistical Tests

In [31]:
def _make_df_square(table):
    
    if not isinstance(table, pd.DataFrame):
        return table

    if not table.index.equals(table.columns):
        ix = list(set(table.index) | set(table.columns))
        ix.sort()
        table = table.reindex(index=ix, columns=ix, fill_value=0)

    table = table.reindex(table.columns)

    return table

def calculate_midp_mcnemar(
    confusion_matrix
):
    table = _make_df_square(confusion_matrix)
    table = np.asarray(table, dtype=np.int)
    n1, n2 = table[0, 1], table[1, 0]
    statistic = np.minimum(n1, n2)
    total_sum = n1 + n2
    mcnemar_results = mcnemar(confusion_matrix, exact=True)
    mcnemar_pvalue = mcnemar_results.pvalue
    midp_mcnemar_pvalue = mcnemar_pvalue - comb(total_sum, statistic) * (0.5 ** total_sum)
    return midp_mcnemar_pvalue

In [39]:
mcnemar_matrix = [[None]*len(respondents) for _ in range(len(respondents))]

for row_idx in range(len(respondents)):
    for col_idx in range(len(respondents)):
        if row_idx != col_idx:
            confusion_matrix = [[0,0],[0,0]]
            confusion_matrix[0][0] = (correctness_dict[respondents[row_idx]] & correctness_dict[respondents[col_idx]]).sum()
            confusion_matrix[0][1] = (correctness_dict[respondents[row_idx]] & ~correctness_dict[respondents[col_idx]]).sum()
            confusion_matrix[1][0] = (~correctness_dict[respondents[row_idx]] & correctness_dict[respondents[col_idx]]).sum()
            confusion_matrix[1][1] = (~correctness_dict[respondents[row_idx]] & ~correctness_dict[respondents[col_idx]]).sum()
            # mcnemar_results = mcnemar(confusion_matrix, exact=True)
            # mcnemar_matrix[row_idx][col_idx] = round(mcnemar_results.pvalue, 5)
            midp_mcnemar_pvalue = calculate_midp_mcnemar(confusion_matrix)
            mcnemar_matrix[row_idx][col_idx] = midp_mcnemar_pvalue

In [41]:
mcnemar_df = pd.DataFrame(mcnemar_matrix,
                          columns=respondents, index=respondents)

mcnemar_df.to_csv(os.path.join(ARTIFACT_DIR, "mcnemar", "mcnemar_3classes.csv"))

# Generalized Linear Mixed Effects Model

In [5]:
combined_df = pd.concat(all_df_list)
# combined_df.to_csv(os.path.join(ARTIFACT_DIR, "glmm", "mixed_effects_dataset.csv")

In [6]:
combined_df.head()

Unnamed: 0,question_no,participant,seniority,ai_correctness,human_correctness,ai_usage
0,1,CHY,Novice,1,1,0
1,2,CHY,Novice,1,1,0
2,3,CHY,Novice,0,1,0
3,4,CHY,Novice,1,0,0
4,5,CHY,Novice,1,1,0


In [7]:
combined_df["seniority"] = combined_df["seniority"].replace(
    {"Novice": 0, "Expert": 1}
)

In [8]:
outcome = combined_df["human_correctness"].values
fixed_effects = combined_df[["seniority", "ai_correctness", "ai_usage"]].values
random_effects = combined_df[["question_no", "participant"]].values

In [9]:
from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM

random = {"a": '0 + C(question_no)', "b": '0 + C(participant)'}

model = BinomialBayesMixedGLM.from_formula(
    'human_correctness ~ seniority + ai_usage + ai_correctness',
    random,
    combined_df
)

In [14]:
result = model.fit_vb()

In [16]:
result.summary()

0,1,2,3,4,5,6
,Type,Post. Mean,Post. SD,SD,SD (LB),SD (UB)
Intercept,M,-0.0247,0.0990,,,
seniority,M,0.7634,0.1736,,,
ai_usage,M,0.9640,0.1563,,,
ai_correctness,M,0.8621,0.1072,,,
a,V,-0.0888,0.0843,0.915,0.773,1.083
b,V,-1.2065,0.3413,0.299,0.151,0.592
