In [1]:
import numpy as np
import os
import pandas as pd

from collections import Counter
from config import MAIN_DIR
from math import comb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score, confusion_matrix
from typing import Sequence, Dict, Any

from statsmodels.stats import inter_rater as irr
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
DATA_DIR = os.path.join("..", "data", "answers")

human_result_path = os.path.join(MAIN_DIR, "artifacts", "human_responses.xlsx")
ai_result_path = os.path.join(MAIN_DIR, "artifacts", "ai_results_summary.xlsx")
save_folder = os.path.join("..", "artifacts")

In [3]:
def majority_vote(
    data: Sequence  
) -> Any:
    counter = Counter(data)
    return max(counter, key=counter.get)

def rename_column(
    column: pd.Series,
    rename_dict: Dict = {
        "USUALLY APPROPRIATE": "UA/MBA",
        "MAY BE APPROPRIATE": "UA/MBA",
        "USUALLY NOT APPROPRIATE": "UNA",
        "INSUFFICIENT INFORMATION": "ICI",
        "UA": "UA/MBA",
        "MBA": "UA/MBA",
        }
) -> pd.Series:
    return column.replace(rename_dict)

def evaluate_results(
    labels: Sequence, preds: Sequence    
) -> Dict:
    accuracy = accuracy_score(labels, preds)
    macro_precision = precision_score(labels, preds, average = "macro", zero_division=np.nan)
    weighted_precision = precision_score(labels, preds, average = "weighted", zero_division=np.nan)
    macro_recall = recall_score(labels, preds, average = "macro", zero_division=np.nan)
    weighted_recall = recall_score(labels, preds, average = "weighted", zero_division=np.nan)
    macro_f1 = f1_score(labels, preds, average = "macro", zero_division=np.nan)
    weighted_f1 = f1_score(labels, preds, average = "weighted", zero_division=np.nan)
    
    return {
        "accuracy": accuracy,
        "macro_precision": macro_precision,
        "weighted_precision": weighted_precision,
        "macro_recall": macro_recall,
        "weighted_recall": weighted_recall,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
    }

# Analysis

In [None]:
df_dict = {}
summary_results = {}

In [12]:
human_labels = [
    'Human_rad1(CLP)',
    'Human_rad2(KGY)',
    'Human_trainee1(CHY)',
    'Human_trainee2(SUD)',
    'Human_ortho(NYH)'
]

for human_label in human_labels:
    summary = {"respondent": human_label}
    human_df = pd.read_excel(human_result_path, sheet_name=human_label)
    human_df["human_gt"] = rename_column(human_df["human_gt"])
    human_df["answer"] = rename_column(
        human_df["answer"],
        rename_dict = {
        "USUALLY APPROPRIATE": "UA/MBA",
        "MAY BE APPROPRIATE": "UA/MBA",
        "UA": "UA/MBA", "MBA": "UA/MBA",
        "USUALLY NOT APPROPRIATE": "UNA",
        "INSUFFICIENT INFORMATION": "ICI"
        }
        )
    
    summary_df = human_df[["human_gt", "answer"]]
    labels=["UA/MBA", "UNA", "ICI"]
    confusion_arr = confusion_matrix(
        summary_df["human_gt"], summary_df["answer"],labels=labels
        )

    summary_results[human_label] = pd.DataFrame(confusion_arr, columns=labels, index=labels)

In [13]:
ai_modes = [
    "NoRAG", "BaseRAG", "CombinedRAG",
]

for ai_mode in ai_modes:
    
    summary_results[ai_mode] = {
        "accuracy": [],
        "macro_precision": [],
        "weighted_precision": [],
        "macro_recall": [],
        "weighted_recall": [],
        "macro_f1": [],
        "weighted_f1": [],
    }
    
    df_dict[ai_mode] = pd.read_excel(ai_result_path, sheet_name=ai_mode, index_col="No")
    df_dict[ai_mode]["human_gt"] = rename_column(df_dict[ai_mode]["human_gt"])
    
    for i in range(5):
        df_dict[ai_mode][f"answer_{i+1}"] = rename_column(df_dict[ai_mode][f"answer_{i+1}"])
        result_metrics = evaluate_results(df_dict[ai_mode]["human_gt"], df_dict[ai_mode][f"answer_{i+1}"])
        
        summary_results[ai_mode]["accuracy"].append(result_metrics["accuracy"])
        summary_results[ai_mode]["macro_precision"].append(result_metrics["macro_precision"])
        summary_results[ai_mode]["weighted_precision"].append(result_metrics["weighted_precision"])
        summary_results[ai_mode]["macro_recall"].append(result_metrics["macro_recall"])
        summary_results[ai_mode]["weighted_recall"].append(result_metrics["weighted_recall"])
        summary_results[ai_mode]["macro_f1"].append(result_metrics["macro_f1"])
        summary_results[ai_mode]["weighted_f1"].append(result_metrics["weighted_f1"])

    df_dict[ai_mode]["majority_vote"] = df_dict[ai_mode][[f"answer_{idx+1}" for idx in range(5)]].apply(lambda x: majority_vote(x), axis = 1)
    
    summary_df = df_dict[ai_mode][["human_gt", "majority_vote"]]
    
    labels=["UA/MBA", "UNA", "ICI"]
    confusion_arr = confusion_matrix(
        summary_df["human_gt"], summary_df["majority_vote"],labels=labels
        )

    summary_results[ai_mode] = pd.DataFrame(confusion_arr, columns=labels, index=labels)

In [15]:
summary_results['Human_rad1(CLP)']

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,25,10,0
UNA,5,20,0
ICI,0,1,9


In [16]:
summary_results['Human_rad2(KGY)']

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,28,7,0
UNA,2,23,0
ICI,0,3,7


In [17]:
summary_results['Human_trainee1(CHY)']

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,28,7,0
UNA,16,9,0
ICI,5,1,4


In [18]:
summary_results['Human_trainee2(SUD)']

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,23,12,0
UNA,5,20,0
ICI,0,0,10


In [19]:
summary_results['Human_ortho(NYH)']

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,24,11,0
UNA,8,17,0
ICI,4,6,0


In [14]:
summary_results["NoRAG"]

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,31,4,0
UNA,18,7,0
ICI,10,0,0


In [6]:
summary_results["BaseRAG"]

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,33,2,0
UNA,16,9,0
ICI,8,1,1


In [7]:
summary_results["CombinedRAG"]

Unnamed: 0,UA/MBA,UNA,ICI
UA/MBA,33,0,2
UNA,3,22,0
ICI,0,0,10
