# Setup

In [1]:
import pandas as pd
import os

In [2]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data")
ARTIFACT_DIR = os.path.join(MAIN_DIR, "artifacts")

# Load Data

In [59]:
def classify(score):
    if score == 0 or score == 1:
        return 0
    elif score == 2 or score == 3:
        return 1
    elif score == -1:
        return -1
    else:
        ValueError("Invalid Score")

In [56]:
low_score_folder = "bbps-0-1"
data_folder = os.path.join(DATA_DIR, "hyper-kvasir")

with open(os.path.join(data_folder, "testcases.txt"), 'r') as fp:
    data = fp.read()
    all_file_paths = data.split("\n") 

gt_scores = [0 if (path.split("/")[-2] == low_score_folder) else 1 for path in all_file_paths]
filenames = [path.split("/")[-1] for path in all_file_paths]

assert len(gt_scores) == len(filenames)

In [60]:
result_df = pd.read_csv(
    os.path.join(ARTIFACT_DIR, "hyper-kvasir", "run_1", "result_0-1794.csv"),
    usecols = ["filename", "fs_text_raw_answer", "fs_text_score"]
)

result_df = result_df.rename(columns = {"fs_text_score": "gpt_score"})

GT_DIR = os.path.join(DATA_DIR, "hyper-kvasir", "ground_truths")

gt_dict = {}

for gt_score in range(4):
    img_folder = "BBPS " + str(gt_score)
    img_files = os.listdir(os.path.join(GT_DIR, img_folder))
    for img_file in img_files:
        gt_dict[img_file] = gt_score

result_df["gpt_score"] = result_df["gpt_score"].fillna(-1)
result_df["gt_score"] = [gt_dict[file] for file in result_df["filename"]]

result_df["gt_class"] = result_df["gt_score"].apply(lambda x: classify(x))
result_df["gpt_class"] = result_df["gpt_score"].apply(lambda x: classify(x))

result_df.head()

Unnamed: 0,filename,fs_text_raw_answer,gpt_score,gt_score,gt_class,gpt_class
0,f69bfb02-30c2-477c-905f-4c219dba30b1.jpg,"Based on the image provided, the bowel prepara...",1.0,1,0,0
1,21ab075e-3ac3-4e8f-a455-6ca78dc5a248.jpg,"Based on the image provided, the mucosa of the...",3.0,3,1,1
2,be4dff9c-3f5d-40c2-8628-ee83c597b653.jpg,"Based on the image provided, the mucosa of the...",2.0,1,0,1
3,a70e990c-9ae8-44bc-8e04-92071ee88039.jpg,"Based on the image provided, the mucosa of the...",3.0,3,1,1
4,c6aae080-89f2-46e7-8fa9-266d57309b9c.jpg,The image shows a colon segment with clear vis...,3.0,3,1,1


# Evaluate

In [66]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, classification_report, cohen_kappa_score

## Score (0, 1, 2, 3)

In [72]:
preds = result_df["gpt_score"]
labels = result_df["gt_score"]

print(classification_report(labels, preds, digits=5))

              precision    recall  f1-score   support

        -1.0    0.00000   0.00000   0.00000         0
         0.0    0.61386   0.49600   0.54867       125
         1.0    0.82692   0.42744   0.56356       503
         2.0    0.38527   0.73118   0.50464       186
         3.0    0.87996   0.95000   0.91364       980

    accuracy                        0.74916      1794
   macro avg    0.54120   0.52092   0.50610      1794
weighted avg    0.79526   0.74916   0.74765      1794



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
no_na_preds = result_df[result_df["gpt_score"] != -1]["gpt_score"]
no_na_labels = result_df[result_df["gt_score"] != -1]["gpt_score"]

print(classification_report(labels, preds, digits=5))

In [70]:
preds = result_df["gpt_class"]
labels = result_df["gt_class"]

print(cohen_kappa_score(labels, preds))

0.6189504360186617


In [77]:
result_df.groupby("gt_score")["gpt_score"].value_counts().reset_index()

Unnamed: 0,gt_score,gpt_score,count
0,0,0.0,62
1,0,1.0,41
2,0,3.0,10
3,0,-1.0,7
4,0,2.0,5
5,1,1.0,215
6,1,2.0,177
7,1,3.0,68
8,1,0.0,39
9,1,-1.0,4
