# Setup

In [1]:
import pandas as pd
import os

In [2]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data")
ARTIFACT_DIR = os.path.join(MAIN_DIR, "artifacts")

# Load Data

In [3]:
def classify(score):
    if score == 0 or score == 1:
        return 0
    elif score == 2 or score == 3:
        return 1
    elif score == -1:
        return -1
    else:
        ValueError("Invalid Score")

In [4]:
data_folder = os.path.join(DATA_DIR, "hyper-kvasir")

with open(os.path.join(data_folder, "testcases.txt"), 'r') as fp:
    data = fp.read()
    all_file_paths = data.split("\n") 

filenames = [path.split("/")[-1] for path in all_file_paths]

In [36]:
result_df = pd.read_csv(
    os.path.join(ARTIFACT_DIR, "hyper-kvasir", "run_3", "result_0-1794.csv"),
    usecols = ["filename", "fs_text_raw_answer", "fs_text_score"]
)

result_df = result_df.rename(columns = {"fs_text_score": "gpt_score"})

GT_DIR = os.path.join(DATA_DIR, "hyper-kvasir", "ground_truths")

gt_dict = {}

for gt_score in range(4):
    img_folder = "BBPS " + str(gt_score)
    img_files = os.listdir(os.path.join(GT_DIR, img_folder))
    for img_file in img_files:
        gt_dict[img_file] = gt_score

result_df["gpt_score"] = result_df["gpt_score"].fillna(-1)
result_df["gt_score"] = [gt_dict[file] for file in result_df["filename"]]

result_df["gt_class"] = result_df["gt_score"].apply(lambda x: classify(x))
result_df["gpt_class"] = result_df["gpt_score"].apply(lambda x: classify(x))

result_df.head()

Unnamed: 0,filename,fs_text_raw_answer,gpt_score,gt_score,gt_class,gpt_class
0,f69bfb02-30c2-477c-905f-4c219dba30b1.jpg,"Based on the image provided, the mucosa of the...",0.0,1,0,0
1,21ab075e-3ac3-4e8f-a455-6ca78dc5a248.jpg,"Based on the image provided, the mucosa of the...",3.0,3,1,1
2,be4dff9c-3f5d-40c2-8628-ee83c597b653.jpg,"I'm sorry, but I cannot provide assistance wit...",-1.0,1,0,-1
3,a70e990c-9ae8-44bc-8e04-92071ee88039.jpg,"Based on the image provided, the mucosa of the...",3.0,3,1,1
4,c6aae080-89f2-46e7-8fa9-266d57309b9c.jpg,"Based on the image provided, the mucosa of the...",3.0,3,1,1


# Evaluate

In [6]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, classification_report, cohen_kappa_score

## Score (0, 1, 2, 3)

In [42]:
result_df["gpt_score"].value_counts()

gpt_score
 3.0    655
-1.0    510
 2.0    267
 0.0    185
 1.0    177
Name: count, dtype: int64

In [66]:
preds = result_df["gpt_score"]
labels = result_df["gt_score"]

print(classification_report(labels, preds, digits=5))

              precision    recall  f1-score   support

        -1.0    0.00000   0.00000   0.00000         0
         0.0    0.21081   0.31200   0.25161       125
         1.0    0.68927   0.24254   0.35882       503
         2.0    0.38202   0.54839   0.45033       186
         3.0    0.84885   0.56735   0.68012       980

    accuracy                        0.45652      1794
   macro avg    0.42619   0.33406   0.34818      1794
weighted avg    0.71125   0.45652   0.53636      1794



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
preds = result_df["gpt_class"]
labels = result_df["gt_class"]

print(classification_report(labels, preds, digits=5))

              precision    recall  f1-score   support

          -1    0.00000   0.00000   0.00000         0
           0    0.91160   0.52548   0.66667       628
           1    0.86117   0.68096   0.76054      1166

    accuracy                        0.62653      1794
   macro avg    0.59092   0.40215   0.47573      1794
weighted avg    0.87882   0.62653   0.72768      1794



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
preds = result_df["gpt_class"]
labels = result_df["gt_class"]

print(cohen_kappa_score(labels, preds))

0.37267751121057513


In [43]:
result_df["gpt_class"].value_counts()

gpt_class
 1    922
-1    510
 0    362
Name: count, dtype: int64

In [67]:
result_df.groupby("gt_score")["gpt_score"].value_counts().reset_index()

Unnamed: 0,gt_score,gpt_score,count
0,0,-1.0,51
1,0,0.0,39
2,0,1.0,25
3,0,3.0,8
4,0,2.0,2
5,1,0.0,144
6,1,1.0,122
7,1,-1.0,119
8,1,3.0,62
9,1,2.0,56
