In [None]:
import os

from openai import AzureOpenAI                      

import pandas as pd
import numpy as np
import sys, time, random, math, os, json, spacy
import IRAEUtils

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Config paramaters

In [None]:
#GPT_DEPLOYMENT = "OAI05-GPT35Turbo16-0613_061823"
#GPT_DEPLOYMENT = "V04-GPT4Turbo-2024-04-09"
GPT_DEPLOYMENT = "V05-GPT4o"

#IRAE_LABEL_COUNT = 0
IRAE_LABEL_COUNT = 1

IRAE_LABEL_COUNT_MAX = 11

BATCH = "TEST"
#BATCH = "04"

project_path = os.getenv("VU_PROJ_PATH") + "immunotoxicity/"
notes_path = project_path + "out/llm/notes/" 
#notes_path = project_path + "out/llm/notes/notestest/" 

irae_anno_map_path = f"{project_path}out/llm/notes/batches/frozen/Map.person-level.Batch.{BATCH}.irAE.csv"
irae_synset_map_path = project_path + "out/llm/notes/batches/frozen/Map.irae.synset.FINAL.csv"

#path_eval = f"{project_path}out/llm/eval-patient-level/test/{GPT_DEPLOYMENT}/"
path_eval = f"{project_path}out/llm/eval-patient-level/{GPT_DEPLOYMENT}/"

llm = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI05_API_KEY"),  
  api_version =  "2024-07-01-preview",  
  azure_endpoint = os.getenv("AZURE_OPENAI05_ENDPOINT")
)

nlp = spacy.load('en_core_web_sm')

print(irae_anno_map_path)

## Load data

In [None]:
# Header: 'GRID', 'PID', 'AgeFirstICI', 'RaceEth', 'Gender', 'ICIType', 'CancerType', 'FolderName', [irAEs]
df_irae_data = pd.read_csv(irae_anno_map_path)
#display(df_irae_data)

# Load irAE list
list_irae_label = df_irae_data.columns.tolist()
list_irae_label = list_irae_label[8:] # exclude non-irAE data

# data frame with manual/gold irAE labels
df_manual_GRID_IRAE = df_irae_data[['GRID'] + list_irae_label]
#display(df_manual_GRID_IRAE)

# ICI list
list_ici_label = "atezolizumab (tecentriq, atezo), avelumab (bavencio), durvalumab (imfinzi), ipilimumab (yervoy), nivolumab (opdivo, nivo), pembrolizumab (keytruda, pembro)"

# Load a dictionary mapping irAEs to its variations
map_irae_label_expand = IRAEUtils.read2cols_2dict_all(irae_synset_map_path, 0, 1, "|")

In [None]:
def folder_size() :
    folder_size = 0
    for folder_name in df_irae_data['FolderName'].tolist() :
        folder_size += len(IRAEUtils.file_names(notes_path + "/" + folder_name))
    return folder_size

print(folder_size())

## IRAE prompt

In [None]:
# Prompt instructions for each irAE type. The output is requested in JSON format
# Relaace get_irae_list_json
def json_format_irae_list(list_irae_label):
    s = '{'
    for irae_label in list_irae_label :
        s += f'''\n"{irae_label}": Output 'Yes' if the patient has experienced {map_irae_label_expand[irae_label]} because of exposure to one or more immune checkpoint inhibitors. Otherwise, output 'No'.,'''
    s += '}'

    return s

#print(get_irae_list_json(irae_label_list))

# Zero-shot prompt template 
def irae_prompt(note_text, list_ici_label, list_irae_label):
    irae_json_format = json_format_irae_list(list_irae_label)
    messages = []
    messages.append({"role": "system", "content": f"""You are a clinical expert in identifying immune-related adverse events (irAEs) caused by immune checkpoint inhibitors (ICIs).                                           
                     You will receive as input a patient note corresponding to a patient who was treated or is currently treated with one or multiple immune checkpoint inhibitors (ICIs) from the following ICI list: {list_ici_label}. 
                     Your task is to determine if the patient note describes any of the immune-related adverse events (irAEs) experienced by the patient and caused by immune checkpoint inhibitors.
                     Output your response in a JSON format using the following structure: 
                     {irae_json_format}"""})
    messages.append({"role": "user", "content": f"""Does the following patient note describe immune-related adverse 
                     events experienced by the patient? 
                     Patient note: {note_text}"""})                     
    return messages

print(irae_prompt(" .. note .. ", list_ici_label, list_irae_label))

## LLM output processisng

In [None]:
# in:  [note-level] dictionary of yes/no reponses by LLM for each irAE label
# out: [note-level] dictionary of 1/0 reponses by LLM for each irAE label
#
# note: yes/no conversion to 1/0 imvolves NLP processing
def convert_llmresponse_yn_to_01(json_dict_irae_llmreponse_yn, list_irae_label, nlp) :
    dict_llmresponse01 = dict()

    if len(list_irae_label) != len(json_dict_irae_llmreponse_yn):
        #print(f"FALSE MSG len{}")
        return False, dict_llmresponse01

    for irae_label in list_irae_label :
        if irae_label not in json_dict_irae_llmreponse_yn :            
            return False, dict_llmresponse01

        llmresponse01 = IRAEUtils.convert_llm_response_to_01(json_dict_irae_llmreponse_yn[irae_label], nlp)
        #print(f"{irae_label} : {json_dict_irae_reponse[irae_label]} : {response_01_elem}")        
        dict_llmresponse01[irae_label] = llmresponse01
        
        if llmresponse01 == -1 :
            return False, dict_llmresponse01
    
    return True, dict_llmresponse01


# in:  [patient-level] list of LLM responses in yes/no JSON format corresponding to all notes of a given patient 
# out: [patient-level] data frame with 1/0 responses. Each row corresponds to a note. Each colum corresponds to an irAE label.
def convert_list_llmresponse_yn_to_01(list_note_fnames, list_note_llmresponse_yn, list_irae_label, nlp) :
    patient_multirow_df_llmresponse_01 = pd.DataFrame(columns=list_irae_label)
    list_note_fnames_filter = [] 

    for note_fname, llmresponse_yn  in zip(list_note_fnames, list_note_llmresponse_yn) :
        #print(f"LL response: {llmresponse_yn}")
        json_llm_response_yn = llmresponse_yn.removeprefix("```json").removesuffix("```").strip()
        #print(f"LLM response: {json_llm_response_yn}")
        if IRAEUtils.is_json(json_llm_response_yn) :
            json_dict_irae_reponse = json.loads(json_llm_response_yn)
            #print(json_dict_irae_reponse)
            flag, dict_llm_response01 = convert_llmresponse_yn_to_01(json_dict_irae_reponse, list_irae_label, nlp)
            if flag == True :
                list_note_fnames_filter.append(note_fname)
                #print(llm_response_01_dict)
                note_df_llmresponse01 = pd.DataFrame([dict_llm_response01])
                #display(note_df_llmresponse01)                
                patient_multirow_df_llmresponse_01 = pd.concat([patient_multirow_df_llmresponse_01, note_df_llmresponse01], ignore_index=True)
                #display(patient_df_llmresponse_01)
            #else :
            #    print(f"FALSE RETURN:{json_dict_irae_reponse}")
    
    patient_df_notefname = pd.DataFrame(list_note_fnames_filter, columns=['FileName'])
    return patient_df_notefname, patient_multirow_df_llmresponse_01

# in: multi-row df of binary llmreposes for all patient notes
# out: one-row df with collapsed llm responses
#
# Note: The assignment of an irAE label at patient-level was determined if the irAE label was prediced for at least one note
def collapse_llresponses01_notes_to_patient(patient_multirow_df_llmresponse_01) :
    dict_collapsed_llmrespose01 = dict()
    for colname in patient_multirow_df_llmresponse_01.columns:
        column_sum = patient_multirow_df_llmresponse_01[colname].sum()
        binary_value = 1 if column_sum > IRAE_LABEL_COUNT else 0
        dict_collapsed_llmrespose01[colname] = binary_value
    
    patient_onerow_df_llmresponse_01 = pd.DataFrame([dict_collapsed_llmrespose01])
    return patient_onerow_df_llmresponse_01

# generalization of collapse_llresponses01_notes_to_patient for multiple threshold values
def collapse_llresponses01_notes_to_patient_multi_th(patient_multirow_df_llmresponse_01) :
    # init the list of dictionaries
    list_dict_collapsed_llmrespose01_multi_th = []    
    for _ in range(IRAE_LABEL_COUNT_MAX):        
        list_dict_collapsed_llmrespose01_multi_th.append({})
    
    # update the list of dictionaries
    for colname in patient_multirow_df_llmresponse_01.columns:
        column_sum = patient_multirow_df_llmresponse_01[colname].sum()
        for threshold in range(IRAE_LABEL_COUNT_MAX):
            list_dict_collapsed_llmrespose01_multi_th[threshold][colname] = 1 if column_sum > threshold else 0

    # init the list of 1-row data frames
    list_patient_onerow_df_llmresponse01_multi_th = []
    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_patient_onerow_df_llmresponse01_multi_th.append(pd.DataFrame([list_dict_collapsed_llmrespose01_multi_th[threshold]]))

    return list_patient_onerow_df_llmresponse01_multi_th

## Run LLMs - multiple thresholds

In [None]:
exception_list = []
list_patient_id = []
patient_counter = 0
# patient_df_llmresponse_01 > list_patient_df_llmresponse01_multi_th
list_patient_df_llmresponse01_multi_th = [] # init w/ empty dfs
for threshold in range(IRAE_LABEL_COUNT_MAX):
    list_patient_df_llmresponse01_multi_th.append(pd.DataFrame(columns=list_irae_label))

for patient_id, patient_folder in zip (df_irae_data['GRID'], df_irae_data['FolderName']) :
    list_patient_id.append(patient_id)
    patient_counter += 1
    print(f"{patient_counter} - {patient_id} - {patient_folder}")
    list_patient_note_llmresponse_yn = []
    list_patient_note_fname = IRAEUtils.file_names(notes_path + patient_folder)
    for patient_note_fname in  list_patient_note_fname:
        print('.', end='', flush=True)
        #print("Patient note:" + patient_note_fname)
        note = IRAEUtils.read(notes_path + patient_folder + "/" + patient_note_fname)
        #print(irae_prompt(note, list_ici_label, list_irae_label))

        try:
            llm_response = llm.chat.completions.create(model = GPT_DEPLOYMENT,
                                                       temperature=0.0, max_tokens=1000, n = 1,
                                                       frequency_penalty=0, presence_penalty=0, seed = 13,     
                                                       #top_p=1, ## reco: alter this param or temp but not both https://platform.openai.com/docs/api-reference/chat/create            
                                                       messages = irae_prompt(note, list_ici_label, list_irae_label))
            #print(llm_response.choices[0].message.content.strip())
            list_patient_note_llmresponse_yn.append(llm_response.choices[0].message.content.strip())
        except Exception as e:
            message = str(e).strip().replace('\n', ' ')
            print("LLMException: "+message)
            list_patient_note_llmresponse_yn.append("LLMException: "+message)
            exception_list.append(f"LLMException: [{patient_note_fname}] {message}")

    # write NoteFName - llm-responses in yes/no JSON format
    df_notes_fname_llm_yn = pd.DataFrame({'FileName' : list_patient_note_fname, 'LLM_yn' : list_patient_note_llmresponse_yn})
    df_notes_fname_llm_yn.to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.note-level-llm-yn.B.{BATCH}.csv", index=False)

    # convert the llm yes/no llm responses into binary responses        
    patient_df_filter_notefname, patient_df_filter_llmresponse_01 = convert_list_llmresponse_yn_to_01(list_patient_note_fname, list_patient_note_llmresponse_yn, list_irae_label, nlp)
    df_notes_fname_llm_01 = pd.concat([patient_df_filter_notefname, patient_df_filter_llmresponse_01], axis=1)    
    df_notes_fname_llm_01.to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.note-level-llm-01.B.{BATCH}.csv", index=False)

    ##------
    ##
    ##
    # collapse the binary llm responses from note- to patient-level
    #
    # patient_onerow_df_llmresponse_01 >> list_patient_onerow_df_llmresponse01_multi_th
    # patient_df_llmresponse_01 >> list_patient_df_llmresponse01_multi_th
    #
    list_patient_onerow_df_llmresponse01_multi_th = collapse_llresponses01_notes_to_patient_multi_th(patient_df_filter_llmresponse_01)

    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_patient_onerow_df_llmresponse01_multi_th[threshold].to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.patient-level-llm-01.TH.{threshold}.B.{BATCH}.csv", index=False)

    # append patient llm response
    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_patient_df_llmresponse01_multi_th[threshold] = pd.concat([list_patient_df_llmresponse01_multi_th[threshold], list_patient_onerow_df_llmresponse01_multi_th[threshold]], ignore_index=True)
    
    # write the current llresponses
    current_df_patient_id = pd.DataFrame(list_patient_id, columns=['GRID'])
    
    list_current_patient_df_llmresponse01_multi_th = []
    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_current_patient_df_llmresponse01_multi_th.append(pd.concat([current_df_patient_id, list_patient_df_llmresponse01_multi_th[threshold]], axis=1))
    
    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_current_patient_df_llmresponse01_multi_th[threshold].to_csv(f"{path_eval}IRAE-LABELS.CRT.{patient_counter}.LLM.patient-level.TH.{threshold}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

    # write the current gold/manual responses
    current_df_manual_GRID_IRAE = pd.merge(current_df_patient_id, df_manual_GRID_IRAE, on='GRID', how='inner')
    current_df_manual_GRID_IRAE.to_csv(f"{path_eval}IRAE-LABELS.CRT.{patient_counter}.GOLD.patient-level.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

    for threshold in range(IRAE_LABEL_COUNT_MAX):
        if (current_df_manual_GRID_IRAE['GRID'] != list_current_patient_df_llmresponse01_multi_th[threshold]['GRID']).any() :
            raise Exception(f"GRID columns are not equal patient counter({patient_counter}) threshold({threshold})")


    # current eval
    current_df_y_gold = current_df_manual_GRID_IRAE[list_irae_label].to_numpy().astype(int)

    for threshold in range(IRAE_LABEL_COUNT_MAX):
        current_df_y_llm = list_current_patient_df_llmresponse01_multi_th[threshold][list_irae_label].to_numpy().astype(int)

        # classif report
        current_clf_report = classification_report(current_df_y_gold, current_df_y_llm, target_names=list_irae_label, zero_division=0, output_dict=True)
        current_df_report = pd.DataFrame(current_clf_report).transpose()
        current_df_report.to_csv(f"{path_eval}EVAL.CRT.{patient_counter}.CLF-REPORT.TH.{threshold}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=True)

        # write detailed eval
        current_df_irae_eval = IRAEUtils.irae_eval(current_df_y_gold, current_df_y_llm, list_irae_label)
        current_df_irae_eval.to_csv(f"{path_eval}EVAL.CRT.{patient_counter}.DETAILED-REPORT.TH.{threshold}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

# write final llm resposens01 at patient level
df_grid = df_irae_data[['GRID']]

list_df_llm_data_multi_th = []
for threshold in range(IRAE_LABEL_COUNT_MAX):
    list_df_llm_data_multi_th.append(pd.concat([df_grid, list_patient_df_llmresponse01_multi_th[threshold]], axis=1))

for threshold in range(IRAE_LABEL_COUNT_MAX):
    list_df_llm_data_multi_th[threshold].to_csv(f"{path_eval}IRAE-LABELS.FINAL.Y_LLM.TH.{threshold}.B.{BATCH}.patient-level.{GPT_DEPLOYMENT}.csv", index=False)


# write final gold IRAE labels at patient level
df_manual_GRID_IRAE.to_csv(f"{path_eval}IRAE-LABELS.FINAL.Y_GOLD.B.{BATCH}.patient-level.{GPT_DEPLOYMENT}.csv", index=False)

# final classif report and detailed eval
df_y_gold = df_irae_data[list_irae_label].to_numpy().astype(int)

for threshold in range(IRAE_LABEL_COUNT_MAX):
    #df_y_llm = df_llm_data[list_irae_label].to_numpy().astype(int)
    df_y_llm = list_df_llm_data_multi_th[threshold][list_irae_label].to_numpy().astype(int)

    final_clf_report = classification_report(df_y_gold, df_y_llm, target_names=list_irae_label, zero_division=0, output_dict=True)
    final_df_report = pd.DataFrame(final_clf_report).transpose()
    final_df_report.to_csv(f"{path_eval}_FINAL-EVAL.CLF-REPORT.TH.{threshold}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=True)

    final_df_irae_eval = IRAEUtils.irae_eval(df_y_gold, df_y_llm, list_irae_label)
    final_df_irae_eval.to_csv(f"{path_eval}_FINAL-EVAL.CRT.DETAILED-REPORT.TH.{threshold}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

## Run LLMs - single threshold

In [None]:
exception_list = []
list_patient_id = []
patient_counter = 0
patient_df_llmresponse_01 = pd.DataFrame(columns=list_irae_label)
for patient_id, patient_folder in zip (df_irae_data['GRID'], df_irae_data['FolderName']) :
    list_patient_id.append(patient_id)
    patient_counter += 1
    print(f"{patient_counter} - {patient_id} - {patient_folder}")
    list_patient_note_llmresponse_yn = []
    list_patient_note_fname = IRAEUtils.file_names(notes_path + patient_folder)
    for patient_note_fname in  list_patient_note_fname:
        print('.', end='', flush=True)
        #print("Patient note:" + patient_note_fname)
        note = IRAEUtils.read(notes_path + patient_folder + "/" + patient_note_fname)
        #print(irae_prompt(note, list_ici_label, list_irae_label))

        try:
            llm_response = llm.chat.completions.create(model = GPT_DEPLOYMENT,
                                                       temperature=0.0, max_tokens=1000, n = 1,
                                                       frequency_penalty=0, presence_penalty=0, seed = 13,     
                                                       #top_p=1, ## reco: alter this param or temp but not both https://platform.openai.com/docs/api-reference/chat/create            
                                                       messages = irae_prompt(note, list_ici_label, list_irae_label))
            #print(llm_response.choices[0].message.content.strip())
            list_patient_note_llmresponse_yn.append(llm_response.choices[0].message.content.strip())
        except Exception as e:
            message = str(e).strip().replace('\n', ' ')
            print("LLMException: "+message)
            list_patient_note_llmresponse_yn.append("LLMException: "+message)
            exception_list.append(f"LLMException: [{patient_note_fname}] {message}")

    # write NoteFName - llm-responses in yes/no JSON format
    df_notes_fname_llm_yn = pd.DataFrame({'FileName' : list_patient_note_fname, 'LLM_yn' : list_patient_note_llmresponse_yn})
    df_notes_fname_llm_yn.to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.note-level-llm-yn.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.csv", index=False)

    # convert the llm yes/no llm responses into binary responses        
    patient_df_filter_notefname, patient_df_filter_llmresponse_01 = convert_list_llmresponse_yn_to_01(list_patient_note_fname, list_patient_note_llmresponse_yn, list_irae_label, nlp)
    df_notes_fname_llm_01 = pd.concat([patient_df_filter_notefname, patient_df_filter_llmresponse_01], axis=1)    
    df_notes_fname_llm_01.to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.note-level-llm-01.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.csv", index=False)

    # collapse the binary llm responses from note- to patient-level
    patient_onerow_df_llmresponse_01 = collapse_llresponses01_notes_to_patient(patient_df_filter_llmresponse_01)
    patient_onerow_df_llmresponse_01.to_csv(f"{path_eval}OUT.{patient_counter}.{patient_id}.patient-level-llm-01.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.csv", index=False)

    # append patient llm response
    patient_df_llmresponse_01 = pd.concat([patient_df_llmresponse_01, patient_onerow_df_llmresponse_01], ignore_index=True)

    # write the current llresponses
    current_df_patient_id = pd.DataFrame(list_patient_id, columns=['GRID'])
    current_patient_df_llmresponse_01 = pd.concat([current_df_patient_id, patient_df_llmresponse_01], axis=1)
    current_patient_df_llmresponse_01.to_csv(f"{path_eval}IRAE-LABELS.CRT.{patient_counter}.LLM.patient-level.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

    # write the current gold/manual responses
    current_df_manual_GRID_IRAE = pd.merge(current_df_patient_id, df_manual_GRID_IRAE, on='GRID', how='inner')
    current_df_manual_GRID_IRAE.to_csv(f"{path_eval}IRAE-LABELS.CRT.{patient_counter}.GOLD.patient-level.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

    if (current_df_manual_GRID_IRAE['GRID'] != current_patient_df_llmresponse_01['GRID']).any() :
        raise Exception(f"GRID columns are not equal ({patient_counter})")

    # current data for eval
    current_df_y_gold = current_df_manual_GRID_IRAE[list_irae_label].to_numpy().astype(int)
    current_df_y_llm = current_patient_df_llmresponse_01[list_irae_label].to_numpy().astype(int)

    # write classif report
    current_clf_report = classification_report(current_df_y_gold, current_df_y_llm, target_names=list_irae_label, zero_division=0, output_dict=True)
    current_df_report = pd.DataFrame(current_clf_report).transpose()
    current_df_report.to_csv(f"{path_eval}EVAL.CRT.{patient_counter}.CLF-REPORT.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=True)

    # write detailed eval
    current_df_irae_eval = IRAEUtils.irae_eval(current_df_y_gold, current_df_y_llm, list_irae_label)
    current_df_irae_eval.to_csv(f"{path_eval}EVAL.CRT.{patient_counter}.DETAILED-REPORT.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

# write final llm resposens01 at patient level
df_grid = df_irae_data[['GRID']]
df_llm_data = pd.concat([df_grid, patient_df_llmresponse_01], axis=1)
df_llm_data.to_csv(f"{path_eval}IRAE-LABELS.FINAL.Y_LLM.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.patient-level.{GPT_DEPLOYMENT}.csv", index=False)

# write final gold IRAE labels at patient level
df_manual_GRID_IRAE.to_csv(f"{path_eval}IRAE-LABELS.FINAL.Y_GOLD.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.patient-level.{GPT_DEPLOYMENT}.csv", index=False)

# write final classif report and detailed eval
df_y_gold = df_irae_data[list_irae_label].to_numpy().astype(int)
df_y_llm = df_llm_data[list_irae_label].to_numpy().astype(int)

final_clf_report = classification_report(df_y_gold, df_y_llm, target_names=list_irae_label, zero_division=0, output_dict=True)
final_df_report = pd.DataFrame(final_clf_report).transpose()
final_df_report.to_csv(f"{path_eval}_FINAL-EVAL.CLF-REPORT.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=True)

final_df_irae_eval = IRAEUtils.irae_eval(df_y_gold, df_y_llm, list_irae_label)
final_df_irae_eval.to_csv(f"{path_eval}_FINAL-EVAL.CRT.DETAILED-REPORT.TH.{IRAE_LABEL_COUNT}.B.{BATCH}.{GPT_DEPLOYMENT}.csv", index=False)

In [None]:
current_df_irae_eval = IRAEUtils.irae_eval(current_df_y_gold, current_df_y_llm, list_irae_label)
print(current_df_irae_eval)