In [None]:
import os

from openai import AzureOpenAI                      

import pandas as pd
import numpy as np
import sys, time, random, math, os, json, spacy, re
import matplotlib.pyplot as plt
import IRAEUtils

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

## Config params

In [None]:
## config params
##
#GPT_DEPLOYMENT = "OAI05-GPT35Turbo16-0613_061823"
#GPT_DEPLOYMENT = "V04-GPT4Turbo-2024-04-09"
GPT_DEPLOYMENT = "V05-GPT4o"

IRAE_LABEL_COUNT_MAX = 101

list_batch = ['70', '71', '72', '73', '01', '02', '03', '04', '07', '08']

## paths
##
project_path = os.getenv("VU_PROJ_PATH") + "immunotoxicity/"
notes_path = project_path + "out/llm/notes/" 
irae_anno_map_path = f"{project_path}out/llm/notes/batches/frozen/"
path_eval = f"{project_path}out/llm/eval-patient-level/{GPT_DEPLOYMENT}/"

## Load resources: maps & manual annotations (df_gold_all)

In [None]:
## maps: IRAE labes: full | norm -> large categories
##
dict_map_full2norm = IRAEUtils.read2cols_2dict(f"{project_path}in/data4llm/IRAE.labels.reverse.csv", 1, 2, ",", True)
dict_map_norm2large = IRAEUtils.read2cols_2dict(f"{project_path}out/llm/eval-patient-level/map-specific2generic/IRAE.map.refined-large.03.edits.csv", 0, 1, ",", True)
dict_map_full2large = dict()
for key, value in dict_map_full2norm.items():
    if key == 'None':
        continue  # Skip this key-value pair
    dict_map_full2large[key] = dict_map_norm2large[dict_map_full2norm[key]]

list_coll_irae_large = IRAEUtils.read1col_2list_skip1(f"{project_path}out/llm/eval-patient-level/map-specific2generic/IRAE.map.refined-large.03.edits.csv", 1, ",")
set_irae_large = set(list_coll_irae_large)
sorted_list_irae_large = sorted(set_irae_large)

#print(dict_map_full2norm)
#print(dict_map_norm2large)

#print(f"dict_map_full2norm: {len(dict_map_full2norm)} dict_map_norm2large: {len(dict_map_norm2large)}")
#print(f"dict_map_full2norm: {dict_map_full2norm}")
#print(f"dict_map_norm2large: {dict_map_norm2large}")
#print(f"dict_map_full2large: {dict_map_full2large}")
#print(f"dict_map_full2norm: {len(dict_map_full2norm)} dict_map_norm2large: {len(dict_map_norm2large)}")

## Load manual annotations + patient characteristics (concatenate batches)
##
## Header: 'GRID', 'PID', 'AgeFirstICI', 'RaceEth', 'Gender', 'ICIType', 'CancerType', 'FolderName', [irAEs - full labels]
df_gold_full = pd.read_csv(f"{irae_anno_map_path}Map.person-level.Batch.{list_batch[0]}.irAE.csv")
for batch in list_batch[1:]:
    df_batch_gold = pd.read_csv(f"{irae_anno_map_path}Map.person-level.Batch.{batch}.irAE.csv")
    df_gold_full = pd.concat([df_gold_full, df_batch_gold], ignore_index=True)
#display(df_gold_full)

## Build irAE full list
##
list_irae_full = df_gold_full.columns.tolist()
list_irae_full = list_irae_full[8:] # exclude non-irAE data

## Build irAE full filter list (exclude null irAEs)
##
filter_list_irae_full  = []
for irae_full in list_irae_full:
        column_sum = df_gold_full[irae_full].sum()
        if column_sum > 0 :
                filter_list_irae_full.append(irae_full)

print(f"list_irae_full:{len(list_irae_full)} -- filter_list_irae_full:{len(filter_list_irae_full)}")

## Convert full to large irAEs

In [None]:
## init row dictionaries for converting specific to large irAE categs
## 
def init_dict_large_irae(list_irae_large) : 
    dict_large_irae = dict()
    for irae_large in list_irae_large :
        dict_large_irae[irae_large] = 0

    return dict_large_irae

## convert specific to large irAE patient dataframe
## 
def convert_specific_large(df_irae_full, dict_map_full2large, list_irae_full, list_irae_large) :
    df_irae_large = pd.DataFrame(columns = list_irae_large)    

    for index, row in df_irae_full.iterrows():
        dict_row = init_dict_large_irae(list_irae_large)
        for irae_full in list_irae_full :
            if row[irae_full] == 1 :
                dict_row[dict_map_full2large[irae_full]] = 1
                
        df_irae_large = pd.concat([df_irae_large, pd.DataFrame([dict_row])], ignore_index=True)
    
    return df_irae_large

## convert gold full irAE to gold large irAE
##
df_gold_large = convert_specific_large(df_gold_full[list_irae_full], dict_map_full2large, list_irae_full, sorted_list_irae_large)
#display(df_gold_large)

## Build irAE large filter list (exclude null irAEs large labels)
##
filter_list_irae_large  = []
for irae_large in sorted_list_irae_large:
        column_sum = df_gold_large[irae_large].sum()
        if column_sum > 0 :
                filter_list_irae_large.append(irae_large)

print(f"sorted_list_irae_large:{len(sorted_list_irae_large)} -- filter_list_irae_large:{len(filter_list_irae_large)}")
print(set(sorted_list_irae_large) - set(filter_list_irae_large))

## Table 1: Dataset characteristics

In [None]:
## Extract: number of notes/patients/notes per patient
##
def dataset_size() :
    folder_size = 0
    for folder_name in df_gold_full['FolderName'].tolist() :
        folder_size += len(IRAEUtils.file_names(notes_path + "/" + folder_name))
    return folder_size

total_notes = dataset_size()
total_patients = len(df_gold_full)

print(f"Total patients = {total_patients}")
print(f"Total notes = {total_notes}")
print(f"Notes per patient = {total_notes/total_patients}")

# Age at first ICI
#print(df_all_gold['AgeFirstICI'].describe())
print(f"Age at first ICI, mean={df_gold_full['AgeFirstICI'].mean()} std={df_gold_full['AgeFirstICI'].std()}")

# Gender
display(IRAEUtils.df_count_perc2(df_gold_full, 'Gender'))

# RaceEth
display(IRAEUtils.df_count_perc2(df_gold_full, 'RaceEth'))

# CancerType
display(IRAEUtils.df_count_perc2(df_gold_full, 'CancerType'))

# ICIType
display(IRAEUtils.df_count_perc2(df_gold_full, 'ICIType'))

#print(df_gold_all[list_irae_label].sum(axis=0))

#print(df_gold_large[sorted_list_irae_large].sum(axis=0))


# Count patients with no irAEs (ie, with 'None' label)
series_irae_large_count = df_gold_large[sorted_list_irae_large].sum(axis=1)
none_count = 0
for irae_count in series_irae_large_count:
    if irae_count == 0 :
        none_count += 1

# irAE full counts distribution
irae_full_sums = df_gold_full[filter_list_irae_full].sum(axis=0)
irae_full_sums_df = pd.DataFrame(irae_full_sums, columns=['Count'])
row = pd.DataFrame({'Count': none_count}, index=['None'])
irae_full_sums_df = pd.concat([irae_full_sums_df, row], ignore_index=False)   
irae_full_sums_df['Percentage'] = (irae_full_sums_df['Count'] / total_patients) * 100
display(irae_full_sums_df)
irae_full_sums_df.to_csv(project_path + "/out/llm/notes/batches/IRAE.counts.patient-level.cohort-patient-subset.csv")

# irAE large counts distribution
irae_large_sums = df_gold_large[sorted_list_irae_large].sum(axis=0)
irae_large_sums_df = pd.DataFrame(irae_large_sums, columns=['Count'])
row = pd.DataFrame({'Count': none_count}, index=['None'])
irae_large_sums_df = pd.concat([irae_large_sums_df, row], ignore_index=False)   

irae_large_sums_df['Percentage'] = (irae_large_sums_df['Count'] / total_patients) * 100

# Calculate cumulative sum and add as a new row
count_sum = irae_large_sums_df['Count'].sum()
perc_sum = irae_large_sums_df['Percentage'].sum()
row = pd.DataFrame({'Count': count_sum, 'Percentage':perc_sum}, index=['irAE large Total'])
irae_large_sums_df = pd.concat([irae_large_sums_df, row], ignore_index=False)   

display(irae_large_sums_df)


df_map_full2large = pd.DataFrame.from_dict(dict_map_full2large, orient='index')
df_sorted_map_full2large = df_map_full2large.sort_index()
#display(df_sorted_map_full2large)

# Categ - irAE map
rows = [{'value': v, 'keys': [k for k, val in dict_map_full2large.items() if val == v]} for v in set(dict_map_full2large.values())]
df = pd.DataFrame(rows)
df = df.sort_values(by='value')
display(df)

## Conversion from note-level llm predictions to patient-level 

In [None]:
## generalization of collapse_llresponses01_notes_to_patient for multiple threshold values
## in: df with note level llm predictions
## out: list where each element is a df with patient-level llm predictions for a given threshold
def collapse_llresponses01_notes_to_patient_multi_th(patient_multirow_df_llmresponse_01) :
    # init the list of dictionaries with patient-level predictions
    list_dict_collapsed_llmrespose01_multi_th = []    
    for _ in range(IRAE_LABEL_COUNT_MAX):        
        list_dict_collapsed_llmrespose01_multi_th.append({})
    
    # update the list of dictionaries
    for colname in patient_multirow_df_llmresponse_01.columns:
        column_sum = patient_multirow_df_llmresponse_01[colname].sum()
        for threshold in range(IRAE_LABEL_COUNT_MAX):
            list_dict_collapsed_llmrespose01_multi_th[threshold][colname] = 1 if column_sum > threshold else 0

    # init the list of 1-row data frames
    list_patient_onerow_df_llmresponse01_multi_th = []
    for threshold in range(IRAE_LABEL_COUNT_MAX):
        list_patient_onerow_df_llmresponse01_multi_th.append(pd.DataFrame([list_dict_collapsed_llmrespose01_multi_th[threshold]]))

    return list_patient_onerow_df_llmresponse01_multi_th

# collect llm predictions at note level
list_all_notelevel_files = []
for batch in list_batch :
    pattern = fr'^.*\.note-level-llm-01\.B\.{batch}\.csv$'
    notelevel_files = [(batch, f) for f in os.listdir(f"{path_eval}final-batches/batch-{batch}/") if re.match(pattern, f)] # Extract all file names that match the regex pattern
    list_all_notelevel_files.extend(notelevel_files)
#print(list_all_notelevel_files)

# build dict: grid - file_name (note_level)
dict_grid_notelevelfile = dict()
for batch, file_name in list_all_notelevel_files:
    toks = file_name.split(".")
    grid = toks[2] 
    dict_grid_notelevelfile[grid] = (batch, file_name)
#print(dict_grid_notelevelfile)
print(f"dict_grid_notelevelfile:{len(dict_grid_notelevelfile)}")

# compute llm predictions at patient level - build {grid - multi_llm_patient} dict 
# Note: account only for the irAEs in the gold dataset (ie, with at least one label) by using filter_list_irae_full
dict_grid_df_multi_patientlevel = dict()
for index, row in df_gold_full.iterrows():
    grid = row['GRID']
    batch, file_name = dict_grid_notelevelfile[grid]
    df_note_level = pd.read_csv(f"{path_eval}final-batches/batch-{batch}/{file_name}")    
    list_patient_onerow_df_llmresponse01_multi_th = collapse_llresponses01_notes_to_patient_multi_th(df_note_level[filter_list_irae_full])
    dict_grid_df_multi_patientlevel[grid] = list_patient_onerow_df_llmresponse01_multi_th

print(f"dict_grid_df_multi_patientlevel:{len(dict_grid_df_multi_patientlevel)}")

## Evaluation: filtered irAE full labels + all threshold values

In [None]:

df_y_gold = df_gold_full[filter_list_irae_full].to_numpy().astype(int)

df_micro_full = pd.DataFrame(columns=['TH', 'precision', 'recall', 'f1-score', 'support'])
for threshold in range(IRAE_LABEL_COUNT_MAX):
    df_threshold = pd.DataFrame(columns = filter_list_irae_full)
    for index, row in df_gold_full.iterrows():
        GRID = row['GRID']
        df_threshold = pd.concat([df_threshold, dict_grid_df_multi_patientlevel[GRID][threshold]], ignore_index=True)

    df_y_llm = df_threshold.to_numpy().astype(int)    
    
    final_clf_report = classification_report(df_y_gold, df_y_llm, target_names = filter_list_irae_full, zero_division=0, output_dict=True)
    dict_row = {'TH' : threshold}
    dict_row.update(final_clf_report['micro avg'])
    #print(dict_row)
    df_micro_full = pd.concat([df_micro_full, pd.DataFrame([dict_row])], ignore_index=True)
    #print(f"Eval threshold {threshold}:")
    #print(final_clf_report['micro avg'])

display(df_micro_full)
df_micro_full.to_csv(f"{path_eval}PatientEval.micro.all_th.irAE-full-filter-{len(filter_list_irae_full)}.{GPT_DEPLOYMENT}.csv", index=False)

# Find the maximum F1 score and its corresponding threshold
max_f1_idx = df_micro_full['f1-score'].idxmax()
max_f1 = df_micro_full['f1-score'].max()
max_f1_threshold = df_micro_full['TH'][max_f1_idx]

IRAEUtils.write(f"{path_eval}PatientEval.best-th.{GPT_DEPLOYMENT}.txt", f"max_f1_idx:{max_f1_idx} max_micro-f1:{max_f1} max_micro-f1_threshold:{max_f1_threshold}")

In [None]:
# Trends of micro-averaged precision, recall, and F1 scores achieved by the GPT model for various threshold values

plt.figure(figsize=(10, 6))

# Plot each metric
plt.plot(df_micro_full['TH'], df_micro_full['precision'], marker='o', label='Micro-Precision')
plt.plot(df_micro_full['TH'], df_micro_full['recall'], marker='o', label='Micro-Recall')
plt.plot(df_micro_full['TH'], df_micro_full['f1-score'], marker='o', label='Micro-F1')

# Highlight the max F1 score
plt.scatter(max_f1_threshold, max_f1, color='red', s=100, label=f'Best Micro-F1 ({max_f1:.2f})', edgecolors='black', zorder=5)

#plt.title(GPT_DEPLOYMENT)
#plt.title("GPT-3.5", fontsize=18)
#plt.title("GPT-4", fontsize=18)
plt.title("GPT-4o", fontsize=18)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Score', fontsize=16)
plt.xlim(0,100)
plt.xticks(fontsize=13)  
plt.yticks(fontsize=13)  
plt.grid(True)
plt.legend(fontsize=13)

plt.savefig(f"{path_eval}PatientEval.score-trends.{GPT_DEPLOYMENT}.png", dpi=300)
plt.show()

print(f"max_f1_idx:{max_f1_idx} max_f1:{max_f1} max_f1_threshold:{max_f1_threshold}")

## Evaluation [full]: filtered irAE full labels + best threshold 

In [None]:
# Extract detailed results for a given threshold
df_y_gold = df_gold_full[filter_list_irae_full].to_numpy().astype(int)

best_threshold = max_f1_threshold
df_llm_full_best_threshold = pd.DataFrame(columns = filter_list_irae_full)
for index, row in df_gold_full.iterrows():
    GRID = row['GRID']
    df_llm_full_best_threshold = pd.concat([df_llm_full_best_threshold, dict_grid_df_multi_patientlevel[GRID][best_threshold]], ignore_index=True)

df_y_llm = df_llm_full_best_threshold.to_numpy().astype(int)    

final_clf_report = classification_report(df_y_gold, df_y_llm, target_names = filter_list_irae_full, zero_division=0, output_dict=True)
final_clf_report = pd.DataFrame(final_clf_report).transpose()
display(final_clf_report)

final_df_irae_eval = IRAEUtils.irae_eval(df_y_gold, df_y_llm, filter_list_irae_full)
display(final_df_irae_eval)

final_clf_report.to_csv(f"{path_eval}PATIENT-EVAL.FULL.CLF-REPORT.{GPT_DEPLOYMENT}.csv", index=True)
final_df_irae_eval.to_csv(f"{path_eval}PATIENT-EVAL.FULL.DETAILED-REPORT.{GPT_DEPLOYMENT}.csv", index=False)

## Evaluation [large]: filtered irAE large labels + best threshold 

In [None]:
df_llm_large_best_threshold = convert_specific_large(df_llm_full_best_threshold, dict_map_full2large, filter_list_irae_full, filter_list_irae_large)

df_y_gold = df_gold_large[filter_list_irae_large].to_numpy().astype(int)
df_y_llm = df_llm_large_best_threshold[filter_list_irae_large].to_numpy().astype(int)    

final_clf_report = classification_report(df_y_gold, df_y_llm, target_names = filter_list_irae_large, zero_division=0, output_dict=True)
final_clf_report = pd.DataFrame(final_clf_report).transpose()
display(final_clf_report)

final_df_irae_eval = IRAEUtils.irae_eval(df_y_gold, df_y_llm, filter_list_irae_large)
display(final_df_irae_eval)

final_clf_report.to_csv(f"{path_eval}PATIENT-EVAL-LARGE.CLF-REPORT.{GPT_DEPLOYMENT}.csv", index=True)
final_df_irae_eval.to_csv(f"{path_eval}PATIENT-EVAL-LARGE.DETAILED-REPORT.{GPT_DEPLOYMENT}.csv", index=False)