In [None]:
from openai import AzureOpenAI

import pandas as pd
import numpy as np
import os, json, spacy, math
import IRAEUtils

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import tiktoken

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Config paramaters

In [None]:
#GPT_DEPLOYMENT = "OAI05-GPT35Turbo16-0613_061823"
#GPT_DEPLOYMENT = "V04-GPT4Turbo-2024-04-09"
GPT_DEPLOYMENT = "V05-GPT4o"

project_path = os.getenv("VU_PROJ_PATH") + "immunotoxicity/"
notes_path = project_path + "in/data4llm/"

annotations_path = project_path  +"in/data4llm/Map.File-irAELabels.csv"
irae_synsets_path = project_path + "map_irae_prompt_detail.csv"
path_eval = f"{project_path}out/llm/eval-note-level/{GPT_DEPLOYMENT}/"

filter_list_irae_full = ['Neuropathy', 'Hypothyroid', 'Myasthenia gravis (MG)', 'Rash', 'Colitis', 'Adrenal insufficiency', 'Hepatitis', 'Arthralgia', 'Duodenitis', 'Pancreatitis', 'Hypophysitis', 'Mucositis', 'Arthritis', 'Pneumonitis', 'Joint pain', 'Fever', 'Myalgia'] 
list_fname_irae_full = ['FileName'] + filter_list_irae_full

llm = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI05_API_KEY"),  
  api_version =  "2024-07-01-preview",
  azure_endpoint = os.getenv("AZURE_OPENAI05_ENDPOINT")
)

nlp = spacy.load('en_core_web_sm')

## Load resources: maps & manual annotations

In [None]:
## maps: IRAE labes: full | norm -> large categories
##
dict_map_full2norm = IRAEUtils.read2cols_2dict(f"{project_path}in/data4llm/IRAE.labels.reverse.csv", 1, 2, ",", True)
dict_map_norm2large = IRAEUtils.read2cols_2dict(f"{project_path}out/llm/eval-patient-level/map-specific2generic/IRAE.map.refined-large.03.edits.csv", 0, 1, ",", True)
dict_map_full2large = dict()
for key, value in dict_map_full2norm.items():
    if key == 'None':
        continue  # Skip this key-value pair
    dict_map_full2large[key] = dict_map_norm2large[dict_map_full2norm[key]]

list_coll_irae_large = IRAEUtils.read1col_2list_skip1(f"{project_path}out/llm/eval-patient-level/map-specific2generic/IRAE.map.refined-large.03.edits.csv", 1, ",")
set_irae_large = set(list_coll_irae_large)
sorted_list_irae_large = sorted(set_irae_large)

# [2] load df[notes & irAE annotations]
#
notelist = []
with open(annotations_path, 'r') as f:    
    next(f) # Skip the first line
    for line in f:    
        cols = line.split(',')
        #print(os.path.join(notes_path, cols[0]))
        with open(os.path.join(notes_path, cols[0]), 'r') as datafile:
            notelist.append({"FileName":cols[0], "text":datafile.read()})       
df_notes = pd.DataFrame(notelist) 
df_labels = pd.read_csv(annotations_path)

df_gold_full = pd.merge(df_labels, df_notes, on='FileName')
df10 = df_gold_full.head(10)
#df_gold_full = df_gold_full.head(10)

#display(df_gold_full)
#display(df_gold_full[list_fname_irae_full])
#display(df_gold_full[filter_list_irae_full])
#print(df_gold_full[filter_list_irae_full].sum(axis=0))
#print(df10[filter_list_irae_full].sum(axis=0))
print(f"Number of notes: {len(df_gold_full)}")

# [3] load irAE sysnset map
#
list_irae_synsets =  IRAEUtils.read2cols_2list_all(irae_synsets_path, 0, 1, "|")

dict_irae_synsets = dict()
for tuple2 in list_irae_synsets : 
    dict_irae_synsets[tuple2[0]] = tuple2[1]
#print(dict_irae_synsets)

# [4] build the list of binary questions in json format that will be included into the prompt
#
ici_list = "atezolizumab (tecentriq, atezo), avelumab (bavencio), durvalumab (imfinzi), ipilimumab (yervoy), nivolumab (opdivo, nivo), pembrolizumab (keytruda, pembro)"

In [None]:
## irAE counts - note level
list_all_irae_full = [irae_label for irae_label in df_gold_full.columns if irae_label not in ['FileName', 'text', 'GRID']]
#df_gold_full['GRID'] = df_gold_full['FileName'].apply(lambda x: x.split('.')[3])

df_gold_full[list_all_irae_full].sum().to_csv(project_path + "/out/llm/notes/batches/IRAE.counts.note-level.cohort-note-subset.csv")

display(df_gold_full[list_all_irae_full].sum())

filter2_list_irae_full  = []
for irae_full in list_all_irae_full:
        column_sum = df_gold_full[irae_full].sum()
        if column_sum > 0 :
                filter2_list_irae_full.append(irae_full)

print(f'Total irAE annotated notes: {len(df_gold_full)}')
print(list_all_irae_full)
print(sorted(filter2_list_irae_full))
print(sorted(filter_list_irae_full))

#display(df_gold_full[['GRID'] + list_fname_irae_full])
#display([list_fname_irae_full])
#display(df_gold_full[list_fname_irae_full])

## Convert full to large irAEs

In [None]:
## init row dictionaries for converting specific to large irAE categs
## 
def init_dict_large_irae(list_irae_large) : 
    dict_large_irae = dict()
    for irae_large in list_irae_large :
        dict_large_irae[irae_large] = 0

    return dict_large_irae

## convert specific to large irAE patient dataframe
## 
def convert_specific_large(df_irae_full, dict_map_full2large, list_irae_full, list_irae_large) :
    df_irae_large = pd.DataFrame(columns = list_irae_large)    

    for index, row in df_irae_full.iterrows():
        dict_row = init_dict_large_irae(list_irae_large)
        for irae_full in list_irae_full :
            if row[irae_full] == 1 :
                dict_row[dict_map_full2large[irae_full]] = 1
                
        df_irae_large = pd.concat([df_irae_large, pd.DataFrame([dict_row])], ignore_index=True)
    
    return df_irae_large

## convert gold full irAE to gold large irAE
##
df_gold_large = convert_specific_large(df_gold_full[filter_list_irae_full], dict_map_full2large, filter_list_irae_full, sorted_list_irae_large)
display(df_gold_large)

## Build irAE large filter list (exclude null irAEs large labels)
##
filter_list_irae_large  = []
for irae_large in sorted_list_irae_large:
        column_sum = df_gold_large[irae_large].sum()
        if column_sum > 0 :
                filter_list_irae_large.append(irae_large)

print(f"sorted_list_irae_large:{len(sorted_list_irae_large)} -- filter_list_irae_large:{len(filter_list_irae_large)}")
print(set(sorted_list_irae_large) - set(filter_list_irae_large))

## IRAE prompt

In [None]:
def get_irae_list_json(irae_list):
    s = '{'
    for irae_label in irae_list :
        s += f'''\n"{irae_label}": Output 'Yes' if the patient has experienced {dict_irae_synsets[irae_label]} because of exposure to one or more immune checkpoint inhibitors. Otherwise, output 'No'.,'''
    s += '}'

    return s

print(get_irae_list_json(filter_list_irae_full))

def prompt_specific_json(note_text, ici_list, irae_list):
    irae_json_format = get_irae_list_json(irae_list)
    messages = []
    messages.append({"role": "system", "content": f"""You are a clinical expert in identifying immune-related adverse events (irAEs) caused by immune checkpoint inhibitors (ICIs).                                           
                     You will receive as input a patient note corresponding to a patient who was treated or is currently treated with one or multiple immune checkpoint inhibitors (ICIs) from the following ICI list: {ici_list}. 
                     Your task is to determine if the patient note describes any of the immune-related adverse events (irAEs) experienced by the patient and caused by immune checkpoint inhibitors.
                     Output your response in a JSON format using the following structure: 
                     {irae_json_format}"""})
    messages.append({"role": "user", "content": f"""Does the following patient note describe immune-related adverse 
                     events experienced by the patient? 
                     Patient note: {note_text}"""})                     
    return messages

#print(prompt_specific_json(" .. test .. ", ici_list, filter_list_irae_full))

## Eval + error analysis

In [None]:
# Multi-label evaluation - error analysis
#df_irae_full_eval = IRAEUtils.irae_eval(y_all_filter, y_llmresponses_filter, filter_list_irae_full)
def irae_eval_error(df_files, df_y_gold, df_y_llm, list_irae_label) :
    list_precision = []
    list_recall = []
    list_specificity = []
    list_f1 = []
    list_acc = []

    print(f"Eval: df_file[{len(df_files)}] df_y_gold[{len(df_y_gold)}] df_y_llm[{len(df_y_llm)}]")

    dict_fp = dict() # key: irAE, vals: list of files
    dict_fn = dict() # key: irAE, vals: list of files

    for irae in list_irae_label : 
        dict_fp[irae] = []
        dict_fn[irae] = []

    TP = 0
    FP = 0
    FN = 0
    TN = 0

    TP_er = 0
    FP_er = 0
    FN_er = 0
    TN_er = 0

    df_eval = pd.DataFrame(columns=['irAE', 'TP', 'FP', 'FN', 'TN', 'Precision', 'Recall', 'Specificity', 'F1', 'Accuracy', 'Support'])

    # build contingency tables for each irAE and compute evaluation measures
    for index, irae in enumerate(list_irae_label):
        y_gold_vector = np.array(df_y_gold[:, index]).astype(int)
        y_llm_vector = np.array(df_y_llm[:, index]).astype(int)

        TP_local = 0
        FP_local = 0
        FN_local = 0
        TN_local = 0

        for file_name, gold_01response, llm_01response in zip(df_files, y_gold_vector, y_llm_vector) :
            #print(f"File {file_name}: Gold{gold_01response} LLM{llm_01response}")
            
            if gold_01response == 1 and llm_01response == 1 :
                TP_local += 1
            
            if gold_01response == 0 and llm_01response == 1 :
                FP_local += 1                
                dict_fp[irae].append(file_name)

            if gold_01response == 1 and llm_01response == 0 :
                FN_local += 1                
                dict_fn[irae].append(file_name)
        
            if gold_01response == 0 and llm_01response == 0 :
                TN_local += 1                                

        cm = confusion_matrix(y_gold_vector, y_llm_vector)
        #print(cm)

        if len(cm) == 1 :
            cmTP = cmFP = cmFN = 0
            cmTN = cm[0][0]
        else :
            cmTP = cm[1,1]
            cmFP = cm[0,1]
            cmFN = cm[1,0]
            cmTN = cm[0,0]

            TP += cmTP
            FP += cmFP
            FN += cmFN
            TN += cmTN

        cm_positives = cmTP+cmFN
        cm_negatives = cmFP+cmTN
        cm_total = cm_positives + cm_negatives

        if cmTP + cmFP == 0 : cm_precision = 0
        else : cm_precision = cmTP / (cmTP + cmFP)                

        if cmTP + cmFN == 0 : cm_recall = 0
        else : cm_recall = cmTP / (cmTP + cmFN)

        if cmTN + cmFP == 0 : cm_specificity = 0
        else: cm_specificity = cmTN / (cmTN + cmFP)

        if cm_precision + cm_recall == 0 : cm_f1 = 0
        else : cm_f1 = 2 * cm_precision * cm_recall / (cm_precision + cm_recall)

        if cm_total == 0 : cm_acc = 0
        else : cm_acc = (cmTP + cmTN) / cm_total

        if math.isnan(cm_precision): cm_precision = 0
        if math.isnan(cm_recall): cm_recall = 0
        if math.isnan(cm_specificity): cm_specificity = 0
        if math.isnan(cm_f1): cm_f1 = 0
        if math.isnan(cm_acc): cm_acc = 0

        list_precision.append(cm_precision)
        list_recall.append(cm_recall)
        list_specificity.append(cm_specificity)
        list_f1.append(cm_f1)
        list_acc.append(cm_acc)
        
        print(f"irAE[{irae}] TP[{cmTP}][{TP_local}] FP[{cmFP}][{FP_local}] FN[{cmFN}][{FN_local}] TN[{cmTN}][{TN_local}] ")
        print(cm)

        df_row = {'irAE' : irae, 'TP' : cmTP, 'FP' : cmFP, 'FN' : cmFN, 'TN' : cmTN, 'Precision' : cm_precision, 'Recall' : cm_recall, 'Specificity' : cm_specificity, 'F1' : cm_f1, 'Accuracy' : cm_acc, 'Support' :cm_positives}    
        df_eval = pd.concat([df_eval, pd.DataFrame([df_row])], ignore_index=True)    

    # macro/micro averaged results
    Positives = TP + FN
    Negatives = FP + TN
    Total = Positives + Negatives

    if TP + FP == 0 : microPrecision = 0
    else : microPrecision = TP / (TP + FP)

    if TP + FN == 0 : microRecall = 0
    else : microRecall = TP / (TP + FN)

    if TN + FP == 0 : microSpecificity = 0
    else : microSpecificity = TN / (TN + FP)

    if microPrecision + microRecall == 0 : microF1 = 0
    else : microF1 = 2 * microPrecision * microRecall / (microPrecision + microRecall)

    if Total == 0 : microAcc = 0
    else : microAcc = (TP + TN) / Total

    if math.isnan(microPrecision): microPrecision = 0
    if math.isnan(microRecall): microRecall = 0
    if math.isnan(microSpecificity): microSpecificity = 0
    if math.isnan(microF1): microF1 = 0
    if math.isnan(microAcc): microAcc = 0

    # add empty row
    df_row = {'irAE' : '', 'TP' : '', 'FP' : '', 'FN' : '', 'TN' :'', 'Precision' : '', 'Recall' : '', 'Specificity' : '', 'F1' : '', 'Accuracy' : '', 'Support' : ''}
    df_eval = pd.concat([df_eval, pd.DataFrame([df_row])], ignore_index=True)

    ## micro-average
    df_row = {'irAE' : 'micro avg', 'TP': TP, 'FP' : FP, 'FN' : FN, 'TN' : TN, 'Precision' : microPrecision, 'Recall' : microRecall, 'Specificity' : microSpecificity, 'F1' : microF1, 'Accuracy' : microAcc, 'Support' : Positives}
    df_eval = pd.concat([df_eval, pd.DataFrame([df_row])], ignore_index=True)

    ## macro-average
    df_row = {'irAE' : 'macro avg', 'TP': TP, 'FP' : FP, 'FN' : FN, 'TN' : TN, 'Precision' : np.mean(list_precision), 'Recall' : np.mean(list_recall), 'Specificity' : np.mean(list_specificity), 'F1' : np.mean(list_f1), 'Accuracy' : np.mean(list_acc), 'Support' : Positives}
    df_eval = pd.concat([df_eval, pd.DataFrame([df_row])], ignore_index=True)

    return df_eval, dict_fp, dict_fn



## LLM output processisng

In [None]:
## Returns <Flag , list01> True if the LLM response is successfully parsed
##
def convert_yn_dict_to_01_list(dict_irae_reponse, irae_list, nlp) :
    response_01_list = []
    for irae_elem in irae_list :
        response_01_elem = IRAEUtils.convert_llm_response_to_01(dict_irae_reponse[irae_elem], nlp)
        #print(f"{irae_elem} : {dict_irae_reponse[irae_elem]} : {response_01_elem}")
        response_01_list.append(response_01_elem)
        if response_01_elem == -1 :
            return False, response_01_list
    
    return True, response_01_list

## Filter out data points corresponding to invalid LLM results 
##
def filter_invalid_llm_responses(files_gold, y_gold, y_llm_yn, irae_list, nlp) :
    #print(f"before: filter_invalid_llm_responses: {len(y_gold)} <> {len(y_llm_yn)}")
    
    files_gold_filter = []
    y_gold_filter = np.empty((0, len(irae_list)))
    y_llm_filter = np.empty((0, len(irae_list)))

    for row_file_gold, row_y_gold, llm_response_yn in zip(files_gold, y_gold, y_llm_yn) :
        #print(f"\nrow_y_gold({row_y_gold})")
        #print(f"llm_response_yn({llm_response_yn})")
        json_llm_response_yn = llm_response_yn.removeprefix("```json").removesuffix("```").strip()
        if IRAEUtils.is_json(json_llm_response_yn) :
            dict_irae_reponse = json.loads(json_llm_response_yn)
            flag, llm_response_01 = convert_yn_dict_to_01_list(dict_irae_reponse, irae_list, nlp)

            if flag == True :
                #print(f"llm_response_01({llm_response_01})")
                files_gold_filter.append(row_file_gold)
                y_gold_filter = np.vstack([y_gold_filter, row_y_gold])
                y_llm_filter = np.vstack([y_llm_filter, llm_response_01])

                #print(f"\ny_gold_filter({y_gold_filter})")
                #print(f"y_llm_filter({y_llm_filter})")

    #print(f"after: filter_invalid_llm_responses: {len(y_gold_filter)} <> {len(y_llm_filter)}")
    return files_gold_filter, y_gold_filter, y_llm_filter

## Run LLMs

In [None]:
X_all = df10[["text"]].values
y_all = df10[filter_list_irae_full].values
F_all = df10[["FileName"]].values

#X_all = df_gold_full[["text"]].values
#y_all = df_gold_full[filter_list_irae_full].values
#F_all = df_gold_full[["FileName"]].values

exception_list = []
y_llmresponses = []

# Run the LLM for each note in the dataset and collect its response
#for note in X_all:
for index, note in enumerate(X_all):
    try:
        llm_response = llm.chat.completions.create(model = GPT_DEPLOYMENT,
            temperature=0.0, max_tokens=500, n = 1,
            frequency_penalty=0, presence_penalty=0, seed = 13,     
            #top_p=1, ## reco: alter this param or temp but not both https://platform.openai.com/docs/api-reference/chat/create            
            #messages = prompt_func(note))
            messages = prompt_specific_json(note, ici_list, filter_list_irae_full))                                
        
        print("Note: "+str(index))
        #print("Prompt: "+str(prompt_func(note, ici_list, irae_list)))
        print(llm_response.choices[0].message.content.strip())
        y_llmresponses.append(llm_response.choices[0].message.content.strip())
        #print(response)
        #print('.', end='', flush=True)
    except Exception as e:            
        print("LLMException: "+str(e).strip().replace('\n', ' '))
        y_llmresponses.append("LLMException: "+str(e).strip().replace('\n', ' '))
        exception_list.append("LLMException: "+str(e).strip().replace('\n', ' '))


## Evaluation [full]: filtered irAE full labels

In [None]:
# Evaluation at irAE type level

F_all_filter, y_all_filter, y_llmresponses_filter = filter_invalid_llm_responses(F_all, y_all, y_llmresponses, filter_list_irae_full, nlp)

clf_report = classification_report(y_all_filter, y_llmresponses_filter, target_names = filter_list_irae_full, zero_division=0, output_dict=True)
df_clf_report = pd.DataFrame(clf_report).transpose()
display(df_clf_report)

df_irae_full_eval = IRAEUtils.irae_eval(y_all_filter, y_llmresponses_filter, filter_list_irae_full)
display(df_irae_full_eval)

df_clf_report.to_csv(f"{path_eval}EVAL-FULL.CLF-REPORT.{GPT_DEPLOYMENT}.csv", index=True)
df_irae_full_eval.to_csv(f"{path_eval}EVAL-FULL.DETAILED-REPORT.{GPT_DEPLOYMENT}.csv", index=False)

In [None]:
df_irae_full_eval, dict_full_fp, dict_full_fn  = irae_eval_error(F_all_filter, y_all_filter, y_llmresponses_filter, filter_list_irae_full)

display(df_irae_full_eval)

with open(f"{path_eval}ErrorAnalysis_FP_NEW.csv", "w") as file:
    # Write each item in the list to the file
    for irae in filter_list_irae_full:
        file.write(f"\n\nirAE: {irae}\n")
        for file_name in dict_full_fp[irae]:
            file.write(f"\n{file_name}")
        
with open(f"{path_eval}ErrorAnalysis_FN_NEW.csv", "w") as file:
    # Write each item in the list to the file
    for irae in filter_list_irae_full:
        file.write(f"\n\nirAE: {irae}\n")
        for file_name in dict_full_fn[irae]:
            file.write(f"\n{file_name}")


## Evaluation [large]: filtered irAE large labels

In [None]:
# Evaluation at irAE category level

## Convert np.array with binary labels for irAE full to df with binary labels for irAE large
##
def convert_np_full_to_df_large(np_irae_full, dict_map_full2large, list_irae_full, list_irae_large) :
    df_irae_large = pd.DataFrame(columns = list_irae_large)
     
    for row in np_irae_full :
        dict_row = init_dict_large_irae(list_irae_large)
        for index_irae_full, label_irae_full in enumerate(list_irae_full):
            if row[index_irae_full] == 1 :
                    dict_row[dict_map_full2large[label_irae_full]] = 1

        df_irae_large = pd.concat([df_irae_large, pd.DataFrame([dict_row])], ignore_index=True)
    
    return df_irae_large


df_y_all_filter = convert_np_full_to_df_large(y_all_filter, dict_map_full2large, filter_list_irae_full, filter_list_irae_large)
df_y_llmresponses_filter = convert_np_full_to_df_large(y_llmresponses_filter, dict_map_full2large, filter_list_irae_full, filter_list_irae_large)

np_y_all_filter = df_y_all_filter.to_numpy().astype(int)
np_y_llmresponses_filter = df_y_llmresponses_filter.to_numpy().astype(int)

final_clf_report = classification_report(np_y_all_filter, np_y_llmresponses_filter, target_names = filter_list_irae_large, zero_division=0, output_dict=True)
final_clf_report = pd.DataFrame(final_clf_report).transpose()
display(final_clf_report)

final_df_irae_large_eval = IRAEUtils.irae_eval(np_y_all_filter, np_y_llmresponses_filter, filter_list_irae_large)
display(final_df_irae_large_eval)

final_clf_report.to_csv(f"{path_eval}EVAL-LARGE.CLF-REPORT.{GPT_DEPLOYMENT}.csv", index=True)
final_df_irae_large_eval.to_csv(f"{path_eval}EVAL-LARGE.DETAILED-REPORT.{GPT_DEPLOYMENT}.csv", index=False)

In [None]:
# Test tiktoken: print tokens and their corresponding parts of the text

s = "Output 'Yes' if the patient has experienced Neuropathy (Neurotox, Neurotoxicity) because of"
s2 = "Rash"
encoding = tiktoken.get_encoding("cl100k_base")

tokens = encoding.encode(s2)

print("Tokens:", tokens)
print("Decoded tokens:", [encoding.decode([token]) for token in tokens])
