In [2]:
# from sklearn.metrics import cohen_kappa_score
import pandas as pd
import numpy as np
import os

In [3]:
base_dir = "/home/beatrice/Documents/ProviderSentiment2024/tiny_sample_01"

os.listdir(base_dir)

['liam.jsonl', 'kalaila.jsonl', 'aarti.jsonl']

In [32]:
class NoteCorpus:
    def __init__(self, file_list):
        self.file_dict = self.get_file_dict(file_list)
        self.annotation_dict = self.get_annotation_dict()
        self.df = list(self.file_dict.values())[0].drop(columns=["label"]).copy()
        self.label_dict = None

    def load_label_dict(self, label_dict=None):
        if label_dict is None:
            self.label_dict = label_dict = {
                "FAMILY: Positive": 0,
                "FAMILY: Negative": 1,
                "PROGNOSIS: Positive": 2,
                "PROGNOSIS: Negative": 3,
                "PATIENT: Positive": 4,
                "PATIENT: Negative": 5,
                "EVENT: Family/Goals of Care Meeting": 6,
                "EVENT: Care Withdrawn/Comfort Measures Only": 7
            }
        else:
            self.label_dict = label_dict


    def get_file_dict(self, file_list):
        file_dict = load_all_files(file_list)
        common_id_set = get_common_ids(file_dict)
        return filter_files(file_dict, common_id_set)
    
    def get_annotation_dict(self, ann_col="label"):
        annotation_dict = dict()
        for annotator, data in self.file_dict.items():
            annotation_list = data[ann_col]
            annotation_dict[annotator] = annotation_list

        return annotation_dict
    
    def retrieve_annotations(self, idx):
        annotation_dict_slice = dict()
        for annotator, annotation_list in self.annotation_dict.items():
            annotation_dict_slice[annotator] = annotation_list[idx]

        return annotation_dict_slice

    def build_corpus(self):
        for i, data in self.df.iterrows():
            note = ClinicalNote(data, self.label_dict, self.retrieve_annotations(i))
            break




def load_all_files(file_list):
    file_dict = dict()

    for _file in file_list:
        if _file.endswith(".jsonl"):
            annotator_name = _file.split(".")[0].title()
            file_dict[annotator_name] = pd.read_json(os.path.join(base_dir, _file), lines=True)

    return file_dict

def get_common_ids(file_dict, id_col="ROW_ID"):
    id_list = [set(_file[id_col]) for _file in file_dict.values()]
    common_ids = set.intersection(*id_list)

    return common_ids

def filter_files(file_dict, common_ids, id_col="ROW_ID"):
    filtered_dict = dict()

    for annotator, data in file_dict.items():
        filtered_data = data[data[id_col].isin(common_ids)]
        filtered_dict[annotator] = filtered_data

    return filtered_dict

    
class ClinicalNote:
    def __init__(self, data, label_dict: dict, annotation_dict: dict):
        self.data = data
        self.label_dict = label_dict
        self.n_labels = len(label_dict)
        self.text_len = len(self.data.text)

        self.label_array = None
        self.binary_label_dict = None

        print(annotation_dict)

    def parse_label(self):
        # extract label list
        label_list = self.data.label

        self.label_array = np.zeros((self.text_len, self.n_labels))
        for label in label_list:
            # unpack the elements of the label
            start_idx, end_idx, label = label

            # set the index
            self.label_array[start_idx:end_idx, self.label_dict[label]] = 1

    def binarize_labels(self):
        # get a reverse of the label dict
        idx_dict = {v:k for (k,v) in self.label_dict.items()}
        pred_dict = {k:False for k in self.label_dict.keys()}

        for idx in range(self.n_labels):
            # check if any indexes were predicted to have a label
            label = idx_dict[idx]
            label_pred = np.sum(self.label_array[:, idx]) > 0

            pred_dict[label] = label_pred

        self.binary_label_dict = pred_dict


label_dict = {
    "FAMILY: Positive": 0,
    "FAMILY: Negative": 1,
    "PROGNOSIS: Positive": 2,
    "PROGNOSIS: Negative": 3,
    "PATIENT: Positive": 4,
    "PATIENT: Negative": 5,
    "EVENT: Family/Goals of Care Meeting": 6,
    "EVENT: Care Withdrawn/Comfort Measures Only": 7
}


In [33]:
file_name_list = os.listdir(base_dir)
file_list = [_file for _file in file_name_list if _file.endswith(".jsonl")]

corpus = NoteCorpus(file_list)
corpus.load_label_dict()


In [34]:
corpus.build_corpus()

{'Liam': [[347, 363, 'EVENT: Care Withdrawn/Comfort Measures Only'], [384, 410, 'PROGNOSIS: Negative']], 'Kalaila': [[280, 296, 'PROGNOSIS: Negative'], [346, 380, 'EVENT: Care Withdrawn/Comfort Measures Only'], [383, 410, 'PROGNOSIS: Negative'], [483, 593, 'FAMILY: Positive']], 'Aarti': [[332, 356, 'EVENT: Family/Goals of Care Meeting'], [383, 428, 'PROGNOSIS: Negative']]}


In [25]:
for k, v in corpus.annotation_dict.items():
    print(k)
    print(type(v[0]))

Liam
<class 'list'>
Kalaila
<class 'list'>
Aarti
<class 'list'>


In [5]:
kalaila_df = pd.read_json(os.path.join(base_dir, 'kalaila.jsonl'), lines=True)

kalaila_list = []

for i, data in kalaila_df.iterrows():
    note_obj = ClinicalNote(data, label_dict)
    note_obj.parse_label()
    note_obj.binarize_labels()

    kalaila_list.append(note_obj.binary_label_dict)

In [14]:
kalaila_df

Unnamed: 0,id,text,ROW_ID,SUBJECT_ID,HADM_ID,CATEGORY,DESCRIPTION,DESCRIPTION_1,DRG_CODE,label,Comments
0,721,"Pt is a 69M with h/o HTN, CAD s/p CABG, AVR wi...",415056,79501,119347,Nursing,Nursing Progress Note,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,65,"[[280, 296, PROGNOSIS: Negative], [346, 380, E...",[]
1,722,A/P 55 yo RHM with a left hemiparesis (leg>arm...,552822,53725,188088,Nursing,Nursing Transfer Note,ACUTE ISCHEMIC STROKE W USE OF THROMBOLYTIC AG...,63,"[[1154, 1178, PROGNOSIS: Positive]]",[]
2,723,Ms. [**Known lastname 2211**] is a 76-year-old...,544396,47906,128544,Nursing,Nursing Progress Note,ACUTE ISCHEMIC STROKE W USE OF THROMBOLYTIC AG...,61,[],[]
3,724,45 yo male w/ PMH: R steel plate in leg; hyper...,557478,90746,115306,Nursing,Nursing Progress Note,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,"[[1746, 1774, PROGNOSIS: Negative]]",[]
4,725,[**Age over 90 **] y/o M ([**Hospital1 328**] ...,532283,86907,125407,Nursing,Nursing Transfer Note,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,"[[1010, 1018, PATIENT: Negative], [1023, 1049,...",[]
5,726,RESP CARE NOTE\nPT REMAINS ON SAME VENT SETTIN...,1652831,30334,170088,Nursing/other,Report,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,[],[]
6,727,***[** 610**] NURSING PROGRESS NOTE 7A-7P***\n...,1645712,29626,150680,Nursing/other,Report,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,"[[619, 651, PROGNOSIS: Negative], [1477, 1492,...",[]
7,728,nsg admit note\nPT TO SICU FROM EW. THIS 27 YO...,1641938,29300,105082,Nursing/other,Report,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,65,[],[]
8,729,nursing Progress note\nEvents: temp 102.5-bloo...,1645710,29626,150680,Nursing/other,Report,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,"[[1328, 1354, FAMILY: Positive]]",[]
9,730,"Respiratory Care\nPt remains stable on vent, w...",1645109,29534,112562,Nursing/other,Report,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,64,[],[]


In [10]:
kalaila_df.ROW_ID.nunique()

40

In [6]:
aarti_df = pd.read_json(os.path.join(base_dir, 'aarti.jsonl'), lines=True)

aarti_list = []

for i, data in aarti_df.iterrows():
    note_obj = ClinicalNote(data, label_dict)
    note_obj.parse_label()
    note_obj.binarize_labels()

    aarti_list.append(note_obj.binary_label_dict)

In [7]:
liam_df = pd.read_json(os.path.join(base_dir, 'liam.jsonl'), lines=True)

liam_list = []

for i, data in liam_df.iterrows():
    note_obj = ClinicalNote(data, label_dict)
    note_obj.parse_label()
    note_obj.binarize_labels()

    liam_list.append(note_obj.binary_label_dict)

In [58]:
def compare_binary_labels(ann1, ann2):
    # Extracting the list of keys from the first dictionary as all dictionaries are assumed to have the same keys
    keys = ann1[0].keys()

    # Calculating Cohen's Kappa for each key
    kappa_scores = {}
    for key in keys:
        ratings1 = [dic[key] for dic in ann1]
        ratings2 = [dic[key] for dic in ann2]
        kappa = cohen_kappa_score(ratings1, ratings2)
        kappa_scores[key] = kappa

    # Print Cohen's Kappa scores for each class
    for key, kappa in kappa_scores.items():
        print(f"Cohen's Kappa for {key}: {kappa:.2f}")

    return kappa_scores


In [59]:
_ = compare_binary_labels(kalaila_list, aarti_list)

Cohen's Kappa for FAMILY: Positive: 0.12
Cohen's Kappa for FAMILY: Negative: 0.19
Cohen's Kappa for PROGNOSIS: Positive: 0.64
Cohen's Kappa for PROGNOSIS: Negative: 0.42
Cohen's Kappa for PATIENT: Positive: -0.06
Cohen's Kappa for PATIENT: Negative: -0.06
Cohen's Kappa for EVENT: Family/Goals of Care Meeting: 0.83
Cohen's Kappa for EVENT: Care Withdrawn/Comfort Measures Only: 0.00


In [60]:
_ = compare_binary_labels(kalaila_list, liam_list)

Cohen's Kappa for FAMILY: Positive: 0.09
Cohen's Kappa for FAMILY: Negative: 0.47
Cohen's Kappa for PROGNOSIS: Positive: 0.84
Cohen's Kappa for PROGNOSIS: Negative: 0.41
Cohen's Kappa for PATIENT: Positive: 0.00
Cohen's Kappa for PATIENT: Negative: -0.05
Cohen's Kappa for EVENT: Family/Goals of Care Meeting: 0.40
Cohen's Kappa for EVENT: Care Withdrawn/Comfort Measures Only: 0.79


In [61]:
_ = compare_binary_labels(liam_list, aarti_list)

Cohen's Kappa for FAMILY: Positive: 0.58
Cohen's Kappa for FAMILY: Negative: 0.46
Cohen's Kappa for PROGNOSIS: Positive: 0.79
Cohen's Kappa for PROGNOSIS: Negative: 0.44
Cohen's Kappa for PATIENT: Positive: 0.00
Cohen's Kappa for PATIENT: Negative: 0.36
Cohen's Kappa for EVENT: Family/Goals of Care Meeting: 0.40
Cohen's Kappa for EVENT: Care Withdrawn/Comfort Measures Only: 0.00


In [None]:
import customtkinter

customtkinter.CTkTextbox.tag_config