In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import collections
import random
from random import sample
random.seed(42)


In [10]:
# Define paths
main_dir = "/home/"
input_file = main_dir + "dataset/mimiciv/3.1/preprocessed/mimicIV_icd10"
sample_dir = main_dir + "dataset/mimiciv/3.1/preprocessed/"

In [11]:
data =  pd.read_pickle(input_file)

In [None]:
diagnosis = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    diagnosis = diagnosis + row["icd10_diag"]
unique_diagnosis = set(diagnosis)
unique_diagnosis

In [13]:
with open(main_dir+'dataset/mimiciv/3.1/preprocessed/unique_diagnosis_icd10.txt','w') as f:
    f.write(str(unique_diagnosis)) 

In [None]:
from collections import defaultdict

labels_histogram = defaultdict(int)
for key in diagnosis:
    labels_histogram[key]+=1
labels_histogram   

#unique_labels, unique_counts = np.unique(diagnosis, return_counts=True)
#labels_histogram = dict(zip(unique_labels, unique_counts))
#labels_histogram

In [16]:
with open(main_dir+'dataset/mimiciv/3.1/preprocessed/diagnosis_histogram_icd10.txt','w') as f:
    f.write(str(labels_histogram)) 

In [None]:
sorted(labels_histogram, key=lambda x: (-labels_histogram[x], x))

In [None]:
occurrences = collections.Counter(diagnosis)
rare_diagnosis =  [key for key in occurrences if all(occurrences[temp] >= occurrences[key] for temp in occurrences)] 
common_diagnosis = [key for key in occurrences if all(occurrences[temp] <= occurrences[key] for temp in occurrences)] 
print("rare_diagnosis: ", rare_diagnosis)
print("common_diagnosis: ", common_diagnosis)

In [None]:
diagnosis_sample = rare_diagnosis + common_diagnosis
print("length of diagnosis_sample: ", len(diagnosis_sample))

selected_indices = []
balanced_indices = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    discharge_text = row["text"]
    if "chief complaint:" in discharge_text:
        if "history of present illness:" in discharge_text:
            if "major surgical or invasive procedure:" in discharge_text:
                if "social history:"  in discharge_text:
                    if "physical exam:" in discharge_text:
                        if "pertinent results:" in discharge_text:
                            if "brief hospital course" in discharge_text:
                                if "medications on admission:" in discharge_text:
                                    if "discharge medications:" in discharge_text:
                                        if "discharge diagnosis:" in discharge_text:
                                            if "discharge instructions:" in discharge_text:
                                                if "deceased" not in discharge_text:
                                                    if "patient expired" not in discharge_text:
                                                        selected_indices.append(index)
                                                        intersection_list = [i for i in row["icd10_diag"] if i in diagnosis_sample]
                                                        if len(intersection_list) != 0:
                                                            balanced_indices.append(index)
        
print("length of selected_indices: ", len(selected_indices))
print("length of balanced_indices: ", len(balanced_indices))

mimicIV_sample = data.iloc[selected_indices]
mimicIV_sample_balanced = data.iloc[balanced_indices]

In [37]:
mimicIV_sample = mimicIV_sample.reset_index(drop=True)
mimicIV_sample.to_csv(sample_dir + "mimicIV_sample_filtered_icd10.csv")

In [38]:
mimicIV_sample_balanced = mimicIV_sample_balanced.reset_index(drop=True)
mimicIV_sample_balanced.to_csv(sample_dir + "mimicIV_sample_filtered_icd10_balanced.csv")

In [None]:
rare_indices = []
for index, row in tqdm(mimicIV_sample_balanced.iterrows(), total=len(mimicIV_sample_balanced)):
    intersection_list = [i for i in row["icd10_diag"] if i in rare_diagnosis]
    if len(intersection_list) != 0:
        rare_indices.append(index)
        
common_indices = []
for index, row in tqdm(mimicIV_sample_balanced.iterrows(), total=len(mimicIV_sample_balanced)):
    intersection_list = [i for i in row["icd10_diag"] if i in common_diagnosis]
    if len(intersection_list) != 0:
        common_indices.append(index)
        
print("length of rare_indices: ", len(rare_indices))
print("length of common_indices: ", len(common_indices))

In [42]:
random.seed(42)
final_common_indices = sample(common_indices, 500)

In [None]:
random.seed(42)
intersect_indices = list(set(rare_indices).intersection(common_indices))
len(intersect_indices)

In [None]:
random.seed(42)
filtered_rare_indices = [i for i in rare_indices if i not in intersect_indices]
len(filtered_rare_indices)

In [None]:
random.seed(42)
final_rare_indices = sample(filtered_rare_indices, 500)
final_filtered_indices = final_rare_indices + final_common_indices
len(final_filtered_indices)

In [None]:
mimicIV_final_sample_balanced = mimicIV_sample_balanced.iloc[final_filtered_indices]
mimicIV_final_sample_balanced = mimicIV_final_sample_balanced.reset_index(drop=True)
mimicIV_final_sample_balanced.to_csv(sample_dir + "mimicIV_1000_balanced_sample_filtered_icd10.csv")

In [None]:
mimicIV_final_sample_balanced

In [7]:
def create_discharge_text_data(data, discharge_data_file):
    print("creating discharge texts")
    note_ids = []
    subject_ids = []
    _ids = []
    icd10_procs = []
    icd10_diags = []
    complaint = []
    pHistory = []
    exam = []
    result = []
    diagnoses = []
    pCondition = []
    inst = []
    
    for index, row in tqdm(data.iterrows(), total=len(data)):
        discharge_text = row["text"]
        note_ids.append(row["note_id"])
        subject_ids.append(row["subject_id"])
        _ids.append(row["_id"])
        icd10_procs.append(row["icd10_proc"])
        icd10_diags.append(row["icd10_diag"])
        splitter = 'brief hospital course:'
        pre_diagnosis_text = discharge_text.split(splitter, 1)[0]
        chief_complaint = pre_diagnosis_text.split('chief complaint:', 1)[1].split('major surgical or invasive procedure:', 1)[0]
        complaint.append(chief_complaint.strip().strip('\n'))
        history = pre_diagnosis_text.split('history of present illness:', 1)[1].split('social history:', 1)[0]
        pHistory.append(history.strip().strip('\n'))
        physical_exam = pre_diagnosis_text.split('physical exam:', 1)[1].split('pertinent results:', 1)[0]
        exam.append(physical_exam.strip().strip('\n'))
        results = pre_diagnosis_text.split('pertinent results:', 1)[1]
        result.append(results.strip().strip('\n'))
        diagnosis = discharge_text.split('discharge diagnosis:', 1)[1].split('discharge condition:', 1)[0]
        diagnoses.append(diagnosis.strip().strip('\n'))
        condition = discharge_text.split('discharge condition:', 1)[1].split('discharge instructions:', 1)[0]
        pCondition.append(condition.strip().strip('\n'))
        instructions = discharge_text.split('discharge instructions:', 1)[1].split('followup instructions:', 1)[0]
        inst.append(instructions.strip().strip('\n'))
    extracted_data = {'note_id': note_ids, 
                        'subject_id': subject_ids,
                        '_id': _ids,
                        'icd10_proc': icd10_procs,
                        'icd10_diag': icd10_diags,
                        'chief_complaint': complaint, 
                        'history': pHistory,
                        'physical_exam': exam,
                        'results': result,
                        'discharge diagnosis': diagnoses,
                        'discharge condition': pCondition,
                        'discharge instructions': inst
                        }
    extracted_data = pd.DataFrame(extracted_data)
    extracted_data.to_csv(discharge_data_file)
    return extracted_data

In [None]:
main_dir = "/home/dataset/mimiciv/3.1/preprocessed/"
final_path = main_dir+"mimicIV_1000_balanced_sample_filtered_parsed_icd10.csv"
create_discharge_text_data(mimicIV_final_sample_balanced,final_path)

In [None]:
random_subset = mimicIV_sample.sample(n=1000, random_state=13)
random_subset = random_subset.reset_index(drop=True)
random_subset

In [87]:
random_subset.to_csv(main_dir + "mimicIV_1000_random_sample_filtered_icd10.csv")

In [None]:
random_final_path = main_dir+"mimicIV_1000_random_sample_filtered_parsed_icd10.csv"
create_discharge_text_data(random_subset,random_final_path)