# Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from datetime import timedelta
import importlib
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()  
from pathlib import Path

sys.path.append(str(Path.cwd().parents[1]))


%load_ext autoreload
%autoreload 2

In [None]:
import utils.cohort_utils
import utils.lab_utils
from config.constants import MIMIC_DIR

# Extract cohort

Extract cancer cohort

In [None]:
icd_code = 'C'

subject_col='subject_id'
visit_col='hadm_id'
admit_col='admittime'
disch_col='dischtime'
death_col='dod'
adm_visit_col='hadm_id'

cancer_cohort = utils.cohort_utils.extract_disease_cohort(
        MIMIC_DIR,
        subject_col=subject_col,
        visit_col=visit_col,
        admit_col=admit_col,
        disch_col=disch_col,
        disease_label=icd_code,
    )

Extract cancer chemo cohort

In [None]:
cancer_chemo_cohort =utils.cohort_utils.extract_chemo_cohort(cancer_cohort, MIMIC_DIR)

Save Cancer chemo cohort

In [None]:
os.makedirs('../saved_data/cohorts', exist_ok=True) 
cancer_chemo_cohort.to_csv(f'../saved_data/cohorts/mimic_cancer_chemo_cohort.csv.gz', compression='gzip', index=False)
print("[SUCCESSFULLY SAVED COHORT DATA]")

# Extract Lab for target cohort

Load Target Cohort

In [None]:
cancer_chemo_cohort = pd.read_csv(f'../saved_data/cohorts/mimic_cancer_chemo_cohort.csv.gz', compression='gzip',  header=0)
target_cohort = cancer_chemo_cohort.copy()

Extract Lab data for the target cohort


In [None]:
labs_df = utils.lab_utils.extract_cohort_labs(MIMIC_DIR, target_cohort,'charttime', dtypes=None, usecols=None)
labs_df = utils.lab_utils.drop_wrong_uom(labs_df, 0.95)
os.makedirs('../saved_data/features', exist_ok=True) 
labs_df[['subject_id', 'hadm_id', 'charttime', 'itemid','valuenum']].to_csv(f'../saved_data/features/mimic_cancer_chemo_cohort_labs.csv.gz', compression='gzip', index=False)
print("[SUCCESSFULLY SAVED LABS DATA]")

***
# Aplasia Occurrence
(only based on single ANC<0.5)

**Note:** The unit for ANC is K/uL (thousands per microliter), so a threshold of ANC < 500 cells/ÂµL should be converted to ANC < 0.5 K/uL.

Find Aplasia cases from Lab markers (ANC < 0.5) \
In dataset max = 208.18, min = 0.0, mean = 4.33
***

Load Target cohort and corresponding Lab measurements

In [None]:
cancer_chemo_cohort = pd.read_csv(f'../saved_data/cohorts/mimic_cancer_chemo_cohort.csv.gz', compression='gzip',  parse_dates=['admittime','dischtime','dod'], header=0)
cancer_chemo_labs = pd.read_csv(f'../saved_data/features/mimic_cancer_chemo_cohort_labs.csv.gz', compression='gzip',  parse_dates=['charttime'], header=0)

Choose target cohort and the number of days to look for aplasia occurrence

In [None]:
target_cohort = cancer_chemo_cohort.copy()
days = 45

### Aplasia based on ANC

In [None]:
#Extract the itemids for the given label
ANC_label='absolute neutrophil count'
ANC_itemids = utils.lab_utils.find_itemid(MIMIC_DIR,ANC_label)
print('ANC itemids are: ',ANC_itemids)

In [None]:
#Filter lab measurements to include only ANC-related item IDs, and add a binary label indicating whether the ANC value is less than 0.5 or not
cancer_chemo_labs = cancer_chemo_labs.groupby(['subject_id', 'hadm_id', 'itemid', 'charttime'])["valuenum"].max().reset_index()
ANC_lab_df = cancer_chemo_labs[cancer_chemo_labs['itemid'].isin(ANC_itemids)]

mask = (ANC_lab_df['valuenum'] < 0.5)
ANC_lab_df['ANC<0.5'] = 0
ANC_lab_df.loc[mask, 'ANC<0.5'] = 1

del cancer_chemo_labs

### Aplasia based on Transfusion

In [None]:
#Find transfusion codes for platelet|red blood cell|RBC
proc_icd_long_titles = utils.cohort_utils.get_procedures_definition(MIMIC_DIR)
transfusion_codes= proc_icd_long_titles[
    (proc_icd_long_titles['icd_version'] == 10) &
    (proc_icd_long_titles['long_title'].str.contains('transfusion', case=False, na=False)) &
    (proc_icd_long_titles['long_title'].str.contains('platelet|red blood cell|RBC', case=False, na=False))
]
transfusion_codes.to_csv(f'../aplasia/docs/icd_procedures_RBC_platelet_transfusion.csv', index=False) # --> 40 codes
transfusion_codes = transfusion_codes['icd_code']

In [None]:
#Label target cohort admissions for transfusion encounter
mimic_procedures = utils.cohort_utils.get_procedures(MIMIC_DIR)
transfusion_procedures= mimic_procedures[mimic_procedures['icd_code'].isin(transfusion_codes)][['hadm_id', 'chartdate']].rename(columns={'chartdate': 'transfusion_date'})
target_cohort = target_cohort.merge(
    transfusion_procedures,
    on='hadm_id',
    how='left'
)
target_cohort['transfusion'] = target_cohort['transfusion_date'].notna().astype(int)
target_cohort['transfusion_date'] = pd.to_datetime(target_cohort['transfusion_date'])

num_transfused_admissions = target_cohort[target_cohort['transfusion'] == 1]['hadm_id'].nunique()
print(f"Number of unique admissions with transfusion: {num_transfused_admissions}")
print(f"Number of total transfusion: ", target_cohort.transfusion.sum())

Find **CURRENT** admission Aplasia occurrence based on lab measurements between admittime and dishctime

In [None]:
def current_aplasia_occurrence(x,days,labs):

    sub_labs = labs[
        (labs["subject_id"] == x.subject_id) & 
        (labs["charttime"]  >= x.admittime)  & 
        (labs["charttime"]  <= x.dischtime) 
        ].sort_values("charttime") 
    
    if x.transfusion == 1:
        return 1
    if sub_labs.empty: 
        return 0
    if sub_labs["ANC<0.5"].any():
        return 1
    else:
        return 0
        
target_cohort["current_aplasia"]  = target_cohort.progress_apply(lambda x: current_aplasia_occurrence(x, days, ANC_lab_df), axis=1)

Find **AFTER** admission Aplasia occurrence based on the lab measurement within certain days after discharge

In [None]:
def after_admission_aplasia_occurrence(x,target_cohort,days,labs):
    
    #based on Labs
    sub_labs = labs[
        (labs["subject_id"] == x.subject_id) & 
        (labs["charttime"]  >= x.dischtime)  & 
        (labs["charttime"]  <= (x.dischtime + timedelta(days=days))) 
        ].sort_values("charttime") 
    
    sub_admissions = target_cohort[
            (target_cohort["subject_id"] == x.subject_id) & 
            (target_cohort["admittime"] >= x.dischtime) & 
            (target_cohort["admittime"] <= (x.dischtime + timedelta(days=days)))
        ].sort_values("admittime") 


    
    ANC_low_rows = sub_labs[sub_labs["ANC<0.5"] == True]
    transfusion_rows = sub_admissions[sub_admissions["transfusion"] == 1]


    
    if ANC_low_rows.empty and transfusion_rows.empty: # if no aplasia in 45 days after discharge
        return 0, None

    else:
        times = []
    
        if not ANC_low_rows.empty:
            times.append(ANC_low_rows.iloc[0]["charttime"]) # first time for ANC low level
        
        if not transfusion_rows.empty:
            times.append(transfusion_rows.iloc[0]["admittime"]) # first time for transfusion
        
        min_time = min(times)
        return 1, min_time # return the earliest aplasia occurrence time
        


target_cohort[["next_aplasia", "next_aplasia_time"]]  = target_cohort.progress_apply(lambda x: pd.Series(after_admission_aplasia_occurrence(x, target_cohort,days, ANC_lab_df)), axis=1)
target_cohort["next_aplasia"] = target_cohort["next_aplasia"].astype(int)


Split Negative and Positive cases for target cohort

In [None]:
def split_aplasia_cases(x, days,target_cohort):
    
    if x.chemo == 0 or (x.chemo == 1 and x.current_aplasia==1): # if there is no chemo or if both chemo and aplasia are present the admission is not considered!
        return 0
    if x.chemo ==1 and x.current_aplasia == 0 and x.hospital_expire_flag ==0:
        sub = target_cohort[
            (target_cohort["subject_id"] == x.subject_id) & 
            (target_cohort["hadm_id"]  != x.hadm_id) &  # avoid selecting the same admission as next where the los for admission is 0
            (target_cohort["admittime"]  >= x.dischtime) &
            (target_cohort["admittime"] <= (x.dischtime + timedelta(days=days)))
            #(target_cohort["admittime"] <= (x.dischtime + days))
        ].sort_values("admittime")  

        #remove admissions where patient died within 30 days of discharge
        if sub.empty and x.dod <= (x.dischtime + timedelta(days=days)): 
        #if sub.empty and x.dod <= (x.dischtime + days):
            return 0
        
        if sub.empty:  # no readmission
            if (x["next_aplasia"] == 1): return 2
            if (x["next_aplasia"] == 0): return 1
            
        if not sub.empty: # if there is readmission
            
            if (sub["chemo"] == 0).all(): # if no chemo in readmissions
                if (x["next_aplasia"] == 1): return 2
                if (x["next_aplasia"] == 0): return 1
                
            if (sub["chemo"] == 1).any(): # if chemo in readmissions
                first_chemo_time = sub.loc[sub["chemo"] == 1, "admittime"].min()
                
                
                if (x["next_aplasia"] == 0):  # no aplasia in 45 days 
                    return 1

                if (x["next_aplasia"] == 1): 
                    if x.next_aplasia_time < first_chemo_time:  # if aplasia before next chemo positive else not considered
                        return 2
                
                    else: 
                        return 0
                

target_cohort["aplasia_case"] = target_cohort.progress_apply(lambda x: split_aplasia_cases(x, days,target_cohort), axis=1)

In [None]:
pos_case = target_cohort[target_cohort["aplasia_case"] == 2]
neg_case= target_cohort[target_cohort["aplasia_case"] == 1]
none_case= target_cohort[target_cohort["aplasia_case"] == 0]


print('-----------------------------------')
print('positive admissions',pos_case.hadm_id.nunique())
print('-----------------------------------')
print('negative admissions',neg_case.hadm_id.nunique())
print('-----------------------------------')
print('not detemined admissions',none_case.hadm_id.nunique())
print('-----------------------------------')
print('Psitive percentage: ',pos_case.hadm_id.nunique()/(pos_case.hadm_id.nunique() + neg_case.hadm_id.nunique())*100) 

Save Cohort

In [None]:
pos_case.loc[:, 'label'] = np.ones(pos_case.shape[0]).astype(int)
neg_case.loc[:, 'label'] = np.zeros(neg_case.shape[0]).astype(int)

cohort_output="mimic_cohort" + "_" + "aplasia" + "_" + str(days) + "_days" 
cohort = pd.concat([pos_case, neg_case], axis=0)

cohort = cohort.drop(columns =['hospital_expire_flag','chemo','current_aplasia','next_aplasia','next_aplasia_time','aplasia_case','transfusion','transfusion_date'])

cohort.to_csv("../saved_data/cohorts/"+cohort_output+".csv.gz", index=False, compression='gzip')
print("[ COHORT SUCCESSFULLY SAVED ]")

print(cohort_output)