In [1]:
import pandas as pd
import numpy as np
from customTransformers import DurationTransformer, DateTimeTransformer, EthnicityTransformer

In [2]:
#1. Load data
# import patient info
data_dir = 'data/physionet.org/files/mimiciii/1.4/'
patient_file = 'PATIENTS.csv'
df_patients = pd.read_csv(data_dir + patient_file)

# import admissions info
admissions_file = 'ADMISSIONS.csv'
df_admissions = pd.read_csv(data_dir + admissions_file)

# import icu stays info
icustays_file = 'ICUSTAYS.csv'
df_icustays = pd.read_csv(data_dir + icustays_file)

In [3]:
# work on df_patients
df_patients=df_patients.drop(['ROW_ID','DOD','DOD_HOSP','DOD_SSN','EXPIRE_FLAG'],axis=1)
df_patients = DateTimeTransformer(['DOB']).fit_transform(df_patients)

In [4]:
# work on df_admissions
# removing data columns that won't be used for predicting ICU stays
df_admissions = df_admissions.drop(['ROW_ID','DEATHTIME',
                                    'DISCHARGE_LOCATION','EDREGTIME','EDOUTTIME'],axis=1)

# fill NAs
values = {'RELIGION': 'NOT SPECIFIED', 'LANGUAGE': '***','MARITAL_STATUS': 'UNKNOWN (DEFAULT)', 'ETHNICITY': 'UNKNOWN',
          'INSURANCE': 'Not Specified', 'ADMISSION_LOCATION': '** INFO NOT AVAILABLE **', 'ADMISSION_TYPE': 'UNKNOWN'}
df_admissions = df_admissions.fillna(value=values)

# convert time strings to datetime
df_admissions = DateTimeTransformer(['ADMITTIME','DISCHTIME']).fit_transform(df_admissions)

# convert ethinicities to ubergroups
df_admissions['ETHNICITY'] = EthnicityTransformer().fit_transform(df_admissions['ETHNICITY'])

# next, replace redundant fields
df_admissions['RELIGION'] = df_admissions['RELIGION'].str.replace('UNOBTAINABLE','NOT SPECIFIED')

# calculate length of hospital stay
adm_dur_trns = DurationTransformer([('DISCHTIME','ADMITTIME')], ['HOSPITAL_DAYS'])
df_admissions = adm_dur_trns.fit_transform(df_admissions)

# negative admit days = dead on arrival, remove
doa_idx = df_admissions[df_admissions['HOSPITAL_DAYS']<0].index
df_admissions = df_admissions.drop(doa_idx,axis=0)

In [5]:
# merge patient and admissions df and clean
df_patient_admit = df_patients.merge(df_admissions,how='inner',left_on=['SUBJECT_ID'],right_on=['SUBJECT_ID'])

# calculate age at admit
# can't use DurationTransformer as is because of int overflow on total_seconds
df_patient_admit['ADMIT_AGE'] = df_patient_admit['ADMITTIME'].dt.year - df_patient_admit['DOB'].dt.year

# no longer need DOB, remove
df_patient_admit=df_patient_admit.drop(['DOB'],axis=1)

# convert ages >200 to 100 because these ages are obscured (purposefully to prevent 
# PHI from being released) --> should be ok because we will use nonlinear classifiers
df_patient_admit.loc[df_patient_admit['ADMIT_AGE']>200,'ADMIT_AGE'] = 100

In [6]:
# deal with icustays
# first & last care units/ward ids and out times are for ICU stays themselves and therefore 
# will not be helpful in predicting ICU stays
# we'll keep ICUSTAY length-of-stay (LOS) in case we want to use that as an outcome variable
# df_icustays = df_icustays.drop(['ROW_ID','FIRST_CAREUNIT','LAST_CAREUNIT','FIRST_WARDID',
#                                 'LAST_WARDID','OUTTIME'],axis=1)
df_icustays = df_icustays.drop(['ROW_ID','FIRST_CAREUNIT','LAST_CAREUNIT','FIRST_WARDID',
                                'LAST_WARDID','OUTTIME'],axis=1)

In [7]:
# convert time strings to datetime
df_icustays = DateTimeTransformer(['INTIME']).fit_transform(df_icustays)

In [8]:
# join patients, admissions, and icustays and create more duration columns
df_patient_admit_icu = df_patient_admit.merge(df_icustays,how='inner',left_on=['SUBJECT_ID','HADM_ID'],
                                              right_on=['SUBJECT_ID','HADM_ID'])

# calculate days from hospital admission to icu admission
adm_dur_trns = DurationTransformer([('INTIME','ADMITTIME')], ['DAYS_ADM_TO_ICU'])
df_patient_admit_icu = adm_dur_trns.fit_transform(df_patient_admit_icu)

# remove any patients whose icu intime was before admittime
icu_first_idx = df_patient_admit_icu[df_patient_admit_icu['DAYS_ADM_TO_ICU']<0].index
df_patient_admit_icu = df_patient_admit_icu.drop(icu_first_idx,axis=0)

# # drop rows with nan for DAYS_ADM_TO_ICU
# df_patient_admit_icu = df_patient_admit_icu.dropna(axis=0, subset=['DAYS_ADM_TO_ICU'])

In [9]:
# create more potential outcome column
def get_time_bins(val,edges):
    time_bin = np.where(val<=edges)[0][0]
    return time_bin

# various admit bins
adm_to_ic_100p = np.percentile(df_patient_admit_icu['DAYS_ADM_TO_ICU'],range(1,101))
adm_to_ic_90m = np.linspace(df_patient_admit_icu['DAYS_ADM_TO_ICU'].min(),df_patient_admit_icu['DAYS_ADM_TO_ICU'].max(),3201)
adm_to_ic_90m = adm_to_ic_90m[1:]

# categorical outcome variables for the time from hospital admit to icu stay
# admits <24 hrs
df_patient_admit_icu['SAMEDAY_ADM_TO_ICU'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(lambda x: int(x<=1))

# admits in percentiles
df_patient_admit_icu['ADM_TO_ICU_100p'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(get_time_bins, 
                                                                                           args=(adm_to_ic_100p,))
# admit times in ~1.5-hour time bins (3200)
df_patient_admit_icu['ADM_TO_ICU_90m'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(get_time_bins, 
                                                                                           args=(adm_to_ic_90m,))



# create 4-category urgency bins
def get_time_bin_cats(val,edges=np.array([1/24, 1, 5, 300]),labels=['immediate','urgent','questionable','stable']):
    time_bin = np.where(val<=edges)[0][0]
    
    return labels[time_bin]

# admit times in urgency bins
df_patient_admit_icu['ICU_URGENCY'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(get_time_bin_cats)

In [10]:
df_patient_admit_icu.to_pickle('./data/pickle/preproc/df_patient_admit_icu__20210126_ALL_final.pkl')

In [11]:
from sklearn.impute import SimpleImputer
simp = SimpleImputer(strategy='median')
df_patient_admit_icu[['LOS']] = simp.fit(df_patient_admit_icu[['LOS']]).transform(df_patient_admit_icu[['LOS']])

In [12]:
from sklearn.model_selection import train_test_split

labels = ['immediate','urgent','questionable','stable']
X_train, X_test, y_train, y_test = train_test_split(
                                    df_patient_admit_icu,
                                    df_patient_admit_icu['ICU_URGENCY'],
                                    random_state=4, test_size=0.2,
                                    stratify=df_patient_admit_icu['ICU_URGENCY'])

In [13]:
import pickle

file = './data/pickle/preproc/df_patient_admit_icu__20210130_TRAIN_final.pkl'
pickle.dump((X_train,y_train),open(file,'wb'))

In [14]:
file = './data/pickle/preproc/df_patient_admit_icu__20210130_TEST_final.pkl'
pickle.dump((X_test,y_test),open(file,'wb'))