In [1]:
import pandas as pd
import numpy as np

In [2]:
#1. Load patients
# import patient info
data_dir = 'data/physionet.org/files/mimiciii/1.4/'
patient_file = 'PATIENTS.csv'
df_patients = pd.read_csv(data_dir + patient_file)

In [3]:
df_patients=df_patients.drop(['ROW_ID','DOD','DOD_HOSP','DOD_SSN','EXPIRE_FLAG'],axis=1)

In [4]:
# convert date strings to datetime
df_patients.DOB = pd.to_datetime(df_patients.DOB,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [5]:
# import admissions info
admissions_file = 'ADMISSIONS.csv'
df_admissions = pd.read_csv(data_dir + admissions_file)

In [6]:
# removing data columns that won't be used for predicting ICU stays
df_admissions = df_admissions.drop(['ROW_ID','DEATHTIME',
                                    'DISCHARGE_LOCATION','EDREGTIME','EDOUTTIME'],axis=1)

In [7]:
# convert time strings to datetime
df_admissions.ADMITTIME = pd.to_datetime(df_admissions.ADMITTIME,format = '%Y-%m-%d %H:%M:%S', 
                                         errors = 'coerce')
df_admissions.DISCHTIME = pd.to_datetime(df_admissions.DISCHTIME,format = '%Y-%m-%d %H:%M:%S', 
                                         errors = 'coerce')

In [8]:
# combining ethnicities into uber-categories for simplicity
ethnicites_dict = {
    'WHITE': ['WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 'WHITE - EASTERN EUROPEAN', 
              'WHITE - BRAZILIAN', 'PORTUGUESE'],
'BLACK': ['BLACK/AFRICAN AMERICAN', 'BLACK/AFRICAN', 'BLACK/HAITIAN', 'BLACK/CAPE VERDEAN'],
'UNKNOWN': ['UNKNOWN/NOT SPECIFIED','PATIENT DECLINED TO ANSWER', 'UNABLE TO OBTAIN'],
'ASIAN': ['ASIAN', 'ASIAN - CHINESE', 'ASIAN - VIETNAMESE', 'ASIAN - CAMBODIAN', 'ASIAN - FILIPINO', 
          'ASIAN - KOREAN', 'ASIAN - THAI', 'ASIAN - JAPANESE', 'ASIAN - OTHER'],
'INDIAN': ['ASIAN - ASIAN INDIAN'],
'OTHER': ['OTHER','SOUTH AMERICAN', 'CARIBBEAN ISLAND'],
'HISPANIC/LATINO': ['HISPANIC OR LATINO', 'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - PUERTO RICAN', 
                    'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - SALVADORAN', 'HISPANIC/LATINO - COLOMBIAN', 
                    'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)', 'HISPANIC/LATINO - HONDURAN', 
                    'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - MEXICAN'],
'MULTIRACE': ['MULTI RACE ETHNICITY'],
'MIDDLE EASTERN': ['MIDDLE EASTERN'],
'AMERICAN NATIVE': ['AMERICAN INDIAN/ALASKA NATIVE', 'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE', 
                    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER']
}

def replace_ethval_todictval(x,ethnicites_dict):
    for key,val in ethnicites_dict.items():
        if x in val:
            return key

In [9]:
# replace each ETHNICITY entry in df by its associated simple-form ethnicity (i.e., associated key in ethnicites_dict)
df_admissions['ETHNICITY'] = df_admissions['ETHNICITY'].apply(replace_ethval_todictval,args=(ethnicites_dict,))

In [10]:
# now fixing religions
# first, fill nans
values = {'RELIGION': 'NOT SPECIFIED', 'LANGUAGE': '***','MARITAL_STATUS': 'UNKNOWN (DEFAULT)', 'ETHNICITY': 'UNKNOWN',
          'INSURANCE': 'Not Specified', 'ADMISSION_LOCATION': '** INFO NOT AVAILABLE **', 'ADMISSION_TYPE': 'UNKNOWN'}
df_admissions = df_admissions.fillna(value=values)

# next, replace redundant fields
df_admissions['RELIGION'] = df_admissions['RELIGION'].str.replace('UNOBTAINABLE','NOT SPECIFIED')

In [11]:
# calculate length of hospital stay
df_admissions['HOSPITAL_DAYS'] = (df_admissions['DISCHTIME'] - 
                                  df_admissions['ADMITTIME']).dt.total_seconds()/(24*60*60)
# negative admit days = dead on arrival, remove
doa_idx = df_admissions[df_admissions['HOSPITAL_DAYS']<0].index
df_admissions = df_admissions.drop(doa_idx,axis=0)

In [12]:
# merge patient and admissions df
df_patient_admit = df_patients.merge(df_admissions,how='left',left_on=['SUBJECT_ID'],right_on=['SUBJECT_ID'])

In [13]:
# calculate age at admit
df_patient_admit['ADMIT_AGE'] = df_patient_admit['ADMITTIME'].dt.year - df_patient_admit['DOB'].dt.year
# no longer need DOB, remove
df_patient_admit=df_patient_admit.drop(['DOB'],axis=1)

In [14]:
# 2. Remove patients <age

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# not removing any patient at this point (for saving purposes), can load 
# in and make modifications later as desired
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

age = -1
child_idx = df_patient_admit[df_patient_admit['ADMIT_AGE']<age].index
child_patients = df_patient_admit.iloc[child_idx]['SUBJECT_ID'].unique()
df_patient_admit = df_patient_admit.drop(child_idx, axis=0)

# convert ages >200 to nan because these ages are obscured (purposefully to prevent 
# PHI from being released)
df_patient_admit.loc[df_patient_admit['ADMIT_AGE']>200,'ADMIT_AGE'] = np.nan

In [15]:
# 3. Load icustays
# import icu stays info
icustays_file = 'ICUSTAYS.csv'
df_icustays = pd.read_csv(data_dir + icustays_file)

child_idx = df_icustays[df_icustays['SUBJECT_ID'].isin(child_patients)].index
df_icustays = df_icustays.drop(child_idx,axis=0)

# first & last care units/ward ids and out times are for ICU stays themselves and therefore 
# will not be helpful in predicting ICU stays
# we'll keep ICUSTAY length-of-stay (LOS) in case we want to use that as an outcome variable
df_icustays = df_icustays.drop(['ROW_ID','FIRST_CAREUNIT','LAST_CAREUNIT','FIRST_WARDID',
                                'LAST_WARDID','OUTTIME'],axis=1)

# convert time strings to datetime
df_icustays.INTIME = pd.to_datetime(df_icustays.INTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [16]:
# join patients, admissions, and icustays
df_patient_admit_icu = df_patient_admit.merge(df_icustays,how='left',left_on=['SUBJECT_ID','HADM_ID'],
                                              right_on=['SUBJECT_ID','HADM_ID'])

In [17]:
# calculate days from hospital admission to icu admission
df_patient_admit_icu['DAYS_ADM_TO_ICU'] = (df_patient_admit_icu['INTIME'] - 
                                           df_patient_admit_icu['ADMITTIME']).dt.total_seconds()/(24*60*60)

In [18]:
# remove any patients whose icu intime was before admittime
icu_first_idx = df_patient_admit_icu[df_patient_admit_icu['DAYS_ADM_TO_ICU']<0].index
df_patient_admit_icu = df_patient_admit_icu.drop(icu_first_idx,axis=0)

In [19]:
# not dropping now in case we want to simulate capacity
# # drop time columns no longer needed
# df_patient_admit_icu = df_patient_admit_icu.drop(['DISCHTIME','INTIME'],axis=1)

In [20]:
# drop rows with nan for DAYS_ADM_TO_ICU
df_patient_admit_icu = df_patient_admit_icu.dropna(axis=0, subset=['DAYS_ADM_TO_ICU'])

In [21]:
adm_to_ic_100p = np.percentile(df_patient_admit_icu['DAYS_ADM_TO_ICU'],range(1,101))

In [22]:
def get_time_bins(val,edges):
    time_bin = np.where(val<=edges)[0][0]
    return time_bin

In [23]:
# categorical outcome variables for the time from hospital admit to icu stay
# admits <24 hrs
df_patient_admit_icu['SAMEDAY_ADM_TO_ICU'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(lambda x: int(x<=1))

# admits in percentiles
df_patient_admit_icu['ADM_TO_ICU_100p'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(get_time_bins, 
                                                                                           args=(adm_to_ic_100p,))

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# # not doing now, since can do this with np.array([1,3,5,7,9]) as bins on 'ADM_TO_ICU_10'
# # admits in 5-bin quantiles
# df_patient_admit_icu['ADM_TO_ICU_5'] = df_patient_admit_icu['ADM_TO_ICU_10'].apply(get_time_bins, 
#                                                                                    args=(np.array([1,3,5,7,9]),))
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

In [24]:
# admits in ~1.5-hour time bins (3200)
adm_to_ic_90m = np.linspace(df_patient_admit_icu['DAYS_ADM_TO_ICU'].min(),df_patient_admit_icu['DAYS_ADM_TO_ICU'].max(),3201)
adm_to_ic_90m = adm_to_ic_90m[1:]
df_patient_admit_icu['ADM_TO_ICU_90m'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(get_time_bins, 
                                                                                           args=(adm_to_ic_90m,))

In [50]:
df_patient_admit_icu.to_pickle('./data/pickle/preproc/df_patient_admit_icu__20210119.pkl')

In [32]:
df_patient_admit_icu.shape

Index(['SUBJECT_ID', 'GENDER', 'EXPIRE_FLAG', 'HADM_ID', 'ADMITTIME',
       'DISCHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE',
       'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'HOSPITAL_DAYS',
       'ADMIT_AGE', 'ICUSTAY_ID', 'DBSOURCE', 'INTIME', 'LOS',
       'DAYS_ADM_TO_ICU', 'SAMEDAY_ADM_TO_ICU', 'ADM_TO_ICU_100p',
       'ADM_TO_ICU_90m'],
      dtype='object')

In [1]:
print(104/8.2)
print(264/52.7)
print(488/554.2)
print(776/576.0)
print(1128/602.4)
print(1544/640.9)
print(2024/682.8)
print(2568/730.6)
print(3176/784.4)
print(3848/844.3)
print(4584/917.9)

12.682926829268293
5.009487666034155
0.8805485384337783
1.3472222222222223
1.8725099601593627
2.409112185988454
2.9642647920328065
3.514919244456611
4.0489546149923505
4.557621698448419
4.994008061880379
