In [1]:
import pandas as pd

In [2]:
#1. Load patients
# import patient info
data_dir = 'data/physionet.org/files/mimiciii/1.4/'
patient_file = 'PATIENTS.csv'
df_patients = pd.read_csv(data_dir + patient_file)

In [3]:
df_patients=df_patients.drop(['ROW_ID','DOD_HOSP','DOD_SSN'],axis=1)

In [4]:
# convert date strings to datetime
df_patients.DOB = pd.to_datetime(df_patients.DOB,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_patients.DOD = pd.to_datetime(df_patients.DOD,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [5]:
# import admissions info
admissions_file = 'ADMISSIONS.csv'
df_admissions = pd.read_csv(data_dir + admissions_file)
df_admissions = df_admissions.drop(['ROW_ID','RELIGION','LANGUAGE','MARITAL_STATUS','ETHNICITY'],axis=1)

In [6]:
# convert time strings to datetime
df_admissions.ADMITTIME = pd.to_datetime(df_admissions.ADMITTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.DISCHTIME = pd.to_datetime(df_admissions.DISCHTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.EDREGTIME = pd.to_datetime(df_admissions.EDREGTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.EDOUTTIME = pd.to_datetime(df_admissions.EDOUTTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [7]:
# calculate length of hospital stay
df_admissions['HOSPITAL_DAYS'] = (df_admissions['DISCHTIME'] - df_admissions['ADMITTIME']).dt.total_seconds()/(24*60*60)
# negative admit days = dead on arrival, remove
doa_idx = df_admissions[df_admissions['HOSPITAL_DAYS']<0].index
df_admissions = df_admissions.drop(doa_idx,axis=0)

In [8]:
# merge patient and admissions df
df_patient_admit = df_patients.merge(df_admissions,how='left',left_on=['SUBJECT_ID'],right_on=['SUBJECT_ID'])

In [9]:
# calculate age at admit, age at discharge
df_patient_admit['ADMIT_AGE'] = df_patient_admit['ADMITTIME'].dt.year - df_patient_admit['DOB'].dt.year
df_patient_admit['DISCH_AGE'] = df_patient_admit['DISCHTIME'].dt.year - df_patient_admit['DOB'].dt.year

In [10]:
# 2. Remove patients <X
# remove patients <X yo from df
age = 10
child_idx = df_patient_admit[df_patient_admit['ADMIT_AGE']<age].index
child_patients = df_patient_admit.iloc[child_idx]['SUBJECT_ID'].unique()
df_patient_admit = df_patient_admit.drop(child_idx, axis=0)

In [11]:
# 3. Load icustays
# import icu stays info
icustays_file = 'ICUSTAYS.csv'
df_icustays = pd.read_csv(data_dir + icustays_file)

child_idx = df_icustays[df_icustays['SUBJECT_ID'].isin(child_patients)].index
df_icustays = df_icustays.drop(child_idx,axis=0)

df_icustays = df_icustays.drop(['ROW_ID'],axis=1)
df_icustays.INTIME = pd.to_datetime(df_icustays.INTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_icustays.OUTTIME = pd.to_datetime(df_icustays.OUTTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [12]:
#join admissions and icu stays
df_admit_icu = df_admissions.merge(df_icustays,how='left',left_on=['SUBJECT_ID','HADM_ID'],right_on=['SUBJECT_ID','HADM_ID'])

In [13]:
cols_to_rmv = list(set(df_admit_icu.columns) & set(df_patient_admit))
# keep SUBJECT_ID and HADM_ID for merge
sbj_idx = cols_to_rmv.index('SUBJECT_ID')
cols_to_rmv.pop(sbj_idx)
h_idx = cols_to_rmv.index('HADM_ID')
cols_to_rmv.pop(h_idx)

df_admit_icu = df_admit_icu.drop(cols_to_rmv,axis=1)

In [14]:
df_patient_admit_icu = df_patient_admit.merge(df_admit_icu,how='left',left_on=['SUBJECT_ID','HADM_ID'],right_on=['SUBJECT_ID','HADM_ID'])

In [15]:
df_patient_admit_icu['DAYS_ADM_TO_ICU'] = (df_patient_admit_icu['INTIME'] - df_patient_admit_icu['ADMITTIME']).dt.total_seconds()/(24*60*60)

In [16]:
# 4. Load prescriptions
# import prescriptions info
# data_dir = 'data/physionet.org/files/mimiciii/1.4/'
prescrips_file = 'PRESCRIPTIONS.csv'
df_prescrips = pd.read_csv(data_dir + prescrips_file,low_memory=False)

df_prescrips = df_prescrips.drop(['ROW_ID','GSN','DRUG','DRUG_NAME_POE','DRUG_NAME_GENERIC','FORMULARY_DRUG_CD'],axis=1)
ndc_nan_idx = df_prescrips[df_prescrips['NDC'].isna()].index
df_prescrips=df_prescrips.drop(ndc_nan_idx,axis=0)

df_prescrips['NDC'] = df_prescrips['NDC'].astype('int64')

In [17]:
child_idx = df_prescrips[df_prescrips['SUBJECT_ID'].isin(child_patients)].index
df_prescrips = df_prescrips.drop(child_idx,axis=0)

In [18]:
df_prescrips.ENDDATE = pd.to_datetime(df_prescrips.ENDDATE,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_prescrips.STARTDATE = pd.to_datetime(df_prescrips.STARTDATE,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [19]:
df_patient_admit_icu_prescrip = df_patient_admit_icu.merge(df_prescrips,how='left',left_on=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'],right_on=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'])

In [20]:
df_patient_admit_icu_prescrip['DAYS_ADM_TO_DRUG'] = (df_patient_admit_icu_prescrip['STARTDATE'] - df_patient_admit_icu_prescrip['ADMITTIME']).dt.total_seconds()/(24*60*60)

In [21]:
df_patient_admit_icu_prescrip['DAYS_DRUG_BEFORE_ICU'] = df_patient_admit_icu_prescrip['DAYS_ADM_TO_ICU'] - df_patient_admit_icu_prescrip['DAYS_ADM_TO_DRUG']

In [22]:
df_patient_admit_icu_prescrip_drugsfirst = df_patient_admit_icu_prescrip[df_patient_admit_icu_prescrip['DAYS_DRUG_BEFORE_ICU']>0]

In [23]:
prescrips_by_combo_before_icu = df_patient_admit_icu_prescrip_drugsfirst.groupby(['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).agg({'NDC': ['count', list]})

In [24]:
prescrips_by_combo_before_icu.sort_values(by=('NDC','count'),ascending=False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NDC,NDC
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,list
SUBJECT_ID,HADM_ID,ICUSTAY_ID,Unnamed: 3_level_2,Unnamed: 4_level_2
60122,122883.0,237390.0,133,"[409672924.0, 2821501.0, 63323001302.0, 633230..."
65610,132065.0,299319.0,123,"[61553008348.0, 61553008348.0, 10019002710.0, ..."
78532,177956.0,240761.0,122,"[517760425.0, 409662502.0, 10019016312.0, 6155..."
92947,190453.0,201667.0,118,"[517293025.0, 641049325.0, 904516561.0, 517293..."
98525,168505.0,294539.0,114,"[55390000401.0, 0.0, 944049101.0, 944049101.0,..."


In [25]:
# find all subject, hadm, icu_stay where count ==1 and remove (since no interaction)
prescrips_by_combo_before_icu = prescrips_by_combo_before_icu[prescrips_by_combo_before_icu[('NDC',  'count')]>1]

df_keep_ix = prescrips_by_combo_before_icu.index
df_keep_subjs = [x[0] for x in df_keep_ix]
df_keep_hadm = [x[1] for x in df_keep_ix]
df_keep_icustay = [x[2] for x in df_keep_ix]

df_ix1 = df_patient_admit_icu_prescrip_drugsfirst['SUBJECT_ID'].apply(lambda x: x in df_keep_subjs)
df_ix2 = df_patient_admit_icu_prescrip_drugsfirst['HADM_ID'].apply(lambda x: x in df_keep_hadm)
df_ix3 = df_patient_admit_icu_prescrip_drugsfirst['ICUSTAY_ID'].apply(lambda x: x in df_keep_icustay)

df_patient_admit_icu_prescrip_drugsfirst = df_patient_admit_icu_prescrip_drugsfirst[df_ix1 & df_ix2 & df_ix3]

In [26]:
# look at all Y-   of drugs and relationship to 'DAYS_DRUG_BEFORE_ICU'
drugs = df_patient_admit_icu_prescrip['NDC'].unique()

In [27]:
drugs = [x for x in drugs if x==x and x!=0.0]

In [None]:
drug_combos = []
mn_days_drug_b4_icu = []
se_days_drug_b4_icu = []
combo_total = []
for ix1, drug1 in enumerate(drugs):
    if ix1 % 100 == 0:
        print('Iteration {} out of {}...'.format(ix1,len(drugs)))
    for ix2 in range(ix1+1,len(drugs)):
        if ix2 % 500 == 0:
            print('     Subiteration {} out of {}...'.format(ix2,len(drugs)))
            
        drug2 = drugs[ix2]
        drug_combos.append((drug1,drug2))
        
        drg1_in = prescrips_by_combo_before_icu[('NDC',  'list')].apply(lambda x: drug1 in x)
        drg2_in = prescrips_by_combo_before_icu[('NDC',  'list')].apply(lambda x: drug2 in x)
        both_ix = prescrips_by_combo_before_icu.index[drg1_in & drg2_in]
        
        both_subjs = [x[0] for x in both_ix]
        both_hadm = [x[1] for x in both_ix]
        both_icustay = [x[2] for x in both_ix]
        
        df_ix1 = df_patient_admit_icu_prescrip_drugsfirst['SUBJECT_ID'].apply(lambda x: x in both_subjs)
        df_ix2 = df_patient_admit_icu_prescrip_drugsfirst['HADM_ID'].apply(lambda x: x in both_hadm)
        df_ix3 = df_patient_admit_icu_prescrip_drugsfirst['ICUSTAY_ID'].apply(lambda x: x in both_icustay)
        
        combo_total = sum(df_ix1 & df_ix2 & df_ix3)
        mn_days_drug_b4_icu.append(df_patient_admit_icu_prescrip_drugsfirst[df_ix1 & df_ix2 & df_ix3].DAYS_DRUG_BEFORE_ICU.mean())
        se_days_drug_b4_icu.append(df_patient_admit_icu_prescrip_drugsfirst[df_ix1 & df_ix2 & df_ix3].DAYS_DRUG_BEFORE_ICU.sem())

        

Iteration 0 out of 3853...
     Subiteration 500 out of 3853...
     Subiteration 1000 out of 3853...
     Subiteration 1500 out of 3853...
     Subiteration 2000 out of 3853...
     Subiteration 2500 out of 3853...
     Subiteration 3000 out of 3853...
     Subiteration 3500 out of 3853...
     Subiteration 500 out of 3853...
     Subiteration 1000 out of 3853...
     Subiteration 1500 out of 3853...
     Subiteration 2000 out of 3853...
     Subiteration 2500 out of 3853...
     Subiteration 3000 out of 3853...
     Subiteration 3500 out of 3853...
     Subiteration 500 out of 3853...
     Subiteration 1000 out of 3853...


In [39]:
df_patient_admit_icu_prescrip_columnsgsfirst['NDC'] = df_patient_admit_icu_prescrip_drugsfirst['NDC'].astype('int64').to_string()

ValueError: invalid literal for int() with base 10: '357          338004903\n358          338004904\n359                  0\n360          641040025\n361           74176230\n362          517260225\n363          517391025\n364          310030011\n365    

In [36]:
sum(df_patient_admit_icu_prescrip['DRUG_BEFORE_ICU'])/len(df_patient_admit_icu_prescrip)

0.1958167939957583

In [15]:
# keep any row where ICUSTAY_ID is NaN (i.e.,)
df_prescrips['ICUSTAY_ID'].isna()

0.34830696724243115

In [11]:
child_idx = df_prescrips[df_prescrips['SUBJECT_ID'].isin(child_patients)].index
df_prescrips = df_prescrips.drop(child_idx,axis=0)

In [None]:
# convert time strings to datetime
df_prescrips.STARTDATE = pd.to_datetime(df_prescrips.STARTDATE,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_prescrips.ENDDATE = pd.to_datetime(df_prescrips.ENDDATE,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [None]:
# Get only admits that occur 
