In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import functools
import pathlib
import numpy as np
import pandas as pd

import clue

In [None]:
dirs_dct = dict(list(pd.read_csv('../directory_paths.csv')['paths'].apply(eval)))


In [3]:
output_dir = pathlib.Path(dirs_dct['data_dir'])

In [4]:
all_hadms = clue.load_hadms(os.path.join(dirs_dct['data_dir'],'hadms.csv'))

In [5]:
metabolic_cohort =pd.read_csv(os.path.join(dirs_dct['data_dir'],'metabolic_syndrome_full_cohort.csv'))

In [6]:
metabolic_hadms = all_hadms.reset_index().merge(metabolic_cohort[['new_HADM','label','CHARTTIME']], right_on='new_HADM',left_on='hadm_id', how='inner')[list(all_hadms.columns)+['label','hadm_id','subject_id','CHARTTIME']]

In [7]:
metabolic_hadms['timestamp']=metabolic_hadms['CHARTTIME']

In [8]:
metabolic_hadms=metabolic_hadms.drop_duplicates(keep='first',subset='hadm_id')

In [9]:
metabolic_hadms[['hadm_id','timestamp']].to_csv(os.path.join(dirs_dct['data_dir'],'metabolic_hadms.csv'),index=False)

In [10]:
metabolic_hadms= metabolic_hadms.set_index(['subject_id','hadm_id'])[list(all_hadms.columns)+['label']]

In [11]:
govf_threshold = 0.9
_, age_buckets = clue.find_optimal_buckets(metabolic_hadms, "adjusted_age", govf_threshold)
print('Determined (adjusted) age buckets as:', age_buckets)
_, oasis_buckets = clue.find_optimal_buckets(metabolic_hadms, "adjusted_oasis", govf_threshold)
print('Determined (adjusted) oasis buckets as:', oasis_buckets)

Determined (adjusted) age buckets as: [15.049966, 38.3436, 56.071182, 71.92882, 90.0]
Determined (adjusted) oasis buckets as: [0.0, 11.0, 30.0, 64.0]


In [12]:
(len(pd.read_csv(os.path.join(dirs_dct['data_dir'],'metabolic_hadms.csv')).reset_index()['hadm_id']), 
 len(pd.read_csv(os.path.join(dirs_dct['data_dir'],'metabolic_hadms.csv')).reset_index()['hadm_id'].unique()))

(16571, 16571)

In [13]:
buckets = {
    'adjusted_age': clue.BucketMapping([0] + age_buckets[1:-1] + [99]),
    'gender': clue.CategoryMapping(['F', 'M']),
    'icu': clue.CategoryMapping(['CCU', 'CSRU', 'MICU', 'SICU', 'TSICU', 'NICU']),
    'adjusted_oasis': clue.BucketMapping([0] + oasis_buckets[1:-1] + [999]),
    'ethnicity': clue.CategoryMapping(['ASIAN', 'BLACK', 'HISPANIC/LATINO', 'OTHER', 'UNKNOWN', 'WHITE']),
    'insurance': clue.CategoryMapping(['Government', 'Medicaid', 'Medicare', 'Private', 'Self Pay']),
    'admission_type': clue.CategoryMapping(['ELECTIVE', 'EMERGENCY', 'URGENT']),
    'admission_location': clue.CategoryMapping(['CLINIC REFERRAL', 'OTHER', 'PHYS REFERRAL', 'TRANSFER FROM HOSP']),
    'label':  clue.BooleanMapping()
}

splitter = clue.NamedStratifier({'train': 8,
                                 'test': 1,
                                 'valid': 1})

stratify = functools.partial(clue.stratify, hadms=metabolic_hadms, splitter=splitter)

In [14]:
for split, df in stratify(os.path.join(dirs_dct['data_dir'],'metabolic_hadms.csv'), buckets):
    df.reset_index().rename(columns={'index':'hadm_id'}).to_csv(output_dir / f'metabolic.{split}.csv',index=False)

In [15]:
train_metabolic = pd.read_csv(output_dir / f'metabolic.train.csv').merge(all_hadms.reset_index()[['subject_id','hadm_id']], on ='hadm_id',how='left')
val_metabolic = pd.read_csv(output_dir / f'metabolic.valid.csv').merge(all_hadms.reset_index()[['subject_id','hadm_id']], on ='hadm_id',how='left')
test_metabolic = pd.read_csv(output_dir / f'metabolic.test.csv').merge(all_hadms.reset_index()[['subject_id','hadm_id']], on ='hadm_id',how='left')

In [16]:
set(val_metabolic['subject_id']).intersection(set(test_metabolic['subject_id']))
# (len(train_metabolic),len(val_metabolic),len(test_metabolic))

set()

In [17]:
(len(train_metabolic['hadm_id']),  len(train_metabolic['hadm_id'].unique()))

(13218, 13218)

In [18]:
nicu_cohort =pd.read_csv(os.path.join(dirs_dct['data_dir'],'nicu_full_cohort.csv'))


In [19]:
admissions_df =  pd.read_csv(os.path.join(dirs_dct['mimic_dir'],'ADMISSIONS.csv'))

subjects_df = pd.read_csv(os.path.join(dirs_dct['mimic_dir'],'PATIENTS.csv'))

patients_df = pd.read_csv(os.path.join(dirs_dct['mimic_dir'],'PATIENTS.csv'))

In [20]:
stays_df = pd.read_csv(os.path.join(dirs_dct['mimic_dir'],'ICUSTAYS.csv')).drop_duplicates(keep='first',subset=['HADM_ID'])

In [21]:
oasis_df=pd.read_csv(os.path.join(dirs_dct['data_dir'],'oasis.csv')).drop_duplicates(keep='first',subset=['hadm_id'])

In [22]:
nicu_cohort = nicu_cohort.merge(stays_df[['HADM_ID','INTIME','FIRST_CAREUNIT']],how='left', left_on='new_HADM',right_on='HADM_ID')

In [23]:
(len(nicu_cohort['new_HADM']),len(nicu_cohort['new_HADM'].unique()))

(7614, 7614)

In [24]:
nicu_cohort = nicu_cohort.merge(admissions_df[['HADM_ID','ADMITTIME','ADMISSION_TYPE','ADMISSION_LOCATION','ETHNICITY','INSURANCE']],how='left', left_on='new_HADM',right_on='HADM_ID')

In [25]:
nicu_cohort = nicu_cohort.merge(oasis_df[['hadm_id','oasis']],how='left', left_on='new_HADM',right_on='hadm_id')

In [26]:
nicu_hadms = nicu_cohort[['new_HADM','SUBJECT_ID','AGE','GENDER','ADMITTIME','INTIME','FIRST_CAREUNIT','ETHNICITY','INSURANCE','ADMISSION_TYPE','ADMISSION_LOCATION','oasis','label']]

In [27]:
(len(pd.read_csv('nicu_hadms.csv')['hadm_id']),len(pd.read_csv(os.path.join(dirs_dct['data_dir'],'nicu_hadms.csv'))['hadm_id'].unique()))

(7614, 7614)

In [28]:
(len(pd.read_csv(os.path.join(dirs_dct['data_dir'],'nicu_hadms.csv'))['hadm_id']),len(pd.read_csv(os.path.join(dirs_dct['data_dir'],'nicu_hadms.csv'))['hadm_id'].unique()))

(7614, 7614)

In [29]:
nicu_hadms= nicu_hadms.rename(columns=str.lower).rename(columns={
    'new_hadm':'hadm_id',
    'intime':'icutime',
    'first_careunit':'icu'
})

In [30]:
nicu_hadms['hadm_id']= nicu_hadms['hadm_id'].astype(int)

In [31]:
nicu_hadms= nicu_hadms.set_index(['subject_id','hadm_id'])

In [32]:
# nicu_hadms['age']=nicu_hadms['age'].apply(lambda x: x/365)

In [33]:
govf_threshold = 0.9
_, age_buckets = clue.find_optimal_buckets(nicu_hadms, "age", govf_threshold)
print('Determined (adjusted) age buckets as:', age_buckets)
_, oasis_buckets = clue.find_optimal_buckets(nicu_hadms, "oasis", govf_threshold)
print('Determined (adjusted) oasis buckets as:', oasis_buckets)

Determined (adjusted) age buckets as: [0.0, 0.51805556, 3.7013888, 13.657639, 29.665277]
Determined (adjusted) oasis buckets as: [6.0, nan, nan]


  sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort])
  ret = ret.dtype.type(ret / rcount)


In [34]:
nicu_hadms['gender']

subject_id  hadm_id
28711       104132     M
1291        141087     F
476         161042     F
28319       158388     F
27526       134575     F
                      ..
11400       109331     M
24254       118562     M
13243       143841     M
25913       147240     M
8982        192942     M
Name: gender, Length: 7614, dtype: object

In [35]:
ethnicities_dict = {'ASIAN - OTHER':'ASIAN', 'UNKNOWN/NOT SPECIFIED':'UNKNOWN', 'ASIAN':'ASIAN', 'WHITE':'WHITE',
       'BLACK/AFRICAN AMERICAN':'BLACK', 'AMERICAN INDIAN/ALASKA NATIVE':'OTHER',
       'UNABLE TO OBTAIN':'UNKNOWN', 'OTHER':'OTHER', 'MULTI RACE ETHNICITY':'OTHER',
       'HISPANIC OR LATINO':'HISPANIC/LATINO', 'ASIAN - CHINESE':'ASIAN', 'BLACK/HAITIAN':'BLACK',
       'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER':'OTHER',
       'PATIENT DECLINED TO ANSWER':'UNKNOWN', 'BLACK/CAPE VERDEAN':'BLACK',
       'HISPANIC/LATINO - PUERTO RICAN':'HISPANIC/LATINO', 'PORTUGUESE':'WHITE', 'BLACK/AFRICAN':'BLACK',
       'ASIAN - ASIAN INDIAN':'ASIAN', 'WHITE - OTHER EUROPEAN':'WHITE', 'MIDDLE EASTERN':'WHITE',
       'WHITE - BRAZILIAN':'WHITE', 'HISPANIC/LATINO - MEXICAN':'HISPANIC/LATINO',
       'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)':'HISPANIC/LATINO', 'ASIAN - VIETNAMESE':'ASIAN',
       'ASIAN - FILIPINO':'ASIAN', 'ASIAN - JAPANESE':'ASIAN',
       'HISPANIC/LATINO - GUATEMALAN':'HISPANIC/LATINO', 'HISPANIC/LATINO - SALVADORAN':'HISPANIC/LATINO',
       'SOUTH AMERICAN':'HISPANIC/LATINO', 'ASIAN - KOREAN':'ASIAN'}

In [36]:
nicu_hadms['ethnicity']= nicu_hadms['ethnicity'].apply(lambda x: ethnicities_dict[x])

In [37]:
nicu_hadms['admission_type'].unique()

array(['NEWBORN', 'EMERGENCY', 'URGENT', 'ELECTIVE'], dtype=object)

In [38]:
nicu_hadms.reset_index().rename(columns={'icutime':'timestamp'})[['hadm_id','timestamp']].to_csv(os.path.join(dirs_dct['data_dir'],'nicu_hadms.csv'),index=False)


In [39]:
buckets = {
    'age': clue.BucketMapping( age_buckets[0:-1]+[30]),
    'gender': clue.CategoryMapping(['F', 'M']),
#     'icu': clue.CategoryMapping(['NICU']),
# #     'adjusted_oasis': clue.BucketMapping([0] + oasis_buckets[1:-1] + [999]),
    'ethnicity': clue.CategoryMapping(['ASIAN', 'BLACK', 'HISPANIC/LATINO', 'OTHER', 'UNKNOWN', 'WHITE']),
    'insurance': clue.CategoryMapping(['Government', 'Medicaid', 'Medicare', 'Private', 'Self Pay']),
    'admission_type': clue.CategoryMapping(['NEWBORN', 'EMERGENCY', 'URGENT', 'ELECTIVE']),
    'admission_location': clue.CategoryMapping(['PHYS REFERRAL/NORMAL DELI', 'CLINIC REFERRAL/PREMATURE','** INFO NOT AVAILABLE **', 'HMO REFERRAL/SICK','TRANSFER FROM HOSP/EXTRAM']),
    'label':  clue.BooleanMapping()
}

splitter = clue.NamedStratifier({'train': 2,
                                 'test': 1,
                                 'valid': 1})

stratify = functools.partial(clue.stratify, hadms=nicu_hadms, splitter=splitter)

In [40]:
for split, df in stratify(os.path.join(dirs_dct['data_dir'],'nicu_hadms.csv'), buckets):
    df.reset_index().rename(columns={'index':'hadm_id'}).to_csv(output_dir / f'nicu.{split}.csv',index=False)

In [41]:
pd.read_csv(output_dir / f'nicu.train.csv')

Unnamed: 0,hadm_id,timestamp
0,104692,2100-08-03 08:55:52
1,108473,2100-08-25 09:30:55
2,135605,2100-08-29 22:58:46
3,111007,2100-09-09 13:31:08
4,119916,2100-09-17 15:11:18
...,...,...
3775,187343,2201-05-15 08:47:39
3776,161900,2201-05-21 08:43:00
3777,189638,2201-05-21 14:14:20
3778,122461,2201-06-07 14:44:41


In [42]:
all_hadms

Unnamed: 0_level_0,Unnamed: 1_level_0,age,gender,admittime,icutime,icu,oasis,ethnicity,insurance,admission_type,admission_location,adjusted_oasis,adjusted_age
subject_id,hadm_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
142,131357,48.087611,F,2143-04-01 07:15:00,2143-04-01 14:19:37,CSRU,27,WHITE,Private,ELECTIVE,PHYS REFERRAL,27,48.087611
161,121804,72.000000,M,2173-05-26 16:03:00,2173-06-09 21:54:27,SICU,27,OTHER,Medicare,EMERGENCY,CLINIC REFERRAL,0,72.000000
256,153771,79.969884,M,2166-07-21 23:38:00,2166-07-22 01:10:31,MICU,21,WHITE,Medicare,EMERGENCY,OTHER,21,79.969884
518,161635,49.004791,M,2111-09-21 16:00:00,2111-09-21 16:01:26,MICU,17,BLACK,Medicare,EMERGENCY,OTHER,17,49.004791
704,151175,80.240931,M,2176-03-24 14:26:00,2176-03-31 02:44:00,CCU,40,UNKNOWN,Medicare,URGENT,TRANSFER FROM HOSP,0,80.240931
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98514,101117,74.666667,M,2121-09-08 20:12:00,2121-09-08 20:13:13,CCU,57,WHITE,Medicare,EMERGENCY,OTHER,57,74.666667
99384,168087,85.552361,M,2117-12-15 18:12:00,2117-12-15 18:13:36,TSICU,38,WHITE,Medicare,EMERGENCY,OTHER,38,90.000000
99423,174021,50.173854,F,2116-08-14 12:15:00,2116-08-17 20:56:15,TSICU,25,WHITE,Private,ELECTIVE,PHYS REFERRAL,0,50.173854
99458,121483,67.214237,M,2169-04-03 08:14:00,2169-04-03 08:15:10,MICU,25,WHITE,Medicare,EMERGENCY,CLINIC REFERRAL,25,67.214237
