# Data Preprocessing

In [None]:
#imports
import torch
import pandas as pd
from datetime import datetime
from datetime import timedelta
from math import ceil
import gzip

print(pd.__version__)

1.3.5


## Load MIMIC-III tables



In [None]:
# from google.colab import drive
# drive.mount('/gdrive')
# %cd /gdrive/MyDrive/598-DLH/mimic
# # %ls

In [None]:
#for local
%cd mimic-iii-clinical-database-1.4/
%ls

/mnt/d/CS598DLH/mimic-iii-clinical-database-1.4
[0m[01;32mADMISSIONS.csv.gz[0m*          [01;32mOUTPUTEVENTS.csv.gz[0m*
[01;32mCALLOUT.csv.gz[0m*             [01;32mPATIENTS.csv.gz[0m*
[01;32mCAREGIVERS.csv.gz[0m*          [01;32mPRESCRIPTIONS.csv.gz[0m*
[01;32mCHARTEVENTS.csv.gz[0m*         [01;32mPROCEDUREEVENTS_MV.csv.gz[0m*
[01;32mCPTEVENTS.csv.gz[0m*           [01;32mPROCEDURES_ICD.csv.gz[0m*
[01;32mDATETIMEEVENTS.csv.gz[0m*      [01;32mREADME.md[0m*
[01;32mDIAGNOSES_ICD.csv.gz[0m*       [01;32mSERVICES.csv.gz[0m*
[01;32mDRGCODES.csv.gz[0m*            [01;32mSHA256SUMS.txt[0m*
[01;32mD_CPT.csv.gz[0m*               [01;32mTRANSFERS.csv.gz[0m*
[01;32mD_ICD_DIAGNOSES.csv.gz[0m*     [01;32mchart.pkl[0m*
[01;32mD_ICD_PROCEDURES.csv.gz[0m*    [01;32mchart_tensor_filled_final.pt.gz[0m*
[01;32mD_ITEMS.csv[0m*                [01;32mchecksum_md5_unzipped.txt[0m*
[01;32mD_ITEMS.csv.gz[0m*             [01;32mchecksum_md5_zipped.txt[0m*
[01

In [None]:
READ_PROPORTION = 1
ad = pd.read_csv('ADMISSIONS.csv.gz', nrows=58976 // READ_PROPORTION, compression='gzip')
patients = pd.read_csv('PATIENTS.csv.gz', nrows=46520 // READ_PROPORTION, compression='gzip')
icu = pd.read_csv('ICUSTAYS.csv.gz', nrows=61532 // READ_PROPORTION, compression='gzip')
diagnoses = pd.read_csv('DIAGNOSES_ICD.csv.gz', nrows=651047 // READ_PROPORTION, compression='gzip')
# Dictionary table should be loaded entirely
# d_items = pd.read_csv('D_ITEMS.csv.gz', nrows=12487 // 1, compression='gzip') 
# d_icd_diag = pd.read_csv('D_ICD_DIAGNOSES.csv.gz', nrows=14710 // 1, compression='gzip')

# READ_PROPORTION_FOR_NOTES = 5
# READ_PROPORTION_FOR_CHARTEVENT = 200
# chart = pd.read_csv('CHARTEVENTS.csv.gz', nrows=330712483 // READ_PROPORTION_FOR_CHARTEVENT, compression='gzip')
# notes = pd.read_csv('NOTEEVENTS.csv.gz', nrows=2083180 // READ_PROPORTION_FOR_NOTES, compression='gzip') #only 1619465 has charttime

## Utility functions

In [None]:
def timedelta_hours(diff: timedelta):
  return diff.total_seconds() // 3600

def timedelta_hours_roundup(diff: timedelta):
  return ceil(diff.total_seconds() / 3600)

def get_index_dict(values_pdSeries):
  return {v:i for i, v in enumerate(list(values_pdSeries.unique()))}

In [None]:
## Use the following function to get the itemid of the important features

# d_items.columns = d_items.columns.str.lower()

# def search_feature(search_str, mode='contains'):
#   if mode == 'contains':
#     search_result = d_items[d_items['label'].str.contains("(?i)^.*" + search_str + ".*$")==True]
#   elif mode == 'exact':
#     search_result = d_items[d_items['label'].str.contains("(?i)^" + search_str + "$")==True]

#   # search_result
#   print('Matching labels:')
#   match_labels = list(search_result['label'])
#   if len(match_labels)<=10:
#     print(match_labels)
#   else:
#     n = len(match_labels) // 10
#     m = len(match_labels) % 10
#     for i in range(n):
#       # for j in range(10):
#       print(match_labels[i*10 : (i+1)*10])
#     print(match_labels[n*10:])
    
#   return search_result[['itemid', 'label', 'dbsource']]

# search_feature('platelets', mode='contains')


## Patients, Addmin

To create Patients objects and Hadm objects, we first need to filter the tables (by age, length of stay, etc.)

### Get is_sepsis for each Hadm

In [None]:
diagnoses.columns = diagnoses.columns.str.lower()
diagnoses['is_sepsis'] = (diagnoses['icd9_code']=='99591') | (diagnoses['icd9_code']=='99592')
hadm_sepsis = diagnoses.groupby('hadm_id')['is_sepsis'].max().reset_index()
hadm_set_diagnoses = set(hadm_sepsis['hadm_id'])

In [None]:
hadm_sepsis

Unnamed: 0,hadm_id,is_sepsis
0,100009,False
1,100023,False
2,100044,False
3,100045,False
4,100055,False
...,...,...
3545,199833,False
3546,199901,False
3547,199917,False
3548,199943,True


### Filter patients, admissions, icustays


#### Get Patients
We need the date of birth (DOB) to filter the age

In [None]:
#convert dob to datetime for calculating age
patients.columns = patients.columns.str.lower()
patients = patients[patients['subject_id'].notna()]
patients['dob2'] = patients['dob'].map(lambda x: datetime.fromisoformat(x))

#### Filter Hadm by age



In [None]:
ad.columns = ad.columns.str.lower()
#convert admittime to datetime for calculating age later
ad['admittime2'] = ad['admittime'].map(lambda x: datetime.fromisoformat(x))
ad['dischtime2'] = ad['dischtime'].map(lambda x: datetime.fromisoformat(x))

#admissions not diagnosed w sepsis upon admission
ad_filtered = ad[ad['diagnosis'].str.contains("(?i)^.*" + "sepsis" + ".*$")==False][['subject_id', 'hadm_id', 'admittime2', 'dischtime2', 'deathtime', 'diagnosis']]

ad_patient = pd.merge(ad_filtered, patients[['subject_id', 'gender', 'dob2']], how='inner', on='subject_id')

#function to calculate age, adopted from https://www.codingem.com/how-to-calculate-age-in-python/
def age(dob, as_of_date):    
    one_or_zero = ((as_of_date.month, as_of_date.day) < (dob.month, dob.day))
    year_difference = as_of_date.year - dob.year
    age = year_difference - one_or_zero
    if (age>=300):
      return 91
    return age

ad_patient['age'] = ad_patient[['admittime2', 'dob2']].apply( lambda x: age(x['dob2'], x['admittime2']), axis=1)
ad_patient = ad_patient[ad_patient['age'] < 90]
ad_patient = ad_patient[ad_patient['age'] > 18]
ad_patient = ad_patient[['hadm_id', 'gender', 'age', 'admittime2']]

# hadm_set_ad_patient = set(ad_patient['hadm_id'])

In [None]:
ad_patient

Unnamed: 0,hadm_id,gender,age,admittime2
0,165315,F,64,2196-04-09 12:26:00
1,152223,M,71,2153-09-03 07:15:00
2,124321,M,75,2157-10-18 19:34:00
3,161859,M,39,2139-06-06 16:14:00
4,129635,M,58,2160-11-02 02:06:00
...,...,...,...,...
57163,190603,M,78,2127-11-07 11:00:00
57164,105447,M,87,2132-12-24 20:06:00
57165,191113,F,19,2131-03-30 21:13:00
57166,101071,F,83,2151-03-05 20:00:00


#### Filter ICUStay by length of stay

In [None]:
icu.columns = icu.columns.str.lower()
icu = icu[icu['intime'].notnull()]
icu = icu[icu['outtime'].notnull()]
icu = icu[icu['icustay_id'].notna()]
icu['intime2'] = icu['intime'].map(lambda x: datetime.fromisoformat(x))
icu['outtime2'] = icu['outtime'].map(lambda x: datetime.fromisoformat(x))
#icu stays between 8 hours and 1 month
icu_filtered = icu[(icu['los']>8/24) & (icu['los']<30)][['hadm_id', 'icustay_id', 'intime2', 'outtime2', 'los']]
hadm_set_icu = set(icu_filtered['hadm_id'])

In [None]:
icu_filtered

Unnamed: 0,hadm_id,icustay_id,intime2,outtime2,los
0,110404,280836,2198-02-14 23:27:38,2198-02-18 05:26:11,3.2490
1,106296,206613,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,188028,220345,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,173727,249196,2120-08-07 23:12:42,2120-08-10 00:39:04,2.0600
4,164716,210407,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202
...,...,...,...,...,...
3071,145300,228293,2191-09-28 19:46:00,2191-10-07 06:25:00,8.4438
3072,187607,219553,2137-08-06 12:28:23,2137-08-08 17:06:51,2.1934
3073,106076,212319,2119-02-08 17:56:38,2119-02-16 20:55:25,8.1242
3074,196785,238621,2197-04-05 11:21:22,2197-04-08 19:49:37,3.3530


#### Compose demographic features

In [None]:
demog = pd.merge(ad_patient, icu_filtered, how='inner', on='hadm_id')
demog['hours_admit_icuin']=(demog['intime2']-demog['admittime2']).map(lambda x: timedelta_hours(x))
demog['gender'] = demog['gender'].map(lambda x: 0 if x=='F' else 1)
demog = demog[['hadm_id', 'gender', 'age', 'icustay_id', 'los', 'hours_admit_icuin']]
demog

Unnamed: 0,hadm_id,gender,age,icustay_id,los,hours_admit_icuin
0,165315,0,64,204798,1.1438,0.0
1,152223,1,71,227807,1.2641,2.0
2,124321,1,75,234044,1.1862,64.0
3,161859,1,39,262236,0.5124,0.0
4,129635,1,58,203487,3.5466,1.0
...,...,...,...,...,...,...
47588,190603,1,78,262848,2.2888,-1.0
47589,105447,1,87,244147,1.2382,0.0
47590,191113,0,19,210188,0.8778,0.0
47591,101071,0,83,294783,0.6202,0.0


### Create data tensor

#### Load chartevents in batches

In [None]:
features = [211, 220045, 220277, 676, 223762, 51, 220050, 53, 8368, 220051, 618, 220210, 1817, \
            74, 224828, 812, 3420, 2981, 7459, 6003, 778, 834, 770, 220587, 1162, 5876, 225624, 3728, 225612, \
            1523, 1525, 220615, 4948, 225651, 1529, 220621, 1531, 225668, 1532, 220635, 4381, 1535, 227442, \
            225690, 851, 227429, 3761, 220545, 814, 220228, 1533, 227466, 1542, 220546, 1528, 227468, 828, 30006] #removed 225170, 225925, 228640 since they actually don't exist in data
# added 30006 for sepsis label calculation
# Features that don't actually have data in chartevents and should be removed before rerun: [5876, 4948, 4381, 30006]

In [None]:
features_idx_dict = get_index_dict(pd.Series(features))
# features_idx_dict

In [None]:
batch_size = 22000000
num_batch = 330712483 // batch_size + 1
skip = [i*batch_size for i in range(num_batch)]
skip

chart = pd.read_csv('CHARTEVENTS.csv.gz', nrows=1, compression='gzip')
colnames = chart.columns.str.lower()
chart.columns = chart.columns.str.lower()

drop_labels = ['storetime', 'cgid', 'valueuom', 'warning', 'error', 'resultstatus', 'stopped']
chart = chart.drop(labels=drop_labels, axis=1)
chart = chart.drop(index=0)
chart

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,itemid,charttime,value,valuenum


In [None]:
%%time

for i in range(0, num_batch):
    print(f'Batch {i+1}...')
    batch = pd.read_csv('CHARTEVENTS.csv.gz', nrows=batch_size, skiprows=skip[i], names=colnames, compression='gzip')
    batch = batch.drop(labels=drop_labels, axis=1)

    batch_important = batch[(batch['itemid'].isin(features))]
    
    chart = pd.concat([chart, batch_important])

    del batch
    del batch_important

    # time.sleep(1)

Batch 1...


  exec(code_obj, self.user_global_ns, self.user_ns)


Batch 2...


  exec(code_obj, self.user_global_ns, self.user_ns)


Batch 3...
Batch 4...
Batch 5...
Batch 6...
Batch 7...
Batch 8...
Batch 9...


  exec(code_obj, self.user_global_ns, self.user_ns)


Batch 10...
Batch 11...
Batch 12...


  exec(code_obj, self.user_global_ns, self.user_ns)


Batch 13...
Batch 14...
Batch 15...
Batch 16...


In [None]:
chart.shape

(29921842, 8)

In [None]:
# chart = chart[chart['value'].str.contains("[^\d\.\-\+]+")==False] #remove rows contains characters other than numeric
# chart = chart[chart['value'].str.contains("(?i)^[\D]+$")==False] #remove rows with only signs and no number
# chart['value'] = chart['value'].str.replace(r'(\D)\1+', r'\1') #replace repeated '+-.' chars
# chart = chart[chart['value'].str.contains("(?i)^[+-]*[\d.]+$")] #keep only rows that matches number format
chart = chart.astype({'valuenum': 'float32'})

##### Backup and save to disk

In [None]:
# chart_backup = chart.copy()
# chart.to_pickle('chart.pkl') #save backup to disk
chart = pd.read_pickle('chart.pkl')

In [None]:
chart

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,itemid,charttime,value,valuenum
65537,61912,502,116367,209737.0,220210,2143-10-30 20:00:00,24.0,24.000000
65538,61913,502,116367,209737.0,220277,2143-10-30 20:00:00,93.0,93.000000
65539,61914,502,116367,209737.0,223762,2143-10-30 20:00:00,37.7,37.700001
65542,61917,502,116367,209737.0,220045,2143-10-30 21:00:00,110.0,110.000000
65543,61918,502,116367,209737.0,220050,2143-10-30 21:00:00,146.0,146.000000
...,...,...,...,...,...,...,...,...
13874593,255088864,32216,158834,231184.0,8368,2198-07-23 22:00:00,64,64.000000
13874599,255088870,32216,158834,231184.0,8368,2198-07-23 23:00:00,57,57.000000
13874628,255089862,32216,158834,231184.0,8368,2198-07-27 05:15:00,64,64.000000
13874635,255089869,32216,158834,231184.0,8368,2198-07-27 06:00:00,70,70.000000


#### Filter demog and chart to subset of hadm_ids from notes processing

In [None]:
#subset of hadm_id from noteevents processing, used to filter the rest of data
hadm_id_subset = {}

# len(hadm_id_subset)

hadm_id_subset_df = pd.DataFrame(hadm_id_subset)
hadm_id_subset_df.columns=['hadm_id']
hadm_id_subset_df

Unnamed: 0,hadm_id
0,131077.0
1,131089.0
2,131094.0
3,196632.0
4,163883.0
...,...
6065,131035.0
6066,131050.0
6067,131052.0
6068,196592.0


In [None]:
#filter demog to only hadm_ids in the subset
demog_subset = pd.merge(hadm_id_subset_df, demog, how='inner', on='hadm_id')
demog_subset

Unnamed: 0,hadm_id,gender,age,icustay_id,los,hours_admit_icuin
0,131118.0,1,60,207606,25.2547,480.0
1,131118.0,1,60,207393,2.9419,1443.0
2,131118.0,1,60,228140,1.8972,2191.0
3,131118.0,1,60,264919,4.4695,2467.0
4,131118.0,1,60,293299,18.2440,2718.0
...,...,...,...,...,...,...
1796,196517.0,1,59,213157,4.0027,1685.0
1797,130982.0,0,55,291718,2.2827,0.0
1798,131003.0,0,48,217305,1.8843,87.0
1799,196558.0,1,75,280368,1.8248,116.0


##### Final filtered icustay_ids and hadm_ids in demographics to calc tensor dimensions

In [None]:
unique_icustay_ids_from_chart = pd.Series(chart['icustay_id'].unique(), name='icustay_id')

final_icustay_ids = pd.merge(demog_subset['icustay_id'], unique_icustay_ids_from_chart, how='inner', on='icustay_id')
# final_icustay_ids

#create a table of details of final icustays for look up later
final_icustays_details = pd.merge(final_icustay_ids, icu, how='inner', on='icustay_id')
final_icustays_details

##### Backup and save to disk

In [None]:
# final_icustays_details.to_pickle('final_icustays_details.pkl') #save backup to disk
# final_icustay_ids.to_pickle('final_icustay_ids.pkl') #save backup to disk
final_icustays_details = pd.read_pickle('final_icustays_details.pkl')
final_icustay_ids = pd.read_pickle('final_icustay_ids.pkl')

In [None]:
demog_final = pd.merge(demog_subset, final_icustay_ids, how='inner', on='icustay_id')
demog_final

In [None]:
demog_final = demog_final.astype({'icustay_id': int})

#### Create index lookup dictionary for populating data tensor

In [None]:
# icustays_4_dict_creation = pd.merge(chart_icuintime_final.groupby(['icustay_id']).size().reset_index()['icustay_id'], final_icustays_details, how='inner', on='icustay_id')
# icustays_4_dict_creation

#create index lookup dict to structure data tensors
hadmid_icustay_idx_dict = {}
icustayid_to_hadmid_lookup_dict = {}

for i, r in final_icustays_details.sort_values(['hadm_id', 'intime2']).iterrows():
  icustayid_to_hadmid_lookup_dict[r['icustay_id']] = r['hadm_id']
  if hadmid_icustay_idx_dict.get(r['hadm_id']) is None:
    hadmid_icustay_idx_dict[r['hadm_id']] = {}
    hadmid_icustay_idx_dict[r['hadm_id']][r['icustay_id']] = 0
  else:
    hadmid_icustay_idx_dict[r['hadm_id']][r['icustay_id']] = len(hadmid_icustay_idx_dict[r['hadm_id']])
  
# hadmid_icustay_idx_dict
# icustayid_to_hadmid_lookup_dict

In [None]:
# verify if icustays are indexed in the time order
# print(hadmid_icustay_idx_dict[125487])
# final_icustays_details[final_icustays_details['hadm_id']==125487]

{222474: 0}


Unnamed: 0,icustay_id,row_id,subject_id,hadm_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los,intime2,outtime2
1477,222474,58056,89800,125487,metavision,CSRU,CSRU,15,15,2114-02-26 18:23:27,2114-02-27 18:11:54,0.992,2114-02-26 18:23:27,2114-02-27 18:11:54


#### Create demographics tensor

In [None]:
# calculate dimensions
num_hadm_ids = final_icustays_details['hadm_id'].nunique()

icu_stays_per_admission = final_icustays_details.groupby('hadm_id')['icustay_id'].count()
max_num_icustays = icu_stays_per_admission.max()

# max_num_hours_from_icu_in = chart_icuintime['hour_from_icu_in'].max()
icuin_icuout_hours = (final_icustays_details['outtime2']-final_icustays_details['intime2']).map(lambda x: timedelta_hours_roundup(x))
max_num_hours_in_icustay = icuin_icuout_hours.max()

num_demog_features = 4

hadm_id_idx_dict = get_index_dict(final_icustays_details['hadm_id'])
icustay_idx_dict = get_index_dict(final_icustay_ids['icustay_id'])

In [None]:
demog_tensor = torch.zeros(num_hadm_ids, max_num_icustays, num_demog_features)
# demog_tensor.shape

for i, r in demog_final.iterrows():
  demog_tensor[hadm_id_idx_dict[r['hadm_id']], hadmid_icustay_idx_dict[r['hadm_id']][r['icustay_id']]] = torch.tensor(r[['gender', 'age', 'los', 'hours_admit_icuin']].values)

demog_tensor_whourdim = demog_tensor.unsqueeze(2)
demog_tensor_whourdim = demog_tensor_whourdim.repeat(1, 1, int(max_num_hours_in_icustay), 1)

In [None]:
demog_tensor_mask = torch.ones(demog_tensor_whourdim.shape)

#### Problem of multiple hadm_ids linked to 1 icustay_id
Solution: use hadm_id <> icustay_id in icu table as the source of truth

In [None]:
# print(demog_icustayswchartevents['hadm_id'].nunique())
# print(chart_icuintime_final['hadm_id'].nunique())

# print(demog_icustayswchartevents['icustay_id'].nunique())
# print(chart_icuintime_final['icustay_id'].nunique())

# check = pd.merge(pd.merge(demog[['hadm_id', 'icustay_id']], final_icustay_ids, how='inner', on='icustay_id'), chart_icuintime_final[['hadm_id', 'icustay_id']], how='inner', on='icustay_id')
# print(check[check['hadm_id_x']!=check['hadm_id_y']]['icustay_id'].unique())

# demog_icustayswchartevents[demog_icustayswchartevents['icustay_id']==256504]

In [None]:
# chart[chart['icustay_id']==256504]['hadm_id'].unique()

In [None]:
# hadm_icustayid_check = pd.merge(chart.groupby(['hadm_id', 'icustay_id']).size().reset_index(), icu[['icustay_id', 'hadm_id']], how='inner', on='icustay_id')
# hadm_icustayid_check[hadm_icustayid_check['hadm_id_x']!=hadm_icustayid_check['hadm_id_y']]

In [None]:
# icu[icu['icustay_id']==214830]

In [None]:
# chart[(chart['icustay_id']==214830) & (chart['hadm_id']!=109444)]

#### Does larger icustay_id mean intime is later? No

In [None]:
hadm_morethan1icu = icu.groupby('hadm_id')['icustay_id'].count()>1 #.reset_index()['hadm_id']
hadm_morethan1icu = hadm_morethan1icu[hadm_morethan1icu].reset_index()
# hadm_morethan1icu
pd.merge(icu, hadm_morethan1icu['hadm_id'], how='inner', on='hadm_id').sort_values(['hadm_id', 'icustay_id'])[['hadm_id', 'icustay_id', 'intime']].head(50)

Unnamed: 0,hadm_id,icustay_id,intime
159,100055,215944,2150-07-06 12:43:34
160,100055,245659,2150-07-08 12:49:43
37,101757,237024,2133-01-09 12:18:30
36,101757,261027,2133-01-03 06:34:40
87,101829,205588,2179-10-11 11:39:21
85,101829,247844,2179-09-29 18:46:50
86,101829,289060,2179-10-07 23:51:40
10,102024,232807,2142-06-01 18:50:29
9,102024,240251,2142-05-20 17:40:15
328,102152,257079,2110-12-05 12:53:06


#### Create chartevents tensor

In [None]:
##### Merge chart with icu to calculate hour_from_icu_in
chart_final = pd.merge(chart, final_icustays_details[['icustay_id', 'intime2']], how='inner', on='icustay_id')

chart_final['charttime2'] = chart_final['charttime'].map(lambda x: datetime.fromisoformat(x))

chart_final['hour_from_icu_in'] = (chart_final['charttime2']-chart_final['intime2']).map(lambda x: timedelta_hours(x))

chart_final = chart_final[['hadm_id', 'icustay_id', 'itemid', 'valuenum', 'charttime2', 'hour_from_icu_in']]

chart_final

Unnamed: 0,hadm_id,icustay_id,itemid,valuenum,charttime2,hour_from_icu_in
0,158591,298509.0,220277,95.0,2114-04-10 17:00:00,11.0
1,158591,298509.0,220045,95.0,2114-04-10 18:00:00,12.0
2,158591,298509.0,220210,21.0,2114-04-10 18:00:00,12.0
3,158591,298509.0,220277,96.0,2114-04-10 18:00:00,12.0
4,158591,298509.0,220045,87.0,2114-04-10 19:00:00,13.0
...,...,...,...,...,...,...
1233377,116756,299674.0,8368,78.0,2167-01-26 15:00:00,42.0
1233378,116756,299674.0,8368,90.0,2167-01-26 16:00:00,43.0
1233379,116756,299674.0,8368,94.0,2167-01-26 17:00:00,44.0
1233380,116756,299674.0,8368,43.0,2167-01-29 21:00:00,120.0


In [None]:
chart_final[chart_final['valuenum']==0]

Unnamed: 0,hadm_id,icustay_id,itemid,valuenum,charttime2,hour_from_icu_in
250,175734,286445.0,224828,0.0,2142-05-27 04:38:00,276.0
342,175734,286445.0,224828,0.0,2142-05-22 04:30:00,156.0
610,175734,286445.0,224828,0.0,2142-05-17 05:10:00,36.0
624,175734,286445.0,220210,0.0,2142-05-18 01:18:00,56.0
723,175734,286445.0,220210,0.0,2142-05-26 20:30:00,268.0
...,...,...,...,...,...,...
1228679,194801,279848.0,224828,0.0,2180-02-21 01:02:00,149.0
1228701,194801,279848.0,224828,0.0,2180-02-19 12:10:00,113.0
1228780,194801,279848.0,224828,0.0,2180-02-20 09:21:00,134.0
1228863,194801,279848.0,224828,0.0,2180-02-19 06:07:00,107.0


In [None]:
num_features = len(features)

chart_tensor = torch.zeros(num_hadm_ids, max_num_icustays, max_num_hours_in_icustay, num_features)

chart_tensor_mask = chart_tensor.clone()

chart_tensor.shape

torch.Size([1541, 5, 720, 58])

In [None]:
%%time
#loop through chart events and populate chart_tensor
for i, r in chart_final.iterrows():
    icustay_id = r['icustay_id']
    real_hadm_id = icustayid_to_hadmid_lookup_dict[icustay_id]
    if hadm_id_idx_dict.get(real_hadm_id) is None:
        continue
    chart_tensor[hadm_id_idx_dict[real_hadm_id], hadmid_icustay_idx_dict[real_hadm_id][icustay_id], int(r['hour_from_icu_in']), features_idx_dict[r['itemid']]] = r['valuenum']
    chart_tensor_mask[hadm_id_idx_dict[real_hadm_id], hadmid_icustay_idx_dict[real_hadm_id][icustay_id], int(r['hour_from_icu_in']), features_idx_dict[r['itemid']]] = 1

CPU times: user 1min 10s, sys: 0 ns, total: 1min 10s
Wall time: 1min 10s


#### Sample and hold

In [None]:
chart_tensor_filled = chart_tensor.clone()

In [None]:
%%time

I, J, K, L = list(chart_tensor_filled.size())
# I, J, K, L 

for l in range(L):
    print(f'feature {l}/{L}. ', end='')
    for i in range(I):
        for j in range(J):
            last_value = 0
            for k in range(K):
                current_value = chart_tensor_filled[i, j, k, l]
                if current_value==0 and chart_tensor_mask[i, j, k, l]==0:
                    chart_tensor_filled[i, j, k, l] = last_value
                else:
                    last_value = current_value

CPU times: user 1h 21min 4s, sys: 0 ns, total: 1h 21min 4s
Wall time: 2h 1min 13s


In [None]:
# #fix features that are actually 0 but mistakenly filled

# %%time

# for i, r in chart_final[chart_final['valuenum']==0].iterrows():
#     if hadm_id_idx_dict.get(r['hadm_id']) is None:
#         continue
#     chart_tensor[hadm_id_idx_dict[icustayid_to_hadmid_lookup_dict[r['icustay_id']]], hadmid_icustay_idx_dict[icustayid_to_hadmid_lookup_dict[r['icustay_id']]][r['icustay_id']], int(r['hour_from_icu_in']), features_idx_dict[r['itemid']]] = r['valuenum']
#     chart_tensor_mask[hadm_id_idx_dict[icustayid_to_hadmid_lookup_dict[r['icustay_id']]], hadmid_icustay_idx_dict[icustayid_to_hadmid_lookup_dict[r['icustay_id']]][r['icustay_id']], int(r['hour_from_icu_in']), features_idx_dict[r['itemid']]] = 1



In [None]:
print(chart_tensor[666, 0, 95:97, 1])
print(chart_tensor_filled[666, 0, 95:97, 1])
print(chart_tensor_mask[666, 0, 95:97, 1])

tensor([97.,  0.])
tensor([97., 97.])
tensor([1., 0.])


#### Population average

In [None]:
population_avg = {}

for feature in features:
    avg = chart[chart['itemid']==feature]['valuenum'].mean()
    population_avg[feature] = avg

In [None]:
chart[chart['itemid']==5876]['valuenum']
# chart_backup[chart_backup['itemid']==5876]['value']

5162516   NaN
Name: valuenum, dtype: float32

In [None]:
%%time

I, J, K, L = list(chart_tensor_filled.size())
# I, J, K, L 

for l in range(L):
    print(f'feature {l}/{L}. ', end='')
    for i in range(I):
        for j in range(J):
            for k in range(K):
                if chart_tensor_filled[i, j, k, l]==0:
                    if chart_tensor_mask[i, j, k, l]==0:
                        chart_tensor_filled[i, j, k, l] = population_avg[features[l]]
                    else:
                        break
                else:
                    break

feature 0/58. feature 1/58. feature 2/58. feature 3/58. feature 4/58. feature 5/58. feature 6/58. feature 7/58. feature 8/58. feature 9/58. feature 10/58. feature 11/58. feature 12/58. feature 13/58. feature 14/58. feature 15/58. feature 16/58. feature 17/58. feature 18/58. feature 19/58. feature 20/58. feature 21/58. feature 22/58. feature 23/58. feature 24/58. feature 25/58. feature 26/58. feature 27/58. feature 28/58. feature 29/58. feature 30/58. feature 31/58. feature 32/58. feature 33/58. feature 34/58. feature 35/58. feature 36/58. feature 37/58. feature 38/58. feature 39/58. feature 40/58. feature 41/58. feature 42/58. feature 43/58. feature 44/58. feature 45/58. feature 46/58. feature 47/58. feature 48/58. feature 49/58. feature 50/58. feature 51/58. feature 52/58. feature 53/58. feature 54/58. feature 55/58. feature 56/58. feature 57/58. CPU times: user 1h 14min 42s, sys: 0 ns, total: 1h 14min 42s
Wall time: 1h 14min 42s


In [None]:
hadm_id_test = 344
range1 = 0
range2 = 100
feature_idx = 25
print(chart_tensor[hadm_id_test, 0, range1:range2, feature_idx])
print(chart_tensor_filled_final[hadm_id_test, 0, range1:range2, feature_idx])
print(chart_tensor_mask[hadm_id_test, 0, range1:range2, feature_idx])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])
tensor([12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12.,
        12., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
        16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 15., 15., 15.,
        15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15.,
        15., 15., 15., 15., 15., 15., 11., 11., 11., 11., 11., 11., 11., 11.,
        11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
        11., 11., 11., 14., 14., 14., 14., 14., 14., 14., 14., 14., 14., 14.,
        14., 

#### Remove features with no data from chart_tensor

In [None]:
# population_avg
features_no_data = [5876, 4948, 4381, 30006]
[features_idx_dict[f] for f in features_no_data]

[25, 32, 40, 57]

In [None]:
len(set(features_idx_dict.values()))

58

In [None]:
features_idx_to_keep = set(features_idx_dict.values())

for f in features_no_data:
    features_idx_to_keep.remove(features_idx_dict[f])

# features_idx_to_keep
features_idx_to_keep = list(features_idx_to_keep)
len(features_idx_to_keep)

54

In [None]:
final_features = [features[i] for i in features_idx_to_keep]
print(len(final_features))
print([i in final_features for i in features_no_data])

54

In [None]:
chart_tensor_filled_final = chart_tensor_filled[:, :, :, features_idx_to_keep]

##### Backup and save to disk

In [None]:
torch.save(chart_tensor_filled_final, gzip.GzipFile("chart_tensor_filled_final.pt.gz", "wb"))

#### Concatenate demog and chartevents tensors

In [None]:
print(demog_tensor_whourdim.shape)
print(chart_tensor_filled_final.shape)

torch.Size([1541, 5, 720, 4])
torch.Size([1541, 5, 720, 54])


In [None]:
data_tensor = torch.cat([demog_tensor_whourdim, chart_tensor_filled_final], dim=-1)
data_tensor.shape

torch.Size([1541, 5, 720, 58])

In [None]:
# torch.save(data_tensor, gzip.GzipFile("data_tensor_chart_demog.pt.gz", "wb"))
data_tensor = torch.load(gzip.GzipFile("data_tensor_chart_demog.pt.gz", "rb"))

#### Load processed notes

In [None]:
note_features_df = pd.read_pickle('text_bert_1874862_2083180.pkl')

In [None]:
note_features_df['charttime2'] = note_features_df['charttime'].map(lambda x: datetime.fromisoformat(x))
note_features_df.sort_values(['hadm_id', 'charttime2'])

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,text_bert,charttime2
164303,54610,100003.0,2150-04-19,2150-04-19 09:54:00,"[tensor(-0.0613), tensor(-0.3600), tensor(-0.3...",2150-04-19 09:54:00
11616,14509,100029.0,2185-04-17,2185-04-17 13:43:00,"[tensor(0.0471), tensor(-0.3467), tensor(-0.50...",2185-04-17 13:43:00
10735,14509,100029.0,2185-04-17,2185-04-17 15:39:00,"[tensor(0.1259), tensor(-0.1368), tensor(-0.21...",2185-04-17 15:39:00
10736,14509,100029.0,2185-04-17,2185-04-17 18:36:00,"[tensor(0.1892), tensor(-0.1211), tensor(-0.31...",2185-04-17 18:36:00
10737,14509,100029.0,2185-04-18,2185-04-18 02:31:00,"[tensor(0.1183), tensor(-0.1319), tensor(-0.33...",2185-04-18 02:31:00
...,...,...,...,...,...,...
133656,25768,199918.0,2111-05-28,2111-05-28 09:59:00,"[tensor(0.1250), tensor(-0.1454), tensor(-0.14...",2111-05-28 09:59:00
133657,25768,199918.0,2111-05-28,2111-05-28 14:01:00,"[tensor(0.1786), tensor(-0.0010), tensor(-0.34...",2111-05-28 14:01:00
169787,49225,199948.0,2102-02-25,2102-02-25 06:50:00,"[tensor(-0.0447), tensor(-0.3101), tensor(-0.3...",2102-02-25 06:50:00
15315,13446,199954.0,2120-11-08,2120-11-08 11:59:00,"[tensor(0.1722), tensor(-0.1987), tensor(-0.25...",2120-11-08 11:59:00


In [None]:
# list(hadm_id_idx_dict.keys())
note_features_df[note_features_df['hadm_id']==131118].sort_values(['hadm_id', 'charttime2']).iloc[0:2]

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,text_bert,charttime2
170041,66508,131118.0,2165-03-18,2165-03-18 09:55:00,"[tensor(0.0197), tensor(-0.2639), tensor(-0.32...",2165-03-18 09:55:00
187875,66508,131118.0,2165-03-22,2165-03-22 05:54:00,"[tensor(0.0439), tensor(-0.2389), tensor(-0.31...",2165-03-22 05:54:00


In [None]:
print(hadmid_icustay_idx_dict[131118])
icustay_inouttime_dict[207606]

{207606: 0, 207393: 1, 228140: 2, 264919: 3, 293299: 4}


(Timestamp('2165-03-17 12:08:06'), Timestamp('2165-04-11 18:14:56'))

In [None]:
#create hadmid_icustay_idx_dict_2 to lookup easily

# hadmid_icustay_idx_dict_2 = {}

# for i, r in final_icustays_details.sort_values(['hadm_id', 'intime2']).iterrows():
#     hadm_id = r['hadm_id']
#     icustay_id = r['icustay_id']
    
#     if hadmid_icustay_idx_dict_2.get(hadm_id) is None:
#         hadmid_icustay_idx_dict_2[hadm_id] = {}
#         hadmid_icustay_idx_dict_2[hadm_id][0] = icustay_id
#     else:
#         hadmid_icustay_idx_dict_2[hadm_id][len(hadmid_icustay_idx_dict_2[hadm_id])] = icustay_id

hadmid_icustay_idx_dict_2 = {}

for hadm_id, hadm_dict in hadmid_icustay_idx_dict.items():
    hadmid_icustay_idx_dict_2[hadm_id] = [k for k, v in hadm_dict.items()]

In [None]:
hadmid_icustay_idx_dict_2[105720], hadmid_icustay_idx_dict[105720]

([291794, 200855, 214341], {291794: 0, 200855: 1, 214341: 2})

In [None]:
icustay_inouttime_dict = {}
for i, r in final_icustays_details.iterrows():
    icustay_inouttime_dict[r['icustay_id']] = (r['intime2'], r['outtime2'])

list(icustay_inouttime_dict.items())[0:5]

[(207606,
  (Timestamp('2165-03-17 12:08:06'), Timestamp('2165-04-11 18:14:56'))),
 (207393,
  (Timestamp('2165-04-26 15:10:35'), Timestamp('2165-04-29 13:46:56'))),
 (228140,
  (Timestamp('2165-05-27 19:08:50'), Timestamp('2165-05-29 16:40:44'))),
 (264919,
  (Timestamp('2165-06-08 06:56:35'), Timestamp('2165-06-12 18:12:40'))),
 (293299,
  (Timestamp('2165-06-18 17:50:21'), Timestamp('2165-07-06 23:41:45')))]

In [None]:
(icustay_inouttime_dict[207606][1] - icustay_inouttime_dict[207606][0]).total_seconds()
 # ,  icustay_inouttime_dict[207393][1] - icustay_inouttime_dict[207393][0], icustay_inouttime_dict[228140][0] - icustay_inouttime_dict[228140][1]]

2182010.0

In [None]:
l=[1,2,3,-1,-2,3]
l.index(min([i for i in l if i>0]))

0

In [None]:
notes_num_features = 768

notes_tensor = torch.zeros(num_hadm_ids, max_num_icustays, max_num_hours_in_icustay, notes_num_features, dtype=torch.float16)
# notes_tensor = torch.zeros([1541, 5, 720, 768], dtype=torch.float16)

notes_tensor_mask = torch.zeros(num_hadm_ids, max_num_icustays, max_num_hours_in_icustay, dtype=torch.float16)
notes_checklist = torch.zeros(208265, dtype=torch.float16)

In [None]:
notes_tensor.shape

torch.Size([1541, 5, 720, 768])

In [None]:
counter = 0

# for i, r in note_features_df[note_features_df['hadm_id']==131118].sort_values(['hadm_id', 'charttime2']).iloc[0:1].iterrows():
for i, r in note_features_df.sort_values(['hadm_id', 'charttime2']).iterrows():
    if counter % 10000 == 0:
        print(f'{counter}', end=' ')
    # print(f'{counter}', end=' ')
    counter+=1
        
    note_time = r['charttime2']
    hadm_id = r['hadm_id']
    
    if hadmid_icustay_idx_dict_2.get(hadm_id) is None:
        # print('case 1')
        continue
    
    first_icu = hadmid_icustay_idx_dict_2[hadm_id][0]
    last_icu = hadmid_icustay_idx_dict_2[hadm_id][-1]
    first_icu_intime = icustay_inouttime_dict[first_icu][0]
    last_icu_outtime = icustay_inouttime_dict[last_icu][1]
        
    if note_time < first_icu_intime:
        # print('case 2')        
        continue
    elif last_icu_outtime < note_time:
        #put notes in the outtime hour of last icu
        last_icu_outtime_hour = int(timedelta_hours(last_icu_outtime - icustay_inouttime_dict[last_icu][0]))
        notes_tensor[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][last_icu], last_icu_outtime_hour] = r['text_bert']
        notes_tensor_mask[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][last_icu], last_icu_outtime_hour] = 1
        # print('case 3')
    else:
        icu_ins = []
        icu_outs = []
        location_found = False
        for icustay in hadmid_icustay_idx_dict_2[hadm_id]:
            icu_in = icustay_inouttime_dict[icustay][0]
            icu_out = icustay_inouttime_dict[icustay][1]
            if icu_in < note_time and note_time<icu_out:
                note_hour = int(timedelta_hours(note_time - icu_in))
                notes_tensor[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][icustay], note_hour] = r['text_bert']
                notes_tensor_mask[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][icustay], note_hour] = 1
                location_found = True
                notes_checklist[i] = 1
                # print('case 4')
                break
            else:
                icu_ins.append(icu_in)
                icu_outs.append(icu_out)
        if location_found == False:
            diffs = [(note_time-icu_out).total_seconds() for icu_out in icu_outs]
            picked_icu_idx = diffs.index(min([d for d in diffs if d>=0]))
            picked_icu_outtime_hour = timedelta_hours(icu_outs[picked_icu_idx] - icu_ins[picked_icu_idx])
            notes_tensor[hadm_id_idx_dict[hadm_id], picked_icu_idx, picked_icu_outtime_hour] = r['text_bert']
            notes_tensor_mask[hadm_id_idx_dict[hadm_id], picked_icu_idx, picked_icu_outtime_hour] = 1
            notes_checklist[i] = 1
            # print('case 5')


0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 

In [None]:
# counter = 0

# for i, r in note_features_df.sort_values(['hadm_id', 'charttime2']).iterrows():
#     if counter % 10000 == 0:
#         print(f'{counter}', end=' ')
#     # print(f'{counter}', end=' ')
#     counter+=1
        
#     note_time = r['charttime2']
#     hadm_id = r['hadm_id']
    
#     if hadmid_icustay_idx_dict_2.get(hadm_id) is None:
#         continue
    
#     first_icu = hadmid_icustay_idx_dict_2[hadm_id][0]
#     last_icu = hadmid_icustay_idx_dict_2[hadm_id][-1]
#     first_icu_intime = icustay_inouttime_dict[first_icu][0]
#     last_icu_outtime = icustay_inouttime_dict[last_icu][1]
        
#     if note_time < first_icu_intime:
#         continue
#     elif last_icu_outtime < note_time:
#         #put notes in the outtime hour of last icu
#         last_icu_outtime_hour = int(timedelta_hours(last_icu_outtime - icustay_inouttime_dict[last_icu][0]))
#         notes_tensor[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][last_icu], last_icu_outtime_hour] = r['text_bert']
#         notes_tensor_mask[hadm_id_idx_dict[hadm_id], hadmid_icustay_idx_dict[hadm_id][last_icu], last_icu_outtime_hour] = 1
#     else:
#         icu_ins = []
#         icu_outs = []
#         location_found = False
#         for icustay in hadmid_icustay_idx_dict_2[hadm_id]:
#             icu_in = icustay_inouttime_dict[icustay][0]
#             icu_out = icustay_inouttime_dict[icustay][1]
#             if icu_in < note_time and note_time<icu_out:
#                 note_hour = timedelta_hours(note_time - icu_in)
#                 location_found = True
#                 notes_checklist[i] = 1
#                 break
#             else:
#                 icu_ins.append(icu_in)
#                 icu_outs.append(icu_out)
#         if location_found == False:
#             diffs = [(note_time-icu_out).total_seconds() for icu_out in icu_outs]
#             picked_icu_idx = diffs.index(min([d for d in diffs if d>=0]))
#             picked_icu_outtime_hour = timedelta_hours(icu_outs[picked_icu_idx] - icu_ins[picked_icu_idx])
#             notes_tensor[hadm_id_idx_dict[hadm_id], picked_icu_idx, picked_icu_outtime_hour] = r['text_bert']
#             notes_tensor_mask[hadm_id_idx_dict[hadm_id], picked_icu_idx, picked_icu_outtime_hour] = 1
#             notes_checklist[i] = 1


0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 

##### Backup and save to disk

In [None]:
torch.save(notes_tensor, gzip.GzipFile("notes_tensor_real.pt.gz", "wb"))

In [None]:
# %%time

I, J, K, L = list(notes_tensor.size())
# I, J, K, L 

for i in range(I):
    if i%100==0:
        print(f'{i}', end='  ')
    for j in range(J):
        last_value = 0
        found_value = False
        for k in reversed(range(K)):
            if notes_tensor_mask[i, j, k] == 0:
                # if (last_value == torch.tensor(0)).max().item():
                if type(last_value)==int:
                    continue
                else:
                    notes_tensor[i, j, k] = last_value
            else:
                if type(last_value)==int:
                    last_value = notes_tensor[i, j, k]
                    fill = last_value.unsqueeze(dim=0)
                    fill = fill.repeat(K-k, 1)
                    notes_tensor[i, j, k:] = fill
                else:
                    last_value = notes_tensor[i, j, k]

0  100  200  300  400  500  600  700  800  900  1000  1100  1200  1300  1400  1500  

In [None]:
notes_tensor[0,0,110:117]

tensor([[ 0.0439, -0.2389, -0.3154,  ..., -0.0068,  0.1559,  0.3254],
        [ 0.0439, -0.2389, -0.3154,  ..., -0.0068,  0.1559,  0.3254],
        [ 0.0439, -0.2389, -0.3154,  ..., -0.0068,  0.1559,  0.3254],
        ...,
        [ 0.0212, -0.2715, -0.3533,  ..., -0.0333,  0.1637,  0.3484],
        [ 0.0212, -0.2715, -0.3533,  ..., -0.0333,  0.1637,  0.3484],
        [ 0.0212, -0.2715, -0.3533,  ..., -0.0333,  0.1637,  0.3484]],
       dtype=torch.float16)

In [None]:
# torch.save(notes_tensor, gzip.GzipFile("notes_tensor_filled.pt.gz", "wb"))
notes_tensor = torch.load(gzip.GzipFile("notes_tensor_filled.pt.gz", "rb"))

#### Concatenate notes and data tensors

In [None]:
data_tensor = data_tensor.to(torch.float16)

In [None]:
print(data_tensor.shape)
print(notes_tensor.shape)
data_tensor.dtype, notes_tensor.dtype

torch.Size([1541, 5, 720, 58])
torch.Size([1541, 5, 720, 768])


(torch.float16, torch.float16)

In [None]:
data_tensor_final = torch.cat([data_tensor, notes_tensor], dim=-1)
data_tensor_final.shape

torch.Size([1541, 5, 720, 826])

In [None]:
torch.save(data_tensor_final, gzip.GzipFile("data_tensor_final.pt.gz", "wb"))
# data_tensor_final = torch.load(gzip.GzipFile("data_tensor_final.pt.gz", "rb"))

### Get filtered hadm set
This contains the id of the hospital admissions after filtering.

In [None]:
filtered_hadm_set = ( ( hadm_set_diagnoses.intersection(hadm_set_notes) ).intersection(hadm_set_ad_patient) 
                    ).intersection(hadm_set_icu)
print(f"number of hadm after filtering: {len(filtered_hadm_set)}")

number of hadm after filtering: 84


### Create Patients and Hadms
Use object to store the information and features

In [None]:
import numpy as np
from datetime import timedelta
import torch

def hour_diff(diff: timedelta):
      return diff.total_seconds() // 3600

class Patient:
  def __init__(self, id, date_of_birth):
    self.id = id
    self.dob = date_of_birth
    self.hadm_list = []

class Hadm:
  def __init__(self, id, admit_time, discharge_time, is_sepsis):
    self.id = id
    self.admit_time = admit_time
    self.discharge_time = discharge_time
    self.is_sepsis = is_sepsis
    self.icu_stay_list = []
  
  def init_feature_lists(self, feature_num):
    self.features = [[] for i in range(feature_num)]
    self.embeddings = []

  def add_feature(self, feature_index, time_value_pair):
    self.features[feature_index].append(time_value_pair)

  def add_embedding(self, time_embedding_pair):
    self.embeddings.append(time_embedding_pair)

  def fill_feature_sequences_to_icu_stays(self, feature_len, embedding_size):
    total_hours_hadm = int(hour_diff(self.discharge_time - self.admit_time) + 1)
    tmp_feature_total_hours = [[np.nan]*(feature_len + embedding_size) for i in range(total_hours_hadm)]

    for idx, f in enumerate(self.features):
      for time, value in f:
        hr_diff = int(hour_diff(time - self.admit_time))
        hr_diff = max(0, hr_diff)
        hr_diff = min(total_hours_hadm - 1, hr_diff)
        tmp_feature_total_hours[hr_diff][idx] = value

    for time, emb in self.embeddings:
      hr_diff = int(hour_diff(time - self.admit_time))
      hr_diff = max(0, hr_diff)
      hr_diff = min(total_hours_hadm - 1, hr_diff)
      for idx in range(embedding_size):
        tmp_feature_total_hours[hr_diff][feature_len + idx] = emb[idx]

    features_dataframe = pd.DataFrame(tmp_feature_total_hours)
    features_mean = features_dataframe.mean(axis=0).fillna(0)
    features_sample_and_hold = features_mean
    for hr in range(total_hours_hadm):
      for f in range(feature_len + embedding_size):
        if np.isnan(features_dataframe.iloc[hr, f]):
          features_dataframe.iloc[hr, f] = features_sample_and_hold[f]
        else:
          features_sample_and_hold[f] = features_dataframe.iloc[hr, f]
    
    for icu_id in self.icu_stay_list:
      icu_tmp = icu_stay_dict[icu_id]
      icu_tmp_start_time = int(hour_diff(icu_tmp.intime - self.admit_time))
      icu_tmp_end_time = int(hour_diff(icu_tmp.outtime - self.admit_time))
      icu_tmp.set_feature_seq(features_dataframe[icu_tmp_start_time: icu_tmp_end_time + 1])
    
    if self.is_sepsis:
      icu_tmp = icu_stay_dict[self.icu_stay_list[-1]]
      icu_tmp.set_sepsis(True)


class Icustay:
  def __init__(self, id, hadm_id, intime, outtime, los):
    self.id = id
    self.hadm_id = hadm_id
    self.intime = intime
    self.outtime = outtime
    self.los = los
    self.is_sepsis = False
  
  def set_sepsis(self, is_sepsis):
    self.is_sepsis = is_sepsis
  
  def set_feature_seq(self, seq):
    self.seq = torch.tensor(seq.values)
  

In [None]:
patient_dict = {}
hadm_dict = {}
icu_stay_dict = {}

# add patients
patients.reset_index(drop=True)
for index, row in patients.iterrows():
  patient_dict[row['subject_id']] = Patient(row['subject_id'], row['dob2'])

# add hadms
ad_sepsis = pd.merge(ad_patient, hadm_sepsis, how='inner', on='hadm_id')
ad_sepsis.reset_index(drop=True)
for index, row in ad_sepsis.iterrows():
  if not (row['hadm_id'] in filtered_hadm_set):
    continue
  hadm_id = row['hadm_id']
  hadm_dict[hadm_id] = Hadm(hadm_id, row['admittime2'], row['dischtime2'], row['is_sepsis'])
  patient_dict[row['subject_id']].hadm_list.append(hadm_id)

# remove patients with empty hadm
for key in list(patient_dict.keys()):
  if len(patient_dict[key].hadm_list) == 0:
    del patient_dict[key]

In [None]:
# add icustays
icu_filtered.reset_index(drop=True)
for index, row in icu_filtered.iterrows():
  if not (row['hadm_id'] in filtered_hadm_set):
    continue
  icu_stay_dict[row['icustay_id']] = Icustay(row['icustay_id'], row['hadm_id'], row['intime2'], row['outtime2'], row['los'])
  hadm_dict[row['hadm_id']].icu_stay_list.append(row['icustay_id'])


### Add text embeddings to Hadm objects

In [None]:
notes.columns = notes.columns.str.lower()
## filter out rows with hadm_id == NA or not in the filtered_hadm_set
notes = notes[notes['hadm_id'].notna()]
#notes = notes[notes['hadm_id'].map(lambda x: x in filtered_hadm_set)]
## convert 'hadm_id' to int
#notes['hadm_id'] = notes['hadm_id'].astype('int64')
notes_filtered = notes[notes['charttime'].notnull()]
notes_filtered['charttime2'] = notes_filtered['charttime'].map(lambda x: datetime.fromisoformat(x))
notes_filtered.reset_index(drop=True)

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,charttime2


In [None]:
notes.columns = notes.columns.str.lower()
notes = notes[notes['hadm_id'].notna()]
notes_filtered = notes[notes['charttime'].notnull()]
print(len(notes_filtered))
2083180

1619465


In [None]:
import re
import nltk
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')

def remove_special_characters(lst):
    lst1 = list()
    for element in lst:
        strs = re.sub('[^a-zA-Z0-9 ]', ' ', element)
        lst1.append(strs)
    return lst1

def stemming(lst):
    lst1 = list()
    ps = PorterStemmer()
    for word in lst:
        lst1.append(ps.stem(word))

    return lst1

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text).strip()

def tokenize_str(text):
    words = remove_special_characters(words)
    # 1. Removing multiple spaces
    text = remove_multiple_spaces(text.lower())

    # 2. Removing stop words
    text = remove_stopwords(text)

    # 3. Tokenization
    words = word_tokenize(text)

    # 4. Removing special characters
    words = remove_special_characters(words)

    # 5. Stemming
    words = stemming(words)

    words = list(filter(None, words))
    sentence = ' '.join(words)
    return sentence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!pip3 install transformers # => need to pip install at the first time
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 46.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def get_embedding(input):
  return tokenizer.encode(input, padding='max_length',max_length=768, truncation=True)

def remove_stopwords_special_char(text):
  sentence = tokenize_str(text)
  return sentence

def get_document_wise_embeddings(text):
  sentences = text.split('\n')
  sentences_embedding = []
  for i in range(len(sentences)):
    sentences[i] = remove_stopwords_special_char(sentences[i])
    sentences_embedding.append(get_embedding(sentences[i]))
  sentences_dataframe = pd.DataFrame(sentences_embedding)
  return(sentences_dataframe.mean(axis=0))

print(notes_filtered.iloc[0:10,:])
#notes_filtered['text_embedding'] = notes_filtered['text'].map(get_document_wise_embedding)

Empty DataFrame
Columns: [row_id, subject_id, hadm_id, chartdate, charttime, storetime, category, description, cgid, iserror, text, charttime2]
Index: []


In [None]:
print(notes_filtered.head())

        row_id  subject_id  hadm_id   chartdate            charttime  \
308820  316097         384   122988  2163-03-09  2163-03-09 23:09:00   
308821  316098         384   122988  2163-03-09  2163-03-09 23:09:00   
308822  316099         384   122988  2163-03-09  2163-03-09 23:40:00   
308823  316100         384   122988  2163-03-09  2163-03-09 23:40:00   
308824  316101         384   122988  2163-03-10  2163-03-09 23:40:00   

                  storetime    category                         description  \
308820  2163-03-09 23:10:03  Physician   Physician Attending Admission Note   
308821  2163-03-09 23:15:25  Physician   Physician Attending Admission Note   
308822  2163-03-09 23:40:51  Physician    Physician Resident Admission Note   
308823  2163-03-09 23:45:57  Physician    Physician Resident Admission Note   
308824  2163-03-10 00:06:55  Physician    Physician Resident Admission Note   

           cgid  iserror                                               text  \
308820  19777

In [None]:
## Add text embeddings to Hadm object
for index, row in notes_filtered.iterrows():
  if not (row['hadm_id'] in filtered_hadm_set):
    continue
  time_embedding_pair = (row['charttime2'], row['text_embedding'])
  hadm_dict[row['hadm_id']].add_embedding(time_embedding_pair)

## input features from hadm object to its icu stay objects, fill missing value

In [None]:
for k, hadm in hadm_dict.items():
  hadm.fill_feature_sequences_to_icu_stays(len(features), 768)

In [None]:
print(len(hadm_dict))

84


# DataLoadder

In [None]:
from torch.utils.data import Dataset
icu_stay_dict_keys = list(icu_stay_dict.keys())

class CustomDataset(Dataset):
  def __init__(self):
    return
  def __len__(self):
    return len(icu_stay_dict)
  def __getitem__(self, index):
    icu_tmp = icu_stay_dict[icu_stay_dict_keys[index]]
    x = icu_tmp.seq
    y = icu_tmp.is_sepsis
    return x, y

dataset = CustomDataset()

In [None]:
from torch.utils.data.dataset import random_split

split = int(len(dataset) * 0.8)
lengths = [split, len(dataset) - split]
train_dataset, test_dataset = random_split(dataset, lengths)
print("length of train dataset:", len(train_dataset))
print("length of test dataset:", len(test_dataset))

length of train dataset: 73
length of test dataset: 19


In [None]:
def collate_fn(data):
    sequences, labels = zip(*data)
    y = torch.tensor(labels, dtype=torch.float)
    num_patients = len(sequences)
    num_visits = [patient.shape[0] for patient in sequences]
    total_num_codes = sequences[0].shape[1]
    max_num_visits = max(num_visits)
    x = torch.zeros((num_patients, max_num_visits, total_num_codes), dtype=torch.float)
    for i_patient, patient in enumerate(sequences):
        for j_visit, visit in enumerate(patient):
            x[i_patient, j_visit, :] = visit.clone().float()  
    l = torch.tensor(num_visits, dtype=torch.long)
    return x, y, l
  
def get_last_visit(hidden_states, length):
    return hidden_states[range(hidden_states.shape[0]), length - 1, :]

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=10, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, collate_fn=collate_fn)

# RNN Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = len(features) + 768, hidden_size = 800, batch_first=True)
    self.fc = nn.Linear(800, 2)
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self, x, length):
    batch_size = x.shape[0]
    output, _ = self.rnn(x)
    true_h_n = get_last_visit(output, length)
    logits = self.fc(true_h_n)
    probs = self.softmax(logits)
    return probs

model = RNN()
model

RNN(
  (rnn): LSTM(828, 800, batch_first=True)
  (fc): Linear(in_features=800, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad) 

5217602

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
from sklearn.metrics import *

#input: Y_score,Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_score, Y_pred, Y_true):
    acc, auc, precision, recall, f1score = accuracy_score(Y_true, Y_pred), \
                                           roc_auc_score(Y_true, Y_score), \
                                           precision_score(Y_true, Y_pred), \
                                           recall_score(Y_true, Y_pred), \
                                           f1_score(Y_true, Y_pred)
    return acc, auc, precision, recall, f1score


#input: model, loader
def evaluate(model, loader):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    for x, y, l in loader:
        # pass the input through the model

        y_hat = model(x, l)
        y_hat = y_hat.select(dim=1, index=0)
        # convert shape from [batch size, 1] to [batch size]
        y_pred = (y_hat > 0.5).type(torch.float)

        all_y_true = torch.cat((all_y_true, y.to('cpu')), dim=0)
        all_y_pred = torch.cat((all_y_pred,  y_pred.to('cpu')), dim=0)
        all_y_score = torch.cat((all_y_score,  y_hat.to('cpu')), dim=0)
        
    acc, auc, precision, recall, f1 = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"acc: {acc:.3f}, auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return

In [None]:
n_epochs = 20

# prep model for training
model.train()

for epoch in range(n_epochs):
    
    train_loss = 0
    for x, y, l in train_loader:
        """ Step 1. clear gradients """
        optimizer.zero_grad()
        """  Step 2. perform forward pass using `model`, save the output to y_hat """
        y_hat = model(x, l)
        """ Step 3. calculate the loss using `criterion`, save the output to loss. """

        y_list = list(y)
        for i in range(len(y_list)):
          if y_list[i] > 0.5:
            y_list[i] = [float(0), float(1)]
          else:
            y_list[i] = [float(1), float(0)]
        y_true = torch.tensor(y_list,dtype=torch.float)

        loss = criterion(y_hat, y_true)
        """ Step 4. backward pass """
        loss.backward()
        """ Step 5. optimization """
        optimizer.step()
        """ Step 6. record loss """
        train_loss += loss.item()
        
    train_loss = train_loss / len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    evaluate(model, train_loader)
    evaluate(model, test_loader)

Epoch: 1 	Training Loss: 0.393281
acc: 0.096, auc: 0.006, precision: 0.096, recall: 1.000, f1: 0.175
acc: 0.105, auc: 0.412, precision: 0.105, recall: 1.000, f1: 0.190
Epoch: 2 	Training Loss: 0.390338
acc: 0.096, auc: 0.006, precision: 0.096, recall: 1.000, f1: 0.175
acc: 0.105, auc: 0.382, precision: 0.105, recall: 1.000, f1: 0.190
Epoch: 3 	Training Loss: 0.385798
acc: 0.082, auc: 0.006, precision: 0.083, recall: 0.857, f1: 0.152
acc: 0.105, auc: 0.353, precision: 0.105, recall: 1.000, f1: 0.190
Epoch: 4 	Training Loss: 0.381758
acc: 0.068, auc: 0.004, precision: 0.070, recall: 0.714, f1: 0.128
acc: 0.105, auc: 0.294, precision: 0.105, recall: 1.000, f1: 0.190
Epoch: 5 	Training Loss: 0.371092
acc: 0.068, auc: 0.004, precision: 0.070, recall: 0.714, f1: 0.128
acc: 0.105, auc: 0.265, precision: 0.105, recall: 1.000, f1: 0.190
Epoch: 6 	Training Loss: 0.366690
acc: 0.027, auc: 0.000, precision: 0.029, recall: 0.286, f1: 0.053
acc: 0.105, auc: 0.294, precision: 0.105, recall: 1.000, f1