In [1]:
%load_ext autoreload

In [2]:
from __future__ import print_function, division

In [3]:
%autoreload

import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter



In [4]:
DATA_FILEPATH     = '/home/kha104/scratch/mimic_workspace/all_hourly_data.h5'

GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']

np.random.seed(SEED)


In [5]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [6]:
%%time
data_full_lvl2 = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')

statics        = pd.read_hdf(DATA_FILEPATH, 'patients')

CPU times: user 4.53 s, sys: 4.92 s, total: 9.45 s
Wall time: 52.2 s


In [7]:
data_full_lvl2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,25.0,0.0,2.0,1.8,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,7.4,0.147733,1.0,5.0,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,7.26,0.0,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [33]:
statics.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.06456,EMERGENCY,MICU,0,0,0,1,0,145
4,185777,294638,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
6,107064,228232,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
9,150750,220597,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
11,194540,229441,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.58441,EMERGENCY,SICU,0,0,0,1,0,38


In [38]:
statics.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 34472 entries, (3, 145834, 211552) to (99999, 113369, 246512)
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   gender                  34472 non-null  category      
 1   ethnicity               34472 non-null  category      
 2   age                     34472 non-null  float64       
 3   insurance               34472 non-null  object        
 4   admittime               34472 non-null  datetime64[ns]
 5   diagnosis_at_admission  34471 non-null  object        
 6   dischtime               34472 non-null  datetime64[ns]
 7   discharge_location      34472 non-null  object        
 8   fullcode_first          28162 non-null  float64       
 9   dnr_first               28162 non-null  float64       
 10  fullcode                28162 non-null  float64       
 11  dnr                     28162 non-null  float64       
 12  dnr_first_

In [10]:
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    return df_out

In [11]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu', 'max_hours']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)
Ys

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mort_hosp,mort_icu,max_hours,los_3,los_7
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,0,0,145,True,False
4,185777,294638,0,0,40,False,False
6,107064,228232,0,0,88,True,False
9,150750,220597,1,1,127,True,False
11,194540,229441,0,0,38,False,False
...,...,...,...,...,...,...,...
99973,150202,275083,0,0,65,False,False
99982,151454,221194,0,0,190,True,True
99991,151118,226241,0,0,75,True,False
99992,197084,242052,0,0,47,False,False


In [12]:
df = data_full_lvl2
lvl2 = df[
    (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (df.index.get_level_values('hours_in') < WINDOW_SIZE)]
lvl2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,25.0,0.0,2.0,1.8,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,7.4,0.147733,1.0,5.0,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,7.26,0.0,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [13]:
lvl2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 574656 entries, (3, 145834, 211552, 0) to (99995, 137810, 229633, 23)
Columns: 312 entries, ('alanine aminotransferase', 'count') to ('ph urine', 'std')
dtypes: float64(312)
memory usage: 1.3 GB


In [17]:
lvl2_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"

In [18]:
np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
print(N)

23944


In [19]:
subjects[0:1]

array([11036])

In [20]:
[lvl2_total, Ys_total] = [df[df.index.get_level_values('subject_id').isin(subjects)] for df in (lvl2, Ys)]


In [25]:
lvl2_total.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,-0.255569,0.0,2.0,-1.984729,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,0.378818,0.147733,1.0,-0.862561,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,-1.403833,0.0,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [27]:
Ys_total.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mort_hosp,mort_icu,max_hours,los_3,los_7
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,0,0,145,True,False
4,185777,294638,0,0,40,False,False
6,107064,228232,0,0,88,True,False
9,150750,220597,1,1,127,True,False
11,194540,229441,0,0,38,False,False


In [28]:
Ys_total.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23944 entries, (3, 145834, 211552) to (99995, 137810, 229633)
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   mort_hosp  23944 non-null  int64
 1   mort_icu   23944 non-null  int64
 2   max_hours  23944 non-null  int64
 3   los_3      23944 non-null  bool 
 4   los_7      23944 non-null  bool 
dtypes: bool(2), int64(3)
memory usage: 4.7 MB


In [22]:
idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2_total.loc[:, idx[:,'mean']].mean(axis=0), lvl2_total.loc[:, idx[:,'mean']].std(axis=0)

lvl2_total.loc[:, idx[:,'mean']] = (lvl2_total.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds


In [23]:
lvl2_means

LEVEL2                        Aggregation Function
alanine aminotransferase      mean                    252.785336
albumin                       mean                      3.170015
albumin ascites               mean                      1.811594
albumin pleural               mean                      1.786154
albumin urine                 mean                     45.446835
                                                         ...    
weight                        mean                     82.138024
white blood cell count        mean                     12.479288
white blood cell count urine  mean                     22.099319
ph                            mean                      7.370250
ph urine                      mean                      5.799118
Length: 104, dtype: float64

In [24]:
lvl2_total

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,-0.255569,0.0,2.0,-1.984729,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,0.378818,0.147733,1.0,-0.862561,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,-1.403833,0.000000,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,137810,229633,19,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,1.0,1.142811,,0.0,,
99995,137810,229633,20,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
99995,137810,229633,21,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
99995,137810,229633,22,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [25]:
lvl2_total = simple_imputer(lvl2_total)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
lvl2_flat = lvl2_total.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], columns=['hours_in'])


In [27]:
lvl2_flat

Unnamed: 0_level_0,Unnamed: 1_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,...,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Aggregation Function,mask,mask,mask,mask,mask,mask,mask,mask,mask,mask,...,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured
Unnamed: 0_level_2,Unnamed: 1_level_2,hours_in,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
subject_id,hadm_id,icustay_id,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
3,145834,211552,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0
4,185777,294638,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0
6,107064,228232,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0
9,150750,220597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0
11,194540,229441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,102.0,103.0,104.0,105.0,106.0,107.0,108.0,109.0,110.0,111.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99973,150202,275083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,110.0,111.0,112.0,113.0,114.0,115.0,116.0,117.0,118.0,119.0
99982,151454,221194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,134.0,135.0,136.0,137.0,138.0,139.0,140.0,141.0,142.0,143.0
99991,151118,226241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0
99992,197084,242052,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.0,39.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0


In [28]:
lvl2_flat.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23944 entries, (3, 145834, 211552) to (99995, 137810, 229633)
Columns: 7488 entries, ('alanine aminotransferase', 'mask', 0) to ('white blood cell count urine', 'time_since_measured', 23)
dtypes: float64(7488)
memory usage: 1.3 GB


In [29]:
lvl2_flat.index

MultiIndex([(    3, 145834, 211552),
            (    4, 185777, 294638),
            (    6, 107064, 228232),
            (    9, 150750, 220597),
            (   11, 194540, 229441),
            (   12, 112213, 232669),
            (   13, 143045, 263738),
            (   17, 194023, 277042),
            (   19, 109235, 273430),
            (   21, 109451, 217847),
            ...
            (99944, 185654, 221067),
            (99955, 108494, 243255),
            (99957, 148574, 257127),
            (99965, 101083, 257338),
            (99966, 167228, 252173),
            (99973, 150202, 275083),
            (99982, 151454, 221194),
            (99991, 151118, 226241),
            (99992, 197084, 242052),
            (99995, 137810, 229633)],
           names=['subject_id', 'hadm_id', 'icustay_id'], length=23944)

In [30]:
lvl2_flat.to_hdf('lvl2_flat.h5', key='lvl2_flat')

  check_attribute_name(name)
  check_attribute_name(name)


In [32]:
df_vital = pd.read_hdf('lvl2_flat.h5', 'lvl2_flat')

In [33]:
df_vital

Unnamed: 0_level_0,Unnamed: 1_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,...,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine,white blood cell count urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Aggregation Function,mask,mask,mask,mask,mask,mask,mask,mask,mask,mask,...,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured
Unnamed: 0_level_2,Unnamed: 1_level_2,hours_in,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
subject_id,hadm_id,icustay_id,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
3,145834,211552,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0
4,185777,294638,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0
6,107064,228232,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0
9,150750,220597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0
11,194540,229441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,102.0,103.0,104.0,105.0,106.0,107.0,108.0,109.0,110.0,111.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99973,150202,275083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,110.0,111.0,112.0,113.0,114.0,115.0,116.0,117.0,118.0,119.0
99982,151454,221194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,134.0,135.0,136.0,137.0,138.0,139.0,140.0,141.0,142.0,143.0
99991,151118,226241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0
99992,197084,242052,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.0,39.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0


In [29]:
Ys_total.to_hdf('Ys_total.h5', key='Ys_total')

In [30]:
df_label = pd.read_hdf('Ys_total.h5', 'Ys_total')

In [31]:
df_label

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mort_hosp,mort_icu,max_hours,los_3,los_7
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,0,0,145,True,False
4,185777,294638,0,0,40,False,False
6,107064,228232,0,0,88,True,False
9,150750,220597,1,1,127,True,False
11,194540,229441,0,0,38,False,False
...,...,...,...,...,...,...,...
99973,150202,275083,0,0,65,False,False
99982,151454,221194,0,0,190,True,True
99991,151118,226241,0,0,75,True,False
99992,197084,242052,0,0,47,False,False


In [35]:
statics.to_hdf('statics.h5', key='statics', format="table")

In [36]:
df_statics = pd.read_hdf('statics.h5', 'statics')

In [37]:
df_statics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.064560,EMERGENCY,MICU,0,0,0,1,0,145
4,185777,294638,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
6,107064,228232,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
9,150750,220597,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
11,194540,229441,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.584410,EMERGENCY,SICU,0,0,0,1,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99983,117390,286606,M,UNKNOWN/NOT SPECIFIED,78.576624,Medicare,2193-04-26 11:35:00,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,2193-04-29 13:30:00,HOME,,,...,2193-04-27 12:33:22,1.039942,EMERGENCY,CCU,0,0,0,1,0,24
99991,151118,226241,M,WHITE,47.729259,Private,2184-12-24 08:30:00,DIVERTICULITIS/SDA,2185-01-05 12:15:00,HOME,1.0,0.0,...,2184-12-31 20:56:20,3.142616,ELECTIVE,TSICU,0,0,0,1,0,75
99992,197084,242052,F,WHITE,65.772155,Medicare,2144-07-25 18:03:00,RETROPERITONEAL HEMORRHAGE,2144-07-28 17:56:00,SNF,1.0,0.0,...,2144-07-27 17:27:55,1.974456,EMERGENCY,MICU,0,0,0,1,0,47
99995,137810,229633,F,WHITE,88.698942,Medicare,2147-02-08 08:00:00,ABDOMINAL AORTIC ANEURYSM/SDA,2147-02-11 13:15:00,HOME,1.0,0.0,...,2147-02-10 17:46:30,2.161481,ELECTIVE,CSRU,0,0,0,1,0,51
