# H5 Data Analysis

Adapted from https://github.com/MLforHealth/MIMIC_Extract/blob/master/notebooks/Baselines%20for%20Mortality%20and%20LOS%20prediction%20-%20GRU-D.ipynb

Create an environment: `/opt/miniconda3/bin/conda create --name=mimic-extract-2`

Install packages:
- Analysis: `conda install pandas pytables scipy numpy ipython ipykernel pytorch`

In [None]:
print("hello world")

In [None]:
import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss, torch

In [None]:
DATA_FILEPATH     = '/home/iandre3/mimic-extract-data/all_hourly_data.h5'
GAP_TIME          = 2  # In hours
WINDOW_SIZE       = 48 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [None]:
%%time
data_full_lvl2 = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
statics        = pd.read_hdf(DATA_FILEPATH, 'patients')

In [None]:
data_full_lvl2.head()

In [None]:
statics.head()

In [None]:
%%time 
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    return df_out

Data Loaded - Start cleaning

In [None]:
%%time

Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

lvl2 = data_full_lvl2[
    (data_full_lvl2.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (data_full_lvl2.index.get_level_values('hours_in') < WINDOW_SIZE)
]

train_frac, test_frac, validate_frac = 0.8, 0.1, 0.1
lvl2_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"

np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_test, N_validate = int(train_frac * N), int(test_frac * N), int(validate_frac * N)
train_subj = subjects[:N_train]
test_subj   = subjects[N_train:N_train + N_test]
validate_subj  = subjects[N_train+N_test:]

[(lvl2_train, lvl2_test, lvl2_validate), (Ys_train, Ys_test, Ys_validate)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, test_subj, validate_subj)] \
    for df in (lvl2, Ys)
]

idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,'mean']].mean(axis=0), lvl2_train.loc[:, idx[:,'mean']].std(axis=0)

lvl2_train.loc[:, idx[:,'mean']] = (lvl2_train.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_test.loc[:, idx[:,'mean']] = (lvl2_test.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_validate.loc[:, idx[:,'mean']] = (lvl2_validate.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds

In [None]:
%%time

lvl2_train, lvl2_test, lvl2_validate = [
    simple_imputer(df) for df in (lvl2_train, lvl2_test, lvl2_validate)
]

for df in lvl2_train, lvl2_test, lvl2_validate: assert not df.isnull().any().any()

In [None]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu']]
Ys.astype(float)
[(Ys_train, Ys_test, Ys_validate)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, test_subj, validate_subj)] \
    for df in (Ys,)
]

In [None]:
Ys_train.head()

In [None]:
lvl2_train.head()

In [None]:
for name in [("Ys_train", Ys_train), ("Ys_test", Ys_test), ("Ys_validate", Ys_validate), ("lvl2_train", lvl2_train), ("lvl2_test", lvl2_test), ("lvl2_validate", lvl2_validate)]:
    with open(f'data/{name[0]}.pkl', 'wb+') as f:
        pickle.dump(name[1], f)