# Setup

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging

# Import

In [2]:
demographics = pd.read_csv('data/demographics.csv')
# demographics.set_index(["stay_id", "subject_id"], inplace=True)

In [3]:
# urine output
uo = pd.read_csv('data/uo.csv')
# coerce datetime
uo['charttime'] = pd.to_datetime(uo['charttime'])
# set indices
uo.set_index(['stay_id'], inplace=True)
# sort values
uo.sort_values(by=['stay_id', 'charttime'], inplace=True)

In [4]:
blood_count = pd.read_csv('data/blood_count.csv')
# coerce datetime
blood_count['charttime'] = pd.to_datetime(blood_count['charttime'])
# set indices
blood_count.set_index(['subject_id'], inplace=True)
# sort values
blood_count.sort_values(by=['subject_id', 'charttime'], inplace=True)

In [5]:
chemistry = pd.read_csv('data/chemistry.csv')
# coerce datetime
chemistry['charttime'] = pd.to_datetime(chemistry['charttime'])
# set indices
chemistry.set_index(['subject_id'], inplace=True)
# sort values
chemistry.sort_values(by=['subject_id', 'charttime'], inplace=True)


In [6]:
coagulation = pd.read_csv('data/coagulation.csv')
# coerce datetime
coagulation['charttime'] = pd.to_datetime(coagulation['charttime'])
# set indices
coagulation.set_index(['subject_id'], inplace=True)
# sort values
coagulation.sort_values(by=['subject_id', 'charttime'], inplace=True)

In [7]:
creatinine = pd.read_csv('data/creatinine.csv')
# coerce datetime
creatinine['charttime'] = pd.to_datetime(creatinine['charttime'])
# set indices
creatinine.set_index(['stay_id'], inplace=True)
# sort values
creatinine.sort_values(by=['stay_id', 'charttime'], inplace=True)


In [8]:
crp = pd.read_csv('data/crp.csv')
# coerce datetime
crp['charttime'] = pd.to_datetime(crp['charttime'])
# set indices
crp.set_index(['subject_id'], inplace=True)
# sort values
crp.sort_values(by=['subject_id', 'charttime'], inplace=True)


In [9]:
dobutamine = pd.read_csv('data/dobutamine.csv')
# coerce datetime
dobutamine['starttime'] = pd.to_datetime(dobutamine['starttime'])
# set indices
dobutamine.set_index(['stay_id'], inplace=True)
# sort values
dobutamine.sort_values(by=['stay_id', 'starttime'], inplace=True)


In [10]:
dopamine = pd.read_csv('data/dopamine.csv')
# coerce datetime
dopamine['starttime'] = pd.to_datetime(dopamine['starttime'])
# set indices
dopamine.set_index(['stay_id'], inplace=True)
# sort values
dopamine.sort_values(by=['stay_id', 'starttime'], inplace=True)


In [11]:
enzyme = pd.read_csv('data/enzyme.csv')
# coerce datetime
enzyme['charttime'] = pd.to_datetime(enzyme['charttime'])
# set indices
enzyme.set_index(['subject_id'], inplace=True)
# sort values
enzyme.sort_values(by=['subject_id', 'charttime'], inplace=True)

In [12]:
epinephrine = pd.read_csv('data/epinephrine.csv')
# coerce datetime
epinephrine['starttime'] = pd.to_datetime(epinephrine['starttime'])
# set indices
epinephrine.set_index(['stay_id'], inplace=True)
# sort values
epinephrine.sort_values(by=['stay_id', 'starttime'], inplace=True)

In [13]:
milrinone = pd.read_csv('data/milrinone.csv')
# coerce datetime
milrinone['starttime'] = pd.to_datetime(milrinone['starttime'])
# set indices
milrinone.set_index(['stay_id'], inplace=True)
# sort values
milrinone.sort_values(by=['stay_id', 'starttime'], inplace=True)

In [14]:
norepinephrine = pd.read_csv('data/norepinephrine.csv')
# coerce datetime
norepinephrine['starttime'] = pd.to_datetime(norepinephrine['starttime'])
# set indices
norepinephrine.set_index(['stay_id'], inplace=True)
# sort values
norepinephrine.sort_values(by=['stay_id', 'starttime'], inplace=True)

In [15]:
phenylephrine = pd.read_csv('data/phenylephyrine.csv')
# coerce datetime
phenylephrine['starttime'] = pd.to_datetime(phenylephrine['starttime'])
# set indices
phenylephrine.set_index(['stay_id'], inplace=True)
# sort values
phenylephrine.sort_values(by=['stay_id', 'starttime'], inplace=True)


In [16]:
ventilation_settings = pd.read_csv('data/ventilation_settings.csv')
# coerce datetime
ventilation_settings['charttime'] = pd.to_datetime(ventilation_settings['charttime'])
# set indices
ventilation_settings.set_index(['stay_id'], inplace=True)
# sort values
ventilation_settings.sort_values(by=['stay_id', 'charttime'], inplace=True)


In [17]:
vitalsign = pd.read_csv('data/vitalsign.csv')
# coerce datetime
vitalsign['charttime'] = pd.to_datetime(vitalsign['charttime'])
# set indices
vitalsign.set_index(['stay_id'], inplace=True)
# sort values
vitalsign.sort_values(by=['stay_id', 'charttime'], inplace=True)

In [18]:
bg = pd.read_csv('data/bg.csv')
# coerce datetime
bg['charttime'] = pd.to_datetime(bg['charttime'])
# set indices
bg.set_index(['subject_id'], inplace=True)
# sort values
bg.sort_values(by=['subject_id', 'charttime'], inplace=True)


# Define Cohort

In [19]:
# identify patients within the first, non-consecutive ICU stay
cohort_stay_ids = demographics[(demographics.hospstay_seq == 1) & (demographics.icustay_seq == 1)].stay_id.unique()
cohort_subject_ids = demographics[(demographics.hospstay_seq == 1) & (demographics.icustay_seq == 1)].subject_id.unique()

In [20]:
# convert cohort_subject_ids to pd.Index
cohort_subject_ids = pd.Index(cohort_subject_ids)
# convert cohort_stay_ids to pd.Index
cohort_stay_ids = pd.Index(cohort_stay_ids)
cohort_subject_ids.name = 'subject_id'
cohort_stay_ids.name = 'stay_id'

In [21]:
# create target dataframe with cohort stay ids and subject ids
target = pd.DataFrame(index=[cohort_stay_ids, cohort_subject_ids], )

In [22]:
target.head()

stay_id,subject_id
39553978,10000032
39765666,10000980
37510196,10001884
39638202,10002223
32610785,10002348


In [23]:
uo = uo[uo.index.get_level_values('stay_id').isin(cohort_stay_ids)]

# Upsampling

## Urine Output

In [24]:
uo.reset_index(inplace=True)
uo.set_index(['stay_id', 'charttime'], inplace=True)
uo.sort_index(inplace=True)

In [25]:
uo.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput
stay_id,charttime,Unnamed: 2_level_1
30000153,2174-09-29 12:12:00,280.0
30000153,2174-09-29 14:00:00,45.0
30000153,2174-09-29 15:00:00,50.0
30000153,2174-09-29 16:00:00,50.0
30000153,2174-09-29 17:00:00,45.0


In [26]:
uo_resampled = uo.groupby('stay_id').resample('H', level='charttime').sum()
uo_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput
stay_id,charttime,Unnamed: 2_level_1
30000153,2174-09-29 12:00:00,280.0
30000153,2174-09-29 13:00:00,0.0
30000153,2174-09-29 14:00:00,45.0
30000153,2174-09-29 15:00:00,50.0
30000153,2174-09-29 16:00:00,50.0


In [27]:
# show length of cohort indices
print(len(cohort_stay_ids))
print(len(cohort_subject_ids))

53569
53569


## Function Definition

In [34]:
def resample_df(df: pd.DataFrame, demographics: pd.DataFrame, cohort_stay_ids: pd.Index, sample_indicator: str = "subject_id", time_indicator: str = "charttime", fill_method: str = "ffill") -> pd.DataFrame:
    """
    Resamples a dataframe to hourly intervals.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to resample.
    demographics : pd.DataFrame
        The demographics dataframe.
    cohort_stay_ids : pd.Index
        The stay ids of the cohort.
    sample_indicator : str, optional
        The column name of the sample indicator, by default "subject_id".
    time_indicator : str, optional
        The column name of the time indicator, by default "charttime".
    fill_method : str, optional
        The method to fill missing values, by default "ffill".
    Returns
    -------
    pd.DataFrame
        The resampled dataframe.
    """
    
    ### DATA PREPARATION ###
    # merge stay_id, subject_id, icu_intime and icu_outtime to df
    df = df.merge(demographics[['icu_intime', 'icu_outtime', 'stay_id', 'subject_id']], how='left', on=sample_indicator)
    # drop values without stay_id
    df = df.dropna(subset=['stay_id'])
    # convert stay_id to int
    df['stay_id'] = df['stay_id'].astype(int)
    # convert subject_id to int
    df['subject_id'] = df['subject_id'].astype(int)
    # coerce icu_intime to datetime
    df['icu_intime'] = pd.to_datetime(df['icu_intime'])
    # coerce icu_outtime to datetime
    df['icu_outtime'] = pd.to_datetime(df['icu_outtime'])
    # downsize to cohort based on stay_id
    df = df[df['stay_id'].isin(cohort_stay_ids)].copy()
    # sort
    df = df.sort_values(by=['stay_id', time_indicator])
    # drop values outside of icu_intime and icu_outtime
    df = df[(df[time_indicator] >= df['icu_intime']) & (df[time_indicator] <= df['icu_outtime'])].copy()
    # set indices
    df = df.set_index(['stay_id', time_indicator])
    
    
    ### RESAMPLING ###
    df_resampled = df.groupby('stay_id').resample('H', level=time_indicator).mean(numeric_only=True)
    if time_indicator == 'starttime':
        # rename index to charttime
        df_resampled.index = df_resampled.index.rename('charttime', level=-1)
    # fill missing values with forward fill
    df_resampled = df_resampled.fillna(method=fill_method)
    return df_resampled


## Blood Count

In [29]:
blood_count_resampled = resample_df(blood_count, demographics, cohort_stay_ids)
blood_count_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,hematocrit,hemoglobin,platelet,wbc
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30000153,2174-09-29 12:00:00,12466550.0,35.0,,,
30000153,2174-09-29 13:00:00,12466550.0,35.0,,,
30000153,2174-09-29 14:00:00,12466550.0,35.0,,,
30000153,2174-09-29 15:00:00,12466550.0,31.7,10.8,173.0,17.0
30000153,2174-09-29 16:00:00,12466550.0,31.7,10.8,173.0,17.0


## Chemistry

In [30]:
chemistry.head()

Unnamed: 0_level_0,charttime,albumin,bicarbonate,bun
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000032,2180-03-23 11:51:00,,,
10000032,2180-03-23 11:51:00,3.3,27.0,13.0
10000032,2180-05-06 22:25:00,3.3,27.0,25.0
10000032,2180-05-07 05:05:00,,28.0,25.0
10000032,2180-06-03 12:00:00,,29.0,22.0


In [31]:
chemistry_resampled = resample_df(chemistry, demographics, cohort_stay_ids)
chemistry_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,albumin,bicarbonate,bun
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30000153,2174-09-29 15:00:00,12466550.0,,19.0,22.0
30000153,2174-09-29 16:00:00,12466550.0,,19.0,22.0
30000153,2174-09-29 17:00:00,12466550.0,,19.0,22.0
30000153,2174-09-29 18:00:00,12466550.0,,19.0,22.0
30000153,2174-09-29 19:00:00,12466550.0,,19.0,22.0


## Blood Gas

In [32]:
bg_resampled = resample_df(bg, demographics, cohort_stay_ids, sample_indicator='subject_id', time_indicator='charttime')
bg_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,po2,pco2,ph,baseexcess,calcium,potassium,sodium
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30000153,2174-09-29 13:00:00,12466550.0,221.0,45.0,7.3,-3.0,1.16,4.4,141.0
30000153,2174-09-29 14:00:00,12466550.0,263.0,45.0,7.3,-3.0,1.1,4.2,142.0
30000153,2174-09-29 15:00:00,12466550.0,263.0,45.0,7.3,-3.0,1.1,4.2,142.0
30000153,2174-09-29 16:00:00,12466550.0,215.0,42.0,7.31,-4.0,1.1,4.2,142.0
30000646,2194-04-29 10:00:00,12207593.0,71.0,30.0,7.44,-1.0,1.1,4.2,142.0


## Coagulation

In [35]:
coagulation.head()

Unnamed: 0_level_0,charttime,d_dimer,fibrinogen,inr,ptt
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000032,2180-03-23 11:51:00,,,1.4,
10000032,2180-05-06 22:25:00,,156.0,1.6,30.9
10000032,2180-05-07 05:05:00,,,1.5,32.3
10000032,2180-06-22 11:15:00,,,1.4,
10000032,2180-06-27 05:10:00,,,1.5,


In [36]:
coagulation_resampled = resample_df(coagulation, demographics, cohort_stay_ids)
coagulation_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,d_dimer,fibrinogen,inr,ptt
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30000153,2174-09-29 15:00:00,12466550.0,,183.0,1.1,25.3
30000646,2194-04-29 02:00:00,12207593.0,,183.0,1.3,37.4
30000646,2194-04-29 03:00:00,12207593.0,,183.0,1.3,37.4
30000646,2194-04-29 04:00:00,12207593.0,,183.0,1.3,37.4
30000646,2194-04-29 05:00:00,12207593.0,,183.0,1.3,37.4


## Creatinine

In [37]:
creatinine_resampled = resample_df(creatinine, demographics, cohort_stay_ids, sample_indicator="stay_id")
creatinine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,creat,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1
30000153,2174-09-29 15:00:00,0.9,12466550.0
30000153,2174-09-29 16:00:00,0.9,12466550.0
30000153,2174-09-29 17:00:00,0.9,12466550.0
30000153,2174-09-29 18:00:00,0.9,12466550.0
30000153,2174-09-29 19:00:00,0.9,12466550.0


## CRP

In [38]:
crp_resampled = resample_df(crp, demographics, cohort_stay_ids)
crp_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,crp
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1
30019654,2136-08-18 21:00:00,11224618.0,0.5
30023732,2191-12-28 13:00:00,12002285.0,57.1
30024161,2185-08-24 23:00:00,19635799.0,17.4
30030111,2112-10-23 04:00:00,17441237.0,1.6
30037325,2144-03-24 02:00:00,17232630.0,156.2


## Dobutamine

In [39]:
dobutamine_resampled = resample_df(dobutamine, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
dobutamine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30014019,2119-07-11 04:00:00,1.530175,0.500057,14475287.0
30014019,2119-07-11 05:00:00,34.455297,0.999991,14475287.0
30014019,2119-07-11 06:00:00,34.455297,0.999991,14475287.0
30014019,2119-07-11 07:00:00,34.455297,0.999991,14475287.0
30014019,2119-07-11 08:00:00,34.455297,0.999991,14475287.0


In [40]:
# rename columns
dobutamine_resampled.columns = ['dobutamine_' + str(col) for col in dobutamine_resampled.columns]
dobutamine_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 28606 entries, (30014019, Timestamp('2119-07-11 04:00:00')) to (39981262, Timestamp('2138-06-07 14:00:00'))
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dobutamine_vaso_amount  28606 non-null  float64
 1   dobutamine_vaso_rate    28606 non-null  float64
 2   dobutamine_subject_id   28606 non-null  float64
dtypes: float64(3)
memory usage: 2.0 MB


## Dopamine

In [41]:
dopamine_resampled = resample_df(dopamine, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
dopamine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30001446,2186-04-12 05:00:00,19.701492,5.004316,16513856.0
30001446,2186-04-12 06:00:00,23.582089,2.502158,16513856.0
30001446,2186-04-12 07:00:00,7.640512,2.001391,16513856.0
30001446,2186-04-12 08:00:00,31.924036,2.500884,16513856.0
30007565,2141-07-09 03:00:00,25.13896,5.270617,16828280.0


In [42]:
# rename columns
dopamine_resampled.columns = ['dopamine_' + col for col in dopamine_resampled.columns]
dopamine_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 33385 entries, (30001446, Timestamp('2186-04-12 05:00:00')) to (39996783, Timestamp('2126-06-29 06:00:00'))
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dopamine_vaso_amount  33385 non-null  float64
 1   dopamine_vaso_rate    33385 non-null  float64
 2   dopamine_subject_id   33385 non-null  float64
dtypes: float64(3)
memory usage: 2.2 MB


## Enzyme

In [43]:
enzyme_resampled = resample_df(enzyme, demographics, cohort_stay_ids)
enzyme_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,alt,ast,bilirubin_total
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30000153,2174-09-29 15:00:00,12466550.0,,,
30000153,2174-09-29 16:00:00,12466550.0,,,
30000153,2174-09-29 17:00:00,12466550.0,,,
30000153,2174-09-29 18:00:00,12466550.0,,,
30000153,2174-09-29 19:00:00,12466550.0,,,


## Epinephrine

In [44]:
epinephrine_resampled = resample_df(epinephrine, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
epinephrine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30003749,2120-11-05 16:00:00,7.9104,0.502343,12227720.0
30003749,2120-11-05 17:00:00,7.9104,0.502343,12227720.0
30003749,2120-11-05 18:00:00,7.9104,0.502343,12227720.0
30003749,2120-11-05 19:00:00,7.9104,0.502343,12227720.0
30003749,2120-11-05 20:00:00,7.9104,0.502343,12227720.0


In [45]:
# rename columns
epinephrine_resampled.columns = ['epinephrine_' + str(col) for col in epinephrine_resampled.columns]
epinephrine_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49949 entries, (30003749, Timestamp('2120-11-05 16:00:00')) to (39996783, Timestamp('2126-06-29 06:00:00'))
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   epinephrine_vaso_amount  49949 non-null  float64
 1   epinephrine_vaso_rate    49949 non-null  float64
 2   epinephrine_subject_id   49949 non-null  float64
dtypes: float64(3)
memory usage: 2.9 MB


## Milrinone

In [46]:
milrinone_resampled = resample_df(milrinone, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
milrinone_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30009339,2145-05-10 10:00:00,20.000001,0.271739,15993831.0
30009339,2145-05-10 11:00:00,20.000001,0.271739,15993831.0
30009339,2145-05-10 12:00:00,20.000001,0.271739,15993831.0
30009339,2145-05-10 13:00:00,20.000001,0.271739,15993831.0
30009339,2145-05-10 14:00:00,20.000001,0.271739,15993831.0


In [47]:
# rename columns
milrinone_resampled.columns = ['milrinone_' + str(col) for col in milrinone_resampled.columns]
milrinone_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 41861 entries, (30009339, Timestamp('2145-05-10 10:00:00')) to (39996073, Timestamp('2175-09-14 22:00:00'))
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   milrinone_vaso_amount  41861 non-null  float64
 1   milrinone_vaso_rate    41861 non-null  float64
 2   milrinone_subject_id   41861 non-null  float64
dtypes: float64(3)
memory usage: 2.5 MB


## Norepinephrine

In [48]:
norepinephrine_resampled = resample_df(norepinephrine, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
norepinephrine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30001446,2186-04-12 09:00:00,0.267303,0.040011,16513856.0
30001446,2186-04-12 10:00:00,1.069213,0.080021,16513856.0
30001446,2186-04-12 11:00:00,1.069213,0.080021,16513856.0
30001446,2186-04-12 12:00:00,0.501014,0.059994,16513856.0
30001446,2186-04-12 13:00:00,0.643911,0.039981,16513856.0


In [49]:
# rename columns
norepinephrine_resampled.columns = ['norepinephrine_' + str(col) for col in norepinephrine_resampled.columns]
norepinephrine_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 532170 entries, (30001446, Timestamp('2186-04-12 09:00:00')) to (39999230, Timestamp('2147-09-01 18:00:00'))
Data columns (total 3 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   norepinephrine_vaso_amount  532170 non-null  float64
 1   norepinephrine_vaso_rate    532170 non-null  float64
 2   norepinephrine_subject_id   532170 non-null  float64
dtypes: float64(3)
memory usage: 26.5 MB


## Phenylephrine

In [50]:
phenylephrine_resampled = resample_df(phenylephrine, demographics, cohort_stay_ids, sample_indicator="stay_id", time_indicator="starttime")
phenylephrine_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vaso_amount,vaso_rate,subject_id
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30000646,2194-04-29 08:00:00,0.535032,0.500217,12207593.0
30000646,2194-04-29 09:00:00,27.562899,0.799936,12207593.0
30000646,2194-04-29 10:00:00,27.562899,0.799936,12207593.0
30000646,2194-04-29 11:00:00,27.562899,0.799936,12207593.0
30000646,2194-04-29 12:00:00,27.562899,0.799936,12207593.0


In [51]:
# rename columns
phenylephrine_resampled.columns = ['phenylephrine_' + str(col) for col in phenylephrine_resampled.columns]
phenylephrine_resampled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 360442 entries, (30000646, Timestamp('2194-04-29 08:00:00')) to (39999552, Timestamp('2186-07-17 21:00:00'))
Data columns (total 3 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   phenylephrine_vaso_amount  360442 non-null  float64
 1   phenylephrine_vaso_rate    360442 non-null  float64
 2   phenylephrine_subject_id   360442 non-null  float64
dtypes: float64(3)
memory usage: 20.9 MB


## Ventilation Settings

In [52]:
ventilation_settings_resampled = resample_df(ventilation_settings, demographics, cohort_stay_ids, time_indicator="charttime")
ventilation_settings_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,respiratory_rate_set,respiratory_rate_spontaneous,respiratory_rate_total,tidal_volume_observed,plateau_pressure,peep,fio2,flow_rate
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30000153,2174-09-29 12:00:00,12466550.0,,,,,,,50.0,
30000153,2174-09-29 13:00:00,12466550.0,,,,,,,50.0,
30000153,2174-09-29 14:00:00,12466550.0,,,,,,,50.0,
30000153,2174-09-29 15:00:00,12466550.0,14.0,0.0,14.0,575.0,16.0,5.0,50.0,
30000153,2174-09-29 16:00:00,12466550.0,14.0,0.0,14.0,575.0,16.0,5.0,50.0,


## Vitalsign

In [53]:
vitalsign_resampled = resample_df(vitalsign, demographics, cohort_stay_ids, time_indicator="charttime")
vitalsign_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,heart_rate,sbp,dbp,mbp,sbp_ni,dbp_ni,mbp_ni,resp_rate,temperature,spo2,glucose
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
30000153,2174-09-29 12:00:00,12466550.0,,,,,,,,18.0,,,
30000153,2174-09-29 13:00:00,12466550.0,104.0,132.0,74.5,84.0,113.0,77.0,84.0,16.0,,100.0,158.0
30000153,2174-09-29 14:00:00,12466550.0,83.0,131.0,61.0,80.0,113.0,77.0,84.0,16.0,37.28,100.0,176.0
30000153,2174-09-29 15:00:00,12466550.0,92.0,123.0,65.0,84.0,113.0,77.0,84.0,14.0,37.28,100.0,192.0
30000153,2174-09-29 16:00:00,12466550.0,83.0,109.0,55.0,71.0,113.0,77.0,84.0,16.0,37.5,100.0,175.0


# Data Export

In [54]:
# export all resampled dataframe to csv
blood_count_resampled.to_csv('data/resampled/blood_count_resampled.csv')
chemistry_resampled.to_csv('data/resampled/chemistry_resampled.csv')
coagulation_resampled.to_csv('data/resampled/coagulation_resampled.csv')
creatinine_resampled.to_csv('data/resampled/creatinine_resampled.csv')
crp_resampled.to_csv('data/resampled/crp_resampled.csv')
dobutamine_resampled.to_csv('data/resampled/dobutamine_resampled.csv')
dopamine_resampled.to_csv('data/resampled/dopamine_resampled.csv')
enzyme_resampled.to_csv('data/resampled/enzyme_resampled.csv')
epinephrine_resampled.to_csv('data/resampled/epinephrine_resampled.csv')
milrinone_resampled.to_csv('data/resampled/milrinone_resampled.csv')
norepinephrine_resampled.to_csv('data/resampled/norepinephrine_resampled.csv')
phenylephrine_resampled.to_csv('data/resampled/phenylephrine_resampled.csv')
ventilation_settings_resampled.to_csv('data/resampled/ventilation_settings_resampled.csv')
vitalsign_resampled.to_csv('data/resampled/vitalsign_resampled.csv')
uo_resampled.to_csv('data/resampled/uo_resampled.csv')

# Data Joining

In [55]:
# merge all resampled dataframes with uo
df = uo_resampled.merge(blood_count_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [56]:
df = df.merge(chemistry_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [57]:
df = df.merge(coagulation_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [58]:
df = df.merge(creatinine_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [59]:
df = df.merge(bg_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [60]:
df = df.merge(crp_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [61]:
df = df.merge(enzyme_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [62]:
df = df.merge(ventilation_settings_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [63]:
df = df.merge(vitalsign_resampled.drop(columns=["subject_id"]), how='left', on=['stay_id', 'charttime'])

In [64]:
df = df.merge(dobutamine_resampled["dobutamine_vaso_rate"], how='left', on=['stay_id', 'charttime'])
df = df.merge(dopamine_resampled["dopamine_vaso_rate"], how='left', on=['stay_id', 'charttime'])
df = df.merge(epinephrine_resampled["epinephrine_vaso_rate"], how='left', on=['stay_id', 'charttime'])
df = df.merge(milrinone_resampled["milrinone_vaso_rate"], how='left', on=['stay_id', 'charttime'])
df = df.merge(norepinephrine_resampled["norepinephrine_vaso_rate"], how='left', on=['stay_id', 'charttime'])
df = df.merge(phenylephrine_resampled["phenylephrine_vaso_rate"], how='left', on=['stay_id', 'charttime'])

In [65]:
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput,hematocrit,hemoglobin,platelet,wbc,albumin,bicarbonate,bun,d_dimer,fibrinogen,...,resp_rate,temperature,spo2,glucose,dobutamine_vaso_rate,dopamine_vaso_rate,epinephrine_vaso_rate,milrinone_vaso_rate,norepinephrine_vaso_rate,phenylephrine_vaso_rate
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30000153,2174-09-29 12:00:00,280.0,35.0,,,,,,,,,...,18.0,,,,,,,,,
30000153,2174-09-29 13:00:00,0.0,35.0,,,,,,,,,...,16.0,,100.0,158.0,,,,,,
30000153,2174-09-29 14:00:00,45.0,35.0,,,,,,,,,...,16.0,37.28,100.0,176.0,,,,,,
30000153,2174-09-29 15:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,14.0,37.28,100.0,192.0,,,,,,
30000153,2174-09-29 16:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,,...,16.0,37.5,100.0,175.0,,,,,,
30000153,2174-09-29 17:00:00,45.0,31.7,10.8,173.0,17.0,,19.0,22.0,,,...,20.0,37.5,100.0,175.0,,,,,,
30000153,2174-09-29 18:00:00,70.0,31.7,10.8,173.0,17.0,,19.0,22.0,,,...,19.0,37.5,99.0,175.0,,,,,,
30000153,2174-09-29 19:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,,...,21.0,37.5,96.0,175.0,,,,,,
30000153,2174-09-29 20:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,,...,21.0,38.22,98.0,185.0,,,,,,
30000153,2174-09-29 21:00:00,80.0,32.1,10.8,173.0,17.0,,19.0,22.0,,,...,22.0,38.22,96.0,185.0,,,,,,


### Second Ffill

In [66]:
# check for missing values in urine output
df.urineoutput.isna().sum()

0

In [67]:
# repeat forward fill for all columns for each patient
df.fillna(method='ffill', axis=0, inplace=True)
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput,hematocrit,hemoglobin,platelet,wbc,albumin,bicarbonate,bun,d_dimer,fibrinogen,...,resp_rate,temperature,spo2,glucose,dobutamine_vaso_rate,dopamine_vaso_rate,epinephrine_vaso_rate,milrinone_vaso_rate,norepinephrine_vaso_rate,phenylephrine_vaso_rate
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30000153,2174-09-29 12:00:00,280.0,35.0,,,,,,,,,...,18.0,,,,,,,,,
30000153,2174-09-29 13:00:00,0.0,35.0,,,,,,,,,...,16.0,,100.0,158.0,,,,,,
30000153,2174-09-29 14:00:00,45.0,35.0,,,,,,,,,...,16.0,37.28,100.0,176.0,,,,,,
30000153,2174-09-29 15:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,14.0,37.28,100.0,192.0,,,,,,
30000153,2174-09-29 16:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,16.0,37.5,100.0,175.0,,,,,,
30000153,2174-09-29 17:00:00,45.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,20.0,37.5,100.0,175.0,,,,,,
30000153,2174-09-29 18:00:00,70.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,19.0,37.5,99.0,175.0,,,,,,
30000153,2174-09-29 19:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,21.0,37.5,96.0,175.0,,,,,,
30000153,2174-09-29 20:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,21.0,38.22,98.0,185.0,,,,,,
30000153,2174-09-29 21:00:00,80.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,22.0,38.22,96.0,185.0,,,,,,


In [68]:
# for vasopressors fill nan with 0
vasopressors = ['dobutamine_vaso_rate', 'dopamine_vaso_rate', 'epinephrine_vaso_rate', 'milrinone_vaso_rate', 'norepinephrine_vaso_rate', 'phenylephrine_vaso_rate']
df[vasopressors] = df[vasopressors].fillna(0)
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput,hematocrit,hemoglobin,platelet,wbc,albumin,bicarbonate,bun,d_dimer,fibrinogen,...,resp_rate,temperature,spo2,glucose,dobutamine_vaso_rate,dopamine_vaso_rate,epinephrine_vaso_rate,milrinone_vaso_rate,norepinephrine_vaso_rate,phenylephrine_vaso_rate
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30000153,2174-09-29 12:00:00,280.0,35.0,,,,,,,,,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 13:00:00,0.0,35.0,,,,,,,,,...,16.0,,100.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 14:00:00,45.0,35.0,,,,,,,,,...,16.0,37.28,100.0,176.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 15:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,14.0,37.28,100.0,192.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 16:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,16.0,37.5,100.0,175.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 17:00:00,45.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,20.0,37.5,100.0,175.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 18:00:00,70.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,19.0,37.5,99.0,175.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 19:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,21.0,37.5,96.0,175.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 20:00:00,0.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,21.0,38.22,98.0,185.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 21:00:00,80.0,32.1,10.8,173.0,17.0,,19.0,22.0,,183.0,...,22.0,38.22,96.0,185.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# export df to csv
df.to_csv('data/resampled/merged_resampled.csv')

In [70]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,urineoutput,hematocrit,hemoglobin,platelet,wbc,albumin,bicarbonate,bun,d_dimer,fibrinogen,...,resp_rate,temperature,spo2,glucose,dobutamine_vaso_rate,dopamine_vaso_rate,epinephrine_vaso_rate,milrinone_vaso_rate,norepinephrine_vaso_rate,phenylephrine_vaso_rate
stay_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30000153,2174-09-29 12:00:00,280.0,35.0,,,,,,,,,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 13:00:00,0.0,35.0,,,,,,,,,...,16.0,,100.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 14:00:00,45.0,35.0,,,,,,,,,...,16.0,37.28,100.0,176.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 15:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,14.0,37.28,100.0,192.0,0.0,0.0,0.0,0.0,0.0,0.0
30000153,2174-09-29 16:00:00,50.0,31.7,10.8,173.0,17.0,,19.0,22.0,,183.0,...,16.0,37.5,100.0,175.0,0.0,0.0,0.0,0.0,0.0,0.0
