### Import package

In [1]:
from tableone import TableOne
import pandas as pd
import numpy as np

import ydata_profiling
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Impute the vitalsign data and baseline data

In [None]:
df = pd.read_csv('raw_data/vitalsign.csv') # Read the vitalsign data
# columns to impute
imputed_columns = ['label_hosp', 'label_icu', 'icu_timestep', 'icu_timestep_back', 'hosp_timestep_back', 'Heart_rate', 'NI_SBP', 'NI_DBP', 'NI_MBP', 'Glucose', 'tempture_C', 'sofa', 'kdigo_creat']
# columns related to urineoutput
urine_columns = ['urineoutput_6hr', 'urineoutput_12hr', 'urineoutput_24hr']
all_columns = ['stay_id'] + imputed_columns  + urine_columns
# Impute columns by last observation first, impute by next observation senondly
for col in imputed_columns:
    df[col] = df.groupby('stay_id')[col].fillna(method='ffill')
    df[col] = df.groupby('stay_id')[col].fillna(method='bfill')
# for urineoutput, we could only use the last observation. We filling with 0 instead of imputing by next observation
for col in urine_columns:
    df[col] = df.groupby('stay_id')[col].fillna(method='ffill')
    df[col] = df.groupby('stay_id')[col].fillna(0)
df = df[all_columns]

# Remov the stay_id with Nan value after previous impute
ids_with_null_total = set() 
sumup = {}
for col in all_columns:
    rows_with_null = df[df[['stay_id', col]].isnull().any(axis=1)]
    ids_with_null = set(rows_with_null['stay_id'].tolist())
    ids_with_null_total.update(rows_with_null['stay_id'].tolist())
    sumup[col] = len(ids_with_null)

# for k, v in sumup.items():
#     print(f'{k}: {v}')
print(len(ids_with_null_total))
df = df[~df['stay_id'].isin(ids_with_null_total)]

# proces the part of urineoutput. divide by the weight of patient and the hours
df_b = pd.read_csv('./raw_data/baseline3.csv') # baseline that contain the weight of patient
df_weight  = df_b[['stay_id', 'weight']]
df = pd.merge(df, df_weight, how='inner', on='stay_id')
# normalize the urineoutput
df['urineoutput_6hr'] = df['urineoutput_6hr'] / df['weight'] / 6
df['urineoutput_12hr'] = df['urineoutput_12hr'] / df['weight'] / 12
df['urineoutput_24hr'] = df['urineoutput_24hr'] / df['weight'] / 24
df = df.drop('weight', axis=1)
df.to_csv('data/vitalsign_processed.csv')
print(len(df['stay_id'].unique())) 


# baseline part
df = pd.read_csv('./raw_data/baseline.csv') # level 1 and 2 baseline data
df_3 = pd.read_csv('./raw_data/baseline3.csv') # label 3 baseline data
df_all = pd.merge(df, df_3, how='inner', on='stay_id')
not_columns = [col for col in df_all.columns if any(col.startswith(prefix) for prefix in ['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'dischtime', 'dod', 'label_icu_', 'label_hosp_', 'icu_timestep_back_', 'hosp_timestep_back_', 'ectopy_type', 'ectopy_frequency', 'ectopy_type_secondary', 'ectopy_frequency_secondary'])]
columns = ['stay_id'] + [col for col in df_all.columns if col not in not_columns]
df_all = df_all[columns]
df_all = df_all[~df_all['stay_id'].isin(ids_with_null_total)] # align the stay_id in baseline data and the one in vitalsign data

# Deal with categorical data. Transform catrgorical data to numerical data. 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_all['gender'] = le.fit_transform(df_all['gender'])
correspondance_gender = dict(zip(le.classes_, le.transform(le.classes_)))
df_all['heart_rhythm'] = le.fit_transform(df_all['heart_rhythm'])
correspondance_heart_rhythm = dict(zip(le.classes_, le.transform(le.classes_)))

print(correspondance_gender)
print(correspondance_heart_rhythm)

df_all.to_csv('data/baseline_processed.csv')


## Data processing for training of LSTM

### Separate the alive and dead patients
To balance the number of training data for LSTM, we first seperate the data

In [None]:
import pandas as pd
# divide into part of alive patients and dead patients
df = pd.read_csv('./data/vitalsign_processed.csv')
dead_df = df[df['label_hosp'] == 1].reset_index().drop('index', axis=1)
alive_df = df[df['label_hosp'] == 0].reset_index().drop('index', axis=1)


dead_df = dead_df.drop('Unnamed: 0', axis=1)
alive_df = alive_df.drop('Unnamed: 0', axis=1)

# Re-arrange the order of the dataframe
new_order = list(alive_df.columns[0:3]) + list(alive_df.columns[4:6]) + list(alive_df.columns[3:4]) + list(alive_df.columns[6:])
alive_df = alive_df[new_order]
dead_df = dead_df[new_order]

# save the result
dead_df.to_csv('./data/dead_vitalsign.csv', index=False)
alive_df.to_csv('./data/alive_vitalsign.csv', index=False)

In [2]:
import pandas as pd
import numpy as np



alive_df = pd.read_csv('./data/alive_vitalsign.csv')
dead_df = pd.read_csv('./data/dead_vitalsign.csv')

# Sort DataFrame by patient_id and hour after the patient is admitted to the ICU
alive_df = alive_df.sort_values(by=['stay_id', 'icu_timestep'])
dead_df = dead_df.sort_values(by=['stay_id', 'icu_timestep'])

# For patient with vitalsign records less than 24 hours, interpolate upto at least 24 hours
def interpolate_up_to_24_rows(group):
    num_to_interpolate = 24 - len(group)
    min_hour = group['icu_timestep'].min()
    max_hour = group['icu_timestep'].max()

    interpolated_df = pd.DataFrame({'icu_timestep': np.random.uniform(min_hour, max_hour, num_to_interpolate)})
    result_df = pd.merge(group, interpolated_df, on='icu_timestep', how='outer')
    result_df = result_df.sort_values(by=['icu_timestep'])

    result_df = result_df.interpolate(method='linear') # linear interpolate for the new generate data

    return result_df


# fill every stay_id to at least 24 timestep
alive_df = alive_df.groupby('stay_id').apply(lambda group: interpolate_up_to_24_rows(group) if len(group) < 24 else group).reset_index(drop=True)
dead_df = dead_df.groupby('stay_id').apply(lambda group: interpolate_up_to_24_rows(group) if len(group) < 24 else group).reset_index(drop=True)




# add label to check whether dies in 24 hour and leave the icu
alive_df['die_24hr'] = np.where((alive_df['icu_timestep_back'] <= 24) & (alive_df['label_icu'] == 1.0), 1, 0)
dead_df['die_24hr'] = np.where((dead_df['icu_timestep_back'] <= 24) & (dead_df['label_icu'] == 1.0), 1, 0)


# add label to check whether alive in 24 hour and leave the icu
alive_df['alive_24hr'] = np.where((alive_df['icu_timestep_back'] <= 24) & (alive_df['label_icu'] == 0.0), 1, 0)
dead_df['alive_24hr'] = np.where((dead_df['icu_timestep_back'] <= 24) & (dead_df['label_icu'] == 0.0), 1, 0)


# modify the order of the column
new_order = list(alive_df.columns[0:5]) + list(alive_df.columns[-2:]) + list(alive_df.columns[5:-2])
alive_df = alive_df[new_order]
dead_df = dead_df[new_order]

# Set test label. If test label is true, only pick the first 24 hour for each patient
test_label = True
test_timestep_index = 0

# Function to retrieve data in a sliding window
def uniformly_sample_windows(group, window_size=24, sample_ratio=1):
    windows = []
    labels = []
    total_windows = len(group) - window_size + 1
    sampled_windows = int(total_windows * sample_ratio)

    if sampled_windows > 0:
        indices = np.random.choice(total_windows, sampled_windows, replace=False)

        if test_label:
            test_timestep_index = len(group) - window_size
            window = group.iloc[test_timestep_index:test_timestep_index + window_size, 7:]  # training data
            label = group.iloc[test_timestep_index + window_size - 1, 0:7] # label
            windows.append(window.values)
            labels.append(label.values)
        else:
            for i in indices:
                window = group.iloc[i:i + window_size, 7:]  # training data
                label = group.iloc[i + window_size - 1, 0:7] # label
                windows.append(window.values)
                labels.append(label.values)

    return np.array(windows), np.array(labels)

# Apply the sliding window function to each patient group
result_alive = alive_df.groupby('stay_id').apply(uniformly_sample_windows)
result_dead = dead_df.groupby('stay_id').apply(uniformly_sample_windows)

# Combine all sliding windows. 
data_alive_X, data_alive_y = [], []
for X, y in result_alive.apply(lambda x: (x[0], x[1])):
    if X.ndim == 3 and X.shape[1] == 24 and X.shape[2] == 12 and y.ndim == 2 and y.shape[1] == 7:
        data_alive_X.append(X)
        data_alive_y.append(y)
data_dead_X, data_dead_y = [], []
for X, y in result_dead.apply(lambda x: (x[0], x[1])):
    if X.ndim == 3 and X.shape[1] == 24 and X.shape[2] == 12 and y.ndim == 2 and y.shape[1] == 7:
        data_dead_X.append(X)
        data_dead_y.append(y)




from sklearn.model_selection import train_test_split

# save the concatenation of sliding window as .npy file
if not test_label:

    data_alive_X, data_alive_y = np.concatenate(data_alive_X, axis=0), np.concatenate(data_alive_y, axis=0)
    data_dead_X, data_dead_y = np.concatenate(data_dead_X, axis=0), np.concatenate(data_dead_y, axis=0)



    # random choose data from alive set
    num_of_data = data_dead_X.shape[0]
    selected_indice = np.random.choice(data_alive_X.shape[0], size=num_of_data, replace=False)
    data_alive_X, data_alive_y = data_alive_X[selected_indice], data_alive_y[selected_indice]

    data_alive_X_train, data_alive_X_test, data_alive_y_train, data_alive_y_test = train_test_split(data_alive_X, data_alive_y, test_size=0.2, random_state=42)
    data_dead_X_train, data_dead_X_test, data_dead_y_train, data_dead_y_test = train_test_split(data_dead_X, data_dead_y, test_size=0.2, random_state=42)

    data_X_train, data_X_test = np.concatenate((data_alive_X_train, data_dead_X_train), axis=0), np.concatenate((data_alive_X_test, data_dead_X_test), axis=0)
    data_y_train, data_y_test = np.concatenate((data_alive_y_train, data_dead_y_train), axis=0), np.concatenate((data_alive_y_test, data_dead_y_test), axis=0)


    np.save('./data_npy/data_X_train_new.npy', data_X_train)
    np.save('./data_npy/data_y_train_new.npy', data_y_train)
    np.save('./data_npy/data_X_test_new.npy', data_X_test)
    np.save('./data_npy/data_y_test_new.npy', data_y_test)

else:
    data_alive_X, data_alive_y = np.concatenate(data_alive_X, axis=0), np.concatenate(data_alive_y, axis=0)
    data_dead_X, data_dead_y = np.concatenate(data_dead_X, axis=0), np.concatenate(data_dead_y, axis=0)

    data_X = np.concatenate((data_alive_X, data_dead_X), axis=0)
    data_y = np.concatenate((data_alive_y, data_dead_y), axis=0)

    # np.save('./data_npy/data_X_final.npy', data_X)
    # np.save('./data_npy/data_y_final.npy', data_y)

    np.save('./data_npy/data_X_final_least.npy', data_X)
    np.save('./data_npy/data_y_final_least.npy', data_y)


## Data statistic for Level 1 & 2 baseline feature

In [None]:
from tableone import TableOne
import pandas as pd
import numpy as np

import ydata_profiling
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data statistic for level 1 baseline feature

In [None]:
data = pd.read_csv('./data/baseline_level_1.csv')
not_cols = ['subject_id', 'hadm_id', 'stay_id', 'intime']
columns = [c for c in data.columns if c not in not_cols]
categorical = ['label_hosp',  'gender', 'insurance', 'race', 'admission_type', 'label_icu'] # baseline1
group_column = 'label_icu' # Target label. Label icu or Label hosp
my_table = TableOne(data=data, columns=columns, categorical=categorical, pval=True, groupby=group_column)
print(my_table)

### Data statistic for level 2 baseline feature

In [None]:
data = pd.read_csv('./data/baseline_level_2.csv')
not_cols = ['subject_id', 'hadm_id', 'stay_id', 'intime']
columns = [c for c in data.columns if c not in not_cols]
categorical = ['label_hosp', 'label_icu', 'sepsis3'] # baseline2
group_column = 'label_icu' # Target label. Label icu or Label hosp
my_table = TableOne(data=data, columns=columns, categorical=categorical, pval=True, groupby=group_column)
print(my_table)

### Data statistic for Combined_comorbidity

In [None]:
data = pd.read_csv('./data/Combined_comorbidity.csv')
not_cols = ['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'dod', 'subject_id_1', 'hadm_id_1', 'stay_id_1']
columns = [c for c in data.columns if c not in not_cols]
categorical = ['label_icu', 'label_hosp']
group_column = 'label_icu' # Target label. Label icu or Label hosp
my_table = TableOne(data=data, columns=columns, categorical=categorical, pval=True, groupby=group_column)
print(my_table)

### Data statistic for Aspiii

In [None]:
data = pd.read_csv('./data/apsiii.csv')
not_cols = ['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'dod', 'subject_id_1', 'hadm_id_1', 'stay_id_1']
columns = [c for c in data.columns if c not in not_cols]
categorical = ['label_icu', 'label_hosp']
group_column = 'label_icu' # Target label. Label icu or Label hosp
my_table = TableOne(data=data, columns=columns, categorical=categorical, pval=True, groupby=group_column)
print(my_table)