# eICU Dataset

In [None]:
import pandas as pd
import os
import IPython.display as ipd
import numpy as np
import time
from tqdm import tqdm

def count_patients(df):
    return len(list(set(df.pid)))

base_path = "./Dataset/physionet.org/files/eicu-crd/2.0"
target_file = "vitalPeriodic.csv.gz" # full version: 146,671,642 rows × 19 columns
print(f"File name: {target_file} ")
start=time.time()
df = pd.read_csv(f"{base_path}/{target_file}",
                 compression='gzip',
                 usecols=['observationoffset', 'patientunitstayid', 'heartrate', 'sao2', 'respiration'])
print(f"Elapsed time...: {time.time()-start}")

df.rename(columns={'patientunitstayid': 'pid'}, inplace=True)
df['pid'] = np.int32(df['pid'])

df.observationoffset = pd.TimedeltaIndex(df.observationoffset, unit='m')
df.set_index('observationoffset', inplace=True)
df.sort_index(inplace=True)

df['offset'] = np.int32(df.index.total_seconds()/60)
df = df.groupby('pid').filter(lambda x: np.all(x.offset >= 0))
print(f"{count_patients(df)}/{len(df)}")
ipd.display(df)

# Resampling

In [None]:
df_resample=[]
start=time.time()
for pid in tqdm(list(set(df.pid))):
    # resample by time
    df_pid = df[df.pid==pid]
    df_pid_resample = df_pid.resample(str(15) + 'T').median()
    
    # remove data including any period that are not measured for a long time
    if np.any(df_pid_resample.pid.isna()):
        continue
    
    df_resample.append(df_pid_resample)
print(f"Elapsed time...: {time.time()-start}")    
df = pd.concat(df_resample)
print(f"{count_patients(df)}/{len(df)}")
ipd.display(df)

# Filtering

In [None]:
start=time.time()
df = df.groupby('pid').filter(lambda x: len(x)>=20) # 5 hours, the last 1 hour is used for labelling
print(f"{count_patients(df)}/{len(df)}")
df = df.groupby('pid').filter(lambda x: np.logical_not(np.any(x.respiration[:20].isna())))
print(f"{count_patients(df)}/{len(df)}")
df = df.groupby('pid').filter(lambda x: np.logical_not(np.any(x.sao2[:20].isna())))
print(f"{count_patients(df)}/{len(df)}")
df = df.groupby('pid').filter(lambda x: np.logical_not(np.any(x.heartrate[:20].isna())))
print(f"{count_patients(df)}/{len(df)}")
print(f"Elapsed time...: {time.time()-start}")

# Cropping & Labelling & Writing

In [None]:
if not os.path.exists(f"{base_path}/preprocessed"):
    os.mkdir(f"{base_path}/preprocessed")
    os.mkdir(f"{base_path}/preprocessed/sequences")
    os.mkdir(f"{base_path}/preprocessed/labels")

df_cropped=[]
start=time.time()
for pid in tqdm(list(set(df.pid))):
    # resample by time
    df_pid = df[df.pid==pid]
    df_cropped.append(df_pid.iloc[:16])
    seq = np.asarray(df_pid.iloc[:16,1:4])
    label = np.asarray([np.any(np.asarray(df_pid.iloc[16:20, 1])<95),
                        np.any(np.asarray(df_pid.iloc[16:20, 2])<70),
                        np.any(np.asarray(df_pid.iloc[16:20, 2])>100),
                        np.any(np.asarray(df_pid.iloc[16:20, 3])<13),
                        np.any(np.asarray(df_pid.iloc[16:20, 3])>20)])
    np.save(f"{base_path}/preprocessed/sequences/{str(int(pid))}.npy", seq)
    np.save(f"{base_path}/preprocessed/labels/{str(int(pid))}.npy", label)
print(f"Elapsed time...: {time.time()-start}")
df = pd.concat(df_cropped)
print(f"{count_patients(df)}/{len(df)}")
ipd.display(df)