In [None]:
! ls ../mimic3-benchmarks/data/root/test

In [None]:
! ls ../mimic3-benchmarks/data/in-hospital-mortality/test

In [None]:
import csv
import pandas as pd
import numpy as np
import os

In [None]:
# The data is organized as follows
# in /home/iandre3/PACE/data, there's two subfolders,
# measurements and icu.
# 
# measurements contains files with time series of health measurements made throughout the first 48 hours of a single ICU admission. Only ICU admissions longer than 48 hours are in the folder, but only the first 48 hours of measurements are included.
# measurements also has a file called labels.csv which lists each time series file name along with its label, 0 for no mortality outcome and 1 for mortality outcome.
# For each ICU patienttime series in measurements/, there is a corresponding folder in icu/ containing admission episodes and where each episode has information on diagnosis

In [None]:
# After dataset loading, the features are going to be
 
# Age  Capillary refill rate  Diagnosis 00845  Diagnosis 0389  Diagnosis 07054  Diagnosis 2449  Diagnosis 25000  Diagnosis 25040  Diagnosis 25060  Diagnosis 2639  Diagnosis 2720  Diagnosis 2724  Diagnosis 2749  Diagnosis 2760  Diagnosis 2761  Diagnosis 2762  Diagnosis 2763  Diagnosis 27651  Diagnosis 27652  Diagnosis 2767  Diagnosis 2768  Diagnosis 27800  Diagnosis 27801  Diagnosis 2800  Diagnosis 2809  Diagnosis 2851  Diagnosis 28521  Diagnosis 28529  Diagnosis 2859  Diagnosis 2869  Diagnosis 2875  Diagnosis 28860  Diagnosis 2930  Diagnosis 2948  Diagnosis 30000  Diagnosis 3004  Diagnosis 30500  Diagnosis 3051  Diagnosis 311  Diagnosis 32723  Diagnosis 34590  Diagnosis 3485  Diagnosis 3572  Diagnosis 36201  Diagnosis 4019  Diagnosis 40390  Diagnosis 40391  Diagnosis 41071  Diagnosis 4111  Diagnosis 412  Diagnosis 4139  Diagnosis 41400  Diagnosis 41401  Diagnosis 4168  Diagnosis 4240  Diagnosis 4241  Diagnosis 4254  Diagnosis 4271  Diagnosis 42731  Diagnosis 42732  Diagnosis 4275  Diagnosis 42789  Diagnosis 4280  Diagnosis 42822  Diagnosis 42823  Diagnosis 42832  Diagnosis 42833  Diagnosis 431  Diagnosis 4439  Diagnosis 45829  Diagnosis 4589  Diagnosis 486  Diagnosis 49121  Diagnosis 49390  Diagnosis 496  Diagnosis 5070  Diagnosis 5119  Diagnosis 5180  Diagnosis 51881  Diagnosis 53081  Diagnosis 5601  Diagnosis 56210  Diagnosis 570  Diagnosis 5712  Diagnosis 5715  Diagnosis 5723  Diagnosis 5770  Diagnosis 5789  Diagnosis 5845  Diagnosis 5849  Diagnosis 5856  Diagnosis 5859  Diagnosis 5990  Diagnosis 60000  Diagnosis 70703  Diagnosis 71590  Diagnosis 73300  Diagnosis 78039  Diagnosis 78551  Diagnosis 78552  Diagnosis 78791  Diagnosis 78820  Diagnosis 7907  Diagnosis 79092  Diagnosis 79902  Diagnosis 99591  Diagnosis 99592  Diagnosis 99662  Diagnosis 9971  Diagnosis 99811  Diagnosis 99812  Diagnosis 99859  Diagnosis E8497  Diagnosis E8782  Diagnosis E8788  Diagnosis E8798  Diagnosis E8889  Diagnosis V103  Diagnosis V1046  Diagnosis V1251  Diagnosis V1254  Diagnosis V1582  Diagnosis V4501  Diagnosis V4581  Diagnosis V4582  Diagnosis V4986  Diagnosis V5861  Diagnosis V5865  Diagnosis V5867  Diagnosis V667  Diastolic blood pressure  Ethnicity  Fraction inspired oxygen  Gender Glascow coma scale eye opening Glascow coma scale motor response  Glascow coma scale total Glascow coma scale verbal response  Glucose  Heart Rate  Height     Hours  Mean blood pressure  Oxygen saturation  Respiratory rate  Systolic blood pressure  Temperature     Weight   pH

In [119]:
import torch
import datetime
from torch.utils.data import Dataset

# from 10011_episode1_timeseries.csv
# return "10011/episode1.csv"
def get_episode_from_stay_name(stay_name):
    chunks = stay_name.split("_")
    return f"{chunks[0]}/{chunks[1]}.csv", f"{chunks[0]}_{chunks[1]}.csv"

def drop_leaking_data_from_episode(episode_df):
    del episode_df["Icustay"]
    del episode_df["Length of Stay"]
    del episode_df["Mortality"]
    
    return episode_df

def cast(x):
    try:
        return 0 if x == 0 else int(float(x.split(" ")[0]))
    except:
        return 0
        
        

class MortalityDataset(Dataset):
    def __init__(self, timeseries_dir, episodes_dir):
        overview = pd.read_csv(timeseries_dir+"labels.csv")
        
        self.data = []
        for index, row in overview.iterrows():
            stay = row['stay']
            y_true = row['y_true']
            measurements = pd.read_csv(timeseries_dir+stay)
            num_measurements = len(measurements.index)
            fs_name, new_fs_name = get_episode_from_stay_name(stay)
            episode_codes = drop_leaking_data_from_episode(pd.read_csv(episodes_dir+fs_name))
            episode_codes = pd.DataFrame(np.repeat(episode_codes.values, num_measurements, axis=0), columns=episode_codes.columns)
            measurements = measurements.combine_first(episode_codes)
            measurements["y_true"] = y_true
            measurements = measurements.sort_values(by="Hours", ascending=True)
            measurements = measurements.fillna(0)
            measurements = measurements.astype(int, errors='ignore')
            
            newdf = measurements.select_dtypes(include=["object"])
            newdf = newdf.applymap(lambda x: cast(x))
            measurements[newdf.columns] = newdf.values
            measurements = measurements.astype(int)
        
            measurements.to_csv(f"/home/iandre3/PACE/data/preprocessed/{new_fs_name}")
            if index % 500 == 0:
                print(f"{datetime.datetime.now()}: Done loading sample {index+1}/{len(overview)}, wrote to {new_fs_name}")

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label
    
m = MortalityDataset(timeseries_dir="/home/iandre3/PACE/data/measurements/", episodes_dir="/home/iandre3/PACE/data/icu/")

2022-04-07 15:18:42.617195: Done loading sample 1/20030, wrote to 10011_episode1.csv
2022-04-07 15:19:25.476045: Done loading sample 501/20030, wrote to 17614_episode1.csv
2022-04-07 15:20:08.210144: Done loading sample 1001/20030, wrote to 24459_episode1.csv
2022-04-07 15:20:51.446712: Done loading sample 1501/20030, wrote to 31526_episode1.csv
2022-04-07 15:21:34.329666: Done loading sample 2001/20030, wrote to 5586_episode1.csv
2022-04-07 15:22:17.494390: Done loading sample 2501/20030, wrote to 77469_episode1.csv
2022-04-07 15:23:00.849964: Done loading sample 3001/20030, wrote to 98057_episode1.csv
2022-04-07 15:23:44.266172: Done loading sample 3501/20030, wrote to 28973_episode1.csv
2022-04-07 15:24:28.413067: Done loading sample 4001/20030, wrote to 19995_episode1.csv
2022-04-07 15:25:12.454241: Done loading sample 4501/20030, wrote to 58075_episode1.csv
2022-04-07 15:25:55.780396: Done loading sample 5001/20030, wrote to 25904_episode1.csv
2022-04-07 15:26:39.357922: Done load