In [17]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [18]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
import yaml
from pandas import DataFrame, Series
import shutil

In [19]:
NB_DIR = %pwd

In [20]:
RAW_DATA = '/data1/MIMIC-III/RAW/'
INTERIM_DATA = f'{RAW_DATA}/../interim/'
PROCESSED_DATA = f'{RAW_DATA}/../processed/'

In [21]:
MIMIC3_BENCHMARK_LOCATION = f'{NB_DIR}/../mimic3-benchmarks/'

# Split train and test

Based on https://github.com/YerevaNN/mimic3-benchmarks/blob/master/scripts/split_train_and_test.py

In [22]:
testset = set()

In [23]:
with open(f"{MIMIC3_BENCHMARK_LOCATION}resources/testset.csv", "r") as test_set_file:
    for line in test_set_file:
        x, y = line.split(',')
        if int(y) == 1:
            testset.add(x)

In [25]:
def copy_to_partition(patients, partition):
    if not os.path.exists(os.path.join(PROCESSED_DATA, partition)):
        os.mkdir(os.path.join(PROCESSED_DATA, partition))
    for patient in patients:
        src = os.path.join(INTERIM_DATA, patient)
        dest = os.path.join(PROCESSED_DATA, partition, patient)
        if not os.path.exists(dest):
            shutil.copytree(src, dest)

In [26]:
folders = os.listdir(INTERIM_DATA)
folders = list((filter(str.isdigit, folders)))
train_patients = [x for x in folders if not x in testset]
test_patients = [x for x in folders if x in testset]

assert len(set(train_patients) & set(test_patients)) == 0

In [27]:
folders[:5]

['27501', '41625', '75430', '7874', '20242']

In [28]:
copy_to_partition(train_patients, "train")
copy_to_partition(test_patients, "test")

# In-hospital mortality

In [29]:
random.seed(49297)

In [30]:
IHM_DATA = f'{PROCESSED_DATA}/in-hospital-mortality/'

In [31]:
if not os.path.exists(IHM_DATA):
    os.makedirs(IHM_DATA)

In [34]:
def process_partition(partition, eps=1e-6, n_hours=48):
    output_dir = os.path.join(IHM_DATA, partition)
    if (not os.path.exists(output_dir)):
        os.mkdir(output_dir)

    xy_pairs = []
    patients = list(filter(str.isdigit, os.listdir(os.path.join(PROCESSED_DATA, partition))))
    for (patient_index, patient) in enumerate(patients):
        patient_folder = os.path.join(PROCESSED_DATA, partition, patient)
        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))

        for ts_filename in patient_ts_files:
            with open(os.path.join(patient_folder, ts_filename)) as tsfile:
                lb_filename = ts_filename.replace("_timeseries", "")
                label_df = pd.read_csv(os.path.join(patient_folder, lb_filename))

                # empty label file
                if (label_df.shape[0] == 0):
                    continue

                mortality = int(label_df.iloc[0]["Mortality"])
                los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                if (pd.isnull(los)):
                    print("\n\t(length of stay is missing)", patient, ts_filename)
                    continue

                if (los < n_hours - eps):
                    continue

                ts_lines = tsfile.readlines()
                header = ts_lines[0]
                ts_lines = ts_lines[1:]
                event_times = [float(line.split(',')[0]) for line in ts_lines]

                ts_lines = [line for (line, t) in zip(ts_lines, event_times)
                                     if (t > -eps and t < n_hours + eps)]
                event_times = [t for t in event_times
                                     if (t > -eps and t < n_hours + eps)]

                # no measurements in ICU
                if (len(ts_lines) == 0):
                    print("\n\t(no events in ICU) ", patient, ts_filename)
                    continue

                output_ts_filename = patient + "_" + ts_filename
                with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
                    outfile.write(header)
                    for line in ts_lines:
                        outfile.write(line)

                xy_pairs.append((output_ts_filename, mortality))

        if ((patient_index + 1) % 500 == 0):
            print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))

    print("\n", len(xy_pairs))
    if partition == "train":
        random.shuffle(xy_pairs)
    if partition == "test":
        xy_pairs = sorted(xy_pairs)

    with open(os.path.join(output_dir, "listfile.csv"), "w") as listfile:
        listfile.write('stay,y_true\n')
        for (x, y) in xy_pairs:
            listfile.write("%s,%d\n" % (x, y))

In [35]:
process_partition("test")
process_partition("train")

processed 100 / 5070 patients
processed 200 / 5070 patients
processed 300 / 5070 patients
processed 400 / 5070 patients
processed 500 / 5070 patients
processed 600 / 5070 patients
processed 700 / 5070 patients
processed 800 / 5070 patients
processed 900 / 5070 patients
processed 1000 / 5070 patients
processed 1100 / 5070 patients
processed 1200 / 5070 patients
processed 1300 / 5070 patients
processed 1400 / 5070 patients
processed 1500 / 5070 patients
processed 1600 / 5070 patients
processed 1700 / 5070 patients
processed 1800 / 5070 patients
processed 1900 / 5070 patients
processed 2000 / 5070 patients
processed 2100 / 5070 patients
processed 2200 / 5070 patients
processed 2300 / 5070 patients
processed 2400 / 5070 patients
processed 2500 / 5070 patients
processed 2600 / 5070 patients
processed 2700 / 5070 patients
processed 2800 / 5070 patients
processed 2900 / 5070 patients
processed 3000 / 5070 patients
processed 3100 / 5070 patients
processed 3200 / 5070 patients
processed 3300 / 