1. Parse outcomes

In [None]:

import pandas as pd
import numpy as np
import os


df_outcomes_a = pd.read_csv('../datasets/p12/rawdata/Outcomes-a.txt', sep=",", header=0,
                            names=["RecordID", "SAPS-I", "SOFA", "Length_of_stay", "Survival", "In-hospital_death"])
df_outcomes_b = pd.read_csv('../datasets/p12/rawdata/Outcomes-b.txt', sep=",", header=0,
                            names=["RecordID", "SAPS-I", "SOFA", "Length_of_stay", "Survival", "In-hospital_death"])
df_outcomes_c = pd.read_csv('../datasets/p12/rawdata/Outcomes-c.txt', sep=",", header=0,
                            names=["RecordID", "SAPS-I", "SOFA", "Length_of_stay", "Survival", "In-hospital_death"])

print(df_outcomes_a.head(n=5))
print(df_outcomes_b.head(n=5))
print(df_outcomes_c.head(n=5))

arr_outcomes_a = np.array(df_outcomes_a)
arr_outcomes_b = np.array(df_outcomes_b)
arr_outcomes_c = np.array(df_outcomes_c)

n_a = arr_outcomes_a.shape[0]
n_b = arr_outcomes_b.shape[0]
n_c = arr_outcomes_c.shape[0]
print('n_a = %d, n_b = %d, n_c = %d' % (n_a, n_b, n_c))

# merge dataframes
arr_outcomes = np.concatenate([arr_outcomes_a, arr_outcomes_b, arr_outcomes_c], axis=0)
n = arr_outcomes.shape[0]
print(arr_outcomes.shape)

y_inhospdeath = arr_outcomes[:, -1]
print("Percentage of in-hosp death: %.2f%%" % (np.sum(y_inhospdeath)/n*100))
print(y_inhospdeath.shape)

os.makedirs('../datasets/p12/processed_data', exist_ok=True)

# Store outcomes in npy format
np.save('../datasets/p12/processed_data/arr_outcomes.npy', arr_outcomes)
print('arr_outcomes.npy saved')


2. extract all parameters encountered across all patients

In [None]:
def extract_unq_params(path):
    params_all = set()
    for fname in os.listdir(path):
        if fname.endswith('.txt'):
            df_temp = pd.read_csv(path + '/' + fname, sep=",", header=1, names=["time", "param", "value"])
            arr_data_temp = df_temp.values
            params_temp = arr_data_temp[:, 1]
            params_all.update(params_temp.tolist())

    params_all = [p for p in params_all if str(p) != 'nan']
    return params_all


params_a = extract_unq_params('../datasets/p12/rawdata/set-a/')
params_b = extract_unq_params('../datasets/p12/rawdata/set-b/')
params_c = extract_unq_params('../datasets/p12/rawdata/set-c/')

params = params_a + params_b + params_c
param_list = list(set(params))
print('#params:', len(param_list))


3. remove 5 fields

In [None]:
param_list.remove("Gender")
param_list.remove("Height")
param_list.remove("Weight")
param_list.remove("Age")
param_list.remove("ICUType")

print("Parameters: ", param_list)
print("Number of total parameters:", len(param_list))

# save variable names
np.save('../datasets/p12/processed_data/ts_params.npy', param_list)
print('ts_params.npy: the names of 36 variables')

static_param_list = ['Age', 'Gender', 'Height', 'ICUType', 'Weight']
np.save('../datasets/p12/processed_data/static_params.npy', static_param_list)
print('save names of static descriptors: static_params.npy')


4. parse variables

In [None]:

def parse_all(path):
    P_list = []
    cnt = 0
    allfiles = os.listdir(path)
    allfiles.sort()
    for fname in allfiles:
        if not fname.endswith('.txt'):
            continue

        df = pd.read_csv(path + '/' + fname, sep=",", header=1, names=["time", "param", "value"])
        df_demogr = df.iloc[0:5]
        df_data = df.iloc[5:]

        arr_demogr = df_demogr.values
        arr_data = df_data.values

        my_dict = {'id': int(fname.split('.')[0])}
        my_dict['static'] = (arr_demogr[0, 2], arr_demogr[1, 2],
                             arr_demogr[2, 2], arr_demogr[3, 2], arr_demogr[4, 2])

        # time-series
        n_pts = arr_data.shape[0]
        ts_list = []
        for i in range(n_pts):  # for each line
            param = arr_data[i, 1]  # the name of variables
            if param in param_list:
                ts = arr_data[i, 0]  # time stamp
                hrs, mins = float(ts[0:2]), float(ts[3:5])
                value = arr_data[i, 2]  # value of variable
                totalmins = 60.0*hrs + mins
                ts_list.append((hrs, mins, totalmins, param, value))
        my_dict['ts'] = ts_list

        # append patient dictionary in master dictionary
        P_list.append(my_dict)
        cnt += 1
    return P_list

p_list_a = parse_all('../datasets/p12/rawdata/set-a/')
p_list_b = parse_all('../datasets/p12/rawdata/set-b/')
p_list_c = parse_all('../datasets/p12/rawdata/set-c/')
P_list = p_list_a + p_list_b + p_list_c
print('Length of P_list', len(P_list))

np.save('../datasets/p12/processed_data/P_list.npy', P_list)
print('P_list.npy saved')


In [None]:

print('number of samples: ', len(P_list))
print(len(param_list), param_list)
print(len(static_param_list), static_param_list)


max unique time series length

In [None]:
n = len(P_list)
max_tmins = 48*60
len_ts = []

for ind in range(n):  # for each patient
    ts = P_list[ind]['ts']
    unq_tmins = []
    for sample in ts:  # for each instance (time point)
        current_tmin = sample[2]
        if (current_tmin not in unq_tmins) and (current_tmin < max_tmins):
            unq_tmins.append(current_tmin)

    len_ts.append(len(unq_tmins))
print('max unique time series length:', np.max(len_ts))


5. Group all patient time series into arrays

In [None]:
extended_static_list = ['Age', 'Gender=0', 'Gender=1', 'Height', 'ICUType=1', 'ICUType=2', 'ICUType=3', 'ICUType=4', 'Weight']
np.save('../datasets/p12/processed_data/extended_static_params.npy', extended_static_list)

P_list = np.load('../datasets/p12/processed_data/P_list.npy', allow_pickle=True)
arr_outcomes = np.load('../datasets/p12/processed_data/arr_outcomes.npy', allow_pickle=True)

ts_params = np.load('../datasets/p12/processed_data/ts_params.npy', allow_pickle=True)
static_params = np.load('../datasets/p12/processed_data/static_params.npy', allow_pickle=True)

max_tmins = 48*60
len_ts = []
n = len(P_list)
max_len = 215
F = len(ts_params)

PTdict_list = []
max_hr = 0
for ind in range(n):
    ID = P_list[ind]['id']
    static = P_list[ind]['static']
    ts = P_list[ind]['ts']

    # find unique times
    unq_tmins = []
    for sample in ts:
        current_tmin = sample[2]
        if (current_tmin not in unq_tmins) and (current_tmin < max_tmins):
            unq_tmins.append(current_tmin)
    unq_tmins = np.array(unq_tmins)

    # one-hot encoding of categorical static variables
    extended_static = [static[0], 0, 0, static[2], 0, 0, 0, 0, static[4]]
    if static[1] == 0:
        extended_static[1] = 1
    elif static[1] == 1:
        extended_static[2] = 1
    if static[3] == 1:
        extended_static[4] = 1
    elif static[3] == 2:
        extended_static[5] = 1
    elif static[3] == 3:
        extended_static[6] = 1
    elif static[3] == 4:
        extended_static[7] = 1

    # construct array of maximal size
    Parr = np.zeros((max_len, F))
    Tarr = np.zeros((max_len, 1))

    # for each time measurement find index and store
    for sample in ts:
        tmins = sample[2]
        param = sample[-2]
        value = sample[-1]
        if tmins < max_tmins:
            time_id = np.where(tmins == unq_tmins)[0][0]
            param_id = np.where(ts_params == param)[0][0]
            Parr[time_id, param_id] = value
            Tarr[time_id, 0] = unq_tmins[time_id]

    length = len(unq_tmins)

    # construct dictionary
    my_dict = {'id': ID, 'static': static, 'extended_static': extended_static, 'arr': Parr, 'time': Tarr, 'length': length}

    # add array into list
    PTdict_list.append(my_dict)

print(len(PTdict_list))
np.save('../datasets/p12/processed_data/PTdict_list.npy', PTdict_list)
print('PTdict_list.npy saved', PTdict_list[0].keys())


6. remove outliers

In [None]:
import numpy as np


"""Remove 12 patients at blacklist"""
PTdict_list = np.load('../datasets/p12/processed_data/PTdict_list.npy', allow_pickle=True)
arr_outcomes = np.load('../datasets/p12/processed_data/arr_outcomes.npy', allow_pickle=True)

# remove blacklist patients
blacklist = [140501, 150649, 140936, 143656, 141264, 145611, 142998, 147514, 142731, 150309, 155655, 156254]

i = 0
n = len(PTdict_list)
while i < n:
    pid = PTdict_list[i]['id']
    if pid in blacklist:
        PTdict_list = np.delete(PTdict_list, i)
        arr_outcomes = np.delete(arr_outcomes, i, axis=0)
        n -= 1
    i += 1

print(len(PTdict_list), arr_outcomes.shape)

np.save('../datasets/p12/processed_data/PTdict_list.npy', PTdict_list)
np.save('../datasets/p12/processed_data/arr_outcomes.npy', arr_outcomes)


7. generate split

In [None]:
import pickle as pkl
from torch_geometric.seed import seed_everything


"""Use 8:1:1 split"""
p_train = 0.80
p_val = 0.10
p_test = 0.10

n = len(PTdict_list)  # original 12000 patients, remove 12 outliers
n_train = round(n*p_train)
n_val = round(n*p_val)
n_test = n - (n_train+n_val)
Nsplits = 5

for seed in range(10):
    seed_everything(seed)
    for split in range(Nsplits):
        p = np.random.permutation(n)
        idx_train = p[:n_train]
        idx_val = p[n_train:n_train+n_val]
        idx_test = p[n_train+n_val:]
        with open(f'../datasets/p12/splits/seed{seed}_split{split}.pkl', 'wb') as wbfile:
            print(f'seed{seed}_split{split}', len(idx_train), len(idx_val), len(idx_test))
            pkl.dump((idx_train, idx_val, idx_test), wbfile)

print('split IDs saved')
