## This script creates the final train/test X and y for AdaBoost in *.mat files
* Set db_name variabele accordingly before running this script. db_name$\in${'ism', 'stm', 'banner'}

In [3]:
import pedAKI_predictor as ppaki
import os
import numpy as np
import itertools
import pandas as pd
import pickle
from scipy import signal, stats, io
import stm_utilities as stm

In [2]:
db_name = 'ism'
fileDir = os.path.dirname("__file__")
cut_off = 0
fill_mode = 'no_fill'
ref_type = 'onset'
test_size = 0.3
cv = 5

timelag_all = list(-1*np.arange(25))
timewin_all = [12, 6]

combination = [(x,y) for x in timelag_all for y in timewin_all]
mask = [abs(x)>=abs(y) for (x,y) in combination]
combination = list(itertools.compress(combination,mask))

## Create ISM train-test data set (no filling NaN values)

In [None]:
if db_name=='ism':
    reload(ppaki)
    # dirFrom_ism = 'io_ism5'
    # dirTo_ism = 'train_test_ism7_nofill_only_level'
    dirFrom_ism = 'io_ism3'
    dirTo_ism = 'train_test_ism_nofill'

    if not os.path.exists(os.path.join(fileDir, dirTo_ism)):
        os.makedirs(os.path.join(fileDir, dirTo_ism))

    # ceid_group_ism = pickle.load(open(os.path.join(fileDir, 'io_ism5', 'ceid_group_ism.pkl'), 'rb'))
    # only_level = True
    # only_rate = False

    for tlag, twin in np.abs(combination):

    #     Check if test-train files exist
        fname_tt_pkl = "ism_onset_tt_tlag{:03d}_twin{:03d}.pkl".format(tlag, twin)
        fname_tt_pkl = os.path.join(fileDir, dirTo_ism, fname_tt_pkl)
        fname_tt_mat = 'ism_onset_tt_tlag{:03d}_twin{:03d}.mat'.format(tlag, twin)
        fname_tt_mat = os.path.join(fileDir, dirTo_ism, fname_tt_mat)

        if os.path.isfile(fname_tt_pkl) and os.path.isfile(fname_tt_mat):
            pass
        else:
    #         try:
            fname_aki = "ism_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(tlag, twin)
            fname_con = "ism_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(tlag, twin)
            fname_aki = os.path.join(fileDir, dirFrom_ism, fname_aki)
            fname_con = os.path.join(fileDir, dirFrom_ism, fname_con)
            io_mat_aki = pd.read_pickle(fname_aki)
    #         if only_level:
    #             io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_ism['ceidAKILevel']), :]
    #         elif only_rate:
    #             io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_ism['ceidAKIRate']), :]

            io_mat_con = pd.read_pickle(fname_con)
            io_mat = pd.concat([io_mat_aki, io_mat_con], axis=0)

            lr_pred = ppaki.AKI_predictor_log(io_mat, False, cutoff=cut_off, fill_mode=fill_mode, ref_type=ref_type, cv=cv, 
                                                      timelag=tlag, timewindow=twin, do_balance=False)

            X_train = lr_pred.X_train.as_matrix()
            y_train = lr_pred.y_train
            X_test = lr_pred.X_test.as_matrix()
            y_test = lr_pred.y_test
            predictors = lr_pred.cols

            f = open(fname_tt_pkl, 'wb')
            pickle.dump({'X_train': X_train, 
                         'y_train': y_train, 
                         'X_test': X_test,
                         'y_test': y_test, 
                         'predictors': predictors}, f)
            f.close()

            io.savemat(fname_tt_mat, {'X_train':X_train, 'y_train': y_train, 
                                   'X_test': X_test, 'y_test': y_test, 
                                   'predictors': predictors})
    #         except:
    #             pass

## Create STM train-test data set (no filling NaN values)
> UOMs of lactic_acid, creatinine, glucose, albumin are converted to be consistent with those of  ISM UOMs

In [None]:
if db_name=='stm'
    dirFrom_stm = 'io_stm3'
    dirTo_stm = 'train_test_stm_nofill'
    if not os.path.exists(os.path.join(fileDir, dirTo_stm)):
        os.makedirs(os.path.join(fileDir, dirTo_stm))

    # ceid_group_stm = pickle.load(open(os.path.join(fileDir, 'io_stm5', 'ceid_group_stm.pkl'), 'rb'))
    # only_level = True
    # only_rate = False

    for tlag, twin in np.abs(combination):

    #     Check if test-train files exist
        fname_tt_pkl = "stm_onset_tt_tlag{:03d}_twin{:03d}.pkl".format(tlag, twin)
        fname_tt_pkl = os.path.join(fileDir, dirTo_stm, fname_tt_pkl)
        fname_tt_mat = 'stm_onset_tt_tlag{:03d}_twin{:03d}.mat'.format(tlag, twin)
        fname_tt_mat = os.path.join(fileDir, dirTo_stm, fname_tt_mat)

    #     if os.path.isfile(fname_tt_pkl) and os.path.isfile(fname_tt_mat):
    #         pass
    #     else:
    #         try:
        fname_aki = "stm_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(tlag, twin)
        fname_con = "stm_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(tlag, twin)
        fname_aki = os.path.join(fileDir, dirFrom_stm, fname_aki)
        fname_con = os.path.join(fileDir, dirFrom_stm, fname_con)
        io_mat_aki = pd.read_pickle(fname_aki)
    #     if only_level:
    #         io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_stm['ceidAKILevel']), :]
    #     elif only_rate:
    #         io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_stm['ceidAKIRate']), :]

        io_mat_con = pd.read_pickle(fname_con)
        io_mat = pd.concat([io_mat_aki, io_mat_con], axis=0)
        for ft in io_mat.columns:
            if 'lactic_acid' in ft:
                io_mat[ft] = io_mat[ft]*9.009        
            #     elif 'creatinine' in ft:
            #         _io_onset[ft] = _io_onset[ft]*0.01131
            elif 'glucose' in ft:
                io_mat[ft] = io_mat[ft]*18.0182
            elif 'albumin' in ft:
                io_mat[ft] = io_mat[ft]*0.1



    #             io_mat = stm.uomConvert('stm', 'ism', io_mat)

        lr_pred = ppaki.AKI_predictor_log(io_mat, ready=False, cutoff=cut_off, fill_mode=fill_mode, 
                                          ref_type=ref_type, cv=cv, 
                                          timelag=tlag, timewindow=twin, do_balance=False)

        X_train = lr_pred.X_train.as_matrix()
        y_train = lr_pred.y_train
        X_test = lr_pred.X_test.as_matrix()
        y_test = lr_pred.y_test
        predictors = lr_pred.cols        

        f = open(fname_tt_pkl, 'wb')
        pickle.dump({'X_train': X_train, 
                     'y_train': y_train, 
                     'X_test': X_test,
                     'y_test': y_test, 
                     'predictors': predictors}, f)
        f.close()

        io.savemat(fname_tt_mat, {'X_train':X_train, 'y_train': y_train, 
                                  'X_test': X_test, 'y_test': y_test, 
                                  'predictors': predictors})
    #         except:
    #             pass

## Create Banner train-test data set (no filling NaN values)
> UOMs of lactic_acid is converted to be consistent with that of ISM UOM

In [1]:
if db_name=='banner'
    dirFrom_banner = 'io_banner3'
    dirTo_banner = 'train_test_banner_nofill'
    # ceid_group_stm = pickle.load(open(os.path.join(fileDir, 'io_stm5', 'ceid_group_stm.pkl'), 'rb'))
    # only_level = True
    # only_rate = False
    if not os.path.exists(os.path.join(fileDir, dirTo_banner)):
        os.makedirs(os.path.join(fileDir, dirTo_banner))

    for tlag, twin in np.abs(combination):

    #     Check if test-train files exist
        fname_tt_pkl = "banner_onset_tt_tlag{:03d}_twin{:03d}.pkl".format(tlag, twin)
        fname_tt_pkl = os.path.join(fileDir, dirTo_banner, fname_tt_pkl)
        fname_tt_mat = 'banner_onset_tt_tlag{:03d}_twin{:03d}.mat'.format(tlag, twin)
        fname_tt_mat = os.path.join(fileDir, dirTo_banner, fname_tt_mat)

    #     if os.path.isfile(fname_tt_pkl) and os.path.isfile(fname_tt_mat):
    #         pass
    #     else:
    #         try:
        fname_aki = "banner_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(tlag, twin)
        fname_con = "banner_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(tlag, twin)
        fname_aki = os.path.join(fileDir, dirFrom_banner, fname_aki)
        fname_con = os.path.join(fileDir, dirFrom_banner, fname_con)
        io_mat_aki = pd.read_pickle(fname_aki)
    #     if only_level:
    #         io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_stm['ceidAKILevel']), :]
    #     elif only_rate:
    #         io_mat_aki = io_mat_aki.loc[np.in1d(io_mat_aki.encounter_id, ceid_group_stm['ceidAKIRate']), :]

        io_mat_con = pd.read_pickle(fname_con)
        io_mat = pd.concat([io_mat_aki, io_mat_con], axis=0)
        for ft in io_mat.columns:
            if 'lactic_acid' in ft:
                io_mat[ft] = io_mat[ft]*9.009        
            #     elif 'creatinine' in ft:
            #         _io_onset[ft] = _io_onset[ft]*0.01131
    #         elif 'glucose' in ft:
    #             io_mat[ft] = io_mat[ft]*18.0182
    #         elif 'albumin' in ft:
    #             io_mat[ft] = io_mat[ft]*0.1



    #             io_mat = stm.uomConvert('stm', 'ism', io_mat)

        lr_pred = ppaki.AKI_predictor_log(io_mat, ready=False, cutoff=cut_off, fill_mode=fill_mode, 
                                          ref_type=ref_type, cv=cv, 
                                          timelag=tlag, timewindow=twin, do_balance=False)

        X_train = lr_pred.X_train.as_matrix()
        y_train = lr_pred.y_train
        X_test = lr_pred.X_test.as_matrix()
        y_test = lr_pred.y_test
        predictors = lr_pred.cols        

        f = open(fname_tt_pkl, 'wb')
        pickle.dump({'X_train': X_train, 
                     'y_train': y_train, 
                     'X_test': X_test,
                     'y_test': y_test, 
                     'predictors': predictors}, f)
        f.close()

        io.savemat(fname_tt_mat, {'X_train':X_train, 'y_train': y_train, 
                                  'X_test': X_test, 'y_test': y_test, 
                                  'predictors': predictors})
    #         except:
    #             pass

SyntaxError: invalid syntax (<ipython-input-1-2a4cb325b0c9>, line 1)

## Create ISM-STM across-institute train-test data set (Inner join)
> * For common predictos in both ISM and STM: no filling NaN values
> * For missing predictors in STM: Ignored
> * UOMs of lactic_acid, creatinine, glucose, albumin are converted to be consistent with those of  ISM UOMs
> * ph, glucose, ratio_pao2_flo2 ignored since the distributions are different
> * ph in STM is gastric ph whereas ph in ISM is blood ph. Gastric ph is significantly lower than blood ph.
> * glucose in STM seems to be lower than that in ISM
> * PF ratio in ISM and STM are not in the same dynamic range. PF ratio given in STM is not consistent
> with PF ratio calculated by PaO2 and FiO2
> * Class balanced
> * Not normalized

In [None]:
# ex_fts = ['ph', 'glucose', 'ratio_pao2_flo2']
# suffices = ['min', 'max', 'mean', 'median', 'last']
# ex_fts_full = [ft+"_"+suff for ft in ex_fts for suff in suffices]

# for tlag, twin in np.abs(combination):
    
# #     Check if test-train files exist
#     fname_tt_pkl = "across_inner_onset_tt_tlag{:03d}_twin{:03d}.pkl".format(tlag, twin)
#     fname_tt_pkl = os.path.join(fileDir, "train_test_across_inner_nofill", fname_tt_pkl)
#     fname_tt_mat = 'across_inner_onset_tt_tlag{:03d}_twin{:03d}.mat'.format(tlag, twin)
#     fname_tt_mat = os.path.join(fileDir, 'train_test_across_inner_nofill', fname_tt_mat)
    
#     if os.path.isfile(fname_tt_pkl) and os.path.isfile(fname_tt_mat):
#         pass
#     else:
# #         try:
#             fname_ism_aki = "ism_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(tlag, twin)
#             fname_ism_con = "ism_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(tlag, twin)
#             fname_ism_aki = os.path.join(fileDir, "io_ism", fname_ism_aki)
#             fname_ism_con = os.path.join(fileDir, "io_ism", fname_ism_con)
#             io_mat_ism_aki = pd.read_pickle(fname_ism_aki)
#             io_mat_ism_con = pd.read_pickle(fname_ism_con)
#             io_mat_ism = pd.concat([io_mat_ism_aki, io_mat_ism_con], axis=0)

#             fname_stm_aki = "stm_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(tlag, twin)
#             fname_stm_con = "stm_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(tlag, twin)
#             fname_stm_aki = os.path.join(fileDir, "io_stm", fname_stm_aki)
#             fname_stm_con = os.path.join(fileDir, "io_stm", fname_stm_con)
#             io_mat_stm_aki = pd.read_pickle(fname_stm_aki)
#             io_mat_stm_con = pd.read_pickle(fname_stm_con)
#             io_mat_stm = pd.concat([io_mat_stm_aki, io_mat_stm_con], axis=0)
#             io_mat_stm = stm.uomConvert('stm', 'ism', io_mat_stm)

#             io_mat = pd.concat([io_mat_ism, io_mat_stm], axis=0, join='inner')
#             valcol = [col for col in io_mat.columns if col not in ex_fts_full]
#             io_mat = io_mat.loc[:, valcol]

#             lr_pred = ppaki.AKI_predictor_log(io_mat, ready=False, cutoff=cut_off, fill_mode=fill_mode,
#                                               ref_type=ref_type, cv=cv, 
#                                               timelag=tlag, timewindow=twin)

#             X_train = lr_pred.X_train.as_matrix()
#             y_train = lr_pred.y_train
#             X_test = lr_pred.X_test.as_matrix()
#             y_test = lr_pred.y_test
#             predictors = lr_pred.cols

#             f = open(fname_tt_pkl, 'wb')
#             pickle.dump({'X_train': X_train, 
#                          'y_train': y_train, 
#                          'X_test': X_test,
#                          'y_test': y_test, 
#                          'predictors': predictors}, f)
#             f.close()

#             io.savemat(fname_tt_mat, {'X_train':X_train, 'y_train': y_train, 
#                                    'X_test': X_test, 'y_test': y_test, 
#                                    'predictors': predictors})
# #         except:
# #             pass