In [None]:
from __future__ import print_function, division
%load_ext autoreload

In [None]:
%autoreload

import copy, math, os, pickle, time, sys, os, random, argparse, pandas as pd, numpy as np, scipy.stats as ss
import pathlib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, precision_score, f1_score, recall_score
from sklearn import preprocessing

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torch.nn.utils import weight_norm
#from mmd_grud_utils import *

import multiprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.model_selection import StratifiedKFold

In [None]:
'''
Single CUDA device, comment for directions on multiple GPU
'''
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
'''
See CUDA versions, CUDA must be available for GPU acceleration
'''
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

In [None]:
'''
Single CUDA device, comment for directions on multiple GPU
'''
torch.cuda.get_device_name(0)
# torch.cuda.get_device_name(1)
# torch.cuda.get_device_name(2)
# torch.cuda.get_device_name(3)

In [None]:
'''
See github directions on getting data access from Physionet.org
'''
DATA_FILEPATH     = "./data/all_hourly_data.h5"
#RAW_DATA_FILEPATH = './all_hourly_data.h5'
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
RESULTS_DIR     = "./results/"
PROCESSED_DATA_DIR = "./processed_data/"


def set_primary_seeds(seed):
    print("Setting primary seeds...")
    if not seed:
        seed = 1
        
    torch.manual_seed(SEED)
    np.random.seed(SEED)
#     np.random.RandomState(SEED)
    random.seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)  # for multiGPUs.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
set_primary_seeds(SEED)

In [None]:
'''
Some tools from: https://github.com/MLforHealth/MIMIC_Extract/tree/master/notebooks
'''
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [None]:
%%time
data_full_lvl2 = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
#data_full_raw  = pd.read_hdf(RAW_DATA_FILEPATH, 'vitals_labs') 
statics        = pd.read_hdf(DATA_FILEPATH, 'patients')

In [None]:
'''
Some tools from: https://github.com/MLforHealth/MIMIC_Extract/tree/master/notebooks

“Simple Imputation” scheme outlined in Che et al.:

Zhengping Che, Sanjay Purushotham, Kyunghyun Cho, David Sontag, and Yan
Liu. 2018. Recurrent Neural Networks for Multivariate Time Series with Missing
Values. Scientific Reports 8, 1 (2018).

'''
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    
    return df_out

In [None]:
'''
Data preprocessing to define 3-day and 7-day length of stay outcome labels
'''
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

lvl2, raw = [df[
    (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (df.index.get_level_values('hours_in') < WINDOW_SIZE)
] for df in (data_full_lvl2, data_full_lvl2)]

### START: LOAD GRU-D SOURCE

In [None]:
'''
GRU-D Source from: https://github.com/MLforHealth/MIMIC_Extract/tree/master/notebooks
'''

import time
def to_3D_tensor(df):
    idx = pd.IndexSlice
    return np.dstack((df.loc[idx[:,:,:,i], :].values for i in sorted(set(df.index.get_level_values('hours_in')))))
def prepare_dataloader(df, Ys, batch_size, shuffle=True):
    """
    dfs = (df_train, df_dev, df_test).
    df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time})
    Ys_series = (subject, hadm, icustay) => label.
    """
    X     = torch.from_numpy(to_3D_tensor(df).astype(np.float32))
    label = torch.from_numpy(Ys.values.astype(np.int64))
    dataset = utils.TensorDataset(X, label)
    
    return utils.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last = True)

class FilterLinear(nn.Module):
    def __init__(self, in_features, out_features, filter_square_matrix, bias=True):
        '''
        filter_square_matrix : filter square matrix, whose each elements is 0 or 1.
        '''
        super(FilterLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        assert in_features > 1 and out_features > 1, "Passing in nonsense sizes"
        
#         use_gpu = torch.cuda.is_available()
        use_gpu=False
        self.filter_square_matrix = None
        if use_gpu: self.filter_square_matrix = Variable(filter_square_matrix.cuda(), requires_grad=False)
        else:       self.filter_square_matrix = Variable(filter_square_matrix, requires_grad=False)
        
        self.weight = Parameter(torch.Tensor(out_features, in_features))

        if bias: self.bias = Parameter(torch.Tensor(out_features))
        else:    self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None: self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x):
        return F.linear(
            x,
            self.filter_square_matrix.mul(self.weight),
            self.bias
        )

    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'in_features=' + str(self.in_features) \
            + ', out_features=' + str(self.out_features) \
            + ', bias=' + str(self.bias is not None) + ')'
        
class GRUD(nn.Module):
    def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, output_last = False):
        """
        With minor modifications from https://github.com/zhiyongc/GRU-D/
        Recurrent Neural Networks for Multivariate Times Series with Missing Values
        GRU-D: GRU exploit two representations of informative missingness patterns, i.e., masking and time interval.
        cell_size is the size of cell_state.
        
        Implemented based on the paper: 
        @article{che2018recurrent,
          title={Recurrent neural networks for multivariate time series with missing values},
          author={Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan},
          journal={Scientific reports},
          volume={8},
          number={1},
          pages={6085},
          year={2018},
          publisher={Nature Publishing Group}
        }
        
        GRU-D:
            input_size: variable dimension of each time
            hidden_size: dimension of hidden_state
            mask_size: dimension of masking vector
            X_mean: the mean of the historical input data
        """
        
        super(GRUD, self).__init__()
        
        self.hidden_size = hidden_size
        self.delta_size = input_size
        self.mask_size = input_size
        
#         use_gpu = torch.cuda.is_available()
        use_gpu = False
        if use_gpu:
            self.identity = torch.eye(input_size).cuda()
            self.zeros = Variable(torch.zeros(batch_size, input_size).cuda())
            self.zeros_h = Variable(torch.zeros(batch_size, self.hidden_size).cuda())
            self.X_mean = Variable(torch.Tensor(X_mean).cuda())
        else:
            self.identity = torch.eye(input_size)
            self.zeros = Variable(torch.zeros(batch_size, input_size))
            self.zeros_h = Variable(torch.zeros(batch_size, self.hidden_size))
            self.X_mean = Variable(torch.Tensor(X_mean))
        
#         print("linear size: ", input_size + hidden_size + self.mask_size)
        self.zl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size) # Wz, Uz are part of the same network. the bias is bz
        self.rl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size) # Wr, Ur are part of the same network. the bias is br
        self.hl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size) # W, U are part of the same network. the bias is b
        
        self.gamma_x_l = FilterLinear(self.delta_size, self.delta_size, self.identity)
        
        self.gamma_h_l = nn.Linear(self.delta_size, self.hidden_size) # this was wrong in available version. remember to raise the issue
        
        self.output_last = output_last
        
        self.fc = nn.Linear(self.hidden_size, 2)
        self.bn= torch.nn.BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True)
        self.drop=nn.Dropout(p=0.5, inplace=False)
        
    def step(self, x, x_last_obsv, x_mean, h, mask, delta):
        """
        Inputs:
            x: input tensor
            x_last_obsv: input tensor with forward fill applied
            x_mean: the mean of each feature
            h: the hidden state of the network
            mask: the mask of whether or not the current value is observed
            delta: the tensor indicating the number of steps since the last time a feature was observed.
            
        Returns:
            h: the updated hidden state of the network
        """
        
        batch_size = x.size()[0]
        dim_size = x.size()[1]
        
        gamma_x_l_delta = self.gamma_x_l(delta)
#         print("GXL: ", gamma_x_l_delta)
        delta_x = torch.exp(-torch.max(self.zeros, gamma_x_l_delta)) #exponentiated negative rectifier
#         print("delta_x",delta_x)
        
        
        gamma_h_l_delta = self.gamma_h_l(delta)
#         print("GHL: ", gamma_h_l_delta)
        delta_h = torch.exp(-torch.max(self.zeros_h, gamma_h_l_delta)) #self.zeros became self.zeros_h to accomodate hidden size != input size
#         print("delta_h", delta_h)
        
        x_mean = x_mean.repeat(batch_size, 1)
#         print("x_mean", x_mean)
        
        x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean)
#         print("x", x)
        h = delta_h * h
#         print("h", h)
        
        combined = torch.cat((x, h, mask), 1)
#         print("combined", combined)
#         print("combined_shape: ", combined.size())
        z_int = self.zl(combined)
#         print(z_int)
        z = torch.sigmoid(self.zl(combined)) #sigmoid(W_z*x_t + U_z*h_{t-1} + V_z*m_t + bz)
        r = torch.sigmoid(self.rl(combined)) #sigmoid(W_r*x_t + U_r*h_{t-1} + V_r*m_t + br)
        combined_new = torch.cat((x, r*h, mask), 1)
        h_tilde = torch.tanh(self.hl(combined_new)) #tanh(W*x_t +U(r_t*h_{t-1}) + V*m_t) + b
        h = (1 - z) * h + z * h_tilde
        return h
    
    
#     def step(self, x, x_last_obsv, x_mean, h, mask, delta):
        
#         batch_size = x.shape[0]
#         dim_size = x.shape[1]
        
#         delta_x = torch.exp(-torch.max(self.zeros, self.gamma_x_l(delta)))
#         delta_h = torch.exp(-torch.max(self.zeros, self.gamma_h_l(delta)))
        
#         x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean)
#         h = delta_h * h
        
#         combined = torch.cat((x, h, mask), 1)
#         z = F.sigmoid(self.zl(combined))
#         r = F.sigmoid(self.rl(combined))
#         combined_r = torch.cat((x, r * h, mask), 1)
#         h_tilde = F.tanh(self.hl(combined_r))
#         h = (1 - z) * h + z * h_tilde
        
#         return h
    
    def forward(self, X, X_last_obsv, Mask, Delta):
        batch_size = X.size(0)
#         type_size = input.size(1)
        step_size = X.size(1) # num timepoints
        spatial_size = X.size(2) # num features
        
        Hidden_State = self.initHidden(batch_size)
#         X = torch.squeeze(input[:,0,:,:])
#         X_last_obsv = torch.squeeze(input[:,1,:,:])
#         Mask = torch.squeeze(input[:,2,:,:])
#         Delta = torch.squeeze(input[:,3,:,:])
        
        i = 1
        outputs = None
        for i in range(step_size):
#             print(i)
#             print(Hidden_State)
            Hidden_State = self.step(
                torch.squeeze(X[:,i:i+1,:], 1),
                torch.squeeze(X_last_obsv[:,i:i+1,:], 1),
                torch.squeeze(self.X_mean[:,i:i+1,:], 1),
                Hidden_State,
                torch.squeeze(Mask[:,i:i+1,:], 1),
                torch.squeeze(Delta[:,i:i+1,:], 1),
            )
            if outputs is None:
                outputs = Hidden_State.unsqueeze(1)
            else:
                outputs = torch.cat((Hidden_State.unsqueeze(1), outputs), 1)
#         time.sleep(10)
#         print("outputs:",outputs)
#         test = self.fc(Hidden_State)
#         print("test",test)
#         test2 = self.bn(test)
#         print("test2",test2)
#         test3 = self.drop(test2)
#         print("test3",test3)
        
        # we want to predict a binary outcome
        #Apply 50% dropout and batch norm here
        return self.drop(self.bn(self.fc(Hidden_State)))
                
#         if self.output_last:
#             return outputs[:,-1,:]
#         else:
#             return outputs
    
    def initHidden(self, batch_size):
#         use_gpu = torch.cuda.is_available()
        use_gpu = False
        if use_gpu:
            Hidden_State = Variable(torch.zeros(batch_size, self.hidden_size).cuda())
            return Hidden_State
        else:
            Hidden_State = Variable(torch.zeros(batch_size, self.hidden_size))
            return Hidden_State

        
def Train_Model(
    model, train_dataloader, valid_dataloader, num_epochs = 300, patience = 3, min_delta = 1e-5, learning_rate=1e-3, batch_size=None
):
    print('Model Structure: ', model)
    print('Start Training ... epochs: ', num_epochs)
    
    model
    
    if (type(model) == nn.modules.container.Sequential):
        output_last = model[-1].output_last
        print('Output type dermined by the last layer')
    else:
        output_last = model.output_last
        print('Output type dermined by the model')
        
    loss_MSE = torch.nn.MSELoss()
    loss_nll=torch.nn.NLLLoss()
    loss_CEL=torch.nn.CrossEntropyLoss()
    loss_L1 = torch.nn.L1Loss()
    
    learning_rate = 0.001
#     optimizer = torch.optim.RMSprop(model.parameters(), lr = learning_rate, alpha=0.99)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    use_gpu = False
#     use_gpu = torch.cuda.is_available()
    
    interval = 100
    losses_train = []
    losses_valid = []
    losses_epochs_train = []
    losses_epochs_valid = []
    
    cur_time = time.time()
    pre_time = time.time()
    
    # Variables for Early Stopping
    is_best_model = 0
    patient_epoch = 0
    for epoch in range(num_epochs):
        
        trained_number = 0
        
        valid_dataloader_iter = iter(valid_dataloader)
        
        losses_epoch_train = []
        losses_epoch_valid = []
        
        for X, labels in train_dataloader:
#             use_gpu = torch.cuda.is_available()
            use_gpu=False
            X = X.numpy()
            mask        = torch.from_numpy(X[:, np.arange(0, X.shape[1], 3), :].astype(np.float32))
            measurement = torch.from_numpy(X[:, np.arange(1, X.shape[1], 3), :].astype(np.float32))
            time_       = torch.from_numpy(X[:, np.arange(2, X.shape[1], 3), :].astype(np.float32))
            
            mask = torch.transpose(mask, 1, 2)
            measurement = torch.transpose(measurement, 1, 2)
            time_ = torch.transpose(time_, 1, 2)
            measurement_last_obsv = measurement            

            assert measurement.size()[0] == batch_size, "Batch Size doesn't match! %s" % str(measurement.size())

            if use_gpu:
                convert_to_cuda=lambda x: Variable(x.cuda())
                X, X_last_obsv, Mask, Delta, labels = map(convert_to_cuda, [measurement, measurement_last_obsv, mask, time_, labels])
            else: 
                convert_to_tensor=lambda x: Variable(x)
                X, X_last_obsv, Mask, Delta, labels  = map(convert_to_tensor, [measurement, measurement_last_obsv, mask, time_, labels])
            
            model.zero_grad()
            prediction=model(X, X_last_obsv, Mask, Delta)
    
            if output_last:
                loss_train = loss_CEL(torch.squeeze(prediction), torch.squeeze(labels))
            else:
                full_labels = torch.cat((inputs[:,1:,:], labels), dim = 1)
                loss_train = loss_MSE(outputs, full_labels)
        
            losses_train.append(loss_train.data)
            losses_epoch_train.append(loss_train.data)
            
            optimizer.zero_grad()
            
            loss_train.backward()
            
            optimizer.step()
            
             # validation 
            try: 
                X_val, labels_val = next(valid_dataloader_iter)
                X_val = X_val.numpy()
                mask_val        = torch.from_numpy(X_val[:, np.arange(0, X_val.shape[1], 3), :].astype(np.float32))
                measurement_val = torch.from_numpy(X_val[:, np.arange(1, X_val.shape[1], 3), :].astype(np.float32))
                time_val       = torch.from_numpy(X_val[:, np.arange(2, X_val.shape[1], 3), :].astype(np.float32))
            
                mask_val = torch.transpose(mask_val, 1, 2)
                measurement_val = torch.transpose(measurement_val, 1, 2)
                time_val = torch.transpose(time_val, 1, 2)
                measurement_last_obsv_val = measurement_val
            except StopIteration:
                valid_dataloader_iter = iter(valid_dataloader)
                X_val, labels_val = next(valid_dataloader_iter)
                X_val = X_val.numpy()
                mask_val        = torch.from_numpy(X_val[:, np.arange(0, X_val.shape[1], 3), :].astype(np.float32))
                measurement_val = torch.from_numpy(X_val[:, np.arange(1, X_val.shape[1], 3), :].astype(np.float32))
                time_val       = torch.from_numpy(X_val[:, np.arange(2, X_val.shape[1], 3), :].astype(np.float32))
            
                mask_val = torch.transpose(mask_val, 1, 2)
                measurement_val = torch.transpose(measurement_val, 1, 2)
                time_val = torch.transpose(time_val, 1, 2)
                measurement_last_obsv_val = measurement_val
            
            if use_gpu:
                convert_to_cuda=lambda x: Variable(x.cuda())
                X_val, X_last_obsv_val, Mask_val, Delta_val, labels_val = map(convert_to_cuda, [measurement_val, measurement_last_obsv_val, mask_val, time_val, labels_val])
            else: 
#                 inputs, labels = Variable(inputs), Variable(labels)
                convert_to_tensor=lambda x: Variable(x)
                X_val, X_last_obsv_val, Mask_val, Delta_val, labels_val = map(convert_to_tensor, [measurement_val, measurement_last_obsv_val, mask_val, time_val, labels_val])
            
                
            model.zero_grad()
            
            prediction_val = model(X_val, X_last_obsv_val, Mask_val, Delta_val)

            if output_last:
                loss_valid =loss_CEL(torch.squeeze(prediction_val), torch.squeeze(labels_val))
            else:
                raise NotImplementedError("Should be output last!")
                full_labels_val = torch.cat((inputs_val[:,1:,:], labels_val), dim = 1)
                loss_valid = loss_MSE(outputs_val, full_labels_val)

            losses_valid.append(loss_valid.data)
            losses_epoch_valid.append(loss_valid.data)
            
            # output
            trained_number += 1
            
        avg_losses_epoch_train = sum(losses_epoch_train).cpu().numpy() / float(len(losses_epoch_train))
        avg_losses_epoch_valid = sum(losses_epoch_valid).cpu().numpy() / float(len(losses_epoch_valid))
        losses_epochs_train.append(avg_losses_epoch_train)
        losses_epochs_valid.append(avg_losses_epoch_valid)
        
        
        # Early Stopping
        if epoch == 0:
            is_best_model = 1
            best_model = model
            min_loss_epoch_valid = 10000.0
            if avg_losses_epoch_valid < min_loss_epoch_valid:
                min_loss_epoch_valid = avg_losses_epoch_valid
        else:
            if min_loss_epoch_valid - avg_losses_epoch_valid > min_delta:
                is_best_model = 1
                best_model = model
                min_loss_epoch_valid = avg_losses_epoch_valid 
                patient_epoch = 0
            else:
                is_best_model = 0
                patient_epoch += 1
                if patient_epoch >= patience:
                    print('Early Stopped at Epoch:', epoch)
                    break
        
        # Print training parameters
        cur_time = time.time()
        print('Epoch: {}, train_loss: {}, valid_loss: {}, time: {}, best model: {}'.format( \
                    epoch, \
                    np.around(avg_losses_epoch_train, decimals=8),\
                    np.around(avg_losses_epoch_valid, decimals=8),\
                    np.around([cur_time - pre_time] , decimals=2),\
                    is_best_model) )
        pre_time = cur_time
#         if epoch==1:
#             break
                
    return best_model, [losses_train, losses_valid, losses_epochs_train, losses_epochs_valid]

def predict_proba(model, dataloader):
    """
    Input:
        model: GRU-D model
        test_dataloader: containing batches of measurement, measurement_last_obsv, mask, time_, labels
    Returns:
        predictions: size[num_samples, 2]
        labels: size[num_samples]
    """
    model.eval()
    use_gpu = False
#     use_gpu = torch.cuda.is_available()
    
    probabilities = []
    labels        = []
    ethnicities   = []
    genders       = []
    for X, label in dataloader:
        X = X.numpy()
        mask        = torch.from_numpy(X[:, np.arange(0, X.shape[1], 3), :].astype(np.float32))
        measurement = torch.from_numpy(X[:, np.arange(1, X.shape[1], 3), :].astype(np.float32))
        time_       = torch.from_numpy(X[:, np.arange(2, X.shape[1], 3), :].astype(np.float32))

        mask = torch.transpose(mask, 1, 2)
        measurement = torch.transpose(measurement, 1, 2)
        time_ = torch.transpose(time_, 1, 2)
        measurement_last_obsv = measurement            

        if use_gpu:
            convert_to_cuda=lambda x: Variable(x.cuda())
            X, X_last_obsv, Mask, Delta, label = map(convert_to_cuda, [measurement, measurement_last_obsv, mask, time_, label])
        else: 
#                 inputs, labels = Variable(inputs), Variable(labels)
            convert_to_tensor=lambda x: Variable(x)
            X, X_last_obsv, Mask, Delta, label  = map(convert_to_tensor, [measurement, measurement_last_obsv, mask, time_, label])

        
        prob = model(X, X_last_obsv, Mask, Delta)
        probabilities.append(torch.squeeze(prob).detach().cpu().data.numpy())
        labels.append(torch.squeeze(label).detach().cpu().data.numpy())

    return probabilities, labels

## Model Training & Evaluation

In [None]:
# get subjects that were admited before and after 2180 [2100,2200]
Ys['admittime'] = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['admittime']]
split_date = np.datetime64('2180-01-01')
Ys_int = Ys[Ys['admittime']<split_date]
Ys_ext = Ys[Ys['admittime']>=split_date]
subjects_int = Ys_int.index
subjects_ext = Ys_ext.index
del(Ys_int)
del(Ys_ext)
Ys.drop(labels='admittime',axis=1,inplace=True)
Ys.astype(float)

In [None]:
lvl2_subj_idx,  Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"

# shuffle the dataset
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)

# standardize the whole dataset
idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2.loc[:, idx[:,'mean']].mean(axis=0), lvl2.loc[:, idx[:,'mean']].std(axis=0)
lvl2.loc[:, idx[:,'mean']] = (lvl2.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds

# impute missing values
lvl2 = simple_imputer(lvl2)

### Split internal and sets datasets

In [None]:
idx = pd.IndexSlice
X_int = lvl2.loc[idx[subjects_int.get_level_values('subject_id')],:]
X_ext = lvl2.loc[idx[subjects_ext.get_level_values('subject_id')],:]
Y_int = Ys.loc[idx[subjects_int.get_level_values('subject_id')],:]
Y_ext = Ys.loc[idx[subjects_ext.get_level_values('subject_id')],:]

print("X internal shape: (%d,%d)"%(X_int.shape[0]/24,X_int.shape[1]))
print("X external shape: (%d,%d)"%(X_ext.shape[0]/24,X_ext.shape[1]))
print("Y internal shape: (%d,%d)"%(Y_int.shape[0],Y_int.shape[1]))
print("Y external shape: (%d,%d)"%(Y_ext.shape[0],Y_ext.shape[1]))

all_int_subjects = list(
    np.random.permutation(Y_int.index.get_level_values('subject_id').values)
)

print("X internal shape: (%d,%d)"%(X_int.shape[0]/24,X_int.shape[1]))
print("X external shape: (%d,%d)"%(X_ext.shape[0]/24,X_ext.shape[1]))
print("Y internal shape: (%d,%d)"%(Y_int.shape[0],Y_int.shape[1]))
print("Y external shape: (%d,%d)"%(Y_ext.shape[0],Y_ext.shape[1]))

In [None]:
'''
Some helper data structures to store and save predictions
'''
class Logger():
    def __init__(self, optional_cols=None):
        self.columns=['task_name','fold','prediction_no','index','y_true','y_score','censoring']
        if (optional_cols is None):
            self.df=pd.DataFrame(columns=self.columns)
            self.has_optional_cols=False
        else:
            self.df=pd.DataFrame(columns=self.columns+optional_cols)
            self.has_optional_cols=True
            self.optional_cols=optional_cols
        self._rocs=[]
        self._prediction_no=0
        return
    
    def append_logger(self,indices, y_true, y_score, label, censoring=None, optional_dict=None,fold=0):
        y_true=np.array(y_true).astype(int)
        y_score=np.array(y_score).astype(float)
        

        if ((y_true.shape[0]!=y_score.shape[0])):
            raise ValueError("Shapes of input matrices must match")
        
            
        self._n=y_true.shape[0]

        if(censoring is None):
            cens = self._n*[math.nan]
            censoring=np.array(censoring)
        else:
            cens=censoring
        
        arr=np.array([self._n*[label],
                      self._n*[fold],
                      self._n*[self._prediction_no],
                      list(indices),
                      list(y_true),
                      list(y_score),
                      list(cens)
              ]).transpose()

        to_append=pd.DataFrame(arr, columns=self.columns)
        if(self.has_optional_cols):
            
            for column, value in optional_dict.items():
                to_append.loc[:,column]=value

        self.df=self.df.append(to_append)
        self._prediction_no=self._prediction_no+1
        
def preds_df_to_int(df):
    df_test = df
    type_dict = {}
    cast=['fold','prediction_no','y_true']
    for col in cast:
        type_dict[col] = 'int64'
    df_test = df_test.astype(dtype=type_dict)
    return df_test

In [None]:
def to_3D_tensor(df):
    idx = pd.IndexSlice
    return np.dstack((df.loc[idx[:,:,:,i], :].values for i in sorted(set(df.index.get_level_values('hours_in')))))

def to_3D_tensor2(df):
    idx = pd.IndexSlice
    return torch.from_numpy(np.dstack((df.loc[idx[:,:,:,i], :].values for i in sorted(set(df.index.get_level_values('hours_in'))))))

def to_2D_tensor(df):
    # idx = pd.IndexSlice
    dl = list()
#     l1 = 0
    for r_idx, row in df.iterrows():
        d = df.loc[r_idx,'data']

        dl.append(d)
    return np.stack(tuple(dl))

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def prepare_dataloader(df, Ys, batch_size, shuffle=True):
    """
    dfs = (df_train, df_dev, df_test).
    df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time})
    Ys_series = (subject, hadm, icustay) => label.
    """
    X     = torch.from_numpy(to_3D_tensor(df).astype(np.float32))
    label = torch.from_numpy(Ys.values.astype(np.int64))
    dataset = utils.TensorDataset(X, label)
    return utils.DataLoader(dataset, batch_size=int(batch_size), shuffle=shuffle, drop_last = True)

def prepare_2d_dataloader(df, Ys, batch_size, shuffle=True):
    """
    dfs = (df_train, df_dev, df_test).
    df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time})
    Ys_series = (subject, hadm, icustay) => label.
    """
    X     = torch.from_numpy(to_2D_tensor(df).astype(np.float32))
    label = torch.from_numpy(Ys.values.astype(np.int64))
    dataset = utils.TensorDataset(X, label)
    return utils.DataLoader(dataset, batch_size=int(batch_size), shuffle=shuffle, drop_last = True)


def stack_dataframe(df):
    df.columns = df.columns.map('_'.join)
    df2 = pd.DataFrame(index=df.index.droplevel(level=3).drop_duplicates())
    df2['data'] = ''
    for idx, df in df.groupby(level=[0,1,2]):
        data = []
        cols = list(df)
        n = len(df[cols[0]])
        for i in range(n):
            for col in cols:
                k = len(df[col])
                if n != k:
                    print("different len: ", n, "vs. ", k)
                data.append(df[col].iloc[i])
        df2.loc[idx,'data'] = data
    return df2

def stack_dataframe_alt(df):
    df.columns = df.columns.map('_'.join)
    df2 = pd.DataFrame(index=df.index.droplevel(level=3).drop_duplicates())
    df2['data'] = ''
    for idx, df in df.groupby(level=[0,1,2]):
        data = []
        cols = list(df)
        n = len(df[cols[0]])
        for col in cols:
            tmp_array = df[col]
            for val in tmp_array:
                data.append(val)
        df2.loc[idx,'data'] = data
    return df2

def ps_spawn(i, shared_dict, df):
    res = stack_dataframe_alt(df)
    shared_dict[i] = res

In [None]:
N = 10
early_stop_frac = 0.1
hyperparams_fixed = {
    'early_stop_frac': early_stop_frac,
    'seed': SEED,
}

GRU_D_dist = DictDist({
    'cell_size': ss.randint(50, 75),
    'hidden_size': ss.randint(65, 95), 
    'learning_rate': ss.uniform(2e-3, 1e-1),
    'num_epochs': ss.randint(15, 150),
    #'num_epochs': ss.randint(1, 2),
    'patience': ss.randint(3, 7),
    'early_stop_frac': ss.uniform(0.05, 0.1),
    'seed': ss.randint(1, 10000),
})
np.random.seed(SEED)
GRU_D_hyperparams_list = GRU_D_dist.rvs(N)
RESULTS_PATH = './results/extraction_baselines_gru-d.pkl'
#with open(RESULTS_PATH, mode='rb') as f: results = pickle.load(f)
results = {}
print(GRU_D_hyperparams_list)

In [None]:
len(GRU_D_hyperparams_list)

In [None]:
subjects_int_alt, subjects_ext_alt = [df.index.get_level_values('subject_id') for df in (X_int, X_ext)]
subjects_int_alt = set(subjects_int_alt)
subjects_ext_alt = set(subjects_ext_alt)
print(len(subjects_int_alt))
print(len(subjects_ext_alt))

In [None]:
%%capture cap --no-stderr
hyperparams_list = GRU_D_hyperparams_list.copy()
sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
outcomes = ['los_3', 'los_7', 'mort_icu', 'mort_hosp']
preds_int = Logger()
preds_ext = Logger()
subjects_int_alt = list(subjects_int_alt)
subjects_ext_alt = list(subjects_ext_alt)
batch_size=16
for t in outcomes:
    print("Outcome:", t)
    int_save_str = RESULTS_DIR + '10Fold_GRU_int_' + str(t)
    ext_save_str = RESULTS_DIR + '10Fold_GRU_ext_' + str(t)
    fold=1
    best_fold_F1, best_fold_rmse, best_fold_auc, best_fold_auprc = -np.Inf, -np.Inf, -np.Inf, -np.Inf
    best_fold_model_name = "N/A"
    best_fold = -1

    for train_subj_idx, test_subj_idx in sss.split(np.zeros(len(subjects_int_alt)), Y_int[t]):
        train_subj = list(np.array(subjects_int_alt)[train_subj_idx])
        test_subj  = list(np.array(subjects_int_alt)[test_subj_idx])
        # Internal: 10-fold cross validation for training split
        best_F1, best_rmse, best_auc, best_auprc = -np.Inf, -np.Inf, -np.Inf, -np.Inf
        best_hyperparams = None
        early_stop_frac = hyperparams_fixed['early_stop_frac']
        best_preds = []
        print("Evaluating GRU for Outcome: %s, Fold: %d"%(t,fold))

        
        set_primary_seeds(SEED)
        all_train_subjects, N = np.random.permutation(list(train_subj)), len(train_subj)
        N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
        train_subjects      = all_train_subjects[:-N_early_stop]
        early_stop_subjects = all_train_subjects[-N_early_stop:]
        print("Train subjects length: ", len(train_subjects))
        print("ES/valid subjects length: ", len(early_stop_subjects))
        
        [(X_train_obs_tmp, X_train_early_stop_tmp, X_test_tmp), (Ys_train_obs, Ys_train_early_stop, Ys_test)] = [
            [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subjects, early_stop_subjects,  test_subj)] \
            for df in (X_int, Y_int)
        ]

        if t in ['mort_icu', 'los_3', 'mort_hosp', 'los_7']:
            output_last = True
        elif t=='los_icu':
            output_last = False
        else:
            print("invalid label for 'output_last' check")
        print("X_train_obs nan count: ", X_train_obs_tmp.isna().sum().sum())
        X_mean = np.nanmean(
            to_3D_tensor(
                X_train_obs_tmp.loc[:, pd.IndexSlice[:, 'mean']] * 
                np.where((X_train_obs_tmp.loc[:, pd.IndexSlice[:, 'mask']] == 1).values, 1, np.NaN)
            ),
            axis=0, keepdims=True
        ).transpose([0, 2, 1])
        
        X_mean = np.nan_to_num(X_mean, copy=True, nan=0, posinf=X_mean.max(), neginf=X_mean.min())

        base_params = {'X_mean': X_mean, 'output_last': True, 'input_size': X_mean.shape[2]}
        print("Total number of hyperparameters combos: ", len(hyperparams_list))
        for i, hyperparams in enumerate(hyperparams_list):
            model_name = "Fold%d_ParamSet%d"%(fold,i)
            print("Beginning Evaluation for: %s"%(model_name))

            train_dataloader      = prepare_dataloader(X_train_obs_tmp, Ys_train_obs[t], batch_size=batch_size)
            early_stop_dataloader = prepare_dataloader(X_train_early_stop_tmp, Ys_train_early_stop[t], batch_size=batch_size)
            test_dataloader        = prepare_dataloader(X_test_tmp, Ys_test[t], batch_size=batch_size)

            print("init dataloaders complete")
            batch_size = 16
            set_primary_seeds(SEED)
            model_hyperparams = copy.copy(base_params)
            model_hyperparams.update(
                {k: v for k, v in hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
            )
            model_hyperparams['batch_size'] = batch_size
            model = GRUD(**model_hyperparams)
            
            model_hyperparams.update(
                {k: v for k, v in hyperparams.items() if k in ('learning_rate', 'num_epochs', 'patience', 'early_stop_frac')}
            )
            #model = model.to(torch.device('cuda'))

            best_model, _ = Train_Model(
                model, train_dataloader, early_stop_dataloader,
                **{k: v for k, v in model_hyperparams.items() if k in (
                    'num_epochs', 'patience', 'learning_rate', 'batch_size'
                )}
            )


            
            if test_dataloader is not None:
                set_primary_seeds(SEED)
                probabilities_test, labels_test = predict_proba(best_model, test_dataloader)
                y_scores      = np.concatenate(probabilities_test)
                y_true        = np.concatenate(labels_test)
                subject_idx = list(range(0,len(y_scores)))
                print("y_true:", y_true.shape)
                print("y_scores:", y_scores.shape)
                
                print("Internal validation testing for our best model: %s, on target %s" % (model_name, t))
                if output_last:
                    y_pred = []
                    m = nn.Softmax(dim=1)
                    y_scores_sm = m(torch.from_numpy(y_scores))
                    y_pred = np.argmax(y_scores_sm, 1).numpy()
                    y_score = y_scores_sm.numpy()[:,1]
                    print("y_score:",y_score.shape)

                    auc = roc_auc_score(y_true, y_score)
                    auprc = average_precision_score(y_true, y_score)
                    acc   = accuracy_score(y_true, y_pred)
                    prec = precision_score(y_true, y_pred)
                    rec = recall_score(y_true, y_pred)
                    F1    = f1_score(y_true, y_pred)
                    print("auc->%f, auprc->%f, acc->%f, prec->%f, rec->%f, F1->%f" % (auc, auprc, acc, prec, rec, F1))
                    if auc > best_auc:
                        best_auc, best_hyperparams = auc, model_hyperparams
#                         print("New Best AUC within Fold (%d): %.2f @ hyperparams = %s" % (fold, 100*best_auc, repr((best_hyperparams))))
                        print("New Best AUC within Fold (%d): %.2f" % (fold, 100*best_auc))
                        # save our best model just in case we want it later
#                         torch.save(best_model.module.state_dict(), 'results/best_model.pt')
                        best_preds = y_score
                else:
                    mse = mean_squared_error(y_score, labels_test)
                    rmse = sqrt(mse)
                    mae = mean_absolute_error(y_score, labels_test)
                    pred_mean = np.mean(labels_test)
                    pred_std = np.std(labels_test)
                    label_mean = np.mean(y_score)
                    label_std = np.std(y_score)
                    print("mse->%f, rmse->%f, MAE->%f, (pred mean, pred_std)->(%f,%f), (label_mean, label_std)->(%f,%f)" % (mse, rmse, mae, pred_mean, pred_std, label_mean, label_std))
#                     results[model_name][t][n] = None, model_hyperparams, mse, rmse
                    if rmse > best_rmse:
                        best_rmse, best_hyperparams = rmse, model_hyperparams
#                         print("New Best RMSE within Fold (%d): %.2f @ hyperparams = %s" % (fold, best_rmse, repr((best_hyperparams))))
                        print("New Best RMSE within Fold (%d): %.2f" % (fold, best_rmse))
                        # save our best model just in case we want it later
#                         torch.save(best_model.module.state_dict(), 'results/best_model.pt')
                        best_preds = y_score
        if output_last:
            if best_auc > best_fold_auc:
                best_fold_auc, best_fold_hyperparams = best_auc, best_hyperparams
                best_fold_model_name = model_name
#                 print("New Best AUC across all folds: %.2f @ hyperparams = %s" % (100*best_fold_auc, repr((best_fold_hyperparams))))
                print("New Best AUC across all folds: %.2f" % (100*best_fold_auc))
                # save our best model just in case we want it later
#                 torch.save(best_model.module.state_dict(), 'results/best_overall_model.pt')
                best_fold = fold
        else:
            if best_rmse > best_fold_rmse:
                best_fold_rmse, best_fold_hyperparams = best_rmse, best_hyperparams
                best_fold_model_name = model_name
                print("New Best RMSE across all folds: %.2f "% (best_fold_rmse))
#                 print("New Best RMSE across all folds: %.2f @ hyperparams = %s" % (best_fold_rmse, repr((best_fold_hyperparams))))
                # save our best model just in case we want it later
    #                 torch.save(best_model.module.state_dict(), 'results/best_overall_model.pt')
                best_fold = fold
        assert len(y_true) == len(best_preds), "Labels (%d) and preds lengths (%d) dont match"%(len(y_true),len(best_preds))
        print("Appending best predictions from Fold %d. Length: %d"%(fold, len(best_preds)))
        preds_int.append_logger(subject_idx, y_true, best_preds, label=t, fold=fold)
        fold+=1
        print("Best hyperparams set: %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams)))) 
        print()
        
    # External: train/validation and testing on all data
    print("Begin external testing")
    print("X internal shape: (%d,%d)"%(X_int.shape[0]/24,X_int.shape[1]))
    print("X external shape: (%d,%d)"%(X_ext.shape[0]/24,X_ext.shape[1]))
    print("Y internal shape: (%d,%d)"%(Y_int.shape[0],Y_int.shape[1]))
    print("Y external shape: (%d,%d)"%(Y_ext.shape[0],Y_ext.shape[1]))

    train_subj = list(np.array(subjects_int_alt))
    test_subj  = list(np.array(subjects_ext_alt))
    print("External evaluation for GRU-D for Outcome: %s"%(t))
    set_primary_seeds(SEED)
    all_train_subjects, N = np.random.permutation(list(train_subj)) ,len(train_subj)
    all_test_subjects = np.random.permutation(list(test_subj))
    N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
    train_subjects      = all_train_subjects[:-N_early_stop]
    early_stop_subjects = all_train_subjects[-N_early_stop:]
    print("Train subjects length: ", len(train_subjects))
    print("ES/valid subjects length: ", len(early_stop_subjects))  
    
    [(X_train_obs_tmp, X_train_early_stop_tmp), (Ys_train_obs, Ys_train_early_stop)] = [
        [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subjects, early_stop_subjects)] \
        for df in (X_int, Y_int)
    ]

    X_test_tmp = X_ext.copy()
    Ys_test = Y_ext.copy()
#     [(X_test_tmp), (Ys_test)] = [
#         [df[df.index.get_level_values('subject_id').isin([s])] for s in (list(all_test_subjects))] \
#         for df in (X_ext, Y_ext)
#     ]

    model_hyperparams = copy.copy(hyperparams_fixed)
    model_hyperparams.update(
        {k: v for k, v in best_fold_hyperparams.items()}
    )
    batch_size = 16
    
    # MULTIVARIATE DL
    train_dataloader      = prepare_dataloader(X_train_obs_tmp, Ys_train_obs[t], batch_size=batch_size)
    early_stop_dataloader = prepare_dataloader(X_train_early_stop_tmp, Ys_train_early_stop[t], batch_size=batch_size)
    test_dataloader        = prepare_dataloader(X_test_tmp, Ys_test[t], batch_size=batch_size)
    print("init dataloaders complete")
    set_primary_seeds(SEED)
    model_hyperparams = copy.copy(base_params)
    model_hyperparams.update(
        {k: v for k, v in best_fold_hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
    )
    model_hyperparams['batch_size'] = batch_size
    model = GRUD(**model_hyperparams)

    model_hyperparams.update(
        {k: v for k, v in best_fold_hyperparams.items() if k in ('learning_rate', 'num_epochs', 'patience', 'early_stop_frac')}
    )
    #model = model.to(torch.device('cuda'))
    print("Hyperparam: ", model_hyperparams)
    best_model, _ = Train_Model(
        model, train_dataloader, early_stop_dataloader,
        **{k: v for k, v in model_hyperparams.items() if k in (
            'num_epochs', 'patience', 'learning_rate', 'batch_size'
        )}
    )

    if test_dataloader is not None:
        set_primary_seeds(SEED)
        probabilities_test, labels_test = predict_proba(best_model, test_dataloader)
        y_scores      = np.concatenate(probabilities_test)
        y_true        = np.concatenate(labels_test)
        subject_idx = list(range(0,len(y_scores)))
        print("y_true:", y_true.shape)
        print("y_scores:", y_scores.shape)
        print("External validation testing for our best model: %s, on target %s" % (model_name, t))
        if output_last:
            y_pred = []
            m = nn.Softmax(dim=1)
            y_scores_sm = m(torch.from_numpy(y_scores))
            y_pred = np.argmax(y_scores_sm, 1).numpy()
            y_score = y_scores_sm.numpy()[:,1]
            print("y_score:",y_score.shape)
            auc = roc_auc_score(y_true, y_score)
            auprc = average_precision_score(y_true, y_score)
            acc   = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            rec = recall_score(y_true, y_pred)
            F1    = f1_score(y_true, y_pred)
            print("auc->%f, auprc->%f, acc->%f, prec->%f, rec->%f, F1->%f" % (auc, auprc, acc, prec, rec, F1))
        else:
            mse = mean_squared_error(y_score, labels_test)
            rmse = sqrt(mse)
            mae = mean_absolute_error(y_score, labels_test)
            pred_mean = np.mean(labels_test)
            pred_std = np.std(labels_test)
            label_mean = np.mean(y_score)
            label_std = np.std(y_score)
            print("mse->%f, rmse->%f, MAE->%f, (pred mean, pred_std)->(%f,%f), (label_mean, label_std)->(%f,%f)" % (mse, rmse, mae, pred_mean, pred_std, label_mean, label_std))
    assert len(y_true) == len(y_score), "Labels (%d) and y_score lengths (%d) dont match"%(len(y_true),len(y_score))
    preds_ext.append_logger(subject_idx, y_true, y_score, label=t, fold=fold)
    print()
preds_int.df = preds_df_to_int(preds_int.df)
preds_int.df.to_csv(RESULTS_DIR+"GRU_internal_test_preds.csv")
preds_ext.df = preds_df_to_int(preds_ext.df)
preds_ext.df.to_csv(RESULTS_DIR+"GRU_external_test_preds.csv")

In [None]:
'''
write cell output (text) from above into a dedicate logfile
'''
write_file = RESULTS_DIR + 'GRU_main_output_hyperparam_sweep_val.txt'
with open(write_file, 'w') as f:
    f.write(cap.stdout)
del(cap)