In [1]:
'''
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/598-DLH/tensor_exports
%ls
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n%cd /content/drive/MyDrive/598-DLH/tensor_exports\n%ls\n"

In [2]:
import torch
import pandas as pd
from datetime import datetime
from datetime import timedelta
from math import ceil
import gzip

In [3]:
final_tensor = torch.load('../tensor_exports/data_tensor_chart_demog.pt')

In [4]:
print(final_tensor.shape)
BATCH_SIZE=100
FEATURE_SIZE = final_tensor.shape[-1]
LEARNING_RATE = 0.001

torch.Size([1541, 5, 720, 58])


# Try to get the hadm_idx_to_hadm_id first
so that we can get the is_sepsis label for that hadm_id

In [5]:
'''
%cd ../mimic
%ls
'''

'\n%cd ../mimic\n%ls\n'

In [6]:
final_icustays_details = pd.read_pickle('final_icustays_details.pkl')
def get_idx_to_hadm_index_dict(values_pdSeries):
    return {i:v for i, v in enumerate(list(values_pdSeries.unique()))}
    
hadm_idx_id_dict = get_idx_to_hadm_index_dict(final_icustays_details['hadm_id'])
hadm_sepsis = pd.read_pickle('hadm_sepsis.pkl')

In [7]:
hadm_id_isSepsis_dict = dict(zip(hadm_sepsis.hadm_id, hadm_sepsis.is_sepsis))

# DataLoadder

In [8]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data_tensor, hadm_id_isSepsis_dict, hadm_idx_id_dict):
        self.data_tensor = data_tensor
        self.hadm_id_isSepsis_dict = hadm_id_isSepsis_dict
        self.hadm_idx_id_dict = hadm_idx_id_dict
        self._get_available_icu_stays()
        return

    def __len__(self):
        return len(self.available_icu_stays)

    def _get_available_icu_stays(self): ## ICU stays that are not padded
        self.available_icu_stays = []
        for hadm_idx in range(self.data_tensor.shape[0]):
            for icu_idx in range(self.data_tensor.shape[1]):
                if self.data_tensor[hadm_idx, icu_idx, 0, 2] != 0: ## If the first ICU stay's length of stay (los) is zero, then it's padded
                    self.available_icu_stays.append((hadm_idx, icu_idx))

    def _get_sofa_score(self, index, value):
        if index == 57: ## Platelets
            if value >= 150:
                return 0
            elif value >= 100:
                return 1
            else:
                return 2
        elif index == 34: ## creatinie  # => seems to over estimate
            if value < 1.2:
                return 0
            elif value < 2:
                return 1
            else:
                return 2
        elif index == 20: ## FiO2
            return 2 if value >=53.3 else 0
        else:
            return 0
        
       

    def _get_sepsis_onset_hour_or_last_hour(self, x):
        hour_idx = 0
        for hour in x:
            SOFA_score = self._get_sofa_score(57, hour[57]) + self._get_sofa_score(20, hour[20]) + self._get_sofa_score(34, hour[34])
            if SOFA_score >= 2:
                return hour_idx, torch.tensor(1, dtype=torch.float32)
            hour_idx += 1
        return int(x[0][2]*24), torch.tensor(0, dtype=torch.float32) ## Get the first hour's 3rd feature, which is length of stay of this icu
  
    def __getitem__(self, index):
        target_hadm_idx, target_icu_idx = self.available_icu_stays[index]
        x = self.data_tensor[target_hadm_idx, target_icu_idx] 

        l = int(x[0][2]*24) ## Get the first hour's 3rd feature, which is length of stay of this icu
        target_hadm_id = self.hadm_idx_id_dict[target_hadm_idx]
        if self.hadm_id_isSepsis_dict[target_hadm_id] == False:
            y = torch.tensor(0, dtype=torch.float32)
        else:
            l, y = self._get_sepsis_onset_hour_or_last_hour(x)
        return x, y, l

dataset = CustomDataset(final_tensor, hadm_id_isSepsis_dict, hadm_idx_id_dict)

In [9]:
from torch.utils.data.dataset import random_split

split = int(len(dataset) * 0.8)
lengths = [split, len(dataset) - split]
train_dataset, test_dataset = random_split(dataset, lengths)
print("length of train dataset:", len(train_dataset))
print("length of test dataset:", len(test_dataset))

length of train dataset: 1434
length of test dataset: 359


In [10]:
def collate_fn(data):
    sequences, labels, lengths = zip(*data)
    x = torch.stack(sequences, dim=0)
    y = torch.tensor(labels, dtype=torch.float)  
    l = torch.tensor(lengths, dtype=torch.long)
    return x, y, l
  
def get_last_visit(hidden_states, length):
    return hidden_states[range(hidden_states.shape[0]), length - 1, :]

In [11]:

## test
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)
loader_iter = iter(loader)
x, y, l = next(loader_iter)

print(x.shape)
print(y.shape)
print(l)


torch.Size([2, 720, 58])
torch.Size([2])
tensor([24, 77])


In [12]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [13]:
def get_last_visit(hidden_states, length):
    return hidden_states[range(hidden_states.shape[0]), length - 1, :]

In [14]:
'''
## TEST 
import random
max_num_visits = 10
batch_size = 16
hidden_dim = 100

hidden_states = torch.randn((batch_size, max_num_visits, hidden_dim))
lengths = torch.tensor([random.randint(1, max_num_visits) for _ in range(batch_size)])
out = get_last_visit(hidden_states, lengths)
'''

'\n## TEST \nimport random\nmax_num_visits = 10\nbatch_size = 16\nhidden_dim = 100\n\nhidden_states = torch.randn((batch_size, max_num_visits, hidden_dim))\nlengths = torch.tensor([random.randint(1, max_num_visits) for _ in range(batch_size)])\nout = get_last_visit(hidden_states, lengths)\n'

In [15]:


import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(input_size = FEATURE_SIZE, hidden_size = 800, batch_first=True)
        self.fc = nn.Linear(800, 2)
        self.softmax = nn.Softmax(dim=1)
  
    def forward(self, x, length):
        batch_size = x.shape[0]
        output, _ = self.rnn(x)
        true_h_n = get_last_visit(output, length)
        logits =torch.relu(self.fc(true_h_n))
        probs = self.softmax(logits)
        '''
        pred = []
        for res in probs:
          pred.append(res)
        '''
        return probs

model = RNN()
model

RNN(
  (rnn): LSTM(58, 800, batch_first=True)
  (fc): Linear(in_features=800, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

In [16]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [17]:
from sklearn.metrics import *

#input: Y_score,Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_score, Y_pred, Y_true):
    auc, precision, recall =   roc_auc_score(Y_true, Y_score), \
                               precision_score(Y_true, Y_pred), \
                                recall_score(Y_true, Y_pred)
    return auc, precision, recall


#input: model, loader
def evaluate(model, loader):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    
    for x, y, l in loader:
        # pass the input through the model
        y_hat = model(x, l)
        y_hat = y_hat.select(dim=1, index=0)
        # convert shape from [batch size, 1] to [batch size]
        y_pred = (y_hat > 0.5).type(torch.float)
        
        y_hat = torch.nan_to_num(y_hat)

        all_y_true = torch.cat((all_y_true, y.to('cpu')), dim=0)
        all_y_pred = torch.cat((all_y_pred,  y_pred.to('cpu')), dim=0)
        all_y_score = torch.cat((all_y_score,  y_hat.to('cpu')), dim=0)
    
    all_y_true = torch.cat((torch.tensor([1]), all_y_true.to('cpu')), dim=0) ## According to the paper, it up-scaling the septic cases
    all_y_pred = torch.cat((torch.tensor([1]), all_y_pred.to('cpu')), dim=0)
    all_y_score = torch.cat((torch.tensor([1]), all_y_score.to('cpu')), dim=0)

    auc, precision, recall = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}")
    return

In [18]:
train_init = evaluate(model, train_loader)

auc: 0.547, precision: 0.375, recall: 0.078


In [None]:
n_epochs = 50

# prep model for training
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for x, y, l in train_loader:
        x = torch.nan_to_num(x)
        """ Step 1. clear gradients """
        optimizer.zero_grad()
        """  Step 2. perform forward pass using `model`, save the output to y_hat """
        y_hat = model(x, l)
        """ Step 3. calculate the loss using `criterion`, save the output to loss. """
        '''
        y_list = list(y)
        for i in range(len(y_list)):
            if y_list[i] > 0.5:
                y_list[i] = [float(0), float(1)]
            else:
                y_list[i] = [float(1), float(0)]
        y_true = torch.tensor(y_list,dtype=torch.float)
        '''
        
        #y_hat = torch.nan_to_num(y_hat)
        y_hat = torch.select(y_hat, index=0, dim=1)
        
        loss = criterion(y_hat, y)
        """ Step 4. backward pass """
        loss.backward()
        """ Step 5. optimization """
        optimizer.step()
        """ Step 6. record loss """
        train_loss += loss.item()
        
    train_loss = train_loss / len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    evaluate(model, train_loader)
    evaluate(model, test_loader)

Epoch: 1 	Training Loss: 0.503746
auc: 0.731, precision: 0.733, recall: 0.036
auc: 0.725, precision: 0.818, recall: 0.122
Epoch: 2 	Training Loss: 0.490070
auc: 0.750, precision: 0.706, recall: 0.039
auc: 0.741, precision: 0.778, recall: 0.095
Epoch: 3 	Training Loss: 0.465966
auc: 0.779, precision: 0.800, recall: 0.039
auc: 0.770, precision: 0.750, recall: 0.081
Epoch: 4 	Training Loss: 0.444488
auc: 0.806, precision: 0.812, recall: 0.042
auc: 0.798, precision: 0.778, recall: 0.095
Epoch: 5 	Training Loss: 0.442244
auc: 0.832, precision: 0.724, recall: 0.068
auc: 0.820, precision: 0.818, recall: 0.122
Epoch: 6 	Training Loss: 0.436112
auc: 0.834, precision: 0.792, recall: 0.062
auc: 0.825, precision: 0.778, recall: 0.095
Epoch: 7 	Training Loss: 0.425274
auc: 0.850, precision: 0.800, recall: 0.091
auc: 0.848, precision: 0.800, recall: 0.162
Epoch: 8 	Training Loss: 0.432507
auc: 0.845, precision: 0.828, recall: 0.078
auc: 0.841, precision: 0.778, recall: 0.095
Epoch: 9 	Training Loss: