In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/598-DLH/Draft
%ls

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import torch
import pandas as pd
from datetime import datetime
from datetime import timedelta
from math import ceil
import gzip

In [None]:
%cd mimic-iii-clinical-database-1.4/
%ls

/mnt/d/CS598DLH/mimic-iii-clinical-database-1.4
[0m[01;32mADMISSIONS.csv.gz[0m*          [01;32mPATIENTS.csv.gz[0m*
[01;32mCALLOUT.csv.gz[0m*             [01;32mPRESCRIPTIONS.csv.gz[0m*
[01;32mCAREGIVERS.csv.gz[0m*          [01;32mPROCEDUREEVENTS_MV.csv.gz[0m*
[01;32mCHARTEVENTS.csv.gz[0m*         [01;32mPROCEDURES_ICD.csv.gz[0m*
[01;32mCPTEVENTS.csv.gz[0m*           [01;32mREADME.md[0m*
[01;32mDATETIMEEVENTS.csv.gz[0m*      [01;32mSERVICES.csv.gz[0m*
[01;32mDIAGNOSES_ICD.csv.gz[0m*       [01;32mSHA256SUMS.txt[0m*
[01;32mDRGCODES.csv.gz[0m*            [01;32mTRANSFERS.csv.gz[0m*
[01;32mD_CPT.csv.gz[0m*               [01;32mchart.pkl[0m*
[01;32mD_ICD_DIAGNOSES.csv.gz[0m*     [01;32mchart_tensor_filled_final.pt.gz[0m*
[01;32mD_ICD_PROCEDURES.csv.gz[0m*    [01;32mchecksum_md5_unzipped.txt[0m*
[01;32mD_ITEMS.csv[0m*                [01;32mchecksum_md5_zipped.txt[0m*
[01;32mD_ITEMS.csv.gz[0m*             [01;32mcleaned_text.pkl[0m*
[01;32

In [None]:
%cd tensor_exports

/content/drive/.shortcut-targets-by-id/14VE6jxu6gzBuYO9WY79TIloM87C-NZXP/598-DLH/Draft/tensor_exports


In [None]:
final_tensor = torch.load('data_tensor_final.pt')

In [None]:
print(final_tensor.shape)

torch.Size([1541, 5, 720, 826])


In [None]:
FEATURE_SIZE = int(final_tensor.shape[3])
BATCH_SIZE=100
LEARNING_RATE=1e-2

# Try to get the hadm_idx_to_hadm_id first
so that we can get the is_sepsis label for that hadm_id

In [None]:
'''
%cd ../mimic
%ls
'''

'\n%cd ../mimic\n%ls\n'

In [None]:
final_icustays_details = pd.read_pickle('final_icustays_details.pkl')
def get_idx_to_hadm_index_dict(values_pdSeries):
    return {i:v for i, v in enumerate(list(values_pdSeries.unique()))}
    
hadm_idx_id_dict = get_idx_to_hadm_index_dict(final_icustays_details['hadm_id'])
hadm_sepsis = pd.read_pickle('hadm_sepsis.pkl')

In [None]:
hadm_id_isSepsis_dict = dict(zip(hadm_sepsis.hadm_id, hadm_sepsis.is_sepsis))

# DataLoadder

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data_tensor, hadm_id_isSepsis_dict, hadm_idx_id_dict):
        self.data_tensor = data_tensor
        self.hadm_id_isSepsis_dict = hadm_id_isSepsis_dict
        self.hadm_idx_id_dict = hadm_idx_id_dict
        self._get_available_icu_stays()
        return

    def __len__(self):
        return len(self.available_icu_stays)

    def _get_available_icu_stays(self): ## ICU stays that are not padded
        self.available_icu_stays = []
        for hadm_idx in range(self.data_tensor.shape[0]):
            for icu_idx in range(self.data_tensor.shape[1]):
                if self.data_tensor[hadm_idx, icu_idx, 0, 2] != 0: ## If the first ICU stay's length of stay (los) is zero, then it's padded
                    self.available_icu_stays.append((hadm_idx, icu_idx))

    def _get_sofa_score(self, index, value):
        if index == 57: ## Platelets
            if value >= 150:
                return 0
            elif value >= 100:
                return 1
            else:
                return 2
        elif index == 34: ## creatinie  # => seems to over estimate
            if value < 1.2:
                return 0
            elif value < 2:
                return 1
            else:
                return 2
        elif index == 20: ## FiO2
            return 2 if value >=53.3 else 0
        else:
            return 0
        
       

    def _get_sepsis_onset_hour_or_last_hour(self, x):
        hour_idx = 0
        for hour in x:
            SOFA_score = self._get_sofa_score(57, hour[57]) + self._get_sofa_score(20, hour[20]) + self._get_sofa_score(34, hour[34])
            if SOFA_score >= 2:
                return hour_idx, torch.tensor(1, dtype=torch.float32)
            hour_idx += 1
        return int(x[0][2]*24), torch.tensor(0, dtype=torch.float32) ## Get the first hour's 3rd feature, which is length of stay of this icu
  
    def __getitem__(self, index):
        target_hadm_idx, target_icu_idx = self.available_icu_stays[index]
        x = self.data_tensor[target_hadm_idx, target_icu_idx] 

        l = int(x[0][2]*24) ## Get the first hour's 3rd feature, which is length of stay of this icu
        target_hadm_id = self.hadm_idx_id_dict[target_hadm_idx]
        if self.hadm_id_isSepsis_dict[target_hadm_id] == False:
            y = torch.tensor(0, dtype=torch.float32)
        else:
            l, y = self._get_sepsis_onset_hour_or_last_hour(x)
        return x, y, l

dataset = CustomDataset(final_tensor[:200,:,:,:].float(), hadm_id_isSepsis_dict, hadm_idx_id_dict)

In [None]:
from torch.utils.data.dataset import random_split

split = int(len(dataset) * 0.8)
lengths = [split, len(dataset) - split]
train_dataset, test_dataset = random_split(dataset, lengths)
print("length of train dataset:", len(train_dataset))
print("length of test dataset:", len(test_dataset))

length of train dataset: 194
length of test dataset: 49


In [None]:
def collate_fn(data):
    sequences, labels, lengths = zip(*data)
    x = torch.stack(sequences, dim=0)
    y = torch.tensor(labels, dtype=torch.float)  
    l = torch.tensor(lengths, dtype=torch.long)
    return x, y, l
  
def get_last_visit(hidden_states, length):
    return hidden_states[range(hidden_states.shape[0]), length - 1, :]

In [None]:

## test
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)
loader_iter = iter(loader)
x, y, l = next(loader_iter)

print(x.shape)
print(y.shape)
print(l)


torch.Size([2, 720, 826])
torch.Size([2])
tensor([306,  54])


In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [None]:
def get_last_visit(hidden_states, length):
    return hidden_states[range(hidden_states.shape[0]), length - 1, :]

In [None]:
'''
## TEST 
import random
max_num_visits = 10
batch_size = 16
hidden_dim = 100

hidden_states = torch.randn((batch_size, max_num_visits, hidden_dim))
lengths = torch.tensor([random.randint(1, max_num_visits) for _ in range(batch_size)])
out = get_last_visit(hidden_states, lengths)
'''

'\n## TEST \nimport random\nmax_num_visits = 10\nbatch_size = 16\nhidden_dim = 100\n\nhidden_states = torch.randn((batch_size, max_num_visits, hidden_dim))\nlengths = torch.tensor([random.randint(1, max_num_visits) for _ in range(batch_size)])\nout = get_last_visit(hidden_states, lengths)\n'

In [None]:


import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(input_size = FEATURE_SIZE, hidden_size = 800, batch_first=True)
        self.fc = nn.Linear(800, 2)
        self.softmax = nn.Softmax(dim=1)
  
    def forward(self, x, length):
        batch_size = x.shape[0]
        # x = x.to(torch.float16)
        output, _ = self.rnn(x)
        true_h_n = get_last_visit(output, length)
        logits = torch.relu(self.fc(true_h_n))
        probs = self.softmax(logits)
        '''
        pred = []
        for res in probs:
          pred.append(res)
        '''
        return probs

model = RNN()
model

RNN(
  (rnn): LSTM(826, 800, batch_first=True)
  (fc): Linear(in_features=800, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
from sklearn.metrics import *

#input: Y_score,Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_score, Y_pred, Y_true):
    auc, precision, recall =   roc_auc_score(Y_true, Y_score), \
                               precision_score(Y_true, Y_pred), \
                                recall_score(Y_true, Y_pred)
    return auc, precision, recall


#input: model, loader
def evaluate(model, loader):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    

    for x, y, l in loader:

        # pass the input through the model
        y_hat = model(x, l)
        y_hat = y_hat.select(dim=1, index=0)
        # convert shape from [batch size, 1] to [batch size]
        y_pred = (y_hat > 0.5).type(torch.float16)
        
        y_hat = torch.nan_to_num(y_hat)

        all_y_true = torch.cat((all_y_true, y.to('cpu')), dim=0)
        all_y_pred = torch.cat((all_y_pred,  y_pred.to('cpu')), dim=0)
        all_y_score = torch.cat((all_y_score,  y_hat.to('cpu')), dim=0)
    
    all_y_true = torch.cat((torch.tensor([1]), all_y_true.to('cpu')), dim=0) ## According to the paper, it up-scaling the septic cases
    all_y_pred = torch.cat((torch.tensor([1]), all_y_pred.to('cpu')), dim=0)
    all_y_score = torch.cat((torch.tensor([1]), all_y_score.to('cpu')), dim=0)

    auc, precision, recall = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}")
    return

In [None]:
train_init = evaluate(model, train_loader)

In [None]:
n_epochs = 50
counter = 0
# prep model for training
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for x, y, l in train_loader:
        x = torch.nan_to_num(x)
        """ Step 1. clear gradients """
        optimizer.zero_grad()
        """  Step 2. perform forward pass using `model`, save the output to y_hat """
        y_hat = model(x, l)
        """ Step 3. calculate the loss using `criterion`, save the output to loss. """
        
        #y_hat = torch.nan_to_num(y_hat)
        y_hat = torch.select(y_hat, index=0, dim=1)
        
        loss = criterion(y_hat, y)
        """ Step 4. backward pass """
        loss.backward()
        """ Step 5. optimization """
        optimizer.step()
        """ Step 6. record loss """
        train_loss += loss.item()
        
    train_loss = train_loss / len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    
    if counter%2==0:
        evaluate(model, train_loader)
        evaluate(model, test_loader)
    counter+=1

Epoch: 1 	Training Loss: 0.647068
auc: 0.559, precision: 1.000, recall: 0.018
auc: 0.560, precision: 1.000, recall: 0.067
Epoch: 2 	Training Loss: 0.656496
Epoch: 3 	Training Loss: 0.606286
auc: 0.497, precision: 1.000, recall: 0.018
auc: 0.389, precision: 1.000, recall: 0.067
Epoch: 4 	Training Loss: 0.589493
Epoch: 5 	Training Loss: 0.599730
auc: 0.512, precision: 1.000, recall: 0.018
auc: 0.501, precision: 1.000, recall: 0.067
Epoch: 6 	Training Loss: 0.584686
Epoch: 7 	Training Loss: 0.585993
auc: 0.645, precision: 1.000, recall: 0.018
auc: 0.657, precision: 1.000, recall: 0.067
Epoch: 8 	Training Loss: 0.584771
Epoch: 9 	Training Loss: 0.585220
auc: 0.600, precision: 1.000, recall: 0.018
auc: 0.625, precision: 1.000, recall: 0.067
Epoch: 10 	Training Loss: 0.584723
Epoch: 11 	Training Loss: 0.585908
auc: 0.583, precision: 1.000, recall: 0.018
auc: 0.545, precision: 1.000, recall: 0.067
Epoch: 12 	Training Loss: 0.583198
Epoch: 13 	Training Loss: 0.584104
auc: 0.545, precision: 1.0