In [1]:
from __future__ import division
import argparse
import math
import numpy as np
import os

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import pickle

from sklearn.metrics import roc_auc_score, classification_report
from tqdm import tqdm

In [2]:
emb_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/Output/'
data_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC_Processed/'

In [3]:
AE_emb_weights = torch.tensor(np.load(emb_path + 'AE_embedding_weights.npy',allow_pickle=True))
print(AE_emb_weights.size())

torch.Size([4428, 175])


In [4]:
n_epochs = 5
vocabsize = 175

vocabsize_icd = 942
vocabsize_meds = 3202
vocabsize_labs = 284


input_seqs_icd = pickle.load(open(data_path + 'MIMICIIIPROCESSED.3digitICD9.seqs', 'rb'))
input_seqs_meds = pickle.load(open(data_path + 'MIMICIIIPROCESSED.meds.seqs', 'rb'))
input_seqs_labs = pickle.load(open(data_path + 'MIMICIIIPROCESSED.abnlabs.seqs', 'rb'))

labels = pickle.load(open(data_path +'MIMICIIIPROCESSED.morts', 'rb'))

### Dataset built-up
1. Convert all input_seqs to one-hot vectors [num_visits, batchsize=1, vocabsize of the seq]
2. Concat all input_seqs of features by vocabsize dimension
3. Multiply AE embedding weights to the concatenated input_seqs for each patient [num_visits, batchsize=1, vocabsize=175]

In [5]:
#input_seqs is a list of embeddings weighted by the AE_emb_weights. 
#The len of list = number of patients, each patient in the list is an numpy array [number of visits, emb_size=175]

input_seqs = []

def convert_to_one_hot(code_seqs, vocab):
    new_code_seqs = torch.zeros((len(code_seqs),1,vocab))
    for i, code_seq in enumerate(code_seqs):
        for code in code_seq:
            new_code_seqs[i][0][code] = 1
    return new_code_seqs

for i in range(len(input_seqs_icd)):
    icd_onehot = convert_to_one_hot(input_seqs_icd[i], vocabsize_icd)
    med_onehot = convert_to_one_hot(input_seqs_meds[i], vocabsize_meds)
    lab_onehot = convert_to_one_hot(input_seqs_labs[i], vocabsize_labs)

    latent_inputs_oh = torch.concat((icd_onehot, med_onehot, lab_onehot), 2)
#     print(latent_inputs_oh.shape)
    latent_inputs = torch.matmul(latent_inputs_oh, AE_emb_weights)
    input_seqs.append(latent_inputs)

In [6]:
print(len(input_seqs))
print(input_seqs[0].size())

7537
torch.Size([2, 1, 175])


### Define the model 

In [7]:
class MyRNN(nn.Module):
    def __init__(self, epochs, batchsize, vocabsize):
        super(MyRNN, self).__init__()
        self.epochs = 5
        self.batchsize = batchsize
        self.vocabsize = vocabsize
        self.rnn = nn.LSTM(input_size=vocabsize, hidden_size=vocabsize, num_layers=1)
        self.out = nn.Linear(vocabsize, 1)
        self.sig = nn.Sigmoid()
    
    def forward(self, x, hidden=None, force=True, steps=0):
        if force or steps == 0: steps = len(x)
        outputs = Variable(torch.zeros(steps, 1, 1))
        x = F.relu(x)
        x = x.view(x.size()[0],1,x.size()[1])
        outputs, hidden = self.rnn(x, hidden)
        outputs = self.out(outputs)
        return outputs.squeeze(), hidden
    
    def predict(self, x):
        out, hid = self.forward(x, None)
        return self.sig(out)

### Train the model and predict using validated and test data

In [8]:
trainratio = 0.7
validratio = 0.1
testratio = 0.2

trainlindex = int(len(input_seqs)*trainratio)
validlindex = int(len(input_seqs)*(trainratio + validratio))
batchsize = 50

In [None]:
best_aucrocs = []
for run in range(10):
    print('Run', run)
    perm = np.random.permutation(len(input_seqs))
    rinput_seqs = [input_seqs[i] for i in perm]
    rlabels = [labels[i] for i in perm]

    rlabels = torch.tensor(rlabels)
    train_input_seqs = rinput_seqs[:trainlindex]
    train_labels = rlabels[:trainlindex]
    train_labels = train_labels.reshape(train_labels.shape[0],1)

    valid_input_seqs = rinput_seqs[trainlindex:validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs = rinput_seqs[validlindex:]
    test_labels = rlabels[validlindex:]

    n_iters = len(train_input_seqs)

    model = MyRNN(n_epochs, 1, vocabsize)
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    aucrocs = []

    n_epochs = 5
    for epoch in range(n_epochs):

        for i in (range(0, n_iters, batchsize)):
            batch_icd = train_input_seqs[i:i+batchsize]
            batch_train_labels = train_labels[i:i+batchsize]

            optimizer.zero_grad()
            losses = []

            for iter in range(len(batch_icd)):
                icd_inputs = Variable(batch_icd[iter].float())
                icd_inputs = icd_inputs.sum(axis=0)
                targets = Variable(batch_train_labels[iter].float())

                # Use teacher forcing 50% of the time
                force = random.random() < 0.5
                outputs, hidden = model(icd_inputs, None, force)
                outputs = outputs.reshape((1))
                #print outputs[-1], targets
                losses.append(criterion(outputs, targets))

            loss = sum(losses)/len(batch_icd)
            loss.backward()
            optimizer.step()

        ## Validation phase
        vpredictions = np.zeros(len(valid_input_seqs))
        for i in range(len(valid_input_seqs)):
            test_seq = valid_input_seqs[i]
            icd_inputs = Variable(test_seq.float())
            icd_inputs = icd_inputs.sum(axis=0)
            vpredictions[i] = model.predict(icd_inputs)

        ## Testing phase
        predictions = np.zeros(len(test_input_seqs))
        for i in range(len(test_input_seqs)):
            test_seq = test_input_seqs[i]
            icd_inputs = Variable(test_seq.float())
            icd_inputs = icd_inputs.sum(axis=0)
            predictions[i] = model.predict(icd_inputs)
        print("Test AUC_ROC: ", roc_auc_score(test_labels, predictions))
        actual_predictions = (predictions>0.5)*1
        # print classification_report(test_labels, actual_predictions)

        aucrocs.append(roc_auc_score(test_labels, predictions))
best_aucrocs.append(max(aucrocs))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))

Run 0
Test AUC_ROC:  0.7037027855748109
Test AUC_ROC:  0.7033233158408592
Test AUC_ROC:  0.7030563522089334
Test AUC_ROC:  0.7024728459848669
Test AUC_ROC:  0.7003981571881864
Run 1
Test AUC_ROC:  0.6831080870361231
Test AUC_ROC:  0.6817918316954287
Test AUC_ROC:  0.6812900953390802
Test AUC_ROC:  0.6792776464070358
Test AUC_ROC:  0.6777128160159382
Run 2
Test AUC_ROC:  0.6865809977474983
Test AUC_ROC:  0.6894501680144751
Test AUC_ROC:  0.6910952328200584
Test AUC_ROC:  0.6933865071452309
Test AUC_ROC:  0.6939902514678188
Run 3
Test AUC_ROC:  0.6835964604675109
Test AUC_ROC:  0.6845141723059934
Test AUC_ROC:  0.6845365555215662
Test AUC_ROC:  0.6884648098545837
Test AUC_ROC:  0.6802669571510644
Run 4
Test AUC_ROC:  0.6730317864806867
Test AUC_ROC:  0.6716030340963282
Test AUC_ROC:  0.6680413984263233
Test AUC_ROC:  0.6651559519551741
Test AUC_ROC:  0.6647331008583691
Run 5
Test AUC_ROC:  0.6799725938181742
Test AUC_ROC:  0.6839329376754062
Test AUC_ROC:  0.6838802334795875
Test AUC_ROC