In [1]:
from __future__ import division
import argparse
import math
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import pickle
from tqdm import tqdm

from sklearn.metrics import roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

### Model construction

In [9]:
class RNN(nn.Module):
    def __init__(self, epochs=5, batchsize=50, vocabsize=5, embsize=100):
        super(RNN, self).__init__()
        self.epochs = 5
        self.batchsize = batchsize
        self.vocabsize = vocabsize
        self.embsize = embsize

        self.rnn = nn.LSTM(input_size=embsize, hidden_size=embsize, num_layers=1)
        self.out = nn.Linear(embsize, 1)
        self.sig = nn.Sigmoid()

    def forward(self, input_latent, hidden=None, force=True, steps=0):
        if force or steps == 0: steps = len(input_latent)
        outputs = Variable(torch.zeros(steps, 1, 1))
        
        inputs = F.relu(input_latent)

        inputs = inputs.view(inputs.size()[0],1,inputs.size()[1])
        outputs, hidden = self.rnn(inputs, hidden)
        outputs = self.out(outputs)
        return outputs.squeeze(), hidden

    def predict(self, input_latent):
        out, hid = self.forward(input_latent, None)
        return self.sig(out[-1]).data

### Loading Data

In [10]:
DATA_PATH = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC_Processed/'
CAE_PATH = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/Output/'

In [20]:
n_epochs = 1
vocabsize_icd = 942
vocabsize_meds = 3202
vocabsize_labs = 284 #all 681
vocabsize = vocabsize_icd+vocabsize_meds+vocabsize_labs

embsize_latent = 175
embsize = embsize_latent

input_seqs_icd = pickle.load(open(DATA_PATH + 'MIMICIIIPROCESSED.3digitICD9.seqs', 'rb'))
input_seqs_meds = pickle.load(open(DATA_PATH + 'MIMICIIIPROCESSED.meds.seqs', 'rb'))
input_seqs_labs = pickle.load(open(DATA_PATH + 'MIMICIIIPROCESSED.abnlabs.seqs', 'rb'))
input_seqs_fullicd = pickle.load(open(DATA_PATH + 'MIMICIIIPROCESSED.seqs', 'rb'))

input_seqs_latent = pickle.load(open(CAE_PATH + 'CAE_Embeddings.seqs', 'rb'))
# latent_weights = pickle.load(open(args.emb_weights))
CAE_emb_weights = torch.tensor(np.load(CAE_PATH + 'CAE_embedding_weights.npy',allow_pickle=True))
print(CAE_emb_weights.size())

labels = pickle.load(open(DATA_PATH + 'MIMICIIIPROCESSED.morts', 'rb'))

torch.Size([4428, 175])


In [21]:
print('Data loaded..')

trainratio = 0.7
validratio = 0.1
testratio = 0.2

trainlindex = int(len(input_seqs_icd)*trainratio)
validlindex = int(len(input_seqs_icd)*(trainratio + validratio))

# Some functions that will be used during training for input seq conversion to one hot vector and the correlation calculation
def convert_to_one_hot(code_seqs, len_):
    new_code_seqs = []
    for code_seq in code_seqs:
        one_hot_vec = np.zeros(len_)
        for code in code_seq:
            one_hot_vec[code] = 1
        new_code_seqs.append(one_hot_vec)
    return np.array(new_code_seqs)

def get_avg(seqs, type_):
    count = 0
    for seq in seqs:
        count += len(seq)
    val = round(count*1.0/len(seqs))
    if type_ == 'i':
        return min(4, int(val/5))
    else:
        return min(4, int(val/50))

Data loaded..


### Model Training

In [22]:
import time

start = time.process_time()

print('Starting training..')

batchsize = 50

# ICD_wise_tot_tr = np.zeros(5)
# meds_wise_tot_tr = np.zeros(5)
# labs_wise_tot_tr = np.zeros(5)

# for i in range(len(train_input_seqs_icd)):
# 	ICD_wise_tot_tr[get_avg(train_input_seqs_icd[i], 'i')] += 1
# 	meds_wise_tot_tr[get_avg(train_input_seqs_meds[i], 'm')] += 1
# 	labs_wise_tot_tr[get_avg(train_input_seqs_labs[i], 'l')] += 1

# print 'ICD-wise train total', ICD_wise_tot_tr
# print 'Meds-wise train total', meds_wise_tot_tr
# print 'Labs-wise train total', labs_wise_tot_tr

best_aucrocs = []
for run in range(10):
    print('Run', run)

    perm = np.random.permutation(len(input_seqs_icd))
    rinput_seqs_icd = [input_seqs_icd[i] for i in perm]
    rinput_seqs_meds = [input_seqs_meds[i] for i in perm]
    rinput_seqs_labs = [input_seqs_labs[i] for i in perm]
    rinput_seqs_latent = input_seqs_latent[perm]
    rinput_seqs_fullicd = [input_seqs_fullicd[i] for i in perm]
    rlabels = [labels[i] for i in perm]
    rlabels = torch.tensor(rlabels)
    
    train_input_seqs_icd = rinput_seqs_icd[:trainlindex]
    train_input_seqs_meds = rinput_seqs_meds[:trainlindex]
    train_input_seqs_labs = rinput_seqs_labs[:trainlindex]
    train_input_seqs_latent = rinput_seqs_latent[:trainlindex]
    train_labels = rlabels[:trainlindex]
    train_labels = train_labels.reshape(train_labels.shape[0],1)

    valid_input_seqs_icd = rinput_seqs_icd[trainlindex:validlindex]
    valid_input_seqs_meds = rinput_seqs_meds[trainlindex:validlindex]
    valid_input_seqs_labs = rinput_seqs_labs[trainlindex:validlindex]
    valid_input_seqs_latent = rinput_seqs_latent[trainlindex:validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs_icd = rinput_seqs_icd[validlindex:]
    test_input_seqs_meds = rinput_seqs_meds[validlindex:]
    test_input_seqs_labs = rinput_seqs_labs[validlindex:]
    test_input_seqs_latent = rinput_seqs_latent[validlindex:]
    test_input_seqs_fullicd = rinput_seqs_fullicd[validlindex:]
    test_labels = rlabels[validlindex:]

    n_iters = len(train_input_seqs_icd)

    model = RNN(n_epochs, 1, vocabsize, embsize)
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    aucrocs = []

    for epoch in range(n_epochs):
        epoch_loss = 0
        print('Epoch', (epoch+1))

        for i in (range(0, n_iters, batchsize)):
            
            batch_icd = train_input_seqs_icd[i:i+batchsize]
            batch_meds = train_input_seqs_meds[i:i+batchsize]
            batch_labs = train_input_seqs_labs[i:i+batchsize]
            batch_latent = train_input_seqs_latent[i:i+batchsize]

            batch_train_labels = train_labels[i:i+batchsize]

            optimizer.zero_grad()
            losses = []

            for j in range(len(batch_icd)):
                icd_onehot = convert_to_one_hot(batch_icd[j], vocabsize_icd)
                med_onehot = convert_to_one_hot(batch_meds[j], vocabsize_meds)
                lab_onehot = convert_to_one_hot(batch_labs[j], vocabsize_labs)

                
                latent_inputs_oh = np.concatenate((icd_onehot, med_onehot, lab_onehot), 1)
                latent_inputs = np.dot(latent_inputs_oh, CAE_emb_weights)
                latent_inputs = Variable(torch.from_numpy(latent_inputs).float())
                # latent_inputs = Variable(batch_latent[j].float())
                

                targets = Variable(batch_train_labels[j].float())

                # Use teacher forcing 50% of the time
                force = random.random() < 0.5
                outputs, hidden = model(latent_inputs, None, force)
                
                #print outputs[-1], targets
                loss = criterion(outputs[-1].view(1), targets)
                losses.append(loss)
            
            loss = sum(losses)/len(batch_icd)
#             print(loss)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.data

        print(epoch, epoch_loss)
        
        ## Validation phase
        vpredictions = np.zeros(len(valid_input_seqs_icd))
        for i in range(len(valid_input_seqs_icd)):
            
            icd_one_hot = convert_to_one_hot(valid_input_seqs_icd[i], vocabsize_icd)
            meds_one_hot = convert_to_one_hot(valid_input_seqs_meds[i], vocabsize_meds)
            labs_one_hot = convert_to_one_hot(valid_input_seqs_labs[i], vocabsize_labs)
            

            test_input_latent_oh = np.concatenate((icd_one_hot, meds_one_hot, labs_one_hot), 1)
            test_input_latent = np.dot(test_input_latent_oh, CAE_emb_weights)
            test_input_latent = Variable(torch.from_numpy(test_input_latent).float())

            # test_input_latent = Variable(torch.from_numpy(np.array(valid_input_seqs_latent[i])).float())
            vpredictions[i] = model.predict(test_input_latent)

        print("Validation AUC_ROC: ", roc_auc_score(valid_labels, vpredictions))
        
        ## Testing phase
        predictions = np.zeros(len(test_input_seqs_icd))

        ICD_wise_corr = np.zeros(5)
        meds_wise_corr = np.zeros(5)
        labs_wise_corr = np.zeros(5)
        ICD_wise_tot = np.zeros(5)
        meds_wise_tot = np.zeros(5)
        labs_wise_tot = np.zeros(5)

        for i in range(len(test_input_seqs_icd)):
            icd_one_hot = convert_to_one_hot(test_input_seqs_icd[i], vocabsize_icd)
            meds_one_hot = convert_to_one_hot(test_input_seqs_meds[i], vocabsize_meds)
            labs_one_hot = convert_to_one_hot(test_input_seqs_labs[i], vocabsize_labs)
            
            test_input_latent_oh = np.concatenate((icd_one_hot, meds_one_hot, labs_one_hot), 1)
            test_input_latent = np.dot(test_input_latent_oh, CAE_emb_weights)
            test_input_latent = Variable(torch.from_numpy(test_input_latent).float())

            # test_input_latent = Variable(torch.from_numpy(np.array(test_input_seqs_latent[i])).float())
            predictions[i] = model.predict(test_input_latent)

            ICD_wise_corr[get_avg(test_input_seqs_icd[i], 'i')] += int((predictions[i]>0.5)*1 == test_labels[i])
            ICD_wise_tot[get_avg(test_input_seqs_icd[i], 'i')] += 1

            meds_wise_corr[get_avg(test_input_seqs_meds[i], 'm')] += int((predictions[i]>0.5)*1 == test_labels[i])
            meds_wise_tot[get_avg(test_input_seqs_meds[i], 'm')] += 1

            labs_wise_corr[get_avg(test_input_seqs_labs[i], 'l')] += int((predictions[i]>0.5)*1 == test_labels[i])
            labs_wise_tot[get_avg(test_input_seqs_labs[i], 'l')] += 1

        print("Test AUC_ROC: ", roc_auc_score(test_labels, predictions))

        aucrocs.append(roc_auc_score(test_labels, predictions))
        fpr, tpr, _ = roc_curve(test_labels, predictions)
        pickle.dump({"FPR":fpr, "TPR":tpr}, open(CAE_PATH+'roc_clout_cornn.p', 'wb'))
        actual_predictions = (predictions>0.5)*1
        print(classification_report(test_labels, actual_predictions))

    best_aucrocs.append(max(aucrocs))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))


end = time.process_time()
print('The training is complete!')
print('The time used is: ', end - start)

Starting training..
Run 0
Epoch 1
0 tensor(61.3961)
Validation AUC_ROC:  0.7469008264462811
Test AUC_ROC:  0.755851886510443
              precision    recall  f1-score   support

           0       0.72      0.85      0.78       927
           1       0.65      0.47      0.54       581

    accuracy                           0.70      1508
   macro avg       0.69      0.66      0.66      1508
weighted avg       0.69      0.70      0.69      1508

Run 1
Epoch 1
0 tensor(61.6677)
Validation AUC_ROC:  0.778673215016557
Test AUC_ROC:  0.7576651599830776
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       926
           1       0.67      0.43      0.53       582

    accuracy                           0.70      1508
   macro avg       0.69      0.65      0.65      1508
weighted avg       0.69      0.70      0.68      1508

Run 2
Epoch 1
0 tensor(62.0193)
Validation AUC_ROC:  0.7480368079558546
Test AUC_ROC:  0.7809256019405458
          