In [1]:
from __future__ import division
import math
import numpy as np

import os
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import pickle
from tqdm import tqdm

In [2]:
vocabsize_icd = 942 #only diagnoses 3 digit icd codes
vocabsize_meds = 3202 #med codes
vocabsize_labs = 284 #abnormal lab codes

In [3]:
DATA_PATH = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC_Processed/'
icd_seqs = pickle.load(open(os.path.join(DATA_PATH,'MIMICIIIPROCESSED.3digitICD9.seqs'), 'rb'))
med_seqs = pickle.load(open(os.path.join(DATA_PATH,'MIMICIIIPROCESSED.meds.seqs'), 'rb'))
lab_seqs = pickle.load(open(os.path.join(DATA_PATH,'MIMICIIIPROCESSED.abnlabs.seqs'), 'rb'))

In [4]:
def combine_encounter(seqs, vocab):
    ret_vector = np.zeros([len(seqs), vocab])
    for i, enc in enumerate(seqs):
#         print(i)
        for code in enc:
            ret_vector[i, code] = 1
    return ret_vector.sum(axis = 0) 

In [18]:
# diagnoses icd9 feature
input_icd = torch.tensor(np.array([combine_encounter(icd_seqs[i], vocabsize_icd) for i in range(0, len(icd_seqs))]))

# med feature
input_med = torch.tensor(np.array([combine_encounter(med_seqs[i], vocabsize_meds) for i in range(0, len(med_seqs))]))

# abnormal lab feature
input_lab = torch.tensor(np.array([combine_encounter(lab_seqs[i], vocabsize_labs) for i in range(0, len(lab_seqs))]))


In [19]:
print(input_icd.shape)
print(input_med.shape)
print(input_lab.shape)

torch.Size([7537, 942])
torch.Size([7537, 3202])
torch.Size([7537, 284])


In [22]:
input_full = torch.cat((input_icd, input_med, input_lab),1)
input_full.shape

torch.Size([7537, 4428])

In [35]:
class AE(nn.Module):
    def __init__(self, epochs, batchsize, embsize):
        super(AE, self).__init__()
        self.epochs = epochs
        self.batchsize = batchsize
        self.embsize = embsize

        self.emb = nn.Linear(vocabsize_icd + vocabsize_meds + vocabsize_labs, self.embsize)

        self.out = nn.Linear(self.embsize, vocabsize_icd + vocabsize_meds + vocabsize_labs)

        self.reconloss = nn.MSELoss(size_average=True)

    def forward(self, input_icd, input_med, input_lab):

        input_full = torch.cat((input_icd, input_med, input_lab),1)

        hidden_full = F.relu(self.emb(input_full))

        output_full = F.relu(self.out(hidden_full))

        return [output_full, hidden_full]

    def get_encodings(self, ICD_data, Lab_data):
        return self.forward(Variable(torch.from_numpy(ICD_data).float()), Variable(torch.from_numpy(Lab_data).float()))[-1]

    def fit(self, ICDs, Meds, Labs):

        optimizer = optim.Adam(self.parameters(), 0.01)

        prev_loss = 1000
        for epoch in range(self.epochs):
            print('Epoch:', epoch)

            perm = np.random.permutation(ICDs.shape[0])
            ICDs = ICDs[perm]
            Meds = Meds[perm]
            Labs = Labs[perm]

            losses = []

            for i in range(0, ICDs.shape[0], self.batchsize):
                ICDbatch, Medbatch, Labbatch = ICDs[i:i+self.batchsize], Meds[i:i+self.batchsize], Labs[i:i+self.batchsize]
                ICDbatchvar, Medbatchvar, Labbatchvar = Variable(ICDbatch.float()), \
                                                        Variable(Medbatch.float()), \
                                                        Variable(Labbatch.float())

                outputs = self.forward(ICDbatchvar, Medbatchvar, Labbatchvar)

                loss = self.reconloss(outputs[0], torch.cat((ICDbatchvar, Medbatchvar, Labbatchvar),1))

                losses.append(loss.data)

                optimizer.zero_grad()

                loss.backward()

                optimizer.step()
                # print 'recon loss:', loss_recon.data[0], 'loss_cr:', loss_cr.data[0]

            print('Epoch loss:', np.mean(losses))

            if abs(np.mean(losses) - prev_loss) < 0.00005:
                break

            prev_loss = np.mean(losses)


In [36]:
model = AE(10,50,175)

model.fit(input_icd,input_med, input_lab)

Epoch: 0
Epoch loss: 0.047882527
Epoch: 1
Epoch loss: 0.0480279
Epoch: 2
Epoch loss: 0.04901274
Epoch: 3
Epoch loss: 0.046973605
Epoch: 4
Epoch loss: 0.047023993
Epoch: 5
Epoch loss: 0.04575749
Epoch: 6
Epoch loss: 0.04655977
Epoch: 7
Epoch loss: 0.046920467
Epoch: 8
Epoch loss: 0.04828365
Epoch: 9
Epoch loss: 0.048332077


In [38]:
emb_weights = model._modules['emb'].weight.data.numpy().T
print('Pickled embedding weights. Shape:', np.array(emb_weights).shape)

Pickled embedding weights. Shape: (4428, 175)


In [39]:
Out_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/Output/'
pickle.dump(emb_weights, open(Out_path + 'AE_embedding_weights.npy', 'wb'))