In [1]:
import sys
sys.path.append("./../../src")
import pandas as pd
import numpy as np

import torch
from torch import nn, optim

In [2]:
from vae_with_dataloader import VAE, VAETrainerWithDataLoader
from dataloader import PatientSparseSimilarityDataset
from visualizer_helper import Visualizer

import pickle

from config_reader import Config
import os
import math
from sklearn.model_selection import train_test_split

In [3]:
###Load Configuration file
config = Config('./config.ini')
experiment_name=config.experiment_name
config.__dict__

{'patient_icd_path': './../../data/PATIENT_ICD_BINARY_SPARSE_CSR.p',
 'icd9codes_path': '/data1/andrew/meng/mixehr/data/Mimic/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz',
 'subject_ids_path': './../../data/PATIENT_ICD_SUBJECT_IDS.csv',
 'experiment_name': '200214_patient_similarity_clusters_default',
 'encoder_dim': [(250, 500), (500, 250), (250, 100)],
 'latent_dim': 25,
 'decoder_dim': [(250, 500), (500, 250), (250, 100)],
 'use_relu_encoder': True,
 'use_relu_decoder': True,
 'kld_beta': 1.0}

In [4]:
###Load Data
sparse_similarity_dataset = PatientSparseSimilarityDataset(csr_data_path=config.patient_icd_path,
                                                           experiment_name=experiment_name)

Loaded CSR Dataset w/ dim (46520, 6984)
Loaded existing ANNOY index from ./../../../large_data_files/200214_patient_similarity_clusters_default.ann


In [5]:
# icd9codes = pd.read_csv(config.icd9codes_path)
# icd_analyzer = ICDAnalysisHelper(icd9codes_df = icd9codes, patient_icd_df = patient_icd_df)
#icd_analyzer.lookup_icds(icd9codes, ["4019", "41401"])
visualizer = Visualizer()
patient_icd_dataset = sparse_similarity_dataset

In [6]:
###Load Model
model = VAE(
    feature_dim = patient_icd_dataset.get_feat_dim(), 
    encoder_dim = config.encoder_dim,
    latent_dim = config.latent_dim,
    decoder_dim = config.decoder_dim,
    use_relu_encoder= config.use_relu_encoder,
    use_relu_decoder= config.use_relu_decoder
)

model = model.double()

optimizer = optim.Adam(model.parameters(), lr=0.005)

###Load Trainer
trainer = VAETrainerWithDataLoader(
    model=model, 
    optimizer=optimizer,
    experiment_name=config.experiment_name,
    kld_beta=config.kld_beta
)

In [7]:
training_params = {'batch_size':32, 'shuffle': True}
training_generator = torch.utils.data.DataLoader(sparse_similarity_dataset, **training_params)

In [None]:
###Train Model
trainer.train(
    training_generator=training_generator,
    epochs=80,
    save_model_interval=2,
    clip_gradients=False
)

In [None]:
trainer.plot_elbo()
trainer.plot_bce()
trainer.plot_kld()

In [8]:
###Load pre-trained model
epoch = 6
model.load_state_dict(torch.load("./VAE_exp_{}_epoch_{}.pkl".format(experiment_name, epoch)))
trainer.model = model

In [None]:
###Encode data
encoding_params = {'batch_size':64, 'shuffle': False}
encoding_generator = torch.utils.data.DataLoader(patient_icd_dataset, **encoding_params)

latent, means, var = trainer.encode_data(encoding_generator)
pickle.dump(latent, open("latent_epochs_{}.p".format(epoch), 'wb'))
pickle.dump(means, open("means_epochs_{}.p".format(epoch), 'wb'))
pickle.dump(var, open("vars_epochs_{}.p".format(epoch), 'wb'))

In [None]:
###Get UMAP representations
X_umap = visualizer.umap_embedding(latent.cpu().detach().numpy(), n_components=3)
mean_umap = visualizer.umap_embedding(means.cpu().detach().numpy(), n_components=3)
var_umap = visualizer.umap_embedding(var.cpu().detach().numpy(), n_components=3)

pickle.dump(X_umap, open("X_umap_3d.p", 'wb'))
pickle.dump(mean_umap, open("mean_umap_3d.p", 'wb'))
pickle.dump(var_umap, open("var_umap_3d.p", 'wb'))