Here we quickly demonstrate how to utilize different mudules in our package.
We will go through how to use our packages for 2 major sections:
1. Process data through our dataloader 
2. Specify parameters and train
***
* The parts where you can take the most control (modify to suit your needs) will have explanations highlighted in **bold**. 
* Hyperparameters should be self-explanatory with details in options() function. 
* You can also find commments at the beginning of each cell for their functionalities in gerneral

### Step0. Load libraries and our modules

In [5]:
"""
Created on Wed Nov 28 12:57:40 2018
@author: ginnyzhu
"""
from __future__ import print_function, division
from io import open
import string
import re
import random

import os
import argparse
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

try:
    import cPickle as pickle
except:
    import pickle
    
#import self-defined modules
#models, utils, and Dataloader
#sys.path.insert() only for jupyter notebook imports
import sys
sys.path.insert(0, '../ehr_pytorch')
import models as model 
from EHRDataloader import EHRdataFromPickles, EHRdataloader 
import utils as ut #:)))) 
from EHREmb import EHREmbeddings

#silly ones
from termcolor import colored

# check GPU availability
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [71]:
#args, slightly modified from main.py file to be more compatible with jupyter notebook 
#all args provide default values, so you can run the whole notebook without changing/providing any args
#args ordered by dataloader, model, and training sections
def options():
    parser = argparse.ArgumentParser(description='Predictive Analytics on EHR with Pytorch')
    
    #EHRdataloader 
    parser.add_argument('-root_dir', type = str, default = '../data/' , 
                        help='the path to the folders with pickled file(s)')
    parser.add_argument('-file', type = str, default = 'hf.train' , 
                        help='the name of pickled files')
    parser.add_argument('-test_ratio', type = float, default = 0.2, 
                        help='test data size [default: 0.2]')
    parser.add_argument('-valid_ratio', type = float, default = 0.1, 
                        help='validation data size [default: 0.1]')
    
    #EHRmodel
    parser.add_argument('-which_model', type = str, default = 'DRNN', 
                        help='choose from {"RNN","DRNN","QRNN","LR"}') 
    parser.add_argument('-cell_type', type = str, default = 'GRU', 
                        help='For RNN based models, choose from {"RNN", "GRU", "LSTM", "QRNN" (for QRNN model only)}')
    parser.add_argument('-input_size', type = list, default =[15817], 
                        help='''input dimension(s), decide which embedding types to use. 
                        If len of 1, then  1 embedding; 
                        len of 3, embedding medical, diagnosis and others separately (3 embeddings) 
                        [default:[15817]]''')
    parser.add_argument('-embed_dim', type=int, default=128, 
                        help='number of embedding dimension [default: 128]')
    parser.add_argument('-hidden_size', type=int, default=128, 
                        help='size of hidden layers [default: 128]')
    parser.add_argument('-dropout_r', type=float, default=0.1, 
                        help='the probability for dropout[default: 0.1]')
    parser.add_argument('-n_layers', type=int, default=3, 
                        help='''number of Layers, 
                        for Dilated RNNs, dilations will increase exponentialy with mumber of layers [default: 1]''')
    parser.add_argument('-bii', type=bool, default=False, 
                        help='indicator of whether Bi-directin is activated. [default: False]')
    parser.add_argument('-time', type=bool, default=False, 
                        help='indicator of whether time is incorporated into embedding. [default: False]')
    parser.add_argument('-preTrainEmb', type= str, default='', 
                        help='path to pretrained embeddings file. [default:'']')
    parser.add_argument("-output_dir",type=str, default= '../models/', 
                        help="The output directory where the best model will be saved and logs written [default: we will create'../models/'] ")
    
    # training 
    parser.add_argument('-lr', type=float, default=10**-4, 
                        help='learning rate [default: 0.0001]')
    parser.add_argument('-L2', type=float, default=10**-4, 
                        help='L2 regularization [default: 0.0001]')
    parser.add_argument('-epochs', type=int, default= 100, 
                        help='number of epochs for training [default: 100]')
    parser.add_argument('-patience', type=int, default= 20, 
                        help='number of stagnant epochs to wait before terminating training [default: 20]')
    parser.add_argument('-batch_size', type=int, default=128, 
                        help='batch size for training, validation or test [default: 128]')
    parser.add_argument('-optimizer', type=str, default='adam', 
                        choices=  ['adam','adadelta','adagrad', 'adamax', 'asgd','rmsprop', 'rprop', 'sgd'], 
                        help='Select which optimizer to train [default: adam]. Upper/lower case does not matter') 
    #parser.add_argument('-cuda', type= bool, default=True, help='whether GPU is available [default:True]')
    args = parser.parse_args([])
    return args 

### StepX: You can modify parameters here to suit your own need

* All parameters have explanations in the cell above

In [70]:
args = options()
##Update the args here if you dont want to use the default ones
##start an example
args.which_model = 'RNN'
args.cell_type = 'GRU'
args.embed_dim = 256
args.hidden_size = 256
args.dropout_r = 0.2
args.n_layers = 2
##end
print(args)

Namespace(L2=0.0001, batch_size=128, bii=False, cell_type='GRU', dropout_r=0.2, embed_dim=256, epochs=100, file='hf.train', hidden_size=256, input_size=[15817], lr=0.0001, n_layers=2, optimizer='adam', output_dir='../models/', patience=20, preTrainEmb='', root_dir='../data/', test_ratio=0.2, time=False, valid_ratio=0.1, which_model='RNN')


### Step1. Data preparation

In [72]:
####Step1. Data preparation
#By default, prevent sort (on visit length) before splitting, if splitting
#Gotta specify your split ratios here if intend to split on non-default split ratios
#First load your data
print(colored("\nLoading and preparing data...", 'green'))    
data = EHRdataFromPickles(root_dir = args.root_dir, 
                          file = args.file, 
                          sort= False,
                          test_ratio = args.test_ratio, 
                          valid_ratio = args.valid_ratio) 

[32m
Loading and preparing data...[0m


In [73]:
#see an example of our pickle data
#40 is the index
#it will print out a formatted table of what each value mean and how they are organized in the file
print(data.__getitem__(40, seeDescription = True)) 

# Dataloader splits
train, test, valid = data.__splitdata__()
# can comment out this part if you dont want to know what's going on here
print(colored("\nSample data after split:", 'green'))
# an example from train, test, and valiation
print(
  "train: {}".format(train[-1]),
  "test: {}".format(test[-1]),
  "validation: {}".format(valid[-1]), sep='\n')
print(colored("\nSample data lengths for train, test and validation:", 'green'))
print(len(train), len(test), len(valid))

| data_description   | data                                                                |
|--------------------+---------------------------------------------------------------------|
| patient_id         | 310831                                                              |
| label              | 0                                                                   |
| visit_time         | [list([0])]                                                         |
| visit_codes        | [list([38, 2108, 171, 263, 16, 47, 260, 42, 141, 52, 244, 28, 30])] |
[310831, 0, [[[0], [38, 2108, 171, 263, 16, 47, 260, 42, 141, 52, 244, 28, 30]]]]
[32m
Sample data after split:[0m
train: [12966320, 0, [[[0], [38, 2196, 124, 222, 16, 105, 91, 88, 181, 52, 311, 108, 30]]]]
test: [12884531, 0, [[[0], [38, 2034, 196, 377, 16, 105, 23, 42, 181, 52, 215, 108, 30]]]]
validation: [13486254, 0, [[[0], [38, 2010, 1997, 130, 33, 180, 16, 47, 103, 42, 181, 52, 183, 108, 30]]]]
[32m
Sample data lengths for train

In [75]:
#separate loader for train, test, validation
#if you have different files, you need to load them separately into EHRdataFromPickles()
#and then use EHRdataloader() on each
#dataloader's default will sort data based on length of visits and then split into batches with default batch_size/of your choice
trainloader = EHRdataloader(train) 
validloader = EHRdataloader(valid)
testloader = EHRdataloader(test)

### Step2. Model loading 

In [76]:
#depending on different models, model parameters might have different choices.
#e.g. if you set bi = True for DRNN or QRNN, it will throw you warnings and implement correct bi =False instead
if args.which_model == 'RNN': 
    ehr_model = model.EHR_RNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r,
                              cell_type=args.cell_type,
                              bii= args.bii,
                              time= args.time,
                              preTrainEmb= args.preTrainEmb) 
elif args.which_model == 'DRNN': 
    ehr_model = model.EHR_DRNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r, #default =0 
                              cell_type=args.cell_type, #default = 'GRU'
                              bii= False, #DRNN
                              time = args.time, 
                              preTrainEmb= args.preTrainEmb)     
elif args.which_model == 'QRNN': 
    ehr_model = model.EHR_DRNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r, #default =0.1
                              cell_type= 'QRNN', #doesn't support normal cell types
                              bii= False, #QRNN doesn't support bi
                              time = args.time, 
                              preTrainEmb= args.preTrainEmb)  
else: 
    ehr_model = model.EHR_LR_emb(input_size = args.input_size,
                                 embed_dim = args.embed_dim,
                                 preTrainEmb= args.preTrainEmb)
#make sure cuda is working
if use_cuda:
    ehr_model = ehr_model.cuda() 
#model optimizers to choose from. Upper/lower case dont matter
if args.optimizer.lower() == 'adam':
    optimizer = optim.Adam(ehr_model.parameters(), 
                           lr=args.lr, 
                           weight_decay=args.L2)
elif args.optimizer.lower() == 'adadelta':
    optimizer = optim.Adadelta(ehr_model.parameters(), 
                               lr=args.lr, 
                               weight_decay=args.L2)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(ehr_model.parameters(), 
                              lr=args.lr, 
                              weight_decay=args.L2) 
elif args.optimizer.lower() == 'adamax':
    optimizer = optim.Adamax(ehr_model.parameters(), 
                             lr=args.lr, 
                             weight_decay=args.L2)
elif args.optimizer.lower() == 'asgd':
    optimizer = optim.ASGD(ehr_model.parameters(), 
                           lr=args.lr, 
                           weight_decay=args.L2)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(ehr_model.parameters(), 
                              lr=args.lr, 
                              weight_decay=args.L2)
elif args.optimizer.lower() == 'rprop':
    optimizer = optim.Rprop(ehr_model.parameters(), 
                            lr=args.lr)
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(ehr_model.parameters(), 
                          lr=args.lr, 
                          weight_decay=args.L2)
else:
    raise NotImplementedError

### Step3. Train, validation and test

In [77]:
#Notes: default: sort data based on visit length 
#default: （batch）shuffle = true
#allows for keyboard interrupt
#saving best model in the directory specified in args.output_dir
try:
    ut.epochs_run(args.epochs, 
                  train = trainloader, 
                  valid = validloader, 
                  test = testloader, 
                  model = ehr_model, 
                  optimizer = optimizer,
                  shuffle = True, 
                  batch_size = args.batch_size, 
                  which_model = args.which_model, 
                  patience = args.patience,
                  output_dir = args.output_dir)
#we can keyboard interupt now 
except KeyboardInterrupt:
    print(colored('-' * 89, 'green'))
    print(colored('Exiting from training early','green'))   

[32m
Current running on Epoch (0), Average_loss (0.42608730097611747)[0m
[32mTrain_auc (0.6913143931598248), Valid_auc (0.6145069648093842)[0m
[32mTrain_time (1m 7s), Valid_time (0m 3s)[0m
[32m
Current running on Epoch (1), Average_loss (0.3597108868757884)[0m
[32mTrain_auc (0.7589733388285548), Valid_auc (0.6701910740469208)[0m
[32mTrain_time (0m 58s), Valid_time (0m 3s)[0m
[32m
Current running on Epoch (2), Average_loss (0.33940000454584757)[0m
[32mTrain_auc (0.8010523896013336), Valid_auc (0.7169240285923754)[0m
[32mTrain_time (1m 0s), Valid_time (0m 6s)[0m
[32m
Current running on Epoch (3), Average_loss (0.32462459921836845)[0m
[32mTrain_auc (0.8278247528693398), Valid_auc (0.7545065065982405)[0m
[32mTrain_time (1m 3s), Valid_time (0m 3s)[0m
[32m-----------------------------------------------------------------------------------------[0m
[32mExiting from training early[0m


In [94]:
#if you want to use previous trained models, use
best_model= torch.load(args.output_dir + 'EHRmodel.pth')
best_model.load_state_dict(torch.load(args.output_dir + 'EHRmodel.st'))
best_model.eval()

EHR_RNN(
  (embed): Embedding(15817, 256, padding_idx=0)
  (rnn_c): GRU(256, 256, dropout=0.1, bidirectional=1)
  (out): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

### StepExtra: Singly use our dataloader for data preparation purposes 

In [99]:
from EHRDataloader import EHRdataFromPickles, EHRdataloader, iter_batch2
data2 = EHRdataFromPickles(root_dir = args.root_dir, 
                          file = args.file, 
                          sort= False,
                          test_ratio = args.test_ratio, 
                          valid_ratio = args.valid_ratio) 
loader2 =  EHRdataloader(data2)

In [None]:
#if you want to shuffle batches before using them, add this line 
#(options are achieved in utils by setting shuffle = True)
loader2 = iter_batch2(loader = loader2, len(loader2))

#otherwise, directly call 
for i, batch in enumerate(loader2): 
    #feed the batch to do things