Here we quickly demonstrate how to utilize different mudules in our package.
We will go through how to use our packages for 2 major sections:
1. Process data through our dataloader 
2. Specify parameters and train
***
* The parts where you can take the most control (modify to suit your needs) will have explanations highlighted in **bold**. 
* Hyperparameters should be self-explanatory with details in options() function. 
* You can also find commments at the beginning of each cell for their functionalities in gerneral

### Step0. Load libraries and our modules

In [1]:
#!pip3.7 install cupy pynvrtc git+https://github.com/salesforce/pytorch-qrnn --user

In [13]:
"""
Created on Wed Nov 28 12:57:40 2018
@author: ginnyzhu
Last reviewed and updated Lrasmy Feb 21 2020
"""
from __future__ import print_function, division
from io import open
import string
import re
import random

import os
import argparse
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

try:
    import cPickle as pickle
except:
    import pickle
    
#import self-defined modules
#models, utils, and Dataloader
#sys.path.insert() only for jupyter notebook imports
import sys
sys.path.insert(0, '../ehr_pytorch')
import models as model 
from EHRDataloader import EHRdataFromPickles,EHRdataFromLoadedPickles, EHRdataloader 
import utils as ut #:)))) 
from EHREmb import EHREmbeddings

#silly ones
from termcolor import colored
from tqdm import tqdm
# check GPU availability
use_cuda = torch.cuda.is_available()
#device = torch.device("cuda:0" if use_cuda else "cpu")

In [14]:
#args, slightly modified from main.py file to be more compatible with jupyter notebook 
#all args provide default values, so you can run the whole notebook without changing/providing any args
#args ordered by dataloader, model, and training sections
def options():
    parser = argparse.ArgumentParser(description='Predictive Analytics on EHR with Pytorch')
    
    #EHRdataloader 
    parser.add_argument('-root_dir', type = str, default = '../data/' , 
                        help='the path to the folders with pickled file(s)')
    parser.add_argument('-file', type = str, default = 'toy.train' , 
                        help='the name of pickled files')
    parser.add_argument('-test_ratio', type = float, default = 0.2, 
                        help='test data size [default: 0.2]')
    parser.add_argument('-valid_ratio', type = float, default = 0.1, 
                        help='validation data size [default: 0.1]')
    
    #EHRmodel
    parser.add_argument('-which_model', type = str, default = 'DRNN', 
                        help='choose from {"RNN","DRNN","QRNN","LR"}') 
    parser.add_argument('-cell_type', type = str, default = 'GRU', 
                        help='For RNN based models, choose from {"RNN", "GRU", "LSTM"}')
    parser.add_argument('-input_size', type = list, default =[15817], 
                        help='''input dimension(s), decide which embedding types to use. 
                        If len of 1, then  1 embedding; 
                        len of 3, embedding medical, diagnosis and others separately (3 embeddings) 
                        [default:[15817]]''') ###multiple embeddings not effective in this release
    parser.add_argument('-embed_dim', type=int, default=128, 
                        help='number of embedding dimension [default: 128]')
    parser.add_argument('-hidden_size', type=int, default=128, 
                        help='size of hidden layers [default: 128]')
    parser.add_argument('-dropout_r', type=float, default=0.1, 
                        help='the probability for dropout[default: 0.1]')
    parser.add_argument('-n_layers', type=int, default=3, 
                        help='''number of Layers, 
                        for Dilated RNNs, dilations will increase exponentialy with mumber of layers [default: 1]''')
    parser.add_argument('-bii', type=bool, default=False, 
                        help='indicator of whether Bi-directin is activated. [default: False]')
    parser.add_argument('-time', type=bool, default=False, 
                        help='indicator of whether time is incorporated into embedding. [default: False]')
    parser.add_argument('-preTrainEmb', type= str, default='', 
                        help='path to pretrained embeddings file. [default:'']')
    parser.add_argument("-output_dir",type=str, default= '../models/', 
                        help="The output directory where the best model will be saved and logs written [default: we will create'../models/'] ")
    
    # training 
    parser.add_argument('-lr', type=float, default=10**-4, 
                        help='learning rate [default: 0.0001]')
    parser.add_argument('-L2', type=float, default=10**-4, 
                        help='L2 regularization [default: 0.0001]')
    parser.add_argument('-epochs', type=int, default= 100, 
                        help='number of epochs for training [default: 100]')
    parser.add_argument('-patience', type=int, default= 20, 
                        help='number of stagnant epochs to wait before terminating training [default: 20]')
    parser.add_argument('-batch_size', type=int, default=128, 
                        help='batch size for training, validation or test [default: 128]')
    parser.add_argument('-optimizer', type=str, default='adam', 
                        choices=  ['adam','adadelta','adagrad', 'adamax', 'asgd','rmsprop', 'rprop', 'sgd'], 
                        help='Select which optimizer to train [default: adam]. Upper/lower case does not matter') 
    #parser.add_argument('-cuda', type= bool, default=True, help='whether GPU is available [default:True]')
    args = parser.parse_args([])
    return args 

### StepX: You can modify parameters here to suit your own need

* All parameters have explanations in the cell above

In [15]:
args = options()
##Update the args here if you dont want to use the default ones
##start an example
args.which_model = 'RNN'
args.cell_type = 'GRU'
args.embed_dim = 128
args.hidden_size = 128
args.dropout_r = 0.2
args.n_layers = 2
args.input_size=[30000]
args.patience=3
##end
print(args)

Namespace(L2=0.0001, batch_size=128, bii=False, cell_type='GRU', dropout_r=0.2, embed_dim=128, epochs=100, file='toy.train', hidden_size=128, input_size=[30000], lr=0.0001, n_layers=2, optimizer='adam', output_dir='../models/', patience=3, preTrainEmb='', root_dir='../data/', test_ratio=0.2, time=False, valid_ratio=0.1, which_model='RNN')


### Step1. Data preparation

In [16]:
####Step1. Data preparation
#By default, prevent sort (on visit length) before splitting, if splitting
#Gotta specify your split ratios here if intend to split on non-default split ratios
#First load your data
print(colored("\nLoading and preparing data...", 'green'))    
data = EHRdataFromPickles(root_dir = args.root_dir, 
                          file = args.file, 
                          sort= False,
                          test_ratio = args.test_ratio, 
                          valid_ratio = args.valid_ratio) 

[32m
Loading and preparing data...[0m


In [17]:
#see an example of our pickle data
#40 is the index
#it will print out a formatted table of what each value mean and how they are organized in the file
print(data.__getitem__(24, seeDescription = True)) 

# Dataloader splits
train, test, valid = data.__splitdata__()
# can comment out this part if you dont want to know what's going on here
print(colored("\nSample data after split:", 'green'))
# an example from train, test, and valiation
print(
  "train: {}".format(train[-1]),
  "test: {}".format(test[-1]),
  "validation: {}".format(valid[-1]), sep='\n')
print(colored("\nSample data lengths for train, test and validation:", 'green'))
print(len(train), len(test), len(valid))

| data_description   | data                                |
|--------------------+-------------------------------------|
| patient_id         | 24                                  |
| label              | 1                                   |
| visit_time         | [list([9358])]                      |
| visit_codes        | [list([16422, 17589, 8301, 17447])] |
[24, 1, [[[9358], [16422, 17589, 8301, 17447]]]]
[32m
Sample data after split:[0m
train: [215, 0, [[[2094], [2709, 12635, 17202, 12404, 7548, 16477, 18066, 8872, 11111, 18996, 9296, 14756, 7263, 8968]]]]
test: [5733, 1, [[[10213], [7940, 6237, 12609, 8603, 758, 2066, 3914, 10758, 16268, 17491, 10553, 16013, 17513, 14205, 13604, 17346, 8807, 1558, 9898, 9635, 2356, 11765, 10720, 12938, 7196, 17190, 17493, 15507, 10740, 17982, 17527, 9718, 13312, 2447, 16315, 5174, 18163, 10339, 8880, 2782, 4949, 10292, 19569, 7362, 17950, 18022, 12886]]]]
validation: [6237, 1, [[[5678], [1645, 15544, 6377, 11201, 3330, 19591, 14144, 8792, 188

### Step2. Model loading 

In [18]:
#depending on different models, model parameters might have different choices.
#e.g. if you set bi = True for DRNN or QRNN, it will throw you warnings and implement correct bi =False instead
if args.which_model == 'RNN': 
    ehr_model = model.EHR_RNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r,
                              cell_type=args.cell_type,
                              bii= args.bii,
                              time= args.time,
                              preTrainEmb= args.preTrainEmb) 
    pack_pad = True
elif args.which_model == 'DRNN': 
    ehr_model = model.EHR_DRNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r, #default =0 
                              cell_type=args.cell_type, #default ='DRNN'
                              bii= False,
                              time = args.time, 
                              preTrainEmb= args.preTrainEmb)     
    pack_pad = False
elif args.which_model == 'QRNN': 
    ehr_model = model.EHR_QRNN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r, #default =0.1
                              cell_type= 'QRNN', #doesn't support normal cell types
                              bii= False, #QRNN doesn't support bi
                              time = args.time,
                              preTrainEmb= args.preTrainEmb)  
    pack_pad = False
elif args.which_model == 'TLSTM': 
    ehr_model = model.EHR_TLSTM(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers,
                              dropout_r=args.dropout_r, #default =0.1
                              cell_type= 'TLSTM', #doesn't support normal cell types
                              bii= False, 
                              time = args.time, 
                              preTrainEmb= args.preTrainEmb)  
    pack_pad = False
elif args.which_model == 'RETAIN': 
    ehr_model = model.RETAIN(input_size= args.input_size, 
                              embed_dim=args.embed_dim, 
                              hidden_size= args.hidden_size,
                              n_layers= args.n_layers) 
    pack_pad = False
else: 
    ehr_model = model.EHR_LR_emb(input_size = args.input_size,
                                 embed_dim = args.embed_dim,
                                 preTrainEmb= args.preTrainEmb)
    pack_pad = False


#make sure cuda is working
if use_cuda:
    ehr_model = ehr_model.cuda() 
#model optimizers to choose from. Upper/lower case dont matter
if args.optimizer.lower() == 'adam':
    optimizer = optim.Adam(ehr_model.parameters(), 
                           lr=args.lr, 
                           weight_decay=args.L2)
elif args.optimizer.lower() == 'adadelta':
    optimizer = optim.Adadelta(ehr_model.parameters(), 
                               lr=args.lr, 
                               weight_decay=args.L2)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(ehr_model.parameters(), 
                              lr=args.lr, 
                              weight_decay=args.L2) 
elif args.optimizer.lower() == 'adamax':
    optimizer = optim.Adamax(ehr_model.parameters(), 
                             lr=args.lr, 
                             weight_decay=args.L2)
elif args.optimizer.lower() == 'asgd':
    optimizer = optim.ASGD(ehr_model.parameters(), 
                           lr=args.lr, 
                           weight_decay=args.L2)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(ehr_model.parameters(), 
                              lr=args.lr, 
                              weight_decay=args.L2)
elif args.optimizer.lower() == 'rprop':
    optimizer = optim.Rprop(ehr_model.parameters(), 
                            lr=args.lr)
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(ehr_model.parameters(), 
                          lr=args.lr, 
                          weight_decay=args.L2)
else:
    raise NotImplementedError

In [19]:
##### separate loader for train, test, validation
#if you have different files, you need to load them separately into EHRdataFromPickles()
#and then use EHRdataloader() on each
#dataloader's default will sort data based on length of visits and then split into batches with default batch_size/of your choice
#new in this release is the creation of minibatches lists once before the epochs run, then will shuffle within the epochs
train_mbs = list(tqdm(EHRdataloader(train, batch_size = args.batch_size, packPadMode = pack_pad)))
print (' creating the list of valid minibatches')
valid_mbs = list(tqdm(EHRdataloader(valid, batch_size = args.batch_size, packPadMode = pack_pad)))
print (' creating the list of test minibatches')
test_mbs = list(tqdm(EHRdataloader(test, batch_size = args.batch_size, packPadMode = pack_pad)))

100%|██████████| 55/55 [00:07<00:00,  7.07it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

 creating the list of valid minibatches


100%|██████████| 8/8 [00:01<00:00,  7.87it/s]
  0%|          | 0/16 [00:00<?, ?it/s]

 creating the list of test minibatches


100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


### Step3. Train, validation and test

In [20]:
#Notes: default: sort data based on visit length 
#default: （batch）shuffle = true
#allows for keyboard interrupt
#saving best model in the directory specified in args.output_dir
try:
    ut.epochs_run(args.epochs, 
                  train = train_mbs, 
                  valid = valid_mbs, 
                  test = test_mbs, 
                  model = ehr_model, 
                  optimizer = optimizer,
                  shuffle = True, 
                  which_model = args.which_model, 
                  patience = args.patience,
                  output_dir = args.output_dir)
#we can keyboard interupt now 
except KeyboardInterrupt:
    print(colored('-' * 89, 'green'))
    print(colored('Exiting from training early','green'))   

[32m
 Epoch (0): Train_auc (0.5578475101672433), Valid_auc (0.5103293868854034) ,Training Average_loss (0.6936373829841613), Train_time (0m 0s), Eval_time (0m 0s)[0m
[33m
 Test_AUC (0.482506255630067) , Test_eval_time (0m 0s) [0m
[32m
 Epoch (1): Train_auc (0.5946469756966579), Valid_auc (0.5133092487864273) ,Training Average_loss (0.6881005471402948), Train_time (0m 0s), Eval_time (0m 0s)[0m
[33m
 Test_AUC (0.484342908617756) , Test_eval_time (0m 0s) [0m
[32m
 Epoch (2): Train_auc (0.6275864439150787), Valid_auc (0.5170380813534341) ,Training Average_loss (0.6836279966614464), Train_time (0m 0s), Eval_time (0m 0s)[0m
[33m
 Test_AUC (0.4873571214092683) , Test_eval_time (0m 0s) [0m
[32m
 Epoch (3): Train_auc (0.6587956501447891), Valid_auc (0.5232100802640222) ,Training Average_loss (0.6783660476857964), Train_time (0m 0s), Eval_time (0m 0s)[0m
[33m
 Test_AUC (0.48973676308677805) , Test_eval_time (0m 0s) [0m
[32m
 Epoch (4): Train_auc (0.6870815213766601), Valid_auc (

In [22]:
#if you want to use previous trained models, use
best_model= torch.load(args.output_dir + 'dhf.trainEHRmodel.pth')
best_model.load_state_dict(torch.load(args.output_dir + 'dhf.trainEHRmodel.st'))
best_model.eval()

EHR_RNN(
  (embed): Embedding(30000, 128, padding_idx=0)
  (rnn_c): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.2)
  (out): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

### StepExtra: Singly use our dataloader for data preparation purposes 

In [23]:
from EHRDataloader import EHRdataFromPickles, EHRdataloader, iter_batch2
data2 = EHRdataFromPickles(root_dir = args.root_dir, 
                          file = args.file, 
                          sort= False,
                          test_ratio = args.test_ratio, 
                          valid_ratio = args.valid_ratio) 
loader2 =  EHRdataloader(data2)

In [29]:
#if you want to shuffle batches before using them, add this line 
#(options are achieved in utils by setting shuffle = True)
loader2 = iter_batch2(loader2, len(loader2))

#otherwise, directly call 
for i, batch in enumerate(loader2): 
    #feed the batch to do things
       # print('EOF')
       # break

EOF
