In [1]:
##################
# Document Maker #
################################################
# Desc   : A Nice RNN for Generating Documents #
# Author : Abe Hoffman                         #
# Date   : Aug 21 2017                         #
################################################

In [2]:
from __future__ import print_function
from collections import namedtuple
import string, textract
import tensorflow as tf
import numpy as np
import glob, time

In [3]:
###################
# I. Architecture #
###################
# A. Load Document (from the Corpus)
# B. Pre-process Document
# C. Encode the Document
# D. Define Batch Generator
# E. Define Model Inputs
# F. Define LSTM Cell
# G. Define Model Output
# H. Define Model Loss
# I. Define Model Optimizer
# J. The LSTM Network

######################
# II. Implementation #
######################
# A. Define Hyperparameters
# B. Model Training and Checkpoints
# C. Sampling

In [4]:
# I. Architecture

In [5]:
# A. Load the Document

In [6]:
# A1. Sanitized Patient Document
printable = set(string.printable)
adoc = textract.process('12345678.rtf')
text = filter(lambda x: x in printable, adoc)

In [7]:
text[0:1000]

' \nPatient: MRN:  FIN:  \nAge: 86 years Sex: M DOB: \nAssociated Diagnoses: None \nAuthor: \n \nBasic Information \nTime seen: Date & time 11/15/2014 02:27:00. \nHistory source: Patient. \nArrival mode: Private vehicle. \nHistory limitation: None. \nAdditional information: Chief Complaint from Nursing Triage Note : Chief Complaint ED \n11/15/2014 2:16 PST Chief Complaint ED SHORTNESS OF BREATH , PCP is Jennifer Cook. \n \nHistory of Present Illness \nThe patient presents with shortness of breath. Patient is an 86 year old male with history of CHF, presenting to the ED with shortness of breath intermittent for one week. Patient states his shortness of breath has not become worse but he was afraid to fall back to sleep. Not on home O2. His shortness of breath is worse with walking. Denies recent fever, cough, chest pain, abdominal pain, nausea, vomiting, diarrhea. Dr. Wong at UC Davis is his cardiac surgeon, and he is supposed to have aortic valve surgery. Just mvoed here from Crescent 

In [8]:
# B. Pre-process Document

In [9]:
# B1. Set the Vocabulary : Unique set of characters that are found in the document
vocab = set(text)
# B2. Assign each unique character an integer (starting from 0) : {'<character>': 0, ... }
vocab_to_int = {c: i for i, c in enumerate(vocab)}
# B3. Flip assignment so integer indicates character : {<integer>: '<character>', ...}
int_to_vocab = dict(enumerate(vocab))

In [10]:
# C. Encode the Document

In [11]:
# C1. For every occurence of a character in the document, assign the integer
# (Use syntactic sugar and convert to a Numpy array)
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)
# C2. This results in our entire document encoded as character : integers, based on the defined vocabulary
print('Encoded document shape: {}'.format(encoded.shape))
encoded[0:10]

Encoded document shape: (7786,)


array([ 2,  1, 41, 51, 71, 58, 54, 65, 71, 23], dtype=int32)

In [12]:
# D. Define Batch Generator

In [13]:
# D1. A Generator Function that returns an Iterator for Batches
def get_batches(arr, n_seqs, n_steps):
    '''
    Desc : Generate batches of size n_seqs * n_steps
    Variables :
        - arr     : Input array
        - n_seqs  : Sequences per batch (batch size)
        - n_steps : Sequence steps per batch
    '''
    # Get the batch size and number of batches we can make
    batch_size = n_seqs * n_steps 
    n_batches  = len(arr) // batch_size
    
    # Keep only enough characters to make full batches
    arr =  arr[:n_batches * batch_size]
    
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs,-1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:,n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros(x.shape)
        y[:,:-1],y[:,-1] = x[:,1:] ,x[:,0]
        yield x, y

In [14]:
# D2. Validate the Function
batches = get_batches(encoded, 10, 50)
x, y = next(batches)

In [15]:
# E. Define Model Inputs

In [16]:
# E1. Shaping input placeholders and preparing for optimization
def build_inputs(batch_size, num_steps):
    '''
    Desc : Tensorflow Placeholders
    Variables :
        - batch_size : Number of sequences per batch
        - num_steps  : Sequence steps per batch
    '''
    # Graph placeholders
    inputs = tf.placeholder(tf.int32,[batch_size,num_steps],name="inputs")
    targets = tf.placeholder(tf.int32,[batch_size,num_steps],name="targets")
    
    # Retain probability placeholder for drop out layers
    keep_prob = tf.placeholder(tf.float32,name="keep_prob") # Scalar
    
    return inputs, targets, keep_prob

In [17]:
# F. Define LSTM Cell

In [18]:
# F1. Basic LSTM Cells
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    '''
    Desc : LSTM Cell for Hidden Layers
    Variables :
        - keep_prob  : Dropout optimization (scalar placeholder)
        - lstm_size  : Size of the hidden layers in the LSTM cells
        - num_layers : Number of LSTM layers
        - batch_size : Batch size
    '''

    # LSTM cell and dropout to the cell outputs
    # Stack LSTM layers, for vector ops (syntactic sugar)
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper\
                                       (tf.contrib.rnn.BasicLSTMCell(lstm_size)) \
                                        for _ in range(num_layers)])
    # Fill the initial state with Zeros
    initial_state = cell.zero_state(batch_size,tf.float32)
    
    return cell, initial_state

In [19]:
# G. Define Model Output

In [20]:
# G1. RNN Softmax Output Layer
def build_output(lstm_output, in_size, out_size):
    '''
    Desc : Softmax layer that returns softmax output and logits (logarithm of the odds p/(1 − p))
    Variables: 
        - lstm_output : Output tensor list from LSTM layer
        - in_size     : Size of the input tensor, for example, size of the LSTM cells
        - out_size    : Size of this softmax layer
    '''
    # Reshape output: one row for each step for each sequence.
    # Concatenate lstm_output over axis 1 (the columns)
    seq_output = tf.concat(lstm_output,axis=1)
    # Reshape seq_output to 2D tensor with lstm_size columns
    x = tf.reshape(seq_output,[-1,in_size])
    
    # Connect RNN outputs to Softmax Layer
    with tf.variable_scope('softmax'):
        # Weight and Bias
        softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size),stddev=0.1))
        softmax_b = tf.Variable(tf.zeros([out_size]))
    
    # Outputs are RNN cell output rows, therefore logits are output rows (one for each step and sequence)
    logits =  tf.add(tf.matmul(x,softmax_w),softmax_b) 
    
    # Softmax for predicted character probabilities
    out = tf.nn.softmax(logits,name ="out")
    print(out)
    return out, logits

In [21]:
# H. Define Model Loss

In [22]:
# H1. Discover mean to calculate loss
def build_loss(logits, targets, lstm_size, num_classes):
    '''
    Desc : Calculate loss from logits and targets
    Variables : 
        - logits      : Logits from final fully connected layer
        - targets     : Targets for supervised learning
        - lstm_size   : Number of LSTM hidden units
        - num_classes : Number of classes in targets
    '''
    # One-hot encode targets and reshape to match logits, one row per sequence per step
    y_one_hot = tf.one_hot(targets,num_classes)
    y_reshaped =  tf.reshape(y_one_hot,logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=y_reshaped))
    
    return loss

In [23]:
# I. Define Model Optimizer

In [24]:
# I1. Clip Gradients to Ensure non-exponential growth
def build_optimizer(loss, learning_rate, grad_clip):
    '''
    Desc : Build optmizer for training, using gradient clipping
    Variables : 
        - loss: Network loss
        - learning_rate: Learning rate for optimizer
    '''
    
    # Optimizer for training (clipping the exploding gradients)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [25]:
# J. The LSTM Network

In [26]:
# J1. A class to define our model
class CharRNN:
    
    def __init__(self,
                 num_classes,
                 batch_size=64,
                 num_steps=50, 
                 lstm_size=128,
                 num_layers=2,
                 learning_rate=0.001, 
                 grad_clip=5,
                 sampling=False):
    
        # Sampling: Pass one character at a time
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # Build the input placeholder tensors
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size,num_steps)
        # Build the LSTM cell
        cell, self.initial_state = build_lstm(lstm_size,num_layers,batch_size,self.keep_prob)
        # (Run the data through the RNN layers)
        # One-hot encode the input tokens
        x_one_hot = tf.one_hot(self.inputs,num_classes)
        
        # Run each sequence step through the RNN 
        outputs, state = tf.nn.dynamic_rnn(cell,x_one_hot,initial_state=self.initial_state)
        self.final_state = state
        
        # Softmax predictions and logits
        self.prediction, self.logits = build_output(outputs,lstm_size,num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss =  build_loss(self.logits,self.targets,lstm_size,num_classes)
        self.optimizer = build_optimizer(self.loss,learning_rate,grad_clip)

In [27]:
# II. Implementation

In [28]:
# A. Define Hyperparameters

In [29]:
# A1. Larger Networks + Dropout Values (0.0 - 1.0)
batch_size = 100        # Sequences per batch
num_steps  = 150        # Sequence steps per batch
lstm_size  = 550        # LSTMs' Hidden layer sizes
num_layers = 2          # LSTM layers
learning_rate = 0.001   # Learning rate
keep_prob  = 0.5        # Dropout 'keep_prob' probability

In [30]:
# B. Model Training and Checkpoints

In [31]:
# B1. Declare Epochs
epochs = 40

In [32]:
# B2. Savepoints every n iterations
save_every_n = 200

In [33]:
# B3. Try first with one GPU 
with tf.device("/gpu:0"):
    model = CharRNN(len(vocab),
                    batch_size=batch_size,
                    num_steps=num_steps,
                    lstm_size=lstm_size,
                    num_layers=num_layers, 
                    learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Load a checkpoint and resume training: saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            end = time.time()
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib

Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.
Instructions for updating:
contrib/learn/dataframe/** is deprecated.




Tensor("out:0", shape=(15000, 76), dtype=float32)
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'dict' object has no attribute 'name'


In [34]:
# C. Sampling

In [35]:
# C1. Next Char
def pick_top_n(preds, vocab_size, top_n=5):
    '''
    Desc : Feed in char, receive next char.
    Variables:
        - preds      : Char to predict
        - vocab_size : Length of the vocab
        - top_n      : Return the top selection 'n' of chars
    '''
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [36]:
# C2. Sample based on Checkpoint
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    '''
    Desc : Resturn a sample from a checkpoint restore
    Variables:
        - checkpoint : ckpt file to restore
        - n_samples  : Number of samples to return
        - lstm_size  : LSTM size
        - vocab_size : Vocabulary size
        - prime      : Prime the results with a string
    '''
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [37]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

Tensor("out:0", shape=(1, 76), dtype=float32)
INFO:tensorflow:Restoring parameters from checkpoints/i0_l550.ckpt
Far'6yTrC<<PTT-jyyhU)Mh1OUOy1yFllUUUlUjj4UUl1U1UU(Ul11%%11%jjvv))psvUpsb1RhbP1obb'aPTvvaCaaTcjc-bcBb1DbvoDorv0vobo%b%8a28vamBDcUcBvbpDobpUUooUjVbjaV(aaajUUh1UU1OUjbj1jcl1l1c1U1v1ctb1c1b1bbbv'vov0o8bbm88BDBDTvvXDvvo4444		a	8	aAbDhAhTPTA-:ryywVVyLaaUaU5<hO-hhjj:Vl)lUV1Vl1bbUU1XHHXXKjKl1lGX1lmGXLGmmmWmnmnmYUWYFYfFcfftc/BVV
Lc

D
CSSCW%Okx%bFy.7NZa.+obb(oVfajaVjTT1VBbUbUcb1BvvvccvpbDD1oo818TTvPaPBOaFc-cbb:bDbD(oor8D%88B-&Dbv&bXCXC<FFvDByrr.byTyTVyrCbPCkCy<OCy.j-Zg&rr		bD0((o(o+PDaVVVajc<cc-h1-cch&&
hb
S
D
SM
SWC%4%4%55CaCCk.-k.c<e2-ZV9
wRRRlllfIlfwwII1(1wwwLLxxQUh+)Uh00VmOFVVa
Xa
-c
-bX


b 6 MX%%%jj -2-aa9Vyywh<hhrrhVVyyyVOy

R

yRyn<<Cj<UUj(jVK))((KU9BBH77(GvvvGvmcmmbvTTT1vb9b	tt	a0aaccDcc11ccc1cbtLLbaoaaabbojjAT:+rUTrrr8VV<<jq<jUjUyPy1pp1U1U1bPPUjvjvv1pp)1OOSTt1HbFvFHTaTv+:bamccVbDcob1bVDoboaaaojcUjUVAAAVaac-f-+bch
XhD&DX&X&nFlFY%Nl%nlBlldUtwlIwLUlllUwUhw1wlUlL%UUjpl1ppVUn11F