In [1]:
##################
# Document Maker #
################################################
# Desc   : A Nice RNN for Generating Documents #
# Author : Abe Hoffman                         #
# Date   : Aug 21 2017                         #
################################################

In [2]:
from __future__ import print_function
from collections import namedtuple
import tensorflow as tf
import numpy as np

In [3]:
################
# Architecture #
################
# A. Load Document (from the Corpus)
# B. Pre-process Document
# C. Encode the Document
# D. Define Batch Generator
# E. Define Model Inputs
# F. Define LSTM Cell
# G. Define Output/

In [4]:
# A. Load the Document

In [5]:
# A1. Open the Document and Read it
with open('anna.txt', 'r') as f: 
    text = f.read()

In [6]:
# B. Pre-process Document

In [7]:
# B1. Set the Vocabulary : Unique set of characters that are found in the document
vocab = set(text)
# B2. Assign each unique character an integer (starting from 0) : {'<character>': 0, ... }
vocab_to_int = {c: i for i, c in enumerate(vocab)}
# B3. Flip assignment so integer indicates character : {<integer>: '<character>', ...}
int_to_vocab = dict(enumerate(vocab))

In [8]:
# C. Encode the Document

In [9]:
# C1. For every occurence of a character in the document, assign the integer
# (Use syntactic sugar and convert to a Numpy array)
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)
# C2. This results in our entire document encoded as character : integers, based on the defined vocabulary
print('Encoded document shape: {}'.format(encoded.shape))
encoded[0:10]

Encoded document shape: (2025486,)


array([31, 66, 57, 74, 78, 61, 76,  3, 16,  1], dtype=int32)

In [10]:
# D. Define Batch Generator

In [11]:
# D1. A Generator Function that returns an Iterator for Batches
def get_batches(arr, n_seqs, n_steps):
    '''
    Desc : Generate batches of size n_seqs * n_steps
    Variables :
        - arr     : Input array
        - n_seqs  : Sequences per batch (batch size)
        - n_steps : Sequence steps per batch
    '''
    # Get the batch size and number of batches we can make
    batch_size = n_seqs * n_steps 
    n_batches  = len(arr) // batch_size
    
    # Keep only enough characters to make full batches
    arr =  arr[:n_batches * batch_size]
    
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs,-1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:,n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros(x.shape)
        y[:,:-1],y[:,-1] = x[:,1:] ,x[:,0]
        yield x, y

In [12]:
# D2. Validate the Function
batches = get_batches(encoded, 10, 50)
x, y = next(batches)

In [13]:
# E. Define Model Inputs

In [14]:
# E1. Shaping input placeholders and preparing for optimization
def build_inputs(batch_size, num_steps):
    '''
    Desc : Tensorflow Placeholders
    Variables :
        - batch_size : Number of sequences per batch
        - num_steps  : Sequence steps per batch
    '''
    # Graph placeholders
    inputs = tf.placeholder(tf.int32,[batch_size,num_steps],name="inputs")
    targets = tf.placeholder(tf.int32,[batch_size,num_steps],name="targets")
    
    # Retain probability placeholder for drop out layers
    keep_prob = tf.placeholder(tf.float32,name="keep_prob") # Scalar
    
    return inputs, targets, keep_prob

In [15]:
# F. Define LSTM Cell

In [16]:
# F1. Use a basic LSTM Cell
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    '''
    Desc : LSTM Cell for Hidden Layers
    Variables :
        - keep_prob  : Dropout optimization (scalar placeholder)
        - lstm_size  : Size of the hidden layers in the LSTM cells
        - num_layers : Number of LSTM layers
        - batch_size : Batch size
    '''

    # LSTM cell and dropout to the cell outputs
    # Stack LSTM layers, for vector ops (syntactic sugar)
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper\
                                       (tf.contrib.rnn.BasicLSTMCell(lstm_size)) \
                                        for _ in range(num_layers)])
    # Fill the initial state with Zeros
    initial_state = cell.zero_state(batch_size,tf.float32)
    
    return cell, initial_state