In [4]:
import time
import string
import math
import itertools
import numpy as np
import tensorflow as tf
from collections import namedtuple

# Load and Encode
First we are going to open and load the text from anna.txt file.
Then we would like to convert it into integers for our network to use. 
Here I'm creating a couple dictionaries to convert the characters to and from integers. 
Encoding the characters as integers makes it easier to use as input in the network.

In [5]:
with open ('anna.txt','r') as file:
    text_data = file.read()
# remove the duplicates and sort the characters in a list
vocabulary_set = sorted(set(text_data))
char_to_int = {char:i for i,char in enumerate(vocabulary_set)}
int_to_char = dict(enumerate(vocabulary_set))
encoded = np.array([char_to_int[char] for char in text_data], dtype=np.int32)

In [32]:
print('size of our text data:',len(text_data))
print('the first 50 characters of the text data:')
print('------------------------- characters from the book -------------------------------------------')
text_data[:50]


size of our text data: 1985223
the first 50 characters of the text data:
------------------------- characters from the book -------------------------------------------


'Chapter 1\n\n\nHappy families are all alike; every un'

In [33]:
print('-------------------------- Same Encoded 50 Character -----------------------------------------')
encoded[:50]

-------------------------- Same Encoded 50 Character -----------------------------------------


array([31, 64, 57, 72, 76, 61, 74,  1, 16,  0,  0,  0, 36, 57, 72, 72, 81,
        1, 62, 57, 69, 65, 68, 65, 61, 75,  1, 57, 74, 61,  1, 57, 68, 68,
        1, 57, 68, 65, 67, 61, 26,  1, 61, 78, 61, 74, 81,  1, 77, 70], dtype=int32)

In [8]:
print('As you can see, character ',int_to_char[70], 'is decoded to ',char_to_int['n'])

As you can see, character  n is decoded to  70


In [31]:
print('Number of Classes in our data set:', len(vocabulary_set))
print('the first 50 characters of the sorted vocabulary set:')
print(set(itertools.islice(vocabulary_set, 50)))

Number of Classes in our data set: 83
the first 50 characters of the sorted vocabulary set:
{',', ' ', '"', '-', '9', 'F', '(', 'I', '0', 'N', '8', 'G', '4', '?', '&', 'M', '%', '!', 'J', 'H', 'B', 'O', 'A', '*', ':', 'K', '$', 'T', '7', 'R', '@', '3', '/', '6', '5', ';', ')', 'S', "'", 'C', 'E', 'D', '2', 'L', 'U', 'Q', 'P', '\n', '.', '1'}


In [10]:
all_set = set(string.printable)
# print(all_set)
print('there are',len(all_set)-len(vocabulary_set),'unused characters in this text book which they are:')
print(all_set - set(text_data))

there are 17 unused characters in this text book which they are:
{'<', '\r', ']', '\\', '[', '=', '#', '+', '~', '\x0c', '}', '^', '|', '\t', '\x0b', '{', '>'}


# Training Mini-Batches

A mini batch generator is defined to yeild x and y in mini batches. 
We would like to cut the length of our data sequence to have K total batches of size N. The number of characters per batch is M. 
Sine we want to predict the next character in the sequence, y is simply the same as x, but shifted by one in our trainig set.

In [37]:
# N is number of sequences = batch_size
# M = num_steps
# M * N = number of characters per batch
# K is total number of batches
def batch_generator(array, batch_size, num_steps):
    # number of charachters per batch
    n_char_per_batch = batch_size * num_steps
    num_batches = math.floor(len(array) / n_char_per_batch)
    array = array[:n_char_per_batch * num_batches]
    array = np.reshape(array, (batch_size,-1))
    # split array into batch_size of sequences
    for n in range(num_batches):
        x = array[:,n:n+num_steps]
#         y_temp = array[:,n+1:n+num_steps+1]
        y = np.zeros(x.shape, dtype=x.dtype)
#         y[:,:y_temp.shape[1]] = y_temp
        y[:, :-1], y[: ,-1] = x[:, 1:] , x[:, 0]
        yield x,y

In [38]:
batches = batch_generator(encoded, 10, 50)
x,y = next(batches)
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])
print('x\n', x.shape)
print('\ny\n', y.shape)

x
 [[31 64 57 72 76 61 74  1 16  0]
 [ 1 57 69  1 70 71 76  1 63 71]
 [78 65 70 13  0  0  3 53 61 75]
 [70  1 60 77 74 65 70 63  1 64]
 [ 1 65 76  1 65 75 11  1 75 65]
 [ 1 37 76  1 79 57 75  0 71 70]
 [64 61 70  1 59 71 69 61  1 62]
 [26  1 58 77 76  1 70 71 79  1]
 [76  1 65 75 70  7 76 13  1 48]
 [ 1 75 57 65 60  1 76 71  1 64]]

y
 [[64 57 72 76 61 74  1 16  0  0]
 [57 69  1 70 71 76  1 63 71 65]
 [65 70 13  0  0  3 53 61 75 11]
 [ 1 60 77 74 65 70 63  1 64 65]
 [65 76  1 65 75 11  1 75 65 74]
 [37 76  1 79 57 75  0 71 70 68]
 [61 70  1 59 71 69 61  1 62 71]
 [ 1 58 77 76  1 70 71 79  1 75]
 [ 1 65 75 70  7 76 13  1 48 64]
 [75 57 65 60  1 76 71  1 64 61]]
x
 (10, 50)

y
 (10, 50)


In [40]:
# placeholders for x,y and keep_prob of dropout layers
def input_generator(batch_size, num_steps):
    inputs = tf.placeholder(tf.int32, shape=(batch_size, num_steps),name='x')
    targets = tf.placeholder(tf.int32, shape=(batch_size,num_steps),name='y')
    keep_prob = tf.placeholder(tf.float32,name='keep_prob')
    return inputs, targets, keep_prob

# Building Model

### LSTM Cells

Here we build one cell of LSTM and stack them up into as many as needed in one layer.
We can have multiple hidden layers of LSTM cells.

In [29]:
# Build LSTM cell
# num_layers : number of hidden layers (verical number of LSTM cells)
# lstm_size : number of LAST cells horizontally in each hidden layer. This should be equal to number of steps that
#we mini batch by.

def buil_LSTM_Cells(lstm_size, num_layers, batch_size, keep_prob):
    # one lstm cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    # one cell wrapped with dropout layer
    dropped = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    # stack of lstm cells in the hidden layer
    cell = tf.contrib.rnn.MultiRNNCell([dropped]*num_layers)
    initial_state = cell.zero_state(batch_size, tf.flout32)
    return cell, initial_state

### Output Layer

The output of RNN cells(hidden Layers) will be fully connected to output layer through softmax to produce predictions. So the size of this layer should be the same as size of our data set characters which is 83.
So if we have N sequences of inputs, each with M steps, when they pass through L number of lstm cells in our hidden layer, the output will be size N . M . L. This is a 3D tensor object that we need to reshape in to a 2D tensor of shape (N . M) . L.
Size of output of softmax layer is the same as size of logits or number of classes.

### Training Loss 

### Optimizer and Gradient Exploding Fix
The optimizer will take in the loss and learning rate and use a threshold to clip the gradients, if they grow bigger than the threshold. This will avoid the problem of gradient exploding.
Adamoptimizer has been used, which optionally can perform "learning decay" if required. 

### RNN Network
Following is defined RNN class that initializes the one-hot-encoded input, lstm cells, output layer. It needs to use the last/final state of LSTM for the mini-batch, so the next batch continous the state from the previous batch.
Then it will calculate the Loss and do the optimization.
Out RNN network needs number of classes, batch size, number of steps per batch, lstm cell size, number of hidden layer, gradient threshold and learning rate as input arguments.

### Hyperparameters of Netwrok
* `batch_size` - Number of sequences running through the network in one pass.
* `num_steps` - Number of characters in the sequence the network is trained on. Larger is better typically, the network will learn more long range dependencies. But it takes longer to train. 100 is typically a good number here.
* `lstm_size` - The number of units in the hidden layers.
* `num_layers` - Number of hidden LSTM layers to use
* `learning_rate` - Learning rate for training
* `keep_prob` - The dropout keep probability when training. If you're network is overfitting, try decreasing this.

### Training Network
To train the network we creat a model and pass it inputs and targets and run the optimizer.
every often checkpoints are save with the following formats:
i{iteration number}_l{# hidden layer units}.ckpt
