In [1]:
#Analysis of Anna Karenina with generative text analysis

In [29]:
# imports
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [30]:
#Let's load that file
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = sorted(set(text))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype = np.int32)

In [31]:
# Let's look atthis garbage.
print('sample text:\n', text[313:414], '\n')
print('length of vocab is:\n', len(vocab), '\n')
print('vocab:\n', vocab, '\n')
print('enumerated vocab:\n', int_to_vocab)

sample text:
 hat she could not go on living in the same house with him.
This position of affairs had now lasted th 

length of vocab is:
 83 

vocab:
 ['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

enumerated vocab:
 {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: '*', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: ';', 27: '?', 28: '@', 29: 'A', 30: 'B', 31: 'C', 32: 'D', 33: 'E', 34: 'F', 35: 'G', 36: 'H', 37: 'I', 38: 'J', 39: 'K', 40: 'L', 41: 'M', 42: 'N', 43: 'O', 44: 'P', 45: 'Q', 46: 

In [32]:
# We need to get our batches so that we can train our data.
def get_batches( arr, n_seqs, n_steps):
    """
    This function is a generator that returns batches of size n_seq * n_steps
    
    Args:
    arr: Just an array to make batches from.
    n_seqs: Batch size
    n_steps: Steps
    """
    
    # Get characters per batch
    characters_per_batch = n_seqs * n_steps
    n_batches = len(arr) // characters_per_batch
    
    # Keep only enough characters to make a full batch
    arr = arr[:n_batches * characters_per_batch]
    
    # Reshape into n_seq rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        
        #features
        x  = arr[:, n: n + n_steps]
        
        #shifted targets
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [33]:
# Get them
batches = get_batches(encoded, 13, 49)
x, y = next(batches)

In [34]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[31 64 57 72 76 61 74  1 16  0]
 [79 60 13  1 47 76 61 72 57 70]
 [75 65 63 64 76  1 71 62  1 68]
 [81  1 75 65 63 64 11  1 75 64]
 [61  1 79 57 75 64 61 60 11  1]
 [ 1 68 57 75 76  1 76 65 69 61]
 [13  1 29 70 60  0 62 71 74  1]
 [61 75 75 65 71 70  1 62 71 74]
 [61 80 72 74 61 75 75  1 76 64]
 [63 57 65 70  1 64 61 74 75 61]]

y
 [[64 57 72 76 61 74  1 16  0  0]
 [60 13  1 47 76 61 72 57 70  1]
 [65 63 64 76  1 71 62  1 68 71]
 [ 1 75 65 63 64 11  1 75 64 61]
 [ 1 79 57 75 64 61 60 11  1 76]
 [68 57 75 76  1 76 65 69 61 13]
 [ 1 29 70 60  0 62 71 74  1 77]
 [75 75 65 71 70  1 62 71 74 27]
 [80 72 74 61 75 75  1 76 64 65]
 [57 65 70  1 64 61 74 75 61 68]]


In [35]:
# Sweet, now we have to build the model. We're going to use a network of LSTM's.

In [36]:
# Inputs
def build_inputs(batch_size, num_steps):
    """
    Defining placeholders for the inputs, targets, and dropouts
    
    Args:
    batch_size: Batch Size, Sequences per Batch
    num_steps: Number of sequences in a batch
    """
    
    # Declare placeholders
    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'inputs')
    targets = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'targets')
    
    # Let's get a probability for the dropout layers
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    
    # Return statement
    return inputs, targets, keep_prob

In [37]:
# LSTM Cell
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    """
    Building an LSTM Cell
    
    Args:
    keep_prob = Dropout probability
    lstm_size = Number of cells per layer
    num_layers = Number of layers
    batch_size = Batch size
    """
    
    def build_cell(lstm_size, keep_prob):
        
        # Basic LSTM:
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        
        # Dropout addition:
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
        
        # return statement:
        return drop
    
    # Stack up multiple LSTM layers:
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    # return statement:
    return cell, initial_state

In [49]:
# Outputs:

def build_output(lstm_output, in_size, out_size):
    """
    Build Softmax layer, return outputs and logits
    
    Args:
    lstm_outpit: Input tensor
    in_size: Size of input tensor
    out_size: Size of output tensor
    """
    
    # Reshape the output so it's a bunch of rows.
    # One row for each step in each sequence
    # Shape is ( M * N ) X L; batch_size * num_steps by LSTM size
    seq_output = tf.concat(lstm_output, axis = 1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    # We need to connect the RNN network with the softmax layer:
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size), stddev = 0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
        
    # output are just RNN cell outputs.
    # logits will be a rows of logit outputs. One for each step & sequence
    # This is a very complicated formula, follow it closely.
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # let's use our softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name = 'predictions')
    
    # Return statement:
    return out, logits

In [50]:
# Training Loss:
def build_loss(logits, targets, lstm_size, num_classes):
    """
    Calculate loss from the logits and the targets
    
    Args:
    logits: Logits from the final fully connected layer
    targets: Targets for supervised learning
    lstm_size: Number of LSTM hidden units
    num_classes: Number of classes in targets
    """
    
    # One hot encode targets and reshape to match our logits
    # ( N * M ) X C; C = num_classes
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = y_reshaped)
    loss = tf.reduce_mean(loss)
    
    # Return statement:
    return loss

In [51]:
# Optimization:
def build_optimizer( loss, learning_rate, grad_clip):
    """
    Building an optimizer for training and bounding our gradients
    
    Args:
    loss: network loss
    learning_rate: learning rate
    grad_clip: the bound for our gradients
    """
    
    # Pretty straight forward here:
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    # Return statement:
    return optimizer

In [52]:
# Network Construction:

class CharRNN:
    
    def __init__(self, num_classes, batch_size = 32, num_steps = 50, lstm_size = 128,
                 num_layers = 3, learning_rate = 0.001, grad_clip = 5, sampling = False):
        
        # When using this network for sampling we'll pass one charcter at a time.
        # This is that option.
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
        
        # Reset the network.
        tf.reset_default_graph()
        
        # Here we are, fear is like a forest.
        # Build the placeholder tensor:
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        
        # Build LSTM nodes:
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        
        # To run data through the RNN layers:
        # One hot encode the input tokens
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # Run each sequence step through the RNN and grab your outputs:
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state = self.initial_state)
        self.final_state = state
        
        # Get your predictions and logits:
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer:
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)
        
            


In [83]:
# Hyperparameters:
batch_size = 100        # Sequences per batch
num_steps = 100         # Number of sequence steps per batch
lstm_size = 256         # Size of hidden layers in LSTMs
num_layers = 3          # Number of LSTM layers
learning_rate = 0.001   # Learning rate
keep_prob = 0.5         # Dropout keep probability
epochs = 20             # epochs
save_every_n = 200      # save every 500 runs

In [84]:
# Network training:
model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps, lstm_size=lstm_size,
               num_layers=num_layers, learning_rate=learning_rate)
# Saver:
saver = tf.train.Saver(max_to_keep = 50)

# Tensorflow session:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    # saver.restore(sess, 'checkpoints/___ckpt')
    
    counter = 0
    
    for e in range(epochs):
        # Training:
        new_state = sess.run(model.initial_state)
        loss = 0
        
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x, model.targets: y, model.keep_prob: keep_prob,
                   model.initial_state: new_state}
            
            batch_loss, new_state, _ = sess.run([model.loss, model.final_state, model.optimizer],
                                               feed_dict = feed)
            
            end = time.time()
            
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
            
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/20...  Training Step: 1...  Training loss: 4.4198...  1.6721 sec/batch
Epoch: 1/20...  Training Step: 2...  Training loss: 4.3861...  1.6324 sec/batch
Epoch: 1/20...  Training Step: 3...  Training loss: 4.2907...  1.6992 sec/batch
Epoch: 1/20...  Training Step: 4...  Training loss: 3.9213...  1.7818 sec/batch
Epoch: 1/20...  Training Step: 5...  Training loss: 3.7292...  1.7914 sec/batch
Epoch: 1/20...  Training Step: 6...  Training loss: 3.6439...  1.7800 sec/batch
Epoch: 1/20...  Training Step: 7...  Training loss: 3.5213...  1.6933 sec/batch
Epoch: 1/20...  Training Step: 8...  Training loss: 3.4899...  1.8991 sec/batch
Epoch: 1/20...  Training Step: 9...  Training loss: 3.4413...  2.0655 sec/batch
Epoch: 1/20...  Training Step: 10...  Training loss: 3.4159...  1.9083 sec/batch
Epoch: 1/20...  Training Step: 11...  Training loss: 3.3590...  2.0699 sec/batch
Epoch: 1/20...  Training Step: 12...  Training loss: 3.3642...  1.9999 sec/batch
Epoch: 1/20...  Training Step: 13... 

In [85]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints\\i3960_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i200_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i400_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i600_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i800_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1000_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1200_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1400_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1600_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1800_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2000_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2200_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2400_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2600_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2800_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3000_l256.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3200_l256.ckpt"
all_mo

In [86]:
# Sampling:

def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p = p)[0]
    return c

In [87]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [88]:
tf.train.latest_checkpoint('checkpoints')

'checkpoints\\i3960_l256.ckpt'

In [91]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Andy Miller")
print(samp)

Andy Miller and the most servenase, and he had not their
terabinate of seeil which she had been saight in who asked the struck to
the toucher, he saw to him, with his husband and what he could brought to him that the mother said that she saw it was said on his beat and tell, and had to she had been still happened in her heart, the standing towards, and then the came an one and walked him on weak of whom was always thought about to some of a
condision of those hand, he carriage of a memal time too with that
which he had
seen over the stattory and what she had not seemed that they they had
no an account and to she was the
patituon.

"And you have been see all this
mother? I'm not stopped him, to me of saying at the
stabes, what there then
all he has at once hear the peasant.

"And I'm shuller, and I consent to see a state, because they were so thas how see him."

"Yes, I said
what was see a lefs this what's so," she was sating to the part of whan she had so hand to him. "If I'll not are 