In [1]:
#Analysis of Anna Karenina with generative text analysis

In [16]:
# imports
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [17]:
#Let's load that file
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = sorted(set(text))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype = np.int32)

In [18]:
# Let's look atthis garbage.
print('sample text:\n', text[313:414], '\n')
print('length of vocab is:\n', len(vocab), '\n')
print('vocab:\n', vocab, '\n')
print('enumerated vocab:\n', int_to_vocab)

sample text:
 hat she could not go on living in the same house with him.
This position of affairs had now lasted th 

length of vocab is:
 83 

vocab:
 ['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

enumerated vocab:
 {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: '*', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: ';', 27: '?', 28: '@', 29: 'A', 30: 'B', 31: 'C', 32: 'D', 33: 'E', 34: 'F', 35: 'G', 36: 'H', 37: 'I', 38: 'J', 39: 'K', 40: 'L', 41: 'M', 42: 'N', 43: 'O', 44: 'P', 45: 'Q', 46: 

In [19]:
# We need to get our batches so that we can train our data.
def get_batches( arr, n_seqs, n_steps):
    """
    This function is a generator that returns batches of size n_seq * n_steps
    
    Args:
    arr: Just an array to make batches from.
    n_seqs: Batch size
    n_steps: Steps
    """
    
    # Get characters per batch
    characters_per_batch = n_seqs * n_steps
    n_batches = len(arr) // characters_per_batch
    
    # Keep only enough characters to make a full batch
    arr = arr[:n_batches * characters_per_batch]
    
    # Reshape into n_seq rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        
        #features
        x  = arr[:, n: n + n_steps]
        
        #shifted targets
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [20]:
# Get them
batches = get_batches(encoded, 13, 49)
x, y = next(batches)

In [21]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[31 64 57 72 76 61 74  1 16  0]
 [79 60 13  1 47 76 61 72 57 70]
 [75 65 63 64 76  1 71 62  1 68]
 [81  1 75 65 63 64 11  1 75 64]
 [61  1 79 57 75 64 61 60 11  1]
 [ 1 68 57 75 76  1 76 65 69 61]
 [13  1 29 70 60  0 62 71 74  1]
 [61 75 75 65 71 70  1 62 71 74]
 [61 80 72 74 61 75 75  1 76 64]
 [63 57 65 70  1 64 61 74 75 61]]

y
 [[64 57 72 76 61 74  1 16  0  0]
 [60 13  1 47 76 61 72 57 70  1]
 [65 63 64 76  1 71 62  1 68 71]
 [ 1 75 65 63 64 11  1 75 64 61]
 [ 1 79 57 75 64 61 60 11  1 76]
 [68 57 75 76  1 76 65 69 61 13]
 [ 1 29 70 60  0 62 71 74  1 77]
 [75 75 65 71 70  1 62 71 74 27]
 [80 72 74 61 75 75  1 76 64 65]
 [57 65 70  1 64 61 74 75 61 68]]


In [15]:
# Sweet, now we have to build the model. We're going to use a network of LSTM's.

In [22]:
# Inputs
def build_inputs(batch_size, num_steps):
    """
    Defining placeholders for the inputs, targets, and dropouts
    
    Args:
    batch_size: Batch Size, Sequences per Batch
    num_steps: Number of sequences in a batch
    """
    
    # Declare placeholders
    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'inputs')
    targets = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'targets')
    
    # Let's get a probability for the dropout layers
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    
    # Return statement
    return inputs, targets, keep_prob

In [23]:
# LSTM Cell
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    """
    Building an LSTM Cell
    
    Args:
    keep_prob = Dropout probability
    lstm_size = Number of cells per layer
    num_layers = Number of layers
    batch_size = Batch size
    """
    
    def build_cell(lstm_size, keep_prob):
        
        # Basic LSTM:
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        
        # Dropout addition:
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
        
        # return statement:
        return drop
    
    # Stack up multiple LSTM layers:
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    # return statement:
    return cell, initial_state

In [24]:
# Outputs:

def build_output(lstm_output, in_size, out_size):
    """
    Build Softmax layer, return outputs and logits
    
    Args:
    lstm_outpit: Input tensor
    in_size: Size of input tensor
    out_size: Size of output tensor
    """
    
    # Reshape the output so it's a bunch of rows.
    # One row for each step in each sequence
    # Shape is ( M * N ) X L; batch_size * num_steps by LSTM size
    seq_output = tf.concat(lstm_output, axis = 1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    # We need to connect the RNN network with the softmax layer:
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size), stdev = 0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
        
    # output are just RNN cell outputs.
    # logits will be a rows of logit outputs. One for each step & sequence
    # This is a very complicated formula, follow it closely.
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # let's use our softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name = 'predictions')
    
    # Return statement:
    return out, logits

In [26]:
# Training Loss:
def build_loss(logits, targets, lstm_size, num_classes):
    """
    Calculate loss from the logits and the targets
    
    Args:
    logits: Logits from the final fully connected layer
    targets: Targets for supervised learning
    lstm_size: Number of LSTM hidden units
    num_classes: Number of classes in targets
    """
    
    # One hot encode targets and reshape to match our logits
    # ( N * M ) X C; C = num_classes
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = y_reshaped)
    loss = tf.reduce_mean(loss)
    
    # Return statement:
    return loss

In [None]:
# Optimization:
