In [1]:
#Analysis of Anna Karenina with generative text analysis

In [2]:
# imports
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [5]:
#Let's load that file
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = sorted(set(text))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype = np.int32)

In [6]:
# Let's look atthis garbage.
print('sample text:\n', text[313:414], '\n')
print('length of vocab is:\n', len(vocab), '\n')
print('vocab:\n', vocab, '\n')
print('enumerated vocab:\n', int_to_vocab)

sample text:
 hat she could not go on living in the same house with him.
This position of affairs had now lasted th 

length of vocab is:
 83 

vocab:
 ['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

enumerated vocab:
 {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: '*', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: ';', 27: '?', 28: '@', 29: 'A', 30: 'B', 31: 'C', 32: 'D', 33: 'E', 34: 'F', 35: 'G', 36: 'H', 37: 'I', 38: 'J', 39: 'K', 40: 'L', 41: 'M', 42: 'N', 43: 'O', 44: 'P', 45: 'Q', 46: 

In [11]:
# We need to get our batches so that we can train our data.
def get_batches( arr, n_seqs, n_steps):
    """
    This function is a generator that returns batches of size n_seq * n_steps
    
    Args:
    arr: Just an array to make batches from.
    n_seqs: Batch size
    n_steps: Steps
    """
    
    # Get characters per batch
    characters_per_batch = n_seqs * n_steps
    n_batches = len(arr) // characters_per_batch
    
    # Keep only enough characters to make a full batch
    arr = arr[:n_batches * characters_per_batch]
    
    # Reshape into n_seq rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        
        #features
        x  = arr[:, n: n + n_steps]
        
        #shifted targets
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [12]:
# Get them
batches = get_batches(encoded, 13, 49)
x, y = next(batches)

In [13]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[31 64 57 72 76 61 74  1 16  0]
 [79 60 13  1 47 76 61 72 57 70]
 [75 65 63 64 76  1 71 62  1 68]
 [81  1 75 65 63 64 11  1 75 64]
 [61  1 79 57 75 64 61 60 11  1]
 [ 1 68 57 75 76  1 76 65 69 61]
 [13  1 29 70 60  0 62 71 74  1]
 [61 75 75 65 71 70  1 62 71 74]
 [61 80 72 74 61 75 75  1 76 64]
 [63 57 65 70  1 64 61 74 75 61]]

y
 [[64 57 72 76 61 74  1 16  0  0]
 [60 13  1 47 76 61 72 57 70  1]
 [65 63 64 76  1 71 62  1 68 71]
 [ 1 75 65 63 64 11  1 75 64 61]
 [ 1 79 57 75 64 61 60 11  1 76]
 [68 57 75 76  1 76 65 69 61 13]
 [ 1 29 70 60  0 62 71 74  1 77]
 [75 75 65 71 70  1 62 71 74 27]
 [80 72 74 61 75 75  1 76 64 65]
 [57 65 70  1 64 61 74 75 61 68]]


In [15]:
# Sweet, now we have to build the model. We're going to use a network of LSTM's.

In [None]:
# Inputs
