## Text-gen RNN

### Import relevant libraries

In [1]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

### Load text into a one huge string (millions of chars)

In [2]:
with open('anna.txt', 'r') as f:
    text=f.read()

In [3]:
print(text[:100])
print(len(text))

Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everythin
1985223


### Create vocabulary : a set of all chars of which 'text' consists

In [4]:
vocab = set(text)

In [5]:
print(vocab)


{'N', 'Q', 'S', 'v', 'w', 'D', '6', 'c', 's', 'K', 'J', 'i', '!', ' ', ';', '&', ':', '1', 'M', '*', 'g', 'F', '_', 'x', 'P', 'B', 'z', "'", 'n', 'V', 't', '5', '0', 'X', 'u', 'h', ')', 'k', '4', 'r', '/', 'Z', '-', '3', ',', 'C', 'q', 'G', '8', 'O', '`', 'b', 'p', 'I', '9', 'T', 'f', '(', 'o', 'R', 'E', '2', 'y', 'L', 'U', '@', 'j', 'm', '"', 'W', 'A', '%', '?', 'l', '\n', 'a', 'e', '$', '.', 'd', 'Y', '7', 'H'}


### Create vocab_to_int and int_to_vocab. These are dictionaries. You won't need it. Hopefully

In [6]:
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

In [7]:
print(vocab_to_int)
print()
print(int_to_vocab)

{'N': 0, 'Q': 1, 'S': 2, 'v': 3, 'w': 4, '6': 6, 'R': 59, 'c': 7, 's': 8, 'K': 9, 'i': 11, '!': 12, ' ': 13, 'n': 28, ';': 14, '&': 15, ':': 16, '1': 17, 'M': 18, 'U': 64, 'D': 5, 'L': 63, 'g': 20, '0': 32, 'B': 25, 'x': 23, 'P': 24, "'": 27, 'V': 29, '5': 31, 'X': 33, 'e': 76, 'G': 47, 'z': 26, 'u': 34, '4': 38, ')': 36, 'k': 37, '/': 40, 'r': 39, 'Z': 41, '-': 42, '3': 43, ',': 44, '.': 78, 'C': 45, 'j': 66, 'q': 46, 'O': 49, 'b': 51, 't': 30, 'I': 53, 'h': 35, '*': 19, 'T': 55, 'f': 56, '(': 57, 'o': 58, 'E': 60, 'Y': 80, 'y': 62, 'A': 70, '@': 65, 'm': 67, 'W': 69, '"': 68, '%': 71, '9': 54, '8': 48, 'F': 21, 'l': 73, '\n': 74, 'p': 52, '_': 22, 'a': 75, 'J': 10, '2': 61, 'd': 79, '$': 77, '`': 50, '?': 72, '7': 81, 'H': 82}

{0: 'N', 1: 'Q', 2: 'S', 3: 'v', 4: 'w', 5: 'D', 6: '6', 7: 'c', 8: 's', 9: 'K', 10: 'J', 11: 'i', 12: '!', 13: ' ', 14: ';', 15: '&', 16: ':', 17: '1', 18: 'M', 19: '*', 20: 'g', 21: 'F', 22: '_', 23: 'x', 24: 'P', 25: 'B', 26: 'z', 27: "'", 28: 'n', 29: 'V',

### Create an iteger representation of 'text' (millions of chars as ints)

unichr(x) - char from unicode int
ord(x) - byte or in from chr() or unichar() respectively

In [8]:
vocab_int = np.array([ord(c) for c in vocab], dtype=np.int32)
print(vocab_int)

print(len(vocab))
print(len(vocab_int))

[ 78  81  83 118 119  68  54  99 115  75  74 105  33  32  59  38  58  49
  77  42 103  70  95 120  80  66 122  39 110  86 116  53  48  88 117 104
  41 107  52 114  47  90  45  51  44  67 113  71  56  79  96  98 112  73
  57  84 102  40 111  82  69  50 121  76  85  64 106 109  34  87  65  37
  63 108  10  97 101  36  46 100  89  55  72]
83
83


In [9]:
#chars = np.array([ord(c) for c in text], dtype=np.int32)
chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)
print(chars[:100])

[45 35 75 52 30 76 39 13 17 74 74 74 82 75 52 52 62 13 56 75 67 11 73 11 76
  8 13 75 39 76 13 75 73 73 13 75 73 11 37 76 14 13 76  3 76 39 62 13 34 28
 35 75 52 52 62 13 56 75 67 11 73 62 13 11  8 13 34 28 35 75 52 52 62 13 11
 28 13 11 30  8 13 58  4 28 74  4 75 62 78 74 74 60  3 76 39 62 30 35 11 28]


## Data split

In [10]:
def split_data(chars_vector,
               samples_per_batch,
               sample_length,
               split_frac=0.9):

    x = chars_vector[:-1]
    y = chars_vector[1:]

    sample_count = len(x)-sample_length+1
    
    x_batches = []
    y_batches = []
    
    start_range = range(0, sample_count, sample_length)

    x_samples = np.array([x[start:start+sample_length] for start in start_range])
    y_samples = np.array([y[start:start+sample_length] for start in start_range])

    if sample_count > samples_per_batch:

        batch_count = len(x_samples) // samples_per_batch
        new_length = batch_count * samples_per_batch
        end_crop_count = len(x_samples)-new_length

        if end_crop_count != 0:
            x_samples = x_samples[:-end_crop_count]
            y_samples = y_samples[:-end_crop_count]

        x_batches = np.array(np.split(x_samples, batch_count))
        y_batches = np.array(np.split(y_samples, batch_count))
        
    else:
        
        x_batches = x_samples
        y_batches = y_samples
        
    
    split_idx = int(len(x_batches)*split_frac)

    train_x, train_y = x_batches[:split_idx], y_batches[:split_idx]
    val_x, val_y = x_batches[split_idx:], y_batches[split_idx:]
    
    return train_x, train_y, val_x, val_y

In [11]:
vector = np.arange(17)
samples_in_batch = 5
sample_length = 3
split_frac = 1.

tx, ty, _, _ = split_data(vector, samples_in_batch, sample_length, split_frac)

print('\ntx:')
print(tx)
print('\nty:')
print(ty)

print(tx.shape)


tx:
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]
  [ 9 10 11]
  [12 13 14]]]

ty:
[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]
  [10 11 12]
  [13 14 15]]]
(1, 5, 3)


In [12]:
def get_batch(tx, ty):
    for x, y in zip(tx, ty):
        yield x, y

#### Creating training and validation sets using function defined above

In [13]:
train_x, train_y, val_x, val_y = split_data(chars_vector=chars, 
                                            samples_per_batch=100, 
                                            sample_length=100)

In [14]:
print('train_x.shape: {}'.format(train_x.shape))
print('train_y.shape: {}'.format(train_y.shape))



train_x.shape: (178, 100, 100)
train_y.shape: (178, 100, 100)


In [15]:
train_x[0,:2,:]

array([[45, 35, 75, 52, 30, 76, 39, 13, 17, 74, 74, 74, 82, 75, 52, 52, 62,
        13, 56, 75, 67, 11, 73, 11, 76,  8, 13, 75, 39, 76, 13, 75, 73, 73,
        13, 75, 73, 11, 37, 76, 14, 13, 76,  3, 76, 39, 62, 13, 34, 28, 35,
        75, 52, 52, 62, 13, 56, 75, 67, 11, 73, 62, 13, 11,  8, 13, 34, 28,
        35, 75, 52, 52, 62, 13, 11, 28, 13, 11, 30,  8, 13, 58,  4, 28, 74,
         4, 75, 62, 78, 74, 74, 60,  3, 76, 39, 62, 30, 35, 11, 28],
       [20, 13,  4, 75,  8, 13, 11, 28, 13,  7, 58, 28, 56, 34,  8, 11, 58,
        28, 13, 11, 28, 13, 30, 35, 76, 13, 49, 51, 73, 58, 28,  8, 37, 62,
         8, 27, 13, 35, 58, 34,  8, 76, 78, 13, 55, 35, 76, 13,  4, 11, 56,
        76, 13, 35, 75, 79, 74, 79, 11,  8,  7, 58,  3, 76, 39, 76, 79, 13,
        30, 35, 75, 30, 13, 30, 35, 76, 13, 35, 34,  8, 51, 75, 28, 79, 13,
         4, 75,  8, 13,  7, 75, 39, 39, 62, 11, 28, 20, 13, 58, 28]], dtype=int32)

In [16]:
train_y[0, :2, :]

array([[35, 75, 52, 30, 76, 39, 13, 17, 74, 74, 74, 82, 75, 52, 52, 62, 13,
        56, 75, 67, 11, 73, 11, 76,  8, 13, 75, 39, 76, 13, 75, 73, 73, 13,
        75, 73, 11, 37, 76, 14, 13, 76,  3, 76, 39, 62, 13, 34, 28, 35, 75,
        52, 52, 62, 13, 56, 75, 67, 11, 73, 62, 13, 11,  8, 13, 34, 28, 35,
        75, 52, 52, 62, 13, 11, 28, 13, 11, 30,  8, 13, 58,  4, 28, 74,  4,
        75, 62, 78, 74, 74, 60,  3, 76, 39, 62, 30, 35, 11, 28, 20],
       [13,  4, 75,  8, 13, 11, 28, 13,  7, 58, 28, 56, 34,  8, 11, 58, 28,
        13, 11, 28, 13, 30, 35, 76, 13, 49, 51, 73, 58, 28,  8, 37, 62,  8,
        27, 13, 35, 58, 34,  8, 76, 78, 13, 55, 35, 76, 13,  4, 11, 56, 76,
        13, 35, 75, 79, 74, 79, 11,  8,  7, 58,  3, 76, 39, 76, 79, 13, 30,
        35, 75, 30, 13, 30, 35, 76, 13, 35, 34,  8, 51, 75, 28, 79, 13,  4,
        75,  8, 13,  7, 75, 39, 39, 62, 11, 28, 20, 13, 58, 28, 13]], dtype=int32)

### Building the model

In [17]:
def build_rnn(num_classes,
              samples_per_batch=50,
              sample_length=50,
              lstm_size=128, 
              num_layers=2,
              learning_rate=0.001, 
              grad_clip=5, 
              sampling=False):
    
    # When we're using this network for sampling later, we'll be passing in
    # one character at a time, so providing an option for that
    if sampling == True:
        samples_per_batch, sample_length = 1, 1

    tf.reset_default_graph()
    
    # Declare placeholders we'll feed into the graph
    inputs = tf.placeholder(tf.int32, [samples_per_batch, sample_length], name='inputs')
    targets = tf.placeholder(tf.int32, [samples_per_batch, sample_length], name='targets')
    
    # Keep probability placeholder for drop out layers
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    # One-hot encoding the input and target characters
    x_one_hot = tf.one_hot(inputs, num_classes)
    y_one_hot = tf.one_hot(targets, num_classes)

    ### Build the RNN layers
    # Use a basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    initial_state = cell.zero_state(samples_per_batch, tf.float32)

    ### Run the data through the RNN layers
    # This makes a list where each element is on step in the sequence
    rnn_inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(x_one_hot, sample_length, 1)]
    
    # Run each sequence step through the RNN and collect the outputs
    outputs, state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=initial_state)
    final_state = state
    
    # Reshape output so it's a bunch of rows, one output row for each step for each batch
    seq_output = tf.concat(outputs, axis=1)
    output = tf.reshape(seq_output, [-1, lstm_size])
    
    # Now connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((lstm_size, num_classes), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(num_classes))
    
    # Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
    # of rows of logit outputs, one for each step and batch
    logits = tf.matmul(output, softmax_w) + softmax_b
    
    # Use softmax to get the probabilities for predicted characters
    preds = tf.nn.softmax(logits, name='predictions')
    
    # Reshape the targets to match the logits
    y_reshaped = tf.reshape(y_one_hot, [-1, num_classes])
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    cost = tf.reduce_mean(loss)

    # Optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    # Export the nodes
    # NOTE: I'm using a namedtuple here because I think they are cool
    export_nodes = ['inputs', 'targets', 'initial_state', 'final_state',
                    'keep_prob', 'cost', 'preds', 'optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])
    
    return graph

## Training

In [18]:
samples_per_batch = 100
sample_length = 100 
lstm_size = 512
num_layers = 2
learning_rate = 0.002
keep_prob = 0.5

In [19]:
epoch_count = 1
# Save every N iterations
save_every_n = 50

print('Generating training and validation data...')
train_x, train_y, val_x, val_y = split_data(chars, samples_per_batch, sample_length)
print('train_x len: {}'.format(len(train_x)))
print('train_y len: {}'.format(len(train_y)))
print('val_x len: {}'.format(len(val_x)))
print('val_y len: {}'.format(len(val_y)))
print('Data generation complete.')

print('Building model...')
model = build_rnn(len(vocab_int), 
                  samples_per_batch=samples_per_batch,
                  sample_length=sample_length,
                  learning_rate=learning_rate,
                  lstm_size=lstm_size,
                  num_layers=num_layers)
print('Model built')

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    n_batches = len(train_x)
    print('total batches: {}'.format(n_batches))
    
    iteration_count = n_batches * epoch_count
    print('total iteration count: {}'.format(iteration_count))
    
    for epoch in range(epoch_count):
        
        new_state = sess.run(model.initial_state)
        loss = 0
        
        for i, (x, y) in enumerate(get_batch(train_x, train_y)):
#             print(train_x[i])
#             print(x.shape)
            iteration = epoch*n_batches + i + 1
            start = time.time()
            
            feed = {model.inputs: x, 
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            
            batch_loss, new_state, _ = sess.run([model.cost, model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            loss += batch_loss
            end = time.time()
            print('Epoch {}/{} '.format(epoch, epoch_count-1),
                  'Iteration {}/{}'.format(iteration, iteration_count),
                  'Training loss: {:.4f}'.format(loss/(i+1)),
                  '{:.4f} sec/batch'.format((end-start)))
            
            if (iteration%save_every_n == 0) or (iteration == iteration_count):
                # Check performance, notice dropout has been set to 1
                val_loss = []
                new_state = sess.run(model.initial_state)
                for x, y in get_batch(val_x, val_y):
                    feed = {model.inputs: x,
                            model.targets: y,
                            model.keep_prob: 1.,
                            model.initial_state: new_state}
                    batch_loss, new_state = sess.run([model.cost, model.final_state], feed_dict=feed)
                    val_loss.append(batch_loss)

                print('Validation loss:', np.mean(val_loss),
                      'Saving checkpoint!')
                saver.save(sess, "checkpoints/i{}_l{}_v{:.3f}.ckpt".format(iteration, lstm_size, np.mean(val_loss)))
        

Generating training and validation data...
train_x len: 178
train_y len: 178
val_x len: 20
val_y len: 20
Data generation complete.
Building model...
Model built
total batches: 178
total iteration count: 178
Epoch 0/0  Iteration 0/178 Training loss: 4.4236 6.4256 sec/batch
Validation loss: 4.27024 Saving checkpoint!
Epoch 0/0  Iteration 1/178 Training loss: 4.3506 5.3474 sec/batch
Epoch 0/0  Iteration 2/178 Training loss: 4.6816 5.5415 sec/batch
Epoch 0/0  Iteration 3/178 Training loss: 4.5882 5.6118 sec/batch
Epoch 0/0  Iteration 4/178 Training loss: 4.4483 5.8159 sec/batch
Epoch 0/0  Iteration 5/178 Training loss: 4.3196 5.6982 sec/batch
Epoch 0/0  Iteration 6/178 Training loss: 4.2095 5.6530 sec/batch
Epoch 0/0  Iteration 7/178 Training loss: 4.1144 5.7437 sec/batch
Epoch 0/0  Iteration 8/178 Training loss: 4.0389 5.6312 sec/batch
Epoch 0/0  Iteration 9/178 Training loss: 3.9698 5.6350 sec/batch
Epoch 0/0  Iteration 10/178 Training loss: 3.9030 5.6098 sec/batch
Epoch 0/0  Iteration 1

In [20]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints/i150_l512_v2.423.ckpt"
all_model_checkpoint_paths: "checkpoints/i0_l512_v4.270.ckpt"
all_model_checkpoint_paths: "checkpoints/i50_l512_v3.083.ckpt"
all_model_checkpoint_paths: "checkpoints/i100_l512_v2.788.ckpt"
all_model_checkpoint_paths: "checkpoints/i150_l512_v2.423.ckpt"

## Sampling

In [21]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [22]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = build_rnn(vocab_size, lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            #x[0,0] = ord(c)# vocab_to_int[c]
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab), 1)
        #samples.append(chr(c))#(int_to_vocab[c])
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            #samples.append(chr(c))# (int_to_vocab[c])
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [24]:
checkpoint = "checkpoints/i150_l512_v2.423.ckpt"
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="The")
print(samp)

The sard solend the th ware hhe salede held sad and ont as tham tire wer orad whe hares atd ont to mer tot his the sat to lo the cer had anler ortant hes he tharse sos had whas, word
thes when whor athen. Ad anly onter, an sotise at to tele tinle the sorente he whan to simas athe anse the wolt he sot ot as te al thereran her of ard wal the he ardensit orat ant otin the te thin tislese she thar simas athang wel so sothe tha sar at hire at hor wore she she couthe he the asser sind afthe whr sos han and sard af and
athers, bath the ans ald warteses aler as olly. Anlderyinn hot wersid too tor the se the sore thes ase she the colles ortaring on anerand and and wald whes sh mer he shers wot ar ant an tha sar ond,
be so sar at ald wit her at hom
ther hit houg har ante wol sith the whr wan hire therensens he cererent an ald ald,.
"""
 heut sote siman asese wild hor wan toun tar too lethen att and so tal tint an hersothin wasd, and and onle and as otel te sere sis atito hhe san sorere ad sor ha