## Text-gen RNN

### Import relevant libraries

In [1]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

### Load text into a one huge string (millions of chars)

In [2]:
with open('anna.txt', 'r') as f:
    text=f.read()

In [3]:
print(text[:100])
print(len(text))

Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everythin
1966145


### Create vocabulary : a set of all chars of which 'text' consists

In [4]:
vocab = set(text)

In [5]:
print(vocab)


{'1', 'p', 'k', 'v', 'j', 'I', 'u', 'g', '9', 'M', 'i', 'q', '3', ')', '`', '(', 'b', '_', 'X', '!', 'V', 'y', '\n', 'G', '8', '0', 'Y', 'S', "'", 'E', 'T', 'f', 'w', '?', 'm', '.', 'B', '2', 'h', 'A', '7', 'Q', 'l', '6', ';', ':', 'e', '"', 'Z', 'z', 'x', 'a', 'W', 'R', 'd', 'H', 'F', ',', 'U', 'J', 'O', 'o', '5', 'C', 't', 'K', 'r', 'c', 'n', '-', 'P', 'N', 'D', 'L', '4', 's', ' '}


### Create vocab_to_int and int_to_vocab. These are dictionaries. You won't need it. Hopefully

In [6]:
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

In [7]:
print(vocab_to_int)
print()
print(int_to_vocab)

{'1': 0, 'v': 3, 'D': 72, 'R': 53, 'j': 4, 'I': 5, 'u': 6, 'g': 7, 'p': 1, 'M': 9, 'i': 10, 'q': 11, '3': 12, ')': 13, '`': 14, '(': 15, 'E': 29, '-': 69, 'b': 16, 'X': 18, '!': 19, 'V': 20, 'y': 21, '\n': 22, '0': 25, '4': 74, 'Y': 26, 'w': 32, 'U': 58, 'S': 27, "'": 28, 'G': 23, 'r': 66, 'T': 30, '8': 24, 'F': 56, 'f': 31, '?': 33, '2': 37, 'm': 34, '.': 35, 'B': 36, 'h': 38, '7': 40, 'Q': 41, 'l': 42, '6': 43, ';': 44, 'c': 67, ':': 45, 'e': 46, '9': 8, 'z': 49, 'x': 50, 'a': 51, 'W': 52, 'd': 54, 'H': 55, '"': 47, ',': 57, 'O': 60, 'o': 61, 'A': 39, 'J': 59, '5': 62, 'L': 73, 'C': 63, 't': 64, 's': 75, 'K': 65, 'k': 2, 'P': 70, 'n': 68, 'N': 71, '_': 17, 'Z': 48, ' ': 76}

{0: '1', 1: 'p', 2: 'k', 3: 'v', 4: 'j', 5: 'I', 6: 'u', 7: 'g', 8: '9', 9: 'M', 10: 'i', 11: 'q', 12: '3', 13: ')', 14: '`', 15: '(', 16: 'b', 17: '_', 18: 'X', 19: '!', 20: 'V', 21: 'y', 22: '\n', 23: 'G', 24: '8', 25: '0', 26: 'Y', 27: 'S', 28: "'", 29: 'E', 30: 'T', 31: 'f', 32: 'w', 33: '?', 34: 'm', 35: '.'

### Create an iteger representation of 'text' (millions of chars as ints)

In [8]:
#chars = np.array([ord(c) for c in text], dtype=np.int32)
chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)
print(chars[:100])

[63 38 51  1 64 46 66 76  0 22 22 22 55 51  1  1 21 76 31 51 34 10 42 10 46
 75 76 51 66 46 76 51 42 42 76 51 42 10  2 46 44 76 46  3 46 66 21 76  6 68
 38 51  1  1 21 76 31 51 34 10 42 21 76 10 75 76  6 68 38 51  1  1 21 76 10
 68 76 10 64 75 76 61 32 68 22 32 51 21 35 22 22 29  3 46 66 21 64 38 10 68]


## Data split

In [9]:
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    """ 
    Split character data into training and validation sets, inputs and targets for each set.
    
    Arguments
    ---------
    chars: character array
    batch_size: Size of examples in each of batch
    num_steps: Number of sequence steps to keep in the input and pass to the network
    split_frac: Fraction of batches to keep in the training set
    
    
    Returns train_x, train_y, val_x, val_y
    """
    
    
    slice_size = batch_size * num_steps
    n_batches = int(len(chars) / slice_size)
    
    # Drop the last few characters to make only full batches
    x = chars[: n_batches*slice_size]
    y = chars[1: n_batches*slice_size + 1]
    
    # Split the data into batch_size slices, then stack them into a 2D matrix 
    x = np.stack(np.split(x, batch_size))
    y = np.stack(np.split(y, batch_size))
    
    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps
    
    # Split into training and validation sets, keep the virst split_frac batches for training
    split_idx = int(n_batches*split_frac)
    train_x, train_y= x[:, :split_idx*num_steps], y[:, :split_idx*num_steps]
    val_x, val_y = x[:, split_idx*num_steps:], y[:, split_idx*num_steps:]
    
    return train_x, train_y, val_x, val_y

In [10]:
train_x, train_y, val_x, val_y = split_data(chars, 10, 200)

train_x.shape

(10, 176800)

In [11]:
def get_batch(arrs, num_steps):
    batch_size, slice_size = arrs[0].shape
    
    n_batches = int(slice_size/num_steps)
    for b in range(n_batches):
        yield [x[:, b*num_steps: (b+1)*num_steps] for x in arrs]

#### Creating training and validation sets using function defined above

### Building the model

In [12]:
def build_rnn(num_classes, batch_size=50, num_steps=50, lstm_size=128, num_layers=2,
              learning_rate=0.001, grad_clip=5, sampling=False):
        
    if sampling == True:
        batch_size, num_steps = 1, 1

    tf.reset_default_graph()
    
    # Declare placeholders we'll feed into the graph
    with tf.name_scope('inputs'):
        inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
        x_one_hot = tf.one_hot(inputs, num_classes, name='x_one_hot')
    
    with tf.name_scope('targets'):
        targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
        y_one_hot = tf.one_hot(targets, num_classes, name='y_one_hot')
        y_reshaped = tf.reshape(y_one_hot, [-1, num_classes])
    
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    # Build the RNN layers
    with tf.name_scope("RNN_layers"):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    
    with tf.name_scope("RNN_init_state"):
        initial_state = cell.zero_state(batch_size, tf.float32)

    # Run the data through the RNN layers
    with tf.name_scope("RNN_forward"):
        rnn_inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(x_one_hot, num_steps, 1)]
        outputs, state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=initial_state)
    
    final_state = state
    
    # Reshape output so it's a bunch of rows, one row for each cell output
    with tf.name_scope('sequence_reshape'):
        seq_output = tf.concat(outputs, axis=1,name='seq_output')
        output = tf.reshape(seq_output, [-1, lstm_size], name='graph_output')
    
    # Now connect the RNN putputs to a softmax layer and calculate the cost
    with tf.name_scope('logits'):
        softmax_w = tf.Variable(tf.truncated_normal((lstm_size, num_classes), stddev=0.1),
                               name='softmax_w')
        softmax_b = tf.Variable(tf.zeros(num_classes), name='softmax_b')
        logits = tf.matmul(output, softmax_w) + softmax_b
        tf.summary.histogram('softmax_w', softmax_w)
        tf.summary.histogram('softmax_b', softmax_b)

    with tf.name_scope('predictions'):
        preds = tf.nn.softmax(logits, name='predictions')
        tf.summary.histogram('predictions', preds)
    
    
    with tf.name_scope('cost'):
        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped, name='loss')
        cost = tf.reduce_mean(loss, name='cost')
        tf.summary.scalar('cost', cost)

    # Optimizer for training, using gradient clipping to control exploding gradients
    with tf.name_scope('train'):
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
        train_op = tf.train.AdamOptimizer(learning_rate)
        optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    merged = tf.summary.merge_all()
    
    # Export the nodes 
    export_nodes = ['inputs', 'targets', 'initial_state', 'final_state',
                    'keep_prob', 'cost', 'preds', 'optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])
    
    return graph

## Training

In [13]:
batch_size = 100
num_steps = 100
lstm_size = 512
num_layers = 2
learning_rate = 0.002
keep_prob = 0.5

### Write out the graph for TensorBoard

In [16]:
!mkdir -p checkpoints/anna

In [27]:
def train(model, epochs, log_string):
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        train_writer = tf.summary.FileWriter('./logs/anna/train/{}'.format(log_string),
                                             sess.graph)
        test_writer = tf.summary.FileWriter('./logs/anna/test/{}'.format(log_string))

        # Use the line below to load a checkpoint and resume training
        #saver.restore(sess, 'checkpoints/anna20.ckpt')

        n_batches = int(train_x.shape[1]/num_steps)
        iterations = n_batches * epochs
        for e in range(epochs):

            # Train network
            new_state = sess.run(model.initial_state)
            loss = 0
            for b, (x, y) in enumerate(get_batch([train_x, train_y], num_steps), 1):
                iteration = e*n_batches + b
                start = time.time()
                feed = {model.inputs: x,
                        model.targets: y,
                        model.keep_prob: keep_prob,
                        model.initial_state: new_state}
                summary, batch_loss, new_state, _ = sess.run([model.merged, model.cost, 
                                                              model.final_state, model.optimizer], 
                                                              feed_dict=feed)
                
                loss += batch_loss
                end = time.time()
                print('Epoch {}/{} '.format(e+1, epochs),
                      'Iteration {}/{}'.format(iteration, iterations),
                      'Training loss: {:.4f}'.format(loss/b),
                      '{:.4f} sec/batch'.format((end-start)))

                train_writer.add_summary(summary, iteration)
                
                if (iteration%save_every_n == 0) or (iteration == iterations):
                    # Check performance, notice dropout has been set to 1
                    val_loss = []
                    new_state = sess.run(model.initial_state)
                    for x, y in get_batch([val_x, val_y], num_steps):
                        feed = {model.inputs: x,
                                model.targets: y,
                                model.keep_prob: 1.,
                                model.initial_state: new_state}
                        summary, batch_loss, new_state = sess.run([model.merged,
                                                                   model.cost, 
                                                                   model.final_state], 
                                                                  feed_dict=feed)
                        val_loss.append(batch_loss)

                    test_writer.add_summary(summary, iteration)

                    print('Validation loss:', np.mean(val_loss),
                          'Saving checkpoint!')
                    # Below command is commented out in Mat's version
                    saver.save(sess, 
                               "checkpoints/anna/i{}_l{}_{:.3f}.ckpt".format(iteration, 
                                                                             lstm_size, 
                                                                             np.mean(val_loss)))

In [20]:
epochs = 20
batch_size = 100
num_steps = 100
train_x, train_y, val_x, val_y = split_data(chars, batch_size, num_steps)

for lstm_size in [256,512]:
    for num_layers in [1, 2]:
        for learning_rate in [0.002]:
            log_string = 'lr={},rl={},ru={}'.format(learning_rate, num_layers, lstm_size)
            model = build_rnn(len(vocab), 
                    batch_size=batch_size,
                    num_steps=num_steps,
                    learning_rate=learning_rate,
                    lstm_size=lstm_size,
                    num_layers=num_layers)
            
            train(model, epochs, log_string)

Generating training and validation data...
Data generation complete.
Building model...
Model built
Epoch 1/1  Iteration 1/176 Training loss: 4.3479 6.4462 sec/batch
Epoch 1/1  Iteration 2/176 Training loss: 4.2779 5.6878 sec/batch
Epoch 1/1  Iteration 3/176 Training loss: 4.8322 6.1623 sec/batch
Epoch 1/1  Iteration 4/176 Training loss: 4.6433 6.3655 sec/batch
Epoch 1/1  Iteration 5/176 Training loss: 4.4711 6.5034 sec/batch
Epoch 1/1  Iteration 6/176 Training loss: 4.3322 5.7222 sec/batch
Epoch 1/1  Iteration 7/176 Training loss: 4.2070 5.6751 sec/batch
Epoch 1/1  Iteration 8/176 Training loss: 4.1048 5.9416 sec/batch
Epoch 1/1  Iteration 9/176 Training loss: 4.0189 5.8041 sec/batch
Epoch 1/1  Iteration 10/176 Training loss: 3.9459 5.7832 sec/batch
Epoch 1/1  Iteration 11/176 Training loss: 3.8844 5.6367 sec/batch
Epoch 1/1  Iteration 12/176 Training loss: 3.8335 5.6381 sec/batch
Epoch 1/1  Iteration 13/176 Training loss: 3.7908 5.6543 sec/batch
Epoch 1/1  Iteration 14/176 Training lo

In [22]:
tf.train.get_checkpoint_state('checkpoints/anna')

model_checkpoint_path: "checkpoints/anna/i176_l512_2.227.ckpt"
all_model_checkpoint_paths: "checkpoints/anna/i50_l512_3.062.ckpt"
all_model_checkpoint_paths: "checkpoints/anna/i100_l512_2.719.ckpt"
all_model_checkpoint_paths: "checkpoints/anna/i150_l512_2.322.ckpt"
all_model_checkpoint_paths: "checkpoints/anna/i176_l512_2.227.ckpt"

## Sampling

In [23]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [24]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = build_rnn(vocab_size, lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            #x[0,0] = ord(c)# vocab_to_int[c]
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab), 1)
        #samples.append(chr(c))#(int_to_vocab[c])
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            #samples.append(chr(c))# (int_to_vocab[c])
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [25]:
checkpoint = "checkpoints/anna/i176_l512_2.227.ckpt"
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Fra")
print(samp)

Frant ho hat wing the ande at tile there shen so ad onte thet and hom want on the her and ale the hing than the
ceraled tin ates tise toet the her thes tor the shars hom he adere soud, and her wald her wos timesen it hin hes sith thit and
saling hat han
tasile of ther arend. The chrate so se the wering the ather the some thet anteste at her hit he thise ans the thes has this whad ale he an on therens wes one ho wend here wout ot he sees and ane sarering the the sonl asin at than wer shat se wall of, here salening womhe thaterene and him to silad ant thicilg tha thar he was the went on aled ta has sore se wand ther andines. 
he aldit on outes he shad his aned that se sas it the had
ther wers on thar sat ard hese her so ware tir an on anting to she seen has wils and he the
hesrad thas sis onter taed ther had se thas the
chrins his sisthing the somint tha has hisesinge the hat whe she hade sad,, whis hes tind as te the sarer ons andithitg the sant his anden an tous, an he sasting had and 