# Char RNN: character-level language modelling

We will work with the CharRNN model for character-level language modelling. In a nutshell, this model aims at modelling the probability distribution of the next character given the sequence of previous characters. These models can be trained using any text source, since the text itself is the supervision signal for the task. Please see [this blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) for more examples.

The code in this tutorial is based on [this implementation](https://github.com/crazydonkey200/tensorflow-char-rnn) by *crazydonkey200*.

## Imports

In [None]:
import tensorflow as tf
import tensorflow.contrib.layers as layers

from util.text_tools import *

import time
import numpy as np

## Define the model

We will create the model containing an embedding layer, an RNN and a fully connected layer. It will be defined in a class with three methods:

- `__init__` instantiates the class and is in charge of creating the graph. It does not perform any computation.
- `run_epoch` runs the model once on all the data. We will use it for training the weights, although it can be used for evaluation on a validation set as well by setting `is_training=False`.
- `sample_seq` takes an input text and samples a sequence of characters from the model. Used for inference on user generated data.

In [None]:
class CharRNN(object):
    """Character RNN model."""
  
    def __init__(self, is_training, batch_size, num_unrollings, vocab_size, hidden_size=128, max_grad_norm=0.2, 
                 embedding_size=15, learning_rate=0.01):
        self.batch_size = batch_size
        self.num_unrollings = num_unrollings
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.max_grad_norm = max_grad_norm
        self.embedding_size = embedding_size
        self.input_size = embedding_size

        # Placeholder to feed in input and targets/labels data
        self.input_data = tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='inputs')
        self.targets = tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='targets')

        # Embeddings layer
        with tf.name_scope('embedding_layer'):
            self.embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size])
            inputs = tf.nn.embedding_lookup(self.embedding, self.input_data)

        # Create multilayer GRU cell.
        cell = tf.contrib.rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)

        # The initial state for the RNN is the null vector
        with tf.name_scope('initial_state'):
            self.zero_state = cell.zero_state(self.batch_size, tf.float32)
            
        # Create a placeholder to propagate the RNN state between batches
        self.initial_state = tf.placeholder(tf.float32, self.zero_state.get_shape())        
        
        # Create the graph for the RNN by unrolling it in time
        rnn_outputs, self.final_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=self.initial_state, 
                                                    dtype=tf.float32)

        # Classification layer on top of the RNN
        with tf.variable_scope('softmax') as sm_vs:
            self.logits = layers.linear(inputs=rnn_outputs, num_outputs=vocab_size)
            self.probs = tf.nn.softmax(self.logits)

        # Compute mean cross entropy loss for each output
        with tf.name_scope('loss'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.targets)
            self.mean_loss = tf.reduce_mean(loss)

        # Track metrics (cross-entropy loss and perplexity) using TensorBoard
        with tf.name_scope('loss_monitor'):
            count = tf.Variable(1.0, name='count')
            sum_mean_loss = tf.Variable(1.0, name='sum_mean_loss')

            self.reset_loss_monitor = tf.group(sum_mean_loss.assign(0.0), count.assign(0.0), name='reset_loss_monitor')
            self.update_loss_monitor = tf.group(sum_mean_loss.assign(sum_mean_loss + self.mean_loss),
                                                count.assign(count + 1), name='update_loss_monitor')
            with tf.control_dependencies([self.update_loss_monitor]):
                self.average_loss = sum_mean_loss / count
                self.ppl = tf.exp(self.average_loss)
        average_loss_summary = tf.summary.scalar("average_loss", self.average_loss)
        ppl_summary = tf.summary.scalar("perplexity", self.ppl)
        self.summaries = tf.summary.merge([average_loss_summary, ppl_summary], name='loss_monitor')

        # Track number of SGD steps
        self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0.0))
        
        # Create training op: (1) compute gradients, (2) clip their norm, and (3) update weights
        if is_training:
            optimizer = tf.train.AdamOptimizer(tf.constant(learning_rate))
            trainable_variables = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.mean_loss, trainable_variables), self.max_grad_norm)
            self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=self.global_step)

            
    def run_epoch(self, session, data_size, batch_generator, is_training, summary_writer=None, epoch_number=None):
        """Runs the model on the given data for one full pass, (optionally) training the model weigths."""
        epoch_size = data_size // (self.batch_size * self.num_unrollings)
        if data_size % (self.batch_size * self.num_unrollings) != 0:
            epoch_size += 1

        if is_training:
            extra_op = self.train_op
        else:
            extra_op = tf.no_op()

        # Prepare initial state and reset the average loss computation.
        state = session.run(self.zero_state)
        self.reset_loss_monitor.run()
        start_time = time.time()
        for step in range(epoch_size):
            # Generate the batch and use [:-1] as inputs and [1:] as targets.
            data = batch_generator.next()
            inputs = np.array(data[:-1]).transpose()
            targets = np.array(data[1:]).transpose()

            ops = [self.average_loss, self.final_state, extra_op, self.summaries]
            feed_dict = {self.input_data: inputs, self.targets: targets, self.initial_state: state}
            average_loss, state, _, summary_str = session.run(ops, feed_dict)

            if summary_writer and (step+1) % 100 == 0:
                summary_writer.add_summary(summary_str, session.run(self.global_step))
            
            ppl = np.exp(average_loss)
            if (step+1) % 500 == 0:
                if epoch_number is not None:
                    print("[Epoch %d] " % epoch_number, end="")
                print("%.1f%%, step:%d, perplexity: %.3f, speed: %.0f words per sec" % 
                      ((step + 1) * 1.0 / epoch_size * 100, step, ppl,
                      (step + 1) * self.batch_size * self.num_unrollings /
                      (time.time() - start_time)))

        if epoch_number is not None:
            print("[Epoch %d] " % epoch_number, end="")
        print("Perplexity: %.3f, speed: %.0f words per sec\n" %
              (ppl, (step + 1) * self.batch_size * self.num_unrollings / (time.time() - start_time)))
        return ppl, summary_str

    def sample_seq(self, session, length, start_text, vocab_index_dict,
                 index_vocab_dict, temperature=1.0, deterministic=True):

        state = session.run(self.zero_state)

        # use start_text to warm up the RNN.
        if start_text is not None and len(start_text) > 0:
            seq = list(start_text)
            for char in start_text[:-1]:
                x = np.array([[char2id(char, vocab_index_dict)]])
                state = session.run(self.final_state,
                                    {self.input_data: x,
                                     self.initial_state: state})
            x = np.array([[char2id(start_text[-1], vocab_index_dict)]])
        else:
            vocab_size = len(vocab_index_dict.keys())
            x = np.array([[np.random.randint(0, vocab_size)]])
            seq = []

        for i in range(length):
            state, logits = session.run([self.final_state,
                                   self.logits],
                                  {self.input_data: x,
                                   self.initial_state: state})
            unnormalized_probs = np.exp((logits - np.max(logits)) / temperature)
            probs = unnormalized_probs / np.sum(unnormalized_probs)

            if deterministic:
                sample = np.argmax(probs[0])
            else:
                sample = np.random.choice(self.vocab_size, 1, p=probs[0][0])[0]
                
            seq.append(id2char(sample, index_vocab_dict))
            x = np.array([[sample]])
        return ''.join(seq)

## Hyperparameters

In [None]:
batch_size = 32
num_unrollings = 30
num_epochs = 5

text_file = 'data/tiny_shakespeare.txt'

## Load data

In [None]:
def create_vocab(text):
    unique_chars = list(set(text))
    vocab_size = len(unique_chars)
    vocab_index_dict = {}
    index_vocab_dict = {}
    for i, char in enumerate(unique_chars):
        vocab_index_dict[char] = i
        index_vocab_dict[i] = char
    return vocab_index_dict, index_vocab_dict, vocab_size

with open(text_file, 'r', encoding='utf-8') as f:
    train_text = f.read()
vocab_index_dict, index_vocab_dict, vocab_size = create_vocab(train_text)
print("Vocabulary (", vocab_size, "elements ): ", sorted(vocab_index_dict.keys()))

train_batches = BatchGenerator(train_text, batch_size, num_unrollings, vocab_size, vocab_index_dict, index_vocab_dict)

## Create graph

In [None]:
# Reset graph in case we already created one and want to change hyperparameters
tf.reset_default_graph()

# Training graph
with tf.name_scope('training'):
    train_model = CharRNN(is_training=True, batch_size=batch_size, num_unrollings=num_unrollings, 
                          vocab_size=vocab_size)

# Both models share weights
tf.get_variable_scope().reuse_variables()

# Evaluation graph
with tf.name_scope('evaluation'):
    test_model = CharRNN(is_training=False, batch_size=1, num_unrollings=1, vocab_size=vocab_size)

## Train loop

In [None]:
# Create session
sess = tf.InteractiveSession()

# Initialize variables
sess.run(tf.global_variables_initializer())

# TensorBoard: create an empty logdir and the writer
logdir = 'tensorboard_logs/char_rnn'
if tf.gfile.Exists(logdir):
    tf.gfile.DeleteRecursively(logdir)
if not tf.gfile.Exists(logdir):
    tf.gfile.MakeDirs(logdir)
summary_writer = tf.summary.FileWriter(logdir)

# Train the model
for epoch in range(num_epochs):
    train_model.run_epoch(sess, len(train_text), train_batches, is_training=True, 
                          summary_writer=summary_writer, epoch_number=epoch+1)
summary_writer.flush()
summary_writer.close()

print("Done!")

## Sample sentences from the trained model

In [None]:
num_samples = 150
deterministic = False
softmax_temperature = 0.5
start_text = "The meaning of life is "

sample = test_model.sample_seq(sess, num_samples, start_text, vocab_index_dict, index_vocab_dict, 
                               softmax_temperature, deterministic)
print('Sampled text is:\n%s' % sample)

## Possible extensions

- Reduce the learning rate when the loss plateaus.
- Split data into train/val/test for a better experimental setup. Tune hyperparameters to maximize performance on the validation set.
- Add more capacity to the model: more layers (check `tf.contrib.rnn.MultiRNNCell`) and more units per layer.
- If the model capacity is increased, regularization may be needed to avoid overfitting. For instance, dropout between RNN layers (check `tf.contrib.rnn.DropoutWrapper`).
- Train the model on your own data!