# Character-level Language Models
As seen in previous example, RNNs works well for sequential dataset. In this notebook, we want to train RNNs character-level language models i.e we'll give the RNNs a huge trunk of text and ask it to model the probability distribution of the next character in the sequence given a sequence of previous characters. This will then allow us to generate new text one character at a time.

We recall the vanilla-RNNs dynamics
$$
\begin{array}{rl}
h_t &= \tanh\left(x_t\times W_{xh} + h_{t-1}\times W_{hh} + b_{h}\right)\\
o_t &= \mathrm{softmax}\left(h_t\times W_{ho} + b_{o}\right)
\end{array}
$$
where 
* $x_t$ is one-hot encoding of an input character
* $W_{xh}$ is the input-to-hidden weight matrix
* $W_{hh}$ is the hidden-to-hidden weight matrix
* $W_{ho}$ is the hidden-to-output weight matrix
* $b_h$ and $b_o$ are the biases

Here we use $o_t$ to model the  conditional distribution
$$
P(x_{t+1}=j| x_{\leq t}) = o_t[j]
$$

First we import the libraries we need and define the dataset

In [None]:
import tensorflow as tf
import numpy as np
import sys
import time

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

if '../common' not in sys.path:
    sys.path.insert(0, '../common')

from rnn.mrnn import BasicMRNNCell                

# Training dataset
We will train RNNs model on Anna Karenina (~2Mb).

## Pre-processing
First we need to do the following pre-processing
* get the set of all characters
* get the map character to ids and vice-versa
* convert text to ids



In [None]:
with open('../common/data/anna.txt', 'r') as f:
    text=f.read()

# get all unique characters
vocabs = set(text)

# get the map char-to-id and vice-versa
vocab_to_id = {c: i for i, c in enumerate(vocabs)}
id_to_vocab = dict(enumerate(vocabs))

# convert text-input into ids
char_ids = np.array([vocab_to_id[c] for c in text], dtype=np.int32)

Let's check out the first 50 characters in text & ids

In [None]:
print(text[:50])
print(chars[:50])

## Mini-batches
Now we want to split data into mini-batches and into training and validation sets. We implement it in following helper function

In [None]:
def split_data(char_ids, batch_size, seq_len, split_frac = 0.9):
    slice_size = batch_size*seq_len
    nb_batches = (len(char_ids) - 1) // slice_size
    
    # get input/target
    x = char_ids[  : nb_batches*slice_size]
    y = char_ids[1 : nb_batches*slice_size+1]
    
    # split them to batches
    x = np.stack(np.split(x, batch_size))
    y = np.stack(np.split(y, batch_size))
    
    # split into train/validation set
    split_idx = int(nb_batches*split_frac) * seq_len
    
    train_x, train_y = x[:, :split_idx], y[:, :split_idx]
    val_x, val_y = x[:, split_idx:], y[:, split_idx:]
    
    return train_x, train_y, val_x, val_y

def get_batches(train_inputs, train_targets, seq_len):
    nb_batches = train_inputs.shape[1]//seq_len
    idx = 0
    for i in range(nb_batches):
        idx += seq_len
        yield train_inputs[:, idx-seq_len : idx], train_targets[:, idx-seq_len : idx]
        
def pick_top_idx(top_prob, top_idx):
    c = np.random.choice(len(top_prob), 1, p = top_prob/np.sum(top_prob))[0]
    return top_idx[c]

# Ensemble a RNNs model
As in previous post, we will use Tensorflow to create a RNNs model using the following functions
* [`tf.one_hot`](https://www.tensorflow.org/api_docs/python/tf/one_hot) to convert target into one-hot representation
* [`tf.contrib.rnn.BasicRNNCell`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell) to model a basic RNN cell
* [`tf.nn.dynamic_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn) to perform fully dynamic unrolling of our rnn i.e we compute the final state of our RNN

In [None]:
# Rnn character-lever language model
class CharRnn(object):
    def __init__(self, vocabs, vocab_to_id, id_to_voca
                     , cell_type, rnn_size, batch_size, seq_len
                     , num_factors = 3, num_layers = 2, learning_rate = 0.001):
        # set input
        self._vocabs = vocabs
        self._vocabs_size = len(vocabs)
        
        self._vocab_to_id = vocab_to_id
        self._id_to_vocab = id_to_vocab
        self._rnn_size = rnn_size
        self._batch_size = batch_size
        self._seq_len = seq_len
        self._cell_type = cell_type
        self._num_factors = num_factors
        self._num_layers = num_layers
        self._lr = learning_rate        
        
        # check input
        assert (self._cell_type in ['rnn', 'mrnn', 'lstm', 'gru'])
        assert (self._num_layers >= 1)
        
        # build graph
        self.build_graph()
        
    def build_graph(self):
        self._graph = tf.Graph()
        
        # create placeholder for input/target
        self._create_placeholder()
        
        # create rnn layers
        self._create_rnn()
        
        # create loss/cost layers
        self._create_loss()
        
        # create train-op & saver
        self._create_train_op_saver()
        
        # create sample
        self._create_sample()
    
    def _create_placeholder(self):
        with self._graph.as_default():
            # input & target has shape [batch_size, seq_len] 
            self._inputs  = tf.placeholder(tf.int32, [self._batch_size, None], name = 'inputs')
            self._targets = tf.placeholder(tf.int32, [self._batch_size, None], name = 'targets')
            
            # convert to one-hot encoding
            self._inputs_one_hot  = tf.one_hot(self._inputs,  self._vocabs_size)
            self._targets_one_hot = tf.one_hot(self._targets, self._vocabs_size)
            
            # Keep probability placeholder for drop out layers
            self._keep_prob = tf.placeholder(tf.float32, name='keep_prob')
            
    def _create_rnn(self):
        with self._graph.as_default():
            with tf.variable_scope('rnn_scopes') as vs:
                # create rnn-cell
                if self._cell_type == 'rnn':
                    cell = tf.contrib.rnn.BasicRNNCell(self._rnn_size)
                elif self._cell_type == 'mrnn':
                    cell = BasicMRNNCell(self._rnn_size, self._num_factors)
                elif self._cell_type == 'lstm':
                    cell = tf.contrib.rnn.BasicLSTMCell(self._rnn_size)
                elif self._cell_type == 'gru':
                    cell = tf.contrib.rnn.GRUCell(self._rnn_size)
                
                if (self._num_layers == 1):
                    self._cell = cell
                else:
                    drop = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self._keep_prob)
                    self._cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
                
                # get initial_state
                self._initial_state = self._cell.zero_state(self._batch_size, dtype = tf.float32)
                
                # run rnn through inputs to create outputs & final-state                
                self._outputs, self._final_state = tf.nn.dynamic_rnn(self._cell,
                                                                     self._inputs_one_hot,
                                                                     initial_state = self._initial_state)
                
                # Retrieve just the RNNs variables.
                self._rnn_variables = [v for v in tf.global_variables() if v.name.startswith(vs.name)]
    
    def _create_loss(self):
        with self._graph.as_default():
            # create softmax-weight & biases
            init_stddev = 1.0 / np.sqrt(self._vocabs_size)
            self._softmax_weights = tf.Variable(tf.truncated_normal([self._rnn_size, self._vocabs_size],
                                                                    stddev = init_stddev), name = 'softmax_w')
            self._softmax_biases  = tf.Variable(tf.zeros(self._vocabs_size), name = 'softmax_b')
            
            # reshape outputs/targets so we can use tf.matmul/tf.nn.softmax_cross_entropy_with_logits
            outputs_flat = tf.reshape(self._outputs, [-1, self._rnn_size])
            targets_flat = tf.reshape(self._targets_one_hot, [-1, self._vocabs_size])
            
            # compute logits (input to softmax)        
            self._logits = tf.matmul(outputs_flat, self._softmax_weights) + self._softmax_biases
            
            # compute the cross-entropy loss at each time-step
            self._loss = tf.nn.softmax_cross_entropy_with_logits(logits=self._logits, 
                                                                 labels=targets_flat)
            
            # cost is the reduce_mean of loss at all time-step
            self._cost = tf.reduce_mean(self._loss)
    
    def _create_train_op_saver(self):
        with self._graph.as_default():
            # apply gradient clipping to control exploiding gradient
            tvars    = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), 5.0)
            
            # create train-op with gradient clipping
            optimizer = tf.train.AdamOptimizer(learning_rate=self._lr)
            self._train_op = optimizer.apply_gradients(zip(grads, tvars))
            
            # create saver
            self._saver = tf.train.Saver(max_to_keep=100)
    
    
    
    def train(self, train_inputs, train_targets, 
              val_inputs, val_targets,
              epochs, save_every=50, 
              save_dir = 'checkpoints', keep_prob = 0.5):
        with tf.Session(graph=self._graph) as sess:
            # initialize variable
            sess.run(tf.global_variables_initializer())
            
            # compute nb of iterations
            nb_batches = train_inputs.shape[1]//self._seq_len            
            nb_iters   = epochs * nb_batches
            
            # number of batches in validation set
            val_nb_batches = val_inputs.shape[1]//self._seq_len
            
            iteration  = 0
            for e in range(epochs):
                # reset initial-state to 0
                new_state  = sess.run(self._initial_state)
                train_loss = 0.
                b = 0
                for inputs, targets in get_batches(train_inputs, train_targets, self._seq_len):
                    # run the training-op
                    # note that the final state of one batch shoud be used as initial-state of next batch
                    start = time.time()
                    batch_loss, new_state, _ = sess.run([self._cost, self._final_state, self._train_op],
                                                        feed_dict = {self._inputs  : inputs,
                                                                     self._targets : targets,
                                                                     self._keep_prob : keep_prob,
                                                                     self._initial_state : new_state})
                    
                    end = time.time()
                    train_loss += batch_loss
                    b          += 1
                    iteration  +=1
                    
                    sys.stdout.write('\rEpoch {}/{}'.format(e+1, epochs) + 
                                     ' Iteration {}/{}'.format(iteration, nb_iters) +
                                     ' Training loss: {:.4f}'.format(train_loss/b) +
                                     ' Running {:.4f} sec/batch'.format((end-start)))
                        
                    if (   (iteration%save_every == 0) 
                        or (iteration == nb_iters)):
                        
                        # reset state for validation set
                        val_state = sess.run(self._initial_state)
                        val_loss  = 0.
                        
                        # run rnn and measure the loss on validation set
                        for val_x, val_y in get_batches(val_inputs, val_targets, self._seq_len):
                            batch_loss, val_state = sess.run([self._cost, self._final_state],
                                                             feed_dict = {self._inputs  : val_x,
                                                                          self._targets : val_y,
                                                                          self._keep_prob : 1.0,
                                                                          self._initial_state : val_state})
                            val_loss += batch_loss
                        
                        val_loss /= val_nb_batches
                        # report validation loss & save down checkpoints
                        print('\nValidation loss: {:.4f}'.format(val_loss), 'Saving checkpoint!\n')
                        save_path = '{}/cell_{}_i{}_l{}_v{:.4f}.ckpt'.format(save_dir,
                                                                             self._cell_type,
                                                                             iteration, 
                                                                             self._rnn_size, 
                                                                             val_loss)
                        self._saver.save(sess, save_path)

    def _create_sample(self):
        with self._graph.as_default():
            dist = tf.nn.softmax(self._logits)
            top_probs, top_indices = tf.nn.top_k(dist, k = 3)
            self._top_probs   = tf.reshape(top_probs, [-1])
            self._top_indices = tf.reshape(top_indices, [-1])
    
    def load_checkpoint(self, checkpoint):
        sess = tf.Session(graph = self._graph)
        self._saver.restore(sess, checkpoint)
        return sess
    
    def sample_text(self, sess, sample_len, prime = 'The '):
        '''
        We generate new text that given current text (prime)
        '''
        new_state = sess.run(self._initial_state)
        
        for c in prime:
            c_id = self._vocab_to_id[c]
            inputs = np.array([c_id]).reshape([1,1])
            
            # forward a single time-step 
            new_state, top_prob, top_idx = sess.run([self._final_state, self._top_probs, self._top_indices], 
                                                    feed_dict = {self._inputs : inputs, 
                                                                 self._keep_prob : 1.0,
                                                                 self._initial_state : new_state})
        
        samples = []
        
        # pick next most probable character
        c_id  = pick_top_idx(top_prob, top_idx)
        samples.append(self._id_to_vocab[c_id])
        
        for i in range(sample_len-1):
            inputs = np.array([c_id]).reshape([1,1])
            
            # forward a single time-step 
            new_state, top_prob, top_idx = sess.run([self._final_state, self._top_probs, self._top_indices], 
                                                    feed_dict = {self._inputs : inputs, 
                                                                 self._keep_prob : 1.0,
                                                                 self._initial_state : new_state})
            c_id  = pick_top_idx(top_prob, top_idx)
            samples.append(self._id_to_vocab[c_id])
        
        return ''.join(samples)
            

# Training RNNs
In this section, we will train our RNNs with various cell-type
* [`BasicRNNCell`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/rnn/BasicRNNCell)
* [`BasicLSTMCell`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/rnn/BasicLSTMCell)
* [`BasicGRUCell`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/rnn/BasicGRUCell)
* [`BasicMRNNCell`](http://www.icml-2011.org/papers/524_icmlpaper.pdf)

First create checkpoint directory so we can store trained-model's checkpoint.

In [None]:
# create dir to store checkpoints
!mkdir checkpoints/crnn

## Train with BasicRNNCell
Before train with BasicRNNCell, we inspect the variables' shape

In [None]:
rnn_size = 256
batch_size = 128
seq_len = 64
num_layers = 2
cell_type = 'rnn'

# rnn models
crnn = CharRnn(vocabs, vocab_to_id, id_to_vocab, cell_type,
               rnn_size = rnn_size, batch_size = batch_size, 
               seq_len = seq_len, num_layers=num_layers)

# view shape
if num_layers > 1:
    print ('initial_state is a tuple of len {} each has shape \n\t{} i.e (batch_size, rnn_size)\n'.format(
                                                                        len(crnn._initial_state),
                                                                        crnn._initial_state[0].get_shape()))
    
    print ('rnn weights and biases:')
    for v in crnn._rnn_variables:
        print ('\t{:<65} rank {} shape {}'.format(v.name, v.get_shape().ndims, v.get_shape().as_list()))
    
    print ('at each layer:\n\tweights should has shape [input_dim + hidden_dim, hidden_dim]',
                         '\n\tbiases should has shape  [hidden_dim]')
    
    

## Train with MRNN cell
We try out the MRNN cell

In [None]:
# hyper parameters
rnn_size = 256
batch_size = 128
seq_len = 64
num_factors= 3
num_layers = 1
cell_type = 'mrnn'

# rnn models
crnn = CharRnn(vocabs, vocab_to_id, id_to_vocab, cell_type,
               num_factors = num_factors, rnn_size = rnn_size, 
               batch_size = batch_size, seq_len = seq_len, num_layers=num_layers)

# create train/validation dataset
train_x, train_y, val_x, val_y = split_data(char_ids, batch_size, seq_len)


It's time to train MRNN cell

In [None]:
epochs = 50
keep_prob = 0.5
crnn.train(train_x, train_y, val_x, val_y, 
           epochs=epochs,
           save_every=500, 
           keep_prob=keep_prob,
           save_dir= 'checkpoints/crnn')

Let's use this to generate some new text

In [None]:
val_crnn = CharRnn(vocabs, vocab_to_id, id_to_vocab, cell_type,
                   rnn_size = rnn_size, batch_size = 1, 
                   seq_len = seq_len, num_layers=num_layers)

sess = val_crnn.load_checkpoint('checkpoints/crnn/crnn_mrnn_i10850_l256_v1.7655.ckpt')

prime = 'Happy families are '
new_text = val_crnn.sample_text(sess, 200, prime=prime)
print ('Prime:  {}\nSample: {}'.format(prime, new_text))

## Train with LSTM cell
Let's create a RNNs model so that we can train it with given dataset

In [None]:
# hyper parameters
rnn_size = 256
batch_size = 128
seq_len = 64
num_layers = 2
cell_type = 'lstm'

# rnn models
crnn = CharRnn(vocabs, vocab_to_id, id_to_vocab, cell_type,
               rnn_size = rnn_size, batch_size = batch_size, 
               seq_len = seq_len, num_layers=num_layers)

# create train/validation dataset
train_x, train_y, val_x, val_y = split_data(char_ids, batch_size, seq_len)

Time for training, we pass train/validation dataset to the `train` function

In [None]:
epochs = 50
keep_prob = 0.5
crnn.train(train_x, train_y, val_x, val_y, 
           epochs=epochs,
           save_every=500, 
           keep_prob=keep_prob,
           save_dir= 'checkpoints/crnn')


# Valuation
Now that the RNNs is trained, we want to use it to generate some new text