In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from time import time
from reader import ptb_raw_data

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
happy = pd.read_csv('data/happy.csv', encoding='utf-8')
text = happy['text'][pd.notnull(happy['text'])]
# Add extra space at the end of each post
# so that after concatenation they remain separated
chars = text.map(lambda s: list(s) + [u' '])
# Flatten chars
chars = [item for sublist in chars for item in sublist]
# Replace space by the special character underscore
chars = [u'_' if x == ' ' else x for x in chars]

# Map char to id
df_chars = pd.DataFrame({'char': chars})
char_count = df_chars.groupby('char')['char'].count()
reduced_chars = char_count[char_count > 10000]
char_to_id = dict(zip(reduced_chars.index, np.arange(reduced_chars.shape[0], dtype=int)))
unknown_char_id = len(char_to_id)

train_data = np.array(map(lambda c: char_to_id.get(c, unknown_char_id), chars))[:5000000]
char_to_id[u'<unk>'] = len(char_to_id)

In [2]:
#np.savetxt('data/happy_chars.txt', train_data, fmt='%d')
#np.save('data/char_to_id.npy', char_to_id)

In [12]:
# Only contains 5M chars
full_data = np.loadtxt('data/happy_chars.txt', dtype=int)[:500000]
char_to_id = np.load('data/char_to_id.npy').item()
id_to_char = {i: k for k, i in char_to_id.iteritems()}

# Split into train/val/test 80/10/10
train_split = (int)(full_data.shape[0] * 0.8)
val_split = (int)(full_data.shape[0] * 0.9)
train_data = full_data[:train_split]
valid_data = full_data[train_split:val_split]
test_data = full_data[val_split:]

IOError: [Errno 2] No such file or directory: 'data/happy_chars.txt'

In [2]:
train_data, valid_data, test_data, char_to_id = ptb_raw_data('data_chars', char=True)
id_to_char = {i: k for k, i in char_to_id.iteritems()}
config = {'char_to_id': char_to_id,
          'id_to_char': id_to_char,
          'batch_size': 20,
          'num_steps': 20,
          'vocab_size': len(char_to_id),
          'hidden_size': 200,
          'num_layers': 2, # Number of stacked LSTMs
          'dropout': 0.9, # Proba to keep neurons
          'max_grad_norm': 5.0, # Maximum norm of gradient
          'init_scale': 0.1, # Weights initialization scale
          'initial_lr': 1.0,
          'lr_decay': 0.5,
          'max_epoch_no_decay': 4, # Number of epochs without decaying learning rate
          'nb_epochs': 10} # Maximum number of epochs

In [3]:
class CharModel():
    def __init__(self, config):
        self.config = config
        batch_size = config['batch_size']
        num_steps = config['num_steps']
        vocab_size = config['vocab_size']
        hidden_size = config['hidden_size']
        num_layers = config['num_layers']
        dropout = config['dropout']
        max_grad_norm = config['max_grad_norm']
        initial_lr = config['initial_lr']
        
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.target = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.learning_rate = tf.Variable(initial_lr, trainable=False)
        # Use a placeholder to turn off dropout during testing 
        self.keep_prob = tf.placeholder(tf.float32)
        
        # Char embedding
        #embedding = tf.get_variable('embedding', [vocab_size, hidden_size])
        #input_embed = tf.nn.embedding_lookup(embedding, self.input_data)
        #input_embed_dropout = tf.nn.dropout(input_embed, self.keep_prob)
        input_data_one_hot = tf.one_hot(self.input_data, vocab_size)

        # LSTM
        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(num_units=hidden_size, forget_bias=1.0)
        def lstm_cell_dropout():
            return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=self.keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell_dropout() for _ in range(num_layers)], state_is_tuple=True)
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        state = self.initial_state
        outputs = []
        with tf.variable_scope('RNN'):
            for t in range(num_steps):
                if t > 0: tf.get_variable_scope().reuse_variables() # Reuse the weights in the LSTMs
                output, state = cell(input_data_one_hot[:, t, :], state)
                outputs.append(output)
        self.final_state = state

        h1 = tf.reshape(tf.stack(outputs, axis=1), [-1, hidden_size])
        W_softmax = tf.get_variable('W_softmax', [hidden_size, vocab_size])
        b_softmax = tf.get_variable('b_softmax', [vocab_size])
        logits = tf.matmul(h1, W_softmax) + b_softmax
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])
        # Use sequence loss for average over batch and sum across timesteps
        loss_vector = tf.contrib.seq2seq.sequence_loss(logits, self.target, weights=tf.ones([batch_size, num_steps]),
                                                       average_across_batch=True, average_across_timesteps=False)
        self.loss = tf.reduce_sum(loss_vector)
        # Use gradient cliping
        trainable_vars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, trainable_vars), max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        self.train_step = optimizer.apply_gradients(zip(grads, trainable_vars),
                                                    global_step=tf.contrib.framework.get_or_create_global_step())
        self.predict = tf.cast(tf.argmax(tf.reshape(logits, [-1, vocab_size]), 1), tf.int32)
        correct_pred = tf.equal(self.predict, tf.reshape(self.target, [-1]))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [4]:
def run_model(sess, model, data, is_training, show_loss_graph=False):
    batch_size = model.config['batch_size']
    num_steps = model.config['num_steps']
    dropout = model.config['dropout']
    initial_lr = model.config['initial_lr']
    lr_decay = model.config['lr_decay']
    max_epoch_no_decay = model.config['max_epoch_no_decay']
    nb_epochs = model.config['nb_epochs']
    
    batch_len = data.shape[0] / batch_size
    data = data[:batch_len * batch_size].reshape((batch_size, batch_len))
    epoch_size = (batch_len - 1) / num_steps
    if is_training:
        # Iteration to print at
        print_iter = list(np.linspace(0, epoch_size - 1, 11).astype(int))
        dropout_param = dropout
        ops = [model.final_state, model.loss, model.accuracy, model.train_step]
    else:
        dropout_param = 1.0
        ops = [model.final_state, model.loss, model.accuracy, tf.no_op()]

    for e in range(nb_epochs):
        print ('Epoch: {0}'.format(e + 1))
        lr_decay = lr_decay ** max(e + 1 - max_epoch_no_decay, 0)
        sess.run(tf.assign(model.learning_rate, initial_lr * lr_decay))

        total_loss = 0.0
        total_accuracy = 0.0
        nb_iter = 0.0
        perplexity_history = []
        numpy_state = sess.run(model.initial_state)
        t0 = time()
        for i in range(epoch_size):
            curr_input = data[:, i * num_steps: (i + 1) * num_steps]
            # Target is the input shifted in time by 1
            curr_target = data[:, i * num_steps + 1: (i + 1) * num_steps + 1]
            numpy_state, curr_loss, curr_acc, _ = sess.run(ops,
                                                           feed_dict={model.input_data: curr_input, 
                                                                      model.target: curr_target,
                                                                      model.initial_state: numpy_state, 
                                                                      model.keep_prob: dropout_param})
            total_loss += curr_loss
            total_accuracy += curr_acc
            nb_iter += num_steps
            perplexity_history.append(np.exp(curr_loss / num_steps))

            if (is_training and i in print_iter):
                print('{0:.0f}% perplexity = {1:.3f}, accuracy = {2:.3f}, speed = {3:.0f} cps'\
                      .format(print_iter.index(i) * 10, 
                              np.exp(total_loss / nb_iter), total_accuracy / (i + 1),
                              (nb_iter * batch_size) / (time() - t0)))
        if not is_training:
            print('Perplexity = {0:.3f}, accuracy = {1:.3f}, speed = {2:.0f} cps'\
                  .format(np.exp(total_loss / nb_iter), total_accuracy / (i + 1),
                          (nb_iter * batch_size) / (time() - t0)))

        if (is_training and show_loss_graph):
            plt.plot(perplexity_history)
            plt.grid(True)
            plt.title('Epoch {0}'.format(e + 1))
            plt.xlabel('Mini-batch number')
            plt.ylabel('Perplexity per mini-batch')
            plt.show()
            
def generate_chars(sess, model, first_char, max_iteration):
    ops = [model.final_state, model.predict]
    current_char = first_char.copy()
    numpy_state = sess.run(model.initial_state)
    preds = []
    for i in range(max_iteration):
        numpy_state, pred = sess.run(ops, feed_dict={model.input_data: current_char,
                                                     model.initial_state: numpy_state,
                                                     model.keep_prob: 1.0})
        preds.append(pred[0])
        current_chars = pred.reshape((1, 1))
    return preds

In [None]:
tf.reset_default_graph()
with tf.Session() as sess:
    print('Training:')
    init_scale = config['init_scale']
    initializer = tf.random_uniform_initializer(-init_scale, init_scale)    
    with tf.variable_scope('Model', reuse=None, initializer=initializer):
        config['nb_epochs'] = 1
        m_train = CharModel(config)
    sess.run(tf.global_variables_initializer())
    run_model(sess, m_train, train_data, is_training=True)
    print('\nValidation:')
    with tf.variable_scope('Model', reuse=True):
        config['nb_epochs'] = 1
        m_valid = CharModel(config)
    #run_model(sess, m_valid, valid_data, is_training=False)
    print('\nTest:')
    with tf.variable_scope('Model', reuse=True):
        m_test =  CharModel(config)
    #run_model(sess, m_test, test_data, is_training=False)
    print('\nCharacters generation')
    with tf.variable_scope('Model', reuse=True):
        config['batch_size'] = 1
        config['num_steps'] = 1
        m_gen = CharModel(config)
    first_char = np.array([[4]])
    preds = generate_chars(sess, m_gen, first_char, 100)
    generated_chars = map(lambda x: config['id_to_char'][x], preds)
    np.save('generated_chars.npy', np.array(generated_chars))
    print('Finished')

Training:
Epoch: 1
0% perplexity = 49.962, accuracy = 0.072, speed = 1182 cps


In [17]:
gene_chars = np.load('generated_chars.npy')
''.join(list(gene_chars)).replace('_', ' ')

u'                                                                                                    '