# Vocabulary Generator Using LSTM

This script implements a RNN using LSTM cells in Tensorflow. The network is then trained on a set of english vocabularies (https://github.com/dwyl/english-words), the resulting network could produce reasonable vocabulary-like words given the first few characters.

In [1]:
# Import needed libraries
import numpy as np
import tensorflow as tf

In [2]:
# Define the function to load the dictionary
def load_words():
    with open('words_alpha.txt') as word_file:
        valid_words = word_file.read().split()
    return valid_words

In [3]:
# Define the mapper that maps character to its one hot representation
class one_hot_mapper():
    """
    Sample Usage:
    >>>mapper = one_hot_mapper(classes = 3)
    >>>one_hot = mapper.transform(2)
    >>one_hot
    [0,1,0]
    >>>one_hot_rev = mapper.reverse(one_hot)
    >>>one_hot_rev
    2
    >>>
    """    
    def __init__(self, classes):
        self.classes = classes
    def transform(self, value):
        arr = [0]*self.classes
        arr[value] = 1
        return arr
    def reverse(self, array, int_to_char = None):
        if int_to_char:
            string = ''
            for element in array:
                string = string + int_to_char[np.argmax(element)]
            return string
        else:
            return np.argmax(array)

In [4]:
# Create relevant mappers
values = '\nabcdefghijklmnopqrstuvwxyz'
mapper = one_hot_mapper(classes = len(values))

char_to_one_hot = dict((key, mapper.transform(value)) for value, key in enumerate(values))
int_to_char = dict((mapper.reverse(key), value) for value, key in char_to_one_hot.items())

english_words = load_words()
max_word_length = max([len(word) for word in english_words])

In [5]:
# Now load the data from the dictionary
use_load_file = False

if use_load_file == True:
    one_hot_encoded = np.load('one_hot_v2.npy')
else:
    one_hot_encoded = [[char_to_one_hot[char] for char in word] for word in english_words]
    change_line_int = char_to_one_hot['\n']
    null_int = change_line_int[:]; null_int[0] = 0
    one_hot_encoded =[np.concatenate((word, [change_line_int] * (max_word_length + 1 - len(word))), axis = 0)
                                                                            for word in one_hot_encoded]
    one_hot_encoded = np.array(one_hot_encoded)
    np.save('one_hot_v2.npy', one_hot_encoded)

In [6]:
one_hot_encoded.shape

(370099, 32, 27)

In [7]:
def get_batches(arr, batch_size):
    """
    Yields mini batches with the given size sampling from arr
    Sample Usage:
    for x,y in get_batches(arr, batch_size):
    """    
    np.random.shuffle(arr)
    n_batches = arr.shape[0]//batch_size

    for n in range(0, n_batches*batch_size, batch_size):
        x = arr[n:n+batch_size]
        y_temp = x[:, 1:]
        y = np.zeros_like(x)
        y[:, :y_temp.shape[1]] = y_temp

        yield x,y

In [8]:
def pick_top_n(preds, values, numclasses, top_n=5):
    """
    Sample the top n possible classes given the required inputs
    Inputs: preds -> probability array, values -> corresponding values array, numclasses -> number of classes, 
    typically numclasses = len(values) top_n -> sample from the top_n most probable results(default:5)
    Usage: pick_top_n(preds, values, numclasses, top_n = 3)
    Return: the sampled value
    """    
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p/np.sum(p)
    c = np.random.choice(numclasses, 1, p=p)[0]
    return values[c]

In [9]:
def sample(checkpoint, max_samples, num_classes, prime = 'a'):
    """
    Sample with the given checkpoint
    Input: checkpoint, max_samples, num_classes, prime (default:'a')
    Sample Usage:
    samp = sample(checkpoint, max_samples, num_classes, prime = 'jupyt')
    """    
    samples = prime
    model = VocabRNN(learning_rate = learning_rate, sampling = True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            #x = np.zeros((1,1))
            x = np.array([char_to_one_hot[c]]).reshape((1,1,-1))
            feed_dict = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], feed_dict = feed_dict)
        c = pick_top_n(preds, values, num_classes)
        samples = samples + c
        if c != '\n':
            for n in range(max_samples):
                x = np.array([char_to_one_hot[c]]).reshape((1,1,-1))
                feed_dict = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
                preds, new_state = sess.run([model.prediction, model.final_state], 
                                            feed_dict = feed_dict)
                c = pick_top_n(preds, values, num_classes)
                samples = samples + c
                if c == '\n':
                    break
    return samples

## Build the RNN
To build the RNN we first define a build_cell() function that returns a basic lstm cell with dropout. Then in the RNN class use tf.contrib.rnn.MultiRNNCell() along with build_cell to build multiple layers of the network. For the optimizer we perform gradient clipping of adam.

In [10]:
def build_cell(lstm_size, keep_prob):
    """
    Returns a basic lstm cell with dripout.
    Inputs: lstm_size, keep_prob
    Output: tensorflow LSTMCell with DropoutWrapper
    """
    lstm = tf.nn.rnn_cell.LSTMCell(lstm_size, name = 'basic_lstm_cell')
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
    return drop
def grad_clip_adam(loss, learning_rate, grad_clip):
    """
    Returns an adam optimizer that is gradient clipped.
    Inputs: loss, learning_rate, grad_clip
    """
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    return optimizer

In [11]:
class VocabRNN:
    def __init__(self, num_classes = 27, batch_size = 256, num_steps = 32, 
                    learning_rate = 0.001, grad_clip = 5, sampling = False):
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()

        self.inputs = tf.placeholder(tf.float32, [batch_size, num_steps, num_classes], name='inputs')
        self.targets = tf.placeholder(tf.float32, [batch_size, num_steps, num_classes], name='targets')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        cell = tf.contrib.rnn.MultiRNNCell([build_cell(128, self.keep_prob) for _ in range(3)]) #lstm_size, num_layers
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        outputs, state = tf.nn.dynamic_rnn(cell, self.inputs, initial_state = self.initial_state)
        self.final_state = state

        seq_output = tf.concat(outputs, axis=1)
        x = tf.reshape(seq_output, [-1, 128]) #lstm_size

        self.logits = tf.layers.dense(x, num_classes)
        self.prediction = tf.nn.softmax(self.logits)

        self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.targets)
        self.loss = tf.reduce_mean(self.loss)
        self.optimizer = grad_clip_adam(self.loss, learning_rate, grad_clip)

## Set the Hyper-parameters and Initialize Model

In [12]:
num_classes = len(values)
learning_rate = 0.0001
keep_prob = 0.8

epochs = 20

save_freq = 2000

In [13]:
model = VocabRNN(learning_rate = learning_rate)

saver = tf.train.Saver(max_to_keep=100)

## Train

In [15]:
# Train the network
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    #uncomment the following line to restore checkpoint
    #saver.restore(sess, 'checkpoints/i28900_l128.ckpt')
    counter = 0
    for e in range(epochs):
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(one_hot_encoded, 256):
            counter += 1
            feed_dict = {model.inputs: x,
                         model.targets: y,
                         model.keep_prob: keep_prob,
                         model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss,
                                                 model.final_state,
                                                 model.optimizer],
                                                 feed_dict = feed_dict)
            if (counter % 200 == 0):
                print('Epoch: {}/{}...'.format(e+1, epochs),
                      'Training Step: {}...'.format(counter),
                      'Training loss: {:.4f}... '.format(batch_loss))
            if (counter % save_freq == 0):
                saver.save(sess, 'checkpoints_v3/i{}_l{}.ckpt'.format(counter, 128)) #lstm_size
        saver.save(sess, 'checkpoints_v3/i{}_l{}.ckpt'.format(counter, 128)) #lstm size

Epoch: 1/20... Training Step: 200... Training loss: 1.3836... 
Epoch: 1/20... Training Step: 400... Training loss: 1.3742... 
Epoch: 1/20... Training Step: 600... Training loss: 1.3709... 
Epoch: 1/20... Training Step: 800... Training loss: 1.3593... 
Epoch: 1/20... Training Step: 1000... Training loss: 1.3253... 
Epoch: 1/20... Training Step: 1200... Training loss: 1.3181... 
Epoch: 1/20... Training Step: 1400... Training loss: 1.3749... 
Epoch: 2/20... Training Step: 1600... Training loss: 1.2949... 
Epoch: 2/20... Training Step: 1800... Training loss: 1.3323... 
Epoch: 2/20... Training Step: 2000... Training loss: 1.3319... 
Epoch: 2/20... Training Step: 2200... Training loss: 1.3755... 
Epoch: 2/20... Training Step: 2400... Training loss: 1.3269... 
Epoch: 2/20... Training Step: 2600... Training loss: 1.3569... 
Epoch: 2/20... Training Step: 2800... Training loss: 1.3459... 
Epoch: 3/20... Training Step: 3000... Training loss: 1.2603... 
Epoch: 3/20... Training Step: 3200... Traini

Epoch: 18/20... Training Step: 25400... Training loss: 0.7149... 
Epoch: 18/20... Training Step: 25600... Training loss: 0.6997... 
Epoch: 18/20... Training Step: 25800... Training loss: 0.7127... 
Epoch: 18/20... Training Step: 26000... Training loss: 0.7284... 
Epoch: 19/20... Training Step: 26200... Training loss: 0.6939... 
Epoch: 19/20... Training Step: 26400... Training loss: 0.7198... 
Epoch: 19/20... Training Step: 26600... Training loss: 0.7105... 
Epoch: 19/20... Training Step: 26800... Training loss: 0.6949... 
Epoch: 19/20... Training Step: 27000... Training loss: 0.6938... 
Epoch: 19/20... Training Step: 27200... Training loss: 0.7066... 
Epoch: 19/20... Training Step: 27400... Training loss: 0.6961... 
Epoch: 20/20... Training Step: 27600... Training loss: 0.7192... 
Epoch: 20/20... Training Step: 27800... Training loss: 0.7095... 
Epoch: 20/20... Training Step: 28000... Training loss: 0.6997... 
Epoch: 20/20... Training Step: 28200... Training loss: 0.7224... 
Epoch: 20/

## Sample

In [16]:
tf.train.latest_checkpoint('checkpoints_v3')

'checkpoints_v3/i28900_l128.ckpt'

In [17]:
checkpoint = tf.train.latest_checkpoint('checkpoints_v3')

In [18]:
#sample(checkpoint, max_samples, lstm_size, num_classes, prime = 'a'):
max_samples = 30
samp = sample(checkpoint, max_samples, num_classes, prime = 'predeca')
print(samp)
samp = sample(checkpoint, max_samples, num_classes, prime = 'exec')
print(samp)
samp = sample(checkpoint, max_samples, num_classes, prime = 'nomencl')
print(samp)

INFO:tensorflow:Restoring parameters from checkpoints_v3/i28900_l128.ckpt
predecasse

INFO:tensorflow:Restoring parameters from checkpoints_v3/i28900_l128.ckpt
execnlitlan

INFO:tensorflow:Restoring parameters from checkpoints_v3/i28900_l128.ckpt
nomenclintes



## Here we can extract the feature of each word, these features could propably be useful in future applications

In [19]:
def get_feature(checkpoint, num_classes, prime = 'a'):
    model = VocabRNN(learning_rate = learning_rate, sampling = True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            #x = np.zeros((1,1))
            x = np.array([char_to_one_hot[c]]).reshape((1,1,-1))
            feed_dict = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            features, new_state = sess.run([model.prediction, model.final_state], feed_dict = feed_dict)
            features = features.flatten()
            for s in new_state:
                features = np.append(features, s.c.flatten())
                features = np.append(features, s.h.flatten())
    return features

In [20]:
features = get_feature(checkpoint, num_classes, prime = 'a')
features.shape

INFO:tensorflow:Restoring parameters from checkpoints_v3/i28900_l128.ckpt


(795,)

In [21]:
print(features)

[ 3.89879793e-02  3.95795926e-02  3.60612459e-02  4.18184958e-02
  4.17242423e-02  4.09130864e-02  3.47757377e-02  4.29286435e-02
  3.14239264e-02  3.61453444e-02  2.76271738e-02  3.30974348e-02
  4.53513265e-02  3.90439965e-02  5.00101224e-02  3.91344652e-02
  3.57991941e-02  2.72507947e-02  3.78436446e-02  4.55554873e-02
  4.70614284e-02  3.52772065e-02  3.18934396e-02  2.97207832e-02
  2.68001761e-02  3.05545554e-02  3.36205065e-02 -4.83612865e-02
 -4.81348261e-02 -2.43934002e-02  3.53001654e-02 -3.81354950e-02
 -2.79355068e-02  2.57578082e-02 -3.87167186e-02 -2.71251239e-02
 -6.50828481e-02 -1.29774302e-01 -1.51702268e-02 -6.42467232e-04
 -5.31159043e-02  5.60663827e-02  4.09913808e-02  4.13453169e-02
  2.57574897e-02  2.63117719e-02  4.37232107e-03  6.00210391e-02
  5.23426160e-02 -1.90418810e-02 -4.82846722e-02  3.97134274e-02
  1.37419045e-01  4.54110280e-03 -5.55290431e-02  5.76084815e-02
 -7.20720291e-02  4.30800058e-02 -2.59220570e-01 -1.32795824e-02
  4.23586458e-01 -1.12903