Where Is natural language generation going next?
===================

In [None]:
from IPython.display import Video
HTML('<video controls loop autoplay width=800><source src="nlp_models.mp4" type="video/mp4"></video>')

In [None]:
from IPython.display import Video

HTML('<video controls loop autoplay width=720><source src="nlg_next_gen.mp4" type="video/mp4"></video>')

B0RK - The Next Generation in Natural Language Generation
===================

## Test Data 



In [13]:
from notebooks.experimentation.nlg_library_runner import bork

data_source = './test_data.csv'

bork.display_test_data()

Sentence,Generated Text
To whose house are you going?,B0rk ... B0rk.
The people marched for justice.,"B0rk b0rk b0rk , B0rk ... B0rk . B0rk b0rk , B0rk ... B0rk!"
These questions are easy to answer.,B0rk b0rk b0rk ... B0rk b0rk - B0rk‽
Brad came to dinner with us.,B0rk ... B0rk b0rk b0rk ... B0rk b0rk?
He loves fish tacos.,B0rk ... B0rk b0rk‽
"In the end, we all felt like we ate too much.",B0rk ... B0rk b0rk b0rk b0rk ... B0rk b0rk b0rk ... B0rk.
We all agreed; it was a magnificent evening.,"B0rk , B0rk b0rk b0rk b0rk - B0rk b0rk b0rk b0rk - B0rk - B0rk b0rk - B0rk b0rk b0rk."
"I hope that, when I've built up my savings, I'll be able to travel to Mexico.","B0rk b0rk b0rk b0rk b0rk b0rk b0rk - B0rk ... B0rk b0rk , B0rk - B0rk ... B0rk b0rk - B0rk - B0rk , B0rk b0rk."
"Did you know that, along with gorgeous architecture, it's home to the largest tamale?",B0rk b0rk b0rk b0rk b0rk b0rk b0rk b0rk b0rk b0rk ... B0rk.
Wouldn't it be lovely to enjoy a week soaking up the culture?,"B0rk b0rk b0rk b0rk b0rk b0rk - B0rk b0rk , B0rk ... B0rk ... B0rk b0rk ! B0rk b0rk - B0rk b0rk b0rk."


# Build Model

In [None]:
from keras.optimizers import RMSprop
from keras.layers import Input, Embedding, Dense, LSTM, Bidirectional
from keras.layers import CuDNNLSTM, concatenate, Reshape, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from notebooks.experimentation.AttentionWeightedAverage import AttentionWeightedAverage


def textgenrnn_model(num_classes, cfg, context_size=None,
                     weights_path=None,
                     dropout=0.0,
                     optimizer=RMSprop(lr=4e-3, rho=0.99)):
    '''
    Builds the model architecture for textgenrnn and
    loads the specified weights for the model.
    '''

    input = Input(shape=(cfg['max_length'],), name='input')
    embedded = Embedding(num_classes, cfg['dim_embeddings'],
                         input_length=cfg['max_length'],
                         name='embedding')(input)

    if dropout > 0.0:
        embedded = SpatialDropout1D(dropout, name='dropout')(embedded)

    rnn_layer_list = []
    for i in range(cfg['rnn_layers']):
        prev_layer = embedded if i is 0 else rnn_layer_list[-1]
        rnn_layer_list.append(new_rnn(cfg, i+1)(prev_layer))

    seq_concat = concatenate([embedded] + rnn_layer_list, name='rnn_concat')
    attention = AttentionWeightedAverage(name='attention')(seq_concat)
    output = Dense(num_classes, name='output', activation='softmax')(attention)

    if context_size is None:
        model = Model(inputs=[input], outputs=[output])
        if weights_path is not None:
            model.load_weights(weights_path, by_name=True)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    else:
        context_input = Input(
            shape=(context_size,), name='context_input')
        context_reshape = Reshape((context_size,),
                                  name='context_reshape')(context_input)
        merged = concatenate([attention, context_reshape], name='concat')
        main_output = Dense(num_classes, name='context_output',
                            activation='softmax')(merged)

        model = Model(inputs=[input, context_input],
                      outputs=[main_output, output])
        if weights_path is not None:
            model.load_weights(weights_path, by_name=True)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer,
                      loss_weights=[0.8, 0.2])

    return model


In [None]:
'''
Create a new LSTM layer per parameters. Unfortunately,
each combination of parameters must be hardcoded.
The normal LSTMs use sigmoid recurrent activations
for parity with CuDNNLSTM:
https://github.com/keras-team/keras/issues/8860
'''


def new_rnn(cfg, layer_num):
    has_gpu = len(K.tensorflow_backend._get_available_gpus()) > 0
    if has_gpu:
        if cfg['rnn_bidirectional']:
            return Bidirectional(CuDNNLSTM(cfg['rnn_size'],
                                           return_sequences=True),
                                 name='rnn_{}'.format(layer_num))

        return CuDNNLSTM(cfg['rnn_size'],
                         return_sequences=True,
                         name='rnn_{}'.format(layer_num))
    else:
        if cfg['rnn_bidirectional']:
            return Bidirectional(LSTM(cfg['rnn_size'],
                                      return_sequences=True,
                                      recurrent_activation='sigmoid'),
                                 name='rnn_{}'.format(layer_num))

        return LSTM(cfg['rnn_size'],
                    return_sequences=True,
                    recurrent_activation='sigmoid',
                    name='rnn_{}'.format(layer_num))

In [None]:
from keras.callbacks import LearningRateScheduler, Callback
from keras.models import Model, load_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import Sequence
from keras import backend as K
import numpy as np


def generate_sequences_from_texts(texts, indices_list,
                                  textgenrnn, context_labels,
                                  batch_size=128):
    is_words = textgenrnn.config['word_level']
    is_single = textgenrnn.config['single_text']
    max_length = textgenrnn.config['max_length']
    meta_token = textgenrnn.META_TOKEN

    if is_words:
        new_tokenizer = Tokenizer(filters='', char_level=True)
        new_tokenizer.word_index = textgenrnn.vocab
    else:
        new_tokenizer = textgenrnn.tokenizer

    while True:
        np.random.shuffle(indices_list)

        X_batch = []
        Y_batch = []
        context_batch = []
        count_batch = 0

        for row in range(indices_list.shape[0]):
            text_index = indices_list[row, 0]
            end_index = indices_list[row, 1]

            text = texts[text_index]

            if not is_single:
                text = [meta_token] + list(text) + [meta_token]

            if end_index > max_length:
                x = text[end_index - max_length: end_index + 1]
            else:
                x = text[0: end_index + 1]
            y = text[end_index + 1]

            if y in textgenrnn.vocab:
                x = process_sequence([x], textgenrnn, new_tokenizer)
                y = textgenrnn_encode_cat([y], textgenrnn.vocab)

                X_batch.append(x)
                Y_batch.append(y)

                if context_labels is not None:
                    context_batch.append(context_labels[text_index])

                count_batch += 1

                if count_batch % batch_size == 0:
                    X_batch = np.squeeze(np.array(X_batch))
                    Y_batch = np.squeeze(np.array(Y_batch))
                    context_batch = np.squeeze(np.array(context_batch))

                    # print(X_batch.shape)

                    if context_labels is not None:
                        yield ([X_batch, context_batch], [Y_batch, Y_batch])
                    else:
                        yield (X_batch, Y_batch)
                    X_batch = []
                    Y_batch = []
                    context_batch = []
                    count_batch = 0

In [15]:
def process_sequence(X, textgenrnn, new_tokenizer):
    X = new_tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(
        X, maxlen=textgenrnn.config['max_length'])

    return X

# Train the model

In [43]:
test_data = bork.load_data(test_data='./data/sample_data.csv',
                        inferSchema='true')

predictions = bork.transform(test_data)

         Accuracy...          0.17504514342445496
         Accuracy...          0.240147125608564
         Accuracy...          0.15569511809619957
         Accuracy...          0.21865983192115399
         Accuracy...          0.24603765114562964
         Accuracy...          0.2567083637775462
         Accuracy...          0.28859242935426915
         Accuracy...          0.29074978441940785
         Accuracy...          0.32325568419024864
         Accuracy...          0.2883431327000034
         Accuracy...          0.3302649882828238
         Accuracy...          0.42950356142288654
         Accuracy...          0.3501365709800032
         Accuracy...          0.25100247673469744
         Accuracy...          0.2634292817188696
         Accuracy...          0.34887789147129
         Accuracy...          0.3674782448026786
         Accuracy...          0.3969257970258558
         Accuracy...          0.33477848866991605
         Accuracy...          0.34878385884471574
         Acc

In [32]:
bork.display_auc()

# Testing the model

In [9]:
list_of_sentences = """
    My mom drove me to school fifteen minutes late on Tuesday.
    The girl wore her hair in two braids, tied with two blue bows.
    The mouse was so hungry he ran across the kitchen floor without even looking for humans.
    The tape got stuck on my lips so I couldn't talk anymore.
    The door slammed down on my hand and I screamed like a little baby.
    My shoes are blue with yellow stripes and green stars on the front.
    The mailbox was bent and broken and looked like someone had knocked it over on purpose.
    I was so thirsty I couldn't wait to get a drink of water.
    I found a gold coin on the playground after school today.
    The chocolate chip cookies smelled so good that I ate one without asking.
    My bandaid wasn't sticky any more so it fell off on the way to school.
    He had a sore throat so I gave him my bottle of water and told him to keep it.
    The church was white and brown and looked very old.
    I was so scared to go to a monster movie but my dad said he would sit with me so we went last night.
    Your mom is so nice she gave me a ride home today.
    I fell in the mud when I was walking home from school today.
    This dinner is so delicious I can't stop eating.
    The school principal was so mean that all the children were scared of him.
    I went to the dentist the other day and he let me pick a prize out of the prize box.
    The box was small and wrapped in paper with tiny silver and red glitter dots.
    My dad is so funny that he told us jokes all night long and we never fell asleep.
    The camping trip was so awesome that I didn't want to come home.
    Are you going to have a blue birthday cake for your next birthday?
    How did you know that I was going to have a peanut butter sandwich for lunch?
    That boy is so mean that he doesn't care if a door slams in your face or if he cuts in line.
    The moms and dads all sat around drinking coffee and eating donuts.
    My mom made a milkshake with frozen bananas and chocolate sauce.
    My pen broke and leaked blue ink all over my new dress.
    I got my haircut today and they did it way too short.
    My pet turtle, Jim, got out of his cage and I could not find him anywhere.
    I would like to go to the library.
    Soon, I'll tell you some good news.
    The man and the woman fell in love.
    I would like to order a cheese omelette.
    """
bork.generate_sentences(list_of_sentences)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Sentence,Generated Text
My mom drove me to school fifteen minutes late on Tuesday.,B0rk - B0rk?
"The girl wore her hair in two braids, tied with two blue bows.","B0rk b0rk ? B0rk b0rk ... B0rk - B0rk , B0rk b0rk b0rk b0rk b0rk ... B0rk b0rk ‽ B0rk!"
The mouse was so hungry he ran across the kitchen floor without even looking for humans.,"B0rk b0rk b0rk , B0rk ... B0rk b0rk b0rk ? B0rk b0rk b0rk , B0rk ... B0rk , B0rk b0rk . B0rk ... B0rk , B0rk , B0rk b0rk - B0rk ‽ B0rk b0rk b0rk - B0rk ... B0rk b0rk!"
The tape got stuck on my lips so I couldn't talk anymore.,"B0rk - B0rk b0rk ‽ B0rk , B0rk b0rk - B0rk - B0rk b0rk ... B0rk b0rk b0rk b0rk b0rk b0rk ... B0rk b0rk ‽ B0rk b0rk!"
The door slammed down on my hand and I screamed like a little baby.,"B0rk ? B0rk , B0rk ? B0rk , B0rk b0rk b0rk - B0rk b0rk b0rk b0rk b0rk b0rk b0rk - B0rk , B0rk b0rk b0rk b0rk b0rk , B0rk b0rk b0rk?"
My shoes are blue with yellow stripes and green stars on the front.,B0rk b0rk?
The mailbox was bent and broken and looked like someone had knocked it over on purpose.,"B0rk - B0rk b0rk , B0rk b0rk b0rk - B0rk b0rk b0rk ... B0rk , B0rk?"
I was so thirsty I couldn't wait to get a drink of water.,B0rk ... B0rk ... B0rk ... B0rk ... B0rk.
I found a gold coin on the playground after school today.,"B0rk b0rk - B0rk - B0rk b0rk , B0rk ... B0rk b0rk b0rk b0rk b0rk b0rk b0rk b0rk ? B0rk ... B0rk b0rk ... B0rk ... B0rk b0rk - B0rk b0rk , B0rk!"
The chocolate chip cookies smelled so good that I ate one without asking.,"B0rk b0rk b0rk b0rk b0rk b0rk b0rk b0rk , B0rk b0rk b0rk , B0rk b0rk - B0rk - B0rk b0rk , B0rk!"
