### Train a bilstm to generate jokes in forward and reverse based on a controlled bit

In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten, Merge, Concatenate
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam
from keras.models import Model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Variables

In [6]:
# http://nlp.stanford.edu/data/glove.6B.zip
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

GLOVE_PATH = './data/glove.6B.200d.txt'

MODEL_PREFIX = 'jokes_controlled_stacked_lstm_glove'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.2

GLOVE_EMBEDDING_DIM = 200
EMBEDDING_DIM1 = 256
EMBEDDING_DIM2 = 256
HIDDEN_DIM1 = 512
HIDDEN_DIM2 = 256
DEEPER_DIM = 256
DROPOUT_FACTOR = 0.2
REGULARIZATION = 0.00001
LEARNING_RATE = 0.003

DATA_PERCENT = 0.1

RUN_INDEX = 1

In [7]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
['sos what do you call a person with no arms and no legs in the middle of the ocean ? fucked . eos', 'sos how does lady gaga like her meat ? raw raw raw raw raw eos']
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [8]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 17, 20, 9, 66, 4, 210, 29, 62, 591, 12, 62, 303, 14, 5, 596, 19, 5, 931, 6, 1009, 3, 2], [1, 32, 64, 551, 3775, 33, 61, 625, 6, 2747, 2747, 2747, 2747, 2747, 2], [1, 409, 22, 26, 35, 1399, 134, 9, 95, 33, 22, 8, 1272, 9, 3, 2], [1, 13, 141, 720, 16, 14, 5, 835, 15, 40, 2569, 7, 398, 8, 148, 16, 14, 61, 427, 70, 452, 61, 298, 3, 2], [1, 9, 67, 94, 199, 220, 34, 9, 80, 111, 5, 76, 74, 4, 667, 1761, 12, 50, 1904, 3, 2]]
8923


In [9]:
# saving
with open(MODELS_PATH + MODEL_PREFIX + '_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [10]:
TAG_SIZE = 2
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)

forward_X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
forward_y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
forward_tag = to_categorical(np.full((forward_X_data.shape[0], MAX_SEQ_LEN), 0), TAG_SIZE)

#print(forward_X_data.shape)
#print(forward_X_data[:2])
#print(forward_y_data.shape)
#print(forward_y_data[:2])
#print(forward_tag.shape)

reverse_X_data = pad_sequences([item[::-1] for item in y_data], maxlen=MAX_SEQ_LEN, padding='pre')
reverse_y_data = pad_sequences([item[::-1] for item in X_data], maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
reverse_tag = to_categorical(np.full((reverse_X_data.shape[0], MAX_SEQ_LEN), 1), TAG_SIZE)

#print(reverse_X_data.shape)
#print(reverse_X_data[:2])
#print(reverse_y_data.shape)
#print(reverse_y_data[:2])
#print(reverse_tag.shape)

X_data = np.concatenate((forward_X_data, reverse_X_data), axis=0)
y_data = np.concatenate((forward_y_data, reverse_y_data), axis=0)
tag_data = np.concatenate((forward_tag, reverse_tag), axis=0)

# shuffle
perm = np.random.permutation(X_data.shape[0])
X_data = X_data[perm]
y_data = y_data[perm]
tag_data = tag_data[perm]
print(X_data.shape)
print(X_data[:4])
print(y_data.shape)
print(y_data[:4])
print(tag_data.shape)
print(tag_data[:4])

Total training data size =  833131
Max seq len =  13
(1666262, 13)
[[  10  464   14  396   12  365   33    5  109   65   37   39    3]
 [   2   87 5671   15  144   35   19  561    5 1012    8   96    9]
 [   8  450  736  139   53 7257 3279    6  977   10   28   24 1121]
 [  54  200    5  202  212  677  828 7409   16    5 7594  202  212]]
(1666262, 13, 1)
[[[ 464]
  [  14]
  [ 396]
  [  12]
  [ 365]
  [  33]
  [   5]
  [ 109]
  [  65]
  [  37]
  [  39]
  [   3]
  [   2]]

 [[  87]
  [5671]
  [  15]
  [ 144]
  [  35]
  [  19]
  [ 561]
  [   5]
  [1012]
  [   8]
  [  96]
  [   9]
  [  67]]

 [[ 450]
  [ 736]
  [ 139]
  [  53]
  [7257]
  [3279]
  [   6]
  [ 977]
  [  10]
  [  28]
  [  24]
  [1121]
  [   4]]

 [[ 200]
  [   5]
  [ 202]
  [ 212]
  [ 677]
  [ 828]
  [7409]
  [  16]
  [   5]
  [7594]
  [ 202]
  [ 212]
  [   2]]]
(1666262, 13, 2)
[[[1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]]

 [[0. 1.]
  [0. 1.

In [7]:
print('Indexing glove word vectors')
#Glove Vectors
glove_embeddings_index = {}
f = open(GLOVE_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(glove_embeddings_index))

Indexing glove word vectors
Total 400000 word vectors.


In [8]:
print('Preparing glove embedding matrix')
glove_embedding_matrix = np.zeros((VOCAB_SIZE, GLOVE_EMBEDDING_DIM))
for word,i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        glove_embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))
print(glove_embedding_matrix.shape)

Preparing glove embedding matrix
Null word embeddings: 255
(8923, 200)


In [22]:
# define model
def StackedLSTM(vocab_size, glove_embedding_dim, glove_embedding_matrix, embedding_dim1, embedding_dim2,
           hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, tag_size,
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    inputs = Input(shape=(None,))
    tag_inputs = Input(shape=(None,tag_size))
    
    glove_embedding = Embedding(vocab_size, glove_embedding_dim, #input_length=max_seq_len,
                                  weights=[glove_embedding_matrix],
                                  mask_zero=True,trainable=False)(inputs)
    
    word_embedding = Embedding(vocab_size, embedding_dim1, #input_length=max_seq_len, 
                               mask_zero=True, embeddings_regularizer=regularizers.l2(regularization))(inputs)
    
    #tag_embedding = Embedding(tag_size, tag_size, embeddings_regularizer=regularizers.l2(regularization))(tag_inputs)
    
    concat_embeds = Concatenate(axis=-1)([glove_embedding, word_embedding, tag_inputs])
    
    final_embed = Dense(units=embedding_dim2, activation='tanh',
                        kernel_regularizer=regularizers.l2(regularization))(concat_embeds)
    
    lstm1 = LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(final_embed)
    
    lstm2 = LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(lstm1)
    
    timedist_dropout = TimeDistributed(Dropout(dropout_factor))(lstm2)
    
    deep_dense = Dense(units=deeper_dim, activation='tanh', 
                       kernel_regularizer=regularizers.l2(regularization))(timedist_dropout)
    
    dropout_layer1 = Dropout(dropout_factor)(deep_dense)
    
    outputs = Dense(units=vocab_size, activation='softmax', 
                    kernel_regularizer=regularizers.l2(regularization))(dropout_layer1)
    
    model = Model(inputs=[inputs,tag_inputs], outputs=outputs)
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy], sample_weight_mode='temporal')
    return model

In [23]:
K.clear_session()
sess = tf.Session()
K.set_session(sess)

model = StackedLSTM(vocab_size=VOCAB_SIZE, glove_embedding_dim=GLOVE_EMBEDDING_DIM,
                    glove_embedding_matrix=glove_embedding_matrix, 
                    embedding_dim1=EMBEDDING_DIM1, embedding_dim2=EMBEDDING_DIM2,
                    hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2,
                    deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
                    regularization=REGULARIZATION, learning_rate=LEARNING_RATE, tag_size=TAG_SIZE)
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    1784600     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    2284288     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 2)      0                                            
__________________________________________________________________________________________________
concatenat

In [24]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/" + MODEL_PREFIX + "/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen' + str(RUN_INDEX) + '.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit([X_data, tag_data], y_data, epochs=10, batch_size=1024, shuffle=True, verbose=1, validation_split=0.2, callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 1333009 samples, validate on 333253 samples
Epoch 1/10

Epoch 00001: saving model to ./models/checkpoints/jokes_controlled_stacked_lstm_glove_gen1.01-4.59.hdf5

Epoch 00001: val_loss improved from inf to 4.59085, saving model to ./models/jokes_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 2/10

Epoch 00002: saving model to ./models/checkpoints/jokes_controlled_stacked_lstm_glove_gen1.02-4.39.hdf5

Epoch 00002: val_loss improved from 4.59085 to 4.38835, saving model to ./models/jokes_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 3/10

Epoch 00003: saving model to ./models/checkpoints/jokes_controlled_stacked_lstm_glove_gen1.03-4.31.hdf5

Epoch 00003: val_loss improved from 4.38835 to 4.31136, saving model to ./models/jokes_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 4/10

Epoch 00004: saving model to ./models/checkpoints/jokes_controlled_stacked_lstm_glove_gen1.04-4.27.hdf5

Epoch 00004: val_loss improved from 4.31136 to 4.27095, saving model to ./models/jokes_controlled_stac

In [2]:
# generate a sequence from a language model
def generate_categorical(model, tokenizer, seed_text, maxlen, probabilistic=False, exploration_factor=1.0, tag=0):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        encoded_seq = seq
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = encoded_seq[-1*MAX_SEQ_LEN:]
            
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        tags = to_categorical(np.full((1, padded_seq[0].shape[0]), tag), TAG_SIZE)
        y_prob = model.predict([padded_seq,tags])[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        
        if random.random() <= exploration_factor:
            probabilistic = True
        else:
            probabilistic = False
            
        if probabilistic:
            y_class = np.argmax(np.random.multinomial(1,y_prob[0]/(np.sum(y_prob[0])+1e-5),1))
        else:
            y_class = y_prob.argmax(axis=-1)[0]
        
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen or out_word == 'sos':
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [11]:
K.clear_session()
sess = tf.Session()
K.set_session(sess)
model = load_model('models/jokes_controlled_stacked_lstm_glove_gen1.hdf5')
with open('models/jokes_controlled_stacked_lstm_glove_tokenizer_1.pickle', 'rb') as pickleFile:
    tokenizer = pickle.load(pickleFile)

In [12]:
joke = generate_categorical(model, tokenizer, "sos i had to use", maxlen=40, tag=0)
print(joke)

[1, 7, 84, 8, 258]
sos i had to use opposite keller while she came home from kitchen . and got tired . eos


In [13]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [14]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos what', 'what do', 'do you', 'you call', 'call a', 'a person', 'person with', 'with no', 'no arms', 'arms and', 'and no', 'no legs', 'legs in', 'in the', 'the middle', 'middle of', 'of the', 'the ocean', 'ocean ?', '? fucked', 'fucked .', '. eos'], ['sos how', 'how does', 'does lady', 'lady gaga', 'gaga like', 'like her', 'her meat', 'meat ?', '? raw', 'raw raw', 'raw raw', 'raw raw', 'raw raw', 'raw eos']]


In [15]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [16]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.2777777777777778  ->  sos i used to own a motorcycle shop , but i had to sell it . i was always two tired . eos
0.2777777777777778  ->  sos i had to use my glasses when playing tennis . because its a no contact sport . eos
0.2777777777777778  ->  sos i had to put my dog down today . my arms were getting tired . eos
0.2222222222222222  ->  sos i saw two kids fighting in the elementary school playground this morning . being the only adult around , i had to step in . they did not stand a chance . eos
0.2222222222222222  ->  sos i had to turn off my carbon monoxide detector threedots threedots the constant beeping was giving me a headache and making me feel sick . eos
0.2222222222222222  ->  sos i was seeing a therapist for trust issues , but i had to quit going when i found out he was seeing other patients . eos
0.2222222222222222  ->  sos i had to put my dog down last night he's just too darn heavy to carry around anymore . eos
0.2222222222222222  ->  sos i had to clean out my spice ra

In [21]:
joke = generate_categorical(model, tokenizer, "eos bastard racist you", maxlen=40, tag=1)
joke = ' '.join(joke.split(' ')[::-1])
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[2, 2458, 389, 9]
sos if you got the potter off my car back threedots threedots you racist bastard eos
0.26666666666666666  ->  sos if you wear a radioactive belt threedots threedots you end up with nuclear waist . eos
0.26666666666666666  ->  sos if you put your right ear really close to your left knee and you listen threedots threedots you can hear a voice say ' what the fuck are you doing ? ' eos
0.26666666666666666  ->  sos if you ain't muslim threedots threedots you ain't shiite eos
0.26666666666666666  ->  sos if you walk in to a room and find a man having a stroke threedots threedots you probably should have knocked . eos
0.2  ->  sos girl , if you don't stop touching my crotch , threedots threedots you might feel a small prick . eos
0.2  ->  sos if you ever find yourself being attacked by a gang of clowns threedots threedots go for the juggler . eos
0.2  ->  sos if you laid out all of the people in the world who were ever mean to me , i could then drive my car over them . eos
0

In [22]:
joke = generate_categorical(model, tokenizer, "eos . sport contact", maxlen=40, tag=1)
joke = ' '.join(joke.split(' ')[::-1])
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[2, 3, 2082, 1698]
sos what do you call a bunch of sauce on a plane ? the whiskey contact sport . eos
0.5555555555555556  ->  sos what do you call a bunch of male pornstars on a flight together ? snakes on a plane . eos
0.5555555555555556  ->  sos what do you call a bunch of white guys sitting on a bench ? the nba . eos
0.5  ->  sos what do you call a black man flying a plane ? the pilot you racist fuck . eos
0.5  ->  sos what do you call a middle eastern man flying a plane ? the pilot , you racist . eos
0.5  ->  sos what do you call a bunch of white guys on a bench ? the nba eos
0.5  ->  sos what do you call a bunch of white guys sitting on a bench ? the nba eos
0.5  ->  sos what do you call a gay guy flying a plane ? the pilot . eos
0.5  ->  sos what do you call a muslim on a plane ? a passenger threedots you racist bastard . eos
0.5  ->  sos what do you call a black man flying a plane ? the pilot . eos
0.5  ->  sos what do you call a bunch of white dudes sitting on a bench ? the nba

In [24]:
joke = generate_categorical(model, tokenizer, "sos what do you call", maxlen=40, tag=0)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call a man who doesn't expecting back out with the using ? both make sure . eos
0.4  ->  sos what do you call a farmer who is really good at his job ? a man who is outstanding in his field . eos
0.4  ->  sos what do you call a man who expects to have sex on the second date ? slow . eos
0.4  ->  sos what do you call a man who comes in through the letter box ? bill . eos
0.4  ->  sos what do you call a man who has lost the lower parts of his legs , but still somehow has his feet ? tony . eos
0.4  ->  sos what do you call a man who can't stop stealing ? nick . eos
0.4  ->  sos what do you call a man who has been dead and buried for thousands of years ? pete . eos
0.4  ->  sos what do you call a man who is too proud of his balls ? ego - testicle . eos
0.4  ->  sos what do you call a man who loves a woman for her brains ? a zombie . eos
0.4  ->  sos what do you call a man who expects to have sex on the second date ? patient . eos
0.4  ->  sos what do you c