In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Variables

In [42]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 10
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 400
HIDDEN_DIM = 1600
DROPOUT_FACTOR = 0.333
REGULARIZATION = 0.00001

DATA_PERCENT = 0.1

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
['sos i had to use my glasses when playing tennis . because its a no contact sport . eos', 'sos why did the japanese funeral home have to turn away new business ? they ran out of san storage eos']
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [4]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 7, 84, 8, 258, 13, 1286, 34, 562, 1726, 3, 40, 206, 4, 62, 1683, 2075, 3, 2], [1, 25, 36, 5, 1124, 898, 227, 27, 8, 376, 259, 142, 659, 6, 37, 638, 52, 19, 5530, 6924, 2], [1, 4, 265, 292, 132, 95, 54, 4, 1287, 14, 5, 335, 3, 2], [1, 58, 225, 28, 351, 72, 35, 144, 924, 8, 720, 75, 3, 2], [1, 11, 7, 95, 1880, 60, 45, 50, 5950, 21, 11, 42, 107, 986, 8, 53, 556, 396, 19, 266, 49, 5531, 3, 2]]
8923


### Preparing Training Data

In [47]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+sliding_window_length])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  1073602
Max seq len =  10
(1073602, 10)
[[   1    7   84    8  258   13 1286   34  562 1726]
 [   7   84    8  258   13 1286   34  562 1726    3]]
(1073602, 1)
[[ 3]
 [40]]


In [73]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, #input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(Bidirectional(LSTM(hidden_dim, 
                                 activation='tanh',
                                 kernel_regularizer=regularizers.l2(regularization),
                                 recurrent_regularizer=regularizers.l2(regularization)#, unroll=True
                                )))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy])
    return model

In [74]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM,
              max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, regularization=REGULARIZATION)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 400)         3569200   
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 3200)              25612800  
_________________________________________________________________
dropout_13 (Dropout)         (None, 3200)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 8923)              28562523  
Total params: 57,744,523
Trainable params: 57,744,523
Non-trainable params: 0
_________________________________________________________________
None


In [75]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_bilstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_bilstm_gen.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1)]

model.fit(X_data, y_data, epochs=10, batch_size=1024, shuffle=True, verbose=1, validation_split=0.2, 
          callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 858881 samples, validate on 214721 samples
Epoch 1/10
109568/858881 [==>...........................] - ETA: 19:08 - loss: 6.3838 - sparse_categorical_crossentropy: 6.2614 - sparse_categorical_accuracy: 0.0985

In [68]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        #if len(seq) > MAX_SEQ_LEN:
        #    encoded_seq = seq[-1*MAX_SEQ_LEN:]
        #else:
        #    encoded_seq = seq
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)
        y_class = y_prob.argmax(axis=-1)[0]
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [69]:
joke = generate(model, tokenizer, "sos i had to use", maxlen=40)
print(joke)

[1, 7, 84, 8, 258]


ValueError: Error when checking : expected embedding_12_input to have shape (10,) but got array with shape (5,)

In [55]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [56]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos i', 'i had', 'had to', 'to use', 'use my', 'my glasses', 'glasses when', 'when playing', 'playing tennis', 'tennis .', '. because', 'because its', 'its a', 'a no', 'no contact', 'contact sport', 'sport .', '. eos'], ['sos why', 'why did', 'did the', 'the japanese', 'japanese funeral', 'funeral home', 'home have', 'have to', 'to turn', 'turn away', 'away new', 'new business', 'business ?', '? they', 'they ran', 'ran out', 'out of', 'of san', 'san storage', 'storage eos']]


In [57]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [62]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.6363636363636364  ->  sos i made up a new word . plagiarism . eos
0.6363636363636364  ->  sos i just invented a new word . plagiarism . eos
0.6363636363636364  ->  sos i invented a new word . plagiarism . eos
0.6363636363636364  ->  sos i just invented a new joke . i just invented a new word . plagiarism . eos
0.6363636363636364  ->  sos i made a new joke . i made a new word . plagiarism . eos
0.5454545454545454  ->  sos i recently invented a new word to describe a lot of the jokes on the subreddit . plagiarism . eos
0.5454545454545454  ->  sos i invented a new word the other day . plagiarism . eos
0.5454545454545454  ->  sos hey people , i've invented a new word . plagiarism . eos
0.5454545454545454  ->  sos i created a new word today . plagiarism . eos
0.45454545454545453  ->  sos i had to use my glasses when playing tennis . because its a no contact sport . eos


In [65]:
joke = generate(model, tokenizer, "sos what do you call", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call a vegetarian ? a rip - off . eos
0.8461538461538461  ->  sos what do you call a cheap circumcision ? a rip - off . eos
0.8461538461538461  ->  sos what do you call a cheap circumcision ? a rip - off . well , you can't blame them . they don't make much money , they just keep the tips . eos
0.8461538461538461  ->  sos what do you call a bad hairdresser who is also very expensive ? a rip - off . eos
0.6923076923076923  ->  sos what do you call a cheap circumcision ? rip - off . eos
0.6923076923076923  ->  sos what do you call a cheap circumcision ? a rip - off ! eos
0.6923076923076923  ->  sos what do you call a gay vegetarian ? a vegetarian . eos
0.6923076923076923  ->  sos what do you call a bad circumcision ? a rip - off eos
0.6923076923076923  ->  sos what do you call a bad circumcision ? what do you call a bad circumcision ? a rip off . eos
0.6923076923076923  ->  sos what do you call a discount circumcision ? a rip off . eos
0.6923076923076923

In [64]:
print(sentences[:10])

['sos i had to use my glasses when playing tennis . because its a no contact sport . eos', 'sos why did the japanese funeral home have to turn away new business ? they ran out of san storage eos', 'sos a world without women would be a pain in the ass . eos', 'sos not everyone that comes into your life needs to stay there . eos', 'sos " i would absolutely say i\'m an introvert ! " - guy screaming to his table full of friends at brunch . eos', 'sos cashier : " would you like to donate to charity today or are you a giant piece of shit ? " eos', "sos what do you call a muslim girl dating an agnostic guy ? for safety purposes , i don't know if i should tell you her name threedots eos", "sos math is so communist threedots threedots there's class struggle for marx eos", "sos i used to be a bodybuilder threedots or ' the dr frankenstein grave robber ' as the press preferred to call me . eos", "sos i can't believe this paper went to college , let alone thought it ruled eos"]
