In [24]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

### Variables

In [87]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 128
HIDDEN_DIM = 512
DROPOUT_FACTOR = 0.333
REGULARIZATION = 0.00001
DEEPER_DIM = 256

DATA_PERCENT = 0.1

RUN_INDEX = 4

In [73]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
["sos what gifts do squirrels exchange on valentine's day ? forget - me - nuts . eos", "sos i have no problem getting women into the sack threedots threedots it's getting the sack into the back of my van that's the problem . eos"]
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [74]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 17, 3335, 20, 3858, 4805, 26, 1962, 104, 6, 749, 42, 22, 42, 468, 3, 2], [1, 7, 27, 62, 469, 199, 132, 72, 5, 1904, 15, 15, 57, 199, 5, 1904, 72, 5, 131, 19, 13, 2747, 156, 5, 469, 3, 2], [1, 17, 20, 9, 66, 4, 222, 195, 1254, 26, 4, 430, 6, 1701, 2], [1, 292, 2885, 17, 95, 202, 65, 20, 30, 4, 2150, 2], [1, 4019, 997, 130, 72, 4, 97, 134, 5, 315, 18, 4928, 1726, 15, 2229, 5, 82, 15, 2]]
8923


In [75]:
# saving
with open(MODELS_PATH + 'jokes_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [76]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+sliding_window_length])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[   1   17 3335   20 3858 4805   26 1962  104    6  749   42   22]
 [  17 3335   20 3858 4805   26 1962  104    6  749   42   22   42]]
(833131, 1)
[[ 42]
 [468]]


In [95]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden_dim, deeper_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(Bidirectional(LSTM(hidden_dim, 
                                 activation='tanh',
                                 kernel_regularizer=regularizers.l2(regularization),
                                 recurrent_regularizer=regularizers.l2(regularization), unroll=True#, return_sequences = True
                                )))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=deeper_dim, activation='elu', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy])
    return model

In [96]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, deeper_dim=DEEPER_DIM,
              max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, regularization=REGULARIZATION)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 13, 128)           1142144   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 1024)              2625536   
_________________________________________________________________
dropout_23 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 256)               262400    
_________________________________________________________________
dropout_24 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 8923)              2293211   
Total params: 6,323,291
Trainable params: 6,323,291
Non-trainable params: 0
_________________________________________________________________


In [97]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_bilstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/jokes_bilstm_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_bilstm_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=10, batch_size=2048, shuffle=True, verbose=1, validation_split=0.2, 
          callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/10

In [99]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = seq[-1*MAX_SEQ_LEN:]
        else:
            encoded_seq = seq
        padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        #padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)
        y_class = y_prob.argmax(axis=-1)[0]
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [100]:
joke = generate(model, tokenizer, "sos i had to use", maxlen=40)
print(joke)

[1, 7, 84, 8, 258]
sos i had to use a lot of money . plagiarism . i don't know what i have to be . eos


In [54]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [55]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos they', 'they say', 'say a', "a woman's", "woman's work", 'work is', 'is never', 'never done', 'done threedots', 'threedots maybe', "maybe that's", "that's why", 'why they', 'they get', 'get paid', 'paid less', 'less than', 'than men', 'men .', '. eos'], ['sos going', 'going to', 'to mcdonalds', 'mcdonalds for', 'for a', 'a salad', 'salad is', 'is like', 'like going', 'going to', 'to a', 'a hooker', 'hooker for', 'for a', 'a hug', 'hug .', '. eos']]


In [56]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [57]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.5  ->  sos i had to use my glasses when playing tennis . because its a no contact sport . eos
0.4  ->  sos i had to stop drinking . i kept getting that thing where you feel sick and your head hurts threedots depression . eos
0.4  ->  sos i found a stray cat today . sadly , my dad is allergic to them so i had to put him down . at least i still have the cat for comfort . eos
0.4  ->  sos ran out of toilet paper , so i had to use leaves . just kidding , but my son learned a big lesson about leaving his clothes in the bathroom . eos
0.4  ->  sos i used to date a girl with a lazy eye . i had to break up with her . she was seeing somebody on the side . eos
0.4  ->  sos i recently broke up with my long term japanese girlfriend threedots she didn't seem to understand so i had to drop the bomb on her twice . eos
0.4  ->  sos i had a vasectomy today , and my wife keeps asking how i feel threedots i've had to tell her over and over that it's not that bad , and that i don't notice much of a vas 

In [101]:
joke = generate(model, tokenizer, "sos what do you call", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call ? a stick up a cliff . a woman . eos
0.5333333333333333  ->  sos i was told that exercise helps with your decision making . it's true . after going to the gym earlier i've decided i'm never going again . eos
0.5333333333333333  ->  sos what do toilet paper and the star ship enterprise have in common they both circle uranus and battle klingons eos
0.4666666666666667  ->  sos did you hear about the white guy who got pulled over ? me neither . eos
0.4666666666666667  ->  sos i wasn't dropped as a baby , but i've been making up for it ever since . eos
0.4666666666666667  ->  sos why are native americans the best strippers ? because when they dance they make it rain . eos
0.4666666666666667  ->  sos what did the eskimo children sing when their principal was leaving ? freeze a jolly good fellow . eos
0.4666666666666667  ->  sos i seriously hate it when a couple starts having an argument in front of you . they could have least waited until i got dressed

In [21]:
print(sentences[:10])

["sos a woman was arrested when her boyfriend's body was found in a freezer in their living room . who the hell puts a freezer in the living room ? eos", 'sos what does heroin make you feel like ? more heroin . eos', "sos why couldn't the physicist understand how boats work ? he thought nothing could possibly travel faster than sea . eos", "sos at what age do you tell a highway it's adopted ? eos", "sos russians dolls . they're so full of themselves eos", "sos how many chocolate bunnies can you put into an empty easter basket ? one . after that the basket won't be empty . eos", "sos nurse pops her head into the doctor's office threedots nurse : ' doctor , there's an invisible man in the waiting room . ' doctor : ' tell him i can't see him . ' eos", 'sos what kind of file makes a hole bigger ? a pedophile eos', "sos what is a paranoid man's favorite food ? who wants to know ? eos", "sos friends invited me to a meteor shower party , but i couldn't make it . they were crushed . eos"]


In [103]:
joke = generate(model, tokenizer, "sos what's the difference between being hungry", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 56, 5, 76, 74, 167, 859]
sos what's the difference between being hungry ? a stick in a tank and a busty crustacean . eos
0.5  ->  sos why couldn't moses adopt a kitten from the animal shelter ? because the shelter was non prophet . eos
0.5  ->  sos men : don't lie to your woman , she'll catch you . don't tell her the truth , she'll be pissed . just pray for a brick to fall on your head . eos
0.5  ->  sos hockey : because running on knives makes sense . eos
0.5  ->  sos a riddle : what's the only hole that i can't put my dick in ? answer : a donut hole ! eos
0.5  ->  sos what tea is hardest to swallow ? reality eos
0.5  ->  sos what happens when you don't pay your exorcist ? threedots you get repossessed . eos
0.5  ->  sos " you suck . " " no , you suck . " " really , you suck . " " please , you suck . " " you suck , i insist . " - polite vampires . eos
0.5  ->  sos why are pot heads always so condescending ? they always think they're higher then others eos
0.5  ->  sos i know a goo

In [29]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

In [104]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 4, 107, 1837]
sos a guy finds up . period is it . eos
0.4  ->  sos what did the baby milk say to his older sister ? you're spoiled ! eos
0.4  ->  sos girl are you a dishwasher ? because i would like to fill you with my dirty load in the evening , turn you on , and fall asleep before you finish eos
0.4  ->  sos if a shark attacks you , do not punch him in the nose . be the bigger person and just ignore him . eos
0.3  ->  sos when you have the opportunity to become a bigger person , take it because cake is delicious . eos
0.3  ->  sos what's the difference between hitler and michael phelps ? michael phelps could finish a race . eos
0.3  ->  sos the difference between " like " " love " and " in love " is the same as the difference between " for now " " for a while " and " forever " eos
0.3  ->  sos what do engineers use for birth control ? personality . eos
0.3  ->  sos what's the only thing working out at the gym ? the business plan . eos
0.3  ->  sos my wife thinks i'm cheating on h