In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Variables

In [2]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 256
HIDDEN_DIM1 = 512
HIDDEN_DIM2 = 256
DEEPER_DIM = 256
DROPOUT_FACTOR = 0.25
REGULARIZATION = 0.00001
LEARNING_RATE = 0.001

DATA_PERCENT = 0.1

RUN_INDEX = 5

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
["sos how does an alabama girl know she's in for a crazy night ? her daddy says he wants her in bed by ten . eos", 'sos my friend works at a circumcision clinic i asked him if he charges alot for his circumcisions he said " no , i just keep the tips . " eos']
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [4]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 32, 64, 50, 3028, 124, 67, 294, 14, 30, 4, 712, 168, 6, 61, 1187, 105, 31, 446, 61, 14, 323, 122, 609, 3, 2], [1, 13, 157, 840, 49, 4, 2059, 2342, 7, 149, 103, 38, 31, 4676, 3927, 30, 53, 5734, 31, 79, 11, 62, 10, 7, 44, 218, 5, 2314, 3, 11, 2], [1, 144, 144, 268, 75, 21, 6156, 21, 6156, 71, 6, 6156, 689, 2366, 5, 7215, 21, 2], [1, 7, 190, 8, 5, 201, 29, 7515, 1604, 10, 31, 1424, 93, 31, 408, 13, 2261, 4019, 3, 7, 106, 179, 527, 5, 1311, 41, 7, 180, 240, 181, 4, 164, 3, 2], [1, 13, 141, 12, 7, 39, 192, 720, 41, 45, 230, 666, 45, 4, 107, 2]]
8923


In [5]:
# saving
with open(MODELS_PATH + 'jokes_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [6]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
#y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[   1   32   64   50 3028  124   67  294   14   30    4  712  168]
 [  32   64   50 3028  124   67  294   14   30    4  712  168    6]]
(833131, 13, 1)
[[[  32]
  [  64]
  [  50]
  [3028]
  [ 124]
  [  67]
  [ 294]
  [  14]
  [  30]
  [   4]
  [ 712]
  [ 168]
  [   6]]

 [[  64]
  [  50]
  [3028]
  [ 124]
  [  67]
  [ 294]
  [  14]
  [  30]
  [   4]
  [ 712]
  [ 168]
  [   6]
  [  61]]]


In [7]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, #input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(TimeDistributed(Dropout(dropout_factor)))
    model.add(Dense(units=deeper_dim, activation='tanh', kernel_regularizer=regularizers.l2(regularization)))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy], sample_weight_mode='temporal')
    return model

In [8]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2,
               deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
               regularization=REGULARIZATION, learning_rate=LEARNING_RATE)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 256)         2284288   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 512)         1574912   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         787456    
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 256)         0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 256)         65792     
_________________________________________________________________
dense_2 (Dense)              (None, None, 8923)        2293211   
Total params: 7,005,659
Trainable params: 7,005,659
Non-trainable params: 0
_________________________________________________________________


In [9]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [11]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_stacked_lstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/jokes_stacked_lstm_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_stacked_lstm_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=20, batch_size=512, shuffle=True, verbose=1, validation_split=0.2, callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/20

Epoch 00001: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.01-5.14.hdf5

Epoch 00001: val_loss improved from inf to 5.14123, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 2/20

Epoch 00002: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.02-4.61.hdf5

Epoch 00002: val_loss improved from 5.14123 to 4.61422, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 3/20

Epoch 00003: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.03-4.49.hdf5

Epoch 00003: val_loss improved from 4.61422 to 4.49015, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 4/20

Epoch 00004: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.04-4.43.hdf5

Epoch 00004: val_loss improved from 4.49015 to 4.43371, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 5/20

Epoch 00005: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.05-4.40.hdf5

Epoch 00005:


Epoch 00018: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.18-4.32.hdf5

Epoch 00018: val_loss improved from 4.31945 to 4.31735, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 19/20

Epoch 00019: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.19-4.32.hdf5

Epoch 00019: val_loss improved from 4.31735 to 4.31702, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Epoch 20/20

Epoch 00020: saving model to ./models/checkpoints/jokes_stacked_lstm_gen5.20-4.31.hdf5

Epoch 00020: val_loss improved from 4.31702 to 4.31181, saving model to ./models/jokes_stacked_lstm_gen5.hdf5
Total elapsed time:  10290.730658531189


In [25]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = seq[-1*MAX_SEQ_LEN:]
        else:
            encoded_seq = seq
        padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        #padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        #print(y_prob.shape)
        #y_class = y_prob.argmax(axis=-1)[0]
        y_class = np.argmax(np.random.multinomial(1,y_prob.squeeze(axis=0),1))
        #print(y_prob)
        #print(y_class)
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [26]:
joke = generate(model, tokenizer, "sos i had to use", maxlen=40)
print(joke)

[1, 7, 84, 8, 258]
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)


ValueError: sum(pvals[:-1]) > 1.0

In [24]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [15]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos how', 'how does', 'does an', 'an alabama', 'alabama girl', 'girl know', "know she's", "she's in", 'in for', 'for a', 'a crazy', 'crazy night', 'night ?', '? her', 'her daddy', 'daddy says', 'says he', 'he wants', 'wants her', 'her in', 'in bed', 'bed by', 'by ten', 'ten .', '. eos'], ['sos my', 'my friend', 'friend works', 'works at', 'at a', 'a circumcision', 'circumcision clinic', 'clinic i', 'i asked', 'asked him', 'him if', 'if he', 'he charges', 'charges alot', 'alot for', 'for his', 'his circumcisions', 'circumcisions he', 'he said', 'said "', '" no', 'no ,', ', i', 'i just', 'just keep', 'keep the', 'the tips', 'tips .', '. "', '" eos']]


In [16]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [17]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.6666666666666666  ->  sos i bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i bought a pair of shoes from a drug dealer today . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i recently bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i bought some shoes from my dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos my drug dealer gave me new shoes today . i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos i recently bought some shoes from a drug dealer threedots i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos i bought shoes from my drug dealer today i don't know what he laced t

In [18]:
joke = generate(model, tokenizer, "sos what do you call", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call a mexican midget ? a paragraph , because he's not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he's not a full essay eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he's not a full ese . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph because he's not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he is not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican with no legs ? a paragraph , because he's not a full essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph , because he's too short to be an essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph , because he's too short to be an essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph 

In [21]:
print(sentences[:10])

["sos a woman was arrested when her boyfriend's body was found in a freezer in their living room . who the hell puts a freezer in the living room ? eos", 'sos what does heroin make you feel like ? more heroin . eos', "sos why couldn't the physicist understand how boats work ? he thought nothing could possibly travel faster than sea . eos", "sos at what age do you tell a highway it's adopted ? eos", "sos russians dolls . they're so full of themselves eos", "sos how many chocolate bunnies can you put into an empty easter basket ? one . after that the basket won't be empty . eos", "sos nurse pops her head into the doctor's office threedots nurse : ' doctor , there's an invisible man in the waiting room . ' doctor : ' tell him i can't see him . ' eos", 'sos what kind of file makes a hole bigger ? a pedophile eos', "sos what is a paranoid man's favorite food ? who wants to know ? eos", "sos friends invited me to a meteor shower party , but i couldn't make it . they were crushed . eos"]


In [19]:
joke = generate(model, tokenizer, "sos what's the difference between being", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 56, 5, 76, 74, 167]
sos what's the difference between being a pedophile and a rapist ? a pedophile comes up to a bar . eos
0.4  ->  sos a rapist , a pedophile and a priest walks into a bar . eos
0.4  ->  sos a priest , a rapist , and a pedophile walk in to a bar . and he orders a drink . eos
0.4  ->  sos what's the difference between a peeping tom and a rapist ? a rapist doesn't waste time beating around the bush eos
0.4  ->  sos a priest , a pedophile and a rapist walk into a bar . he orders a drink . eos
0.4  ->  sos a rapist , a pedophile and a priest walk into a bar . he orders a drink . eos
0.35  ->  sos what's the difference between a slut and a bitch ? a slut fucks everyone threedots a bitch fucks everyone but you . eos
0.35  ->  sos what's the difference between a bmw driver and a porcupine ? a porcupine has the pricks on the outside . eos
0.35  ->  sos what's the difference between a blonde and a mosquito ? a mosquito will stop sucking after you slap it . eos
0.35  ->  sos

In [29]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

In [20]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 4, 107, 1832]
sos a guy finds a rectal thermometer in his pocket and says " i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought shoes from a drug dealer i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos my drug dealer gave me new shoes today . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought shoes from my drug dealer today i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos shoes from a drug dealer i bought some shoes from a drug dealer . i don't know w