In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten, Merge, Concatenate
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam, RMSprop
from keras.models import Model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K
from keras.utils import multi_gpu_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Variables

In [2]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.2

MODEL_PREFIX = 'jokes_stacked_lstm'

EMBEDDING_DIM = 512
HIDDEN_DIM1 = 1024
HIDDEN_DIM2 = 512
DEEPER_DIM = 512
DROPOUT_FACTOR = 0.2
REGULARIZATION = 0.00001
LEARNING_RATE = 0.003

DATA_PERCENT = 0.1

RUN_INDEX = 6

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
['sos i finally told her those three magical words every woman wish to hear threedots i give up ! eos', 'sos which cheeseburger makes a big hit in baseball ? a double ! eos']
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [4]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 7, 583, 123, 61, 320, 253, 4222, 522, 174, 138, 474, 8, 98, 15, 7, 181, 55, 21, 2], [1, 278, 4349, 304, 4, 229, 339, 14, 1245, 6, 4, 1065, 21, 2], [1, 32, 39, 536, 12, 2075, 1941, 6, 37, 39, 192, 293, 122, 313, 42, 3927, 1701, 3, 2], [1, 85, 3395, 86, 8, 4, 8479, 1286, 2], [1, 38, 13, 265, 2314, 24, 4, 270, 15, 15, 7, 95, 54, 5, 4019, 2]]
8923


In [8]:
# saving
with open(MODELS_PATH + MODEL_PREFIX + '_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [5]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
#y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[   1    7  583  123   61  320  253 4222  522  174  138  474    8]
 [   7  583  123   61  320  253 4222  522  174  138  474    8   98]]
(833131, 13, 1)
[[[   7]
  [ 583]
  [ 123]
  [  61]
  [ 320]
  [ 253]
  [4222]
  [ 522]
  [ 174]
  [ 138]
  [ 474]
  [   8]
  [  98]]

 [[ 583]
  [ 123]
  [  61]
  [ 320]
  [ 253]
  [4222]
  [ 522]
  [ 174]
  [ 138]
  [ 474]
  [   8]
  [  98]
  [  15]]]


In [10]:
# define model
def StackedLSTM(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, #input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(TimeDistributed(Dropout(dropout_factor)))
    model.add(Dense(units=deeper_dim, activation='tanh', kernel_regularizer=regularizers.l2(regularization)))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    #model = multi_gpu_model(model)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy], sample_weight_mode='temporal')
    return model

In [11]:
K.clear_session()
sess = tf.Session()
K.set_session(sess)
model = StackedLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2, 
                    deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
                    regularization=REGULARIZATION, learning_rate=LEARNING_RATE)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 512)         4568576   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 1024)        6295552   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 512)         3147776   
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 512)         0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 512)         262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
dense_2 (Dense)              (None, None, 8923)        4577499   
Total para

In [None]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/" + MODEL_PREFIX + "/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=25, batch_size=1024, shuffle=True, verbose=1, validation_split=0.2, callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/25

Epoch 00001: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.01-4.59.hdf5

Epoch 00001: val_loss improved from inf to 4.59488, saving model to ./models/jokes_stacked_lstm_gen6.hdf5
Epoch 2/25

Epoch 00002: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.02-4.50.hdf5

Epoch 00002: val_loss improved from 4.59488 to 4.49845, saving model to ./models/jokes_stacked_lstm_gen6.hdf5
Epoch 3/25

In [6]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen, probabilistic=False, exploration_factor=0.0):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        encoded_seq = seq
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = encoded_seq[-1*MAX_SEQ_LEN:]
            
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        
        if random.random() <= exploration_factor:
            probabilistic = True
        else:
            probabilistic = False
            
        if probabilistic:
            y_class = np.argmax(np.random.multinomial(1,y_prob[0]/(np.sum(y_prob[0])+1e-5),1))
        else:
            y_class = y_prob.argmax(axis=-1)[0]
        
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen or out_word == 'sos':
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [7]:
print(sentences[:10])

['sos i finally told her those three magical words every woman wish to hear threedots i give up ! eos', 'sos which cheeseburger makes a big hit in baseball ? a double ! eos', "sos how are america and mcdonald's similar ? they are both run by red - headed clowns . eos", 'sos two toilets go to a potty lol eos', 'sos if my friends circle was a pizza threedots threedots i would be the crust eos', 'sos my buddy told me this one : " people think i\'m ugly until they see my bank account " " then they think i\'m ugly and poor . " eos', 'sos my dog always walked backwards threedots so i named him god . eos', 'sos i dated this muslim girl once threedots she was the bomb ! eos', "sos trying to make a list of things that are worse than mondays and all i've got so far is hitler and christian rock . eos", "sos after sex i always get out my phone and order my wife a dozen roses . my girlfriend thinks i'm hilarious . eos"]


In [8]:
model = load_model('models/jokes_stacked_lstm_gen6.hdf5')
with open('models/jokes_stacked_lstm_tokenizer_6.pickle', 'rb') as pickleFile:
    tokenizer = pickle.load(pickleFile)

In [47]:
joke = generate(model, tokenizer, "sos i like ", maxlen=40, exploration_factor=0.2)
print(joke)

[1, 14, 5, 571]
sos in the zoo , the only animal in the entire zoo is a dog . it's a shitzu . eos


In [12]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [13]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos i', 'i finally', 'finally told', 'told her', 'her those', 'those three', 'three magical', 'magical words', 'words every', 'every woman', 'woman wish', 'wish to', 'to hear', 'hear threedots', 'threedots i', 'i give', 'give up', 'up !', '! eos'], ['sos which', 'which cheeseburger', 'cheeseburger makes', 'makes a', 'a big', 'big hit', 'hit in', 'in baseball', 'baseball ?', '? a', 'a double', 'double !', '! eos']]


In [14]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [38]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.3181818181818182  ->  sos my neighbour complained to me about my dog chasing people on a bike . my dog can't even ride a bike . eos
0.2727272727272727  ->  sos my dog used to chase people on a bike a lot . it got so bad that i finally had to take his bike away . eos
0.2727272727272727  ->  sos a man walks into a zoo threedots the only animal in the entire zoo is a dog . it's a shitzu . eos
0.2727272727272727  ->  sos my dog used to chase people on a bike a lot it got so bad , finally i had to take his bike away . eos
0.2727272727272727  ->  sos my best friend won't talk to me . it's because he is a dog . eos
0.2727272727272727  ->  sos a man walks into a zoo threedots threedots the only animal in the whole entire zoo is a dog . it's a shitzu . eos
0.2727272727272727  ->  sos a man walks into a zoo only to find out the only animal in the entire zoo is a dog . it's a shitzu . eos
0.2727272727272727  ->  sos a man goes to a zoo but the only animal there is a dog . it's a shitzu . eos
0.

In [27]:
joke = generate(model, tokenizer, "sos what do you call", maxlen=40, exploration_factor=0.3)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call he when he just wheel it ? a carrot . eos
0.4666666666666667  ->  sos what do you call the sound a ghost makes when he calls you ? a phone moan . eos
0.4666666666666667  ->  sos what do you call a porno with aliens in it ? a science friction movie . eos
0.4666666666666667  ->  sos what do you call an immature carrot ? a baby carrot . eos
0.4666666666666667  ->  sos what do you call a black brick with three holes in it ? a polish bowling ball . eos
0.4666666666666667  ->  sos what do you call it when you sit on a high chair to see if you want to buy it ? a stool sample . eos
0.4666666666666667  ->  sos what do you call keanu reeves when he served under hitler ? a neo nazi . eos
0.4  ->  sos what do you call a human that's now a cactus ? a transplant . eos
0.4  ->  sos what do you call a baby donkey in spanish ? a burrito . eos
0.4  ->  sos what do you call a person who drinks way too much soda ? a coke - o - nut . eos
0.4  ->  sos what do you call

In [28]:
print(sentences[:10])

['sos save the whales . collect the whole set . eos', 'sos mickey mouse q : what kind of mouse walks on two legs ? a : mickey mouse q : what kind of duck walks on two legs ? a : they all do ! eos', 'sos did you know diarrhoea is hereditary ? it runs in your jeans . eos', 'sos i thought i saw jesus in a cookie . but i was wrong . it was just a guy who looked like him . eos', "sos i'm a feminist so is my wife , but because i'm a man , i'm better at it . eos", "sos what's the difference between a good joke and click bait ? this isn't a good joke . eos", 'sos when jokes go to parties , where do they wait for drinks ? in the punchlines . eos', 'sos when i found out top gear was being cancelled threedots i was so shocked it was like a punch in the face . eos', "sos waiter asked if i wanted the soup or salad me : yes , i'll take the super salad eos", 'sos president trump ! what about the aliens from space ? we need a roof ! eos']


In [38]:
joke = generate(model, tokenizer, "sos i ", maxlen=40, exploration_factor=0.0)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 7, 33, 4, 124]
sos i like a girl with a lazy eye . i just can't find her pencil . eos
0.5294117647058824  ->  sos i used to date a girl with a lazy eye . i had to break up with her . she was seeing somebody on the side . eos
0.47058823529411764  ->  sos i was dating a girl with a lazy eye . unfortunately i had to break up with her because she was seeing someone else on the side . eos
0.4117647058823529  ->  sos i dated a girl with a lazy eye once threedots caught her seeing someone on the side though . eos
0.4117647058823529  ->  sos i once dated a girl with a lazy eye , it was going well for a few months until i realized she was seeing someone else . eos
0.4117647058823529  ->  sos i dated a girl with a lazy eye , but it didn't work out . she was seeing someone else on the side . eos
0.4117647058823529  ->  sos i used to date a girl with a lazy eye threedots turns out she was seeing someone else the whole time . eos
0.4117647058823529  ->  sos i used to date a girl with a lazy eye

In [33]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

KeyboardInterrupt: 

In [20]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 4, 107, 1832]
sos a guy finds a rectal thermometer in his pocket and says " i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought shoes from a drug dealer i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos my drug dealer gave me new shoes today . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought shoes from my drug dealer today i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos shoes from a drug dealer i bought some shoes from a drug dealer . i don't know w