In [3]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten, Merge, Concatenate
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam
from keras.models import Model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K
from keras.utils import multi_gpu_model

In [2]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:1']

### Variables

In [17]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.2

MODEL_PREFIX = 'jokes_stacked_lstm'

EMBEDDING_DIM = 512
HIDDEN_DIM1 = 1024
HIDDEN_DIM2 = 512
DEEPER_DIM = 512
DROPOUT_FACTOR = 0.2
REGULARIZATION = 0.00001
LEARNING_RATE = 0.003

DATA_PERCENT = 0.1

RUN_INDEX = 6

In [5]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
["sos my credit card was stolen today i don't think i'm gonna do anything about it . so far he's spending way less than my wife does . eos", "sos if i ever start with ' this one time i went jogging threedots ' i am not telling the truth . eos"]
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [6]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 13, 1173, 928, 24, 1245, 136, 7, 47, 106, 45, 348, 20, 298, 51, 16, 3, 43, 699, 179, 2804, 150, 780, 151, 13, 108, 64, 3, 2], [1, 38, 7, 143, 314, 29, 81, 69, 46, 90, 7, 190, 4552, 15, 81, 7, 237, 58, 508, 5, 1880, 3, 2], [1, 17, 36, 46, 664, 73, 60, 8, 5, 91, 6, 802, 33, 1984, 2], [1, 217, 23, 56, 174, 4222, 279, 787, 6, 42, 4, 23, 253, 299, 2467, 21, 2], [1, 17, 36, 5, 2935, 2059, 1012, 60, 6, 114, 9, 1356, 22, 6, 2]]
8923


In [7]:
# saving
with open(MODELS_PATH + MODEL_PREFIX + '_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [8]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
#y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[   1   13 1173  928   24 1245  136    7   47  106   45  348   20]
 [  13 1173  928   24 1245  136    7   47  106   45  348   20  298]]
(833131, 13, 1)
[[[  13]
  [1173]
  [ 928]
  [  24]
  [1245]
  [ 136]
  [   7]
  [  47]
  [ 106]
  [  45]
  [ 348]
  [  20]
  [ 298]]

 [[1173]
  [ 928]
  [  24]
  [1245]
  [ 136]
  [   7]
  [  47]
  [ 106]
  [  45]
  [ 348]
  [  20]
  [ 298]
  [  51]]]


In [27]:
# define model
def StackedLSTM(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, #input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    model.add(TimeDistributed(Dropout(dropout_factor)))
    model.add(Dense(units=deeper_dim, activation='tanh', kernel_regularizer=regularizers.l2(regularization)))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    #model = multi_gpu_model(model)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy], sample_weight_mode='temporal')
    return model

In [28]:
K.clear_session()
sess = tf.Session()
K.set_session(sess)
model = StackedLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2, 
                    deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
                    regularization=REGULARIZATION, learning_rate=LEARNING_RATE)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 512)         4568576   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 1024)        6295552   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 512)         3147776   
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 512)         0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 512)         262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
dense_2 (Dense)              (None, None, 8923)        4577499   
Total para

In [29]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/" + MODEL_PREFIX + "/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=25, batch_size=1024, shuffle=True, verbose=1, validation_split=0.2, callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/25

Epoch 00001: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.01-4.72.hdf5

Epoch 00001: val_loss improved from inf to 4.71779, saving model to ./models/jokes_stacked_lstm_gen6.hdf5
Epoch 2/25

Epoch 00002: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.02-4.51.hdf5

Epoch 00002: val_loss improved from 4.71779 to 4.51020, saving model to ./models/jokes_stacked_lstm_gen6.hdf5
Epoch 3/25

Epoch 00003: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.03-4.46.hdf5

Epoch 00003: val_loss improved from 4.51020 to 4.46278, saving model to ./models/jokes_stacked_lstm_gen6.hdf5
Epoch 4/25

Epoch 00004: saving model to ./models/checkpoints/jokes_stacked_lstm_gen6.04-4.44.hdf5

Epoch 00004: val_loss improved from 4.46278 to 4.44359, saving model to ./models/jokes_stacked_lstm_gen6.hdf5


In [2]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen, probabilistic=False, exploration_factor=1.0):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        encoded_seq = seq
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = encoded_seq[-1*MAX_SEQ_LEN:]
            
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        
        if random.random() <= exploration_factor:
            probabilistic = True
        else:
            probabilistic = False
            
        if probabilistic:
            y_class = np.argmax(np.random.multinomial(1,y_prob[0]/(np.sum(y_prob[0])+1e-5),1))
        else:
            y_class = y_prob.argmax(axis=-1)[0]
        
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen or out_word == 'sos':
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [None]:
model = load_model('models/jokes_stacked_lstm_gen5.hdf5')
with open('models/jokes_tokenizer_5.pickle', 'rb') as pickleFile:
    tokenizer = pickle.load(pickleFile)

In [26]:
joke = generate(model, tokenizer, "sos i had to use", maxlen=40)
print(joke)

[1, 7, 84, 8, 258]
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)
(1, 8923)


ValueError: sum(pvals[:-1]) > 1.0

In [24]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [15]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos how', 'how does', 'does an', 'an alabama', 'alabama girl', 'girl know', "know she's", "she's in", 'in for', 'for a', 'a crazy', 'crazy night', 'night ?', '? her', 'her daddy', 'daddy says', 'says he', 'he wants', 'wants her', 'her in', 'in bed', 'bed by', 'by ten', 'ten .', '. eos'], ['sos my', 'my friend', 'friend works', 'works at', 'at a', 'a circumcision', 'circumcision clinic', 'clinic i', 'i asked', 'asked him', 'him if', 'if he', 'he charges', 'charges alot', 'alot for', 'for his', 'his circumcisions', 'circumcisions he', 'he said', 'said "', '" no', 'no ,', ', i', 'i just', 'just keep', 'keep the', 'the tips', 'tips .', '. "', '" eos']]


In [16]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [17]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.6666666666666666  ->  sos i bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i bought a pair of shoes from a drug dealer today . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i recently bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6666666666666666  ->  sos i bought some shoes from my dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos my drug dealer gave me new shoes today . i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos i recently bought some shoes from a drug dealer threedots i don't know what he laced them with , but i've been tripping all day . eos
0.6296296296296297  ->  sos i bought shoes from my drug dealer today i don't know what he laced t

In [18]:
joke = generate(model, tokenizer, "sos what do you call", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 17, 20, 9, 66]
sos what do you call a mexican midget ? a paragraph , because he's not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he's not a full essay eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he's not a full ese . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph because he's not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican midget ? a paragraph , because he is not a full essay . eos
0.8947368421052632  ->  sos what do you call a mexican with no legs ? a paragraph , because he's not a full essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph , because he's too short to be an essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph , because he's too short to be an essay . eos
0.7894736842105263  ->  sos what do you call a mexican midget ? a paragraph 

In [21]:
print(sentences[:10])

["sos a woman was arrested when her boyfriend's body was found in a freezer in their living room . who the hell puts a freezer in the living room ? eos", 'sos what does heroin make you feel like ? more heroin . eos', "sos why couldn't the physicist understand how boats work ? he thought nothing could possibly travel faster than sea . eos", "sos at what age do you tell a highway it's adopted ? eos", "sos russians dolls . they're so full of themselves eos", "sos how many chocolate bunnies can you put into an empty easter basket ? one . after that the basket won't be empty . eos", "sos nurse pops her head into the doctor's office threedots nurse : ' doctor , there's an invisible man in the waiting room . ' doctor : ' tell him i can't see him . ' eos", 'sos what kind of file makes a hole bigger ? a pedophile eos', "sos what is a paranoid man's favorite food ? who wants to know ? eos", "sos friends invited me to a meteor shower party , but i couldn't make it . they were crushed . eos"]


In [19]:
joke = generate(model, tokenizer, "sos what's the difference between being", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 56, 5, 76, 74, 167]
sos what's the difference between being a pedophile and a rapist ? a pedophile comes up to a bar . eos
0.4  ->  sos a rapist , a pedophile and a priest walks into a bar . eos
0.4  ->  sos a priest , a rapist , and a pedophile walk in to a bar . and he orders a drink . eos
0.4  ->  sos what's the difference between a peeping tom and a rapist ? a rapist doesn't waste time beating around the bush eos
0.4  ->  sos a priest , a pedophile and a rapist walk into a bar . he orders a drink . eos
0.4  ->  sos a rapist , a pedophile and a priest walk into a bar . he orders a drink . eos
0.35  ->  sos what's the difference between a slut and a bitch ? a slut fucks everyone threedots a bitch fucks everyone but you . eos
0.35  ->  sos what's the difference between a bmw driver and a porcupine ? a porcupine has the pricks on the outside . eos
0.35  ->  sos what's the difference between a blonde and a mosquito ? a mosquito will stop sucking after you slap it . eos
0.35  ->  sos

In [29]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

In [20]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 4, 107, 1832]
sos a guy finds a rectal thermometer in his pocket and says " i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i recently bought shoes from a drug dealer i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos my drug dealer gave me new shoes today . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought some shoes from a drug dealer . i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos i bought shoes from my drug dealer today i don't know what he laced them with , but i've been tripping all day . eos
0.5517241379310345  ->  sos shoes from a drug dealer i bought some shoes from a drug dealer . i don't know w