In [35]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

In [36]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

### Variables

In [13]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'
EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 100

HIDDEN_DIM = 512
DROPOUT_FACTOR = 0.333
REGULARIZATION = 0.00001
DEEPER_DIM = 256
DATA_PERCENT = 0.1
RUN_INDEX = 4

# EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
# TRAIN_DATA_FILE = "./datasets/combined.pickle"
# VOCABULARY_FILE = "./datasets/combined_vocabulary.pickle"
# MAX_SEQUENCE_LENGTH = 10
MAX_NB_WORDS = 200000
# EMBEDDING_DIM = 100
SEQUENCE_STEP = 1
# #VALIDATION_SPLIT = 0.1

In [4]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
['sos a man walks into an eye doctor and asks to see the doctor threedots the nurse replies , " not with that eye ! " eos', 'sos last night i got a handjob from a blind girl she said , " you\'ve got the biggest dick i\'ve ever put my hands on . " i said , " nah , you\'re just pulling my leg . " eos']
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [11]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)
#word to index
word_index = tokenizer.word_index
encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 4, 73, 130, 72, 50, 466, 201, 12, 288, 8, 118, 5, 201, 15, 5, 1701, 650, 10, 11, 58, 29, 28, 466, 21, 11, 2], [1, 154, 168, 7, 77, 4, 4120, 78, 4, 298, 124, 70, 79, 10, 11, 395, 77, 5, 1032, 248, 153, 143, 148, 13, 384, 26, 3, 11, 7, 79, 10, 11, 1761, 10, 94, 44, 1984, 13, 564, 3, 11, 2], [1, 32, 20, 9, 67, 5, 3028, 24, 205, 14, 2366, 6, 1919, 498, 16, 2502, 166, 5, 11, 6156, 11, 2], [1, 13, 2537, 1254, 71, 24, 2701, 78, 1157, 1630, 259, 672, 31, 799, 31, 24, 1148, 2], [1, 7, 24, 348, 88, 4, 208, 82, 15, 727, 261, 16, 3, 2]]
8923


In [8]:
# saving
with open(MODELS_PATH + 'jokes_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [9]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+sliding_window_length])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[  1   4  73 130  72  50 466 201  12 288   8 118   5]
 [  4  73 130  72  50 466 201  12 288   8 118   5 201]]
(833131, 1)
[[201]
 [ 15]]


In [10]:
print('Indexing word vectors')
#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))

Indexing word vectors
Total 400000 word vectors.


In [15]:
print('Preparing embedding matrix')
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word,i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 255


In [37]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden_dim, deeper_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001):
    model = Sequential()
    model.add(Embedding(vocab_size,embedding_dim,input_length=max_seq_len,weights=[embedding_matrix],mask_zero=True,trainable=False))
    model.add(Bidirectional(LSTM(hidden_dim, 
                                 activation='tanh',
                                 kernel_regularizer=regularizers.l2(regularization),
                                 recurrent_regularizer=regularizers.l2(regularization), unroll=True, return_sequences = True
                                )))
    model.add(Dropout(dropout_factor))
    model.add(Attention(MAX_SEQUENCE_LENGTH))
    model.add(Dense(units=deeper_dim, activation='elu', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy])
    return model

In [38]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, deeper_dim=DEEPER_DIM,
              max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, regularization=REGULARIZATION)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 13, 100)           892300    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 13, 1024)          2510848   
_________________________________________________________________
dropout_4 (Dropout)          (None, 13, 1024)          0         
_________________________________________________________________
attention_1 (Attention)      (None, 1024)              1037      
_________________________________________________________________
dense_3 (Dense)              (None, 256)               262400    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 8923)              2293211   
Total para

In [20]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [39]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_bilstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/jokes_bilstm_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_bilstm_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=1, batch_size=2048, shuffle=True, verbose=1, validation_split=0.2, 
          callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/1

Epoch 00001: saving model to ./models/checkpoints/jokes_bilstm_gen4.01-5.28.hdf5

Epoch 00001: val_loss improved from inf to 5.27568, saving model to ./models/jokes_bilstm_gen4.hdf5
Total elapsed time:  179.44890713691711


In [24]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = seq[-1*MAX_SEQ_LEN:]
        else:
            encoded_seq = seq
        padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        #padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)
        y_class = y_prob.argmax(axis=-1)[0]
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [31]:
joke = generate(model, tokenizer, "sos i had to use napkin and apple", maxlen=40)
print(joke)

[1, 7, 84, 8, 258, 5259, 12, 519]
sos i had to use napkin and apple . eos


In [26]:
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']


In [27]:
sentence_bigrams = [bigrams_list(s) for s in sentences]
print(sentence_bigrams[:2])

[['sos a', 'a man', 'man walks', 'walks into', 'into an', 'an eye', 'eye doctor', 'doctor and', 'and asks', 'asks to', 'to see', 'see the', 'the doctor', 'doctor threedots', 'threedots the', 'the nurse', 'nurse replies', 'replies ,', ', "', '" not', 'not with', 'with that', 'that eye', 'eye !', '! "', '" eos'], ['sos last', 'last night', 'night i', 'i got', 'got a', 'a handjob', 'handjob from', 'from a', 'a blind', 'blind girl', 'girl she', 'she said', 'said ,', ', "', '" you\'ve', "you've got", 'got the', 'the biggest', 'biggest dick', "dick i've", "i've ever", 'ever put', 'put my', 'my hands', 'hands on', 'on .', '. "', '" i', 'i said', 'said ,', ', "', '" nah', 'nah ,', ", you're", "you're just", 'just pulling', 'pulling my', 'my leg', 'leg .', '. "', '" eos']]


In [28]:
def intersection(lst1, lst2):
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def similarity_score(lst1, lst2):
    intersection_len = len(intersection(lst1, lst2))
    return (1.0*intersection_len)/len(lst1)#+len(lst2)-intersection_len)
 
def print_closest_sentences(sentence, sentence_bigrams, top_k=3):
    bigrams = bigrams_list(sentence)
    scores = np.array([similarity_score(bigrams, sbigrams)
                       for sbigrams in sentence_bigrams])
    top_k_indices = scores.argsort()[-1*top_k:][::-1]
    top_k_scores = scores[top_k_indices]
    for k in range(top_k):
        print(top_k_scores[k], " -> ", sentences[top_k_indices[k]])

In [29]:
print_closest_sentences(joke, sentence_bigrams, 10)

0.45454545454545453  ->  sos i was getting a massage and i asked the masseuse if it was normal for a man to get an erection he replied that it was . so i asked if he could get it out of my face . eos
0.45454545454545453  ->  sos i told my boyfriend that i felt like i had been forgetting a lot of things lately . he said , " because i've been fucking your brains out . " i've never laughed so hard . eos
0.45454545454545453  ->  sos i had to take the batteries out of my carbon monoxide detector . all the beeping was giving me a headache and making me sleepy . eos
0.45454545454545453  ->  sos i had to use my glasses when playing tennis . because its a no contact sport . eos
0.45454545454545453  ->  sos i recently came into a lot of money . the bank teller wasn't happy about having to use gloves . eos
0.45454545454545453  ->  sos i stepped on an ant hill today and realized i had probably killed a lot of innocent ants . i also killed all the ant rapists so , i'm a hero . eos
0.363636363636363

In [29]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

In [104]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)
print_closest_sentences(joke, sentence_bigrams, 10)

[1, 4, 107, 1837]
sos a guy finds up . period is it . eos
0.4  ->  sos what did the baby milk say to his older sister ? you're spoiled ! eos
0.4  ->  sos girl are you a dishwasher ? because i would like to fill you with my dirty load in the evening , turn you on , and fall asleep before you finish eos
0.4  ->  sos if a shark attacks you , do not punch him in the nose . be the bigger person and just ignore him . eos
0.3  ->  sos when you have the opportunity to become a bigger person , take it because cake is delicious . eos
0.3  ->  sos what's the difference between hitler and michael phelps ? michael phelps could finish a race . eos
0.3  ->  sos the difference between " like " " love " and " in love " is the same as the difference between " for now " " for a while " and " forever " eos
0.3  ->  sos what do engineers use for birth control ? personality . eos
0.3  ->  sos what's the only thing working out at the gym ? the business plan . eos
0.3  ->  sos my wife thinks i'm cheating on h