In [19]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

### Variables

In [2]:
DATA_PATH = './datasets/jokes.pickle'
VOCAB_PATH = './datasets/jokes_vocabulary.pickle'
MODELS_PATH = './models/'

MAX_SEQUENCE_LENGTH = 10
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
DROPOUT_FACTOR = 0.333
REGULARIZATION = 0.00001

DATA_PERCENT = 0.1

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  96910
["sos how many nazi's does it take to screw in a lightbulb ? nein eos", "sos interviewer to me : what is your weakness ? i replied : honesty . interviewer : but honesty is not a weakness that's a good thing . i replied : i do not give a fuck what you think . eos"]
Vocab size =  8922
['sos', 'did', 'you', 'hear', 'about', 'the', 'new', 'corduroy', 'pillows', '?']


In [4]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 32, 112, 8025, 64, 16, 83, 8, 352, 14, 4, 322, 6, 3071, 2], [1, 2178, 8, 22, 23, 17, 18, 35, 3470, 6, 7, 672, 23, 3927, 3, 2178, 23, 41, 3927, 18, 58, 4, 3470, 156, 4, 109, 146, 3, 7, 672, 23, 7, 20, 58, 181, 4, 261, 17, 9, 106, 3, 2], [1, 17, 20, 9, 66, 85, 211, 29, 62, 591, 12, 62, 303, 1254, 147, 4, 682, 6, 4454, 12, 5353, 2], [1, 45, 58, 388, 41, 15, 35, 412, 387, 7515, 3, 11, 32, 18, 28, 388, 6, 11, 7, 79, 7, 329, 388, 3, 9, 89, 941, 3, 8026, 316, 3, 2], [1, 4676, 30, 531, 23, 11, 39, 9, 537, 19, 2179, 29, 65, 9, 401, 67, 6, 11, 2]]
8923


### Preparing Training Data

In [6]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+sliding_window_length])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  1073602
Max seq len =  10
(1073602, 10)
[[   1   32  112 8025   64   16   83    8  352   14]
 [  32  112 8025   64   16   83    8  352   14    4]]
(1073602, 1)
[[  4]
 [322]]


In [7]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden_dim, max_seq_len, 
           dropout_factor=0.5, regularization=0.00001):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_len, 
                        mask_zero=True, embeddings_regularizer=regularizers.l2(regularization)))
    model.add(Bidirectional(LSTM(hidden_dim, 
                                 kernel_regularizer=regularizers.l2(regularization),
                                 recurrent_regularizer=regularizers.l2(regularization),
                                 unroll=True
                                )))
    model.add(Dropout(dropout_factor))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy])
    return model

In [8]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM,
              max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, regularization=REGULARIZATION)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 128)           1142144   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               788480    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8923)              4577499   
Total params: 6,508,123
Trainable params: 6,508,123
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [24]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_bilstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_bilstm_gen.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1)]

model.fit(X_data, y_data, epochs=5, batch_size=256, shuffle=True, verbose=1, validation_split=0.2, 
          callbacks=callbacks)

print("Total elapsed time: ", time()-start_time)

Train on 858881 samples, validate on 214721 samples
Epoch 1/5
 66048/858881 [=>............................] - ETA: 46:11 - loss: 4.7209 - sparse_categorical_crossentropy: 4.5004 - sparse_categorical_accuracy: 0.2561

KeyboardInterrupt: 

In [27]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = seq[-1*MAX_SEQ_LEN:]
        else:
            encoded_seq = seq
        padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        y_prob = model.predict(padded_seq)
        y_class = y_prob.argmax(axis=-1)[0]
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [30]:
print(generate(model, tokenizer, "sos", maxlen=40))

[1]
sos the guy says " i said , " i don't know how i am . " eos
