In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten, Merge, Concatenate
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam
from keras.models import Model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K
from keras.utils import multi_gpu_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# http://nlp.stanford.edu/data/glove.6B.zip
DATA_PATH = './datasets/combined.pickle'
VOCAB_PATH = './datasets/combined_vocabulary.pickle'
MODELS_PATH = './models/'

GLOVE_PATH = './data/glove/glove.6B.200d.txt'

MODEL_PREFIX = 'combined_jokes_quote_controlled_stacked_lstm_glove'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.2

GLOVE_EMBEDDING_DIM = 200
EMBEDDING_DIM1 = 512
EMBEDDING_DIM2 = 512
HIDDEN_DIM1 = 1024
HIDDEN_DIM2 = 512
DEEPER_DIM = 512
DROPOUT_FACTOR = 0.2
REGULARIZATION = 0.00001
LEARNING_RATE = 0.003

DATA_PERCENT = 0.1

RUN_INDEX = 1

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)
sentences = [sentence for sentence in sentences if sentence[0] != 2]
with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  140293
[[0, 'sos the lady at the bakery who draws her eyebrows on is looking extra surprised today eos'], [0, 'sos do you know why one side of the v is slightly larger when birds fly together ? because there are more birds on that side . eos']]
Vocab size =  12614
['liberate', 'savings', 'clip', 'rhino', 'cross', 'encouragement', 'viewed', 'womens', 'lap', 'challenges']


In [4]:
NUM_JOKES = len([0 for item in sentences if item[0] == 0])
NUM_QUOTES = len([1 for item in sentences if item[0] == 1])
#NUM_TWEETS = len([2 for item in sentences if item[0] == 2])
print("Number of Jokes: ", NUM_JOKES)
print("Number of Quotes: ", NUM_QUOTES)
#print("Number of Tweets: ", NUM_TWEETS)
TOTAL_SENTENCES = NUM_JOKES + NUM_QUOTES
JOKES_WEIGHT = TOTAL_SENTENCES/NUM_JOKES
QUOTES_WEIGHT = TOTAL_SENTENCES/NUM_QUOTES
#TWEETS_WEIGHT = TOTAL_SENTENCES/NUM_TWEETS
print("JOKES_WEIGHT: ", JOKES_WEIGHT)
print("QUOTES_WEIGHT: ", QUOTES_WEIGHT)
#print("TWEETS_WEIGHT: ", TWEETS_WEIGHT)

CLASS_WEIGHTS = [JOKES_WEIGHT, QUOTES_WEIGHT]
CLASSES = [item[0] for item in sentences]
sentences = [item[1] for item in sentences]

Number of Jokes:  96910
Number of Quotes:  43383
JOKES_WEIGHT:  1.4476627798988753
QUOTES_WEIGHT:  3.233824309061153


In [10]:
# tokenize data
num_words = len(vocab)

with open("../combined_controlled_stacked_lstm_glove_tokenizer_1.pickle","rb") as fip:
    tokenizer = pickle.load(fip)
# tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
#                       char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 4, 797, 39, 4, 4094, 70, 7971, 85, 3415, 26, 12, 271, 1371, 1508, 80, 2], [1, 24, 11, 66, 46, 53, 433, 14, 4, 3242, 12, 2616, 3441, 38, 1706, 750, 474, 10, 61, 74, 35, 94, 1706, 26, 20, 433, 3, 2], [1, 285, 8, 100, 4, 174, 314, 2848, 162, 11, 66, 10, 2], [1, 1716, 12, 129, 1308, 7, 1553, 21, 4, 5468, 11, 3661, 420, 3, 2], [1, 1075, 3533, 46, 24, 6048, 486, 1075, 3533, 10, 61, 4, 999, 3533, 35, 7, 242, 3, 2]]
12615


In [11]:
# saving
with open(MODELS_PATH + MODEL_PREFIX + '_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
TAG_SIZE = 2
X_data = []
y_data = []
tag_data = []
weight_data = []

for idx, sentence in enumerate(encoded_sentences):
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        tag_data.append(CLASSES[idx])
        weight_data.append(CLASS_WEIGHTS[CLASSES[idx]])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)

X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
tag_data = to_categorical([[[tag]]*MAX_SEQ_LEN for tag in tag_data], TAG_SIZE)
weight_data = np.array(weight_data).reshape(-1,)
#y_data = np.array(y_data).reshape(-1,1)

# shuffle
perm = np.random.permutation(X_data.shape[0])
X_data = X_data[perm]
y_data = y_data[perm]
tag_data = tag_data[perm]
weight_data = weight_data[perm]
print(X_data.shape)
print(X_data[:5])
print(y_data.shape)
print(y_data[:5])
print(tag_data.shape)
print(tag_data[:5])
print(weight_data.shape)
print(weight_data[:5])

Total training data size =  1398826
Max seq len =  13
(1398826, 13)
[[  104    13  5908     3    32    65    15    27     5   970    86    54
     34]
 [  235     6    62    20    19   554    14   529     8     3    19     6
    213]
 [   92  2961    46    58  2096  1892     5    68    14   517     9  1358
      3]
 [10117     4   698     8   628     8     9   134    37   129    14   384
    104]
 [  111    17    15   855 12134     8     9   218  3910  4031    20    22
    120]]
(1398826, 13, 1)
[[[   13]
  [ 5908]
  [    3]
  [   32]
  [   65]
  [   15]
  [   27]
  [    5]
  [  970]
  [   86]
  [   54]
  [   34]
  [  524]]

 [[    6]
  [   62]
  [   20]
  [   19]
  [  554]
  [   14]
  [  529]
  [    8]
  [    3]
  [   19]
  [    6]
  [  213]
  [    6]]

 [[ 2961]
  [   46]
  [   58]
  [ 2096]
  [ 1892]
  [    5]
  [   68]
  [   14]
  [  517]
  [    9]
  [ 1358]
  [    3]
  [   49]]

 [[    4]
  [  698]
  [    8]
  [  628]
  [    8]
  [    9]
  [  134]
  [   37]
  [  129]
  [   14]
  [

In [8]:
print('Indexing glove word vectors')
#Glove Vectors
glove_embeddings_index = {}
f = open(GLOVE_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(glove_embeddings_index))

Indexing glove word vectors
Total 400000 word vectors.


In [9]:
print('Preparing glove embedding matrix')
glove_embedding_matrix = np.zeros((VOCAB_SIZE, GLOVE_EMBEDDING_DIM))
for word,i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        glove_embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))
print(glove_embedding_matrix.shape)

Preparing glove embedding matrix
Null word embeddings: 298
(12615, 200)


In [10]:
from keras.layers import merge
from keras.layers.core import Lambda
from keras.models import Model

import tensorflow as tf

def make_parallel(model, gpu_count):
    def get_slice(data, idx, parts):
        shape = tf.shape(data)
        size = tf.concat([ shape[:1] // parts, shape[1:] ],axis=0)
        stride = tf.concat([ shape[:1] // parts, shape[1:]*0 ],axis=0)
        start = stride * idx
        return tf.slice(data, start, size)

    outputs_all = []
    for i in range(len(model.outputs)):
        outputs_all.append([])

    #Place a copy of the model on each GPU, each getting a slice of the batch
    for i in range(gpu_count):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('tower_%d' % i) as scope:

                inputs = []
                #Slice each input into a piece for processing on this GPU
                for x in model.inputs:
                    input_shape = tuple(x.get_shape().as_list())[1:]
                    slice_n = Lambda(get_slice, output_shape=input_shape, arguments={'idx':i,'parts':gpu_count})(x)
                    inputs.append(slice_n)                

                outputs = model(inputs)
                
                if not isinstance(outputs, list):
                    outputs = [outputs]
                
                #Save all the outputs for merging back together later
                for l in range(len(outputs)):
                    outputs_all[l].append(outputs[l])

    # merge outputs on CPU
    with tf.device('/cpu:0'):
        merged = []
        for outputs in outputs_all:
            merged.append(merge(outputs, mode='concat', concat_axis=0))
            
        return Model(input=model.inputs, output=merged)

In [11]:
# define model
def StackedLSTM(vocab_size, glove_embedding_dim, glove_embedding_matrix, embedding_dim1, embedding_dim2,
           hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, tag_size,
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    inputs = Input(shape=(None,))
    tag_inputs = Input(shape=(None,tag_size))
    
    glove_embedding = Embedding(vocab_size, glove_embedding_dim, #input_length=max_seq_len,
                                  weights=[glove_embedding_matrix],
                                  mask_zero=True,trainable=False)(inputs)
    
    word_embedding = Embedding(vocab_size, embedding_dim1, #input_length=max_seq_len, 
                               mask_zero=True, embeddings_regularizer=regularizers.l2(regularization))(inputs)
    
    #tag_embedding = Embedding(tag_size, tag_size, embeddings_regularizer=regularizers.l2(regularization))(tag_inputs)
    
    concat_embeds = Concatenate(axis=-1)([glove_embedding, word_embedding, tag_inputs])
    
    final_embed = Dense(units=embedding_dim2, activation='tanh',
                        kernel_regularizer=regularizers.l2(regularization))(concat_embeds)
    
    lstm1 = LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(final_embed)
    
    lstm2 = LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(lstm1)
    
    timedist_dropout = TimeDistributed(Dropout(dropout_factor))(lstm2)
    
    deep_dense = Dense(units=deeper_dim, activation='tanh', 
                       kernel_regularizer=regularizers.l2(regularization))(timedist_dropout)
    
    dropout_layer1 = Dropout(dropout_factor)(deep_dense)
    
    outputs = Dense(units=vocab_size, activation='softmax', 
                    kernel_regularizer=regularizers.l2(regularization))(dropout_layer1)
    
    model = Model(inputs=[inputs,tag_inputs], outputs=outputs)
    #model = make_parallel(Model(inputs=[inputs,tag_inputs], outputs=outputs), 2)
    #model = multi_gpu_model(model, gpus=2)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy]#, sample_weight_mode='temporal'
                 )
    return model

In [12]:
K.clear_session()
config = tf.ConfigProto(allow_soft_placement=True)
sess = tf.Session(config=config)
K.set_session(sess)

model = StackedLSTM(vocab_size=VOCAB_SIZE, glove_embedding_dim=GLOVE_EMBEDDING_DIM,
                    glove_embedding_matrix=glove_embedding_matrix, 
                    embedding_dim1=EMBEDDING_DIM1, embedding_dim2=EMBEDDING_DIM2,
                    hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2,
                    deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
                    regularization=REGULARIZATION, learning_rate=LEARNING_RATE, tag_size=TAG_SIZE)
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    2523000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 512)    6458880     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 2)      0                                            
__________________________________________________________________________________________________
concatenat

In [13]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [14]:
start_time = time()
tensorboard = TB(log_dir="./logs/" + MODEL_PREFIX + "/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen' + str(RUN_INDEX) + '.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit([X_data, tag_data], y_data, epochs=10, batch_size=1024, shuffle=True, 
          verbose=1, validation_split=0.2, callbacks=callbacks, sample_weight=weight_data)

print("Total elapsed time: ", time()-start_time)

Train on 1119060 samples, validate on 279766 samples
Epoch 1/10

Epoch 00001: saving model to ./models/checkpoints/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.01-10.00.hdf5

Epoch 00001: val_loss improved from inf to 10.00103, saving model to ./models/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 2/10

Epoch 00002: saving model to ./models/checkpoints/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.02-9.36.hdf5

Epoch 00002: val_loss improved from 10.00103 to 9.36054, saving model to ./models/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 3/10

Epoch 00003: saving model to ./models/checkpoints/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.03-9.11.hdf5

Epoch 00003: val_loss improved from 9.36054 to 9.11086, saving model to ./models/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 4/10

Epoch 00004: saving model to ./models/checkpoints/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.04-8.96.

KeyboardInterrupt: 

In [5]:
#model.save_weights(MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen' + str(RUN_INDEX)+'_epoch1.hdf5')
model = load_model('./models/checkpoints/combined_jokes_quote_controlled_stacked_lstm_glove_gen1.08-8.66.hdf5')

In [6]:
# generate a sequence from a language model
def generate_categorical(model, tokenizer, seed_text, maxlen, probabilistic=False, exploration_factor=1.0, tag=0):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        encoded_seq = seq
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = encoded_seq[-1*MAX_SEQ_LEN:]
            
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        tags = to_categorical(np.full((1, padded_seq[0].shape[0]), tag), TAG_SIZE)
        y_prob = model.predict([padded_seq,tags])[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        
        if np.random.rand() <= exploration_factor:
            probabilistic = True
        else:
            probabilistic = False
            
        if probabilistic:
            y_class = np.argmax(np.random.multinomial(1,y_prob[0]/(np.sum(y_prob[0])+1e-5),1))
        else:
            y_class = y_prob.argmax(axis=-1)[0]
        
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen or out_word == 'sos':
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [28]:
joke = generate_categorical(model, tokenizer, "sos yo mama", maxlen=40, tag=0, exploration_factor=0.1)
print(joke)

[1, 873, 1182]
sos yo mama so fat she had to drop her fist in the oven . she says " i don't know , but i think i was going to get a little bit of a lighter . eos


In [26]:
joke = generate_categorical(model, tokenizer, "sos yo mama", maxlen=40, tag=1, exploration_factor=0.2)
print(joke)

[1, 873, 1182]
sos yo mama is so expressive . she has always been the kind of guy that i love in my life , but i have a lot of friends , and i don't think she'd ever seen him . eos


In [29]:
###Generate 10 random jokes

In [30]:
random = np.random.randint(0,len(sentences),100)
random.shape

(100,)

In [36]:
joke_list = []
for i in random[:20]:
    sent = sentences[i]
    seed_list = sent.split()
    seed = seed_list[:int(len(seed_list)/3)]
    joke = generate_categorical(model, tokenizer,' '.join(seed), maxlen=40, tag=0, exploration_factor=0.3)
    joke_list.append((' '.join(seed),joke))

[1, 695, 162, 695, 47, 19, 23, 99, 1904, 503, 21]
[1, 23, 24, 11, 42, 38, 11, 361, 325, 3349, 382]
[1, 6, 1163, 1199]
[1, 6, 24, 32, 111, 13, 239, 95]
[1, 100, 732, 9]
[1, 4798, 4991, 34, 3146, 221, 3, 485, 2610, 54]
[1, 4767, 56, 459, 93, 5, 1778, 124, 37, 117]
[1, 1698, 3]
[1, 6, 1366, 1033, 4]
[1, 23, 24, 11, 107, 5]
[1, 31, 3589, 110, 6, 15, 4, 6763, 8, 65, 134, 65, 899]
[1, 50, 24, 11]
[1, 6, 106, 466, 72]
[1, 6, 36, 13, 184]
[1, 6, 269, 7, 265, 2050, 2013, 8, 89, 3]
[1, 6, 79, 7, 64, 42]
[1, 2571, 99, 998]
[1, 569, 732, 9, 83]
[1, 100, 4, 146, 125, 5, 3739]
[1, 6, 56, 10768, 11, 8, 772, 1172, 12, 5, 10751]


In [37]:
joke_list

[('sos blonde joke blonde : " what does idk stand for',
  'sos blonde joke blonde : " what does idk stand for ? " " a : " little johnny , " they said it was one of the few . eos'),
 ('sos what do you get when you cross hot chicks coming',
  'sos what do you get when you cross hot chicks coming from a gay bar ? a : storage eos'),
 ('sos i bet cats',
  "sos i bet cats are when golfers have gender questions . they don't know how to argue with it . eos"),
 ('sos i do not see my family life',
  "sos i do not see my family life . i don't know what i told them was , but i think they misunderstood me when i said i was pregnant . eos"),
 ("sos what's green and",
  "sos what's green and smells like pork ? kermit the frog's finger and says - wow , i think you should have seen the coolest guy in fb . eos"),
 ('sos edward snowden just joined twitter . almost immediately he',
  'sos edward snowden just joined twitter . almost immediately he had no gaming . i was just a bit of a racist . eos'),
 ('so

In [38]:
for f,s in joke_list:
    print(s)

sos blonde joke blonde : " what does idk stand for ? " " a : " little johnny , " they said it was one of the few . eos
sos what do you get when you cross hot chicks coming from a gay bar ? a : storage eos
sos i bet cats are when golfers have gender questions . they don't know how to argue with it . eos
sos i do not see my family life . i don't know what i told them was , but i think they misunderstood me when i said i was pregnant . eos
sos what's green and smells like pork ? kermit the frog's finger and says - wow , i think you should have seen the coolest guy in fb . eos
sos edward snowden just joined twitter . almost immediately he had no gaming . i was just a bit of a racist . eos
sos gambling can turn into a dangerous two - way - to - stop - motion . eos
sos weed . i was his mother and i was like the same way i forgot to take a picture of her gone . eos
sos i accidentally wet the paper in the shower threedots threedots but i couldn't find it . eos
sos what do you call a black guy 