In [29]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,Input
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.layers.merge import Concatenate
from keras.utils.np_utils import to_categorical   
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf

from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

In [2]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

### Variables

In [27]:
DATA_PATH = './datasets/combined.pickle'
VOCAB_PATH = './datasets/combined_vocabulary.pickle'
MODELS_PATH = './models/'
EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
MAX_SEQ_LEN = 13
VALIDATION_SPLIT = 0.1

EMBEDDING_DIM = 100

HIDDEN_DIM = 512
DROPOUT_FACTOR = 0.333
REGULARIZATION = 0.00001
DEEPER_DIM = 256
DATA_PERCENT = 0.1
RUN_INDEX = 4
TAG_DIM = 3
# EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
# TRAIN_DATA_FILE = "./datasets/combined.pickle"
# VOCABULARY_FILE = "./datasets/combined_vocabulary.pickle"
# MAX_SEQUENCE_LENGTH = 10
MAX_NB_WORDS = 200000
# EMBEDDING_DIM = 100
SEQUENCE_STEP = 1
# #VALIDATION_SPLIT = 0.1

In [6]:
with open(DATA_PATH, 'rb') as pickleFile:
    tags_and_sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  270543
[[2, "sos thinking about all the guys who have screwed me over . i just want to find someone who's real . eos"], [0, 'sos what comes after l ? bow eos']]
Vocab size =  12614
['liberate', 'savings', 'clip', 'rhino', 'cross', 'encouragement', 'viewed', 'womens', 'lap', 'challenges']


In [21]:
#Specially process combined data (separate texts and index)
combined_data = np.array(tags_and_sentences)
print(combined_data.shape)
sentences = np.array(combined_data[:,1])
tags = to_categorical(np.array(combined_data[:,0]))
print(sentences.shape)
print(tags.shape)
#print(sentences)

(270543, 2)
(270543,)
(270543, 3)


In [13]:
# tokenize data
num_words = len(vocab)

tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
                      char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)
#word to index
word_index = tokenizer.word_index
encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 72, 14, 205, 56, 4, 104, 4254, 4975, 13, 282, 309, 4114, 1263, 16, 2], [1, 7, 1704, 6, 3990, 8, 2731, 8, 9, 3468, 3, 323, 53, 101, 716, 3, 2], [1, 49, 1195, 11, 2086, 12, 12, 113, 11, 973, 1490, 13, 2], [1, 5, 1567, 2822, 85, 40, 4, 1948, 28, 46, 858, 7, 945, 3, 2], [1, 50, 34, 4, 3231, 2841, 6, 4, 367, 13, 71, 296, 7, 4384, 3, 2]]
12615


In [14]:
# saving
with open(MODELS_PATH + 'combined_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [7]:
X_data = []
y_data = []
for sentence in encoded_sentences:
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+sliding_window_length])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)
X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = np.array(y_data).reshape(-1,1)
print(X_data.shape)
print(X_data[:2])
print(y_data.shape)
print(y_data[:2])

Total training data size =  833131
Max seq len =  13
(833131, 13)
[[   1  217   23   25   36  256 3028   73  159  138    6    4   23]
 [ 217   23   25   36  256 3028   73  159  138    6    4   23   31]]
(833131, 1)
[[ 31]
 [135]]


In [15]:
print('Indexing word vectors')
#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))

Indexing word vectors
Total 400000 word vectors.


In [16]:
print('Preparing embedding matrix')
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word,i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 298


In [35]:
# define model
def BiLSTM(vocab_size, embedding_dim, hidden1_dim,hidden_dim2,hidden_dim3 max_seq_len, 
           tag_dim,dropout_factor=0.2,regularization=0.00001):
    
    model = Sequential()
    model.add(Embedding(vocab_size,embedding_dim,input_length=max_seq_len,weights=[embedding_matrix],mask_zero=True,trainable=False))
    tag_input = Input(shape=(tag_dim,), name='aux_input')
    model.add(Concatenate([model, tag_input]))
    model.add(LSTM(hidden_dim1, 
                                 activation='tanh',
                                 kernel_regularizer=regularizers.l2(regularization),
                                 recurrent_regularizer=regularizers.l2(regularization), unroll=True, return_sequences = True
                                ))
    model.add(LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    
     model.add(LSTM(hidden_dim3, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor))
    
    
    model.add(Dropout(dropout_factor))
    model.add(Attention(MAX_SEQUENCE_LENGTH))
    model.add(Dense(units=vocab_size, activation='softmax', 
              kernel_regularizer=regularizers.l2(regularization)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy])
    return model

In [36]:
model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim1=512,hidden_dim2=256,hidden_dim3=128,
              max_seq_len=MAX_SEQ_LEN,tag_dim=TAG_DIM,dropout_factor=DROPOUT_FACTOR, regularization=REGULARIZATION)
print(model.summary())

ValueError: A `Concatenate` layer should be called on a list of at least 2 inputs

In [12]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/jokes_bilstm/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/jokes_bilstm_gen'+str(RUN_INDEX)+'.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + 'jokes_bilstm_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit(X_data, y_data, epochs=50, batch_size=2048, shuffle=True, verbose=1, validation_split=0.2, 
          callbacks=callbacks)

print("Total elapsed time: ",time()-start_time)

Train on 666504 samples, validate on 166627 samples
Epoch 1/50
 81920/666504 [==>...........................] - ETA: 2:11 - loss: 5.1130 - sparse_categorical_crossentropy: 5.0824 - sparse_categorical_accuracy: 0.2048

In [1]:
# generate a sequence from a language model
def generate(model, tokenizer, seed_text, maxlen):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        if len(seq) > MAX_SEQ_LEN:
            encoded_seq = seq[-1*MAX_SEQ_LEN:]
        else:
            encoded_seq = seq
        padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        #padded_seq = np.array([seq])
        y_prob = model.predict(padded_seq)
        #y_class = y_prob.argmax(axis=-1)[0]
        y_class_max = y_prob.argmax(axis=-1)[0]
        y_class_sample = np.argmax(np.random.multinomial(1,y_prob.squeeze(axis=0),1))
        y_class = y_class_sample
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen:
            break
    words = [reverse_word_map[idx] for idx in seq]
    return ' '.join(words)

In [29]:
model1 =  load_model('models/checkpoints/jokes_bilstm_gen2.08-4.39.hdf5')

In [2]:
joke = generate(model, tokenizer, "sos a guy finds", maxlen=40)
print(joke)

NameError: name 'model' is not defined

In [19]:
## Document Similarity 

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
joke = generate(model, tokenizer,' '.join(sentences[12].split()[:3]), maxlen=40)
print(joke)
tfidf_vectorizer = TfidfVectorizer()
X_tf_idf_data = tfidf_vectorizer.fit_transform(sentences)
joke_vector = tfidf_vectorizer.transform([joke])
d = cosine_distances(X_tf_idf_data,joke_vector)
sentences[np.argmax(d)]

[1, 159, 92]
sos before we ipad most dyslexics tang blankets beyonce sings restroom hugs presidency vision frame opinion daily itch monkeys care daughter's harry quietly burns archaeologists glove recipe bruno honour minor wimbledon insensitive answers jewish teenagers motherfucking rousey fowl judged marvel la


'sos phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew phew the chosen phew eos'