#### Following https://www.kaggle.com/qqgeogor/keras-lstm-attention-glove840b-lb-0-043

In [1]:
import csv
import codecs
import numpy as np
import pandas as pd
import pickle 

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

## set directories and parameters

In [3]:
EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
TRAIN_DATA_FILE = "./datasets/jokes.pickle"
VOCABULARY_FILE = "./datasets/jokes_vocabulary.pickle"
MAX_SEQUENCE_LENGTH = 10
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
SEQUENCE_STEP = 1
#VALIDATION_SPLIT = 0.1

In [4]:
num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

act = 'relu'
#re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)


## Load Glove vectors

In [5]:
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

Indexing word vectors


In [6]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


## Load Training data

In [7]:
data = pickle.load(open(TRAIN_DATA_FILE,"rb"))
vocabulary = pickle.load(open(VOCABULARY_FILE,"rb"))
print("#training jokes/quotes: ",len(data))
print("#Vocabulary ",len(vocabulary))

#training jokes/quotes:  94195
#Vocabulary  8338


## Check OOV words that are not present in glove vocabulary

In [9]:
oov = []
for word in vocabulary:
    if word not in embeddings_index:
        oov.append(word)
print("#non vocabulary words: ",len(oov))
print(oov[:max(len(oov),100)])

#non vocabulary words:  108
['shouldnt', 'threedots', 'sleevies', 'theyd', 'tifu', 'howd', 'wifes', 'dumbass', 'shitzu', 'kylo', 'biebers', 'theyll', 'youve', 'selfie', 'everyones', 'blowjob', 'draculas', 'harambe', 'tsss', 'schrodingers', 'redditor', 'hadnt', 'werent', 'pubes', 'brexit', 'whered', 'moaner', 'teethbrush', 'idk', 'unfollow', 'mustve', 'theyve', 'cmon', 'hahaha', 'itll', 'cums', 'necrophiliacs', 'downvote', 'shouldve', 'redditors', 'pornstars', 'clickbait', 'whatd', 'tennish', 'cumference', 'neverlands', 'updog', 'nsfw', 'roamin', 'farted', 'jehovahs', 'maam', 'pussies', 'whatll', 'nobodys', 'titties', 'sjws', 'anyones', 'exs', 'beethovens', 'friendzone', 'reddits', 'spaghetto', 'handjob', 'neckbeard', 'wouldve', 'retweet', 'upvotes', 'beastiality', 'crossfitter', 'douchebag', 'hurty', 'motherfucking', 'hodor', 'shits', 'walmarts', 'badum', 'turds', 'reposts', 'subreddit', 'selfies', 'neckbeards', 'pornhub', 'unfollowed', 'tauntaun', 'thatll', 'beiber', 'ofurniture', 'ar

## Reducing data for faster iterations

In [10]:
percentage = 0.1
data_reduced = data[:int(percentage*len(data))]

## Tokenizing Sentences to create sequence of inetgers and building word to id mappings

In [11]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_reduced)
sequences = tokenizer.texts_to_sequences(data_reduced)
#word to index
word_index = tokenizer.word_index
#index to word
index_word = {}
for k,v in word_index.items():
    index_word[v] = k
print('Found %s unique tokens.' % len(word_index))
print(len(sequences),len(data_reduced))
assert len(data_reduced) == len(sequences)



Found 7150 unique tokens.
9419 9419


In [30]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1 #Vocabulary size
word_index_list = []
for sentence in sequences:
    word_index_list += sentence
print("#Total Sequence Length:", len(word_index_list))

#Total Sequence Length: 173427


## Prepare training data sequences

In [31]:
#create sequences
X_train = []
y_train = np.zeros((len(word_index_list)-MAX_SEQUENCE_LENGTH,nb_words),dtype=np.bool)
print(SEQUENCE_STEP,MAX_SEQUENCE_LENGTH)
for i in range(0, len(word_index_list) - MAX_SEQUENCE_LENGTH, SEQUENCE_STEP):
    X_train.append(word_index_list[i: i + MAX_SEQUENCE_LENGTH])
    y_train[i][word_index_list[i + MAX_SEQUENCE_LENGTH]] = 1
X_train = np.array(X_train)
print('nb sequences:', len(X_train))
print(X_train.shape,y_train.shape)
assert X_train.shape[0] == y_train.shape[0]

1 10
nb sequences: 173417
(173417, 10) (173417, 7151)


## prepare embeddings

In [32]:
print('Preparing embedding matrix')
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 91


## Define the model structure

In [39]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = Bidirectional(LSTM(num_lstm, activation="relu",return_sequences=True,dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm),input_shape=(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM))
data_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(data_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
#Add attention
x = Attention(MAX_SEQUENCE_LENGTH)(x)
preds = Dense(nb_words, activation='softmax')(x)
model = Model(inputs=[data_input], \
        outputs=preds)
model.compile(loss='categorical_crossentropy',
        optimizer='rmsprop',
        metrics=[categorical_accuracy])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 10, 100)           715100    
_________________________________________________________________
bidirectional_9 (Bidirection (None, 10, 600)           962400    
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 600)           0         
_________________________________________________________________
attention_4 (Attention)      (None, 600)               610       
_________________________________________________________________
dense_3 (Dense)              (None, 7151)              4297751   
Total params: 5,975,861
Trainable params: 5,260,761
Non-trainable params: 715,100
____________________________________________________________

In [40]:
STAMP = 'simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense)
print(STAMP)

simple_lstm_glove_vectors_0.25_0.25


In [41]:
early_stopping =EarlyStopping(monitor='val_loss', patience=2)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

## Train the model

In [None]:
hist = model.fit(X_train,y_train, \
        epochs=5, batch_size=256, shuffle=True, \
         callbacks=[early_stopping, model_checkpoint],
                validation_split=0.1)

Train on 156075 samples, validate on 17342 samples
Epoch 1/5


In [70]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

In [None]:
## Test Prediction

In [86]:
predict = model.predict([X_train[0:10]])
idx = np.argmax(predict,axis=1)
for id in idx:
    print(index_word[id])