In [1]:
import csv
import codecs
import numpy as np
import pandas as pd
import pickle 

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [36]:
########################################
## set directories and parameters
########################################
EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
TRAIN_DATA_FILE = "./datasets/jokes.pickle"
VOCABULARY_FILE = "./datasets/jokes_vocabulary.pickle"
MAX_SEQUENCE_LENGTH = 10
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
SEQUENCE_STEP = 1
#VALIDATION_SPLIT = 0.1

In [3]:
num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

act = 'relu'
#re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)


## Load Glove vectors

In [5]:
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

Indexing word vectors


In [6]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


## Load Training data

In [99]:
data = pickle.load(open(TRAIN_DATA_FILE,"rb"))
vocabulary = pickle.load(open(VOCABULARY_FILE,"rb"))
print("#training jokes/quotes: ",len(data))
print("#Vocabulary ",len(vocabulary))

#training jokes/quotes:  94195
#Vocabulary  8338


In [None]:
## Check OOV words that are not present in glove vocabulary

In [103]:
oov = []
for word in vocabulary:
    if word not in embeddings_index:
        oov.append(word)
print("#non vocabulary words: ",len(oov))
print(oov[:max(len(oov),100)])

#non vocabulary words:  108
['shouldnt', 'threedots', 'sleevies', 'theyd', 'tifu', 'howd', 'wifes', 'dumbass', 'shitzu', 'kylo', 'biebers', 'theyll', 'youve', 'selfie', 'everyones', 'blowjob', 'draculas', 'harambe', 'tsss', 'schrodingers', 'redditor', 'hadnt', 'werent', 'pubes', 'brexit', 'whered', 'moaner', 'teethbrush', 'idk', 'unfollow', 'mustve', 'theyve', 'cmon', 'hahaha', 'itll', 'cums', 'necrophiliacs', 'downvote', 'shouldve', 'redditors', 'pornstars', 'clickbait', 'whatd', 'tennish', 'cumference', 'neverlands', 'updog', 'nsfw', 'roamin', 'farted', 'jehovahs', 'maam', 'pussies', 'whatll', 'nobodys', 'titties', 'sjws', 'anyones', 'exs', 'beethovens', 'friendzone', 'reddits', 'spaghetto', 'handjob', 'neckbeard', 'wouldve', 'retweet', 'upvotes', 'beastiality', 'crossfitter', 'douchebag', 'hurty', 'motherfucking', 'hodor', 'shits', 'walmarts', 'badum', 'turds', 'reposts', 'subreddit', 'selfies', 'neckbeards', 'pornhub', 'unfollowed', 'tauntaun', 'thatll', 'beiber', 'ofurniture', 'ar

## Reducing data for faster iterations

In [23]:
percentage = 0.05
data_reduced = data[:int(percentage*len(data))]

## Tokenizing Sentences to create sequence of inetgers and building word to id mappings

In [95]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_reduced)
sequences = tokenizer.texts_to_sequences(data_reduced)
#word to index
word_index = tokenizer.word_index
#index to word
index_word = {}
for k,v in word_index.items():
    index_word[v] = k
print('Found %s unique tokens.' % len(word_index))
print(len(sequences),len(data_reduced))
assert len(data_reduced) == len(sequences)

Found 5928 unique tokens.




In [32]:
word_index_list = []
for sentence in sequences:
    word_index_list += sentence
print("#Total Sequence Length:", len(word_index_list))

## Prepare training data sequences

In [104]:
#create sequences
X_train = []
y_train = np.zeros((len(word_index_list)-MAX_SEQUENCE_LENGTH,nb_words+1),dtype=np.bool)
print(SEQUENCE_STEP,MAX_SEQUENCE_LENGTH)
for i in range(0, len(word_index_list) - MAX_SEQUENCE_LENGTH, SEQUENCE_STEP):
    X_train.append(word_index_list[i: i + MAX_SEQUENCE_LENGTH])
    y_train[i][word_index_list[i + MAX_SEQUENCE_LENGTH]] = 1
X_train = np.array(X_train)
print('nb sequences:', len(X_train))
print(X_train.shape,y_train.shape)
assert X_train.shape[0] == y_train.shape[0]

1 10
nb sequences: 86892
(86892, 10) (86892, 5929)


## prepare embeddings

In [41]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index)) #Vocabulary size
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 72


## Define the model structure

In [105]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = Bidirectional(LSTM(num_lstm, activation="relu",dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm),input_shape=(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM))
data_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(data_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
preds = Dense(nb_words+1, activation='softmax')(x)
model = Model(inputs=[data_input], \
        outputs=preds)
model.compile(loss='categorical_crossentropy',
        optimizer='rmsprop',
        metrics=[categorical_accuracy])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 10, 100)           592800    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 600)               962400    
_________________________________________________________________
dropout_4 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5929)              3563329   
Total params: 5,118,529
Trainable params: 4,525,729
Non-trainable params: 592,800
_________________________________________________________________
None


In [55]:
STAMP = 'simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense)
print(STAMP)

simple_lstm_glove_vectors_0.25_0.25


In [56]:
early_stopping =EarlyStopping(monitor='val_loss', patience=2)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

## Train the model

In [69]:
hist = model.fit(X_train,y_train, \
        epochs=5, batch_size=256, shuffle=True, \
         callbacks=[early_stopping, model_checkpoint],
                validation_split=0.1)

Train on 78202 samples, validate on 8690 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [70]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

In [71]:
print(bst_val_score)

5.311095283824363


In [73]:
sequences[0]

[1,
 13,
 30,
 4,
 476,
 57,
 6,
 4,
 791,
 122,
 53,
 4080,
 112,
 1815,
 40,
 118,
 12,
 20,
 9,
 374,
 2]

In [86]:
predict = model.predict([X_train[0:10]])

In [92]:
idx = np.argmax(predict,axis=1)

In [93]:
idx.shape

(10,)

In [94]:
idx

array([ 4, 17,  3,  3,  2,  2,  2,  2,  4,  2])

In [96]:
for id in idx:
    print(index_word[id])

the
was
a
a
eos
eos
eos
eos
the
eos
