In [None]:
import csv
import codecs
import numpy as np
import pandas as pd
import pickle 

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

In [34]:
########################################
## set directories and parameters
########################################
EMBEDDING_FILE = "./data/glove/glove.6B.100d.txt"
TRAIN_DATA_FILE = "./datasets/jokes.pickle"
VOCABULARY_FILE = "./datasets/jokes_vocabulary.pickle"
MAX_SEQUENCE_LENGTH = 10
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
#VALIDATION_SPLIT = 0.1

In [37]:
num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

act = 'relu'
#re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)


In [12]:
#Load Glove vectors

In [10]:
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

Indexing word vectors


In [11]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [13]:
#Load Training data

In [16]:
data = pickle.load(open(TRAIN_DATA_FILE,"rb"))
vocabulary = pickle.load(open(VOCABULARY_FILE,"rb"))

In [17]:
len(data)

94195

In [18]:
len(vocabulary)

8338

In [21]:
oov = []
for word in vocabulary:
    if word not in embeddings_index:
        #print(word)
        oov.append(word)

In [23]:
len(oov)

108

In [24]:
data[:10]

['sos what did the duck say to the cashier after she purchased some lipstick ? " just put it on my bill " eos',
 'sos i dont know why this bottle of whiskey tastes so good , but im going to get to the bottom of it . eos',
 'sos hell yeah im a catholic ive been addicted to cats my whole life eos',
 'sos are your parents retarded ? threedots because youre one special lady ! eos',
 'sos apparently there are midget prostitutes in thailand who cost less than a dollar . clearly theyre selling themselves short . eos',
 'sos have you seen www . yawn . com ? yes but im a bit tired of it . eos',
 'sos what do you get when you cross the italian mafia with the russian mafia ? thrown in a lake . eos',
 'sos restaurant a friend said she heard theres a wonderful restaurant on the moon , but nobody goes there because theres no atmosphere threedots eos',
 'sos whats the difference between oral sex and anal sex ? one makes your whole week , the other makes your hole weak . eos',
 'sos if the opposite of

In [30]:
percentage = 0.05
word_list = []
for sentence in data[:int(percentage*len(data))]:
    word_list += sentence.split()

In [31]:
len(word_list)

98135

In [32]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_list))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for i,word in enumerate(word_list):
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 1163


In [35]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

In [40]:
lstm_layer = Bidirectional(LSTM(num_lstm, activation="relu",dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm),input_shape=(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM))

In [None]:
data_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(data_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
preds = Dense(vocab_size, activation='softmax')(x)

In [None]:
########################################
## train the model
########################################
model = Model(inputs=[data_input], \
        outputs=preds)
model.compile(loss='categorical_crossentropy',
        optimizer='rmsprop',
        metrics=['accuracy'])
print(model.summary())

STAMP = 'simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense)
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=2)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit(data_train, labels_train, \
        epochs=5, batch_size=256, shuffle=True, \
         callbacks=[early_stopping, model_checkpoint],
                validation_split=0.1)
         
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])