CNN-RNN Model with Glove Embeddings (no POS Tags)

Based heavily on models from:

https://github.com/Smerity/keras_snli

https://github.com/bradleypallen/keras-quora-question-pairs/blob/master/keras-quora-question-pairs.py

In [1]:
from __future__ import print_function
import numpy as np
import csv, datetime, time, json
from zipfile import ZipFile
from os.path import expanduser, exists
import re
import pickle
import pandas as pd

In [2]:
# leverage keras for faster development
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.data_utils import get_file
from keras import backend as K
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [7]:
# Initialize global variables
KERAS_DATASETS_DIR = 'Data/'
QUESTION_PAIRS_FILE_URL = 'Data/'
QUESTION_PAIRS_FILE = 'train_lite.df.pkl'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TRAINING_GLOVE_FILE = 'q1_train_GLOVE.npy'
Q2_TRAINING_GLOVE_FILE = 'q2_train_GLOVE.npy'
LABEL_TRAINING_GLOVE_FILE = 'label_train_GLOVE.npy'
WORD_EMBEDDING_MATRIX_GLOVE_FILE = 'word_embedding_matrix_GLOVE.npy'
NB_WORDS_GLOVE_FILE = 'nb_words_glove.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
MODEL_WEIGHTS_FILE = 'question_pairs_weights_glove.h5'
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
#NB_EPOCHS = 1
DROPOUT = 0.1
BATCH_SIZE = 32
OPTIMIZER = 'adam'

In [8]:
# build questions with POS tags
def build_sent(q, q_pos):
    q = str(q).split()
    q_pos = str(q_pos).split()
    if len(q) > MAX_SEQUENCE_LENGTH/2:
        # cut it off
        q = q[0:MAX_SEQUENCE_LENGTH]
    else:
        # pad to max_time
        amt_to_pad = int((MAX_SEQUENCE_LENGTH/2) - len(q))
        q.extend([0] * amt_to_pad)
    if len(q_pos) > MAX_SEQUENCE_LENGTH/2:
        # cut it off
        q_pos = q_pos[0:MAX_SEQUENCE_LENGTH]
    else:
        # pad to max_time
        amt_to_pad = int((MAX_SEQUENCE_LENGTH/2) - len(q_pos))
        q_pos.extend([0] * amt_to_pad)
    
    q_final = q + q_pos
    q_final = ' '.join(str(w) for w in q_final)
    return q_final

In [9]:
# from word embedding example provided by instructors
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)

In [10]:
# load questions from POS tag process
print("Processing Questions")

question1 = []
question2 = []
is_duplicate = []
    
f = "Data/train0.df.pkl.gz"
dev = pd.read_pickle(f)
    
t0 = time.time()
train_pos = pd.DataFrame()
i = 1000
while i < 364000:
    f = "Data/train"+str(i)+".df.pkl.gz"
    print("Starting ", f, i, i+1000, pretty_timedelta(since=t0))
    x = pd.read_pickle(f)
    train_pos = pd.concat([train_pos, x])
    print("Finished ", f, i, i+1000, pretty_timedelta(since=t0))
    i += 1000
    if i == 26000:
        i = 27000 # skip bad file
    
# lets add in dev rows
f = "Data/dev0.df.pkl.gz"
dev = pd.read_pickle(f)
    
t0 = time.time()
i = 1000
while i < 41000:
    f = "Data/dev"+str(i)+".df.pkl.gz"
    print("Starting ", f, i, i+1000, pretty_timedelta(since=t0))
    x = pd.read_pickle(f)
    train_pos = pd.concat([train_pos, x])
    print("Finished ", f, i, i+1000, pretty_timedelta(since=t0))
    i += 1000
    if i == 34000:
        i = 35000 # skip bad file
    
    
for index, row in train_pos.iterrows():
    q1=row["question1"]
    # q1_pos = row["question1_pos"]
    q1_pos = ""
    q2=row["question2"]
    # q2_pos = row["question2_pos"]
    q2_pos = ""
    question1.append(build_sent(q1,q1_pos))
    question2.append(build_sent(q2,q2_pos))
    
    is_duplicate.append(row['is_duplicate'])

print('Question pairs: %d' % len(question1))
# end reading and building sentences
    
    
# Build tokenized word index
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

# Download and process GloVe embeddings
if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

# Prepare word embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

# Prepare training data tensors
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

# Persist training and configuration data to files
np.save(open(Q1_TRAINING_GLOVE_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_GLOVE_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_GLOVE_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_GLOVE_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_GLOVE_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)


Processing Questions
Starting  Data/train1000.df.pkl.gz 1000 2000 0:00:00
Finished  Data/train1000.df.pkl.gz 1000 2000 0:00:00
Starting  Data/train2000.df.pkl.gz 2000 3000 0:00:00
Finished  Data/train2000.df.pkl.gz 2000 3000 0:00:00
Starting  Data/train3000.df.pkl.gz 3000 4000 0:00:00
Finished  Data/train3000.df.pkl.gz 3000 4000 0:00:00
Starting  Data/train4000.df.pkl.gz 4000 5000 0:00:00
Finished  Data/train4000.df.pkl.gz 4000 5000 0:00:00
Starting  Data/train5000.df.pkl.gz 5000 6000 0:00:00
Finished  Data/train5000.df.pkl.gz 5000 6000 0:00:00
Starting  Data/train6000.df.pkl.gz 6000 7000 0:00:00
Finished  Data/train6000.df.pkl.gz 6000 7000 0:00:00
Starting  Data/train7000.df.pkl.gz 7000 8000 0:00:00
Finished  Data/train7000.df.pkl.gz 7000 8000 0:00:00
Starting  Data/train8000.df.pkl.gz 8000 9000 0:00:00
Finished  Data/train8000.df.pkl.gz 8000 9000 0:00:00
Starting  Data/train9000.df.pkl.gz 9000 10000 0:00:00
Finished  Data/train9000.df.pkl.gz 9000 10000 0:00:00
Starting  Data/train100

Finished  Data/train77000.df.pkl.gz 77000 78000 0:00:04
Starting  Data/train78000.df.pkl.gz 78000 79000 0:00:04
Finished  Data/train78000.df.pkl.gz 78000 79000 0:00:05
Starting  Data/train79000.df.pkl.gz 79000 80000 0:00:05
Finished  Data/train79000.df.pkl.gz 79000 80000 0:00:05
Starting  Data/train80000.df.pkl.gz 80000 81000 0:00:05
Finished  Data/train80000.df.pkl.gz 80000 81000 0:00:05
Starting  Data/train81000.df.pkl.gz 81000 82000 0:00:05
Finished  Data/train81000.df.pkl.gz 81000 82000 0:00:05
Starting  Data/train82000.df.pkl.gz 82000 83000 0:00:05
Finished  Data/train82000.df.pkl.gz 82000 83000 0:00:05
Starting  Data/train83000.df.pkl.gz 83000 84000 0:00:05
Finished  Data/train83000.df.pkl.gz 83000 84000 0:00:05
Starting  Data/train84000.df.pkl.gz 84000 85000 0:00:05
Finished  Data/train84000.df.pkl.gz 84000 85000 0:00:05
Starting  Data/train85000.df.pkl.gz 85000 86000 0:00:05
Finished  Data/train85000.df.pkl.gz 85000 86000 0:00:05
Starting  Data/train86000.df.pkl.gz 86000 87000 

Finished  Data/train148000.df.pkl.gz 148000 149000 0:00:13
Starting  Data/train149000.df.pkl.gz 149000 150000 0:00:13
Finished  Data/train149000.df.pkl.gz 149000 150000 0:00:13
Starting  Data/train150000.df.pkl.gz 150000 151000 0:00:13
Finished  Data/train150000.df.pkl.gz 150000 151000 0:00:13
Starting  Data/train151000.df.pkl.gz 151000 152000 0:00:13
Finished  Data/train151000.df.pkl.gz 151000 152000 0:00:13
Starting  Data/train152000.df.pkl.gz 152000 153000 0:00:13
Finished  Data/train152000.df.pkl.gz 152000 153000 0:00:13
Starting  Data/train153000.df.pkl.gz 153000 154000 0:00:13
Finished  Data/train153000.df.pkl.gz 153000 154000 0:00:14
Starting  Data/train154000.df.pkl.gz 154000 155000 0:00:14
Finished  Data/train154000.df.pkl.gz 154000 155000 0:00:14
Starting  Data/train155000.df.pkl.gz 155000 156000 0:00:14
Finished  Data/train155000.df.pkl.gz 155000 156000 0:00:14
Starting  Data/train156000.df.pkl.gz 156000 157000 0:00:14
Finished  Data/train156000.df.pkl.gz 156000 157000 0:00:

Finished  Data/train218000.df.pkl.gz 218000 219000 0:00:25
Starting  Data/train219000.df.pkl.gz 219000 220000 0:00:25
Finished  Data/train219000.df.pkl.gz 219000 220000 0:00:25
Starting  Data/train220000.df.pkl.gz 220000 221000 0:00:25
Finished  Data/train220000.df.pkl.gz 220000 221000 0:00:26
Starting  Data/train221000.df.pkl.gz 221000 222000 0:00:26
Finished  Data/train221000.df.pkl.gz 221000 222000 0:00:26
Starting  Data/train222000.df.pkl.gz 222000 223000 0:00:26
Finished  Data/train222000.df.pkl.gz 222000 223000 0:00:26
Starting  Data/train223000.df.pkl.gz 223000 224000 0:00:26
Finished  Data/train223000.df.pkl.gz 223000 224000 0:00:26
Starting  Data/train224000.df.pkl.gz 224000 225000 0:00:26
Finished  Data/train224000.df.pkl.gz 224000 225000 0:00:27
Starting  Data/train225000.df.pkl.gz 225000 226000 0:00:27
Finished  Data/train225000.df.pkl.gz 225000 226000 0:00:27
Starting  Data/train226000.df.pkl.gz 226000 227000 0:00:27
Finished  Data/train226000.df.pkl.gz 226000 227000 0:00:

Finished  Data/train288000.df.pkl.gz 288000 289000 0:00:41
Starting  Data/train289000.df.pkl.gz 289000 290000 0:00:41
Finished  Data/train289000.df.pkl.gz 289000 290000 0:00:42
Starting  Data/train290000.df.pkl.gz 290000 291000 0:00:42
Finished  Data/train290000.df.pkl.gz 290000 291000 0:00:42
Starting  Data/train291000.df.pkl.gz 291000 292000 0:00:42
Finished  Data/train291000.df.pkl.gz 291000 292000 0:00:42
Starting  Data/train292000.df.pkl.gz 292000 293000 0:00:42
Finished  Data/train292000.df.pkl.gz 292000 293000 0:00:42
Starting  Data/train293000.df.pkl.gz 293000 294000 0:00:42
Finished  Data/train293000.df.pkl.gz 293000 294000 0:00:43
Starting  Data/train294000.df.pkl.gz 294000 295000 0:00:43
Finished  Data/train294000.df.pkl.gz 294000 295000 0:00:43
Starting  Data/train295000.df.pkl.gz 295000 296000 0:00:43
Finished  Data/train295000.df.pkl.gz 295000 296000 0:00:43
Starting  Data/train296000.df.pkl.gz 296000 297000 0:00:43
Finished  Data/train296000.df.pkl.gz 296000 297000 0:00:

Finished  Data/train358000.df.pkl.gz 358000 359000 0:01:01
Starting  Data/train359000.df.pkl.gz 359000 360000 0:01:01
Finished  Data/train359000.df.pkl.gz 359000 360000 0:01:02
Starting  Data/train360000.df.pkl.gz 360000 361000 0:01:02
Finished  Data/train360000.df.pkl.gz 360000 361000 0:01:02
Starting  Data/train361000.df.pkl.gz 361000 362000 0:01:02
Finished  Data/train361000.df.pkl.gz 361000 362000 0:01:02
Starting  Data/train362000.df.pkl.gz 362000 363000 0:01:02
Finished  Data/train362000.df.pkl.gz 362000 363000 0:01:03
Starting  Data/train363000.df.pkl.gz 363000 364000 0:01:03
Finished  Data/train363000.df.pkl.gz 363000 364000 0:01:03
Starting  Data/dev1000.df.pkl.gz 1000 2000 0:00:00
Finished  Data/dev1000.df.pkl.gz 1000 2000 0:00:00
Starting  Data/dev2000.df.pkl.gz 2000 3000 0:00:00
Finished  Data/dev2000.df.pkl.gz 2000 3000 0:00:00
Starting  Data/dev3000.df.pkl.gz 3000 4000 0:00:00
Finished  Data/dev3000.df.pkl.gz 3000 4000 0:00:01
Starting  Data/dev4000.df.pkl.gz 4000 5000 0:

In [11]:
#print('Word embeddings: ' % word_embedding_matrix.shape)
print('Shape of embedding matrix:', word_embedding_matrix.shape)
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of embedding matrix: (94380, 300)
Null word embeddings: 28741
Shape of question1 data tensor: (400290, 25)
Shape of question2 data tensor: (400290, 25)
Shape of label tensor: (400290,)


In [12]:
# Partition the dataset into train and test sets
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [13]:
# Define the model
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

# embedding
q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
# print ("q1 shape:", q1.shape)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
# max pooling
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)
# print ("q1 shape:", q1.shape)
q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

# merge and send into RNN
merged = concatenate([q1,q2])
# print(merged.shape)
merged = Dense(200, activation='relu')(merged)
# print(merged.shape)

# 4 layer
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])


In [14]:
# Train the model, checkpointing weights with best validation accuracy
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2017-08-21 00:35:44.425403
Train on 324234 samples, validate on 36027 samples
Epoch 1/25
1055s - loss: 0.5518 - acc: 0.7206 - val_loss: 0.5097 - val_acc: 0.7464
Epoch 2/25
1043s - loss: 0.4951 - acc: 0.7576 - val_loss: 0.5064 - val_acc: 0.7422
Epoch 3/25
1042s - loss: 0.4674 - acc: 0.7753 - val_loss: 0.4623 - val_acc: 0.7802
Epoch 4/25
1037s - loss: 0.4470 - acc: 0.7875 - val_loss: 0.4550 - val_acc: 0.7840
Epoch 5/25
1041s - loss: 0.4289 - acc: 0.7991 - val_loss: 0.4519 - val_acc: 0.7885
Epoch 6/25
1044s - loss: 0.4140 - acc: 0.8085 - val_loss: 0.4637 - val_acc: 0.7742
Epoch 7/25
862s - loss: 0.3987 - acc: 0.8180 - val_loss: 0.4338 - val_acc: 0.7966
Epoch 8/25
570s - loss: 0.3864 - acc: 0.8247 - val_loss: 0.4244 - val_acc: 0.8003
Epoch 9/25
569s - loss: 0.3758 - acc: 0.8316 - val_loss: 0.4265 - val_acc: 0.7992
Epoch 10/25
569s - loss: 0.3640 - acc: 0.8385 - val_loss: 0.4252 - val_acc: 0.7987
Epoch 11/25
569s - loss: 0.3535 - acc: 0.8440 - val_loss: 0.4378 - val_acc

In [15]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('Test loss = {0:.4f}, test accuracy = {1:.4f}'.format(loss, accuracy))

Test loss = 0.4325, test accuracy = 0.8022


In [16]:
model.save('RNN_noPOS_GloveEmbed.h5')