# Quora question pairs: training

## Import packages

In [5]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json
import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Merge, BatchNormalization, TimeDistributed, Lambda
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

## Initialize global variables

In [6]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 17


## Load the dataset, embedding matrix and word count

In [7]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    
    nb_words = json.load(f)['nb_words']

## Partition the dataset into train and test sets

In [4]:
X = np.stack((q1_data, q2_data), axis=1)

print(q1_data.shape, q2_data.shape)
y = labels
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

(404290, 25) (404290, 25)
(404290,)


## Define the model

In [8]:
Q1 = Sequential()
Q1.add(Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False))
Q1.add(TimeDistributed(Dense(EMBEDDING_DIM, activation='relu')))
Q1.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
Q2 = Sequential()
Q2.add(Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False))
Q2.add(TimeDistributed(Dense(EMBEDDING_DIM, activation='relu')))
Q2.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
model = Sequential()
model.add(Merge([Q1, Q2]))
model.add(BatchNormalization())
model.add(Dense(200, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy', 'precision', 'recall', 'fbeta_score'])

## Train the model, checkpointing weights with best validation accuracy

In [6]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    nb_epoch=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2017-04-26 12:41:15.901224
Train on 327474 samples, validate on 36387 samples
Epoch 1/17
638s - loss: 0.6218 - acc: 0.6780 - precision: 0.6545 - recall: 0.2920 - fbeta_score: 0.3878 - val_loss: 0.6183 - val_acc: 0.6873 - val_precision: 0.6785 - val_recall: 0.2837 - val_fbeta_score: 0.3854
Epoch 2/17
634s - loss: 0.6162 - acc: 0.6830 - precision: 0.6722 - recall: 0.2939 - fbeta_score: 0.3960 - val_loss: 0.6123 - val_acc: 0.6838 - val_precision: 0.7341 - val_recall: 0.2146 - val_fbeta_score: 0.3189
Epoch 3/17
632s - loss: 0.6112 - acc: 0.6893 - precision: 0.6888 - recall: 0.3010 - fbeta_score: 0.4070 - val_loss: 0.6076 - val_acc: 0.6929 - val_precision: 0.6624 - val_recall: 0.3378 - val_fbeta_score: 0.4336
Epoch 4/17
633s - loss: 0.6060 - acc: 0.6957 - precision: 0.7129 - recall: 0.3064 - fbeta_score: 0.4167 - val_loss: 0.6122 - val_acc: 0.6970 - val_precision: 0.7442 - val_recall: 0.2695 - val_fbeta_score: 0.3807
Epoch 5/17
628s - loss: 0.6006 - acc: 0.7013 - precis

KeyboardInterrupt: 

## Plot training and validation accuracy

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['acc'],
                    'validation': history.history['val_acc']})
ax = acc.ix[:,:].plot(x='epoch', figsize={5,8}, grid=True)
ax.set_ylabel("accuracy")
ax.set_ylim([0.0,1.0]);

## Print best validation accuracy and epoch

In [None]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

## Evaluate the model with best validation accuracy on the test partition

In [None]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy, precision, recall, fbeta_score = model.evaluate([Q1_test, Q2_test], y_test)
print('')
print('loss      = {0:.4f}'.format(loss))
print('accuracy  = {0:.4f}'.format(accuracy))
print('precision = {0:.4f}'.format(precision))
print('recall    = {0:.4f}'.format(recall))
print('F         = {0:.4f}'.format(fbeta_score))

# Prediction

# Load Test Data

In [9]:
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
model.load_weights(MODEL_WEIGHTS_FILE)
QUESTION_PAIRS_FILE = 'test.csv'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TESTING_DATA_FILE = 'q1_test.npy'
Q2_TESTING_DATA_FILE = 'q2_test.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300


In [10]:
import csv
question1 = []
question2 = []
with open(QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=',')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
print('Question pairs: %d' % len(question1))

Question pairs: 2345796


In [11]:
from keras.preprocessing.text import Tokenizer
questions = question1 + question2
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 101312


In [12]:
print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Processing glove.840B.300d.txt
Word embeddings: 2196016


In [13]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 31446


In [14]:
from keras.preprocessing.sequence import pad_sequences
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
#labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
#print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (2345796, 25)
Shape of question2 data tensor: (2345796, 25)


In [14]:
np.save(open(Q1_TESTING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TESTING_DATA_FILE, 'wb'), q2_data)
#np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
#np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
#with open(NB_WORDS_DATA_FILE, 'w') as f:
#    json.dump({'nb_words': nb_words}, f)

In [15]:
q1_data = np.load(open(Q1_TESTING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TESTING_DATA_FILE, 'rb'))
X = np.stack((q1_data, q2_data), axis=1)
Q1_test = X[:,0]
Q2_test = X[:,1]

In [16]:
pred = model.predict([Q1_test, Q2_test])

In [36]:
#print("Starting training at", datetime.datetime.now())
#t0 = time.time()
name = "competition1.csv"
#pred will equal model.predict, which is a numpy array
pred = np.load(open('prediction.npy', 'rb'))
print('pred shape: ', pred.shape)


def write(pred, name):
    f = open(name, 'w')
    f.write('test_id,is_duplicate\n')
    for i in range(len(pred)):
        label = pred[i]
        image_id = i
        f.write(str(image_id) + ',' + str(label[0]) + '\n')
    f.close()
write(pred, name)

pred shape:  (2345796, 1)


In [19]:
print(pred.shape)

(2345796, 1)


In [22]:
np.save('prediction', pred)