In [1]:
from collections import Counter
import pickle

import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

from keras.layers import *
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib ; device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 11163153185555723425, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 2446397763255536660
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 12380700044005288048
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15597548340
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 3177498499026159870
 physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0"]

In [3]:
data = pd.read_csv('../data/data.tsv', sep='\t', header=None, names=['query_id', 'query_text', 'passage_text', 'label', 'passage_id'])
data.head()

Unnamed: 0,query_id,query_text,passage_text,label,passage_id
0,0,. what is a corporation?,A company is incorporated in a specific nation...,0,0
1,0,. what is a corporation?,"Today, there is a growing community of more th...",0,1
2,0,. what is a corporation?,"Corporation definition, an association of indi...",0,2
3,0,. what is a corporation?,Examples of corporation in a Sentence. 1 He w...,0,3
4,0,. what is a corporation?,1: a government-owned corporation (as a utilit...,0,4


In [4]:
y = data.label.values
Counter(y)

Counter({0: 4717692, 1: 524188})

In [None]:
tk = text.Tokenizer(num_words=200000)

max_len_q = 12
max_len_p = 20
tk.fit_on_texts(list(data.query_text.values.astype(str)) + list(data.passage_text.values.astype(str)))

x1 = tk.texts_to_sequences(data.query_text.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len_q)

x2 = tk.texts_to_sequences(data.passage_text.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len_p)
x1.shape

In [None]:
with open('../data/tokenizer.pkl', 'wb') as f:
    pickle.dump(tk, f)

In [None]:
x1[0]

In [None]:
x2[0]

In [None]:
word_index = tk.word_index

ytrain_enc = np_utils.to_categorical(y)

embeddings_index = {}
emb_size = 100

f = open('../glove.6B/glove.6B.%sd.txt'%emb_size, encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, emb_size))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
max_features = 200000
filter_length = 5
nb_filter = 64
pool_length = 4

model = Sequential()
print('Build model...')

In [None]:
model1 = Sequential()
model1.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_q,
                     trainable=False))

model1.add(TimeDistributed(Dense(emb_size, activation='relu')))
model1.add(Lambda(lambda x: K.sum(x, axis=1), output_shape=(emb_size,)))

model2 = Sequential()
model2.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_p,
                     trainable=False))

model2.add(TimeDistributed(Dense(emb_size, activation='relu')))
model2.add(Lambda(lambda x: K.sum(x, axis=1), output_shape=(emb_size,)))

model3 = Sequential()
model3.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_q,
                     trainable=False))
model3.add(Convolution1D(nb_filter=nb_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1))
model3.add(Dropout(0.2))

model3.add(Convolution1D(nb_filter=nb_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1))

model3.add(GlobalMaxPooling1D())
model3.add(Dropout(0.2))

model3.add(Dense(emb_size))
model3.add(Dropout(0.2))
model3.add(BatchNormalization())

model4 = Sequential()
model4.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_p,
                     trainable=False))
model4.add(Convolution1D(nb_filter=nb_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1))
model4.add(Dropout(0.2))

model4.add(Convolution1D(nb_filter=nb_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1))

model4.add(GlobalMaxPooling1D())
model4.add(Dropout(0.2))

model4.add(Dense(emb_size))
model4.add(Dropout(0.2))
model4.add(BatchNormalization())

model5 = Sequential()
model5.add(Embedding(len(word_index) + 1, emb_size, input_length=max_len_q, dropout=0.2))
model5.add(LSTM(emb_size, dropout_W=0.2, dropout_U=0.2))

model6 = Sequential()
model6.add(Embedding(len(word_index) + 1, emb_size, input_length=max_len_p, dropout=0.2))
model6.add(LSTM(emb_size, dropout_W=0.2, dropout_U=0.2))

In [None]:
submodel_inputs = [model.input for model in [model1, model2, model3, model4, model5, model6]]
submodel_outputs = [model.output for model in [model1, model2, model3, model4, model5, model6]]
# https://stackoverflow.com/questions/45979848/merge-2-sequential-models-in-keras
mergedout = Concatenate(axis=1)(submodel_outputs)

mergedout = BatchNormalization()(mergedout)

mergedout = Dense(emb_size)(mergedout)
mergedout = PReLU()(mergedout)
mergedout = Dropout(0.2)(mergedout)
mergedout = BatchNormalization()(mergedout)

mergedout = Dense(emb_size)(mergedout)
mergedout = PReLU()(mergedout)
mergedout = Dropout(0.2)(mergedout)
mergedout = BatchNormalization()(mergedout)

mergedout = Dense(emb_size)(mergedout)
mergedout = PReLU()(mergedout)
mergedout = Dropout(0.2)(mergedout)
mergedout = BatchNormalization()(mergedout)

mergedout = Dense(emb_size)(mergedout)
mergedout = PReLU()(mergedout)
mergedout = Dropout(0.2)(mergedout)
mergedout = BatchNormalization()(mergedout)

mergedout = Dense(1)(mergedout)
mergedout = Activation('sigmoid')(mergedout)

checkpoint = ModelCheckpoint('../data/siamese-model9to9.h5', monitor='val_acc', save_best_only=True, verbose=2)

In [None]:
newModel = Model(submodel_inputs, mergedout)
newModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
newModel.fit(x=[x1, x2, x1, x2, x1, x2], y=y, batch_size=1024, epochs=5, verbose=1, validation_split=0.1, shuffle=True, callbacks=[checkpoint])