In [1]:
from collections import Counter
import pickle

import pandas as pd
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

from keras.layers import *
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib ; device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15393703127507441911, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 11021298170085022729
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 3132006109820702862
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15597548340
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 6994167729943694577
 physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0"]

In [3]:
data = pd.read_csv('../data/data_p.tsv', sep='\t')
# data = pd.read_csv('../data/data_p.tsv', sep='\t', names=['query_id', 'query_text', 'passage_text', 'label', 'passage_id'])

data['label'] = pd.to_numeric(data['label'])
data.head(n=50)

Unnamed: 0,query_id,query_text,passage_text,label,passage_id
0,0,. what is a corporation ?,a company is incorporated in a specific nation...,0,0
1,0,. what is a corporation ?,"today , there is a growing community of more ...",0,1
2,0,. what is a corporation ?,"corporation ddefinition , an association of i...",0,2
3,0,. what is a corporation ?,examples of corporation in a sentence . 1 he...,0,3
4,0,. what is a corporation ?,1 : a government - owned corporation ( as ...,0,4


In [4]:
y = data.label.values
Counter(y)

Counter({0: 4717692, 1: 524188})

### Resampling to reduce data for balancing and handling OOM error

In [5]:
frac_for_0 = 3/9
result1 = data[data.label==1]
result0 = data[data.label==0].sample(frac=frac_for_0) 
data = pd.concat([result1, result0], axis=0).sample(frac=1) 
data.head()

Unnamed: 0,query_id,query_text,passage_text,label,passage_id
1517028,124131,how much vacation time do boeing employees get,online and mobile banking will be unavailable ...,0,7
1213916,750068,cost of laminate flooring for bathrooms,durable laminate flooring is a low - cost op...,0,8
4414644,414673,what tv show is captain john on,also pictured are actors alan alda and loretta...,1,3
187126,38148,define burn candle both ends,burning the candle at both ends - meaning ...,1,6
1171654,734968,what time does ebenefits update,we greatly appreciate the veterans and other e...,0,7


In [6]:
y = data.label.values
Counter(y)

Counter({0: 1572564, 1: 524188})

In [7]:
tk = text.Tokenizer(num_words=840000000000)

max_len_q = 12
max_len_p = 20
tk.fit_on_texts(list(data.query_text.values.astype(str)) + list(data.passage_text.values.astype(str)))

x1 = tk.texts_to_sequences(data.query_text.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len_q)

x2 = tk.texts_to_sequences(data.passage_text.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len_p)
x1.shape

(2096752, 12)

In [8]:
with open('../data/tokenizer.pkl', 'wb') as f:
    pickle.dump(tk, f)

In [9]:
x1[0]

array([    0,     0,     0,     0,    29,    85,  3351,    55,    43,
       10979,   891,    93], dtype=int32)

In [10]:
x2[0]

array([  203,  1658,   275,   500,    39,   203,   399,  2009,  1568,
           8,  4288,  2095,    86, 26544,     8,   102, 20360,    28,
          49,   128], dtype=int32)

In [11]:
word_index = tk.word_index

# ytrain_enc = np_utils.to_categorical(y)

emb_size = 300 #100
file_path = '../glove.840B/glove.840B.%sd.txt'%emb_size
# file_path = '../glove.6B/glove.6B.%sd.txt'%emb_size

def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file.split('.')[-1] == 'vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(file, encoding='utf-8')) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(file, encoding='latin')))
        
    return embeddings_index

embeddings_index = load_embed(file_path)

2196017it [02:44, 13318.67it/s]


In [12]:
print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, emb_size))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  8%|▊         | 54250/677173 [00:00<00:02, 260647.62it/s]

Found 2196016 word vectors.


100%|██████████| 677173/677173 [00:01<00:00, 457173.20it/s]


In [13]:
max_features = 200000
filter_length = 5
nb_filter = 64
pool_length = 4

model = Sequential()
print('Build model...')

Build model...


In [14]:
model1 = Sequential()
model1.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_q,
                     trainable=False))

model1.add(TimeDistributed(Dense(emb_size, activation='relu')))
model1.add(Lambda(lambda x: K.sum(x, axis=1), output_shape=(emb_size,)))

model2 = Sequential()
model2.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_p,
                     trainable=False))

model2.add(TimeDistributed(Dense(emb_size, activation='relu')))
model2.add(Lambda(lambda x: K.sum(x, axis=1), output_shape=(emb_size,)))

model3 = Sequential()
model3.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_q,
                     trainable=False))
model3.add(Conv1D(filters=nb_filter,
                         kernel_size=filter_length,
                         padding='valid',
                         activation='relu',
                         strides=1))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=nb_filter,
                         kernel_size=filter_length,
                         padding='valid',
                         activation='relu',
                         strides=1))

model3.add(GlobalMaxPooling1D())
model3.add(Dropout(0.2))

model3.add(Dense(emb_size))
model3.add(Dropout(0.2))
model3.add(BatchNormalization())

model4 = Sequential()
model4.add(Embedding(len(word_index) + 1,
                     emb_size,
                     weights=[embedding_matrix],
                     input_length=max_len_p,
                     trainable=False))
model4.add(Conv1D(filters=nb_filter,
                         kernel_size=filter_length,
                         padding='valid',
                         activation='relu',
                         strides=1))
model4.add(Dropout(0.2))

model4.add(Conv1D(filters=nb_filter,
                         kernel_size=filter_length,
                         padding='valid',
                         activation='relu',
                         strides=1))

model4.add(GlobalMaxPooling1D())
model4.add(Dropout(0.2))

model4.add(Dense(emb_size))
model4.add(Dropout(0.2))
model4.add(BatchNormalization())

blstm_merge_mode = 'ave' #'concat'

model5 = Sequential()
model5.add(Embedding(len(word_index) + 1, emb_size, input_length=max_len_q, dropout=0.2))
model5.add(Bidirectional(LSTM(emb_size, dropout_W=0.2, dropout_U=0.2), merge_mode=blstm_merge_mode))

model6 = Sequential()
model6.add(Embedding(len(word_index) + 1, emb_size, input_length=max_len_p, dropout=0.2))
model6.add(Bidirectional(LSTM(emb_size, dropout_W=0.2, dropout_U=0.2), merge_mode=blstm_merge_mode))



In [15]:
# submodel_inputs = [model.input for model in [model1, model2, model3, model4, model5, model6]]
# submodel_outputs = [model.output for model in [model1, model2, model3, model4, model5, model6]]
# # https://stackoverflow.com/questions/45979848/merge-2-sequential-models-in-keras
# mergedout = Concatenate(axis=1)(submodel_outputs)

# mergedout = BatchNormalization()(mergedout)

# mergedout = Dense(emb_size)(mergedout)
# mergedout = PReLU()(mergedout)
# mergedout = Dropout(0.2)(mergedout)
# mergedout = BatchNormalization()(mergedout)

# mergedout = Dense(emb_size)(mergedout)
# mergedout = PReLU()(mergedout)
# mergedout = Dropout(0.2)(mergedout)
# mergedout = BatchNormalization()(mergedout)

# mergedout = Dense(1)(mergedout)
# mergedout = Activation('sigmoid')(mergedout)

# checkpoint = ModelCheckpoint('../data/siamese.{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', save_best_only=False, verbose=2)
# newModel = Model(submodel_inputs, mergedout)
# newModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
submodel_inputs = [model.input for model in [model1, model2, model3, model4, model5, model6]]
[print(model.output) for model in [model1, model2, model3, model4, model5, model6]]
submodel_outputs = [model.output for model in [model1, model2, model3, model4, model5, model6]]

# https://stackoverflow.com/questions/45979848/merge-2-sequential-models-in-keras
# mergedout = Concatenate(axis=1)(submodel_outputs)
mergedout1 = Average()([model1.output, model3.output, model5.output])
mergedout2 = Average()([model2.output, model4.output, model6.output])
mergedout = Multiply()([mergedout1, mergedout2])

print(mergedout1.shape)
print(mergedout2.shape)
print(mergedout.shape)

mergedout = BatchNormalization()(mergedout)

mergedout = Dense(emb_size)(mergedout)
mergedout = PReLU()(mergedout)
mergedout = Dropout(0.2)(mergedout)
mergedout = BatchNormalization()(mergedout)

mergedout = Dense(1)(mergedout)
mergedout = Activation('sigmoid')(mergedout)

checkpoint = ModelCheckpoint('../data/siamese.{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', save_best_only=False, verbose=2)
newModel = Model(submodel_inputs, mergedout)
newModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Tensor("lambda_1/Sum:0", shape=(?, 300), dtype=float32)
Tensor("lambda_2/Sum:0", shape=(?, 300), dtype=float32)
Tensor("batch_normalization_1/cond/Merge:0", shape=(?, 300), dtype=float32)
Tensor("batch_normalization_2/cond/Merge:0", shape=(?, 300), dtype=float32)
Tensor("bidirectional_1/truediv:0", shape=(?, 300), dtype=float32)
Tensor("bidirectional_2/truediv:0", shape=(?, 300), dtype=float32)
(?, 300)
(?, 300)
(?, 300)


In [17]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          61406       12259       25054        2062       24092       46506
Swap:             0           0           0


In [18]:
class_weights = {0: 1., 1: float(1/frac_for_0)}
class_weights

{0: 1.0, 1: 3.0}

In [None]:
newModel.fit(x=[x1, x2, x1, x2, x1, x2], y=y, batch_size=512, epochs=5, 
             verbose=1, validation_split=0.1, shuffle=True, 
             class_weight=class_weights, callbacks=[checkpoint])

In [None]:
# tf.enable_eager_execution()

# a = tf.convert_to_tensor(
#     [[1,2], [3,4]],
#     dtype=None,
#     name=None,
#     preferred_dtype=None
# )

# b = tf.convert_to_tensor(
#     [[1,2], [3,4]],
#     dtype=None,
#     name=None,
#     preferred_dtype=None
# )

# Dot(axes=1)([a,b])
# Multiply()([a,b])