In [1]:
# https://www.kaggle.com/lystdo/lb-0-18-lstm-with-glove-and-magic-features/code

In [5]:
'''
Example of an LSTM model with GloVe embeddings along with magic features

Tested under Keras 2.0 with Tensorflow 1.0 backend

Single model may achieve LB scores at around 0.18+, average ensembles can get 0.17+
'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

import logging

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(filename='LSTM implementation by lystdo v3.log', format=FORMAT, level=logging.DEBUG)
logger = logging.getLogger('kaggle-quora')

In [6]:
logger.info('Initial log message')
print('Initial print message')

Initial print message


In [7]:
def log_print(x):
    logger.info(x)
    print(x)

In [99]:
%%time
########################################
## set directories and parameters
########################################
BASE_DIR = '../input/'
EMBEDDING_FILE = BASE_DIR + 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 60
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

fasttest_vec = True
num_lstm = 64  # np.random.randint(175, 275)
num_dense = 300  # np.random.randint(100, 150)
rate_drop_lstm = 0.1  # 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.1  # 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.2 µs


In [9]:
%%time
########################################
## index word vectors
########################################
log_print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
count = 0
for ix, line in enumerate(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    if ix % 100000 == 0:
        log_print('Embedding found: {}'.format(ix + 1))
f.close()

log_print('Found %d word vectors of glove.' % len(embeddings_index))

Indexing word vectors
Embedding found: 1
Embedding found: 100001
Embedding found: 200001
Embedding found: 300001
Embedding found: 400001
Embedding found: 500001
Embedding found: 600001
Embedding found: 700001
Embedding found: 800001
Embedding found: 900001
Embedding found: 1000001
Embedding found: 1100001
Embedding found: 1200001
Embedding found: 1300001
Embedding found: 1400001
Embedding found: 1500001
Embedding found: 1600001
Embedding found: 1700001
Embedding found: 1800001
Embedding found: 1900001
Embedding found: 2000001
Embedding found: 2100001
Found 2196016 word vectors of glove.
CPU times: user 1min 11s, sys: 1.37 s, total: 1min 12s
Wall time: 1min 12s


In [10]:
%%time
########################################
## process texts in datasets
########################################
log_print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
log_print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
log_print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
log_print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
log_print('Shape of data tensor: {}'.format(data_1.shape))
log_print('Shape of label tensor: {}'.format(labels.shape))

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Processing text dataset
Found 404290 texts in train.csv
Found 2345796 texts in test.csv
Found 120500 unique tokens
Shape of data tensor: (404290, 60)
Shape of label tensor: (404290,)
CPU times: user 4min 7s, sys: 988 ms, total: 4min 8s
Wall time: 4min 8s


In [11]:
%%time
########################################
## generate leaky features
########################################

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

log_print('Processing leaky features for training data')
train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

log_print('Processing leaky features for test data')
test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)

leaks = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)

log_print('Done processing leaky features')

Processing leaky features for training data
Processing leaky features for test data
Done processing leaky features
CPU times: user 3min 53s, sys: 1.41 s, total: 3min 54s
Wall time: 3min 52s


In [64]:
if fasttest_vec:
    import fasttext
    fasttext_model = fasttext.load_model('model_full_data.bin')

In [67]:
%%time
########################################
## prepare embeddings
########################################
log_print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if not fasttest_vec:
        embedding_vector = embeddings_index.get(word)
    else:
        embedding_vector = np.array(fasttext_model[word], dtype=np.float32)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
log_print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 1
CPU times: user 2.44 s, sys: 36 ms, total: 2.47 s
Wall time: 2.46 s


In [68]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [69]:
from keras.layers.wrappers import Bidirectional

In [100]:
%%time
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
# lstm_layer = Bidirectional(lstm_layer)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(num_dense/2, activation=act)(leaks_input)

merged = concatenate([x1, y1, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense/2, activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

CPU times: user 2.23 s, sys: 552 ms, total: 2.78 s
Wall time: 2.29 s


In [101]:
%%time
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


In [102]:
%%time
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
log_print(STAMP)

STAMP += 'non-bidirectional-fasttext-vec'

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=1000, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

lstm_64_300_0.10_0.10
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
CPU times: user 22min 14s, sys: 1min 54s, total: 24min 8s
Wall time: 23min 49s


In [77]:
%%time
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
log_print(STAMP)

STAMP += 'non-bidirectional-fasttext-vec'

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=1000, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

lstm_128_512_0.10_0.10non-bidirectional-fasttext-vec
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
CPU times: user 28min 23s, sys: 4min 18s, total: 32min 41s
Wall time: 27min 4s


In [88]:
%%time
hist = model.fit([data_2_train, data_1_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=1000, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
 87000/727722 [==>...........................] - ETA: 152s - loss: 0.1174 - acc: 0.9215

KeyboardInterrupt: 

In [89]:
model_overfit = model

True

In [91]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
bst_val_score

0.18810520809438736

In [18]:
%%time
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
log_print(STAMP)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=500, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_256_512_0.30_0.30
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
CPU times: user 2h 53min 28s, sys: 22min 6s, total: 3h 15min 35s
Wall time: 2h 57min 6s


In [48]:
%%time
hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=20, batch_size=500, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Train on 727722 samples, validate on 80858 samples
Epoch 1/20
Epoch 2/20
120500/727722 [===>..........................] - ETA: 553s - loss: 0.1667 - acc: 0.8818

KeyboardInterrupt: 

In [19]:
bst_val_score

0.19677223692829843

In [15]:
%%time
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
log_print(STAMP)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_190_122_0.29_0.16
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
CPU times: user 17min 41s, sys: 3min 59s, total: 21min 41s
Wall time: 21min 36s


In [None]:
%%time
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='rmsprop',
        metrics=['acc'])
#model.summary()
log_print(STAMP)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_128_128_0.30_0.30
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
CPU times: user 28min 57s, sys: 6min 18s, total: 35min 16s
Wall time: 32min 9s


In [94]:
%%time
########################################
## make the submission
########################################
log_print('Start making the submission before fine-tuning')

test_batch_size = 2048
preds1 = model.predict([test_data_1, test_data_2, test_leaks], batch_size=test_batch_size, verbose=1)
preds2 = model.predict([test_data_2, test_data_1, test_leaks], batch_size=test_batch_size, verbose=1)
preds = (preds1 + preds2) / 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

log_print('Submission file created')

Start making the submission before fine-tuning
Submission file created
CPU times: user 5min 5s, sys: 1min 46s, total: 6min 51s
Wall time: 6min 51s


In [96]:
preds.mean(), preds1.mean(), preds2.mean()

(0.093413979, 0.093444876, 0.093383066)

In [90]:
%%time
########################################
## make the submission
########################################
log_print('Start making the submission before fine-tuning')

test_batch_size = 2048
preds1 = model.predict([test_data_1, test_data_2, test_leaks], batch_size=test_batch_size, verbose=1)
preds2 = model.predict([test_data_2, test_data_1, test_leaks], batch_size=test_batch_size, verbose=1)
preds = (preds1 + preds2) / 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'-overfit' + '.csv', index=False)

log_print('Submission file created')

Start making the submission before fine-tuning
Submission file created
CPU times: user 5min 5s, sys: 1min 46s, total: 6min 52s
Wall time: 6min 51s


In [92]:
preds.mean(), preds1.mean(), preds2.mean()

(0.13072345, 0.13069223, 0.13075462)

In [26]:
preds.mean()

0.1178443

In [25]:
test_df[:10]

Unnamed: 0,test_id,question1,question2,q1_q2_intersect,q1_freq,q2_freq
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0,1,1
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0,2,2
2,2,What but is the best way to send money from Ch...,What you send money to China?,0,1,1
3,3,Which food not emulsifiers?,What foods fibre?,0,1,1
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0,1,1
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...,0,1,1
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?,26,28,30
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...,0,1,1
8,8,What are the how best books of all time?,What are some of the military history books of...,0,1,1
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?,0,1,2


In [97]:
dsf = pd.read_csv('../predictions/xgb_special_feats.csv')
dsf_v2 = pd.read_csv('../predictions/xgb_special_feats_v2.csv')
dlm = pd.read_csv('0.1902_lstm_190_122_0.29_0.16.csv')
dlm_v2 = pd.read_csv('0.1968_lstm_256_512_0.30_0.30.csv')

In [98]:
dlm.is_duplicate.mean(), dlm_v2.is_duplicate.mean()

(0.10843242556048154, 0.11784429806897431)

In [30]:
((dsf.is_duplicate + dsf_v2.is_duplicate)/2).mean()

0.091609956882722082

In [35]:
((dlm.is_duplicate + dsf.is_duplicate)/2).mean()

0.099839756069026606

In [31]:
dsf.mean()

test_id         1.172898e+06
is_duplicate    9.124709e-02
dtype: float64

In [103]:
dsf_v2.mean()

test_id         1.172898e+06
is_duplicate    9.197283e-02
dtype: float64

In [36]:
dsf.is_duplicate.corr(dlm.is_duplicate)

0.71886675922714249

In [46]:
(dsf > 0.5).mean()

test_id         1.000000
is_duplicate    0.030325
dtype: float64

In [45]:
(dsf_v2 > 0.5).mean()

test_id         1.000000
is_duplicate    0.032715
dtype: float64

In [47]:
(dlm > 0.5).mean()

is_duplicate    0.049402
test_id         1.000000
dtype: float64