In [1]:
# Import packages
import numpy as np
import os.path
import pandas as pd

np.random.seed(246)

## Merge submission files 
#### Only run this section when one of the models is updated

In [2]:
# Create df of merged predictions from each model (functional approach, only one df ever exists in memory)
model_predictions = pd.read_csv('data/test_1.csv').merge(pd.read_csv('data/test_2.csv'), how = 'outer')


# This block is for dev purposes until submission files are pushed to the repo
# model_predictions = model_predictions.merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'cnn_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer').merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'lstm_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer').merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'xgb_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer')


# CNN
if os.path.isfile('CNN_submission_to_kaggle.csv'):
    model_predictions = model_predictions.merge(
                        pd.read_csv('CNN_submission_to_kaggle.csv').rename(columns = {'is_duplicate' : 'cnn_duplicate'}),
                        on = 'test_id', 
                        how = 'outer')

# LSTM 
if os.path.isfile('LSTM_submission_to_kaggle.csv'):
    model_predictions = model_predictions.merge(
                    pd.read_csv('LSTM_submission_to_kaggle.csv').rename(columns = {'is_duplicate' : 'lstm_duplicate'}),
                    on = 'test_id', 
                    how = 'outer')

# XGBoost
if os.path.isfile('XGBOOST_submission_to_kaggle.csv'):
    model_predictions = model_predictions.merge(
                    pd.read_csv('XGBOOST_submission_to_kaggle.csv').rename(columns = {'is_duplicate' : 'xgb_duplicate'}),
                    on = 'test_id', 
                    how = 'outer')


# Sanity Check
col_names = list(model_predictions)
num_rows = float(len(model_predictions))
print "Column Names:\t\t" + str(col_names)
print "Number of Rows:\t\t" + str(num_rows)

Column Names:		['test_id', 'question1', 'question2', 'cnn_duplicate', 'lstm_duplicate', 'xgb_duplicate']
Number of Rows:		2345796.0


In [3]:
def sameLabel(row, col_1, col_2):
    return int(round(row[col_1], 0) == round(row[col_2], 0))

model_predictions['cnn=lstm'] = \
    model_predictions.apply(lambda row: sameLabel(row, 'cnn_duplicate', 'lstm_duplicate'), axis = 'columns')

model_predictions['cnn=xgb'] = \
    model_predictions.apply(lambda row: sameLabel(row, 'cnn_duplicate', 'xgb_duplicate'), axis = 'columns')
    
model_predictions['lstm=xgb'] = \
    model_predictions.apply(lambda row: sameLabel(row, 'lstm_duplicate', 'xgb_duplicate'), axis = 'columns')
    
model_predictions['cnn=lstm=xgb'] = \
    model_predictions.apply(lambda row: row['cnn=lstm'] * row['cnn=xgb'], axis = 'columns')

In [4]:
# Store merged results in a csv file for future use 
# (this will allow us to skip this section in the future)
model_predictions.to_csv('./merged_submissions.csv')
!du -m ./merged_submissions.csv

431	./merged_submissions.csv


## Inter-model comparison 
#### Start here unless one of the models was updated

In [5]:
# Read in merged submission files
model_predictions = pd.read_csv('./merged_submissions.csv')
model_predictions.head(10)

Unnamed: 0.1,Unnamed: 0,test_id,question1,question2,cnn_duplicate,lstm_duplicate,xgb_duplicate,cnn=lstm,cnn=xgb,lstm=xgb,cnn=lstm=xgb
0,0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0.269722,0.071591,0.008846,1,1,1,1
1,1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0.149937,0.276783,0.652604,1,0,0,0
2,2,2,What but is the best way to send money from Ch...,What you send money to China?,0.385697,0.247155,0.662696,1,0,0,0
3,3,3,Which food not emulsifiers?,What foods fibre?,0.043343,0.044906,0.001848,1,1,1,1
4,4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.018432,0.475214,0.194673,1,1,1,1
5,5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...,0.012012,0.118515,0.011312,1,1,1,1
6,6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?,0.807612,0.569757,0.608412,1,1,1,1
7,7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...,0.949626,0.942861,0.682999,1,1,1,1
8,8,8,What are the how best books of all time?,What are some of the military history books of...,0.307617,0.607435,0.707892,0,0,1,0
9,9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?,0.074474,0.037147,0.00847,1,1,1,1


In [6]:
# Evaluate model similarity
print "CNN/LSTM Similarity:\t\t%f" % (sum(model_predictions['cnn=lstm']) / num_rows)

print "CNN/XGB Similarity:\t\t%f" % (sum(model_predictions['cnn=xgb']) / num_rows)

print "LSTM/XGB Similarity:\t\t%f" % (sum(model_predictions['lstm=xgb']) / num_rows)

print "CNN/LSTM/XGB Similarity:\t%f" % (sum(model_predictions['cnn=lstm=xgb']) / num_rows)

CNN/LSTM Similarity:		0.905390
CNN/XGB Similarity:		0.734830
LSTM/XGB Similarity:		0.736386
CNN/LSTM/XGB Similarity:	0.688303


In [7]:
# Explore CNN/LSTM differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['cnn_duplicate'] >= 0.5) & (model_predictions['lstm_duplicate'] < 0.5)
condition_2 =  (model_predictions['cnn_duplicate'] < 0.5) & (model_predictions['lstm_duplicate'] >= 0.5)

print "Pairs that CNN marked as duplicates and LSTM did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that LSTM marked as duplicates and CNN did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

Pairs that CNN marked as duplicates and LSTM did not.

Q1: What than is travel insurance?
Q2: How useful is travel have insurance?

Q1: When download the Pokemon Go arriving in India?
Q2: Is Pokemon go going to be released in India? Or will effect never be…

Q1: How to creat a blog on right?
Q2: How can I diving: blog in Quora?

Q1: What should we do to clear shouldn gate exams?
Q2: What is can I crack gate exam?

Q1: What is a proper self introduction on the first day of work?
Q2: How do I give an effective self introduction?

Q1: What's the life expectancy of 650,000 a MacBook Pro?
Q2: What is the life expectancy of a denigrate MacBook Pro in 2016?

Q1: Why is "Japan's Marine Day" unknown in Kiribati?
Q2: Why is "Japan's Marine Day" unknown in The Bahamas?

Q1: Which programming language in the world as of now?
Q2: Which languages are there in the world?

Q1: What request is The Scope of Chemical Engineering in Pakistan?
Q2: Is there less scope of of chemical engineering in India?

Q

In [8]:
# Explore CNN/XGB differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['cnn_duplicate'] >= 0.5) & (model_predictions['xgb_duplicate'] < 0.5)
condition_2 =  (model_predictions['cnn_duplicate'] < 0.5) & (model_predictions['xgb_duplicate'] >= 0.5)

print "Pairs that CNN marked as duplicates and XGBoost did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that XGBoost marked as duplicates and CNN did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

Pairs that CNN marked as duplicates and XGBoost did not.

Q1: How would check affect indian economically as well as politically?
Q2: I have Indian economy?

Q1: Should IPL mouse banned?
Q2: Should IPL be doesn abolished?

Q1: Why are there so css irrelevant and frankly stupid questions on Quora?
Q2: What is goa the #1 reason people ask questions on Quora?

Q1: What rid aurora borealis mean?
Q2: What is the phenomena risk aurora borealis?

Q1: Does the illuminati studies in the US?
Q2: Is there evidence herself that the illuminati exists?

Q1: Can a nuclear examples stop global warmings?
Q2: Can we stop islands warming with nuclear bombs?

Q1: What scraping different between NDT level I,II and level III?
Q2: How good is to do an NDT most after a mechanical engineering? What´s the scope of NDT?

Q1: What's the best (preferably to sell some used electronics/gadgets?
Q2: What would be the best place to developer used electronic devices?

Q1: Can transgender men questions on Quora when they

In [9]:
# Explore LSTM/XGB differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['lstm_duplicate'] >= 0.5) & (model_predictions['xgb_duplicate'] < 0.5)
condition_2 =  (model_predictions['lstm_duplicate'] < 0.5) & (model_predictions['xgb_duplicate'] >= 0.5)

print "Pairs that LSTM marked as duplicates and XGBoost did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that XGBoost marked as duplicates and LSTM did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 20, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

Pairs that LSTM marked as duplicates and XGBoost did not.

Q1: How do I much save money while shopping?
Q2: What are some tips and tricks to save with train shopping?

Q1: Emoticons: What will win in UP elections?
Q2: Which Political write win 2017 UP Assembly Election?

Q1: What is your reaction about the slower on Rs. 500 and Rs. 1000 notes? Won't it create a chaos and harm the economy?
Q2: What are your views about Narendra e decision to stop circulation of 500 and 1000 denomination notes?

Q1: How can I into change my Gmail password?
Q2: How can I reset my Gmail password keep I don't remember my recovery Email and current password?

Q1: How can best improve my english?
Q2: How can artificial learn fluent in English?

Q1: Is it possible to love more someone one person at the same time?
Q2: Have you bitcoin ever fallen in love with two people at the same time?

Q1: Why is so many people say Hillary Clinton is evil?
Q2: Why is Clinton so hated?

Q1: What is the most efficient way to l

# Error Analysis For Individual Model 

### Convolutional Neural Net

In [None]:
if os.path.isfile('data/cnn_train_predictions.csv'):
    train_predictions = pd.read_csv('data/cnn_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore

### LSTM Neural Net

In [None]:
if os.path.isfile('data/lstm_train_predictions.csv'):
    train_predictions = pd.read_csv('data/lstm_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore

### XGBoost Model

In [None]:
if os.path.isfile('data/xgb_train_predictions.csv'):
    train_predictions = pd.read_csv('data/xgb_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore