In [None]:
# Import packages
import numpy as np
import os.path
import pandas as pd

np.random.seed(1234)

# Inter-model comparison 

In [None]:
# Create df of merged predictions from each model (functional approach, only one df ever exists in memory)
model_predictions = pd.read_csv('data/test_1.csv').merge(pd.read_csv('data/test_2.csv'), how = 'outer')


# This block is for dev purposes until submission files are pushed to the repo
# model_predictions = model_predictions.merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'cnn_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer').merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'lstm_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer').merge(
#                         pd.read_csv('data/sample_submission.csv').rename(columns = {'is_duplicate' : 'xgb_duplicate'}),
#                         on = 'test_id', 
#                         how = 'outer')


# CNN
if os.path.isfile('data/cnn_submission.csv'):
    model_predictions = model_predictions.merge(
                        pd.read_csv('data/cnn_submission.csv').rename(columns = {'is_duplicate' : 'cnn_duplicate'}),
                        on = 'test_id', 
                        how = 'outer')

# LSTM 
if os.path.isfile('data/lstm_submission.csv'):
    model_predictions = model_predictions.merge(
                    pd.read_csv('data/lstm_submission.csv').rename(columns = {'is_duplicate' : 'lstm_duplicate'}),
                    on = 'test_id', 
                    how = 'outer')

# XGBoost
if os.path.isfile('data/xgb_submission.csv'):
    model_predictions = model_predictions.merge(
                    pd.read_csv('data/xgb_submission.csv').rename(columns = {'is_duplicate' : 'xgb_duplicate'}),
                    on = 'test_id', 
                    how = 'outer')


# Sanity Check
col_names = list(model_predictions)
num_rows = len(model_predictions)
print "Column Names:\t\t" + str(col_names)
print "Number of Rows:\t\t" + str(num_rows) 

In [None]:
# Evaluate model similarity
cnn_lstm = model_predictions['cnn_duplicate'] == model_predictions['lstm_duplicate']
cnn_xgb = model_predictions['cnn_duplicate'] == model_predictions['xgb_duplicate']
lstm_xgb = model_predictions['lstm_duplicate'] == model_predictions['xgb_duplicate']
cnn_lstm_xgb = cnn_lstm & cnn_xgb

print "CNN/LSTM Similarity:\t\t%f" % (len(model_predictions[cnn_lstm]) / num_rows)

print "CNN/XGB Similarity:\t\t%f" % (len(model_predictions[cnn_xgb]) / num_rows)

print "LSTM/XGB Similarity:\t\t%f" % (len(model_predictions[lstm_xgb]) / num_rows)

print "CNN/LSTM/XGB Similarity:\t%f" % (len(model_predictions[cnn_lstm_xgb]) / num_rows)

In [None]:
# Explore CNN/LSTM differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['cnn_duplicate'] == 1) & (model_predictions['lstm_duplicate'] == 0)
condition_2 =  (model_predictions['cnn_duplicate'] == 0) & (model_predictions['lstm_duplicate'] == 1)

print "Pairs that CNN marked as duplicates and LSTM did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that LSTM marked as duplicates and CNN did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

In [None]:
# Explore CNN/XGB differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['cnn_duplicate'] == 1) & (model_predictions['xgb_duplicate'] == 0)
condition_2 =  (model_predictions['cnn_duplicate'] == 0) & (model_predictions['xgb_duplicate'] == 1)

print "Pairs that CNN marked as duplicates and XGBoost did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that XGBoost marked as duplicates and CNN did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

In [None]:
# Explore LSTM/XGB differences
# Print ~20 rows (small, but representative sample) and compare/contrast
condition_1 =  (model_predictions['lstm_duplicate'] == 1) & (model_predictions['xgb_duplicate'] == 0)
condition_2 =  (model_predictions['lstm_duplicate'] == 0) & (model_predictions['xgb_duplicate'] == 1)

print "Pairs that LSTM marked as duplicates and XGBoost did not.\n"
for key in np.random.choice(model_predictions[condition_1]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

print "".join(['=' for i in range(115)])
print "\nPairs that XGBoost marked as duplicates and LSTM did not.\n"
for key in np.random.choice(model_predictions[condition_2]['test_id'], size = 10, replace = False):
    print "Q1: " + str(model_predictions['question1'][key])
    print "Q2: " + str(model_predictions['question2'][key]) + "\n"

In [None]:
# Remove df to conserve memory
del model_predictions

# Error Analysis For Individual Model 

### Convolutional Neural Net

In [None]:
if os.path.isfile('data/cnn_train_predictions.csv'):
    train_predictions = pd.read_csv('data/cnn_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore

### LSTM Neural Net

In [None]:
if os.path.isfile('data/lstm_train_predictions.csv'):
    train_predictions = pd.read_csv('data/lstm_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore

### XGBoost Model

In [None]:
if os.path.isfile('data/xgb_train_predictions.csv'):
    train_predictions = pd.read_csv('data/xgb_train_predictions.csv')
    
# Print ~25 rows (small, but representative sample) of correct predictions and explore
# Print ~25 rows (small, but representative sample) of incorrect predictions and explore