In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce

from nltk.corpus import stopwords
stops = stopwords.words('english')

In [147]:
def read_dataset(filename):
    data = pd.read_table(filename, sep='\t', encoding='utf-8',
                     names=['qid','truth', 'prediction'])
    return data

In [148]:
df_test = pd.read_table('Quora_question_pair_partition/test.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'qid'])

In [149]:
df_5k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_5k.tsv')
df_10k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_10k.tsv')
df_15k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_15k.tsv')
df_20k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_20k.tsv')
df_25k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_25k.tsv')
df_30k = read_dataset('/Users/andrada/Thesis_notebooks/ensemble/predictions.quora.ensemble.reverse_30k.tsv')

In [174]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['q1_lemma']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_lemma']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return round(R,2)

In [160]:
def preprocess(df, name):
    column = 'prediction_' + name
    df[column] = df['prediction'].apply(lambda x: x.replace('[',''))
    df[column] = df[column].apply(lambda x: x.replace(']',''))
    df[column] = df[column].apply(pd.to_numeric)
    return df

def errors(df,name):
    column = 'prediction_' + name
    new_col = 'label_'+ name
    df[new_col] = df[column].apply(lambda x: int(x > 0.5))
    return df

def prepare_df(df, name):
    df = preprocess(df, name)
    df = errors(df,name)
    return df

In [161]:
errors_5k = prepare_df(df_5k, '5k')
errors_10k = prepare_df(df_10k, '10k')
errors_15k = prepare_df(df_15k, '15k')
errors_20k = prepare_df(df_20k, '20k')
errors_25k = prepare_df(df_25k, '25k')
errors_30k = prepare_df(df_30k, '30k')

In [163]:
df = [df_test, errors_5k, errors_10k, errors_15k, errors_20k, errors_25k, errors_30k]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['qid'], how='inner'), df)

In [165]:
df_merged.drop(columns=['truth_x', 'truth_y', 'prediction_x', 'prediction_y'], inplace=True)

In [178]:
df_merged['q1_lemma'] = df_merged['question1'].apply(lemmatize_text)
df_merged['q2_lemma'] = df_merged['question2'].apply(lemmatize_text)
df_merged['word_overlap'] = df_merged.apply( word_match_share, axis=1, raw=True)

In [179]:
mask1 = (df_merged['is_duplicate'] != df_merged['label_5k'])
mask2 = (df_merged['is_duplicate'] != df_merged['label_10k'])
mask3 = (df_merged['is_duplicate'] != df_merged['label_15k'])
mask4 = (df_merged['is_duplicate'] != df_merged['label_20k'])
mask5 = (df_merged['is_duplicate'] != df_merged['label_25k'])
mask6 = (df_merged['is_duplicate'] != df_merged['label_30k'])

In [183]:
df_errors = df_merged[mask1 & mask2 & mask3 & mask4 & mask5 & mask6]
df_errors['word_overlap'].mean(), df_errors['word_overlap'].std()

(0.6456360708534616, 0.17936480105663222)

In [193]:
df_errors.groupby('is_duplicate')

Unnamed: 0_level_0,question1,question2,qid,prediction_5k,label_5k,prediction_10k,label_10k,prediction_15k,label_15k,prediction_20k,label_20k,prediction_25k,label_25k,prediction_30k,label_30k,q1_lemma,q2_lemma,word_overlap
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,429,429,429,429,429,429,429,429,429,429,429,429,429,429,429,429,429,429
1,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192


In [187]:
df_merged[mask1]['word_overlap'].mean()

0.6193433931484515

In [188]:
df_merged[mask2]['word_overlap'].mean()

0.6201648351648362

In [189]:
df_merged[mask3]['word_overlap'].mean()

0.6246259124087599

In [190]:
df_merged[mask4]['word_overlap'].mean()

0.6254375569735656

In [191]:
df_merged[mask5]['word_overlap'].mean()

0.6226815050344467

In [192]:
df_merged[mask6]['word_overlap'].mean()

0.6270961718020562