In [1]:
import pandas as pd
import numpy as np

In [2]:
# configure bootstrap
N_ITERATIONS = 100
ALPHA = .95

In [3]:
# read the quora question (sampled) dataset with 1000 question pairs
QUORA_DATA = 'data/quora_questions.csv'
N_QUESTIONS = 1000

df = pd.read_csv(QUORA_DATA)

In [4]:
print(df.shape)

(1000, 4)


In [5]:
print(df.head())

   Unnamed: 0                                          question1  \
0      250366  What are the tips for clearing Google Summer o...   
1      112801    How does social security rule monocular vision?   
2       13679  Which AMD FX series laptop is equal to Intel i...   
3      207849                  What is an addictive personality?   
4      171197  What are the most critical metrics To measure ...   

                                           question2  is_duplicate  
0        How can I crack GSOC-Google Summer of Code?             1  
1  Will you be approved for social security with ...             1  
2                     Which is better: AMD FX vs i5?             0  
3               What is a non-addictive personality?             0  
4  What instrumental does Dr. Dre use in his comm...             0  


In [6]:
n_sim = df['is_duplicate'].sum()
print(n_sim)

356


In [7]:
# define our vector similarity scoring function based on cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

def score(vector1, vector2, threshold):
    sim_score = 1 if cosine_similarity([vector1],[vector2])[0,0] >= threshold else 0
    return sim_score

In [8]:
# bootstrap accuracy confidence interval and compute average accuracy

from sklearn.utils import resample

def bootstrap_conf(vectors1, vectors2, is_similar, threshold, n_iterations, alpha):
    # compute the bootstrap samples
    n = vectors1.shape[0]
    print('Computing bootstrap samples: ', end='')
    stats = []
    for i in range(n_iterations):
        print(i,end=" ")
        ix_sample = resample([ix for ix in range(n)],
                             replace=True,
                             n_samples=n)
        errors = 0
        for j in ix_sample:
            v1 = vectors1.loc[j,:]
            v2 = vectors2.loc[j,:]
            sim_score = score(v1, v2, threshold)
            if sim_score != is_similar[j]:
                errors += 1

        acc = 1 - errors/n

        # save the score for this bootstrap sample
        stats.append(acc)

    print('')
    # confidence intervals
    p = ((1.0-alpha)/2.0) * 100
    lower = max(0.0, np.percentile(stats, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(stats, p))
    # average performance
    avg_acc = sum(stats)/n_iterations
    
    return lower, upper, avg_acc

In [9]:
# grab our data for processing
questions1 = list(df['question1'])
questions2 = list(df['question2'])
is_similar = list(df['is_duplicate'])

## Syntactic Features

In [10]:
# build a vocabulary based on the first set of questions
from sklearn.feature_extraction.text import CountVectorizer

questions = questions1.copy()
vocab_vectorizer = CountVectorizer(analyzer='word',
                                   binary=True,
                                   min_df=2,
                                   stop_words='english')
vocab_vectorizer.fit(questions)
vocab = list(vocab_vectorizer.get_feature_names())

In [11]:
print(len(vocab))

798


In [12]:
# build docterm for questions1
q1_vectorizer = CountVectorizer(analyzer='word', 
                                binary = True, 
                                vocabulary=vocab) # use our vocabulary
q1_docarray = q1_vectorizer.fit_transform(questions1).toarray()
q1_docterm = pd.DataFrame(q1_docarray, columns=vocab)

In [13]:
# build docterm for questions2
q2_vectorizer = CountVectorizer(analyzer='word', 
                                binary = True, 
                                vocabulary=vocab) # use our vocabulary
q2_docarray = q2_vectorizer.fit_transform(questions2).toarray()
q2_docterm = pd.DataFrame(q2_docarray, columns=vocab)

In [14]:
print(q1_docterm.shape)
print(q2_docterm.shape)

(1000, 798)
(1000, 798)


In [15]:
# experimentally determined similarity threshold for syntactic features
sim_threshold = 0.75

In [16]:
# compute the accuracy for all N_QUESTIONS question pairs
errors = 0
for i in range(N_QUESTIONS):
    v1 = q1_docterm.loc[i,:]
    v2 = q2_docterm.loc[i,:]
    sim_score = score(v1, v2, sim_threshold)
    if sim_score != is_similar[i]:
        errors += 1

acc = 1 - errors/N_QUESTIONS

print("Accuracy: {:.2f}%".format(acc*100))

Accuracy: 65.50%


In [17]:
# accuracy confidence intervals and average accuracy

lb, ub, ave_acc = bootstrap_conf(q1_docterm, 
                                 q2_docterm, 
                                 is_similar, 
                                 sim_threshold, 
                                 N_ITERATIONS, 
                                 ALPHA)
print('{:.2f}% confidence interval {:.2f}% and {:.2f}%'.format(ALPHA*100, lb*100, ub*100))
print("Average accuracy: {:.2f}%".format(ave_acc*100))

Computing bootstrap samples: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
95.00% confidence interval 62.25% and 68.05%
Average accuracy: 65.46%


## Semantic Features

In [18]:
# Load Spacy semantic model

import spacy

# NOTE: for performance reasons disable everything in the pipeline except the tokenizer
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner', 'textcat'])

In [19]:
# process questions for semantic features -- compute embedding vectors for question texts

def embed(X):
    '''
    x is a list of strings and embed will compute
    an embedding vector for each and return an array
    of shape (len(x),EMBEDDING_DIM)
    '''
    vectors = []
    text_array = np.array(X)

    print(text_array.shape)
    
    for i in range(text_array.shape[0]):
        vector = nlp(str(text_array[i])).vector
        vectors.append(vector)
    
    return pd.DataFrame(vectors)

vectors1 = embed(questions1)
vectors2 = embed(questions2)

(1000,)
(1000,)


In [20]:
# experimentally determined similarity threshold for semantic features
sim_threshold = 0.95

In [21]:
# compute the accuracy for all N_QUESTIONS question pairs
errors = 0
for i in range(N_QUESTIONS):
    v1 = vectors1.loc[i,:]
    v2 = vectors2.loc[i,:]
    sim_score = score(v1, v2, sim_threshold)
    if sim_score != is_similar[i]:
        errors += 1
acc = 1 - errors/N_QUESTIONS

print("Accuracy: {:.2f}%".format(acc*100))

Accuracy: 69.00%


In [22]:
# accuracy confidence intervals and average accuracy

lb, ub, ave_acc = bootstrap_conf(vectors1, 
                                 vectors2, 
                                 is_similar, 
                                 sim_threshold, 
                                 N_ITERATIONS, 
                                 ALPHA)
print('{:.2f}% confidence interval {:.2f}% and {:.2f}%'.format(ALPHA*100, lb*100, ub*100))
print("Average accuracy: {:.2f}%".format(ave_acc*100))

Computing bootstrap samples: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
95.00% confidence interval 66.24% and 71.66%
Average accuracy: 68.98%
