In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the quora question (sampled) dataset with 1000 question pairs
QUORA_DATA = 'data/quora_questions.csv'
N_QUESTIONS = 1000

df = pd.read_csv(QUORA_DATA)

In [3]:
print(df.shape)

(1000, 4)


In [4]:
print(df.head())

   Unnamed: 0                                          question1  \
0      250366  What are the tips for clearing Google Summer o...   
1      112801    How does social security rule monocular vision?   
2       13679  Which AMD FX series laptop is equal to Intel i...   
3      207849                  What is an addictive personality?   
4      171197  What are the most critical metrics To measure ...   

                                           question2  is_duplicate  
0        How can I crack GSOC-Google Summer of Code?             1  
1  Will you be approved for social security with ...             1  
2                     Which is better: AMD FX vs i5?             0  
3               What is a non-addictive personality?             0  
4  What instrumental does Dr. Dre use in his comm...             0  


In [5]:
n_sim = df['is_duplicate'].sum()
print(n_sim)

356


## Syntactic Features

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

questions1 = list(df['question1'])
questions2 = list(df['question2'])
is_similar = list(df['is_duplicate'])

questions = questions1.copy()
#questions.extend(questions2)

vocab_vectorizer = CountVectorizer(analyzer='word',
                                   binary=True,
                                   min_df=2,
                                   stop_words='english')
vocab_vectorizer.fit(questions)
vocab = list(vocab_vectorizer.get_feature_names())

In [7]:
print(len(vocab))

798


In [8]:
# build docterm for questions1
q1_vectorizer = CountVectorizer(analyzer='word', 
                                binary = True, 
                                vocabulary=vocab)
q1_docarray = q1_vectorizer.fit_transform(questions1).toarray()
q1_docterm = pd.DataFrame(q1_docarray, columns=vocab)

In [9]:
# build docterm for questions2
q2_vectorizer = CountVectorizer(analyzer='word', 
                                binary = True, 
                                vocabulary=vocab)
q2_docarray = q2_vectorizer.fit_transform(questions2).toarray()
q2_docterm = pd.DataFrame(q2_docarray, columns=vocab)

In [10]:
print(q1_docterm.shape)
print(q2_docterm.shape)

(1000, 798)
(1000, 798)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

SIM_THRESHOLD = 0.75

errors = 0
for i in range(N_QUESTIONS):
    vector1 = q1_docterm.loc[i,:]
    vector2 = q2_docterm.loc[i,:]
    sim_score = 1 if cosine_similarity([vector1],[vector2])[0,0] >= SIM_THRESHOLD else 0
    
    if sim_score != is_similar[i]:
        errors += 1
acc = 1 - errors/N_QUESTIONS

print("Accuracy: {: 6.2f}%".format(acc*100))

Accuracy:  65.50%


## Semantic Features

In [12]:
# Load Spacy semantic model

import spacy

# NOTE: for performance reasons disable everything in the pipeline except the tokenizer
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner', 'textcat'])

In [13]:
# preprocess text for semantic features

questions1 = list(df['question1'])
questions2 = list(df['question2'])
is_similar = list(df['is_duplicate'])

def embed(X):
    '''
    x is a list of strings and embed will compute
    an embedding vector for each and return an array
    of shape (len(x),EMBEDDING_DIM)
    '''
    vectors = []
    text_array = np.array(X)

    print(text_array.shape)
    
    for i in range(text_array.shape[0]):
        vector = nlp(str(text_array[i])).vector
        vectors.append(vector)
    
    return pd.DataFrame(vectors)

vectors1 = embed(questions1)
vectors2 = embed(questions2)

(1000,)
(1000,)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

SIM_THRESHOLD = 0.95

errors = 0
for i in range(N_QUESTIONS):
    vector1 = vectors1.loc[i,:]
    vector2 = vectors2.loc[i,:]
    sim_score = 1 if cosine_similarity([vector1],[vector2])[0,0] >= SIM_THRESHOLD else 0
    if sim_score != is_similar[i]:
        errors += 1
acc = 1 - errors/N_QUESTIONS

print("Accuracy: {: 6.2f}%".format(acc*100))

Accuracy:  69.00%
