#### Load Dataset

We are using **Quora's question pairs** dataset. The dataset has question pairs and label indicating whether the two questions are semantically same or not.

In [None]:
!wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [None]:
!ls -l

In [None]:
import pandas as pd
import numpy as np

In [None]:
# df = pd.read_csv('quora_duplicate_questions.tsv')

In [None]:
df = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')

In [None]:
df.sample(n=5)

In [None]:
df.to_csv("test.csv")

In [None]:
df.shape

In [None]:
#Lets consider only 5000 records for demo here
small_df = df.sample(n=5000)
small_df.reset_index(inplace=True, drop=True)

In [None]:
small_df.shape

Check question pairs

In [None]:
idx = np.random.randint(0, small_df.shape[0])

print('First Question:', small_df.loc[idx,'question1'])
print('Second Question:', small_df.loc[idx,'question2'])
print('Are questions duplicate?:', small_df.loc[idx,'is_duplicate'])

In [None]:
#How many duplicate pairs
small_df.groupby(['is_duplicate']).count()

#### Text Preprocessing

In [None]:
import re

Function to clean up text. We can add more things here e.g Lemmatization

In [None]:
def clean_str(text):
    #Using regex
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', str(text))
    text = text.lower()
    return text

Apply the above function on question pairs

In [None]:
small_df['clean_question1'] = small_df['question1'].apply(clean_str)
small_df['clean_question2'] = small_df['question2'].apply(clean_str)

In [None]:
small_df.sample(n=5)

Check question pairs with cleaned out text

In [None]:
idx = np.random.randint(0, small_df.shape[0])
print('First Question:', small_df.loc[idx,'clean_question1'])
print('Second Question:', small_df.loc[idx,'clean_question2'])
print('Are questions duplicate?:', small_df.loc[idx,'is_duplicate'])

#### Vectorization

We are using TF-IDF vectorization here

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vect = TfidfVectorizer(stop_words='english')
vect.fit(small_df['clean_question1'].tolist() + small_df['clean_question2'].tolist())
len(vect.get_feature_names_out())

#### Check Similarity using TF-IDF

In [None]:
#Convert question pairs in vector form
question1 = vect.transform(small_df['clean_question1'].tolist())
question2 = vect.transform(small_df['clean_question2'].tolist())

In [None]:
question1.shape, question2.shape

We are using Cosine Similarity here to check similarity between two vectors. Other approaches can be Eucledean distance, Jaccard Index, Manhattan distance, WMD etc.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
idx = np.random.randint(0, small_df.shape[0])

print('First Question:', small_df.loc[idx,'clean_question1'])
print('Second Question:', small_df.loc[idx,'clean_question2'])
print('Are questions duplicate?:', small_df.loc[idx,'is_duplicate'])

print('Cosine Similarity:',cosine_similarity(question1[idx], question2[idx]))

Function to calculate similarity based on a threshold

In [None]:
def check_similarity_tfidf(row, threshold=0.5):

    similarity = cosine_similarity(vect.transform([str(row[6])]), vect.transform([str(row[7])]))
    if similarity >= threshold:
        return 1
    else:
        return 0

In [None]:
#Apply the function above
small_df['tfidf_similarity'] = small_df.apply(check_similarity_tfidf, axis=1)

In [None]:
small_df.sample(n=5)

In [None]:
small_df.to_csv("tdfidf.csv")

Calculate accuracy of this approach

In [None]:
small_df['is_duplicate'] == small_df['tfidf_similarity']

In [None]:
np.mean(small_df['is_duplicate'] == small_df['tfidf_similarity'])

#### Check Similarity using Word2Vec embeddings

In [None]:
import gensim.downloader as api

In [None]:
#Load Glove model (similar to Word2Vec)
model = api.load('glove-wiki-gigaword-50')

In [None]:
#Model vocabulary
#model.index2word

In [None]:
def sentence2vec(model, sentence, embedding_size=50):

    words = sentence.split()

    #Initialize sentence vector with zeros
    sent2vec = np.zeros(embedding_size)
    sentence_length =0

    for word in words:

        if word in model.index2word:
            sent2vec = np.add(sent2vec, model[word])
            sentence_length += 1

    #Average features (divide by sentence length)
    if sentence_length > 0:
        sent2vec = np.divide(sent2vec, sentence_length)

    return np.expand_dims(sent2vec,axis=0)

Find Cosine similarity between Question embeddings

In [None]:
idx = np.random.randint(0, small_df.shape[0])
print('First Question:', small_df.loc[idx,'clean_question1'])
print('Second Question:', small_df.loc[idx,'clean_question2'])
print('Are questions duplicate?:', small_df.loc[idx,'is_duplicate'])

#Get Sentence embeddings
q1_embed = sentence2vec(model, small_df.loc[idx,'clean_question1'])
q2_embed = sentence2vec(model, small_df.loc[idx,'clean_question2'])
question_similarity = cosine_similarity(q1_embed, q2_embed)
print('Cosine Similarity:',question_similarity)

#### Check Similarity using Word2Vec with SIF (Smooth Inverse Frequency)

In [None]:
from collections import Counter
import itertools
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('punkt')

Building word frequency map

In [None]:
def map_word_frequency(document):
    return Counter(itertools.chain(*document))

In [None]:
#Tokenize each question into word list
tokenized_question1 = [word_tokenize(row) for row in small_df['clean_question1'].tolist()]
tokenized_question2 = [word_tokenize(row) for row in small_df['clean_question2'].tolist()]

#Build word count map
word_count_map = map_word_frequency(tokenized_question1 + tokenized_question2)

In [None]:
word_count_map

In [None]:
word_count_map['nickname']

In [None]:
def sentence2vec_sif(model, sentence, word_count_map, embedding_size=50):

    words = sentence.split()

    #Initialize sentence vector with zeros
    sent2vec = np.zeros(embedding_size)
    sentence_length =0

    for word in words:

        if word in model.index2word:
            #A word importance, high for less frequent words
            word_importance = 0.01 / (0.01 + word_count_map[word])
            word_embed = np.multiply(model[word], word_importance)

            sent2vec = np.add(sent2vec, word_embed)
            sentence_length += 1

    #Average features (divide by sentence length)
    if sentence_length > 0:
        sent2vec = np.divide(sent2vec, sentence_length)

    return np.expand_dims(sent2vec,axis=0)

In [None]:
idx = np.random.randint(0, small_df.shape[0])
print('First Question:', small_df.loc[idx,'clean_question1'])
print('Second Question:', small_df.loc[idx,'clean_question2'])
print('Are questions duplicate?:', small_df.loc[idx,'is_duplicate'])

#Get Sentence embeddings
q1_embed = sentence2vec_sif(model, small_df.loc[idx,'clean_question1'], word_count_map)
q2_embed = sentence2vec_sif(model, small_df.loc[idx,'clean_question2'], word_count_map)
question_similarity = cosine_similarity(q1_embed, q2_embed)
print('Cosine Similarity:',question_similarity)