In [3]:
import pandas as pd
import numpy as np
import re
import nltk
#nltk.download()

In [4]:
text='''Following is total compensation for other presidents at private colleges in Ohio in 2015:

Grant Cornwell, College of Wooster (left in 2015): $911,651
Marvin Krislov, Oberlin College (left in 2016):  $829,913
Mark Roosevelt, Antioch College, (left in 2015): $507,672
Laurie Joyner, Wittenberg University (left in 2015): $463,504
Richard Giese, University of Mount Union (left in 2015): $453,800'''

In [5]:
def extract(text):
    result = re.findall(r'(\w+\s*\w*(?=,)),\s+(.*(?=\s+\()).*(\d{4})\)\:\s+\$(\d*,\d*)',text)
    return result
extract(text)          

[('Grant Cornwell', 'College of Wooster', '2015', '911,651'),
 ('Marvin Krislov', 'Oberlin College', '2016', '829,913'),
 ('Mark Roosevelt', 'Antioch College,', '2015', '507,672'),
 ('Laurie Joyner', 'Wittenberg University', '2015', '463,504'),
 ('Richard Giese', 'University of Mount Union', '2015', '453,800')]

In [6]:
df = pd.read_csv('quora_duplicate_question_500.csv',header = 0)
df

Unnamed: 0,q1,q2,is_duplicate
0,How do you take a screenshot on a Mac laptop?,How do I take a screenshot on my MacBook Pro? ...,1.0
1,Is the US election rigged?,Was the US election rigged?,1.0
2,How scary is it to drive on the road to Hana g...,Do I need a four-wheel-drive car to drive all ...,0.0
3,What should I do when my friends betray me?,What can I do when my friend betrayed me?,1.0
4,How can I see who blocked me at Facebook?,Will I see if someone blocked me of Facebook?,1.0
5,Why India does not have friendly relations wit...,How are India's relations with the U.S.A.?,0.0
6,Why are my bestfriend still ignoring me?,What should I do when my bestfriend is ignorin...,0.0
7,Did Trump win the election?,Does Donald Trump have any chance of winning t...,1.0
8,I am a straight A student but have no motivati...,My fiancÃ©e died recently and it pains my hear...,0.0
9,Who are the Rohingya Muslims?,Who are the Rohingya people?,1.0


In [7]:
import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en')
def tokenize(text,lemmatized = False,no_stopword = False):
    doc = nlp(text)
    stop_words = stopwords.words('english')
    if lemmatized :
        tokens = []
        for i in doc:
            tokens.append(i.lemma_)
        if no_stopword:
            tmp = tokens.copy()
            for i in tmp:
                if i in stop_words:
                    tokens.remove(i)
            return tokens
        else:
            return tokens
    else:
        tokens = [i.text for i in doc]
        if no_stopword:
            tmp = tokens.copy()
            for i in tmp:
                if i in stop_words:
                        tokens.remove(i)
            return tokens
        else:
            return tokens
tokenize("Why should she be dancing?",lemmatized=True,no_stopword=True)

['-PRON-', 'dance', '?']

In [8]:
from scipy.spatial import distance
from sklearn.preprocessing import normalize
def get_similarity(q1,q2,lemmatized = False,no_stopword = False):
    sim = None
    docs = q1+q2
    print(docs)
    token_count = {}
    for i,j in enumerate(docs):
        tokens = tokenize(j,lemmatized=lemmatized,no_stopword=no_stopword)
        freq = nltk.FreqDist(tokens)
        token_count[i] = dict(freq)
    df = pd.DataFrame.from_dict(token_count,orient = 'index')
    df = df.fillna(0)
    tf = df.values
    doc_len = tf.sum(axis = 1)
    tf = np.divide(tf, doc_len[:,None])
    dfeq = np.where(tf>0,1,0)
    smoothed_idf = np.log(np.divide(len(docs)+1, np.sum(dfeq, axis=0)+1))+1
    smoothed_tf_idf = normalize(tf*smoothed_idf)
    sim=1-distance.squareform(distance.pdist(smoothed_tf_idf, 'cosine'))
    sim_list = []
    for i in range(0,500):
        sim_list.append(sim[i][i+500])
    return sim_list

In [9]:
def predict(sim,ground_truth,threshold=0.5):
    predict = []
    count = np.sum(np.where(ground_truth>0,1,0),axis=0)
    count_same = 0
    for i in range(len(sim)):
        if sim[i]>threshold:
            predict.append(1.0)
        else:
            predict.append(0.0)
    predict = np.array(predict)
    for i in range(len(predict)):
        if predict[i] == ground_truth[i] == 1.0:
            count_same+=1
    recall = count_same/count
    return predict,recall    

In [10]:
def evaluate(sim,ground_truth,threshold=0.5):
    predict_this,recall = predict(sim,ground_truth,threshold=threshold)
    correct_count = 0
    count = np.sum(np.where(predict_this>0,1,0))
    for i in range(len(predict_this)):
        if predict_this[i] == ground_truth[i] == 1.0:
            correct_count+=1
    precision = correct_count/count
    return precision,recall

In [11]:
if __name__ == "__main__": 
    # Test Q1
    text='''Following is total compensation for other presidents at private colleges in Ohio in 2015:
            
            Grant Cornwell, College of Wooster (left in 2015): $911,651
            Marvin Krislov, Oberlin College (left in 2016):  $829,913
            Mark Roosevelt, Antioch College, (left in 2015): $507,672
            Laurie Joyner, Wittenberg University (left in 2015): $463,504
            Richard Giese, University of Mount Union (left in 2015): $453,800'''
    print("Test Q1")
    print(extract(text))
    
    data=pd.read_csv("quora_duplicate_question_500.csv",header=0)
    q1 = data["q1"].values.tolist()
    q2 = data["q2"].values.tolist()
    # Test Q2
    
    print("Test Q1")
    print("\nlemmatized: No, no_stopword: No")
    sim = get_similarity(q1,q2)
    pred, recall=predict(sim, data["is_duplicate"].values) 
    print(recall)
    
    print("\nlemmatized: Yes, no_stopword: No")
    sim = get_similarity(q1,q2, True)
    pred, recall=predict(sim, data["is_duplicate"].values) 
    print(recall)
    
    print("\nlemmatized: No, no_stopword: Yes")
    sim = get_similarity(q1,q2, False, True)
    pred, recall=predict(sim, data["is_duplicate"].values) 
    print(recall)
    print("\nlemmatized: Yes, no_stopword: Yes")
    sim = get_similarity(q1,q2, True, True)
    pred, recall=predict(sim, data["is_duplicate"].values) 
    print(recall)
    # Test Q3. Get similarity score, set threshold, and then
    sim = get_similarity(q1,q2)
    prec, rec = evaluate(sim, data["is_duplicate"].values, 0.5)
    print("\nlemmatized: No, no_stopword: No")
    print(prec,rec)
    
    sim = get_similarity(q1,q2,True)
    prec, rec = evaluate(sim, data["is_duplicate"].values, 0.5)
    print("\nlemmatized: Yes, no_stopword: No")
    print(prec,rec)
    
    sim = get_similarity(q1,q2,False,True)
    prec, rec = evaluate(sim, data["is_duplicate"].values, 0.5)
    print("\nlemmatized: No, no_stopword: Yes")
    print(prec,rec)
    
    sim = get_similarity(q1,q2,True,True)
    prec, rec = evaluate(sim, data["is_duplicate"].values, 0.5)
    print("\nlemmatized: Yes, no_stopword: Yes")
    print(prec,rec)

Test Q1
[('Grant Cornwell', 'College of Wooster', '2015', '911,651'), ('Marvin Krislov', 'Oberlin College', '2016', '829,913'), ('Mark Roosevelt', 'Antioch College,', '2015', '507,672'), ('Laurie Joyner', 'Wittenberg University', '2015', '463,504'), ('Richard Giese', 'University of Mount Union', '2015', '453,800')]
Test Q1

lemmatized: No, no_stopword: No
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the differ

0.6304347826086957

lemmatized: Yes, no_stopword: No
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', "What words rank t

0.782608695652174

lemmatized: No, no_stopword: Yes
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', "What words rank th

0.6358695652173914

lemmatized: Yes, no_stopword: Yes
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', "What words rank 

0.7934782608695652
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', "What words rank the highest on Dictionary.com's dif


lemmatized: No, no_stopword: No
0.5523809523809524 0.6304347826086957
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', 


lemmatized: Yes, no_stopword: No
0.51985559566787 0.782608695652174
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?', "W


lemmatized: No, no_stopword: Yes
0.5391705069124424 0.6358695652173914
['How do you take a screenshot on a Mac laptop?', 'Is the US election rigged?', 'How scary is it to drive on the road to Hana given all of the turns?', 'What should I do when my friends betray me?', 'How can I see who blocked me at Facebook?', "Why India does not have friendly relations with it's neighbouring countries?", 'Why are my bestfriend still ignoring me?', 'Did Trump win the election?', 'I am a straight A student but have no motivation or will whatsoever to go to school. Can someone help me fight this?', 'Who are the Rohingya Muslims?', 'What are natural numbers?', 'Can we control our actions in a dream?', 'What is the difference between a turkey and a chicken?', 'What is the opera song with a choir used for horror movies?', 'Did Ronald Reagan have a mannerism in his speech?', 'Who is the most powerful character in Skyrim apart from Alduin?', 'Where can I buy meldonium in Canada?', 'Who is israil friend?',


lemmatized: Yes, no_stopword: Yes
0.5104895104895105 0.7934782608695652
