In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
import re
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import os
# prints current conda environment
print(os.environ['CONDA_DEFAULT_ENV'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ajay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


base


In [2]:
df = pd.read_csv("data/question-pairs-dataset/questions.csv")
df = df.dropna(axis=0,how="any")
# df =df.loc[0:350000,:]
df.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [3]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [4]:
print(df.is_duplicate.value_counts())
255045/len(df)

0    255042
1    149306
Name: is_duplicate, dtype: int64


0.6307561803199224

In [5]:
# taking look at some questions
print("Not duplicate")
print(df.question1[0])
print(df.question2[0])
print()
print("Not duplicate")
print(df.question1[1])
print(df.question2[1])
print()
print(df.question1[5])
print(df.question2[5])

Not duplicate
What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?

Not duplicate
What is the story of Kohinoor (Koh-i-Noor) Diamond?
What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


In [6]:
def review_to_wordlist(review,remove_stopwords=True):
    # convert sentence to lower case and split it in separate words vector
    words = review.lower().split()
    if remove_stopwords:
        # Optionally remove stop words
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    review_text = " ".join(words)
    # clean the text
    # review_text = "AhbhbhBBjnj2121!??\\'ve@3AB\\'sggFF'll"
    review_text = re.sub("[^A-Za-z0-9(),!.?\'\']"," ",review_text)
    review_text = re.sub("\'s"," 's ",review_text)
    review_text = re.sub(r"\'ve", " 've ", review_text)
    review_text = re.sub(r"n\'t", " 't ", review_text)
    review_text = re.sub(r"\'re", " 're ", review_text)
    review_text = re.sub(r"\'d", " 'd ", review_text)
    review_text = re.sub(r"\'ll", " 'll ", review_text)
    review_text = re.sub(r",", " ", review_text)
    review_text = re.sub(r"\.", " ", review_text)
    review_text = re.sub(r"!", " ", review_text)
    review_text = re.sub(r"\(", " ( ", review_text)
    review_text = re.sub(r"\)", " ) ", review_text)
    review_text = re.sub(r"\?", " ", review_text)
    review_text = re.sub(r"\s{2,}", " ", review_text)
    words = review_text.split()
    # shorten words to their stems
    stemmer = SnowballStemmer("english")
    stemmed_words = [stemmer.stem(word) for word in words]
    review_text = " ".join(stemmed_words)
    return(review_text)


In [7]:
def process_questions(question_list, questions,question_list_name):
    for question in questions:
        question_list.append(review_to_wordlist(question))
        if len(question_list) % 10000 == 0:
            progress=(len(question_list)/len(df))*100
            print("{} is {} % complete".format(question_list_name,progress))


In [None]:


questions1 = []
process_questions(question_list=questions1,
                 questions=df.question1,
                 question_list_name="question 1")
print()
questions2 = []
process_questions(question_list=questions2,
                 questions=df.question2,
                 question_list_name="question 2")

question 1 is 2.4731172158635633 % complete
question 1 is 4.9462344317271265 % complete
question 1 is 7.41935164759069 % complete
question 1 is 9.892468863454253 % complete
question 1 is 12.365586079317815 % complete
question 1 is 14.83870329518138 % complete
question 1 is 17.311820511044942 % complete
question 1 is 19.784937726908506 % complete


In [None]:
# Take a look at first 5 pairs of questions
for i in range(5):
    print(questions1[i])
    print(questions2[i])
    print()

In [None]:
# stores the indices of unusable question
invalid_questions = []
for i in range(len(questions1)):
    if not re.search('[aeiouyAEIOUY]',questions1[i]) or not re.search("[aeiouyAEIOUY]",questions2[i]):
        print(i-len(invalid_questions))
        invalid_questions.append(i-len(invalid_questions))
print(len(invalid_questions))

In [None]:
# remove invalid questions
for index in invalid_questions:
    df = df[df.id != index]
    questions1.pop(index)
    questions2.pop(index)

In [None]:
# These questions are also unusable, but were not detected initially.
# They were found when the function 'cosine_sim' stopped due to an error

unexpected_invalid_questions = [36459,42272,65936,89323,304866,306827,353917,304865,306826,353916]
unexpected_invalid_questions = np.sort(unexpected_invalid_questions)
for index in unexpected_invalid_questions:
    df = df[df.id != index]
    questions1.pop(index)
    questions2.pop(index)


In [None]:
# use tfidfVectorizer() to transform the questions into vectors
# then compute their cosine similarity
vectorizer = TfidfVectorizer()
def cosine_sim(text1,text2):
    tfidf = vectorizer.fit_transform([text1,text2])
    return ((tfidf*tfidf.T).A)[0,1]

In [None]:
tfidf_scores= []
for i in range(len(questions1)):
    try:
        score = cosine_sim(questions1[i],questions2[i])
        tfidf_scores.append(score)
        if i % 10000 == 0:
            progress = (i/len(questions1))*100
            print("Similarity score is {} % complete".format(round(progress,2)))
    except:
        print("index is {},question1 is {}, questions2 is {}".format(i,questions1[i],questions2[i]))
        

In [None]:
text1 = "connect use vpn internet"
text2 =  "internet speed speed speed increas hack dns"
print(text1)
print(text2)
# print(((tfidf*tfidf.T).A)[0,1])
tfidf = vectorizer.fit_transform([text1,text2])

In [None]:
questions1[42272]

In [56]:
unexpected_invalid_questions = [36459,42272,65936,89323,304866,306827,353917,304865,306826,353916]


In [65]:
xx=unexpected_invalid_questions.sort()

In [64]:
xx

<function list.sort(*, key=None, reverse=False)>

In [66]:
print(xx)

None


array([ 36459,  42272,  65936,  89323, 304865, 304866, 306826, 306827,
       353916, 353917])