In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup

In [67]:
df = pd.read_csv('/content/train.csv')

In [68]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [70]:
df.sample(30000, random_state = 2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0
367788,367788,498109,491396,Why do so many people in the U.S. hate the sou...,My boyfriend doesnt feel guilty when he hurts ...,0
151235,151235,237843,50930,Consequences of Bhopal gas tragedy?,What was the reason behind the Bhopal gas trag...,0
...,...,...,...,...,...,...
243932,243932,26193,356455,What are some good web scraping tutorials?,What are some good web scraping programs?,1
91980,91980,154063,154064,Can I apply for internet banking in SBI withou...,I have internet banking kit of SBI but it's no...,0
266955,266955,133017,384210,How much HE laundry detergent do you use in a ...,Can I use regular Dawn dishsoap in my dishwash...,0
71112,71112,122427,122428,What is the best way to understand and learn m...,What are some of the best ways to learn math?,1


In [71]:
def preprocess(q):

    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()


    return q

In [72]:
preprocess("I am Zeyam Ali matharu @ g,<p>ail$%$00</p>")

'i am zeyam ali matharu at g ail dollar percent dollar 00'

In [73]:
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

  q = BeautifulSoup(q)


In [74]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0


In [75]:
#no.of charchters in each question
df['q1-len'] = df['question1'].str.len()
df['q2_len'] = df['question2'].str.len()

In [76]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38


In [77]:
#words in each question
def words(s):
  list = []
  for word in s.split():
    list.append(word)



  return list

In [78]:
df['q1_words'] = pd.DataFrame({'q1_words': df['question1'].apply(words)})
df['q2_words'] = pd.DataFrame({'q2_words': df['question2'].apply(words)})

In [79]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv..."
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm..."
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,..."
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,..."
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]"


In [80]:
#No.of words in each question
def no_of_words(s):
  list = []
  for word in s.split():
    list.append(word)



  return len(list)

In [81]:
s = "i am zeyam ali"
no_of_words(s)

4

In [92]:
df['no_q1_words'] = pd.DataFrame(df['question1'].apply(no_of_words))
df['no_q2_words'] = pd.DataFrame(df['question2'].apply(no_of_words))


In [93]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words,no_q1_words,no_q12_words,no_q2_words,common_words
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",14,12,12,12
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm...",10,15,15,11
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",14,10,10,14
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,...",11,12,12,11
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",13,7,7,14


In [100]:
#common words
def common_words(w):
    w1 = set(map(lambda word: word.lower().strip() , w['question1'].split(" ")))
    w2 = set(map(lambda word : word.lower().strip() , w['question2'].split(" ") ))
    return len(w1 & w2)

In [102]:
df['common_words'] = df.apply(common_words, axis =1)

In [103]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words,no_q1_words,no_q12_words,no_q2_words,common_words
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",14,12,12,11
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm...",10,15,15,8
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",14,10,10,4
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,...",11,12,12,1
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",13,7,7,4


In [97]:
df.head(398783)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words,no_q1_words,no_q12_words,no_q2_words,common_words
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",14,12,12,12
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm...",10,15,15,11
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",14,10,10,14
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,...",11,12,12,11
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",13,7,7,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398778,398778,65294,105882,why does quora say your question needs improve...,why does my question need to be improved and h...,1,106,62,"[why, does, quora, say, your, question, needs,...","[why, does, my, question, need, to, be, improv...",20,13,13,19
398779,398779,175323,240565,how does one build a website from scratch,how can i make a website from scratch,1,41,37,"[how, does, one, build, a, website, from, scra...","[how, can, i, make, a, website, from, scratch]",8,8,8,8
398780,398780,532025,532026,how large typically is the entourage that acco...,how large is the u s presidents entourage when...,1,89,77,"[how, large, typically, is, the, entourage, th...","[how, large, is, the, u, s, presidents, entour...",15,14,14,15
398781,398781,532027,532028,who is amartya sen,what is the contribution of amartya sen in wel...,0,18,60,"[who, is, amartya, sen]","[what, is, the, contribution, of, amartya, sen...",4,10,10,4


In [106]:
def total_words(s):
  w1 = set(map(lambda word: word.lower().strip(), s['question1'].split(" ")))
  w2 = set(map(lambda word: word.lower().strip(), s['question2'].split(" ")))
  return (len(w1) + len(w2))


In [107]:
df['total_word_in_pair'] = df.apply(total_words,axis =1)

In [108]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words,no_q1_words,no_q12_words,no_q2_words,common_words,total_word_in_pair
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",14,12,12,11,23
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm...",10,15,15,8,26
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",14,10,10,4,24
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,...",11,12,12,1,22
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",13,7,7,4,21


In [115]:
df['word_share'] = round(df['common_words']/df['total_word_in_pair'],2)

In [116]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1-len,q2_len,q1_words,q2_words,no_q1_words,no_q12_words,no_q2_words,common_words,total_word_in_pair,word_share
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",14,12,12,11,23,0.48
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm...",10,15,15,8,26,0.31
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",14,10,10,4,24,0.17
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, 23, 24, math, is,...",11,12,12,1,22,0.05
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",13,7,7,4,21,0.19
