In [4]:
pip install pandas numpy nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.7.25-cp38-cp38-macosx_11_0_arm64.whl (282 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m282.6/282.6 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
[0mSuccessfully installed nltk-3.7 regex-2022.7.25
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


# Data cleaning

We do some basic data cleaning including stemming (i.e. removing suffixes) and removing common words, tagging parts of speech, and finding the duplicate words between given question pairs

In [53]:
import pandas as pd
import numpy as np

from nltk.stem.porter import *
from nltk.tokenize import *
from nltk.corpus import stopwords

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

data_file_location = "./quora_duplicate_questions.tsv"
data = pd.read_csv(
    data_file_location,
    sep='\t',
)

stemmed_q1s = []
tagged_q1s = []
stemmed_q2s = []
tagged_q2s = []
dups_all = []


common_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

for index, question in data.iterrows():
    question1 = question.question1
    question2 = question.question2

    try:
        tokens1 = [token for token in wordpunct_tokenize(question1) if token not in common_words]
        stemmed1 = [stemmer.stem(word) for word in tokens1]
        tagged1 = nltk.pos_tag(stemmed1)

        tokens2 = [token for token in wordpunct_tokenize(question2) if token not in common_words]
        stemmed2 = [stemmer.stem(word) for word in tokens2]
        tagged2 = nltk.pos_tag(stemmed2)
        
        dups = [word for word in stemmed1 if word in stemmed2]
    except:
        data.drop([index],inplace=True)
        print("Ran into problem with data, removing question:")
        print(question)
        continue
    stemmed_q1s.append(stemmed1)
    tagged_q1s.append(tagged1)
    stemmed_q2s.append(stemmed2)
    tagged_q2s.append(tagged2)
    dups_all.append(dups)

data.insert(4,'q1_stems',stemmed_q1s)
data.insert(5,'q1_tags',tagged_q1s)
data.insert(7,'q2_stems',stemmed_q2s)
data.insert(8,'q2_tags',tagged_q2s)
data.insert(9,'duplicates',dups_all)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Ran into problem with data, removing question:
id                                      105780
qid1                                    174363
qid2                                    174364
question1       How can I develop android app?
question2                                  NaN
is_duplicate                                 0
Name: 105780, dtype: object
Ran into problem with data, removing question:
id                                        201841
qid1                                      303951
qid2                                      174364
question1       How can I create an Android app?
question2                                    NaN
is_duplicate                                   0
Name: 201841, dtype: object
Ran into problem with data, removing question:
id                                                         363362
qid1                                                       493340
qid2                                                       493341
question1                    

In [56]:
print(data.head())

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                            q1_stems  \
0  [what, step, step, guid, invest, share, market...   
1  [what, stori, kohinoor, (, koh, -, -, noor, ),...   
2  [how, i, increas, speed, internet, connect, us...   
3         [whi, i, mental, lone, ?, how, i, solv, ?]   
4  [which, one, dissolv, water, quikli, sugar, ,,...   

                                             q1_tags  \
0  [(what, WP), (step, VB), (step, NN), (guid, NN...   
1  [(what, WP), (stori, VBD), (kohinoor, NN), ((,...   
2  [(how, WRB), (i, JJ), (increas, VBP), (speed, ...   
3  [(w