In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize

In [2]:
# Configure lemmatizer
# nltk uses treebank tags. WNLemmatizer uses ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# use get_wordnet_pos to convert treebank tags to wordnet tags before lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [3]:
df = pd.read_csv('../data/train.csv', index_col="id")

In [4]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


id - the id of a training set question pair

qid1, qid2 - unique ids of each question (only available in train.csv)

question1, question2 - the full text of each question
is_duplicate - the target variable, set to 1 if question1 and question2 have essentially the same meaning, and 0 otherwise.

In [5]:
df.shape

(404290, 5)

In [6]:
df.is_duplicate.sum()

149263

In [7]:
# percent duplicate questions == 36.9%
df.is_duplicate.sum() / df.shape[0]

0.369197853026293

In [8]:
# tokenize, lower, tag, lemmatize question text
question1_raw = df.question1

In [9]:
tokens = word_tokenize(question1_raw[0])
tokens

['What',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india',
 '?']

In [15]:
def tokenize_q(raw_q):
    return word_tokenize(question1_raw[0])

In [10]:
lowered = [t.lower() for t in tokens]

In [16]:
def lower(tokenized_q):
    return [t.lower() for t in tokenized_q]

In [11]:
tags = nltk.pos_tag(lowered)
tags

[('what', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('step', 'NN'),
 ('by', 'IN'),
 ('step', 'NN'),
 ('guide', 'RB'),
 ('to', 'TO'),
 ('invest', 'VB'),
 ('in', 'IN'),
 ('share', 'NN'),
 ('market', 'NN'),
 ('in', 'IN'),
 ('india', 'NN'),
 ('?', '.')]

In [22]:
def tag(lowered_q):
    return nltk.pos_tag(lowered_q)

In [12]:
for x in lowered:
    print(lemmatizer.lemmatize(x, wordnet.VERB))

what
be
the
step
by
step
guide
to
invest
in
share
market
in
india
?


In [13]:
wnl_pos = []
for x in tags:
    wnl_pos.append((x[0], get_wordnet_pos(x[1])))

In [14]:
for x in wnl_pos:
    print(lemmatizer.lemmatize(x[0], x[1]))

what
be
the
step
by
step
guide
to
invest
in
share
market
in
india
?


In [27]:
# wrap functions up
def get_tags(question_str):
    return tag(lower(tokenize_q(question_str)))

In [24]:
# test functions on 2nd question
question2_raw = df.question1[1]

In [28]:
get_lemmas_tags(question2_raw)

[('what', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('step', 'NN'),
 ('by', 'IN'),
 ('step', 'NN'),
 ('guide', 'RB'),
 ('to', 'TO'),
 ('invest', 'VB'),
 ('in', 'IN'),
 ('share', 'NN'),
 ('market', 'NN'),
 ('in', 'IN'),
 ('india', 'NN'),
 ('?', '.')]

In [43]:
def get_lemmas(question_str):
    word_tag_pairs = tag(lower(tokenize_q(question_str)))
    wordnet_pos_converted = [(x[0], get_wordnet_pos(x[1])) for x in word_tag_pairs]
    lemmas_list = [lemmatizer.lemmatize(x[0], x[1]) for x in wordnet_pos_converted]
    return ' '.join(lemmas_list)

In [44]:
get_lemmas(question2_raw)

'what be the step by step guide to invest in share market in india ?'

In [45]:
# lemmatize all questions in question1 col
question1_lemmas = [get_lemmas(question) for question in df.question1]

In [46]:
question1_lemmas[:5]

['what be the step by step guide to invest in share market in india ?',
 'what be the step by step guide to invest in share market in india ?',
 'what be the step by step guide to invest in share market in india ?',
 'what be the step by step guide to invest in share market in india ?',
 'what be the step by step guide to invest in share market in india ?']

In [49]:
len(question1_lemmas)

404290

In [50]:
len(set(question1_lemmas))

1

In [51]:
test = [question for question in df.question1]

In [52]:
test[:5]

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?']