In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize

In [2]:
# Configure lemmatizer
# nltk uses treebank tags. WNLemmatizer uses ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# use get_wordnet_pos to convert treebank tags to wordnet tags before lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [3]:
df = pd.read_csv('../data/train.csv', index_col="id")

In [4]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


id - the id of a training set question pair

qid1, qid2 - unique ids of each question (only available in train.csv)

question1, question2 - the full text of each question
is_duplicate - the target variable, set to 1 if question1 and question2 have essentially the same meaning, and 0 otherwise.

In [5]:
df.shape

(404290, 5)

In [6]:
df.is_duplicate.sum()

149263

In [7]:
# percent duplicate questions == 36.9%
df.is_duplicate.sum() / df.shape[0]

0.369197853026293

In [8]:
# tokenize, lower, tag, lemmatize question text
question1_raw = df.question1

In [9]:
question1_raw[0].lower()

'what is the step by step guide to invest in share market in india?'

In [10]:
tokens = word_tokenize(question1_raw[0])
tokens

['What',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india',
 '?']

In [11]:
tags = nltk.pos_tag(tokens)
tags

[('What', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('step', 'NN'),
 ('by', 'IN'),
 ('step', 'NN'),
 ('guide', 'RB'),
 ('to', 'TO'),
 ('invest', 'VB'),
 ('in', 'IN'),
 ('share', 'NN'),
 ('market', 'NN'),
 ('in', 'IN'),
 ('india', 'NN'),
 ('?', '.')]

In [12]:
for x in tokens:
    print(lemmatizer.lemmatize(x, wordnet.VERB))

What
be
the
step
by
step
guide
to
invest
in
share
market
in
india
?


In [13]:
wnl_pos = []
for x in tags:
    wnl_pos.append((x[0], get_wordnet_pos(x[1])))

In [14]:
for x in wnl_pos:
    print(lemmatizer.lemmatize(x[0], x[1]))

What
be
the
step
by
step
guide
to
invest
in
share
market
in
india
?
