In [1]:
import nltk
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer

In [2]:
# Import training data into dataframe
TRAIN = pd.read_csv('../data.gi/train.csv')
TRAIN.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
# Create lookup table
lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]

In [4]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
q1_df = TRAIN[['id', 'qid1', 'question1']]
q1_df.columns = ['pid', 'qid', 'question']
q2_df = TRAIN[['id', 'qid2', 'question2']]
q2_df.columns = ['pid', 'qid', 'question']
questions_df = pd.concat([q1_df, q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)
questions_df.head()

Unnamed: 0,pid,qid,question
0,0,1,What is the step by step guide to invest in sh...
1,0,2,What is the step by step guide to invest in sh...
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,1,4,What would happen if the Indian government sto...
4,2,5,How can I increase the speed of my internet co...


In [5]:
# Parse to string, lowercase, tokenize, filter out stopwords, and stem
#def cleanTokenizeFilterStem(questions):
#    tokenizer = RegexpTokenizer(r'\w+')
#    stemmer = SnowballStemmer('english')
#    stopwords = set('for a of the and to in'.split(' '))
#    cleaned = [str(question).lower() for question in questions]
#    tokenized = [tokenizer.tokenize(question) for question in cleaned]
#    filtered = [[token for token in tokens if token not in stopwords] for tokens in tokenized]
#    [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in filtered]
#
#cleanTokenizeFilterStem(questions_df['question'])

In [6]:
# Parse questions to lowercase string
q_cleaned = [str(question).lower() for question in questions_df['question']]
q_cleaned

['what is the step by step guide to invest in share market in india?',
 'what is the step by step guide to invest in share market?',
 'what is the story of kohinoor (koh-i-noor) diamond?',
 'what would happen if the indian government stole the kohinoor (koh-i-noor) diamond back?',
 'how can i increase the speed of my internet connection while using a vpn?',
 'how can internet speed be increased by hacking through dns?',
 'why am i mentally very lonely? how can i solve it?',
 'find the remainder when [math]23^{24}[/math] is divided by 24,23?',
 'which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'which fish would survive in salt water?',
 'astrology: i am a capricorn sun cap moon and cap rising...what does that say about me?',
 "i'm a triple capricorn (sun, moon and ascendant in capricorn) what does this say about me?",
 'should i buy tiago?',
 'what keeps childern active and far from phone and video games?',
 'how can i be a good geologist?',
 'what should 

In [7]:
# Remove puncutation and tokenize
tokenizer = RegexpTokenizer(r'\w+')
q_tokenized = [tokenizer.tokenize(question) for question in q_cleaned]
q_tokenized

[['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  'in',
  'india'],
 ['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market'],
 ['what',
  'is',
  'the',
  'story',
  'of',
  'kohinoor',
  'koh',
  'i',
  'noor',
  'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'stole',
  'the',
  'kohinoor',
  'koh',
  'i',
  'noor',
  'diamond',
  'back'],
 ['how',
  'can',
  'i',
  'increase',
  'the',
  'speed',
  'of',
  'my',
  'internet',
  'connection',
  'while',
  'using',
  'a',
  'vpn'],
 ['how',
  'can',
  'internet',
  'speed',
  'be',
  'increased',
  'by',
  'hacking',
  'through',
  'dns'],
 ['why',
  'am',
  'i',
  'mentally',
  'very',
  'lonely',
  'how',
  'can',
  'i',
  'solve',
  'it'],
 ['find',
  'the',
  'remainder',
  'when',
  'math',
  '23',
  '24',
  'math',
  'is',
  'divided',
  'by',
  '24',
  '23']

In [8]:
# Filter out stopwords
stopwords = set('for a of the and to in'.split(' '))
q_filtered = [[token for token in tokens if token not in stopwords] for tokens in q_tokenized]
q_filtered

[['what',
  'is',
  'step',
  'by',
  'step',
  'guide',
  'invest',
  'share',
  'market',
  'india'],
 ['what', 'is', 'step', 'by', 'step', 'guide', 'invest', 'share', 'market'],
 ['what', 'is', 'story', 'kohinoor', 'koh', 'i', 'noor', 'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'indian',
  'government',
  'stole',
  'kohinoor',
  'koh',
  'i',
  'noor',
  'diamond',
  'back'],
 ['how',
  'can',
  'i',
  'increase',
  'speed',
  'my',
  'internet',
  'connection',
  'while',
  'using',
  'vpn'],
 ['how',
  'can',
  'internet',
  'speed',
  'be',
  'increased',
  'by',
  'hacking',
  'through',
  'dns'],
 ['why',
  'am',
  'i',
  'mentally',
  'very',
  'lonely',
  'how',
  'can',
  'i',
  'solve',
  'it'],
 ['find',
  'remainder',
  'when',
  'math',
  '23',
  '24',
  'math',
  'is',
  'divided',
  'by',
  '24',
  '23'],
 ['which',
  'one',
  'dissolve',
  'water',
  'quikly',
  'sugar',
  'salt',
  'methane',
  'carbon',
  'di',
  'oxide'],
 ['which', 'fish', 'would', 'su

In [9]:
# Stem tokens
stemmer = SnowballStemmer('english')
q_stemmed = [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in q_filtered]
q_stemmed

[['what',
  'is',
  'step',
  'by',
  'step',
  'guid',
  'invest',
  'share',
  'market',
  'india'],
 ['what', 'is', 'step', 'by', 'step', 'guid', 'invest', 'share', 'market'],
 ['what', 'is', 'stori', 'kohinoor', 'koh', 'i', 'noor', 'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'indian',
  'govern',
  'stole',
  'kohinoor',
  'koh',
  'i',
  'noor',
  'diamond',
  'back'],
 ['how',
  'can',
  'i',
  'increas',
  'speed',
  'my',
  'internet',
  'connect',
  'while',
  'use',
  'vpn'],
 ['how',
  'can',
  'internet',
  'speed',
  'be',
  'increas',
  'by',
  'hack',
  'through',
  'dns'],
 ['whi', 'am', 'i', 'mental', 'veri', 'lone', 'how', 'can', 'i', 'solv', 'it'],
 ['find',
  'remaind',
  'when',
  'math',
  '23',
  '24',
  'math',
  'is',
  'divid',
  'by',
  '24',
  '23'],
 ['which',
  'one',
  'dissolv',
  'water',
  'quik',
  'sugar',
  'salt',
  'methan',
  'carbon',
  'di',
  'oxid'],
 ['which', 'fish', 'would', 'surviv', 'salt', 'water'],
 ['astrolog',
  'i',
  'am

In [12]:
questions_df['tokens'] = q_stemmed
questions_df.head()

Unnamed: 0,pid,qid,question,tokens
0,0,1,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
1,0,2,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[what, is, stori, kohinoor, koh, i, noor, diam..."
3,1,4,What would happen if the Indian government sto...,"[what, would, happen, if, indian, govern, stol..."
4,2,5,How can I increase the speed of my internet co...,"[how, can, i, increas, speed, my, internet, co..."
