In [2]:
import pandas as pd
import datetime as dt
import nltk
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument



In [3]:
# Import training and test data
TRAIN = pd.read_csv('../data.gi/train.csv')
TEST = pd.read_csv('../data.gi/test.csv')

In [5]:
# Create lookup table
train_lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]
train_lookup_df.to_pickle('./pickles.gi/train_lookup_df.pkl')

In [6]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
train_q1_df = TRAIN[['id', 'qid1', 'question1']]
train_q1_df.columns = ['pid', 'qid', 'question']
train_q2_df = TRAIN[['id', 'qid2', 'question2']]
train_q2_df.columns = ['pid', 'qid', 'question']
train_questions_df = pd.concat([train_q1_df, train_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)
# Add test set flag
values = [0] * len(train_questions_df.index)
train_questions_df = train_questions_df.assign(test=values)
train_questions_df.head()

Unnamed: 0,pid,qid,question,test
0,0,1,What is the step by step guide to invest in sh...,0
1,0,2,What is the step by step guide to invest in sh...,0
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,0
3,1,4,What would happen if the Indian government sto...,0
4,2,5,How can I increase the speed of my internet co...,0


In [7]:
# Add qid's for question1 and question2
odd_range = pd.Series(range(1, len(TEST.index) * 2 + 1, 2))
even_range = pd.Series(range(2, len(TEST.index) * 2 + 1, 2))
TEST = TEST.assign(qid1=odd_range, qid2=even_range)
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
test_q1_df = TEST[['test_id', 'qid1', 'question1']]
test_q1_df.columns = ['pid', 'qid', 'question']
test_q2_df = TEST[['test_id', 'qid2', 'question2']]
test_q2_df.columns = ['pid', 'qid', 'question']
test_questions_df = pd.concat([test_q1_df, test_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)
# Add test set flag
values = [1] * len(test_questions_df.index)
test_questions_df = test_questions_df.assign(test=values)
test_questions_df.head()

Unnamed: 0,pid,qid,question,test
0,0,1,How does the Surface Pro himself 4 compare wit...,1
1,0,2,Why did Microsoft choose core m3 and not core ...,1
2,1,3,Should I have a hair transplant at age 24? How...,1
3,1,4,How much cost does hair transplant require?,1
4,2,5,What but is the best way to send money from Ch...,1


In [10]:
# Combine train and test sets
# Move test flag column to first position
questions_df = pd.concat([train_questions_df, test_questions_df], ignore_index=True).sort_values(by=['test', 'pid', 'qid']).reset_index(drop=True)
cols = questions_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
questions_df = questions_df[cols]
questions_df.to_pickle('./pickles.gi/questions_df.pkl')
questions_df.head()

Unnamed: 0,test,pid,qid,question
0,0,0,1,What is the step by step guide to invest in sh...
1,0,0,2,What is the step by step guide to invest in sh...
2,0,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,0,1,4,What would happen if the Indian government sto...
4,0,2,5,How can I increase the speed of my internet co...


In [24]:
def preprocessText(questions):
    print("Preprocessing Started @ %s" % dt.datetime.now())
    print("----------------------------------------------------------")
    lowercased = [str(question).lower() for question in questions]
    print("Lowercasing Completed @ %s" % dt.datetime.now())
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized = [tokenizer.tokenize(question) for question in lowercased]
    print("Tokenizing Completed @ %s" % dt.datetime.now())
    stopset = set(stopwords.words('english'))
    filtered = [[token for token in tokens if token not in stopset] for tokens in tokenized]
    print("Stopword Filtering Completed @ %s" % dt.datetime.now())
    lemmatizer = WordNetLemmatizer()
    lemmatized = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in filtered]
    print("Lemmatizing Completed @ %s" % dt.datetime.now())
    print("----------------------------------------------------------")
    print("Preprocessing Ended @ %s" % dt.datetime.now())
    return lemmatized

In [25]:
# Preprocess text for tagging and vectorization
questions_df['token_list'] = preprocessText(questions_df['question'])
questions_df.to_pickle('./pickles.gi/questions_with_tokens_df.pkl')
questions_df.head()

Preprocessing Started @ 2017-05-01 12:49:48.688175
----------------------------------------------------------
Lowercasing Completed @ 2017-05-01 12:49:56.983005
Tokenizing Completed @ 2017-05-01 12:50:39.149221
Stopword Filtering Completed @ 2017-05-01 12:51:21.952501
Lemmatizing Completed @ 2017-05-01 12:57:00.707373
----------------------------------------------------------
Preprocessing Ended @ 2017-05-01 12:57:00.719374


Unnamed: 0,test,pid,qid,question,token_list
0,0,0,1,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market, india]"
1,0,0,2,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market]"
2,0,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[story, kohinoor, koh, noor, diamond]"
3,0,1,4,What would happen if the Indian government sto...,"[would, happen, indian, government, stole, koh..."
4,0,2,5,How can I increase the speed of my internet co...,"[increase, speed, internet, connection, using,..."


In [50]:
# Get list of tagged documents
train_tagged_doclist = [TaggedDocument(row[5], 'pid_' + str(row[2])) for row in questions_df.itertuples() if row[1] == 0]
test_tagged_doclist = [TaggedDocument(row[5], 'pid_' + str(row[2])) for row in questions_df.itertuples() if row[1] == 1]
tagged_doclist = train_tagged_docs + test_tagged_docs
pd.to_pickle(obj=tagged_doclist, path='./pickles.gi/tagged_doclist.pkl')
shuffled_tagged_doclist = tagged_doclist[:]  # for reshuffling per training pass
shuffled_tagged_doclist[:5]

[TaggedDocument(words=['step', 'step', 'guide', 'invest', 'share', 'market', 'india'], tags='pid0'),
 TaggedDocument(words=['step', 'step', 'guide', 'invest', 'share', 'market'], tags='pid0'),
 TaggedDocument(words=['story', 'kohinoor', 'koh', 'noor', 'diamond'], tags='pid1'),
 TaggedDocument(words=['would', 'happen', 'indian', 'government', 'stole', 'kohinoor', 'koh', 'noor', 'diamond', 'back'], tags='pid1'),
 TaggedDocument(words=['increase', 'speed', 'internet', 'connection', 'using', 'vpn'], tags='pid2')]