## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Config

In [3]:
pd.set_option('display.max_colwidth', 2000)

In [4]:
data_folder = '../data/'

In [5]:
TEST_ID_OFFSET = 1000000

## Read Data

In [6]:
questions_train = pd.read_csv(data_folder + 'train.csv')

In [7]:
questions_test = pd.read_csv(data_folder + 'test.csv')

## Remove Duplicate Questions

### Train

In [8]:
train_q1_unique = questions_train[['qid1', 'question1']] \
    .rename(columns={'qid1': 'qid', 'question1': 'question'}) \
    .sort_values('qid')
    
train_q2_unique = questions_train[['qid2', 'question2']] \
    .rename(columns={'qid2': 'qid', 'question2': 'question'}) \
    .sort_values('qid')

In [9]:
train_questions_unique = pd.concat([train_q1_unique, train_q2_unique]) \
    .dropna() \
    .drop_duplicates() \
    .sort_values('qid') \
    .reset_index(drop=True)

In [10]:
train_questions_unique = train_questions_unique[['qid', 'question']]

In [11]:
train_questions_unique.head()

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in share market in india?
1,2,What is the step by step guide to invest in share market?
2,3,What is the story of Kohinoor (Koh-i-Noor) Diamond?
3,4,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
4,5,How can I increase the speed of my internet connection while using a VPN?


In [12]:
train_questions_unique.to_csv(data_folder + 'preproc/unique_questions_train.csv', header=True, index=None)

### Test

In [13]:
test_questions_unique = pd.concat([
    questions_test[['test_id', 'question1']].rename(columns={'question1': 'question'}),
    questions_test[['test_id', 'question2']].rename(columns={'question2': 'question'})
])

In [14]:
test_questions_unique = test_questions_unique.dropna() \
    .drop_duplicates() \
    .sort_values('test_id') \
    .reset_index(drop=True)

In [15]:
test_questions_unique['qid'] = pd.Series(range(TEST_ID_OFFSET, len(test_questions_unique) + TEST_ID_OFFSET))

In [16]:
test_questions_unique = test_questions_unique[['qid', 'test_id', 'question']]

In [17]:
test_questions_unique.head()

Unnamed: 0,qid,test_id,question
0,1000000,0,How does the Surface Pro himself 4 compare with iPad Pro?
1,1000001,0,Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?
2,1000002,1,Should I have a hair transplant at age 24? How much would it cost?
3,1000003,1,How much cost does hair transplant require?
4,1000004,2,What but is the best way to send money from China to the US?


In [18]:
test_questions_unique.to_csv(data_folder + 'preproc/unique_questions_test.csv', header=True, index=None)

## Merge Train + Test Into a Single Dataset

In [19]:
all_questions_unique = pd.concat([
    train_questions_unique[['qid', 'question']],
    test_questions_unique[['qid', 'question']],
])

In [20]:
all_questions_unique.to_csv(data_folder + 'preproc/unique_questions_all.csv', header=True, index=None)

## Save Questions as Text Corpora

In [21]:
with open(data_folder + 'preproc/unique_questions_train.txt', 'w') as f:
    f.write('\n'.join(train_questions_unique.question.values))

In [22]:
with open(data_folder + 'preproc/unique_questions_test.txt', 'w') as f:
    f.write('\n'.join(test_questions_unique.question.values))

In [23]:
with open(data_folder + 'preproc/unique_questions_all.txt', 'w') as f:
    f.write('\n'.join(all_questions_unique.question.values))