## Load Dependencies

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


### Load Dataset

In [3]:
q1 = pd.read_csv("/home/abhishek/Desktop/Quora-Duplicate-Question-Pairs/data/train.csv")
q2 = pd.read_csv("/home/abhishek/Desktop/Quora-Duplicate-Question-Pairs/data/test.csv")

In [4]:
# prepare question dataframes

ques1 = q1.question1
ques2 = q1.question2
is_dup = q1.is_duplicate
ques = ques1 + ques2
ques2[201841] = '';ques2[105780] = '';ques[105780] = '';ques[201841] = ''

In [16]:
# fit the text file to tokenize data
# reference: http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/

tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(ques)
q1_word_sequences = tokenizer.texts_to_sequences(ques1)
q2_word_sequences = tokenizer.texts_to_sequences(ques2)

### Load GloVe embeddings

In [7]:
def get_embeddings():
    GloVe = open("glove.6B.200d.txt", "r")
    embeddings = {}
    print "processing GloVe"
    for line in GloVe.readlines():
        val = line.split(' ')
        word = val[0]
        embeddings[word] = np.asarray(val[1:], dtype='float32')

    np.save('embeddings.npy', embeddings) 
    GloVe.close()
    return embeddings
    
embeddings = get_embeddings()

processing GloVe


In [24]:
embedding_dim = 200
word_index = tokenizer.word_index

In [33]:
# reference: http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/

embedding_mat = np.zeros((len(word_index)+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_mat[i] = embedding_vector[:embedding_dim]

print('Null word embeddings: %d' % np.sum(np.sum(embedding_mat, axis=1) == 0))

Null word embeddings: 36170


### create train question data

In [34]:
q1_data = pad_sequences(q1_word_sequences, maxlen=20)
q2_data = pad_sequences(q2_word_sequences, maxlen=20)
labels = np.array(is_dup, dtype=int)
print('question1', q1_data.shape)
print('question2', q2_data.shape)
print('label tensor', labels.shape)

('question1', (404290, 20))
('question2', (404290, 20))
('label tensor', (404290,))


### save preprocessed files

In [35]:
np.save('q1_train.npy', q1_data)
np.save('q2_train.npy', q2_data)
np.save('is_dup.npy', labels)
np.save('embedding_mat.npy', embedding_mat)