# Quora question pairs: data preparation

## Import packages

In [4]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file
import pandas as pd

## Initialize global variables

In [2]:
DATASETS_DIR = '.\data\\'
GLOVE_FILE = 'glove.6B.300d.txt'
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300

## Download and extract questions pairs data

In [9]:
train = pd.read_csv(DATASETS_DIR+'train.csv')
print(train.head())

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


In [10]:
train.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [11]:
train = train.dropna()

## Build tokenized word index

In [43]:
questions = train['question1'] + train['question2']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(train['question1'])
question2_word_sequences = tokenizer.texts_to_sequences(train['question2'])
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 96492


In [52]:
import pickle


with open('word_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('word_index.pickle', 'rb') as handle:
    word_indices = pickle.load(handle)
word_indices

In [14]:
question1_word_sequences = tokenizer.texts_to_sequences('Hi man hows you')
question1_word_sequences

[[1979],
 [5],
 [],
 [618],
 [6],
 [828],
 [],
 [1979],
 [1950],
 [2222],
 [225],
 [],
 [1312],
 [1950],
 [305]]

## Download and process GloVe embeddings

In [16]:

embeddings_index = {}
with open(DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Word embeddings: 400000


## Prepare word embedding matrix

In [17]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 36148


## Prepare training data tensors

In [47]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train['is_duplicate'], dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [48]:
q1_data

array([[    0,     0,     0, ...,   383,     8,    35],
       [    0,     0,     0, ...,     5, 21311,  4564],
       [    0,     0,     0, ...,   146,     6,  2773],
       ...,
       [    0,     0,     0, ...,     3,    49,  4395],
       [    0,     0,     0, ...,    32,    82,   234],
       [    0,     0,     0, ...,   155,    29,  4533]])

## Persist training and configuration data to files

In [40]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)

In [18]:
word_embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.04656   ,  0.21318001, -0.0074364 , ...,  0.0090611 ,
        -0.20988999,  0.053913  ],
       [-0.20017   ,  0.14302   ,  0.052055  , ...,  0.034939  ,
        -0.12599   ,  0.21863   ],
       ...,
       [-0.020654  ,  0.051946  , -0.19756   , ..., -0.1902    ,
         0.27513999,  0.45159   ],
       [ 0.31079   ,  0.57249999,  0.10701   , ...,  0.14535999,
         0.57359999,  0.59401   ],
       [-0.43546   , -0.14072999, -0.26552999, ...,  0.42638001,
        -0.03747   ,  0.26030001]])

## Testing

In [51]:
import keras

with open('word_index.pickle', 'rb') as handle:
    word_indices = pickle.load(handle)
word_indices

def word2vec(question):
    word_seq = keras.preprocessing.text.text_to_word_sequence(question)
    vec_sequence = [word_indices[w] for w in word_seq]
    vec_sequence = pad_sequences([vec_sequence], maxlen= 25)
    return vec_sequence
word2vec('Hi my man')   

[3840, 17, 296]


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        3840,   17,  296]])