# Quora question pairs: data preparation

## Import packages

In [24]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

import sys
import io
import pandas as pd
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

## Initialize global variables

In [25]:
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/') #/home/yuliang/.keras/datasets/

TRAIN_PAIRS_FILE = 'cor_train.csv'
TEST_PAIRS_FILE = 'cor_test.csv'

GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
# GLOVE_FILE = 'glove.840B.300d.txt'
GLOVE_FILE = 'quora-vectors.txt'

Q1_TRAINING_DATA_FILE = 'q1_train-quora-glove.npy'
Q2_TRAINING_DATA_FILE = 'q2_train-quora-glove.npy'
LABEL_TRAINING_DATA_FILE = 'label_train-quora-glove.npy'

Q1_TEST_DATA_FILE = 'q1_test-quora-glove.npy'
Q2_TEST_DATA_FILE = 'q2_test-quora-glove.npy'

WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix-quora-glove.npy'
NB_WORDS_DATA_FILE = 'nb_words-quora-glove.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
# EMBEDDING_DIM = 300
EMBEDDING_DIM =200

## Extract questions pairs data

In [26]:
train_data = pd.read_csv(KERAS_DATASETS_DIR + TRAIN_PAIRS_FILE)
question1 = train_data['question1']
question2 = train_data['question2']
is_duplicate = train_data['is_duplicate']

print('Train Question pairs: %d' % len(question1))

Train Question pairs: 404290


In [27]:
test_data = pd.read_csv(KERAS_DATASETS_DIR + TEST_PAIRS_FILE)
test_question1 = test_data['question1']
test_question2 = test_data['question2']

print('Test Question pairs: %d' % len(test_question1))

Test Question pairs: 2345796


## Build tokenized word index

In [28]:
questions = question1 + question2
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 96500


In [29]:
test_question1_word_sequences = tokenizer.texts_to_sequences(test_question1)
test_question2_word_sequences = tokenizer.texts_to_sequences(test_question2)

In [52]:
max_num = 0
min_num = 200
ave_num = 0
i=0
for q in question1_word_sequences:
    ave_num += len(q)
    if len(q)>max_num:
        max_num = len(q)
    if len(q)<min_num:
        min_num = len(q)
    if len(q)>20:
        i = i + 1
print(max_num,min_num,ave_num*1.0/404290)
print(i)

127 0 11.0336515867
26612


## Download and process GloVe embeddings

In [31]:
if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
    
print("Processing", GLOVE_FILE)

embeddings_index = {}
with io.open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Processing quora-vectors.txt
Word embeddings: 162405


## Prepare word embedding matrix

In [32]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 45487


## Prepare training data tensors

In [33]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q1_test_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_test_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of question1_test data tensor:', q1_test_data.shape)
print('Shape of question2_test data tensor:', q2_test_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404290, 25)
Shape of question2 data tensor: (404290, 25)
Shape of question1_test data tensor: (2345796, 25)
Shape of question2_test data tensor: (2345796, 25)
Shape of label tensor: (404290,)


## Persist training and configuration data to files

In [34]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(Q1_TEST_DATA_FILE, 'wb'), q1_test_data)
np.save(open(Q2_TEST_DATA_FILE, 'wb'), q2_test_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)