## Imports

In [1]:
import os

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


## Config

In [4]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

In [5]:
MAX_VOCAB_SIZE = 125000
MAX_SEQUENCE_LENGTH = 30

In [6]:
RANDOM_SEED = 42

## Read Data

In [7]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_no_stopwords_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_no_stopwords_test.json')

In [8]:
embedding_vectors = load(aux_data_folder + 'glove.42B.300d.vectors.pickle')
embedding_vocab = load(aux_data_folder + 'glove.42B.300d.vocab.pickle')

In [9]:
EMBEDDING_DIM = embedding_vectors.shape[-1]

## Build Features

### Collect all texts

In [10]:
question_texts_1_train = [' '.join(pair['question1']) for pair in question_tokens_train]
question_texts_2_train = [' '.join(pair['question2']) for pair in question_tokens_train]

question_texts_1_test = [' '.join(pair['question1']) for pair in question_tokens_test]
question_texts_2_test = [' '.join(pair['question2']) for pair in question_tokens_test]

In [11]:
unique_question_texts = list(set(
    question_texts_1_train +
    question_texts_2_train +
    question_texts_1_test +
    question_texts_2_test
))

### Create question sequences

In [12]:
tokenizer = Tokenizer(
    num_words=MAX_VOCAB_SIZE,
    split=' ',
    lower=True,
    char_level=False,
)

In [13]:
tokenizer.fit_on_texts(unique_question_texts)

In [14]:
question_sequences_1_train = tokenizer.texts_to_sequences(question_texts_1_train)
question_sequences_2_train = tokenizer.texts_to_sequences(question_texts_2_train)

In [15]:
question_sequences_1_test = tokenizer.texts_to_sequences(question_texts_1_test)
question_sequences_2_test = tokenizer.texts_to_sequences(question_texts_2_test)

### Create embedding matrix

In [16]:
num_words = min(MAX_VOCAB_SIZE, len(tokenizer.word_index))

In [17]:
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))

In [18]:
missing_words = set()

In [19]:
for word, index in tokenizer.word_index.items():
    if word in embedding_vocab:
        embedding_matrix[index] = embedding_vectors[embedding_vocab[word]]
    else:
        missing_words.add(word)

In [20]:
print('# Unique missing words:', len(missing_words))

# Unique missing words: 8272


In [21]:
save(embedding_matrix, aux_data_folder + 'embedding_weights_glove_filtered_no_stopwords.pickle')

### Create sequence matrices

In [22]:
q1_suffix = 'nn_glove_q1_filtered_no_stopwords.pickle'
q2_suffix = 'nn_glove_q2_filtered_no_stopwords.pickle'

In [23]:
question_sequences_1_train_padded = pad_sequences(question_sequences_1_train, maxlen=MAX_SEQUENCE_LENGTH)
question_sequences_2_train_padded = pad_sequences(question_sequences_2_train, maxlen=MAX_SEQUENCE_LENGTH)

In [24]:
save(question_sequences_1_train_padded, features_data_folder + 'X_train_' + q1_suffix)
save(question_sequences_2_train_padded, features_data_folder + 'X_train_' + q2_suffix)

In [25]:
question_sequences_1_test_padded = pad_sequences(question_sequences_1_test, maxlen=MAX_SEQUENCE_LENGTH)
question_sequences_2_test_padded = pad_sequences(question_sequences_2_test, maxlen=MAX_SEQUENCE_LENGTH)

In [26]:
save(question_sequences_1_test_padded, features_data_folder + 'X_test_' + q1_suffix)
save(question_sequences_2_test_padded, features_data_folder + 'X_test_' + q2_suffix)