## Imports

In [1]:
import os

In [2]:
from collections import Counter

In [3]:
from nltk.tokenize import RegexpTokenizer

## Config

In [4]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep

In [5]:
vocab_file = aux_data_folder + 'fasttext.wiki.en.vocab.txt'

## Read Data

In [6]:
known_vocab = set(load_lines(vocab_file))

In [19]:
questions_train = pd.read_csv(data_folder + 'train.csv').fillna('none')

In [20]:
questions_test = pd.read_csv(data_folder + 'test.csv').fillna('none')

## Tokenize

In [7]:
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
files_to_tokenize = {
    'unique_questions_train.txt': 'unique_questions_train.json',
    'unique_questions_test.txt': 'unique_questions_test.json',
    'unique_questions_all.txt': 'unique_questions_all.json',
}

In [9]:
for input_filename, output_filename in progressbar(files_to_tokenize.items()):
    lines = [line.lower() for line in load_lines(preproc_data_folder + input_filename)]
    save_json(tokenizer.tokenize_sents(lines), preproc_data_folder + output_filename)

## Count Words

In [10]:
all_word_counter = Counter()
existing_word_counter = Counter()
missing_word_counter = Counter()

In [11]:
questions_all_tokenized = load_json(preproc_data_folder + 'unique_questions_all.json')

In [12]:
for question in progressbar(questions_all_tokenized):
    for word in question:
        all_word_counter[word] += 1
        if word in known_vocab:
            existing_word_counter[word] += 1
        else:
            missing_word_counter[word] += 1

In [13]:
with open(preproc_data_folder + 'words_all.txt', 'w') as f:
    for word, count in all_word_counter.most_common():
        print(f'{word:40s}{count}', file=f)

In [14]:
with open(preproc_data_folder + 'words_missing.txt', 'w') as f:
    for word, count in missing_word_counter.most_common():
        print(f'{word:40s}{count}', file=f)

In [23]:
questions_train.ix[18789]

id                                                                                   18789
qid1                                                                                 35567
qid2                                                                                 27091
question1       What are the differences between speed and velocity? How are the measured?
question2                                What's the difference between speed and velocity?
is_duplicate                                                                             1
Name: 18789, dtype: object