## Imports

In [1]:
import os

In [2]:
import enchant
import pylev
import nltk

## Config

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep

## Load Data

In [4]:
questions_train = pd.read_csv(data_folder + 'train.csv').fillna('none')

In [5]:
stopwords = set(load_lines(aux_data_folder + 'stopwords_custom_quora.vocab'))

## Load Tools

In [6]:
spellcheck_log = open(preproc_data_folder + 'spellcheck.log', 'w')

In [7]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [8]:
spellchecker = enchant.DictWithPWL("en_US", aux_data_folder + 'fasttext.wiki.en.vocab')

## Process

In [9]:
def translate(text, translation):
    for token, replacement in translation.items():
        text = text.replace(token, ' ' + replacement + ' ')
    text = text.replace('  ', ' ')
    return text

In [10]:
def spell_digits(text):
    translation = {
        '0': 'zero',
        '1': 'one',
        '2': 'two',
        '3': 'three',
        '4': 'four',
        '5': 'five',
        '6': 'six',
        '7': 'seven',
        '8': 'nine',
        '9': 'ten',
    }
    return translate(text, translation)

In [11]:
def expand_negations(text):
    translation = {
        "can't": 'can not',
        "won't": 'would not',
        "shan't": 'shall not',
    }
    text = translate(text, translation)
    return text.replace("n't", " not")

In [12]:
def get_best_suggestion(word):
    suggestion_scores = {
        suggestion: pylev.damerau_levenshtein(word, suggestion)
        for suggestion in spellchecker.suggest(word)
    }
    if len(suggestion_scores) == 0:
        return None
    
    best_suggestion = min(suggestion_scores, key=suggestion_scores.get)
    if suggestion_scores[best_suggestion] > 4:
        return None
    
    return best_suggestion

In [13]:
def correct_spelling(text):
    tokens = tokenizer.tokenize(text)
    corrected_tokens = []
    
    for token in tokens:
        if not spellchecker.check(token):
            correction = get_best_suggestion(token)
            if correction:
                corrected_tokens.append(correction.lower())
                print('{} ---> {}'.format(token, correction), file=spellcheck_log, flush=True)
            else:
                corrected_tokens.append(token)
        else:
            corrected_tokens.append(token)
    
    return ' '.join(corrected_tokens)

In [14]:
def get_question_tokens(question):
    question = question.lower()
    question = spell_digits(question)
    question = expand_negations(question)
    question = correct_spelling(question)
    return [token for token in tokenizer.tokenize(question) if token not in stopwords]

In [15]:
tokenized_train = []

In [16]:
for index, row in progressbar(questions_train.iterrows(), size=len(questions_train)):
    tokenized_train.append({
        'id': row.id,
        'question1': get_question_tokens(row.question1),
        'question2': get_question_tokens(row.question2),
    })
    
    if index % 20000 == 0:
        save_json(tokenized_train, preproc_data_folder + 'question_tokens_train.json')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [17]:
spellcheck_log.close()

In [18]:
save_json(tokenized_train, preproc_data_folder + 'question_tokens_train.json')