In [1]:
import time
import functools

import pandas as pd
import pkg_resources
from symspellpy import SymSpell, Verbosity

# http://www.ravi.io/language-word-lengths

MAX_DICT_DISTANCE = 5

start = time.time()
sym_spell = SymSpell(max_dictionary_edit_distance=MAX_DICT_DISTANCE, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
print("Loaded dictionary in: ", time.time() - start)


def find_closest_sentence(sentence):
    new_sentence = []
    sentence = str(sentence)
    for word in sentence.split():
        closest_word = find_closest_word(word)
        if closest_word:
            new_sentence.append(closest_word)
    return " ".join(new_sentence)

@functools.lru_cache(maxsize=None)
def find_closest_word(word):
    max_edit_distance = min(len(word) // 3, MAX_DICT_DISTANCE)

    suggestion_list = sym_spell.lookup(
        word, Verbosity.TOP, max_edit_distance=max_edit_distance
    )

    if len(suggestion_list) == 1:
        suggestion = suggestion_list[0]
        return suggestion.term
    # return word
    return ""

df = pd.read_csv("../data_exploration/train_input_data_eng.csv", dtype=str)
df["text"] = df["text"].apply(find_closest_sentence)

Loaded dictionary in:  8.827191829681396


In [2]:
df = df.dropna()
df.to_csv("train_input_data_eng_after_spell_correction_only_found.csv", index=False)

In [3]:
find_closest_word.cache_info()

CacheInfo(hits=993784, misses=366311, maxsize=None, currsize=366311)