Skip to content

Commit

Permalink
Merge ee98140 into 7892dc3
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Apr 19, 2019
2 parents 7892dc3 + ee98140 commit f433dc3
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 10 deletions.
61 changes: 51 additions & 10 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,20 @@ class SpellChecker(object):
frequency dictionary; if provided, no language will be loaded
distance (int): The edit distance to use. Defaults to 2 """

__slots__ = ["_distance", "_word_frequency"]
__slots__ = ["_distance", "_word_frequency", "_tokenizer"]

def __init__(self, language="en", local_dictionary=None, distance=2):
def __init__(
self, language="en", local_dictionary=None, distance=2, tokenizer=None
):
self._distance = None
self.distance = distance # use the setter value check
self._word_frequency = WordFrequency()

self._tokenizer = _parse_into_words
if tokenizer is not None:
self._tokenizer = tokenizer

self._word_frequency = WordFrequency(self._tokenizer)

if local_dictionary:
self._word_frequency.load_dictionary(local_dictionary)
elif language:
Expand Down Expand Up @@ -79,15 +87,15 @@ def distance(self, val):
pass
self._distance = tmp

@staticmethod
def split_words(text):
""" Split text into individual `words` using a simple whitespace regex
def split_words(self, text):
""" Split text into individual `words` using either a simple whitespace
regex or the passed in tokenizer
Args:
text (str): The text to split into individual words
Returns:
list(str): A listing of all words in the provided text """
return _parse_into_words(text)
return self._tokenizer(text)

def export(self, filepath, encoding="utf-8", gzipped=True):
""" Export the word frequency list for import in the future
Expand Down Expand Up @@ -238,14 +246,24 @@ class WordFrequency(object):
""" Store the `dictionary` as a word frequency list while allowing for
different methods to load the data and update over time """

__slots__ = ["_dictionary", "_total_words", "_unique_words", "_letters"]
__slots__ = [
"_dictionary",
"_total_words",
"_unique_words",
"_letters",
"_tokenizer",
]

def __init__(self):
def __init__(self, tokenizer=None):
self._dictionary = Counter()
self._total_words = 0
self._unique_words = 0
self._letters = set()

self._tokenizer = _parse_into_words
if tokenizer is not None:
self._tokenizer = tokenizer

def __contains__(self, key):
""" turn on contains """
return key.lower() in self._dictionary
Expand Down Expand Up @@ -297,6 +315,18 @@ def letters(self):
Not settable """
return self._letters

def tokenize(self, text):
""" Tokenize the provided string object into individual words
Args:
text (str): The string object to tokenize
Yields:
str: The next `word` in the tokenized string
Note:
This is the same as the `spellchecker.split_words()` """
for x in self._tokenizer(text):
yield x.lower()

def keys(self):
""" Iterator over the key of the dictionary
Expand All @@ -317,6 +347,17 @@ def words(self):
for word in self._dictionary.keys():
yield word

def items(self):
""" Iterator over the words in the dictionary
Yields:
str: The next word in the dictionary
int: The number of instances in the dictionary
Note:
This is the same as `dict.items()` """
for word in self._dictionary.keys():
yield word, self._dictionary[word]

def load_dictionary(self, filename, encoding="utf-8"):
""" Load in a pre-built word frequency list
Expand Down Expand Up @@ -349,7 +390,7 @@ def load_text(self, text, tokenizer=None):
if tokenizer:
words = [x.lower() for x in tokenizer(text)]
else:
words = _parse_into_words(text)
words = self.tokenize(text)

self._dictionary.update(words)
self._update_dictionary()
Expand Down
33 changes: 33 additions & 0 deletions tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,22 @@ def test_remove_by_threshold(self):
cnt += 1
self.assertEqual(cnt, 0)


def test_remove_by_threshold_using_items(self):
''' test removing everything below a certain threshold; using items to test '''
spell = SpellChecker()
cnt = 0
for _, val in spell.word_frequency.items():
if val < 7:
cnt += 1
self.assertGreater(cnt, 0)
spell.word_frequency.remove_by_threshold(7)
cnt = 0
for _, val in spell.word_frequency.items(): # synonym for keys
if val < 7:
cnt += 1
self.assertEqual(cnt, 0)

def test_add_word(self):
''' test adding a word '''
spell = SpellChecker()
Expand Down Expand Up @@ -316,3 +332,20 @@ def tokens(txt):
self.assertFalse('awesome' in spell)
self.assertTrue(spell['whale'])
self.assertTrue('sea.' in spell)

def test_tokenizer_provided(self):
""" Test passing in a tokenizer """
def tokens(txt):
for x in txt.split():
yield x

here = os.path.dirname(__file__)
filepath = '{}/resources/small_doc.txt'.format(here)
spell = SpellChecker(language=None, tokenizer=tokens) # just from this doc!
spell.word_frequency.load_text_file(filepath)
self.assertEqual(spell['a'], 3)
self.assertEqual(spell['storm'], 1)
self.assertEqual(spell['storm.'], 1)
self.assertFalse('awesome' in spell)
self.assertTrue(spell['whale'])
self.assertTrue('sea.' in spell)

0 comments on commit f433dc3

Please sign in to comment.