diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bac588..520409e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # pyspellchecker +## Version 0.1.4 (unreleased) +* Remove words based on threshold +* Add ability to iterate over words (keys) in the dictionary +* Add setting to to reduce the edit distance check +[see PR #17](https://github.com/barrust/pyspellchecker/pull/17) Thanks [@mrjamesriley](https://github.com/mrjamesriley) + ## Version 0.1.3 * Better handle punctuation and numbers as the word to check diff --git a/spellchecker/__init__.py b/spellchecker/__init__.py index 0d4185e..62c20cf 100644 --- a/spellchecker/__init__.py +++ b/spellchecker/__init__.py @@ -1,7 +1,7 @@ ''' SpellChecker Module ''' from . spellchecker import SpellChecker, WordFrequency -from . info import (__author__, __maintainer__, __email__, __license__, - __version__, __credits__, __url__, __bugtrack_url__) +from . info import (__author__, __maintainer__, __email__, __license__, # noqa: F401 + __version__, __credits__, __url__, __bugtrack_url__) # noqa: F401 __all__ = ['SpellChecker', 'WordFrequency'] diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py index 4a30d55..50b8c91 100644 --- a/spellchecker/spellchecker.py +++ b/spellchecker/spellchecker.py @@ -169,6 +169,7 @@ def _check_if_should_check(word): return True + class WordFrequency(object): ''' Store the `dictionary` as a word frequency list while allowing for different methods to load the data and update over time ''' @@ -221,6 +222,26 @@ def letters(self): Not settable ''' return self._letters + def keys(self): + ''' Iterator over the key of the dictionary + + Yields: + str: The next key in the dictionary + Note: + This is the same as `spellchecker.words()` ''' + for key in self._dictionary.keys(): + yield key + + def words(self): + ''' Iterator over the words in the dictionary + + Yields: + str: The next word in the dictionary + Note: + This is the same as `spellchecker.keys()` ''' + for word in self._dictionary.keys(): + yield word + def load_dictionary(self, filename): ''' Load in a pre-built word frequency list @@ -284,6 +305,18 @@ def remove(self, word): self._dictionary.pop(word.lower()) self._update_dictionary() + def remove_by_threshold(self, threshold=5): + ''' Remove all words at, or below, the provided threshold + + Args: + threshold (int): The threshold at which a word is to be \ + removed ''' + keys = [x.lower() for x in self._dictionary.keys()] + for key in keys: + if self._dictionary[key] <= threshold: + self._dictionary.pop(key) + self._update_dictionary() + def _update_dictionary(self): ''' Update the word frequency object ''' self._total_words = sum(self._dictionary.values()) diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py index 48c450c..80aebcb 100644 --- a/tests/spellchecker_test.py +++ b/tests/spellchecker_test.py @@ -156,9 +156,34 @@ def test_remove_word(self): spell.word_frequency.remove('teh') self.assertEqual(spell['teh'], 0) + def test_remove_by_threshold(self): + ''' test removing everything below a certain threshold ''' + spell = SpellChecker() + cnt = 0 + for key in spell.word_frequency.keys(): + if spell.word_frequency[key] < 7: + cnt += 1 + self.assertGreater(cnt, 0) + spell.word_frequency.remove_by_threshold(7) + cnt = 0 + for key in spell.word_frequency.words(): # synonym for keys + if spell.word_frequency[key] < 7: + cnt += 1 + self.assertEqual(cnt, 0) + def test_add_word(self): ''' test adding a word ''' spell = SpellChecker() self.assertEqual(spell['meh'], 0) spell.word_frequency.add('meh') self.assertEqual(spell['meh'], 1) + + def test_checking_odd_word(self): + ''' test checking a word that is really a number ''' + spell = SpellChecker() + self.assertEqual(spell.edit_distance_1('12345'), {'12345'}) + + def test_unique_words(self): + ''' test the unique word count ''' + spell = SpellChecker() + self.assertEqual(spell.word_frequency.unique_words, len(list(spell.word_frequency.keys())))