Merge 56a8df7 into 31147fe

barrust · Oct 6, 2018 · ad8435d · ad8435d
2 parents 31147fe + 56a8df7
commit ad8435d
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,14 @@
 # pyspellchecker
 
-## Version 0.1.5 (unreleased)
+## Version 0.1.5
 * Remove words based on threshold
 * Add ability to iterate over words (keys) in the dictionary
 * Add setting to to reduce the edit distance check
-[see PR #17](https://github.com/barrust/pyspellchecker/pull/17) Thanks [@mrjamesriley](https://github.com/mrjamesriley) 
+[see PR #17](https://github.com/barrust/pyspellchecker/pull/17) Thanks [@mrjamesriley](https://github.com/mrjamesriley)
+* Added Export functionality:
+   * json
+   * gzip
+* Updated logic for loading dictionaries to be either language or local_dictionary
 
 ## Version 0.1.4
 * Ability to easily remove words

diff --git a/README.rst b/README.rst
@@ -24,6 +24,10 @@ German, and French. Dictionaries were generated using the `WordFrequency project
 ``pyspellchecker`` supports **Python 3**. If may work for Python 2.7 but it is not
 guaranteed (especially for Non-English dictionaries)!
 
+``pyspellchecker`` allows for the setting of the Levenshtein Distance to check.
+For longer words, it is highly recommended to use a distance of 1 and not the
+default 2. See the quickstart to find how one can change the distance parameter.
+
 
 Installation
 -------------------------------------------------------------------------------
@@ -86,10 +90,20 @@ text to generate a more appropriate list for your use case.
     spell.known(['microsoft', 'google'])  # will return both now!
 
 
+If the words that you wish to check are long, it is recommended to reduce the
+`distance` to 1. This can be accomplished either when initializing the spell
+check class or after the fact.
+
+.. code:: python
+
+    from spellchecker import SpellChecker
+
+    spell = SpellChecker(distance=1)  # set at initialization
+
+    # do some work on longer words
 
+    spell.distance = 2  # set the distance parameter back to the default
 
-More work in storing and loading word frequency lists is planned; stay
-tuned.
 
 
 Additional Methods

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -66,6 +66,18 @@ Once a word is identified as misspelled, you can find the likeliest replacement:
         spell.correction(word)  # 'happening'
 
 
+.. code:: python
+
+    from spellchecker import SpellChecker
+
+    spell = SpellChecker(distance=1)  # set the Levenshtein Distance parameter
+
+    # do additional work
+
+    # now for shorter words, we can revert to Levenshtein Distance of 2!
+    spell.distance = 2
+
+
 Or if the word identified as the likeliest is not correct, a list of candidates
 can also be pulled:
 

diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = 'Tyler Barrus'
 __email__ = 'barrust@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.1.4'
+__version__ = '0.1.5'
 __credits__ = ['Peter Norvig']
 __url__ = 'https://github.com/barrust/pyspellchecker'
 __bugtrack_url__ = '{0}/issues'.format(__url__)
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -20,16 +20,16 @@ class SpellChecker(object):
             for no dictionary. Supported languages are `en`, `es`, `de`, and \
             `fr`. Defaults to `en`
             local_dictionary (str): The path to a locally stored word \
-            frequency dictionary
+            frequency dictionary; if provided, no language will be loaded
             distance (int): The edit distance to use. Defaults to 2'''
 
-
     def __init__(self, language='en', local_dictionary=None, distance=2):
-        self._distance = distance
+        self._distance = None
+        self.distance = distance  # use the setter value check
         self._word_frequency = WordFrequency()
         if local_dictionary:
             self._word_frequency.load_dictionary(local_dictionary)
-        if language:
+        elif language:
             filename = '{}.json.gz'.format(language)
             here = os.path.dirname(__file__)
             full_filename = os.path.join(here, 'resources', filename)
@@ -55,6 +55,27 @@ def word_frequency(self):
                 Not settable '''
         return self._word_frequency
 
+    @property
+    def distance(self):
+        ''' int: The maximum edit distance to calculate
+
+            Note:
+                Valid values are 1 or 2; if an invalid value is passed, \
+                defaults to 2 '''
+        return self._distance
+
+    @distance.setter
+    def distance(self, val):
+        ''' set the distance parameter '''
+        tmp = 2
+        try:
+            int(val)
+            if val > 0 and val <= 2:
+                tmp = val
+        except (ValueError, TypeError):
+            pass
+        self._distance = tmp
+
     @staticmethod
     def words(text):
         ''' Split text into individual `words` using a simple whitespace regex
@@ -65,6 +86,20 @@ def words(text):
                 list(str): A listing of all words in the provided text '''
         return _words(text)
 
+    def export(self, filepath, gzipped=True):
+        ''' Export the word frequency list for import in the future
+
+             Args:
+                filepath (str): The filepath to the exported dictionary
+                gzipped (bool): Whether to gzip the dictionary or not '''
+        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
+        if gzipped:
+            with gzip.open(filepath, 'wt') as fobj:
+                fobj.write(data)
+        else:
+            with open(filepath, 'w') as fobj:
+                fobj.write(data)
+
     def word_probability(self, word, total_words=None):
         ''' Calculate the probability of the `word` being the desired, correct
             word
@@ -100,7 +135,8 @@ def candidates(self, word):
                 set: The set of words that are possible candidates '''
 
         return (self.known([word]) or self.known(self.edit_distance_1(word)) or
-                (self._distance == 2 and self.known(self.edit_distance_2(word))) or {word})
+                (self._distance == 2 and
+                 self.known(self.edit_distance_2(word))) or {word})
 
     def known(self, words):
         ''' The subset of `words` that appear in the dictionary of words

diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py
@@ -123,6 +123,22 @@ def test_edit_distance_one(self):
         spell = SpellChecker(language=None, local_dictionary=filepath, distance=1)
         self.assertEqual(spell.candidates('hike'), {'bike'})
 
+    def test_edit_distance_one_property(self):
+        ''' check the property setting of the distance property '''
+        spell = SpellChecker(distance=1)
+        self.assertEqual(spell.distance, 1)
+        spell.distance = 2
+        self.assertEqual(spell.distance, 2)
+
+    def test_edit_distance_invalud(self):
+        ''' check the property setting of the distance property on invalid inputs '''
+        spell = SpellChecker(distance=None)
+        self.assertEqual(spell.distance, 2)
+        spell.distance = 1
+        self.assertEqual(spell.distance, 1)
+        spell.distance = 'string'
+        self.assertEqual(spell.distance, 2)
+
     def test_edit_distance_two(self):
         ''' test a case where edit distance must be two '''
         here = os.path.dirname(__file__)
@@ -187,3 +203,35 @@ def test_unique_words(self):
         ''' test the unique word count '''
         spell = SpellChecker()
         self.assertEqual(spell.word_frequency.unique_words, len(list(spell.word_frequency.keys())))
+
+    def test_import_export_json(self):
+        ''' test the export functionality as json '''
+        here = os.path.dirname(__file__)
+        filepath = '{}/resources/small_dictionary.json'.format(here)
+
+        spell = SpellChecker(language=None, local_dictionary=filepath)
+        spell.word_frequency.add('meh')
+        new_filepath = '{}/resources/small_dictionary_new.json'.format(here)
+        spell.export(new_filepath, gzipped=False)
+
+        sp = SpellChecker(language=None, local_dictionary=new_filepath)
+        self.assertTrue('meh' in sp)
+        self.assertFalse('bananna' in sp)
+
+        os.remove(new_filepath)
+
+    def test_import_export_gzip(self):
+        ''' test the export functionality as gzip '''
+        here = os.path.dirname(__file__)
+        filepath = '{}/resources/small_dictionary.json'.format(here)
+
+        spell = SpellChecker(language=None, local_dictionary=filepath)
+        spell.word_frequency.add('meh')
+        new_filepath = '{}/resources/small_dictionary_new.json.gz'.format(here)
+        spell.export(new_filepath, gzipped=True)
+
+        sp = SpellChecker(language=None, local_dictionary=new_filepath)
+        self.assertTrue('meh' in sp)
+        self.assertFalse('bananna' in sp)
+
+        os.remove(new_filepath)