v0.5.0 PR (#46)

* add better python 2.7 file reading support * update changelog * version bump * update documentation
barrust · Jul 11, 2019 · 1ca044d · 1ca044d
1 parent ac687b1
commit 1ca044d
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # pyspellchecker
 
+## Version 0.5.0
+* Add tokenizer to the Spell object
+* Add Support for local dictionaries to be case sensitive
+[see PR #44](https://github.com/barrust/pyspellchecker/pull/44) Thanks [@davido-brainlabs ](https://github.com/davido-brainlabs)
+* Better python 2.7 support for reading gzipped files
+
 ## Version 0.4.0
 * Add support for a tokenizer for splitting words into tokens
 

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -193,7 +193,8 @@ dictionary:
 
     from spellchecker import SpellChecker
 
-    spell = SpellChecker(language=None)  # turn off loading a built language dictionary
+    # turn off loading a built language dictionary, case sensitive on (if desired)
+    spell = SpellChecker(language=None, case_sensitive=True)
 
     # if you have a dictionary...
     spell.word_frequency.load_dictionary('./path-to-my-json-dictionary.json')

diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = "Tyler Barrus"
 __email__ = "barrust@gmail.com"
 __license__ = "MIT"
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 __credits__ = ["Peter Norvig"]
 __url__ = "https://github.com/barrust/pyspellchecker"
 __bugtrack_url__ = "{0}/issues".format(__url__)
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -22,12 +22,19 @@ class SpellChecker(object):
             local_dictionary (str): The path to a locally stored word \
             frequency dictionary; if provided, no language will be loaded
             distance (int): The edit distance to use. Defaults to 2.
-            case_sensitive (bool): Flag to use a case sensitive dictionary or not."""
+            case_sensitive (bool): Flag to use a case sensitive dictionary or \
+            not, only available when not using a language dictionary.
+        Note:
+            Using a case sensitive dictionary can be slow to correct words."""
 
     __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
 
     def __init__(
-        self, language="en", local_dictionary=None, distance=2, tokenizer=None, \
+        self,
+        language="en",
+        local_dictionary=None,
+        distance=2,
+        tokenizer=None,
         case_sensitive=False,
     ):
         self._distance = None
@@ -183,7 +190,11 @@ def unknown(self, words):
             Returns:
                 set: The set of those words from the input that are not in \
                 the corpus """
-        tmp = [w if self._case_sensitive else w.lower() for w in words if self._check_if_should_check(w)]
+        tmp = [
+            w if self._case_sensitive else w.lower()
+            for w in words
+            if self._check_if_should_check(w)
+        ]
         return set(w for w in tmp if w not in self._word_frequency.dictionary)
 
     def edit_distance_1(self, word):
@@ -332,8 +343,8 @@ def tokenize(self, text):
                 str: The next `word` in the tokenized string
             Note:
                 This is the same as the `spellchecker.split_words()` """
-        for x in self._tokenizer(text):
-            yield x if self._case_sensitive else x.lower()
+        for word in self._tokenizer(text):
+            yield word if self._case_sensitive else word.lower()
 
     def keys(self):
         """ Iterator over the key of the dictionary
@@ -409,7 +420,9 @@ def load_words(self, words):
 
             Args:
                 words (list): The list of words to be loaded """
-        self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
+        self._dictionary.update(
+            [word if self._case_sensitive else word.lower() for word in words]
+        )
         self._update_dictionary()
 
     def add(self, word):

diff --git a/spellchecker/utils.py b/spellchecker/utils.py
@@ -6,9 +6,12 @@
 
 if sys.version_info < (3, 0):
     import io  # python 2 text file encoding support
-
+    READMODE = 'rb'
+    WRITEMODE = 'wb'
     OPEN = io.open  # hijack this
 else:
+    READMODE = 'rt'
+    WRITEMODE = 'wt'
     OPEN = open
 
 
@@ -24,7 +27,7 @@ def load_file(filename, encoding):
             str: The string data from the file read
     """
     try:
-        with gzip.open(filename, mode="rt") as fobj:
+        with gzip.open(filename, mode=READMODE) as fobj:
             yield fobj.read()
     except (OSError, IOError):
         with OPEN(filename, mode="r", encoding=encoding) as fobj:
@@ -42,7 +45,7 @@ def write_file(filepath, encoding, gzipped, data):
             data (str): The data to be written out
     """
     if gzipped:
-        with gzip.open(filepath, "wt") as fobj:
+        with gzip.open(filepath, WRITEMODE) as fobj:
             fobj.write(data)
     else:
         with OPEN(filepath, "w", encoding=encoding) as fobj: