Skip to content

Commit

Permalink
v0.5.0 PR (#46)
Browse files Browse the repository at this point in the history
* add better python 2.7 file reading support
* update changelog
* version bump
* update documentation
  • Loading branch information
barrust committed Jul 11, 2019
1 parent ac687b1 commit 1ca044d
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 11 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# pyspellchecker

## Version 0.5.0
* Add tokenizer to the Spell object
* Add Support for local dictionaries to be case sensitive
[see PR #44](https://github.com/barrust/pyspellchecker/pull/44) Thanks [@davido-brainlabs ](https://github.com/davido-brainlabs)
* Better python 2.7 support for reading gzipped files

## Version 0.4.0
* Add support for a tokenizer for splitting words into tokens

Expand Down
3 changes: 2 additions & 1 deletion docs/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ dictionary:
from spellchecker import SpellChecker
spell = SpellChecker(language=None) # turn off loading a built language dictionary
# turn off loading a built language dictionary, case sensitive on (if desired)
spell = SpellChecker(language=None, case_sensitive=True)
# if you have a dictionary...
spell.word_frequency.load_dictionary('./path-to-my-json-dictionary.json')
Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = "Tyler Barrus"
__email__ = "barrust@gmail.com"
__license__ = "MIT"
__version__ = "0.4.0"
__version__ = "0.5.0"
__credits__ = ["Peter Norvig"]
__url__ = "https://github.com/barrust/pyspellchecker"
__bugtrack_url__ = "{0}/issues".format(__url__)
25 changes: 19 additions & 6 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,19 @@ class SpellChecker(object):
local_dictionary (str): The path to a locally stored word \
frequency dictionary; if provided, no language will be loaded
distance (int): The edit distance to use. Defaults to 2.
case_sensitive (bool): Flag to use a case sensitive dictionary or not."""
case_sensitive (bool): Flag to use a case sensitive dictionary or \
not, only available when not using a language dictionary.
Note:
Using a case sensitive dictionary can be slow to correct words."""

__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]

def __init__(
self, language="en", local_dictionary=None, distance=2, tokenizer=None, \
self,
language="en",
local_dictionary=None,
distance=2,
tokenizer=None,
case_sensitive=False,
):
self._distance = None
Expand Down Expand Up @@ -183,7 +190,11 @@ def unknown(self, words):
Returns:
set: The set of those words from the input that are not in \
the corpus """
tmp = [w if self._case_sensitive else w.lower() for w in words if self._check_if_should_check(w)]
tmp = [
w if self._case_sensitive else w.lower()
for w in words
if self._check_if_should_check(w)
]
return set(w for w in tmp if w not in self._word_frequency.dictionary)

def edit_distance_1(self, word):
Expand Down Expand Up @@ -332,8 +343,8 @@ def tokenize(self, text):
str: The next `word` in the tokenized string
Note:
This is the same as the `spellchecker.split_words()` """
for x in self._tokenizer(text):
yield x if self._case_sensitive else x.lower()
for word in self._tokenizer(text):
yield word if self._case_sensitive else word.lower()

def keys(self):
""" Iterator over the key of the dictionary
Expand Down Expand Up @@ -409,7 +420,9 @@ def load_words(self, words):
Args:
words (list): The list of words to be loaded """
self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
self._dictionary.update(
[word if self._case_sensitive else word.lower() for word in words]
)
self._update_dictionary()

def add(self, word):
Expand Down
9 changes: 6 additions & 3 deletions spellchecker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

if sys.version_info < (3, 0):
import io # python 2 text file encoding support

READMODE = 'rb'
WRITEMODE = 'wb'
OPEN = io.open # hijack this
else:
READMODE = 'rt'
WRITEMODE = 'wt'
OPEN = open


Expand All @@ -24,7 +27,7 @@ def load_file(filename, encoding):
str: The string data from the file read
"""
try:
with gzip.open(filename, mode="rt") as fobj:
with gzip.open(filename, mode=READMODE) as fobj:
yield fobj.read()
except (OSError, IOError):
with OPEN(filename, mode="r", encoding=encoding) as fobj:
Expand All @@ -42,7 +45,7 @@ def write_file(filepath, encoding, gzipped, data):
data (str): The data to be written out
"""
if gzipped:
with gzip.open(filepath, "wt") as fobj:
with gzip.open(filepath, WRITEMODE) as fobj:
fobj.write(data)
else:
with OPEN(filepath, "w", encoding=encoding) as fobj:
Expand Down

0 comments on commit 1ca044d

Please sign in to comment.