diff --git a/.gitignore b/.gitignore index b04accb..41bed9e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.egg-info +/dist/ /lib/ /bin/ /include/ *~ *.pyc +*#* diff --git a/cities.gz b/cities.gz new file mode 100644 index 0000000..a9406cb Binary files /dev/null and b/cities.gz differ diff --git a/fuzzyset/__init__.py b/fuzzyset/__init__.py index aeb9604..39166d5 100644 --- a/fuzzyset/__init__.py +++ b/fuzzyset/__init__.py @@ -1,6 +1,7 @@ import re import math import collections +import Levenshtein _non_word_re = re.compile(r'[^\w, ]+') @@ -44,6 +45,9 @@ def __getitem__(self, value): results = [(match_score / (norm * self.items[idx][0]), self.items[idx][1]) for idx, match_score in matches.items()] results.sort(reverse=True) + results = [(Levenshtein.distance(matched, value), matched) + for _, matched in results[:50]] + results.sort() if results: return [result for result in results if result[0] == results[0][0]] @@ -67,14 +71,16 @@ def _iterate_grams(value, gram_size=2): for i in range(len(simplified) - gram_size + 1): yield simplified[i:i + gram_size] -if __name__ == '__main__': - with open('./cities') as input_file: +def _interactive_test(): + import gzip + with gzip.GzipFile('./cities.gz') as input_file: f = FuzzySet((line.strip() for line in input_file), gram_size=2) - while False: + while True: town = raw_input("Enter town name: ") print f[town] +def _other_test(): with open('./origin_cities') as cities: for line in cities: result = f.get(line.strip()) @@ -82,3 +88,7 @@ def _iterate_grams(value, gram_size=2): print "{}: Could not find".format(line.strip()) elif isinstance(result, list): print "{}: {}".format(line.strip(), result) + +if __name__ == '__main__': + _interactive_test() + #_other_test() diff --git a/setup.py b/setup.py index 05433db..d464b08 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ def read(fname): url = "https://github.com/axiak/fuzzyset/", packages=['fuzzyset'], long_description=read('README.rst'), + install_requires=['python-levenshtein'], classifiers=[ "Development Status :: 3 - Alpha", "License :: OSI Approved :: BSD License",