From 5436804346b16e543fd18080b29cc50aac0c1687 Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 5 Jan 2020 23:59:16 -0500 Subject: [PATCH 1/7] initial backends implementation + updated docs --- README.rst | 80 +++++++++++++---- requirements.txt | 1 - setup.py | 8 +- simpleneighbors/__init__.py | 87 ++++++++++++------- simpleneighbors/backends/__init__.py | 28 ++++++ simpleneighbors/backends/annoy_.py | 32 +++++++ simpleneighbors/backends/base.py | 23 +++++ .../backends/bruteforcepurepython.py | 76 ++++++++++++++++ simpleneighbors/backends/sklearn_.py | 72 +++++++++++++++ simpleneighbors/benchmark.py | 46 ++++++++++ tests/test_simpleneighbors.py | 26 +++--- 11 files changed, 412 insertions(+), 67 deletions(-) create mode 100644 simpleneighbors/backends/__init__.py create mode 100644 simpleneighbors/backends/annoy_.py create mode 100644 simpleneighbors/backends/base.py create mode 100644 simpleneighbors/backends/bruteforcepurepython.py create mode 100644 simpleneighbors/backends/sklearn_.py create mode 100644 simpleneighbors/benchmark.py diff --git a/README.rst b/README.rst index aaed333..736e3f1 100644 --- a/README.rst +++ b/README.rst @@ -11,8 +11,12 @@ Simple Neighbors :target: https://pypi.python.org/pypi/simpleneighbors Simple Neighbors is a clean and easy interface for performing nearest-neighbor -lookups on items from a corpus. For example, here's how to find the most -similar color to a color in the `xkcd colors list +lookups on items from a corpus. To install the package:: + + pip install simpleneighbors[annoy] + +Here's a quick example, showing how to find the names of colors most similar to +'pink' in the `xkcd colors list `_:: >>> from simpleneighbors import SimpleNeighbors @@ -26,7 +30,16 @@ similar color to a color in the `xkcd colors list >>> list(sim.neighbors('pink', 5)) ['pink', 'bubblegum pink', 'pale magenta', 'dark mauve', 'light plum'] -Read the documentation here: https://simpleneighbors.readthedocs.org. +For a more complete example, refer to my `Understanding Word Vectors notebook +`_, +which shows how to use Simple Neighbors to perform similarity lookups on word +vectors. + +Read the complete Simple Neighbors documentation here: +https://simpleneighbors.readthedocs.org. + +Why Simple Neighbors? +--------------------- Approximate nearest-neighbor lookups are a quick way to find the items in your data set that are closest (or most similar to) any other item in your data, or @@ -36,28 +49,57 @@ in a 300-dimensional space. You could always perform pairwise distance calculations to find nearest neighbors in your data, but for data of any appreciable size and complexity, -this kind of calculation is unbearably slow. This library uses `Annoy -`_ behind the scenes for approximate -nearest-neighbor lookups, which are ultimately a little less accurate than -pairwise calculations but much, much faster. +this kind of calculation is unbearably slow. Simple Neighbors uses one of a +handful of libraries behind the scenes to provide approximate nearest-neighbor +lookups, which are ultimately a little less accurate than pairwise calculations +but much, much faster. The library also keeps track of your data, sparing you the extra step of -mapping each item in your data to its integer index in Annoy (at the potential -cost of some redundancy in data storage, depending on your application). +mapping each item in your data to its integer index (at the potential cost of +some redundancy in data storage, depending on your application). + +I made Simple Neighbors because I use nearest neighbor lookups all the time and +found myself writing and rewriting the same bits of wrapper code over and over +again. I wanted to hide a little bit of the complexity of using these libraries +to make it easier to build small prototypes and teach workshops using +nearest-neighbor lookups. + +Multiple backend support +------------------------ + +Simple Neighbors relies on the approximate nearest neighbor index +implementations found in other libraries. By default, Simple Neighbors will +choose the best backend based on the packages installed in your environment. +(You can also specify which backend to use by hand, or create your own.) + +Currently supported backend libraries include: + +* ``Annoy``: Erik Bernhardsson's `Annoy `_ library +* ``Sklearn``: `scikit-learn's NearestNeighbors `_ +* ``BruteForcePurePython``: Pure Python brute-force search (included in package) + +When you install Simple Neighbors, you can direct ``pip`` to install the +required packages for a given backend. For example, to install Simple Neighbors +with Annoy:: + + pip install simpleneighbors[annoy] + +Annoy is highly recommended! This is the preferred way to use Simple Neighbors. -I made Simple Neighbors because I use Annoy all the time and found myself -writing and rewriting the same bits of wrapper code over and over again. I -wanted to hide a little bit of the complexity of using Annoy to make it easier -to build small prototypes and teach workshops using nearest-neighbor lookups. +To install Simple Neighbors alongside scikit-learn to use the ``Sklearn`` +backend (which makes use of scikit-learn's `NearestNeighbors` class):: -Installation ------------- + pip install simpleneighbors[sklearn] -Install with pip like so:: +If you can't install Annoy or scikit-learn on your platform, you can also use a +pure Python backend:: - pip install simpleneighbors + pip install simpleneighbors[purepython] -You can also download the source code and install manually:: +Note that the pure Python version uses a brute force search and is therefore +very slow. In general, it's not suitable for datasets with more than a few +thousand items (or more than a handful of dimensions). - python setup.py install +See the documentation for the ``SimpleNeighbors`` class for more information on +specifying backends. diff --git a/requirements.txt b/requirements.txt index 1eb98b6..e69de29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +0,0 @@ -annoy>=1.12.0 diff --git a/setup.py b/setup.py index bdb870e..2cff5a5 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='simpleneighbors', - version='0.0.1', + version='0.1.0', author='Allison Parrish', author_email='allison@decontextualize.com', url='https://github.com/aparrish/simpleneighbors', @@ -26,8 +26,12 @@ package_dir={'simpleneighbors': 'simpleneighbors'}, packages=['simpleneighbors'], install_requires=[ - 'annoy' ], + extras_require={ + 'annoy': ['annoy>=1.16.0'], + 'sklearn': ['scikit-learn>=0.20'], + 'purepython': [] + }, platforms='any', test_suite='tests' ) diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py index 108c014..0661a76 100644 --- a/simpleneighbors/__init__.py +++ b/simpleneighbors/__init__.py @@ -1,32 +1,44 @@ import pickle -import annoy +from simpleneighbors.backends import select_best __author__ = 'Allison Parrish' __email__ = 'allison@decontextualize.com' -__version__ = '0.0.1' +__version__ = '0.1.0' class SimpleNeighbors: """A Simple Neighbors index. - You need to specify the number of dimensions in your data (i.e., the - length of the list or array you plan to provide for each item) and the - distance metric you want to use. (The default is "angular" distance, - i.e., cosine distance. You might also want to try "euclidean" for - Euclidean distance.) Both of these parameters are passed directly to - Annoy; see `the Annoy documentation `_ - for more details. + This class wraps backend implementations of approximate nearest neighbors + indexes with a user-friendly API. When you instantiate this class, it will + automatically select a backend implementation based on packages installed + in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip + install annoy``) to enable the Annoy backend! (The alternatives are + slower and not as accurate.) Alternatively, you can specify a backend of + your choosing with the ``backend`` parameter. + + Specify the number of dimensions in your data (i.e., the length of the list + or array you plan to provide for each item) and the distance metric you + want to use. The default is ``angular`` distance, an approximation of + cosine distance. This metric is supported by all backends, as is + ``euclidean`` (for Euclidean distance). Both of these parameters are passed + directly to the backend; see the backend documentation for more details. :param dims: the number of dimensions in your data :param metric: the distance metric to use + :param backend: the nearest neighbors backend to use (default is annoy) """ - def __init__(self, dims, metric="angular"): + def __init__(self, dims, metric="angular", backend=None): + + if backend is None: + backend = select_best() + self.dims = dims self.metric = metric self.id_map = {} self.corpus = [] - self.annoy = annoy.AnnoyIndex(dims, metric=metric) + self.backend = backend(dims, metric=metric) self.i = 0 self.built = False @@ -53,7 +65,7 @@ def add_one(self, item, vector): """ assert self.built is False, "Index already built; can't add new items." - self.annoy.add_item(self.i, vector) + self.backend.add_item(self.i, vector) self.id_map[item] = self.i self.corpus.append(item) self.i += 1 @@ -88,20 +100,25 @@ def feed(self, items): for item, vector in items: self.add_one(item, vector) - def build(self, n=10): + def build(self, n=10, params=None): """Build the index. - After adding all of your items, call this method to build - the index. The specified parameter controls the number of trees in the - underlying Annoy index; a higher number will take longer to build but - provide more precision when querying. + After adding all of your items, call this method to build the index. + The meaning of parameter ``n`` is different for each backend + implementation. For the Annoy backend, it specifies the number of trees + in the underlying Annoy index (a higher number will take longer to + build but provide more precision when querying). For the Sklearn + backend, the number specifies the leaf size when building the ball + tree. (The Brute Force Pure Python backend ignores this value + entirely.) After you call build, you'll no longer be able to add new items to the index. :param n: number of trees + :param params: dictionary with extra parameters to pass to backend """ - self.annoy.build(n) + self.backend.build(n, params) self.built = True def nearest(self, vec, n=12): @@ -130,7 +147,7 @@ def nearest(self, vec, n=12): """ return [self.corpus[idx] for idx - in self.annoy.get_nns_by_vector(vec, n)] + in self.backend.get_nns_by_vector(vec, n)] def neighbors(self, item, n=12): """Returns the items nearest another item in the index. @@ -234,10 +251,10 @@ def dist(self, a, b): :param b: second item :returns: distance between ``a`` and ``b`` """ - return self.annoy.get_distance(self.id_map[a], self.id_map[b]) + return self.backend.get_distance(self.id_map[a], self.id_map[b]) def vec(self, item): - """Returns the vector for an item + """Returns the vector for an item. This method returns the vector that was originally provided when indexing the specified item. (Depending on how it was originally @@ -247,7 +264,7 @@ def vec(self, item): :param item: item to lookup :returns: vector for item """ - return self.annoy.get_item_vector(self.id_map[item]) + return self.backend.get_item_vector(self.id_map[item]) def __len__(self): """Returns the number of items in the vector""" @@ -256,12 +273,14 @@ def __len__(self): def save(self, prefix): """Saves the index to disk. - This method saves the index to disk. Annoy indexes can't be serialized - with `pickle`, so this method produces two files: the serialized Annoy - index, and a pickle with the other data from the object. This method's - parameter specifies the "prefix" to use for these files. The Annoy - index will be saved as ``.annoy`` and the object data will be - saved as ``-data.pkl``. + This method saves the index to disk. Each backend manages serialization + a little bit differently: consult the documentation and source code for + more details. For example, because Annoy indexes can't be serialized + with `pickle`, the Annoy backend's implementation produces two files: + the serialized Annoy index, and a pickle with the other data from the + object. + + This method's parameter specifies the "prefix" to use for these files. :param prefix: filename prefix for Annoy index and object data :returns: None @@ -275,9 +294,10 @@ def save(self, prefix): 'i': self.i, 'built': self.built, 'metric': self.metric, - 'dims': self.dims + 'dims': self.dims, + '_backend_class': self.backend.__class__ }, fh) - self.annoy.save(prefix + ".annoy") + self.backend.save(prefix + ".idx") @classmethod def load(cls, prefix): @@ -286,7 +306,7 @@ def load(cls, prefix): This class method restores a previously-saved index using the specified file prefix. - :param prefix: prefix for AnnoyIndex file and object data pickle + :param prefix: prefix used when saving :returns: SimpleNeighbors object restored from specified files """ @@ -294,11 +314,12 @@ def load(cls, prefix): data = pickle.load(fh) newobj = cls( dims=data['dims'], - metric=data['metric'] + metric=data['metric'], + backend=data['_backend_class'] ) newobj.id_map = data['id_map'] newobj.corpus = data['corpus'] newobj.i = data['i'] newobj.built = data['built'] - newobj.annoy.load(prefix + ".annoy") + newobj.backend.load(prefix + ".idx") return newobj diff --git a/simpleneighbors/backends/__init__.py b/simpleneighbors/backends/__init__.py new file mode 100644 index 0000000..82bc1bc --- /dev/null +++ b/simpleneighbors/backends/__init__.py @@ -0,0 +1,28 @@ +import warnings + +from .annoy_ import Annoy +from .bruteforcepurepython import BruteForcePurePython +from .sklearn_ import Sklearn + +brute_force_message = """ +Using BruteForcePurePython backend (no alternatives available). This backend is +very slow and not appropriate for datasets with more than a few thousand items +(or more than a handful of dimensions). This backend is provided only as a last +resort for users who are not able to install the packages necessary to use the +other (faster and better) backends. + +It is HIGHLY RECOMMENDED that you install Annoy (pip install annoy) or +scikit-learn (pip install scikit-learn). Doing so will make the corresponding +backends available to you and will improve performance dramatically. +""" + +def select_best(): + for b in (Annoy, Sklearn): + if b.available(): + return b + warnings.warn(brute_force_message) + return BruteForcePurePython + +def available(): + return [Annoy, Sklearn, BruteForcePurePython] + diff --git a/simpleneighbors/backends/annoy_.py b/simpleneighbors/backends/annoy_.py new file mode 100644 index 0000000..406f484 --- /dev/null +++ b/simpleneighbors/backends/annoy_.py @@ -0,0 +1,32 @@ +from simpleneighbors.backends.base import BaseBackend + +class Annoy(BaseBackend): + @classmethod + def available(cls): + try: + import annoy + except ImportError: + return False + return True + def __init__(self, dims, metric): + import annoy + self.annoy = annoy.AnnoyIndex(dims, metric=metric) + def add_item(self, idx, vector): + self.annoy.add_item(idx, vector) + def build(self, n, params=None): + self.annoy.build(n) + def get_nns_by_vector(self, vec, n): + return self.annoy.get_nns_by_vector(vec, n) + def get_distance(self, a_idx, b_idx): + return self.annoy.get_distance(a_idx, b_idx) + def get_item_vector(self, idx): + return self.annoy.get_item_vector(idx) + def save(self, fname): + """ + Saves the Annoy index as ``.annoy`` and the object data will be + saved as ``-data.pkl``. + """ + self.annoy.save(fname) + def load(self, fname): + self.annoy.load(fname) + diff --git a/simpleneighbors/backends/base.py b/simpleneighbors/backends/base.py new file mode 100644 index 0000000..7364619 --- /dev/null +++ b/simpleneighbors/backends/base.py @@ -0,0 +1,23 @@ +class BaseBackend: + @classmethod + def available(cls): + return False + def __init__(self, dims, metric): + raise NotImplementedError + def add_item(self, idx, vector): + raise NotImplementedError + def build(self, n, params=None): + raise NotImplementedError + def get_nns_by_vector(self, vec, n): + raise NotImplementedError + def get_distance(self, a_idx, b_idx): + raise NotImplementedError + def get_item_vector(self, idx): + raise NotImplementedError + def save(self, fname): + raise NotImplementedError + @classmethod + def load(cls, fname): + raise NotImplementedError + + diff --git a/simpleneighbors/backends/bruteforcepurepython.py b/simpleneighbors/backends/bruteforcepurepython.py new file mode 100644 index 0000000..fcdd04d --- /dev/null +++ b/simpleneighbors/backends/bruteforcepurepython.py @@ -0,0 +1,76 @@ +from simpleneighbors.backends.base import BaseBackend +from math import sqrt +import pickle +try: + from functools import lru_cache +except: + # for python 2, NOP lru_cache + from functools import wraps + def lru_cache(maxsize=10000): + def deco(fn): + @wraps(fn) + def wrapper(*args): + return fn(*args) + return wrapper + return deco + +@lru_cache(maxsize=10000) +def distance(coord1, coord2): + return sqrt(sum([(i - j)**2 for i, j in zip(coord1, coord2)])) + +def norm(vec): + return sqrt(sum([item**2 for item in vec])) + +@lru_cache(maxsize=10000) +def normalize(vec): + norm_val = norm(vec) + return tuple(item / norm_val for item in vec) + +@lru_cache(maxsize=10000) +def norm_dist(v1, v2): + return distance(normalize(v1), normalize(v2)) + +class BruteForcePurePython(BaseBackend): + + @classmethod + def available(cls): + return True + + def __init__(self, dims, metric): + self.items = [] + assert metric in ('angular', 'euclidean') + if metric == 'angular': + self.dist_fn = norm_dist + elif metric == 'euclidean': + self.dist_fn = distance + else: + raise NotImplementedError('no metric %s for this backend' % metric) + + def add_item(self, idx, vector): + self.items.append(tuple(float(d) for d in vector)) + + def build(self, n, params=None): + return + + def get_nns_by_vector(self, vec, n): + w_idx = sorted( + enumerate(self.items), + key=lambda x: self.dist_fn(x[1], tuple(vec)))[:n] + return [item[0] for item in w_idx] + + def get_distance(self, a_idx, b_idx): + return self.dist_fn(self.items[a_idx], self.items[b_idx]) + + def get_item_vector(self, idx): + return list(self.items[idx]) + + def save(self, fname): + with open(fname, "wb") as fh: + pickle.dump(self, fh) + + def load(self, fname): + with open(fname, "rb") as fh: + obj = pickle.load(fh) + self.items = obj.items + self.dist_fn = obj.dist_fn + diff --git a/simpleneighbors/backends/sklearn_.py b/simpleneighbors/backends/sklearn_.py new file mode 100644 index 0000000..a530656 --- /dev/null +++ b/simpleneighbors/backends/sklearn_.py @@ -0,0 +1,72 @@ +from simpleneighbors.backends.base import BaseBackend +import pickle + +class Sklearn(BaseBackend): + + @classmethod + def available(cls): + try: + from sklearn.neighbors import NearestNeighbors + import numpy as np + except ImportError: + return False + return True + + def __init__(self, dims, metric): + self.items = [] + self.metric = metric + + def add_item(self, idx, vector): + self.items.append([float(d) for d in vector]) + + def build(self, n, params=None): + from sklearn.neighbors import NearestNeighbors + from sklearn.preprocessing import normalize + import numpy as np + data = np.array(self.items) + if self.metric == 'angular': + data = normalize(data, norm='l2') + metric = 'minkowski' # equivalent to euclidean + else: + metric = self.metric + if params is None: + params = {} + self.nn = NearestNeighbors( + algorithm='auto', + leaf_size=n, + metric=metric, + n_jobs=-1, + **params) + self.nn.fit(data) + + def get_nns_by_vector(self, vec, n): + indices = self.nn.kneighbors([vec], n, return_distance=False) + return [item for item in indices[0]] + + def get_distance(self, a_idx, b_idx): + from sklearn.neighbors import DistanceMetric + from sklearn.preprocessing import normalize + import numpy as np + X = np.array([self.items[a_idx], self.items[b_idx]]) + if self.metric == 'angular': + X= normalize(X, norm='l2') + metric = 'minkowski' + else: + metric = self.metric + dist = DistanceMetric.get_metric(metric) + return dist.pairwise(X)[0][1] + + def get_item_vector(self, idx): + return self.items[idx] + + def save(self, fname): + with open(fname, "wb") as fh: + pickle.dump((self.items, self.nn), fh) + + def load(self, fname): + with open(fname, "rb") as fh: + obj = pickle.load(fh) + self.items = obj[0] + self.nn = obj[1] + + diff --git a/simpleneighbors/benchmark.py b/simpleneighbors/benchmark.py new file mode 100644 index 0000000..282a742 --- /dev/null +++ b/simpleneighbors/benchmark.py @@ -0,0 +1,46 @@ +from simpleneighbors import SimpleNeighbors +from simpleneighbors.backends import available + +def benchmark(n=10000, dims=300, query_count=10, metric='angular'): + import numpy as np + from time import time + data = np.random.randn(n, dims) + for backend in available(): + start = time() + print("benchmarking", backend, "at", start) + sim = SimpleNeighbors(dims, metric, backend=backend) + labels = list(range(n)) + print("feeding data") + sim.feed(zip(labels, data)) + print("building index") + sim.build(50) + to_build = time() + print("querying") + for i in range(query_count): + sim.nearest(np.random.randn(dims)) + nearest_query = time() + print(backend, "%0.2f sec to build, %0.2f sec to query %d items" % + (to_build - start, nearest_query - start, query_count)) + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser( + description='Benchmarks for SimpleNeighbors backends') + parser.add_argument( + "--n", + type=int, + default=10000, + help='number of random data items to generate') + parser.add_argument( + "--dims", + type=int, + default=128, + help='number of dimensions in random data') + parser.add_argument( + "--query-count", + type=int, + default=10, + help='number of queries to perform') + args = parser.parse_args() + benchmark(args.n, args.dims, args.query_count) diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py index 0345199..77d682b 100644 --- a/tests/test_simpleneighbors.py +++ b/tests/test_simpleneighbors.py @@ -4,6 +4,8 @@ from shutil import rmtree from simpleneighbors import SimpleNeighbors +from simpleneighbors.backends import BruteForcePurePython, Annoy, Sklearn +from simpleneighbors.backends.base import BaseBackend data = [ ('mahogany', (74, 1, 0)), @@ -39,15 +41,17 @@ def setUpClass(cls): def tearDownClass(cls): rmtree(cls.tmpdir) - def make_sim(self): - sim = SimpleNeighbors(3) + def make_sim(self, backend=None): + sim = SimpleNeighbors(3, metric='angular', backend=backend) sim.feed(data) sim.add_one(*one_more) sim.build(20) return sim def workflow(self, sim): - + + print("running backend", sim.backend) + self.assertRaises(AssertionError, sim.add_one, *one_more) # +1 because of the call to test .add_one above @@ -76,16 +80,14 @@ def workflow(self, sim): "%0.5f" % sim.dist('topaz', 'dusk'), "0.45335") - def test_workflow(self): - sim = self.make_sim() - self.workflow(sim) - - def test_save_load(self): - sim = self.make_sim() - sim.save(opj(self.tmpdir, 'neighbortest')) - sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest')) - self.workflow(sim2) + for backend in Annoy, BruteForcePurePython, Sklearn: + sim = self.make_sim(backend) + self.workflow(sim) + sim.save(opj(self.tmpdir, 'neighbortest')) + sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest')) + self.workflow(sim2) + if __name__ == '__main__': unittest.main() From a47c8820a0177d926afa939b22cf365625be9b51 Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 17:18:23 -0500 Subject: [PATCH 2/7] tweaks and testing select_best --- simpleneighbors/__init__.py | 2 +- tests/test_select_best.py | 24 ++++++++++++++++++++++++ tests/test_simpleneighbors.py | 1 - 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 tests/test_select_best.py diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py index 0661a76..0d7ea5b 100644 --- a/simpleneighbors/__init__.py +++ b/simpleneighbors/__init__.py @@ -115,7 +115,7 @@ def build(self, n=10, params=None): After you call build, you'll no longer be able to add new items to the index. - :param n: number of trees + :param n: backend-dependent (for Annoy: number of trees) :param params: dictionary with extra parameters to pass to backend """ self.backend.build(n, params) diff --git a/tests/test_select_best.py b/tests/test_select_best.py new file mode 100644 index 0000000..20a88a6 --- /dev/null +++ b/tests/test_select_best.py @@ -0,0 +1,24 @@ +import unittest +try: + from unittest import mock +except: + import mock +import warnings +from simpleneighbors.backends import select_best +from simpleneighbors.backends import Annoy, Sklearn, BruteForcePurePython + +class TestSelectBest(unittest.TestCase): + + def test_select_best(self): + self.assertEqual(select_best(), Annoy) + with mock.patch.dict('sys.modules', {'annoy': None}): + self.assertEqual(select_best(), Sklearn) + with mock.patch.dict('sys.modules', + {'annoy': None, 'sklearn.neighbors': None}): + with warnings.catch_warnings(record=True) as w: + self.assertEqual(select_best(), BruteForcePurePython) + self.assertIn("very slow", str(w[-1].message)) + self.assertIn("not appropriate", str(w[-1].message)) + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py index 77d682b..c704a8b 100644 --- a/tests/test_simpleneighbors.py +++ b/tests/test_simpleneighbors.py @@ -5,7 +5,6 @@ from simpleneighbors import SimpleNeighbors from simpleneighbors.backends import BruteForcePurePython, Annoy, Sklearn -from simpleneighbors.backends.base import BaseBackend data = [ ('mahogany', (74, 1, 0)), From 1cd89a6b2fa101cd6c64cc95025e635866db529b Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 17:51:09 -0500 Subject: [PATCH 3/7] flake8 compliance --- simpleneighbors/__init__.py | 4 ++-- simpleneighbors/backends/__init__.py | 3 ++- simpleneighbors/backends/annoy_.py | 13 +++++++++++-- simpleneighbors/backends/base.py | 11 +++++++++-- simpleneighbors/backends/bruteforcepurepython.py | 10 ++++++++-- simpleneighbors/backends/sklearn_.py | 13 ++++++------- simpleneighbors/benchmark.py | 4 +++- tests/test_select_best.py | 6 ++++-- tests/test_simpleneighbors.py | 13 ++++++------- 9 files changed, 51 insertions(+), 26 deletions(-) diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py index 0d7ea5b..86c85db 100644 --- a/simpleneighbors/__init__.py +++ b/simpleneighbors/__init__.py @@ -13,7 +13,7 @@ class SimpleNeighbors: indexes with a user-friendly API. When you instantiate this class, it will automatically select a backend implementation based on packages installed in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip - install annoy``) to enable the Annoy backend! (The alternatives are + install annoy``) to enable the Annoy backend! (The alternatives are slower and not as accurate.) Alternatively, you can specify a backend of your choosing with the ``backend`` parameter. @@ -279,7 +279,7 @@ def save(self, prefix): with `pickle`, the Annoy backend's implementation produces two files: the serialized Annoy index, and a pickle with the other data from the object. - + This method's parameter specifies the "prefix" to use for these files. :param prefix: filename prefix for Annoy index and object data diff --git a/simpleneighbors/backends/__init__.py b/simpleneighbors/backends/__init__.py index 82bc1bc..a644408 100644 --- a/simpleneighbors/backends/__init__.py +++ b/simpleneighbors/backends/__init__.py @@ -16,6 +16,7 @@ backends available to you and will improve performance dramatically. """ + def select_best(): for b in (Annoy, Sklearn): if b.available(): @@ -23,6 +24,6 @@ def select_best(): warnings.warn(brute_force_message) return BruteForcePurePython + def available(): return [Annoy, Sklearn, BruteForcePurePython] - diff --git a/simpleneighbors/backends/annoy_.py b/simpleneighbors/backends/annoy_.py index 406f484..ee25061 100644 --- a/simpleneighbors/backends/annoy_.py +++ b/simpleneighbors/backends/annoy_.py @@ -1,32 +1,41 @@ from simpleneighbors.backends.base import BaseBackend + class Annoy(BaseBackend): + @classmethod def available(cls): try: - import annoy + import annoy # noqa: F401 except ImportError: return False return True + def __init__(self, dims, metric): import annoy self.annoy = annoy.AnnoyIndex(dims, metric=metric) + def add_item(self, idx, vector): self.annoy.add_item(idx, vector) + def build(self, n, params=None): self.annoy.build(n) + def get_nns_by_vector(self, vec, n): return self.annoy.get_nns_by_vector(vec, n) + def get_distance(self, a_idx, b_idx): return self.annoy.get_distance(a_idx, b_idx) + def get_item_vector(self, idx): return self.annoy.get_item_vector(idx) + def save(self, fname): """ Saves the Annoy index as ``.annoy`` and the object data will be saved as ``-data.pkl``. """ self.annoy.save(fname) + def load(self, fname): self.annoy.load(fname) - diff --git a/simpleneighbors/backends/base.py b/simpleneighbors/backends/base.py index 7364619..6435052 100644 --- a/simpleneighbors/backends/base.py +++ b/simpleneighbors/backends/base.py @@ -1,23 +1,30 @@ class BaseBackend: + @classmethod def available(cls): return False + def __init__(self, dims, metric): raise NotImplementedError + def add_item(self, idx, vector): raise NotImplementedError + def build(self, n, params=None): raise NotImplementedError + def get_nns_by_vector(self, vec, n): raise NotImplementedError + def get_distance(self, a_idx, b_idx): raise NotImplementedError + def get_item_vector(self, idx): raise NotImplementedError + def save(self, fname): raise NotImplementedError + @classmethod def load(cls, fname): raise NotImplementedError - - diff --git a/simpleneighbors/backends/bruteforcepurepython.py b/simpleneighbors/backends/bruteforcepurepython.py index fcdd04d..ed683f6 100644 --- a/simpleneighbors/backends/bruteforcepurepython.py +++ b/simpleneighbors/backends/bruteforcepurepython.py @@ -1,11 +1,13 @@ from simpleneighbors.backends.base import BaseBackend from math import sqrt import pickle + try: from functools import lru_cache -except: +except ImportError: # for python 2, NOP lru_cache from functools import wraps + def lru_cache(maxsize=10000): def deco(fn): @wraps(fn) @@ -14,22 +16,27 @@ def wrapper(*args): return wrapper return deco + @lru_cache(maxsize=10000) def distance(coord1, coord2): return sqrt(sum([(i - j)**2 for i, j in zip(coord1, coord2)])) + def norm(vec): return sqrt(sum([item**2 for item in vec])) + @lru_cache(maxsize=10000) def normalize(vec): norm_val = norm(vec) return tuple(item / norm_val for item in vec) + @lru_cache(maxsize=10000) def norm_dist(v1, v2): return distance(normalize(v1), normalize(v2)) + class BruteForcePurePython(BaseBackend): @classmethod @@ -73,4 +80,3 @@ def load(self, fname): obj = pickle.load(fh) self.items = obj.items self.dist_fn = obj.dist_fn - diff --git a/simpleneighbors/backends/sklearn_.py b/simpleneighbors/backends/sklearn_.py index a530656..1232125 100644 --- a/simpleneighbors/backends/sklearn_.py +++ b/simpleneighbors/backends/sklearn_.py @@ -1,13 +1,14 @@ from simpleneighbors.backends.base import BaseBackend import pickle + class Sklearn(BaseBackend): @classmethod def available(cls): try: - from sklearn.neighbors import NearestNeighbors - import numpy as np + from sklearn.neighbors import NearestNeighbors # noqa: F401 + import numpy as np # noqa: F401 except ImportError: return False return True @@ -18,7 +19,7 @@ def __init__(self, dims, metric): def add_item(self, idx, vector): self.items.append([float(d) for d in vector]) - + def build(self, n, params=None): from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import normalize @@ -26,7 +27,7 @@ def build(self, n, params=None): data = np.array(self.items) if self.metric == 'angular': data = normalize(data, norm='l2') - metric = 'minkowski' # equivalent to euclidean + metric = 'minkowski' # equivalent to euclidean else: metric = self.metric if params is None: @@ -49,7 +50,7 @@ def get_distance(self, a_idx, b_idx): import numpy as np X = np.array([self.items[a_idx], self.items[b_idx]]) if self.metric == 'angular': - X= normalize(X, norm='l2') + X = normalize(X, norm='l2') metric = 'minkowski' else: metric = self.metric @@ -68,5 +69,3 @@ def load(self, fname): obj = pickle.load(fh) self.items = obj[0] self.nn = obj[1] - - diff --git a/simpleneighbors/benchmark.py b/simpleneighbors/benchmark.py index 282a742..a562575 100644 --- a/simpleneighbors/benchmark.py +++ b/simpleneighbors/benchmark.py @@ -1,6 +1,7 @@ from simpleneighbors import SimpleNeighbors from simpleneighbors.backends import available + def benchmark(n=10000, dims=300, query_count=10, metric='angular'): import numpy as np from time import time @@ -20,7 +21,8 @@ def benchmark(n=10000, dims=300, query_count=10, metric='angular'): sim.nearest(np.random.randn(dims)) nearest_query = time() print(backend, "%0.2f sec to build, %0.2f sec to query %d items" % - (to_build - start, nearest_query - start, query_count)) + (to_build - start, nearest_query - start, query_count)) + if __name__ == '__main__': diff --git a/tests/test_select_best.py b/tests/test_select_best.py index 20a88a6..a2ab7f3 100644 --- a/tests/test_select_best.py +++ b/tests/test_select_best.py @@ -1,12 +1,13 @@ import unittest try: from unittest import mock -except: +except ImportError: import mock import warnings from simpleneighbors.backends import select_best from simpleneighbors.backends import Annoy, Sklearn, BruteForcePurePython + class TestSelectBest(unittest.TestCase): def test_select_best(self): @@ -14,11 +15,12 @@ def test_select_best(self): with mock.patch.dict('sys.modules', {'annoy': None}): self.assertEqual(select_best(), Sklearn) with mock.patch.dict('sys.modules', - {'annoy': None, 'sklearn.neighbors': None}): + {'annoy': None, 'sklearn.neighbors': None}): with warnings.catch_warnings(record=True) as w: self.assertEqual(select_best(), BruteForcePurePython) self.assertIn("very slow", str(w[-1].message)) self.assertIn("not appropriate", str(w[-1].message)) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py index c704a8b..d085402 100644 --- a/tests/test_simpleneighbors.py +++ b/tests/test_simpleneighbors.py @@ -30,6 +30,7 @@ one_more = ('purpley', (135, 86, 228)) + class TestSimpleNeighbors(unittest.TestCase): @classmethod @@ -39,7 +40,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): rmtree(cls.tmpdir) - + def make_sim(self, backend=None): sim = SimpleNeighbors(3, metric='angular', backend=backend) sim.feed(data) @@ -51,8 +52,8 @@ def workflow(self, sim): print("running backend", sim.backend) - self.assertRaises(AssertionError, - sim.add_one, *one_more) + self.assertRaises(AssertionError, sim.add_one, *one_more) + # +1 because of the call to test .add_one above self.assertEqual(len(sim), len(data) + 1) @@ -67,12 +68,11 @@ def workflow(self, sim): sim.nearest([100, 100, 200], 3), ['dusk', 'purpley', 'french blue']) - nm = list(sim.neighbors_matching('mint', 1, - lambda x: 'a' in x)) + nm = list(sim.neighbors_matching('mint', 1, lambda x: 'a' in x)) self.assertEqual(nm[0], 'battleship grey') nm = list(sim.nearest_matching([100, 100, 200], 1, - lambda x: x.startswith('p'))) + lambda x: x.startswith('p'))) self.assertEqual(nm[0], 'purpley') self.assertEqual( @@ -90,4 +90,3 @@ def test_workflow(self): if __name__ == '__main__': unittest.main() - From a71333a5c852395329e5cdc1e2c6ce4dd659d11f Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 17:52:45 -0500 Subject: [PATCH 4/7] build stuff --- .travis.yml | 8 ++++---- Makefile | 5 ----- setup.py | 9 ++++++++- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index ae7b826..2e66665 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ language: python python: - "pypy3" - "pypy" + - "3.8" + - "3.7" - "3.6" - "3.5" - "3.4" @@ -14,7 +16,7 @@ sudo: false # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - - pip install -r requirements.txt + - pip install -e .[dev] - pip install coverage # command to run tests, e.g. python setup.py test @@ -28,6 +30,4 @@ after_success: after_script: - coverage report # show coverage on cmd line -- pip install pycodestyle pyflakes -- pyflakes . | tee >(wc -l) # static analysis -- pycodestyle --statistics --count . # static analysis +- flake8 simpleneighbors tests diff --git a/Makefile b/Makefile index 9879798..8576764 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,6 @@ help: @echo "clean-test - remove test and coverage artifacts" @echo "lint - check style with flake8" @echo "test - run tests quickly with the default Python" - @echo "test-all - run tests on every Python version with tox" @echo "coverage - check code coverage quickly with the default Python" @echo "docs - generate Sphinx HTML documentation, including API docs" @echo "release - package and upload a release" @@ -30,7 +29,6 @@ clean-pyc: find . -name '__pycache__' -exec rm -fr {} + clean-test: - rm -fr .tox/ rm -f .coverage rm -fr htmlcov/ @@ -41,9 +39,6 @@ test: python setup.py test python -m doctest simpleneighbors/__init__.py -test-all: - tox - coverage: coverage run --source simpleneighbors setup.py test coverage report -m diff --git a/setup.py b/setup.py index 2cff5a5..1a5a550 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,14 @@ extras_require={ 'annoy': ['annoy>=1.16.0'], 'sklearn': ['scikit-learn>=0.20'], - 'purepython': [] + 'purepython': [], + 'dev': [ + 'annoy>=1.16.0', + 'scikit-learn>=0.20', + 'mock;python_version<="2.7"', + 'coverage', + 'flake8', + ] }, platforms='any', test_suite='tests' From f917272e204ceb95da2d1e28b35c5534bf19688c Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 18:37:28 -0500 Subject: [PATCH 5/7] workaround for numpy weirdness? --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1a5a550..4b7a8ba 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ 'sklearn': ['scikit-learn>=0.20'], 'purepython': [], 'dev': [ + 'numpy==1.15.4', # see https://github.com/numpy/numpy/issues/14012 'annoy>=1.16.0', 'scikit-learn>=0.20', 'mock;python_version<="2.7"', From c19bd342139524ef2ce71a6c149e77b9566da8c9 Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 21:52:12 -0500 Subject: [PATCH 6/7] another attempt at getting travis to work, ugh --- .travis.yml | 2 +- requirements.txt | 0 setup.py | 1 - tox.ini | 23 ----------------------- 4 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 requirements.txt delete mode 100644 tox.ini diff --git a/.travis.yml b/.travis.yml index 2e66665..3d3c7af 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ install: # command to run tests, e.g. python setup.py test script: - - coverage run --source simpleneighbors setup.py test --verbose + - coverage run --source simpleneighbors tests/test_simpleneighbors.py --verbose - python -m doctest simpleneighbors/__init__.py after_success: diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py index 4b7a8ba..1a5a550 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,6 @@ 'sklearn': ['scikit-learn>=0.20'], 'purepython': [], 'dev': [ - 'numpy==1.15.4', # see https://github.com/numpy/numpy/issues/14012 'annoy>=1.16.0', 'scikit-learn>=0.20', 'mock;python_version<="2.7"', diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 96a8f23..0000000 --- a/tox.ini +++ /dev/null @@ -1,23 +0,0 @@ -[tox] -skipsdist = True -usedevelop = True -envlist = py{27,36,37}, - flake8 - -[testenv] -install_command = pip install -e ".[dev]" -U {packages} -basepython = - py27: python2.7 - py36: python3.6 - py37: python3.7 -deps = - -r{toxinidir}/requirements.txt -commands = python setup.py test - -[testenv:flake8] -basepython = - python3.6 -deps = - flake8 -commands = - flake8 simpleneighbors From 6ccc9dc1d7c976ec067325605fc0a53e06f556cd Mon Sep 17 00:00:00 2001 From: Allison Parrish Date: Sun, 12 Jan 2020 21:56:57 -0500 Subject: [PATCH 7/7] not even going to try, sorry --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3d3c7af..9deb4cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,8 +2,6 @@ language: python python: - - "pypy3" - - "pypy" - "3.8" - "3.7" - "3.6"