From 5436804346b16e543fd18080b29cc50aac0c1687 Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 5 Jan 2020 23:59:16 -0500
Subject: [PATCH 1/7] initial backends implementation + updated docs

---
 README.rst                                    | 80 +++++++++++++----
 requirements.txt                              |  1 -
 setup.py                                      |  8 +-
 simpleneighbors/__init__.py                   | 87 ++++++++++++-------
 simpleneighbors/backends/__init__.py          | 28 ++++++
 simpleneighbors/backends/annoy_.py            | 32 +++++++
 simpleneighbors/backends/base.py              | 23 +++++
 .../backends/bruteforcepurepython.py          | 76 ++++++++++++++++
 simpleneighbors/backends/sklearn_.py          | 72 +++++++++++++++
 simpleneighbors/benchmark.py                  | 46 ++++++++++
 tests/test_simpleneighbors.py                 | 26 +++---
 11 files changed, 412 insertions(+), 67 deletions(-)
 create mode 100644 simpleneighbors/backends/__init__.py
 create mode 100644 simpleneighbors/backends/annoy_.py
 create mode 100644 simpleneighbors/backends/base.py
 create mode 100644 simpleneighbors/backends/bruteforcepurepython.py
 create mode 100644 simpleneighbors/backends/sklearn_.py
 create mode 100644 simpleneighbors/benchmark.py

diff --git a/README.rst b/README.rst
index aaed333..736e3f1 100644
--- a/README.rst
+++ b/README.rst
@@ -11,8 +11,12 @@ Simple Neighbors
         :target: https://pypi.python.org/pypi/simpleneighbors
 
 Simple Neighbors is a clean and easy interface for performing nearest-neighbor
-lookups on items from a corpus. For example, here's how to find the most
-similar color to a color in the `xkcd colors list
+lookups on items from a corpus. To install the package::
+
+    pip install simpleneighbors[annoy]
+
+Here's a quick example, showing how to find the names of colors most similar to
+'pink' in the `xkcd colors list
 <https://github.com/dariusk/corpora/blob/master/data/colors/xkcd.json>`_::
 
     >>> from simpleneighbors import SimpleNeighbors
@@ -26,7 +30,16 @@ similar color to a color in the `xkcd colors list
     >>> list(sim.neighbors('pink', 5))
     ['pink', 'bubblegum pink', 'pale magenta', 'dark mauve', 'light plum']
 
-Read the documentation here: https://simpleneighbors.readthedocs.org.
+For a more complete example, refer to my `Understanding Word Vectors notebook
+<https://github.com/aparrish/rwet/blob/master/understanding-word-vectors.ipynb>`_,
+which shows how to use Simple Neighbors to perform similarity lookups on word
+vectors.
+
+Read the complete Simple Neighbors documentation here:
+https://simpleneighbors.readthedocs.org.
+
+Why Simple Neighbors?
+---------------------
 
 Approximate nearest-neighbor lookups are a quick way to find the items in your
 data set that are closest (or most similar to) any other item in your data, or
@@ -36,28 +49,57 @@ in a 300-dimensional space.
 
 You could always perform pairwise distance calculations to find nearest
 neighbors in your data, but for data of any appreciable size and complexity,
-this kind of calculation is unbearably slow. This library uses `Annoy
-<https://pypi.org/project/annoy/>`_ behind the scenes for approximate
-nearest-neighbor lookups, which are ultimately a little less accurate than
-pairwise calculations but much, much faster.
+this kind of calculation is unbearably slow. Simple Neighbors uses one of a
+handful of libraries behind the scenes to provide approximate nearest-neighbor
+lookups, which are ultimately a little less accurate than pairwise calculations
+but much, much faster.
 
 The library also keeps track of your data, sparing you the extra step of
-mapping each item in your data to its integer index in Annoy (at the potential
-cost of some redundancy in data storage, depending on your application).
+mapping each item in your data to its integer index (at the potential cost of
+some redundancy in data storage, depending on your application).
+
+I made Simple Neighbors because I use nearest neighbor lookups all the time and
+found myself writing and rewriting the same bits of wrapper code over and over
+again. I wanted to hide a little bit of the complexity of using these libraries
+to make it easier to build small prototypes and teach workshops using
+nearest-neighbor lookups.
+
+Multiple backend support
+------------------------
+
+Simple Neighbors relies on the approximate nearest neighbor index
+implementations found in other libraries. By default, Simple Neighbors will
+choose the best backend based on the packages installed in your environment.
+(You can also specify which backend to use by hand, or create your own.)
+
+Currently supported backend libraries include:
+
+* ``Annoy``: Erik Bernhardsson's `Annoy <https://pypi.org/project/annoy/>`_ library
+* ``Sklearn``: `scikit-learn's NearestNeighbors <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors>`_
+* ``BruteForcePurePython``: Pure Python brute-force search (included in package)
+
+When you install Simple Neighbors, you can direct ``pip`` to install the
+required packages for a given backend. For example, to install Simple Neighbors
+with Annoy::
+
+    pip install simpleneighbors[annoy]
+
+Annoy is highly recommended! This is the preferred way to use Simple Neighbors.
 
-I made Simple Neighbors because I use Annoy all the time and found myself
-writing and rewriting the same bits of wrapper code over and over again. I
-wanted to hide a little bit of the complexity of using Annoy to make it easier
-to build small prototypes and teach workshops using nearest-neighbor lookups.
+To install Simple Neighbors alongside scikit-learn to use the ``Sklearn``
+backend (which makes use of scikit-learn's `NearestNeighbors` class)::
 
-Installation
-------------
+    pip install simpleneighbors[sklearn]
 
-Install with pip like so::
+If you can't install Annoy or scikit-learn on your platform, you can also use a
+pure Python backend::
 
-    pip install simpleneighbors
+    pip install simpleneighbors[purepython]
 
-You can also download the source code and install manually::
+Note that the pure Python version uses a brute force search and is therefore
+very slow. In general, it's not suitable for datasets with more than a few
+thousand items (or more than a handful of dimensions).
 
-    python setup.py install
+See the documentation for the ``SimpleNeighbors`` class for more information on
+specifying backends.
 
diff --git a/requirements.txt b/requirements.txt
index 1eb98b6..e69de29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +0,0 @@
-annoy>=1.12.0
diff --git a/setup.py b/setup.py
index bdb870e..2cff5a5 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name='simpleneighbors',
-    version='0.0.1',
+    version='0.1.0',
     author='Allison Parrish',
     author_email='allison@decontextualize.com',
     url='https://github.com/aparrish/simpleneighbors',
@@ -26,8 +26,12 @@
     package_dir={'simpleneighbors': 'simpleneighbors'},
     packages=['simpleneighbors'],
     install_requires=[
-        'annoy'
     ],
+    extras_require={
+        'annoy': ['annoy>=1.16.0'],
+        'sklearn': ['scikit-learn>=0.20'],
+        'purepython': []
+    },
     platforms='any',
     test_suite='tests'
 )
diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py
index 108c014..0661a76 100644
--- a/simpleneighbors/__init__.py
+++ b/simpleneighbors/__init__.py
@@ -1,32 +1,44 @@
 import pickle
-import annoy
+from simpleneighbors.backends import select_best
 
 __author__ = 'Allison Parrish'
 __email__ = 'allison@decontextualize.com'
-__version__ = '0.0.1'
+__version__ = '0.1.0'
 
 
 class SimpleNeighbors:
     """A Simple Neighbors index.
 
-    You need to specify the number of dimensions in your data (i.e., the
-    length of the list or array you plan to provide for each item) and the
-    distance metric you want to use. (The default is "angular" distance,
-    i.e., cosine distance. You might also want to try "euclidean" for
-    Euclidean distance.) Both of these parameters are passed directly to
-    Annoy; see `the Annoy documentation <https://pypi.org/project/annoy/>`_
-    for more details.
+    This class wraps backend implementations of approximate nearest neighbors
+    indexes with a user-friendly API. When you instantiate this class, it will
+    automatically select a backend implementation based on packages installed
+    in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip
+    install annoy``) to enable the Annoy backend! (The alternatives are 
+    slower and not as accurate.) Alternatively, you can specify a backend of
+    your choosing with the ``backend`` parameter.
+
+    Specify the number of dimensions in your data (i.e., the length of the list
+    or array you plan to provide for each item) and the distance metric you
+    want to use. The default is ``angular`` distance, an approximation of
+    cosine distance. This metric is supported by all backends, as is
+    ``euclidean`` (for Euclidean distance). Both of these parameters are passed
+    directly to the backend; see the backend documentation for more details.
 
     :param dims: the number of dimensions in your data
     :param metric: the distance metric to use
+    :param backend: the nearest neighbors backend to use (default is annoy)
     """
 
-    def __init__(self, dims, metric="angular"):
+    def __init__(self, dims, metric="angular", backend=None):
+
+        if backend is None:
+            backend = select_best()
+
         self.dims = dims
         self.metric = metric
         self.id_map = {}
         self.corpus = []
-        self.annoy = annoy.AnnoyIndex(dims, metric=metric)
+        self.backend = backend(dims, metric=metric)
         self.i = 0
         self.built = False
 
@@ -53,7 +65,7 @@ def add_one(self, item, vector):
         """
 
         assert self.built is False, "Index already built; can't add new items."
-        self.annoy.add_item(self.i, vector)
+        self.backend.add_item(self.i, vector)
         self.id_map[item] = self.i
         self.corpus.append(item)
         self.i += 1
@@ -88,20 +100,25 @@ def feed(self, items):
         for item, vector in items:
             self.add_one(item, vector)
 
-    def build(self, n=10):
+    def build(self, n=10, params=None):
         """Build the index.
 
-        After adding all of your items, call this method to build
-        the index. The specified parameter controls the number of trees in the
-        underlying Annoy index; a higher number will take longer to build but
-        provide more precision when querying.
+        After adding all of your items, call this method to build the index.
+        The meaning of parameter ``n`` is different for each backend
+        implementation. For the Annoy backend, it specifies the number of trees
+        in the underlying Annoy index (a higher number will take longer to
+        build but provide more precision when querying). For the Sklearn
+        backend, the number specifies the leaf size when building the ball
+        tree. (The Brute Force Pure Python backend ignores this value
+        entirely.)
 
         After you call build, you'll no longer be able to add new items to the
         index.
 
         :param n: number of trees
+        :param params: dictionary with extra parameters to pass to backend
         """
-        self.annoy.build(n)
+        self.backend.build(n, params)
         self.built = True
 
     def nearest(self, vec, n=12):
@@ -130,7 +147,7 @@ def nearest(self, vec, n=12):
         """
 
         return [self.corpus[idx] for idx
-                in self.annoy.get_nns_by_vector(vec, n)]
+                in self.backend.get_nns_by_vector(vec, n)]
 
     def neighbors(self, item, n=12):
         """Returns the items nearest another item in the index.
@@ -234,10 +251,10 @@ def dist(self, a, b):
         :param b: second item
         :returns: distance between ``a`` and ``b``
         """
-        return self.annoy.get_distance(self.id_map[a], self.id_map[b])
+        return self.backend.get_distance(self.id_map[a], self.id_map[b])
 
     def vec(self, item):
-        """Returns the vector for an item
+        """Returns the vector for an item.
 
         This method returns the vector that was originally provided when
         indexing the specified item. (Depending on how it was originally
@@ -247,7 +264,7 @@ def vec(self, item):
         :param item: item to lookup
         :returns: vector for item
         """
-        return self.annoy.get_item_vector(self.id_map[item])
+        return self.backend.get_item_vector(self.id_map[item])
 
     def __len__(self):
         """Returns the number of items in the vector"""
@@ -256,12 +273,14 @@ def __len__(self):
     def save(self, prefix):
         """Saves the index to disk.
 
-        This method saves the index to disk. Annoy indexes can't be serialized
-        with `pickle`, so this method produces two files: the serialized Annoy
-        index, and a pickle with the other data from the object. This method's
-        parameter specifies the "prefix" to use for these files. The Annoy
-        index will be saved as ``<prefix>.annoy`` and the object data will be
-        saved as ``<prefix>-data.pkl``.
+        This method saves the index to disk. Each backend manages serialization
+        a little bit differently: consult the documentation and source code for
+        more details. For example, because Annoy indexes can't be serialized
+        with `pickle`, the Annoy backend's implementation produces two files:
+        the serialized Annoy index, and a pickle with the other data from the
+        object.
+        
+        This method's parameter specifies the "prefix" to use for these files.
 
         :param prefix: filename prefix for Annoy index and object data
         :returns: None
@@ -275,9 +294,10 @@ def save(self, prefix):
                 'i': self.i,
                 'built': self.built,
                 'metric': self.metric,
-                'dims': self.dims
+                'dims': self.dims,
+                '_backend_class': self.backend.__class__
             }, fh)
-        self.annoy.save(prefix + ".annoy")
+        self.backend.save(prefix + ".idx")
 
     @classmethod
     def load(cls, prefix):
@@ -286,7 +306,7 @@ def load(cls, prefix):
         This class method restores a previously-saved index using the specified
         file prefix.
 
-        :param prefix: prefix for AnnoyIndex file and object data pickle
+        :param prefix: prefix used when saving
         :returns: SimpleNeighbors object restored from specified files
         """
 
@@ -294,11 +314,12 @@ def load(cls, prefix):
             data = pickle.load(fh)
         newobj = cls(
             dims=data['dims'],
-            metric=data['metric']
+            metric=data['metric'],
+            backend=data['_backend_class']
         )
         newobj.id_map = data['id_map']
         newobj.corpus = data['corpus']
         newobj.i = data['i']
         newobj.built = data['built']
-        newobj.annoy.load(prefix + ".annoy")
+        newobj.backend.load(prefix + ".idx")
         return newobj
diff --git a/simpleneighbors/backends/__init__.py b/simpleneighbors/backends/__init__.py
new file mode 100644
index 0000000..82bc1bc
--- /dev/null
+++ b/simpleneighbors/backends/__init__.py
@@ -0,0 +1,28 @@
+import warnings
+
+from .annoy_ import Annoy
+from .bruteforcepurepython import BruteForcePurePython
+from .sklearn_ import Sklearn
+
+brute_force_message = """
+Using BruteForcePurePython backend (no alternatives available). This backend is
+very slow and not appropriate for datasets with more than a few thousand items
+(or more than a handful of dimensions). This backend is provided only as a last
+resort for users who are not able to install the packages necessary to use the
+other (faster and better) backends.
+
+It is HIGHLY RECOMMENDED that you install Annoy (pip install annoy) or
+scikit-learn (pip install scikit-learn). Doing so will make the corresponding
+backends available to you and will improve performance dramatically.
+"""
+
+def select_best():
+    for b in (Annoy, Sklearn):
+        if b.available():
+            return b
+    warnings.warn(brute_force_message)
+    return BruteForcePurePython
+
+def available():
+    return [Annoy, Sklearn, BruteForcePurePython]
+
diff --git a/simpleneighbors/backends/annoy_.py b/simpleneighbors/backends/annoy_.py
new file mode 100644
index 0000000..406f484
--- /dev/null
+++ b/simpleneighbors/backends/annoy_.py
@@ -0,0 +1,32 @@
+from simpleneighbors.backends.base import BaseBackend
+
+class Annoy(BaseBackend):
+    @classmethod
+    def available(cls):
+        try:
+            import annoy
+        except ImportError:
+            return False
+        return True
+    def __init__(self, dims, metric):
+        import annoy
+        self.annoy = annoy.AnnoyIndex(dims, metric=metric)
+    def add_item(self, idx, vector):
+        self.annoy.add_item(idx, vector)
+    def build(self, n, params=None):
+        self.annoy.build(n)
+    def get_nns_by_vector(self, vec, n):
+        return self.annoy.get_nns_by_vector(vec, n)
+    def get_distance(self, a_idx, b_idx):
+        return self.annoy.get_distance(a_idx, b_idx)
+    def get_item_vector(self, idx):
+        return self.annoy.get_item_vector(idx)
+    def save(self, fname):
+        """
+        Saves the Annoy index as ``<prefix>.annoy`` and the object data will be
+        saved as ``<prefix>-data.pkl``.
+        """
+        self.annoy.save(fname)
+    def load(self, fname):
+        self.annoy.load(fname)
+
diff --git a/simpleneighbors/backends/base.py b/simpleneighbors/backends/base.py
new file mode 100644
index 0000000..7364619
--- /dev/null
+++ b/simpleneighbors/backends/base.py
@@ -0,0 +1,23 @@
+class BaseBackend:
+    @classmethod
+    def available(cls):
+        return False
+    def __init__(self, dims, metric):
+        raise NotImplementedError
+    def add_item(self, idx, vector):
+        raise NotImplementedError
+    def build(self, n, params=None):
+        raise NotImplementedError
+    def get_nns_by_vector(self, vec, n):
+        raise NotImplementedError
+    def get_distance(self, a_idx, b_idx):
+        raise NotImplementedError
+    def get_item_vector(self, idx):
+        raise NotImplementedError
+    def save(self, fname):
+        raise NotImplementedError
+    @classmethod
+    def load(cls, fname):
+        raise NotImplementedError
+
+
diff --git a/simpleneighbors/backends/bruteforcepurepython.py b/simpleneighbors/backends/bruteforcepurepython.py
new file mode 100644
index 0000000..fcdd04d
--- /dev/null
+++ b/simpleneighbors/backends/bruteforcepurepython.py
@@ -0,0 +1,76 @@
+from simpleneighbors.backends.base import BaseBackend
+from math import sqrt
+import pickle
+try:
+    from functools import lru_cache
+except:
+    # for python 2, NOP lru_cache
+    from functools import wraps
+    def lru_cache(maxsize=10000):
+        def deco(fn):
+            @wraps(fn)
+            def wrapper(*args):
+                return fn(*args)
+            return wrapper
+        return deco
+
+@lru_cache(maxsize=10000)
+def distance(coord1, coord2):
+    return sqrt(sum([(i - j)**2 for i, j in zip(coord1, coord2)]))
+
+def norm(vec):
+    return sqrt(sum([item**2 for item in vec]))
+
+@lru_cache(maxsize=10000)
+def normalize(vec):
+    norm_val = norm(vec)
+    return tuple(item / norm_val for item in vec)
+
+@lru_cache(maxsize=10000)
+def norm_dist(v1, v2):
+    return distance(normalize(v1), normalize(v2))
+
+class BruteForcePurePython(BaseBackend):
+
+    @classmethod
+    def available(cls):
+        return True
+
+    def __init__(self, dims, metric):
+        self.items = []
+        assert metric in ('angular', 'euclidean')
+        if metric == 'angular':
+            self.dist_fn = norm_dist
+        elif metric == 'euclidean':
+            self.dist_fn = distance
+        else:
+            raise NotImplementedError('no metric %s for this backend' % metric)
+
+    def add_item(self, idx, vector):
+        self.items.append(tuple(float(d) for d in vector))
+
+    def build(self, n, params=None):
+        return
+
+    def get_nns_by_vector(self, vec, n):
+        w_idx = sorted(
+                    enumerate(self.items),
+                    key=lambda x: self.dist_fn(x[1], tuple(vec)))[:n]
+        return [item[0] for item in w_idx]
+
+    def get_distance(self, a_idx, b_idx):
+        return self.dist_fn(self.items[a_idx], self.items[b_idx])
+
+    def get_item_vector(self, idx):
+        return list(self.items[idx])
+
+    def save(self, fname):
+        with open(fname, "wb") as fh:
+            pickle.dump(self, fh)
+
+    def load(self, fname):
+        with open(fname, "rb") as fh:
+            obj = pickle.load(fh)
+        self.items = obj.items
+        self.dist_fn = obj.dist_fn
+
diff --git a/simpleneighbors/backends/sklearn_.py b/simpleneighbors/backends/sklearn_.py
new file mode 100644
index 0000000..a530656
--- /dev/null
+++ b/simpleneighbors/backends/sklearn_.py
@@ -0,0 +1,72 @@
+from simpleneighbors.backends.base import BaseBackend
+import pickle
+
+class Sklearn(BaseBackend):
+
+    @classmethod
+    def available(cls):
+        try:
+            from sklearn.neighbors import NearestNeighbors
+            import numpy as np
+        except ImportError:
+            return False
+        return True
+
+    def __init__(self, dims, metric):
+        self.items = []
+        self.metric = metric
+
+    def add_item(self, idx, vector):
+        self.items.append([float(d) for d in vector])
+        
+    def build(self, n, params=None):
+        from sklearn.neighbors import NearestNeighbors
+        from sklearn.preprocessing import normalize
+        import numpy as np
+        data = np.array(self.items)
+        if self.metric == 'angular':
+            data = normalize(data, norm='l2')
+            metric = 'minkowski' # equivalent to euclidean
+        else:
+            metric = self.metric
+        if params is None:
+            params = {}
+        self.nn = NearestNeighbors(
+                algorithm='auto',
+                leaf_size=n,
+                metric=metric,
+                n_jobs=-1,
+                **params)
+        self.nn.fit(data)
+
+    def get_nns_by_vector(self, vec, n):
+        indices = self.nn.kneighbors([vec], n, return_distance=False)
+        return [item for item in indices[0]]
+
+    def get_distance(self, a_idx, b_idx):
+        from sklearn.neighbors import DistanceMetric
+        from sklearn.preprocessing import normalize
+        import numpy as np
+        X = np.array([self.items[a_idx], self.items[b_idx]])
+        if self.metric == 'angular':
+            X= normalize(X, norm='l2')
+            metric = 'minkowski'
+        else:
+            metric = self.metric
+        dist = DistanceMetric.get_metric(metric)
+        return dist.pairwise(X)[0][1]
+
+    def get_item_vector(self, idx):
+        return self.items[idx]
+
+    def save(self, fname):
+        with open(fname, "wb") as fh:
+            pickle.dump((self.items, self.nn), fh)
+
+    def load(self, fname):
+        with open(fname, "rb") as fh:
+            obj = pickle.load(fh)
+        self.items = obj[0]
+        self.nn = obj[1]
+
+
diff --git a/simpleneighbors/benchmark.py b/simpleneighbors/benchmark.py
new file mode 100644
index 0000000..282a742
--- /dev/null
+++ b/simpleneighbors/benchmark.py
@@ -0,0 +1,46 @@
+from simpleneighbors import SimpleNeighbors
+from simpleneighbors.backends import available
+
+def benchmark(n=10000, dims=300, query_count=10, metric='angular'):
+    import numpy as np
+    from time import time
+    data = np.random.randn(n, dims)
+    for backend in available():
+        start = time()
+        print("benchmarking", backend, "at", start)
+        sim = SimpleNeighbors(dims, metric, backend=backend)
+        labels = list(range(n))
+        print("feeding data")
+        sim.feed(zip(labels, data))
+        print("building index")
+        sim.build(50)
+        to_build = time()
+        print("querying")
+        for i in range(query_count):
+            sim.nearest(np.random.randn(dims))
+        nearest_query = time()
+        print(backend, "%0.2f sec to build, %0.2f sec to query %d items" %
+                (to_build - start, nearest_query - start, query_count))
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser(
+            description='Benchmarks for SimpleNeighbors backends')
+    parser.add_argument(
+            "--n",
+            type=int,
+            default=10000,
+            help='number of random data items to generate')
+    parser.add_argument(
+            "--dims",
+            type=int,
+            default=128,
+            help='number of dimensions in random data')
+    parser.add_argument(
+            "--query-count",
+            type=int,
+            default=10,
+            help='number of queries to perform')
+    args = parser.parse_args()
+    benchmark(args.n, args.dims, args.query_count)
diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py
index 0345199..77d682b 100644
--- a/tests/test_simpleneighbors.py
+++ b/tests/test_simpleneighbors.py
@@ -4,6 +4,8 @@
 from shutil import rmtree
 
 from simpleneighbors import SimpleNeighbors
+from simpleneighbors.backends import BruteForcePurePython, Annoy, Sklearn
+from simpleneighbors.backends.base import BaseBackend
 
 data = [
     ('mahogany', (74, 1, 0)),
@@ -39,15 +41,17 @@ def setUpClass(cls):
     def tearDownClass(cls):
         rmtree(cls.tmpdir)
        
-    def make_sim(self):
-        sim = SimpleNeighbors(3)
+    def make_sim(self, backend=None):
+        sim = SimpleNeighbors(3, metric='angular', backend=backend)
         sim.feed(data)
         sim.add_one(*one_more)
         sim.build(20)
         return sim
 
     def workflow(self, sim):
-        
+
+        print("running backend", sim.backend)
+
         self.assertRaises(AssertionError,
                 sim.add_one, *one_more)
         # +1 because of the call to test .add_one above
@@ -76,16 +80,14 @@ def workflow(self, sim):
                 "%0.5f" % sim.dist('topaz', 'dusk'),
                 "0.45335")
 
-
     def test_workflow(self):
-        sim = self.make_sim()
-        self.workflow(sim)
-
-    def test_save_load(self):
-        sim = self.make_sim()
-        sim.save(opj(self.tmpdir, 'neighbortest'))
-        sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest'))
-        self.workflow(sim2)
+        for backend in Annoy, BruteForcePurePython, Sklearn:
+            sim = self.make_sim(backend)
+            self.workflow(sim)
+            sim.save(opj(self.tmpdir, 'neighbortest'))
+            sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest'))
+            self.workflow(sim2)
+
 
 if __name__ == '__main__':
     unittest.main()

From a47c8820a0177d926afa939b22cf365625be9b51 Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 17:18:23 -0500
Subject: [PATCH 2/7] tweaks and testing select_best

---
 simpleneighbors/__init__.py   |  2 +-
 tests/test_select_best.py     | 24 ++++++++++++++++++++++++
 tests/test_simpleneighbors.py |  1 -
 3 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_select_best.py

diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py
index 0661a76..0d7ea5b 100644
--- a/simpleneighbors/__init__.py
+++ b/simpleneighbors/__init__.py
@@ -115,7 +115,7 @@ def build(self, n=10, params=None):
         After you call build, you'll no longer be able to add new items to the
         index.
 
-        :param n: number of trees
+        :param n: backend-dependent (for Annoy: number of trees)
         :param params: dictionary with extra parameters to pass to backend
         """
         self.backend.build(n, params)
diff --git a/tests/test_select_best.py b/tests/test_select_best.py
new file mode 100644
index 0000000..20a88a6
--- /dev/null
+++ b/tests/test_select_best.py
@@ -0,0 +1,24 @@
+import unittest
+try:
+    from unittest import mock
+except:
+    import mock
+import warnings
+from simpleneighbors.backends import select_best
+from simpleneighbors.backends import Annoy, Sklearn, BruteForcePurePython
+
+class TestSelectBest(unittest.TestCase):
+
+    def test_select_best(self):
+        self.assertEqual(select_best(), Annoy)
+        with mock.patch.dict('sys.modules', {'annoy': None}):
+            self.assertEqual(select_best(), Sklearn)
+        with mock.patch.dict('sys.modules',
+                {'annoy': None, 'sklearn.neighbors': None}):
+            with warnings.catch_warnings(record=True) as w:
+                self.assertEqual(select_best(), BruteForcePurePython)
+                self.assertIn("very slow", str(w[-1].message))
+                self.assertIn("not appropriate", str(w[-1].message))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py
index 77d682b..c704a8b 100644
--- a/tests/test_simpleneighbors.py
+++ b/tests/test_simpleneighbors.py
@@ -5,7 +5,6 @@
 
 from simpleneighbors import SimpleNeighbors
 from simpleneighbors.backends import BruteForcePurePython, Annoy, Sklearn
-from simpleneighbors.backends.base import BaseBackend
 
 data = [
     ('mahogany', (74, 1, 0)),

From 1cd89a6b2fa101cd6c64cc95025e635866db529b Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 17:51:09 -0500
Subject: [PATCH 3/7] flake8 compliance

---
 simpleneighbors/__init__.py                      |  4 ++--
 simpleneighbors/backends/__init__.py             |  3 ++-
 simpleneighbors/backends/annoy_.py               | 13 +++++++++++--
 simpleneighbors/backends/base.py                 | 11 +++++++++--
 simpleneighbors/backends/bruteforcepurepython.py | 10 ++++++++--
 simpleneighbors/backends/sklearn_.py             | 13 ++++++-------
 simpleneighbors/benchmark.py                     |  4 +++-
 tests/test_select_best.py                        |  6 ++++--
 tests/test_simpleneighbors.py                    | 13 ++++++-------
 9 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/simpleneighbors/__init__.py b/simpleneighbors/__init__.py
index 0d7ea5b..86c85db 100644
--- a/simpleneighbors/__init__.py
+++ b/simpleneighbors/__init__.py
@@ -13,7 +13,7 @@ class SimpleNeighbors:
     indexes with a user-friendly API. When you instantiate this class, it will
     automatically select a backend implementation based on packages installed
     in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip
-    install annoy``) to enable the Annoy backend! (The alternatives are 
+    install annoy``) to enable the Annoy backend! (The alternatives are
     slower and not as accurate.) Alternatively, you can specify a backend of
     your choosing with the ``backend`` parameter.
 
@@ -279,7 +279,7 @@ def save(self, prefix):
         with `pickle`, the Annoy backend's implementation produces two files:
         the serialized Annoy index, and a pickle with the other data from the
         object.
-        
+
         This method's parameter specifies the "prefix" to use for these files.
 
         :param prefix: filename prefix for Annoy index and object data
diff --git a/simpleneighbors/backends/__init__.py b/simpleneighbors/backends/__init__.py
index 82bc1bc..a644408 100644
--- a/simpleneighbors/backends/__init__.py
+++ b/simpleneighbors/backends/__init__.py
@@ -16,6 +16,7 @@
 backends available to you and will improve performance dramatically.
 """
 
+
 def select_best():
     for b in (Annoy, Sklearn):
         if b.available():
@@ -23,6 +24,6 @@ def select_best():
     warnings.warn(brute_force_message)
     return BruteForcePurePython
 
+
 def available():
     return [Annoy, Sklearn, BruteForcePurePython]
-
diff --git a/simpleneighbors/backends/annoy_.py b/simpleneighbors/backends/annoy_.py
index 406f484..ee25061 100644
--- a/simpleneighbors/backends/annoy_.py
+++ b/simpleneighbors/backends/annoy_.py
@@ -1,32 +1,41 @@
 from simpleneighbors.backends.base import BaseBackend
 
+
 class Annoy(BaseBackend):
+
     @classmethod
     def available(cls):
         try:
-            import annoy
+            import annoy  # noqa: F401
         except ImportError:
             return False
         return True
+
     def __init__(self, dims, metric):
         import annoy
         self.annoy = annoy.AnnoyIndex(dims, metric=metric)
+
     def add_item(self, idx, vector):
         self.annoy.add_item(idx, vector)
+
     def build(self, n, params=None):
         self.annoy.build(n)
+
     def get_nns_by_vector(self, vec, n):
         return self.annoy.get_nns_by_vector(vec, n)
+
     def get_distance(self, a_idx, b_idx):
         return self.annoy.get_distance(a_idx, b_idx)
+
     def get_item_vector(self, idx):
         return self.annoy.get_item_vector(idx)
+
     def save(self, fname):
         """
         Saves the Annoy index as ``<prefix>.annoy`` and the object data will be
         saved as ``<prefix>-data.pkl``.
         """
         self.annoy.save(fname)
+
     def load(self, fname):
         self.annoy.load(fname)
-
diff --git a/simpleneighbors/backends/base.py b/simpleneighbors/backends/base.py
index 7364619..6435052 100644
--- a/simpleneighbors/backends/base.py
+++ b/simpleneighbors/backends/base.py
@@ -1,23 +1,30 @@
 class BaseBackend:
+
     @classmethod
     def available(cls):
         return False
+
     def __init__(self, dims, metric):
         raise NotImplementedError
+
     def add_item(self, idx, vector):
         raise NotImplementedError
+
     def build(self, n, params=None):
         raise NotImplementedError
+
     def get_nns_by_vector(self, vec, n):
         raise NotImplementedError
+
     def get_distance(self, a_idx, b_idx):
         raise NotImplementedError
+
     def get_item_vector(self, idx):
         raise NotImplementedError
+
     def save(self, fname):
         raise NotImplementedError
+
     @classmethod
     def load(cls, fname):
         raise NotImplementedError
-
-
diff --git a/simpleneighbors/backends/bruteforcepurepython.py b/simpleneighbors/backends/bruteforcepurepython.py
index fcdd04d..ed683f6 100644
--- a/simpleneighbors/backends/bruteforcepurepython.py
+++ b/simpleneighbors/backends/bruteforcepurepython.py
@@ -1,11 +1,13 @@
 from simpleneighbors.backends.base import BaseBackend
 from math import sqrt
 import pickle
+
 try:
     from functools import lru_cache
-except:
+except ImportError:
     # for python 2, NOP lru_cache
     from functools import wraps
+
     def lru_cache(maxsize=10000):
         def deco(fn):
             @wraps(fn)
@@ -14,22 +16,27 @@ def wrapper(*args):
             return wrapper
         return deco
 
+
 @lru_cache(maxsize=10000)
 def distance(coord1, coord2):
     return sqrt(sum([(i - j)**2 for i, j in zip(coord1, coord2)]))
 
+
 def norm(vec):
     return sqrt(sum([item**2 for item in vec]))
 
+
 @lru_cache(maxsize=10000)
 def normalize(vec):
     norm_val = norm(vec)
     return tuple(item / norm_val for item in vec)
 
+
 @lru_cache(maxsize=10000)
 def norm_dist(v1, v2):
     return distance(normalize(v1), normalize(v2))
 
+
 class BruteForcePurePython(BaseBackend):
 
     @classmethod
@@ -73,4 +80,3 @@ def load(self, fname):
             obj = pickle.load(fh)
         self.items = obj.items
         self.dist_fn = obj.dist_fn
-
diff --git a/simpleneighbors/backends/sklearn_.py b/simpleneighbors/backends/sklearn_.py
index a530656..1232125 100644
--- a/simpleneighbors/backends/sklearn_.py
+++ b/simpleneighbors/backends/sklearn_.py
@@ -1,13 +1,14 @@
 from simpleneighbors.backends.base import BaseBackend
 import pickle
 
+
 class Sklearn(BaseBackend):
 
     @classmethod
     def available(cls):
         try:
-            from sklearn.neighbors import NearestNeighbors
-            import numpy as np
+            from sklearn.neighbors import NearestNeighbors  # noqa: F401
+            import numpy as np  # noqa: F401
         except ImportError:
             return False
         return True
@@ -18,7 +19,7 @@ def __init__(self, dims, metric):
 
     def add_item(self, idx, vector):
         self.items.append([float(d) for d in vector])
-        
+
     def build(self, n, params=None):
         from sklearn.neighbors import NearestNeighbors
         from sklearn.preprocessing import normalize
@@ -26,7 +27,7 @@ def build(self, n, params=None):
         data = np.array(self.items)
         if self.metric == 'angular':
             data = normalize(data, norm='l2')
-            metric = 'minkowski' # equivalent to euclidean
+            metric = 'minkowski'  # equivalent to euclidean
         else:
             metric = self.metric
         if params is None:
@@ -49,7 +50,7 @@ def get_distance(self, a_idx, b_idx):
         import numpy as np
         X = np.array([self.items[a_idx], self.items[b_idx]])
         if self.metric == 'angular':
-            X= normalize(X, norm='l2')
+            X = normalize(X, norm='l2')
             metric = 'minkowski'
         else:
             metric = self.metric
@@ -68,5 +69,3 @@ def load(self, fname):
             obj = pickle.load(fh)
         self.items = obj[0]
         self.nn = obj[1]
-
-
diff --git a/simpleneighbors/benchmark.py b/simpleneighbors/benchmark.py
index 282a742..a562575 100644
--- a/simpleneighbors/benchmark.py
+++ b/simpleneighbors/benchmark.py
@@ -1,6 +1,7 @@
 from simpleneighbors import SimpleNeighbors
 from simpleneighbors.backends import available
 
+
 def benchmark(n=10000, dims=300, query_count=10, metric='angular'):
     import numpy as np
     from time import time
@@ -20,7 +21,8 @@ def benchmark(n=10000, dims=300, query_count=10, metric='angular'):
             sim.nearest(np.random.randn(dims))
         nearest_query = time()
         print(backend, "%0.2f sec to build, %0.2f sec to query %d items" %
-                (to_build - start, nearest_query - start, query_count))
+              (to_build - start, nearest_query - start, query_count))
+
 
 if __name__ == '__main__':
 
diff --git a/tests/test_select_best.py b/tests/test_select_best.py
index 20a88a6..a2ab7f3 100644
--- a/tests/test_select_best.py
+++ b/tests/test_select_best.py
@@ -1,12 +1,13 @@
 import unittest
 try:
     from unittest import mock
-except:
+except ImportError:
     import mock
 import warnings
 from simpleneighbors.backends import select_best
 from simpleneighbors.backends import Annoy, Sklearn, BruteForcePurePython
 
+
 class TestSelectBest(unittest.TestCase):
 
     def test_select_best(self):
@@ -14,11 +15,12 @@ def test_select_best(self):
         with mock.patch.dict('sys.modules', {'annoy': None}):
             self.assertEqual(select_best(), Sklearn)
         with mock.patch.dict('sys.modules',
-                {'annoy': None, 'sklearn.neighbors': None}):
+                             {'annoy': None, 'sklearn.neighbors': None}):
             with warnings.catch_warnings(record=True) as w:
                 self.assertEqual(select_best(), BruteForcePurePython)
                 self.assertIn("very slow", str(w[-1].message))
                 self.assertIn("not appropriate", str(w[-1].message))
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_simpleneighbors.py b/tests/test_simpleneighbors.py
index c704a8b..d085402 100644
--- a/tests/test_simpleneighbors.py
+++ b/tests/test_simpleneighbors.py
@@ -30,6 +30,7 @@
 
 one_more = ('purpley', (135, 86, 228))
 
+
 class TestSimpleNeighbors(unittest.TestCase):
 
     @classmethod
@@ -39,7 +40,7 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         rmtree(cls.tmpdir)
-       
+
     def make_sim(self, backend=None):
         sim = SimpleNeighbors(3, metric='angular', backend=backend)
         sim.feed(data)
@@ -51,8 +52,8 @@ def workflow(self, sim):
 
         print("running backend", sim.backend)
 
-        self.assertRaises(AssertionError,
-                sim.add_one, *one_more)
+        self.assertRaises(AssertionError, sim.add_one, *one_more)
+
         # +1 because of the call to test .add_one above
         self.assertEqual(len(sim), len(data) + 1)
 
@@ -67,12 +68,11 @@ def workflow(self, sim):
             sim.nearest([100, 100, 200], 3),
             ['dusk', 'purpley', 'french blue'])
 
-        nm = list(sim.neighbors_matching('mint', 1,
-            lambda x: 'a' in x))
+        nm = list(sim.neighbors_matching('mint', 1, lambda x: 'a' in x))
         self.assertEqual(nm[0], 'battleship grey')
 
         nm = list(sim.nearest_matching([100, 100, 200], 1,
-            lambda x: x.startswith('p')))
+                  lambda x: x.startswith('p')))
         self.assertEqual(nm[0], 'purpley')
 
         self.assertEqual(
@@ -90,4 +90,3 @@ def test_workflow(self):
 
 if __name__ == '__main__':
     unittest.main()
-

From a71333a5c852395329e5cdc1e2c6ce4dd659d11f Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 17:52:45 -0500
Subject: [PATCH 4/7] build stuff

---
 .travis.yml | 8 ++++----
 Makefile    | 5 -----
 setup.py    | 9 ++++++++-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ae7b826..2e66665 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,8 @@ language: python
 python:
   - "pypy3"
   - "pypy"
+  - "3.8"
+  - "3.7"
   - "3.6"
   - "3.5"
   - "3.4"
@@ -14,7 +16,7 @@ sudo: false
 
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
-  - pip install -r requirements.txt
+  - pip install -e .[dev]
   - pip install coverage
 
 # command to run tests, e.g. python setup.py test
@@ -28,6 +30,4 @@ after_success:
 
 after_script:
 - coverage report                     # show coverage on cmd line
-- pip install pycodestyle pyflakes
-- pyflakes . | tee >(wc -l)           # static analysis
-- pycodestyle --statistics --count .  # static analysis
+- flake8 simpleneighbors tests
diff --git a/Makefile b/Makefile
index 9879798..8576764 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,6 @@ help:
 	@echo "clean-test - remove test and coverage artifacts"
 	@echo "lint - check style with flake8"
 	@echo "test - run tests quickly with the default Python"
-	@echo "test-all - run tests on every Python version with tox"
 	@echo "coverage - check code coverage quickly with the default Python"
 	@echo "docs - generate Sphinx HTML documentation, including API docs"
 	@echo "release - package and upload a release"
@@ -30,7 +29,6 @@ clean-pyc:
 	find . -name '__pycache__' -exec rm -fr {} +
 
 clean-test:
-	rm -fr .tox/
 	rm -f .coverage
 	rm -fr htmlcov/
 
@@ -41,9 +39,6 @@ test:
 	python setup.py test
 	python -m doctest simpleneighbors/__init__.py
 
-test-all:
-	tox
-
 coverage:
 	coverage run --source simpleneighbors setup.py test
 	coverage report -m
diff --git a/setup.py b/setup.py
index 2cff5a5..1a5a550 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,14 @@
     extras_require={
         'annoy': ['annoy>=1.16.0'],
         'sklearn': ['scikit-learn>=0.20'],
-        'purepython': []
+        'purepython': [],
+        'dev': [
+            'annoy>=1.16.0',
+            'scikit-learn>=0.20',
+            'mock;python_version<="2.7"',
+            'coverage',
+            'flake8',
+        ]
     },
     platforms='any',
     test_suite='tests'

From f917272e204ceb95da2d1e28b35c5534bf19688c Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 18:37:28 -0500
Subject: [PATCH 5/7] workaround for numpy weirdness?

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 1a5a550..4b7a8ba 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
         'sklearn': ['scikit-learn>=0.20'],
         'purepython': [],
         'dev': [
+            'numpy==1.15.4',  # see https://github.com/numpy/numpy/issues/14012
             'annoy>=1.16.0',
             'scikit-learn>=0.20',
             'mock;python_version<="2.7"',

From c19bd342139524ef2ce71a6c149e77b9566da8c9 Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 21:52:12 -0500
Subject: [PATCH 6/7] another attempt at getting travis to work, ugh

---
 .travis.yml      |  2 +-
 requirements.txt |  0
 setup.py         |  1 -
 tox.ini          | 23 -----------------------
 4 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100644 requirements.txt
 delete mode 100644 tox.ini

diff --git a/.travis.yml b/.travis.yml
index 2e66665..3d3c7af 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ install:
 
 # command to run tests, e.g. python setup.py test
 script:
-  - coverage run --source simpleneighbors setup.py test --verbose
+  - coverage run --source simpleneighbors tests/test_simpleneighbors.py --verbose
   - python -m doctest simpleneighbors/__init__.py
 
 after_success:
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/setup.py b/setup.py
index 4b7a8ba..1a5a550 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,6 @@
         'sklearn': ['scikit-learn>=0.20'],
         'purepython': [],
         'dev': [
-            'numpy==1.15.4',  # see https://github.com/numpy/numpy/issues/14012
             'annoy>=1.16.0',
             'scikit-learn>=0.20',
             'mock;python_version<="2.7"',
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index 96a8f23..0000000
--- a/tox.ini
+++ /dev/null
@@ -1,23 +0,0 @@
-[tox]
-skipsdist = True
-usedevelop = True
-envlist = py{27,36,37},
-          flake8
-
-[testenv]
-install_command = pip install -e ".[dev]" -U {packages}
-basepython =
-    py27: python2.7
-    py36: python3.6
-    py37: python3.7
-deps =
-    -r{toxinidir}/requirements.txt
-commands = python setup.py test
-
-[testenv:flake8]
-basepython =
-    python3.6
-deps =
-    flake8
-commands =
-    flake8 simpleneighbors

From 6ccc9dc1d7c976ec067325605fc0a53e06f556cd Mon Sep 17 00:00:00 2001
From: Allison Parrish <allison@decontextualize.com>
Date: Sun, 12 Jan 2020 21:56:57 -0500
Subject: [PATCH 7/7] not even going to try, sorry

---
 .travis.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3d3c7af..9deb4cb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,6 @@
 
 language: python
 python:
-  - "pypy3"
-  - "pypy"
   - "3.8"
   - "3.7"
   - "3.6"