benfred · benfred · Oct 11, 2017 · Oct 10, 2017
diff --git a/docs/als.rst b/docs/als.rst
@@ -0,0 +1,6 @@
+AlternatingLeastSquares
+=======================
+
+.. autoclass:: implicit.als.AlternatingLeastSquares
+   :members:
+   :inherited-members:
diff --git a/docs/ann.rst b/docs/ann.rst
@@ -0,0 +1,29 @@
+Approximate Alternating Least Squares
+=====================================
+
+This library supports using a couple of different approximate nearest neighbours libraries
+to speed up the recommend and similar_items methods of the AlternatingLeastSquares model.
+
+The potential speedup of using these methods can be quite significant, at the risk of
+potentially missing relevant results:
+
+.. image:: recommendperf.png
+
+See `this post comparing the different ANN libraries 
+<http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/>`_ for
+more details.
+
+NMSLibAlternatingLeastSquares
+-----------------------------
+.. autoclass:: implicit.approximate_als.NMSLibAlternatingLeastSquares
+   :members:
+
+AnnoyAlternatingLeastSquares
+----------------------------
+.. autoclass:: implicit.approximate_als.AnnoyAlternatingLeastSquares
+   :members:
+
+FaissAlternatingLeastSquares
+-----------------------------
+.. autoclass:: implicit.approximate_als.FaissAlternatingLeastSquares
+   :members:
diff --git a/docs/index.rst b/docs/index.rst
@@ -20,8 +20,9 @@ Filtering <https://pdfs.semanticscholar.org/bfdf/7af6cf7fd7bb5e6b6db5bbd91be1159
    :caption: Contents:
 
     Quickstart <quickstart>
-    API Reference <models>
-
+    RecommenderBase <models>
+    Alternating Least Squares <als>
+    Approximate Alternating Least Squares <ann>
 
 Indices and tables
 ==================

diff --git a/docs/models.rst b/docs/models.rst
@@ -1,19 +1,6 @@
-Models
+RecommenderBase
 ===============
 
-RecommenderBase
----------------
 .. autoclass:: implicit.recommender_base.RecommenderBase
    :members:
    :undoc-members:
-
-AlternatingLeastSquares
------------------------
-.. autoclass:: implicit.als.AlternatingLeastSquares
-   :members:
-   :inherited-members:
-
-AnnoyAlternatingLeastSquares
-----------------------------
-.. autoclass:: implicit.annoy_als.AnnoyAlternatingLeastSquares
-   :members:
diff --git a/docs/recommendperf.png b/docs/recommendperf.png
diff --git a/examples/lastfm.py b/examples/lastfm.py
@@ -21,15 +21,45 @@
 from scipy.sparse import coo_matrix
 
 from implicit.als import AlternatingLeastSquares
-from implicit.annoy_als import AnnoyAlternatingLeastSquares
+from implicit.approximate_als import (AnnoyAlternatingLeastSquares, NMSLibAlternatingLeastSquares,
+                                      FaissAlternatingLeastSquares)
 from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                          TFIDFRecommender, bm25_weight)
 
 
+# maps command line model argument to class name
+MODELS = {"als":  AlternatingLeastSquares,
+          "nmslib_als": NMSLibAlternatingLeastSquares,
+          "annoy_als": AnnoyAlternatingLeastSquares,
+          "faiss_als": FaissAlternatingLeastSquares,
+          "tfidf": TFIDFRecommender,
+          "cosine": CosineRecommender,
+          "bm25": BM25Recommender}
+
+
+def get_model(model_name):
+    model_class = MODELS.get(model_name)
+    if not model_class:
+        raise ValueError("Unknown Model '%s'" % model_name)
+
+    # some default params
+    if issubclass(model_class, AlternatingLeastSquares):
+        params = {'factors': 50, 'dtype': numpy.float32}
+    elif model_name == "bm25":
+        params = {'K1': 100, 'B': 0.5}
+    else:
+        params = {}
+
+    return model_class(**params)
+
+
 def read_data(filename):
     """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
     and a sparse matrix of artist/user/playcount """
     # read in triples of user/artist/playcount from the input dataset
+    # get a model based off the input params
+    start = time.time()
+    logging.debug("reading data from %s", filename)
     data = pandas.read_table(filename,
                              usecols=[0, 2, 3],
                              names=['user', 'artist', 'plays'])
@@ -43,62 +73,38 @@ def read_data(filename):
                        (data['artist'].cat.codes.copy(),
                         data['user'].cat.codes.copy())))
 
+    logging.debug("read data file in %s", time.time() - start)
     return data, plays
 
 
-def calculate_similar_artists(input_filename, output_filename,
-                              model_name="als",
-                              factors=50, regularization=0.01,
-                              iterations=15,
-                              exact=False,
-                              use_native=True,
-                              dtype=numpy.float64,
-                              cg=False):
-    logging.debug("Calculating similar artists. This might take a while")
-
-    # read in the input data file
-    logging.debug("reading data from %s", input_filename)
-    start = time.time()
+def calculate_similar_artists(input_filename, output_filename, model_name="als"):
+    """ generates a list of similar artists in lastfm by utiliizing the 'similar_items'
+    api of the models """
     df, plays = read_data(input_filename)
-    logging.debug("read data file in %s", time.time() - start)
 
-    # generate a recommender model based off the input params
-    if model_name == "als":
-        if exact:
-            model = AlternatingLeastSquares(factors=factors, regularization=regularization,
-                                            use_native=use_native, use_cg=cg,
-                                            dtype=dtype, iterations=iterations)
-        else:
-            model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization,
-                                                 use_native=use_native, use_cg=cg,
-                                                 dtype=dtype, iterations=iterations)
+    # create a model from the input data
+    model = get_model(model_name)
 
+    # if we're training an ALS based model, weight input for last.fm
+    # by bm25
+    if issubclass(model.__class__, AlternatingLeastSquares):
         # lets weight these models by bm25weight.
         logging.debug("weighting matrix by bm25_weight")
         plays = bm25_weight(plays, K1=100, B=0.8)
 
-    elif model_name == "tfidf":
-        model = TFIDFRecommender()
-
-    elif model_name == "cosine":
-        model = CosineRecommender()
+        # also disable building approximate recommend index
+        model.approximate_recommend = False
 
-    elif model_name == "bm25":
-        model = BM25Recommender(K1=100, B=0.5)
-
-    else:
-        raise NotImplementedError("TODO: model %s" % model_name)
-
-    # train the model
     logging.debug("training model %s", model_name)
     start = time.time()
     model.fit(plays)
-    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
+    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
 
     # write out similar artists by popularity
+    artists = dict(enumerate(df['artist'].cat.categories))
+    start = time.time()
     logging.debug("calculating top artists")
     user_count = df.groupby('artist').size()
-    artists = dict(enumerate(df['artist'].cat.categories))
     to_generate = sorted(list(artists), key=lambda x: -user_count[x])
 
     # write out as a TSV of artistid, otherartistid, score
@@ -108,43 +114,64 @@ def calculate_similar_artists(input_filename, output_filename,
             for other, score in model.similar_items(artistid, 11):
                 o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
 
+    logging.debug("generated similar artists in %0.2fs",  time.time() - start)
+
+
+def calculate_recommendations(input_filename, output_filename, model_name="als"):
+    """ Generates artist recommendations for each user in the dataset """
+    # train the model based off input params
+    df, plays = read_data(input_filename)
+
+    # create a model from the input data
+    model = get_model(model_name)
+
+    # if we're training an ALS based model, weight input for last.fm
+    # by bm25
+    if issubclass(model.__class__, AlternatingLeastSquares):
+        # lets weight these models by bm25weight.
+        logging.debug("weighting matrix by bm25_weight")
+        plays = bm25_weight(plays, K1=100, B=0.8)
+
+        # also disable building approximate recommend index
+        model.approximate_similar_items = False
+
+    logging.debug("training model %s", model_name)
+    start = time.time()
+    model.fit(plays)
+    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
+
+    # generate recommendations for each user and write out to a file
+    artists = dict(enumerate(df['artist'].cat.categories))
+    start = time.time()
+    user_plays = plays.T.tocsr()
+    with open(output_filename, "w") as o:
+        for userid, username in enumerate(df['user'].cat.categories):
+            for artistid, score in model.recommend(userid, user_plays):
+                o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
+    logging.debug("generated recommendations in %0.2fs",  time.time() - start)
+
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generates related artists on the last.fm dataset",
+    parser = argparse.ArgumentParser(description="Generates similart artists on the last.fm dataset"
+                                     " or generates personalized recommendations for each user",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
     parser.add_argument('--input', type=str,
                         dest='inputfile', help='last.fm dataset file', required=True)
     parser.add_argument('--output', type=str, default='similar-artists.tsv',
                         dest='outputfile', help='output file name')
     parser.add_argument('--model', type=str, default='als',
-                        dest='model', help='model to calculate (als/bm25/tfidf/cosine)')
-    parser.add_argument('--factors', type=int, default=50, dest='factors',
-                        help='Number of factors to calculate')
-    parser.add_argument('--reg', type=float, default=0.8, dest='regularization',
-                        help='regularization weight')
-    parser.add_argument('--iter', type=int, default=15, dest='iterations',
-                        help='Number of ALS iterations')
-    parser.add_argument('--exact', help='compute exact distances (slow)', action="store_true")
-    parser.add_argument('--purepython',
-                        help='dont use cython extension (slow)',
-                        action="store_true")
-    parser.add_argument('--float32',
-                        help='use 32 bit floating point numbers',
-                        action="store_true")
-    parser.add_argument('--cg',
-                        help='use CG optimizer',
+                        dest='model', help='model to calculate (%s)' % "/".join(MODELS.keys()))
+    parser.add_argument('--recommend',
+                        help='Recommend items for each user rather than calculate similar_items',
                         action="store_true")
+    parser.add_argument('--param', action='append',
+                        help="Parameters to pass to the model, formatted as 'KEY=VALUE")
+
     args = parser.parse_args()
 
     logging.basicConfig(level=logging.DEBUG)
 
-    calculate_similar_artists(args.inputfile, args.outputfile,
-                              model_name=args.model,
-                              factors=args.factors,
-                              regularization=args.regularization,
-                              exact=args.exact,
-                              iterations=args.iterations,
-                              use_native=not args.purepython,
-                              dtype=numpy.float32 if args.float32 else numpy.float64,
-                              cg=args.cg)
+    if args.recommend:
+        calculate_recommendations(args.inputfile, args.outputfile, model_name=args.model)
+    else:
+        calculate_similar_artists(args.inputfile, args.outputfile, model_name=args.model)
diff --git a/implicit/__init__.py b/implicit/__init__.py
@@ -3,6 +3,6 @@
 from . import nearest_neighbours
 from . import als
 
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 __all__ = [alternating_least_squares, als, nearest_neighbours, __version__]
diff --git a/implicit/annoy_als.py b/implicit/annoy_als.py