Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/als.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
AlternatingLeastSquares
=======================

.. autoclass:: implicit.als.AlternatingLeastSquares
:members:
:inherited-members:
29 changes: 29 additions & 0 deletions docs/ann.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Approximate Alternating Least Squares
=====================================

This library supports using a couple of different approximate nearest neighbours libraries
to speed up the recommend and similar_items methods of the AlternatingLeastSquares model.

The potential speedup of using these methods can be quite significant, at the risk of
potentially missing relevant results:

.. image:: recommendperf.png

See `this post comparing the different ANN libraries
<http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/>`_ for
more details.

NMSLibAlternatingLeastSquares
-----------------------------
.. autoclass:: implicit.approximate_als.NMSLibAlternatingLeastSquares
:members:

AnnoyAlternatingLeastSquares
----------------------------
.. autoclass:: implicit.approximate_als.AnnoyAlternatingLeastSquares
:members:

FaissAlternatingLeastSquares
-----------------------------
.. autoclass:: implicit.approximate_als.FaissAlternatingLeastSquares
:members:
5 changes: 3 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ Filtering <https://pdfs.semanticscholar.org/bfdf/7af6cf7fd7bb5e6b6db5bbd91be1159
:caption: Contents:

Quickstart <quickstart>
API Reference <models>

RecommenderBase <models>
Alternating Least Squares <als>
Approximate Alternating Least Squares <ann>

Indices and tables
==================
Expand Down
15 changes: 1 addition & 14 deletions docs/models.rst
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
Models
RecommenderBase
===============

RecommenderBase
---------------
.. autoclass:: implicit.recommender_base.RecommenderBase
:members:
:undoc-members:

AlternatingLeastSquares
-----------------------
.. autoclass:: implicit.als.AlternatingLeastSquares
:members:
:inherited-members:

AnnoyAlternatingLeastSquares
----------------------------
.. autoclass:: implicit.annoy_als.AnnoyAlternatingLeastSquares
:members:
Binary file added docs/recommendperf.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
159 changes: 93 additions & 66 deletions examples/lastfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,45 @@
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.annoy_als import AnnoyAlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, NMSLibAlternatingLeastSquares,
FaissAlternatingLeastSquares)
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
TFIDFRecommender, bm25_weight)


# maps command line model argument to class name
MODELS = {"als": AlternatingLeastSquares,
"nmslib_als": NMSLibAlternatingLeastSquares,
"annoy_als": AnnoyAlternatingLeastSquares,
"faiss_als": FaissAlternatingLeastSquares,
"tfidf": TFIDFRecommender,
"cosine": CosineRecommender,
"bm25": BM25Recommender}


def get_model(model_name):
model_class = MODELS.get(model_name)
if not model_class:
raise ValueError("Unknown Model '%s'" % model_name)

# some default params
if issubclass(model_class, AlternatingLeastSquares):
params = {'factors': 50, 'dtype': numpy.float32}
elif model_name == "bm25":
params = {'K1': 100, 'B': 0.5}
else:
params = {}

return model_class(**params)


def read_data(filename):
""" Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
and a sparse matrix of artist/user/playcount """
# read in triples of user/artist/playcount from the input dataset
# get a model based off the input params
start = time.time()
logging.debug("reading data from %s", filename)
data = pandas.read_table(filename,
usecols=[0, 2, 3],
names=['user', 'artist', 'plays'])
Expand All @@ -43,62 +73,38 @@ def read_data(filename):
(data['artist'].cat.codes.copy(),
data['user'].cat.codes.copy())))

logging.debug("read data file in %s", time.time() - start)
return data, plays


def calculate_similar_artists(input_filename, output_filename,
model_name="als",
factors=50, regularization=0.01,
iterations=15,
exact=False,
use_native=True,
dtype=numpy.float64,
cg=False):
logging.debug("Calculating similar artists. This might take a while")

# read in the input data file
logging.debug("reading data from %s", input_filename)
start = time.time()
def calculate_similar_artists(input_filename, output_filename, model_name="als"):
""" generates a list of similar artists in lastfm by utiliizing the 'similar_items'
api of the models """
df, plays = read_data(input_filename)
logging.debug("read data file in %s", time.time() - start)

# generate a recommender model based off the input params
if model_name == "als":
if exact:
model = AlternatingLeastSquares(factors=factors, regularization=regularization,
use_native=use_native, use_cg=cg,
dtype=dtype, iterations=iterations)
else:
model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization,
use_native=use_native, use_cg=cg,
dtype=dtype, iterations=iterations)
# create a model from the input data
model = get_model(model_name)

# if we're training an ALS based model, weight input for last.fm
# by bm25
if issubclass(model.__class__, AlternatingLeastSquares):
# lets weight these models by bm25weight.
logging.debug("weighting matrix by bm25_weight")
plays = bm25_weight(plays, K1=100, B=0.8)

elif model_name == "tfidf":
model = TFIDFRecommender()

elif model_name == "cosine":
model = CosineRecommender()
# also disable building approximate recommend index
model.approximate_recommend = False

elif model_name == "bm25":
model = BM25Recommender(K1=100, B=0.5)

else:
raise NotImplementedError("TODO: model %s" % model_name)

# train the model
logging.debug("training model %s", model_name)
start = time.time()
model.fit(plays)
logging.debug("trained model '%s' in %s", model_name, time.time() - start)
logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

# write out similar artists by popularity
artists = dict(enumerate(df['artist'].cat.categories))
start = time.time()
logging.debug("calculating top artists")
user_count = df.groupby('artist').size()
artists = dict(enumerate(df['artist'].cat.categories))
to_generate = sorted(list(artists), key=lambda x: -user_count[x])

# write out as a TSV of artistid, otherartistid, score
Expand All @@ -108,43 +114,64 @@ def calculate_similar_artists(input_filename, output_filename,
for other, score in model.similar_items(artistid, 11):
o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

logging.debug("generated similar artists in %0.2fs", time.time() - start)


def calculate_recommendations(input_filename, output_filename, model_name="als"):
""" Generates artist recommendations for each user in the dataset """
# train the model based off input params
df, plays = read_data(input_filename)

# create a model from the input data
model = get_model(model_name)

# if we're training an ALS based model, weight input for last.fm
# by bm25
if issubclass(model.__class__, AlternatingLeastSquares):
# lets weight these models by bm25weight.
logging.debug("weighting matrix by bm25_weight")
plays = bm25_weight(plays, K1=100, B=0.8)

# also disable building approximate recommend index
model.approximate_similar_items = False

logging.debug("training model %s", model_name)
start = time.time()
model.fit(plays)
logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

# generate recommendations for each user and write out to a file
artists = dict(enumerate(df['artist'].cat.categories))
start = time.time()
user_plays = plays.T.tocsr()
with open(output_filename, "w") as o:
for userid, username in enumerate(df['user'].cat.categories):
for artistid, score in model.recommend(userid, user_plays):
o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
logging.debug("generated recommendations in %0.2fs", time.time() - start)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generates related artists on the last.fm dataset",
parser = argparse.ArgumentParser(description="Generates similart artists on the last.fm dataset"
" or generates personalized recommendations for each user",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--input', type=str,
dest='inputfile', help='last.fm dataset file', required=True)
parser.add_argument('--output', type=str, default='similar-artists.tsv',
dest='outputfile', help='output file name')
parser.add_argument('--model', type=str, default='als',
dest='model', help='model to calculate (als/bm25/tfidf/cosine)')
parser.add_argument('--factors', type=int, default=50, dest='factors',
help='Number of factors to calculate')
parser.add_argument('--reg', type=float, default=0.8, dest='regularization',
help='regularization weight')
parser.add_argument('--iter', type=int, default=15, dest='iterations',
help='Number of ALS iterations')
parser.add_argument('--exact', help='compute exact distances (slow)', action="store_true")
parser.add_argument('--purepython',
help='dont use cython extension (slow)',
action="store_true")
parser.add_argument('--float32',
help='use 32 bit floating point numbers',
action="store_true")
parser.add_argument('--cg',
help='use CG optimizer',
dest='model', help='model to calculate (%s)' % "/".join(MODELS.keys()))
parser.add_argument('--recommend',
help='Recommend items for each user rather than calculate similar_items',
action="store_true")
parser.add_argument('--param', action='append',
help="Parameters to pass to the model, formatted as 'KEY=VALUE")

args = parser.parse_args()

logging.basicConfig(level=logging.DEBUG)

calculate_similar_artists(args.inputfile, args.outputfile,
model_name=args.model,
factors=args.factors,
regularization=args.regularization,
exact=args.exact,
iterations=args.iterations,
use_native=not args.purepython,
dtype=numpy.float32 if args.float32 else numpy.float64,
cg=args.cg)
if args.recommend:
calculate_recommendations(args.inputfile, args.outputfile, model_name=args.model)
else:
calculate_similar_artists(args.inputfile, args.outputfile, model_name=args.model)
2 changes: 1 addition & 1 deletion implicit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
from . import nearest_neighbours
from . import als

__version__ = '0.2.6'
__version__ = '0.2.7'

__all__ = [alternating_least_squares, als, nearest_neighbours, __version__]
102 changes: 0 additions & 102 deletions implicit/annoy_als.py

This file was deleted.

Loading