diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4a0a0e64..3a38850c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 isort cpplint + pip install flake8 isort cpplint black pip install -r requirements.txt - name: Lint with flake8 run: | @@ -36,6 +36,9 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=11 --max-line-length=127 --statistics flake8 --filename='*.pyx,*.px*' --ignore E901,E225,E226,E227,E402,E999 + - name: Lint with black + run: | + black --check . - name: Lint with isort run: | isort -c . diff --git a/benchmarks/benchmark_als.py b/benchmarks/benchmark_als.py index 211b89e4..bcdda46b 100644 --- a/benchmarks/benchmark_als.py +++ b/benchmarks/benchmark_als.py @@ -17,6 +17,7 @@ try: import implicit.gpu # noqa + has_cuda = True except ImportError: has_cuda = False @@ -30,25 +31,29 @@ def inner(iteration, elapsed): loss = calculate_loss(plays, model.item_factors, model.user_factors, 0) print("model %s iteration %i loss %.5f" % (name, iteration, loss)) output[name].append(loss) + return inner for steps in [2, 3, 4]: - model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=True, regularization=0, - iterations=25) + model = AlternatingLeastSquares( + factors=100, use_native=True, use_cg=True, regularization=0, iterations=25 + ) model.cg_steps = steps - model.fit_callback = store_loss(model, 'cg%i' % steps) + model.fit_callback = store_loss(model, "cg%i" % steps) model.fit(plays) if has_cuda: - model = AlternatingLeastSquares(factors=100, use_native=True, use_gpu=True, - regularization=0, iterations=25) - model.fit_callback = store_loss(model, 'gpu') + model = AlternatingLeastSquares( + factors=100, use_native=True, use_gpu=True, regularization=0, iterations=25 + ) + model.fit_callback = store_loss(model, "gpu") model.use_gpu = True model.fit(plays) - model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=False, regularization=0, - iterations=25) - model.fit_callback = store_loss(model, 'cholesky') + model = AlternatingLeastSquares( + factors=100, use_native=True, use_cg=False, regularization=0, iterations=25 + ) + model.fit_callback = store_loss(model, "cholesky") model.fit(plays) return output @@ -61,99 +66,122 @@ def store_time(model, name): def inner(iteration, elapsed): print(name, model.factors, iteration, elapsed) times[name][model.factors].append(elapsed) + return inner output = defaultdict(list) for factors in range(32, 257, 32): for steps in [2, 3, 4]: - model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=True, - regularization=0, iterations=iterations) - model.fit_callback = store_time(model, 'cg%i' % steps) + model = AlternatingLeastSquares( + factors=factors, + use_native=True, + use_cg=True, + regularization=0, + iterations=iterations, + ) + model.fit_callback = store_time(model, "cg%i" % steps) model.cg_steps = steps model.fit(plays) - model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=False, - regularization=0, iterations=iterations) - model.fit_callback = store_time(model, 'cholesky') + model = AlternatingLeastSquares( + factors=factors, use_native=True, use_cg=False, regularization=0, iterations=iterations + ) + model.fit_callback = store_time(model, "cholesky") model.fit(plays) if has_cuda: - model = AlternatingLeastSquares(factors=factors, use_native=True, use_gpu=True, - regularization=0, iterations=iterations) - model.fit_callback = store_time(model, 'gpu') + model = AlternatingLeastSquares( + factors=factors, + use_native=True, + use_gpu=True, + regularization=0, + iterations=iterations, + ) + model.fit_callback = store_time(model, "gpu") model.fit(plays) # take the min time for the output - output['factors'].append(factors) + output["factors"].append(factors) for name, stats in times.items(): output[name].append(min(stats[factors])) return output -LABELS = {'cg2': 'CG (2 Steps/Iteration)', - 'cg3': 'CG (3 Steps/Iteration)', - 'cg4': 'CG (4 Steps/Iteration)', - 'gpu': 'GPU', - 'cholesky': 'Cholesky'} - -COLOURS = {'cg2': "#2ca02c", - 'cg3': "#ff7f0e", - 'cg4': "#c5b0d5", - 'gpu': "#1f77b4", - 'cholesky': "#d62728"} - - -def generate_speed_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky'], - labels=None, colours=None): +LABELS = { + "cg2": "CG (2 Steps/Iteration)", + "cg3": "CG (3 Steps/Iteration)", + "cg4": "CG (4 Steps/Iteration)", + "gpu": "GPU", + "cholesky": "Cholesky", +} + +COLOURS = { + "cg2": "#2ca02c", + "cg3": "#ff7f0e", + "cg4": "#c5b0d5", + "gpu": "#1f77b4", + "cholesky": "#d62728", +} + + +def generate_speed_graph( + data, + filename="als_speed.png", + keys=["gpu", "cg2", "cg3", "cholesky"], + labels=None, + colours=None, +): labels = labels or {} colours = colours or {} seaborn.set() fig, ax = plt.subplots() - factors = data['factors'] + factors = data["factors"] for key in keys: - ax.plot(factors, data[key], - color=colours.get(key, COLOURS.get(key)), - marker='o', markersize=6) + ax.plot( + factors, data[key], color=colours.get(key, COLOURS.get(key)), marker="o", markersize=6 + ) ax.text(factors[-1] + 5, data[key][-1], labels.get(key, LABELS[key]), fontsize=10) ax.set_ylabel("Seconds per Iteration") ax.set_xlabel("Factors") - plt.savefig(filename, bbox_inches='tight', dpi=300) + plt.savefig(filename, bbox_inches="tight", dpi=300) -def generate_loss_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky']): +def generate_loss_graph(data, filename="als_speed.png", keys=["gpu", "cg2", "cg3", "cholesky"]): seaborn.set() fig, ax = plt.subplots() - iterations = range(1, len(data['cholesky']) + 1) + iterations = range(1, len(data["cholesky"]) + 1) for key in keys: - ax.plot(iterations, data[key], - color=COLOURS[key], - marker='o', markersize=6) + ax.plot(iterations, data[key], color=COLOURS[key], marker="o", markersize=6) ax.text(iterations[-1] + 1, data[key][-1], LABELS[key], fontsize=10) ax.set_ylabel("Mean Squared Error") ax.set_xlabel("Iteration") - plt.savefig(filename, bbox_inches='tight', dpi=300) + plt.savefig(filename, bbox_inches="tight", dpi=300) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark CG version against Cholesky", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--input', type=str, required=True, - dest='inputfile', help='dataset file in matrix market format') - parser.add_argument('--graph', help='generates graphs', - action="store_true") - parser.add_argument('--loss', help='test training loss', - action="store_true") - parser.add_argument('--speed', help='test training speed', - action="store_true") + parser = argparse.ArgumentParser( + description="Benchmark CG version against Cholesky", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--input", + type=str, + required=True, + dest="inputfile", + help="dataset file in matrix market format", + ) + parser.add_argument("--graph", help="generates graphs", action="store_true") + parser.add_argument("--loss", help="test training loss", action="store_true") + parser.add_argument("--speed", help="test training speed", action="store_true") args = parser.parse_args() if not (args.speed or args.loss): diff --git a/benchmarks/benchmark_qmf.py b/benchmarks/benchmark_qmf.py index faab73d8..adc80b1e 100644 --- a/benchmarks/benchmark_qmf.py +++ b/benchmarks/benchmark_qmf.py @@ -32,15 +32,24 @@ def benchmark_implicit(matrix, factors, reg, iterations): def benchmark_qmf(qmfpath, matrix, factors, reg, iterations): matrix = matrix.tocoo() datafile = "qmf_data.txt" - open(datafile, "w").write("\n".join("%s %s %s" % vals - for vals in zip(matrix.row, matrix.col, matrix.data))) + open(datafile, "w").write( + "\n".join("%s %s %s" % vals for vals in zip(matrix.row, matrix.col, matrix.data)) + ) def get_qmf_command(nepochs): - return [qmfpath, "--train_dataset", datafile, - "--nfactors", str(factors), - "--confidence_weight", "1", - "--nepochs", str(nepochs), - "--regularization_lambda", str(reg)] + return [ + qmfpath, + "--train_dataset", + datafile, + "--nfactors", + str(factors), + "--confidence_weight", + "1", + "--nepochs", + str(nepochs), + "--regularization_lambda", + str(reg), + ] # ok, so QMF needs to read the data in - and including # that in the timing isn't fair. So run it once with no iterations @@ -58,8 +67,9 @@ def get_qmf_command(nepochs): def run_benchmark(args): plays = bm25_weight(scipy.io.mmread(args.inputfile)) - qmf_time = benchmark_qmf(args.qmfpath, plays, args.factors, args.regularization, - args.iterations) + qmf_time = benchmark_qmf( + args.qmfpath, plays, args.factors, args.regularization, args.iterations + ) implicit_time = benchmark_implicit(plays, args.factors, args.regularization, args.iterations) @@ -69,19 +79,25 @@ def run_benchmark(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generates Benchmark", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--input', type=str, - dest='inputfile', help='dataset file in matrix market format') - parser.add_argument('--qmfpath', type=str, - dest='qmfpath', help='full path to qmf wals.bin file', required=True) - parser.add_argument('--factors', type=int, default=50, dest='factors', - help='Number of factors to calculate') - parser.add_argument('--reg', type=float, default=0.8, dest='regularization', - help='regularization weight') - parser.add_argument('--iter', type=int, default=15, dest='iterations', - help='Number of ALS iterations') + parser = argparse.ArgumentParser( + description="Generates Benchmark", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--input", type=str, dest="inputfile", help="dataset file in matrix market format" + ) + parser.add_argument( + "--qmfpath", type=str, dest="qmfpath", help="full path to qmf wals.bin file", required=True + ) + parser.add_argument( + "--factors", type=int, default=50, dest="factors", help="Number of factors to calculate" + ) + parser.add_argument( + "--reg", type=float, default=0.8, dest="regularization", help="regularization weight" + ) + parser.add_argument( + "--iter", type=int, default=15, dest="iterations", help="Number of ALS iterations" + ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) diff --git a/benchmarks/benchmark_spark.py b/benchmarks/benchmark_spark.py index ca0c2a04..5a07bc10 100644 --- a/benchmarks/benchmark_spark.py +++ b/benchmarks/benchmark_spark.py @@ -20,19 +20,19 @@ def convert_sparse_to_dataframe(spark, context, sparse_matrix): """ Converts a scipy sparse matrix to a spark dataframe """ m = sparse_matrix.tocoo() - data = context.parallelize(numpy.array([m.row, m.col, m.data]).T, - numSlices=len(m.row)/1024) - return spark.createDataFrame(data.map(lambda p: Row(row=int(p[0]), - col=int(p[1]), - data=float(p[2])))) + data = context.parallelize(numpy.array([m.row, m.col, m.data]).T, numSlices=len(m.row) / 1024) + return spark.createDataFrame( + data.map(lambda p: Row(row=int(p[0]), col=int(p[1]), data=float(p[2]))) + ) def benchmark_spark(ratings, factors, iterations=5): - conf = (SparkConf() - .setAppName("implicit_benchmark") - .setMaster('local[*]') - .set('spark.driver.memory', '16G') - ) + conf = ( + SparkConf() + .setAppName("implicit_benchmark") + .setMaster("local[*]") + .set("spark.driver.memory", "16G") + ) context = SparkContext(conf=conf) spark = SparkSession(context) @@ -41,14 +41,20 @@ def benchmark_spark(ratings, factors, iterations=5): ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: - als = ALS(rank=rank, maxIter=iterations, - alpha=1, implicitPrefs=True, - userCol="row", itemCol="col", ratingCol="data") + als = ALS( + rank=rank, + maxIter=iterations, + alpha=1, + implicitPrefs=True, + userCol="row", + itemCol="col", + ratingCol="data", + ) start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations - print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) + print("spark. factors=%i took %.3f" % (rank, elapsed / iterations)) finally: spark.stop() @@ -59,15 +65,15 @@ def benchmark_implicit(ratings, factors, iterations=5, use_gpu=False): ratings = ratings.tocsr() times = {} for rank in factors: - model = implicit.als.AlternatingLeastSquares(factors=rank, - iterations=iterations, - use_gpu=use_gpu) + model = implicit.als.AlternatingLeastSquares( + factors=rank, iterations=iterations, use_gpu=use_gpu + ) start = time.time() model.fit(ratings) elapsed = time.time() - start # take average time over iterations to be consistent with spark timings times[rank] = elapsed / iterations - print("implicit. factors=%i took %.3f" % (rank, elapsed/iterations)) + print("implicit. factors=%i took %.3f" % (rank, elapsed / iterations)) return times @@ -76,22 +82,24 @@ def generate_graph(times, factors, filename="spark_speed.png"): fig, ax = plt.subplots() for key in times: current = [times[key][f] for f in factors] - ax.plot(factors, current, marker='o', markersize=6) + ax.plot(factors, current, marker="o", markersize=6) ax.text(factors[-1] + 5, current[-1], key, fontsize=10) ax.set_ylabel("Seconds per Iteration") ax.set_xlabel("Factors") - plt.savefig(filename, bbox_inches='tight', dpi=300) + plt.savefig(filename, bbox_inches="tight", dpi=300) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark Spark against implicit", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--input', type=str, required=True, - help='dataset file in matrix market format') - parser.add_argument('--output', type=str, required=True, - help='output file location') + parser = argparse.ArgumentParser( + description="Benchmark Spark against implicit", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--input", type=str, required=True, help="dataset file in matrix market format" + ) + parser.add_argument("--output", type=str, required=True, help="output file location") args = parser.parse_args() if not (args.speed or args.loss): print("must specify at least one of --speed or --loss") @@ -102,9 +110,9 @@ def generate_graph(times, factors, filename="spark_speed.png"): times = {} factors = list(range(64, 257, 64)) - times['Implicit (GPU)'] = benchmark_implicit(m, factors, use_gpu=True) - times['Spark MLlib'] = benchmark_spark(m, factors) - times['Implicit (CPU)'] = benchmark_implicit(m, factors, use_gpu=False) + times["Implicit (GPU)"] = benchmark_implicit(m, factors, use_gpu=True) + times["Spark MLlib"] = benchmark_spark(m, factors) + times["Implicit (CPU)"] = benchmark_implicit(m, factors, use_gpu=False) print(times) generate_graph(times, factors, filename=args.output + ".png") diff --git a/cuda_setup.py b/cuda_setup.py index c6914bf9..cd8cf8ea 100644 --- a/cuda_setup.py +++ b/cuda_setup.py @@ -29,55 +29,62 @@ def locate_cuda(): If nvcc can't be found, this returns None """ - nvcc_bin = 'nvcc' + nvcc_bin = "nvcc" if sys.platform.startswith("win"): - nvcc_bin = 'nvcc.exe' + nvcc_bin = "nvcc.exe" # first check if the CUDAHOME env variable is in use - if 'CUDAHOME' in os.environ: - home = os.environ['CUDAHOME'] - nvcc = os.path.join(home, 'bin', nvcc_bin) - elif 'CUDA_PATH' in os.environ: - home = os.environ['CUDA_PATH'] - nvcc = os.path.join(home, 'bin', nvcc_bin) + if "CUDAHOME" in os.environ: + home = os.environ["CUDAHOME"] + nvcc = os.path.join(home, "bin", nvcc_bin) + elif "CUDA_PATH" in os.environ: + home = os.environ["CUDA_PATH"] + nvcc = os.path.join(home, "bin", nvcc_bin) else: # otherwise, search the PATH for NVCC - nvcc = find_in_path(nvcc_bin, os.environ['PATH']) + nvcc = find_in_path(nvcc_bin, os.environ["PATH"]) if nvcc is None: - logging.warning('The nvcc binary could not be located in your $PATH. Either add it to ' - 'your path, or set $CUDAHOME to enable CUDA extensions') + logging.warning( + "The nvcc binary could not be located in your $PATH. Either add it to " + "your path, or set $CUDAHOME to enable CUDA extensions" + ) return None home = os.path.dirname(os.path.dirname(nvcc)) if not os.path.exists(os.path.join(home, "include")): logging.warning("Failed to find cuda include directory, attempting /usr/local/cuda") home = "/usr/local/cuda" - cudaconfig = {'home': home, - 'nvcc': nvcc, - 'include': os.path.join(home, 'include'), - 'lib64': os.path.join(home, 'lib64')} - - post_args = ["-arch=sm_50", - "-gencode=arch=compute_50,code=sm_50", - "-gencode=arch=compute_52,code=sm_52", - "-gencode=arch=compute_60,code=sm_60", - "-gencode=arch=compute_61,code=sm_61", - "-gencode=arch=compute_70,code=sm_70", - "-gencode=arch=compute_70,code=compute_70", - "--ptxas-options=-v", "-O2"] + cudaconfig = { + "home": home, + "nvcc": nvcc, + "include": os.path.join(home, "include"), + "lib64": os.path.join(home, "lib64"), + } + + post_args = [ + "-arch=sm_50", + "-gencode=arch=compute_50,code=sm_50", + "-gencode=arch=compute_52,code=sm_52", + "-gencode=arch=compute_60,code=sm_60", + "-gencode=arch=compute_61,code=sm_61", + "-gencode=arch=compute_70,code=sm_70", + "-gencode=arch=compute_70,code=compute_70", + "--ptxas-options=-v", + "-O2", + ] if sys.platform == "win32": - cudaconfig['lib64'] = os.path.join(home, 'lib', 'x64') - post_args += ['-Xcompiler', '/MD'] + cudaconfig["lib64"] = os.path.join(home, "lib", "x64") + post_args += ["-Xcompiler", "/MD"] else: - post_args += ['-c', '--compiler-options', "'-fPIC'"] + post_args += ["-c", "--compiler-options", "'-fPIC'"] for k, v in cudaconfig.items(): if not os.path.exists(v): - logging.warning('The CUDA %s path could not be located in %s', k, v) + logging.warning("The CUDA %s path could not be located in %s", k, v) return None - cudaconfig['post_args'] = post_args + cudaconfig["post_args"] = post_args return cudaconfig @@ -85,45 +92,55 @@ def locate_cuda(): # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py class _UnixCCompiler(unixccompiler.UnixCCompiler): src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) - src_extensions.append('.cu') + src_extensions.append(".cu") def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): # For sources other than CUDA C ones, just call the super class method. - if os.path.splitext(src)[1] != '.cu': + if os.path.splitext(src)[1] != ".cu": return unixccompiler.UnixCCompiler._compile( - self, obj, src, ext, cc_args, extra_postargs, pp_opts) + self, obj, src, ext, cc_args, extra_postargs, pp_opts + ) # For CUDA C source files, compile them with NVCC. _compiler_so = self.compiler_so try: - nvcc_path = CUDA['nvcc'] - post_args = CUDA['post_args'] + nvcc_path = CUDA["nvcc"] + post_args = CUDA["post_args"] # TODO? base_opts = build.get_compiler_base_options() - self.set_executable('compiler_so', nvcc_path) + self.set_executable("compiler_so", nvcc_path) return unixccompiler.UnixCCompiler._compile( - self, obj, src, ext, cc_args, post_args, pp_opts) + self, obj, src, ext, cc_args, post_args, pp_opts + ) finally: self.compiler_so = _compiler_so class _MSVCCompiler(msvccompiler.MSVCCompiler): - _cu_extensions = ['.cu'] + _cu_extensions = [".cu"] src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) src_extensions.extend(_cu_extensions) - def _compile_cu(self, sources, output_dir=None, macros=None, - include_dirs=None, debug=0, extra_preargs=None, - extra_postargs=None, depends=None): + def _compile_cu( + self, + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=0, + extra_preargs=None, + extra_postargs=None, + depends=None, + ): # Compile CUDA C files, mainly derived from UnixCCompiler._compile(). - macros, objects, extra_postargs, pp_opts, _build = \ - self._setup_compile(output_dir, macros, include_dirs, sources, - depends, extra_postargs) + macros, objects, extra_postargs, pp_opts, _build = self._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs + ) - compiler_so = CUDA['nvcc'] + compiler_so = CUDA["nvcc"] cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) - post_args = CUDA['post_args'] + post_args = CUDA["post_args"] for obj in objects: try: @@ -131,7 +148,7 @@ def _compile_cu(self, sources, output_dir=None, macros=None, except KeyError: continue try: - self.spawn([compiler_so] + cc_args + [src, '-o', obj] + post_args) + self.spawn([compiler_so] + cc_args + [src, "-o", obj] + post_args) except errors.DistutilsExecError as e: raise errors.CompileError(str(e)) @@ -142,14 +159,13 @@ def compile(self, sources, **kwargs): cu_sources = [] other_sources = [] for source in sources: - if os.path.splitext(source)[1] == '.cu': + if os.path.splitext(source)[1] == ".cu": cu_sources.append(source) else: other_sources.append(source) # Compile source files other than CUDA C ones. - other_objects = msvccompiler.MSVCCompiler.compile( - self, other_sources, **kwargs) + other_objects = msvccompiler.MSVCCompiler.compile(self, other_sources, **kwargs) # Compile CUDA C sources. cu_objects = self._compile_cu(cu_sources, **kwargs) @@ -163,22 +179,24 @@ class cuda_build_ext(setuptools_build_ext): def run(self): if CUDA is not None: + def wrap_new_compiler(func): def _wrap_new_compiler(*args, **kwargs): try: return func(*args, **kwargs) except errors.DistutilsPlatformError: - if not sys.platform == 'win32': + if not sys.platform == "win32": CCompiler = _UnixCCompiler else: CCompiler = _MSVCCompiler - return CCompiler( - None, kwargs['dry_run'], kwargs['force']) + return CCompiler(None, kwargs["dry_run"], kwargs["force"]) + return _wrap_new_compiler + ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler) # Intentionally causes DistutilsPlatformError in # ccompiler.new_compiler() function to hook. - self.compiler = 'nvidia' + self.compiler = "nvidia" setuptools_build_ext.run(self) diff --git a/docs/conf.py b/docs/conf.py index 92d0ed01..8c3560fe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,25 +30,24 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.napoleon'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'Implicit' -copyright = u'2017, Ben Frederickson' -author = u'Ben Frederickson' +project = u"Implicit" +copyright = u"2017, Ben Frederickson" +author = u"Ben Frederickson" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -57,7 +56,7 @@ # The short X.Y version. import implicit # noqa -version = '.'.join(implicit.__version__.split('.')[:2]) +version = ".".join(implicit.__version__.split(".")[:2]) # The full version, including alpha/beta/rc tags. release = implicit.__version__ @@ -72,10 +71,10 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -86,7 +85,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -97,7 +96,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -105,12 +104,12 @@ # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - 'donate.html', + "**": [ + "about.html", + "navigation.html", + "relations.html", # needs 'show_related': True theme option to display + "searchbox.html", + "donate.html", ] } @@ -118,7 +117,7 @@ # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'Implicitdoc' +htmlhelp_basename = "Implicitdoc" # -- Options for LaTeX output --------------------------------------------- @@ -127,15 +126,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -145,8 +141,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Implicit.tex', u'Implicit Documentation', - u'Ben Frederickson', 'manual'), + (master_doc, "Implicit.tex", u"Implicit Documentation", u"Ben Frederickson", "manual"), ] @@ -154,10 +149,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'implicit', u'Implicit Documentation', - [author], 1) -] +man_pages = [(master_doc, "implicit", u"Implicit Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -166,7 +158,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Implicit', u'Implicit Documentation', - author, 'Implicit', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "Implicit", + u"Implicit Documentation", + author, + "Implicit", + "One line description of project.", + "Miscellaneous", + ), ] diff --git a/examples/lastfm.py b/examples/lastfm.py index 65d52c74..ff1afa2d 100644 --- a/examples/lastfm.py +++ b/examples/lastfm.py @@ -15,24 +15,33 @@ import tqdm from implicit.als import AlternatingLeastSquares -from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares, - NMSLibAlternatingLeastSquares) +from implicit.approximate_als import ( + AnnoyAlternatingLeastSquares, + FaissAlternatingLeastSquares, + NMSLibAlternatingLeastSquares, +) from implicit.bpr import BayesianPersonalizedRanking from implicit.datasets.lastfm import get_lastfm from implicit.lmf import LogisticMatrixFactorization -from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender, - TFIDFRecommender, bm25_weight) +from implicit.nearest_neighbours import ( + BM25Recommender, + CosineRecommender, + TFIDFRecommender, + bm25_weight, +) # maps command line model argument to class name -MODELS = {"als": AlternatingLeastSquares, - "nmslib_als": NMSLibAlternatingLeastSquares, - "annoy_als": AnnoyAlternatingLeastSquares, - "faiss_als": FaissAlternatingLeastSquares, - "tfidf": TFIDFRecommender, - "cosine": CosineRecommender, - "bpr": BayesianPersonalizedRanking, - "lmf": LogisticMatrixFactorization, - "bm25": BM25Recommender} +MODELS = { + "als": AlternatingLeastSquares, + "nmslib_als": NMSLibAlternatingLeastSquares, + "annoy_als": AnnoyAlternatingLeastSquares, + "faiss_als": FaissAlternatingLeastSquares, + "tfidf": TFIDFRecommender, + "cosine": CosineRecommender, + "bpr": BayesianPersonalizedRanking, + "lmf": LogisticMatrixFactorization, + "bm25": BM25Recommender, +} def get_model(model_name): @@ -43,13 +52,13 @@ def get_model(model_name): # some default params if model_name.endswith("als"): - params = {'factors': 64, 'dtype': np.float32} + params = {"factors": 64, "dtype": np.float32} elif model_name == "bm25": - params = {'K1': 100, 'B': 0.5} + params = {"K1": 100, "B": 0.5} elif model_name == "bpr": - params = {'factors': 63} + params = {"factors": 63} elif model_name == "lmf": - params = {'factors': 30, "iterations": 40, "regularization": 1.5} + params = {"factors": 30, "iterations": 40, "regularization": 1.5} else: params = {} @@ -57,8 +66,8 @@ def get_model(model_name): def calculate_similar_artists(output_filename, model_name="als"): - """ generates a list of similar artists in lastfm by utilizing the 'similar_items' - api of the models """ + """generates a list of similar artists in lastfm by utilizing the 'similar_items' + api of the models""" artists, users, plays = get_lastfm() # create a model from the input data @@ -99,7 +108,7 @@ def calculate_similar_artists(output_filename, model_name="als"): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) progress.update(1) - logging.debug("generated similar artists in %0.2fs", time.time() - start) + logging.debug("generated similar artists in %0.2fs", time.time() - start) def calculate_recommendations(output_filename, model_name="als"): @@ -137,22 +146,37 @@ def calculate_recommendations(output_filename, model_name="als"): for artistid, score in model.recommend(userid, user_plays): o.write("%s\t%s\t%s\n" % (username, artists[artistid], score)) progress.update(1) - logging.debug("generated recommendations in %0.2fs", time.time() - start) + logging.debug("generated recommendations in %0.2fs", time.time() - start) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generates similar artists on the last.fm dataset" - " or generates personalized recommendations for each user", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--output', type=str, default='similar-artists.tsv', - dest='outputfile', help='output file name') - parser.add_argument('--model', type=str, default='als', - dest='model', help='model to calculate (%s)' % "/".join(MODELS.keys())) - parser.add_argument('--recommend', - help='Recommend items for each user rather than calculate similar_items', - action="store_true") - parser.add_argument('--param', action='append', - help="Parameters to pass to the model, formatted as 'KEY=VALUE") + parser = argparse.ArgumentParser( + description="Generates similar artists on the last.fm dataset" + " or generates personalized recommendations for each user", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--output", + type=str, + default="similar-artists.tsv", + dest="outputfile", + help="output file name", + ) + parser.add_argument( + "--model", + type=str, + default="als", + dest="model", + help="model to calculate (%s)" % "/".join(MODELS.keys()), + ) + parser.add_argument( + "--recommend", + help="Recommend items for each user rather than calculate similar_items", + action="store_true", + ) + parser.add_argument( + "--param", action="append", help="Parameters to pass to the model, formatted as 'KEY=VALUE" + ) args = parser.parse_args() diff --git a/examples/movielens.py b/examples/movielens.py index 68f7ae9b..7ecb66c4 100644 --- a/examples/movielens.py +++ b/examples/movielens.py @@ -24,15 +24,17 @@ from implicit.bpr import BayesianPersonalizedRanking from implicit.datasets.movielens import get_movielens from implicit.lmf import LogisticMatrixFactorization -from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender, - TFIDFRecommender, bm25_weight) +from implicit.nearest_neighbours import ( + BM25Recommender, + CosineRecommender, + TFIDFRecommender, + bm25_weight, +) log = logging.getLogger("implicit") -def calculate_similar_movies(output_filename, - model_name="als", min_rating=4.0, - variant='20m'): +def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) @@ -95,22 +97,44 @@ def calculate_similar_movies(output_filename, if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generates related movies from the MovieLens 20M " - "dataset (https://grouplens.org/datasets/movielens/20m/)", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--output', type=str, default='similar-movies.tsv', - dest='outputfile', help='output file name') - parser.add_argument('--model', type=str, default='als', - dest='model', help='model to calculate (als/bm25/tfidf/cosine)') - parser.add_argument('--variant', type=str, default='20m', dest='variant', - help='Whether to use the 20m, 10m, 1m or 100k movielens dataset') - parser.add_argument('--min_rating', type=float, default=4.0, dest='min_rating', - help='Minimum rating to assume that a rating is positive') + parser = argparse.ArgumentParser( + description="Generates related movies from the MovieLens 20M " + "dataset (https://grouplens.org/datasets/movielens/20m/)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--output", + type=str, + default="similar-movies.tsv", + dest="outputfile", + help="output file name", + ) + parser.add_argument( + "--model", + type=str, + default="als", + dest="model", + help="model to calculate (als/bm25/tfidf/cosine)", + ) + parser.add_argument( + "--variant", + type=str, + default="20m", + dest="variant", + help="Whether to use the 20m, 10m, 1m or 100k movielens dataset", + ) + parser.add_argument( + "--min_rating", + type=float, + default=4.0, + dest="min_rating", + help="Minimum rating to assume that a rating is positive", + ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) - calculate_similar_movies(args.outputfile, - model_name=args.model, - min_rating=args.min_rating, variant=args.variant) + calculate_similar_movies( + args.outputfile, model_name=args.model, min_rating=args.min_rating, variant=args.variant + ) diff --git a/implicit/__init__.py b/implicit/__init__.py index 7a869a7a..b3f203f4 100644 --- a/implicit/__init__.py +++ b/implicit/__init__.py @@ -1,5 +1,5 @@ from . import als, approximate_als, bpr, lmf, nearest_neighbours -__version__ = '0.4.4' +__version__ = "0.4.4" __all__ = [als, approximate_als, bpr, nearest_neighbours, lmf, __version__] diff --git a/implicit/als.py b/implicit/als.py index 4c59c291..ced62558 100644 --- a/implicit/als.py +++ b/implicit/als.py @@ -16,7 +16,7 @@ def AlternatingLeastSquares( num_threads=0, random_state=None, ): - """ Alternating Least Squares + """Alternating Least Squares A Recommendation Model based off the algorithms described in the paper 'Collaborative Filtering for Implicit Feedback Datasets' with performance optimizations described in diff --git a/implicit/approximate_als.py b/implicit/approximate_als.py index 53cecc6f..5395c67c 100644 --- a/implicit/approximate_als.py +++ b/implicit/approximate_als.py @@ -15,7 +15,7 @@ def augment_inner_product_matrix(factors): - """ This function transforms a factor matrix such that an angular nearest neighbours search + """This function transforms a factor matrix such that an angular nearest neighbours search will return top related items of the inner product. This involves transforming each row by adding one extra dimension as suggested in the paper: @@ -24,7 +24,7 @@ def augment_inner_product_matrix(factors): Basically this involves transforming each feature vector so that they have the same norm, which means the cosine of this transformed vector is proportional to the dot product (if the other - vector in the cosine has a 0 in the extra dimension). """ + vector in the cosine has a 0 in the extra dimension).""" norms = numpy.linalg.norm(factors, axis=1) max_norm = norms.max() @@ -36,7 +36,7 @@ def augment_inner_product_matrix(factors): class NMSLibAlternatingLeastSquares(AlternatingLeastSquares): - """ Speeds up the base :class:`~implicit.als.AlternatingLeastSquares` model by using + """Speeds up the base :class:`~implicit.als.AlternatingLeastSquares` model by using `NMSLib `_ to create approximate nearest neighbours indices of the latent factors. @@ -67,14 +67,21 @@ class NMSLibAlternatingLeastSquares(AlternatingLeastSquares): item_factors """ - def __init__(self, - approximate_similar_items=True, approximate_recommend=True, - method='hnsw', index_params=None, query_params=None, - random_state=None, *args, **kwargs): + def __init__( + self, + approximate_similar_items=True, + approximate_recommend=True, + method="hnsw", + index_params=None, + query_params=None, + random_state=None, + *args, + **kwargs + ): if index_params is None: - index_params = {'M': 16, 'post': 0, 'efConstruction': 400} + index_params = {"M": 16, "post": 0, "efConstruction": 400} if query_params is None: - query_params = {'ef': 90} + query_params = {"ef": 90} self.similar_items_index = None self.recommend_index = None @@ -86,14 +93,14 @@ def __init__(self, self.index_params = index_params self.query_params = query_params - super(NMSLibAlternatingLeastSquares, self).__init__(*args, - random_state=random_state, - **kwargs) + super(NMSLibAlternatingLeastSquares, self).__init__( + *args, random_state=random_state, **kwargs + ) def fit(self, Ciu, show_progress=True): # nmslib can be a little chatty when first imported, disable some of # the logging - logging.getLogger('nmslib').setLevel(logging.WARNING) + logging.getLogger("nmslib").setLevel(logging.WARNING) import nmslib # train the model @@ -102,8 +109,7 @@ def fit(self, Ciu, show_progress=True): # create index for similar_items if self.approximate_similar_items: log.debug("Building nmslib similar items index") - self.similar_items_index = nmslib.init( - method=self.method, space='cosinesimil') + self.similar_items_index = nmslib.init(method=self.method, space="cosinesimil") # there are some numerical instability issues here with # building a cosine index with vectors with 0 norms, hack around this @@ -116,18 +122,15 @@ def fit(self, Ciu, show_progress=True): ids = ids[norms != 0] self.similar_items_index.addDataPointBatch(item_factors, ids=ids) - self.similar_items_index.createIndex(self.index_params, - print_progress=show_progress) + self.similar_items_index.createIndex(self.index_params, print_progress=show_progress) self.similar_items_index.setQueryTimeParams(self.query_params) # build up a separate index for the inner product (for recommend # methods) if self.approximate_recommend: log.debug("Building nmslib recommendation index") - self.max_norm, extra = augment_inner_product_matrix( - self.item_factors) - self.recommend_index = nmslib.init( - method='hnsw', space='cosinesimil') + self.max_norm, extra = augment_inner_product_matrix(self.item_factors) + self.recommend_index = nmslib.init(method="hnsw", space="cosinesimil") self.recommend_index.addDataPointBatch(extra) self.recommend_index.createIndex(self.index_params, print_progress=show_progress) self.recommend_index.setQueryTimeParams(self.query_params) @@ -136,17 +139,26 @@ def similar_items(self, itemid, N=10): if not self.approximate_similar_items: return super(NMSLibAlternatingLeastSquares, self).similar_items(itemid, N) - neighbours, distances = self.similar_items_index.knnQuery( - self.item_factors[itemid], N) + neighbours, distances = self.similar_items_index.knnQuery(self.item_factors[itemid], N) return zip(neighbours, 1.0 - distances) - def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, - filter_items=None, recalculate_user=False): + def recommend( + self, + userid, + user_items, + N=10, + filter_already_liked_items=True, + filter_items=None, + recalculate_user=False, + ): if not self.approximate_recommend: - return super(NMSLibAlternatingLeastSquares, - self).recommend(userid, user_items, N=N, - filter_items=filter_items, - recalculate_user=recalculate_user) + return super(NMSLibAlternatingLeastSquares, self).recommend( + userid, + user_items, + N=N, + filter_items=filter_items, + recalculate_user=recalculate_user, + ) user = self._user_factor(userid, user_items, recalculate_user) @@ -202,12 +214,20 @@ class AnnoyAlternatingLeastSquares(AlternatingLeastSquares): item_factors """ - def __init__(self, approximate_similar_items=True, approximate_recommend=True, - n_trees=50, search_k=-1, random_state=None, *args, **kwargs): - - super(AnnoyAlternatingLeastSquares, self).__init__(*args, - random_state=random_state, - **kwargs) + def __init__( + self, + approximate_similar_items=True, + approximate_recommend=True, + n_trees=50, + search_k=-1, + random_state=None, + *args, + **kwargs + ): + + super(AnnoyAlternatingLeastSquares, self).__init__( + *args, random_state=random_state, **kwargs + ) self.similar_items_index = None self.recommend_index = None @@ -230,8 +250,7 @@ def fit(self, Ciu, show_progress=True): if self.approximate_similar_items: log.debug("Building annoy similar items index") - self.similar_items_index = annoy.AnnoyIndex( - self.item_factors.shape[1], 'angular') + self.similar_items_index = annoy.AnnoyIndex(self.item_factors.shape[1], "angular") for i, row in enumerate(self.item_factors): self.similar_items_index.add_item(i, row) self.similar_items_index.build(self.n_trees) @@ -241,7 +260,7 @@ def fit(self, Ciu, show_progress=True): if self.approximate_recommend: log.debug("Building annoy recommendation index") self.max_norm, extra = augment_inner_product_matrix(self.item_factors) - self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular') + self.recommend_index = annoy.AnnoyIndex(extra.shape[1], "angular") for i, row in enumerate(extra): self.recommend_index.add_item(i, row) self.recommend_index.build(self.n_trees) @@ -250,19 +269,29 @@ def similar_items(self, itemid, N=10): if not self.approximate_similar_items: return super(AnnoyAlternatingLeastSquares, self).similar_items(itemid, N) - neighbours, dist = self.similar_items_index.get_nns_by_item(itemid, N, - search_k=self.search_k, - include_distances=True) + neighbours, dist = self.similar_items_index.get_nns_by_item( + itemid, N, search_k=self.search_k, include_distances=True + ) # transform distances back to cosine from euclidean distance return zip(neighbours, 1 - (numpy.array(dist) ** 2) / 2) - def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, - filter_items=None, recalculate_user=False): + def recommend( + self, + userid, + user_items, + N=10, + filter_already_liked_items=True, + filter_items=None, + recalculate_user=False, + ): if not self.approximate_recommend: - return super(AnnoyAlternatingLeastSquares, - self).recommend(userid, user_items, N=N, - filter_items=filter_items, - recalculate_user=recalculate_user) + return super(AnnoyAlternatingLeastSquares, self).recommend( + userid, + user_items, + N=N, + filter_items=filter_items, + recalculate_user=recalculate_user, + ) user = self._user_factor(userid, user_items, recalculate_user) @@ -276,8 +305,9 @@ def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, count = N + len(liked) query = numpy.append(user, 0) - ids, dist = self.recommend_index.get_nns_by_vector(query, count, include_distances=True, - search_k=self.search_k) + ids, dist = self.recommend_index.get_nns_by_vector( + query, count, include_distances=True, search_k=self.search_k + ) # convert the distances from euclidean to cosine distance, # and then rescale the cosine distance to go back to inner product @@ -288,7 +318,7 @@ def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, class FaissAlternatingLeastSquares(AlternatingLeastSquares): - """ Speeds up the base :class:`~implicit.als.AlternatingLeastSquares` model by using + """Speeds up the base :class:`~implicit.als.AlternatingLeastSquares` model by using `Faiss `_ to create approximate nearest neighbours indices of the latent factors. @@ -321,9 +351,17 @@ class FaissAlternatingLeastSquares(AlternatingLeastSquares): item_factors """ - def __init__(self, approximate_similar_items=True, approximate_recommend=True, - nlist=400, nprobe=20, use_gpu=implicit.gpu.HAS_CUDA, random_state=None, - *args, **kwargs): + def __init__( + self, + approximate_similar_items=True, + approximate_recommend=True, + nlist=400, + nprobe=20, + use_gpu=implicit.gpu.HAS_CUDA, + random_state=None, + *args, + **kwargs + ): self.similar_items_index = None self.recommend_index = None @@ -335,9 +373,9 @@ def __init__(self, approximate_similar_items=True, approximate_recommend=True, self.nlist = nlist self.nprobe = nprobe self.use_gpu = use_gpu - super(FaissAlternatingLeastSquares, self).__init__(*args, - random_state=random_state, - **kwargs) + super(FaissAlternatingLeastSquares, self).__init__( + *args, random_state=random_state, **kwargs + ) def fit(self, Ciu, show_progress=True): import faiss @@ -350,18 +388,20 @@ def fit(self, Ciu, show_progress=True): if self.use_gpu: self.gpu_resources = faiss.StandardGpuResources() - item_factors = self.item_factors.astype('float32') + item_factors = self.item_factors.astype("float32") if self.approximate_recommend: log.debug("Building faiss recommendation index") # build up a inner product index here if self.use_gpu: - index = faiss.GpuIndexIVFFlat(self.gpu_resources, self.factors, self.nlist, - faiss.METRIC_INNER_PRODUCT) + index = faiss.GpuIndexIVFFlat( + self.gpu_resources, self.factors, self.nlist, faiss.METRIC_INNER_PRODUCT + ) else: - index = faiss.IndexIVFFlat(self.quantizer, self.factors, self.nlist, - faiss.METRIC_INNER_PRODUCT) + index = faiss.IndexIVFFlat( + self.quantizer, self.factors, self.nlist, faiss.METRIC_INNER_PRODUCT + ) index.train(item_factors) index.add(item_factors) @@ -376,13 +416,15 @@ def fit(self, Ciu, show_progress=True): norms = numpy.linalg.norm(item_factors, axis=1) norms[norms == 0] = 1e-10 - normalized = (item_factors.T / norms).T.astype('float32') + normalized = (item_factors.T / norms).T.astype("float32") if self.use_gpu: - index = faiss.GpuIndexIVFFlat(self.gpu_resources, self.factors, self.nlist, - faiss.METRIC_INNER_PRODUCT) + index = faiss.GpuIndexIVFFlat( + self.gpu_resources, self.factors, self.nlist, faiss.METRIC_INNER_PRODUCT + ) else: - index = faiss.IndexIVFFlat(self.quantizer, self.factors, self.nlist, - faiss.METRIC_INNER_PRODUCT) + index = faiss.IndexIVFFlat( + self.quantizer, self.factors, self.nlist, faiss.METRIC_INNER_PRODUCT + ) index.train(normalized) index.add(normalized) @@ -395,17 +437,28 @@ def similar_items(self, itemid, N=10): factors = self.item_factors[itemid] factors /= numpy.linalg.norm(factors) - (dist,), (ids,) = self.similar_items_index.search(factors.reshape(1, -1).astype('float32'), - N) + (dist,), (ids,) = self.similar_items_index.search( + factors.reshape(1, -1).astype("float32"), N + ) return zip(ids, dist) - def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, - filter_items=None, recalculate_user=False): + def recommend( + self, + userid, + user_items, + N=10, + filter_already_liked_items=True, + filter_items=None, + recalculate_user=False, + ): if not self.approximate_recommend: - return super(FaissAlternatingLeastSquares, - self).recommend(userid, user_items, N=N, - filter_items=filter_items, - recalculate_user=recalculate_user) + return super(FaissAlternatingLeastSquares, self).recommend( + userid, + user_items, + N=N, + filter_items=filter_items, + recalculate_user=recalculate_user, + ) user = self._user_factor(userid, user_items, recalculate_user) @@ -421,14 +474,17 @@ def recommend(self, userid, user_items, N=10, filter_already_liked_items=True, # the GPU variant of faiss doesn't support returning more than 1024 results. # fall back to the exact match when this happens if self.use_gpu and count >= 1024: - return super(FaissAlternatingLeastSquares, - self).recommend(userid, user_items, N=N, - filter_items=filter_items, - recalculate_user=recalculate_user) + return super(FaissAlternatingLeastSquares, self).recommend( + userid, + user_items, + N=N, + filter_items=filter_items, + recalculate_user=recalculate_user, + ) # faiss expects multiple queries - convert query to a matrix # and results back to single vectors - query = user.reshape(1, -1).astype('float32') + query = user.reshape(1, -1).astype("float32") (dist,), (ids,) = self.recommend_index.search(query, count) # convert the distances from euclidean to cosine distance, diff --git a/implicit/bpr.py b/implicit/bpr.py index a54daf52..80cc258c 100644 --- a/implicit/bpr.py +++ b/implicit/bpr.py @@ -15,7 +15,7 @@ def BayesianPersonalizedRanking( verify_negative_samples=True, random_state=None, ): - """ Bayesian Personalized Ranking + """Bayesian Personalized Ranking A recommender model that learns a matrix factorization embedding based off minimizing the pairwise ranking loss described in the paper `BPR: Bayesian Personalized Ranking from Implicit diff --git a/implicit/cpu/_als.pyx b/implicit/cpu/_als.pyx index 79ae1fc2..f0048b98 100644 --- a/implicit/cpu/_als.pyx +++ b/implicit/cpu/_als.pyx @@ -6,6 +6,7 @@ from cython cimport floating, integral from cython.parallel import parallel, prange cimport scipy.linalg.cython_blas as cython_blas + # requires scipy v0.16 cimport scipy.linalg.cython_lapack as cython_lapack from libc.stdlib cimport free, malloc diff --git a/implicit/cpu/als.py b/implicit/cpu/als.py index b89a47ee..0aa85a91 100644 --- a/implicit/cpu/als.py +++ b/implicit/cpu/als.py @@ -17,7 +17,7 @@ class AlternatingLeastSquares(MatrixFactorizationBase): - """ Alternating Least Squares + """Alternating Least Squares A Recommendation Model based off the algorithms described in the paper 'Collaborative Filtering for Implicit Feedback Datasets' with performance optimizations described in @@ -56,10 +56,18 @@ class AlternatingLeastSquares(MatrixFactorizationBase): Array of latent factors for each user in the training set """ - def __init__(self, factors=100, regularization=0.01, dtype=np.float32, - use_native=True, use_cg=True, - iterations=15, calculate_training_loss=False, num_threads=0, - random_state=None): + def __init__( + self, + factors=100, + regularization=0.01, + dtype=np.float32, + use_native=True, + use_cg=True, + iterations=15, + calculate_training_loss=False, + num_threads=0, + random_state=None, + ): super(AlternatingLeastSquares, self).__init__() @@ -86,7 +94,7 @@ def __init__(self, factors=100, regularization=0.01, dtype=np.float32, check_blas_config() def fit(self, item_users, show_progress=True): - """ Factorizes the item_users matrix. + """Factorizes the item_users matrix. After calling this method, the members 'user_factors' and 'item_factors' will be initialized with a latent factor model of the input data. @@ -150,15 +158,30 @@ def fit(self, item_users, show_progress=True): # alternate between learning the user_factors from the item_factors and vice-versa for iteration in range(self.iterations): s = time.time() - solver(Cui, self.user_factors, self.item_factors, self.regularization, - num_threads=self.num_threads) - solver(Ciu, self.item_factors, self.user_factors, self.regularization, - num_threads=self.num_threads) + solver( + Cui, + self.user_factors, + self.item_factors, + self.regularization, + num_threads=self.num_threads, + ) + solver( + Ciu, + self.item_factors, + self.user_factors, + self.regularization, + num_threads=self.num_threads, + ) progress.update(1) if self.calculate_training_loss: - loss = _als.calculate_loss(Cui, self.user_factors, self.item_factors, - self.regularization, num_threads=self.num_threads) + loss = _als.calculate_loss( + Cui, + self.user_factors, + self.item_factors, + self.regularization, + num_threads=self.num_threads, + ) progress.set_postfix({"loss": loss}) if self.fit_callback: @@ -170,17 +193,27 @@ def fit(self, item_users, show_progress=True): self._check_fit_errors() def recalculate_user(self, userid, user_items): - return user_factor(self.item_factors, self.YtY, - user_items.tocsr(), userid, - self.regularization, self.factors) + return user_factor( + self.item_factors, + self.YtY, + user_items.tocsr(), + userid, + self.regularization, + self.factors, + ) def recalculate_item(self, itemid, react_users): - return item_factor(self.user_factors, self.XtX, - react_users.tocsr(), itemid, - self.regularization, self.factors) + return item_factor( + self.user_factors, + self.XtX, + react_users.tocsr(), + itemid, + self.regularization, + self.factors, + ) def explain(self, userid, user_items, itemid, user_weights=None, N=10): - """ Provides explanations for why the item is liked by the user. + """Provides explanations for why the item is liked by the user. Parameters --------- @@ -211,9 +244,9 @@ def explain(self, userid, user_items, itemid, user_weights=None, N=10): # from section 5 of the paper CF for Implicit Feedback Datasets user_items = user_items.tocsr() if user_weights is None: - A, _ = user_linear_equation(self.item_factors, self.YtY, - user_items, userid, - self.regularization, self.factors) + A, _ = user_linear_equation( + self.item_factors, self.YtY, user_items, userid, self.regularization, self.factors + ) user_weights = scipy.linalg.cho_factor(A) seed_item = self.item_factors[itemid] @@ -265,13 +298,12 @@ def XtX(self): def alternating_least_squares(Ciu, factors, **kwargs): - """ factorizes the matrix Cui using an implicit alternating least squares + """factorizes the matrix Cui using an implicit alternating least squares algorithm. Note: this method is deprecated, consider moving to the AlternatingLeastSquares class instead """ - log.warning("This method is deprecated. Please use the AlternatingLeastSquares" - " class instead") + log.warning("This method is deprecated. Please use the AlternatingLeastSquares class instead") model = AlternatingLeastSquares(factors=factors, **kwargs) model.fit(Ciu) @@ -279,7 +311,7 @@ def alternating_least_squares(Ciu, factors, **kwargs): def least_squares(Cui, X, Y, regularization, num_threads=0): - """ For each user in Cui, calculate factors Xu for them + """For each user in Cui, calculate factors Xu for them using least squares on Y. Note: this is at least 10 times slower than the cython version included @@ -341,7 +373,7 @@ def least_squares_cg(Cui, X, Y, regularization, num_threads=0, cg_steps=3): r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i] else: confidence *= -1 - r += - (confidence - 1) * Y[i].dot(x) * Y[i] + r += -(confidence - 1) * Y[i].dot(x) * Y[i] p = r.copy() rsold = r.dot(r) diff --git a/implicit/datasets/_download.py b/implicit/datasets/_download.py index 53b75f53..0b252279 100644 --- a/implicit/datasets/_download.py +++ b/implicit/datasets/_download.py @@ -11,15 +11,17 @@ def download_file(url, local_filename): - """ Simple wrapper around urlretrieve that uses tqdm to display a progress - bar of download progress """ + """Simple wrapper around urlretrieve that uses tqdm to display a progress + bar of download progress""" local_filename = os.path.abspath(local_filename) path = os.path.dirname(local_filename) if not os.path.isdir(path): os.makedirs(path) - with tqdm(unit='B', unit_scale=True) as progress: + with tqdm(unit="B", unit_scale=True) as progress: + def report(chunk, chunksize, total): progress.total = total progress.update(chunksize) + return urlretrieve(url, local_filename, reporthook=report) diff --git a/implicit/datasets/lastfm.py b/implicit/datasets/lastfm.py index d6fc638d..ff2938eb 100644 --- a/implicit/datasets/lastfm.py +++ b/implicit/datasets/lastfm.py @@ -11,12 +11,12 @@ log = logging.getLogger("implicit") -URL = 'https://github.com/benfred/recommender_data/releases/download/v1.0/lastfm_360k.hdf5' +URL = "https://github.com/benfred/recommender_data/releases/download/v1.0/lastfm_360k.hdf5" def get_lastfm(): - """ Returns the lastfm360k dataset, downloading locally if necessary. - Returns a tuple of (artistids, userids, plays) where plays is a CSR matrix """ + """Returns the lastfm360k dataset, downloading locally if necessary. + Returns a tuple of (artistids, userids, plays) where plays is a CSR matrix""" filename = os.path.join(_download.LOCAL_CACHE_DIR, "lastfm_360k.hdf5") if not os.path.isfile(filename): @@ -25,14 +25,14 @@ def get_lastfm(): else: log.info("Using cached dataset at '%s'", filename) - with h5py.File(filename, 'r') as f: - m = f.get('artist_user_plays') - plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr'))) - return np.array(f['artist']), np.array(f['user']), plays + with h5py.File(filename, "r") as f: + m = f.get("artist_user_plays") + plays = csr_matrix((m.get("data"), m.get("indices"), m.get("indptr"))) + return np.array(f["artist"]), np.array(f["user"]), plays def generate_dataset(filename, outputfilename): - """ Generates a hdf5 lastfm datasetfile from the raw datafiles found at: + """Generates a hdf5 lastfm datasetfile from the raw datafiles found at: http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html You shouldn't have to run this yourself, and can instead just download the @@ -55,14 +55,13 @@ def _read_dataframe(filename): # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) - data = pandas.read_table(filename, - usecols=[0, 2, 3], - names=['user', 'artist', 'plays'], - na_filter=False) + data = pandas.read_table( + filename, usecols=[0, 2, 3], names=["user", "artist", "plays"], na_filter=False + ) # map each artist and user to a unique numeric value - data['user'] = data['user'].astype("category") - data['artist'] = data['artist'].astype("category") + data["user"] = data["user"].astype("category") + data["artist"] = data["artist"].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) @@ -72,21 +71,24 @@ def _read_dataframe(filename): def _hfd5_from_dataframe(data, outputfilename): # create a sparse matrix of all the users/plays - plays = coo_matrix((data['plays'].astype(np.float32), - (data['artist'].cat.codes.copy(), - data['user'].cat.codes.copy()))).tocsr() + plays = coo_matrix( + ( + data["plays"].astype(np.float32), + (data["artist"].cat.codes.copy(), data["user"].cat.codes.copy()), + ) + ).tocsr() with h5py.File(outputfilename, "w") as f: - g = f.create_group('artist_user_plays') + g = f.create_group("artist_user_plays") g.create_dataset("data", data=plays.data) g.create_dataset("indptr", data=plays.indptr) g.create_dataset("indices", data=plays.indices) dt = h5py.special_dtype(vlen=str) - artist = list(data['artist'].cat.categories) - dset = f.create_dataset('artist', (len(artist),), dtype=dt) + artist = list(data["artist"].cat.categories) + dset = f.create_dataset("artist", (len(artist),), dtype=dt) dset[:] = artist - user = list(data['user'].cat.categories) - dset = f.create_dataset('user', (len(user),), dtype=dt) + user = list(data["user"].cat.categories) + dset = f.create_dataset("user", (len(user),), dtype=dt) dset[:] = user diff --git a/implicit/datasets/million_song_dataset.py b/implicit/datasets/million_song_dataset.py index a11d06af..ea880725 100644 --- a/implicit/datasets/million_song_dataset.py +++ b/implicit/datasets/million_song_dataset.py @@ -12,11 +12,11 @@ log = logging.getLogger("implicit") -URL = 'https://github.com/benfred/recommender_data/releases/download/v1.0/msd_taste_profile.hdf5' +URL = "https://github.com/benfred/recommender_data/releases/download/v1.0/msd_taste_profile.hdf5" def get_msd_taste_profile(): - """ Returns the taste profile subset from the million song dataset: + """Returns the taste profile subset from the million song dataset: https://labrosa.ee.columbia.edu/millionsong/tasteprofile Data returned is a tuple of (trackinfo, user, plays) where @@ -35,15 +35,18 @@ def get_msd_taste_profile(): else: log.info("Using cached dataset at '%s'", filename) - with h5py.File(filename, 'r') as f: - m = f.get('track_user_plays') - plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr'))) - return np.array(f['track']), np.array(f['user']), plays + with h5py.File(filename, "r") as f: + m = f.get("track_user_plays") + plays = csr_matrix((m.get("data"), m.get("indices"), m.get("indptr"))) + return np.array(f["track"]), np.array(f["user"]), plays -def generate_dataset(triplets_filename, summary_filename="msd_summary_file.h5", - outputfilename="msd_taste_profile.hdf5"): - """ Generates a hdf5 datasetfile from the raw datafiles: +def generate_dataset( + triplets_filename, + summary_filename="msd_summary_file.h5", + outputfilename="msd_taste_profile.hdf5", +): + """Generates a hdf5 datasetfile from the raw datafiles: You will need to download the train_triplets from here: https://labrosa.ee.columbia.edu/millionsong/tasteprofile#getting @@ -68,11 +71,11 @@ def _read_triplets_dataframe(filename): # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) - data = pandas.read_table("train_triplets.txt", names=['user', 'track', 'plays']) + data = pandas.read_table("train_triplets.txt", names=["user", "track", "plays"]) # map each artist and user to a unique numeric value - data['user'] = data['user'].astype("category") - data['track'] = data['track'].astype("category") + data["user"] = data["user"].astype("category") + data["track"] = data["track"].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) @@ -85,12 +88,12 @@ def _join_summary_file(data, summary_filename="msd_summary_file.h5"): msd = h5py.File(summary_filename) # create a lookup table of trackid -> position - track_lookup = dict((t.encode("utf8"), i) for i, t in enumerate(data['track'].cat.categories)) + track_lookup = dict((t.encode("utf8"), i) for i, t in enumerate(data["track"].cat.categories)) # join on trackid to the summary file to get the artist/album/songname track_info = np.empty(shape=(len(track_lookup), 4), dtype=np.object) with tqdm(total=len(track_info)) as progress: - for song in msd['metadata']['songs']: + for song in msd["metadata"]["songs"]: trackid = song[17] if trackid in track_lookup: pos = track_lookup[trackid] @@ -102,20 +105,23 @@ def _join_summary_file(data, summary_filename="msd_summary_file.h5"): def _hfd5_from_dataframe(data, track_info, outputfilename): # create a sparse matrix of all the users/plays - plays = coo_matrix((data['plays'].astype(np.float32), - (data['track'].cat.codes.copy(), - data['user'].cat.codes.copy()))).tocsr() + plays = coo_matrix( + ( + data["plays"].astype(np.float32), + (data["track"].cat.codes.copy(), data["user"].cat.codes.copy()), + ) + ).tocsr() with h5py.File(outputfilename, "w") as f: - g = f.create_group('track_user_plays') + g = f.create_group("track_user_plays") g.create_dataset("data", data=plays.data) g.create_dataset("indptr", data=plays.indptr) g.create_dataset("indices", data=plays.indices) dt = h5py.special_dtype(vlen=str) - dset = f.create_dataset('track', track_info.shape, dtype=dt) + dset = f.create_dataset("track", track_info.shape, dtype=dt) dset[:] = track_info - user = list(data['user'].cat.categories) - dset = f.create_dataset('user', (len(user),), dtype=dt) + user = list(data["user"].cat.categories) + dset = f.create_dataset("user", (len(user),), dtype=dt) dset[:] = user diff --git a/implicit/datasets/movielens.py b/implicit/datasets/movielens.py index 47375d17..777af8f7 100644 --- a/implicit/datasets/movielens.py +++ b/implicit/datasets/movielens.py @@ -10,11 +10,11 @@ log = logging.getLogger("implicit") -URL_BASE = 'https://github.com/benfred/recommender_data/releases/download/v1.0/' +URL_BASE = "https://github.com/benfred/recommender_data/releases/download/v1.0/" def get_movielens(variant="20m"): - """ Gets movielens datasets + """Gets movielens datasets Parameters --------- @@ -39,14 +39,14 @@ def get_movielens(variant="20m"): else: log.info("Using cached dataset at '%s'", path) - with h5py.File(path, 'r') as f: - m = f.get('movie_user_ratings') - plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr'))) - return np.array(f['movie']), plays + with h5py.File(path, "r") as f: + m = f.get("movie_user_ratings") + plays = csr_matrix((m.get("data"), m.get("indices"), m.get("indptr"))) + return np.array(f["movie"]), plays -def generate_dataset(path, variant='20m', outputpath="."): - """ Generates a hdf5 movielens datasetfile from the raw datafiles found at: +def generate_dataset(path, variant="20m", outputpath="."): + """Generates a hdf5 movielens datasetfile from the raw datafiles found at: https://grouplens.org/datasets/movielens/20m/ You shouldn't have to run this yourself, and can instead just download the @@ -54,9 +54,9 @@ def generate_dataset(path, variant='20m', outputpath="."): """ filename = os.path.join(outputpath, "movielens_%s.hdf5" % variant) - if variant == '20m': + if variant == "20m": ratings, movies = _read_dataframes_20M(path) - elif variant == '100k': + elif variant == "100k": ratings, movies = _read_dataframes_100k(path) else: ratings, movies = _read_dataframes(path) @@ -78,43 +78,52 @@ def _read_dataframes_100k(path): """ reads in the movielens 100k dataset""" import pandas - ratings = pandas.read_table(os.path.join(path, "u.data"), - names=['userId', 'movieId', 'rating', 'timestamp']) + ratings = pandas.read_table( + os.path.join(path, "u.data"), names=["userId", "movieId", "rating", "timestamp"] + ) - movies = pandas.read_csv(os.path.join(path, "u.item"), - names=['movieId', 'title'], - usecols=[0, 1], - delimiter='|', - encoding='ISO-8859-1') + movies = pandas.read_csv( + os.path.join(path, "u.item"), + names=["movieId", "title"], + usecols=[0, 1], + delimiter="|", + encoding="ISO-8859-1", + ) return ratings, movies def _read_dataframes(path): import pandas - ratings = pandas.read_csv(os.path.join(path, "ratings.dat"), delimiter="::", - names=['userId', 'movieId', 'rating', 'timestamp']) - movies = pandas.read_table(os.path.join(path, "movies.dat"), delimiter="::", - names=['movieId', 'title', 'genres']) + ratings = pandas.read_csv( + os.path.join(path, "ratings.dat"), + delimiter="::", + names=["userId", "movieId", "rating", "timestamp"], + ) + + movies = pandas.read_table( + os.path.join(path, "movies.dat"), delimiter="::", names=["movieId", "title", "genres"] + ) return ratings, movies def _hfd5_from_dataframe(ratings, movies, outputfilename): # transform ratings dataframe into a sparse matrix - m = coo_matrix((ratings['rating'].astype(np.float32), - (ratings['movieId'], ratings['userId']))).tocsr() + m = coo_matrix( + (ratings["rating"].astype(np.float32), (ratings["movieId"], ratings["userId"])) + ).tocsr() with h5py.File(outputfilename, "w") as f: # write out the ratings matrix - g = f.create_group('movie_user_ratings') + g = f.create_group("movie_user_ratings") g.create_dataset("data", data=m.data) g.create_dataset("indptr", data=m.indptr) g.create_dataset("indices", data=m.indices) # write out the titles as a numpy array - titles = np.empty(shape=(movies.movieId.max()+1,), dtype=np.object) + titles = np.empty(shape=(movies.movieId.max() + 1,), dtype=np.object) titles[movies.movieId] = movies.title dt = h5py.special_dtype(vlen=str) - dset = f.create_dataset('movie', (len(titles),), dtype=dt) + dset = f.create_dataset("movie", (len(titles),), dtype=dt) dset[:] = titles diff --git a/implicit/datasets/reddit.py b/implicit/datasets/reddit.py index 1a54d0bf..2d5f373d 100644 --- a/implicit/datasets/reddit.py +++ b/implicit/datasets/reddit.py @@ -11,17 +11,17 @@ log = logging.getLogger("implicit") -URL = 'https://github.com/benfred/recommender_data/releases/download/v1.0/reddit.hdf5' +URL = "https://github.com/benfred/recommender_data/releases/download/v1.0/reddit.hdf5" def get_reddit(): - """ Returns the reddit dataset, downloading locally if necessary. + """Returns the reddit dataset, downloading locally if necessary. This dataset was released here: https://www.reddit.com/r/redditdev/comments/dtg4j/want_to_help_reddit_build_a_recommender_a_public/ and contains 23M up/down votes from 44K users on 3.4M links. - Returns a CSR matrix of (item, user, rating """ + Returns a CSR matrix of (item, user, rating""" filename = os.path.join(_download.LOCAL_CACHE_DIR, "reddit.hdf5") if not os.path.isfile(filename): @@ -30,13 +30,13 @@ def get_reddit(): else: log.info("Using cached dataset at '%s'", filename) - with h5py.File(filename, 'r') as f: - m = f.get('item_user_ratings') - return csr_matrix((m.get('data'), m.get('indices'), m.get('indptr'))) + with h5py.File(filename, "r") as f: + m = f.get("item_user_ratings") + return csr_matrix((m.get("data"), m.get("indices"), m.get("indptr"))) def generate_dataset(filename, outputfilename): - """ Generates a hdf5 reddit datasetfile from the raw datafiles found at: + """Generates a hdf5 reddit datasetfile from the raw datafiles found at: https://www.reddit.com/r/redditdev/comments/dtg4j/want_to_help_reddit_build_a_recommender_a_public/ You shouldn't have to run this yourself, and can instead just download the @@ -55,11 +55,11 @@ def _read_dataframe(filename): # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) - data = pandas.read_table(filename, usecols=[0, 1, 3], names=['user', 'item', 'rating']) + data = pandas.read_table(filename, usecols=[0, 1, 3], names=["user", "item", "rating"]) # map each artist and user to a unique numeric value - data['user'] = data['user'].astype("category") - data['item'] = data['item'].astype("category") + data["user"] = data["user"].astype("category") + data["item"] = data["item"].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) @@ -67,15 +67,18 @@ def _read_dataframe(filename): def _hfd5_from_dataframe(data, outputfilename): - ratings = coo_matrix((data['rating'].astype(np.float32), - (data['item'].cat.codes.copy(), - data['user'].cat.codes.copy()))).tocsr() + ratings = coo_matrix( + ( + data["rating"].astype(np.float32), + (data["item"].cat.codes.copy(), data["user"].cat.codes.copy()), + ) + ).tocsr() print(repr(ratings)) print(repr(ratings.indices)) print(repr(ratings.indptr)) with h5py.File(outputfilename, "w") as f: - g = f.create_group('item_user_ratings') + g = f.create_group("item_user_ratings") g.create_dataset("data", data=ratings.data) g.create_dataset("indptr", data=ratings.indptr) g.create_dataset("indices", data=ratings.indices) diff --git a/implicit/datasets/sketchfab.py b/implicit/datasets/sketchfab.py index 5ee53225..b8fc3b96 100644 --- a/implicit/datasets/sketchfab.py +++ b/implicit/datasets/sketchfab.py @@ -11,17 +11,17 @@ log = logging.getLogger("implicit") -URL = 'https://github.com/benfred/recommender_data/releases/download/v1.0/sketchfab.hdf5' +URL = "https://github.com/benfred/recommender_data/releases/download/v1.0/sketchfab.hdf5" def get_sketchfab(): - """ Returns the sketchfab dataset, downloading locally if necessary. + """Returns the sketchfab dataset, downloading locally if necessary. This dataset contains about 632K likes from 62K users on 28k items collected from the sketchfab website, as described here: http://blog.ethanrosenthal.com/2016/10/09/likes-out-guerilla-dataset/ - Returns a tuple of (items, users, likes) where likes is a CSR matrix """ + Returns a tuple of (items, users, likes) where likes is a CSR matrix""" filename = os.path.join(_download.LOCAL_CACHE_DIR, "sketchfab.hdf5") if not os.path.isfile(filename): @@ -30,10 +30,10 @@ def get_sketchfab(): else: log.info("Using cached dataset at '%s'", filename) - with h5py.File(filename, 'r') as f: - m = f.get('item_user_likes') - plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr'))) - return np.array(f['item']), np.array(f['user']), plays + with h5py.File(filename, "r") as f: + m = f.get("item_user_likes") + plays = csr_matrix((m.get("data"), m.get("indices"), m.get("indptr"))) + return np.array(f["item"]), np.array(f["user"]), plays def generate_dataset(filename, outputfilename): @@ -49,11 +49,11 @@ def _read_dataframe(filename): # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) - data = pandas.read_csv(filename, delimiter='|', quotechar='\\') + data = pandas.read_csv(filename, delimiter="|", quotechar="\\") # map each artist and user to a unique numeric value - data['uid'] = data['uid'].astype("category") - data['mid'] = data['mid'].astype("category") + data["uid"] = data["uid"].astype("category") + data["mid"] = data["mid"].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) @@ -61,24 +61,24 @@ def _read_dataframe(filename): def _hfd5_from_dataframe(data, outputfilename): - items = data['mid'].cat.codes.copy() - users = data['uid'].cat.codes.copy() + items = data["mid"].cat.codes.copy() + users = data["uid"].cat.codes.copy() values = np.ones(len(items)).astype(np.float32) # create a sparse matrix of all the item/users/likes likes = coo_matrix((values, (items, users))).astype(np.float32).tocsr() with h5py.File(outputfilename, "w") as f: - g = f.create_group('item_user_likes') + g = f.create_group("item_user_likes") g.create_dataset("data", data=likes.data) g.create_dataset("indptr", data=likes.indptr) g.create_dataset("indices", data=likes.indices) dt = h5py.special_dtype(vlen=str) - item = list(data['mid'].cat.categories) - dset = f.create_dataset('item', (len(item),), dtype=dt) + item = list(data["mid"].cat.categories) + dset = f.create_dataset("item", (len(item),), dtype=dt) dset[:] = item - user = list(data['uid'].cat.categories) - dset = f.create_dataset('user', (len(user),), dtype=dt) + user = list(data["uid"].cat.categories) + dset = f.create_dataset("user", (len(user),), dtype=dt) dset[:] = user diff --git a/implicit/gpu/__init__.py b/implicit/gpu/__init__.py index 67833936..f82f7c41 100644 --- a/implicit/gpu/__init__.py +++ b/implicit/gpu/__init__.py @@ -4,6 +4,7 @@ import cupy # noqa from ._cuda import * # noqa + HAS_CUDA = True except ImportError: HAS_CUDA = False diff --git a/implicit/gpu/als.py b/implicit/gpu/als.py index c94c56d0..3aec6680 100644 --- a/implicit/gpu/als.py +++ b/implicit/gpu/als.py @@ -19,7 +19,7 @@ class AlternatingLeastSquares(MatrixFactorizationBase): - """ Alternating Least Squares + """Alternating Least Squares A Recommendation Model based off the algorithms described in the paper 'Collaborative Filtering for Implicit Feedback Datasets' with performance optimizations described in @@ -48,9 +48,14 @@ class AlternatingLeastSquares(MatrixFactorizationBase): Array of latent factors for each user in the training set """ - def __init__(self, factors=64, regularization=0.01, - iterations=15, calculate_training_loss=False, - random_state=None): + def __init__( + self, + factors=64, + regularization=0.01, + iterations=15, + calculate_training_loss=False, + random_state=None, + ): if not implicit.gpu.HAS_CUDA: raise ValueError("No CUDA extension has been built, can't train on GPU.") @@ -62,8 +67,12 @@ def __init__(self, factors=64, regularization=0.01, # the 'dot' function in 'implicit/gpu/utils.cuh) if factors % 32: padding = 32 - factors % 32 - log.warning("GPU training requires factor size to be a multiple of 32." - " Increasing factors from %i to %i.", factors, factors + padding) + log.warning( + "GPU training requires factor size to be a multiple of 32." + " Increasing factors from %i to %i.", + factors, + factors + padding, + ) factors += padding # parameters on how to factorize @@ -78,7 +87,7 @@ def __init__(self, factors=64, regularization=0.01, self.cg_steps = 3 def fit(self, item_users, show_progress=True): - """ Factorizes the item_users matrix. + """Factorizes the item_users matrix. After calling this method, the members 'user_factors' and 'item_factors' will be initialized with a latent factor model of the input data. @@ -126,10 +135,10 @@ def fit(self, item_users, show_progress=True): # Initialize the variables randomly if they haven't already been set if self.user_factors is None: - self.user_factors = (random_state.rand(users, self.factors, dtype=cp.float32) - .5) + self.user_factors = random_state.rand(users, self.factors, dtype=cp.float32) - 0.5 self.user_factors /= self.factors if self.item_factors is None: - self.item_factors = (random_state.rand(items, self.factors, dtype=cp.float32) - .5) + self.item_factors = random_state.rand(items, self.factors, dtype=cp.float32) - 0.5 self.item_factors /= self.factors log.debug("Initialized factors in %s", time.time() - s) diff --git a/implicit/gpu/bpr.py b/implicit/gpu/bpr.py index 25a41835..ea200242 100644 --- a/implicit/gpu/bpr.py +++ b/implicit/gpu/bpr.py @@ -15,7 +15,7 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase): - """ Bayesian Personalized Ranking + """Bayesian Personalized Ranking A recommender model that learns a matrix factorization embedding based off minimizing the pairwise ranking loss described in the paper `BPR: Bayesian Personalized Ranking from Implicit @@ -46,16 +46,29 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase): user_factors : ndarray Array of latent factors for each user in the training set """ - def __init__(self, factors=100, learning_rate=0.01, regularization=0.01, dtype=np.float32, - iterations=100, verify_negative_samples=True, random_state=None): + + def __init__( + self, + factors=100, + learning_rate=0.01, + regularization=0.01, + dtype=np.float32, + iterations=100, + verify_negative_samples=True, + random_state=None, + ): super(BayesianPersonalizedRanking, self).__init__() if not implicit.gpu.HAS_CUDA: raise ValueError("No CUDA extension has been built, can't train on GPU.") if (factors + 1) % 32: padding = 32 - (factors + 1) % 32 - log.warning("GPU training requires factor size to be a multiple of 32 - 1." - " Increasing factors from %i to %i.", factors, factors + padding) + log.warning( + "GPU training requires factor size to be a multiple of 32 - 1." + " Increasing factors from %i to %i.", + factors, + factors + padding, + ) factors += padding self.factors = factors @@ -66,7 +79,7 @@ def __init__(self, factors=100, learning_rate=0.01, regularization=0.01, dtype=n self.random_state = random_state def fit(self, item_users, show_progress=True): - """ Factorizes the item_users matrix + """Factorizes the item_users matrix Parameters ---------- @@ -102,7 +115,7 @@ def fit(self, item_users, show_progress=True): # Note: the final dimension is for the item bias term - which is set to a 1 for all users # this simplifies interfacing with approximate nearest neighbours libraries etc if self.item_factors is None: - self.item_factors = rs.rand(items, self.factors + 1, dtype=cp.float32) - .5 + self.item_factors = rs.rand(items, self.factors + 1, dtype=cp.float32) - 0.5 self.item_factors /= self.factors # set factors to all zeros for items without any ratings @@ -110,7 +123,7 @@ def fit(self, item_users, show_progress=True): self.item_factors[item_counts == 0] = cp.zeros(self.factors + 1) if self.user_factors is None: - self.user_factors = rs.rand(users, self.factors + 1, dtype=cp.float32) - .5 + self.user_factors = rs.rand(users, self.factors + 1, dtype=cp.float32) - 0.5 self.user_factors /= self.factors # set factors to all zeros for users without any ratings @@ -130,14 +143,23 @@ def fit(self, item_users, show_progress=True): log.debug("Running %i BPR training epochs", self.iterations) with tqdm(total=self.iterations, disable=not show_progress) as progress: for epoch in range(self.iterations): - correct, skipped = implicit.gpu.cu_bpr_update(userids, itemids, indptr, - X, Y, self.learning_rate, - self.regularization, - rs.randint(2**31), - self.verify_negative_samples) + correct, skipped = implicit.gpu.cu_bpr_update( + userids, + itemids, + indptr, + X, + Y, + self.learning_rate, + self.regularization, + rs.randint(2 ** 31), + self.verify_negative_samples, + ) progress.update(1) total = len(user_items.data) if total != 0 and total != skipped: progress.set_postfix( - {"correct": "%.2f%%" % (100.0 * correct / (total - skipped)), - "skipped": "%.2f%%" % (100.0 * skipped / total)}) + { + "correct": "%.2f%%" % (100.0 * correct / (total - skipped)), + "skipped": "%.2f%%" % (100.0 * skipped / total), + } + ) diff --git a/implicit/gpu/matrix_factorization_base.py b/implicit/gpu/matrix_factorization_base.py index 6dfcc41f..0f8718e1 100644 --- a/implicit/gpu/matrix_factorization_base.py +++ b/implicit/gpu/matrix_factorization_base.py @@ -11,7 +11,7 @@ class MatrixFactorizationBase(RecommenderBase): - """ Base class for MF models running on the GPU. + """Base class for MF models running on the GPU. This adds support for inference to run on the GPU as well as training. Factors are stored as cupy arrays. @@ -30,8 +30,15 @@ def __init__(self): self._item_norms = None self._user_norms = None - def recommend(self, userid, user_items, - N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False): + def recommend( + self, + userid, + user_items, + N=10, + filter_already_liked_items=True, + filter_items=None, + recalculate_user=False, + ): if recalculate_user: raise NotImplementedError("recalculate_user isn't support on GPU yet") @@ -130,7 +137,7 @@ def check_random_state(random_state): if isinstance(random_state, np.random.RandomState): # we need to convert from numpy random state to cupy random state. - return cp.random.RandomState(random_state.randint(2**63)) + return cp.random.RandomState(random_state.randint(2 ** 63)) # otherwise try to initialize a new one, and let it fail through # on the numpy side if it doesn't work diff --git a/implicit/nearest_neighbours.py b/implicit/nearest_neighbours.py index 97a1bfeb..1aa95ad0 100644 --- a/implicit/nearest_neighbours.py +++ b/implicit/nearest_neighbours.py @@ -10,7 +10,7 @@ class ItemItemRecommender(RecommenderBase): - """ Base class for Item-Item Nearest Neighbour recommender models + """Base class for Item-Item Nearest Neighbour recommender models here. Parameters @@ -22,6 +22,7 @@ class ItemItemRecommender(RecommenderBase): The number of threads to use for fitting the model. Specifying 0 means to default to the number of cores on the machine. """ + def __init__(self, K=20, num_threads=0): self.similarity = None self.K = K @@ -30,13 +31,20 @@ def __init__(self, K=20, num_threads=0): def fit(self, weighted, show_progress=True): """ Computes and stores the similarity matrix """ - self.similarity = all_pairs_knn(weighted, self.K, - show_progress=show_progress, - num_threads=self.num_threads).tocsr() + self.similarity = all_pairs_knn( + weighted, self.K, show_progress=show_progress, num_threads=self.num_threads + ).tocsr() self.scorer = NearestNeighboursScorer(self.similarity) - def recommend(self, userid, user_items, - N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False): + def recommend( + self, + userid, + user_items, + N=10, + filter_already_liked_items=True, + filter_items=None, + recalculate_user=False, + ): """ returns the best N recommendations for a user given its id""" if userid >= user_items.shape[0]: raise ValueError("userid is out of bounds of the user_items matrix") @@ -46,9 +54,14 @@ def recommend(self, userid, user_items, if filter_items: items += len(filter_items) - indices, data = self.scorer.recommend(userid, user_items.indptr, user_items.indices, - user_items.data, K=items, - remove_own_likes=filter_already_liked_items) + indices, data = self.scorer.recommend( + userid, + user_items.indptr, + user_items.indices, + user_items.data, + K=items, + remove_own_likes=filter_already_liked_items, + ) best = sorted(zip(indices, data), key=lambda x: -x[1]) if not filter_items: @@ -90,7 +103,7 @@ def similar_items(self, itemid, N=10): def __getstate__(self): state = self.__dict__.copy() # scorer isn't picklable - del state['scorer'] + del state["scorer"] return state def __setstate__(self, state): @@ -102,8 +115,9 @@ def __setstate__(self, state): def save(self, filename): m = self.similarity - numpy.savez(filename, data=m.data, indptr=m.indptr, indices=m.indices, shape=m.shape, - K=self.K) + numpy.savez( + filename, data=m.data, indptr=m.indptr, indices=m.indices, shape=m.shape, K=self.K + ) @classmethod def load(cls, filename): @@ -112,17 +126,18 @@ def load(cls, filename): filename = filename + ".npz" m = numpy.load(filename) - similarity = csr_matrix((m['data'], m['indices'], m['indptr']), shape=m['shape']) + similarity = csr_matrix((m["data"], m["indices"], m["indptr"]), shape=m["shape"]) ret = cls() ret.similarity = similarity ret.scorer = NearestNeighboursScorer(similarity) - ret.K = m['K'] + ret.K = m["K"] return ret class CosineRecommender(ItemItemRecommender): """ An Item-Item Recommender on Cosine distances between items """ + def fit(self, counts, show_progress=True): # cosine distance is just the dot-product of a normalized matrix ItemItemRecommender.fit(self, normalize(counts), show_progress) @@ -130,6 +145,7 @@ def fit(self, counts, show_progress=True): class TFIDFRecommender(ItemItemRecommender): """ An Item-Item Recommender on TF-IDF distances between items """ + def fit(self, counts, show_progress=True): weighted = normalize(tfidf_weight(counts)) ItemItemRecommender.fit(self, weighted, show_progress) @@ -137,7 +153,8 @@ def fit(self, counts, show_progress=True): class BM25Recommender(ItemItemRecommender): """ An Item-Item Recommender on BM25 distance between items """ - def __init__(self, K=20, K1=1.2, B=.75, num_threads=0): + + def __init__(self, K=20, K1=1.2, B=0.75, num_threads=0): super(BM25Recommender, self).__init__(K, num_threads) self.K1 = K1 self.B = B @@ -161,8 +178,8 @@ def tfidf_weight(X): def normalize(X): - """ equivalent to scipy.preprocessing.normalize on sparse matrices - , but lets avoid another depedency just for a small utility function """ + """equivalent to scipy.preprocessing.normalize on sparse matrices + , but lets avoid another depedency just for a small utility function""" X = coo_matrix(X) X.data = X.data / sqrt(bincount(X.row, X.data ** 2))[X.row] return X diff --git a/implicit/utils.py b/implicit/utils.py index 5587571e..ec9e69af 100644 --- a/implicit/utils.py +++ b/implicit/utils.py @@ -6,7 +6,7 @@ def nonzeros(m, row): """ returns the non zeroes of a row in csr_matrix """ - for index in range(m.indptr[row], m.indptr[row+1]): + for index in range(m.indptr[row], m.indptr[row + 1]): yield m.indices[index], m.data[index] @@ -14,21 +14,25 @@ def nonzeros(m, row): def check_blas_config(): - """ checks to see if using OpenBlas/Intel MKL. If so, warn if the number of threads isn't set - to 1 (causes severe perf issues when training - can be 10x slower) """ + """checks to see if using OpenBlas/Intel MKL. If so, warn if the number of threads isn't set + to 1 (causes severe perf issues when training - can be 10x slower)""" # don't warn repeatedly global _checked_blas_config if _checked_blas_config: return _checked_blas_config = True - if np.__config__.get_info('openblas_info') and os.environ.get('OPENBLAS_NUM_THREADS') != '1': - logging.warning("OpenBLAS detected. Its highly recommend to set the environment variable " - "'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading") - if np.__config__.get_info('blas_mkl_info') and os.environ.get('MKL_NUM_THREADS') != '1': - logging.warning("Intel MKL BLAS detected. Its highly recommend to set the environment " - "variable 'export MKL_NUM_THREADS=1' to disable its internal " - "multithreading") + if np.__config__.get_info("openblas_info") and os.environ.get("OPENBLAS_NUM_THREADS") != "1": + logging.warning( + "OpenBLAS detected. Its highly recommend to set the environment variable " + "'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading" + ) + if np.__config__.get_info("blas_mkl_info") and os.environ.get("MKL_NUM_THREADS") != "1": + logging.warning( + "Intel MKL BLAS detected. Its highly recommend to set the environment " + "variable 'export MKL_NUM_THREADS=1' to disable its internal " + "multithreading" + ) def check_random_state(random_state): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..aa4949aa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 100 diff --git a/setup.cfg b/setup.cfg index e94bcbc5..b6f6e242 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,11 +11,14 @@ max-line-length = 100 exclude = build,.eggs,.tox [isort] +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +use_parentheses = True +ensure_newline_before_comments = True known_first_party = implicit known_third_party = scipy,annoy,numpy,cython,pandas line_length = 100 -balanced_wrapping = True -indent = ' ' skip = build,.eggs,.tox [bumpversion:file:implicit/__init__.py] diff --git a/setup.py b/setup.py index ec4a62f3..f4ef2f21 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from cuda_setup import CUDA, build_ext -NAME = 'implicit' +NAME = "implicit" VERSION = "0.4.4" @@ -21,17 +21,17 @@ def define_extensions(): if sys.platform.startswith("win"): # compile args from # https://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx - compile_args = ['/O2', '/openmp'] + compile_args = ["/O2", "/openmp"] link_args = [] else: gcc = extract_gcc_binaries() if gcc is not None: - rpath = '/usr/local/opt/gcc/lib/gcc/' + gcc[-1] + '/' - link_args = ['-Wl,-rpath,' + rpath] + rpath = "/usr/local/opt/gcc/lib/gcc/" + gcc[-1] + "/" + link_args = ["-Wl,-rpath," + rpath] else: link_args = [] - compile_args = ['-Wno-unused-function', '-Wno-maybe-uninitialized', '-O3', '-ffast-math'] + compile_args = ["-Wno-unused-function", "-Wno-maybe-uninitialized", "-O3", "-ffast-math"] if use_openmp: compile_args.append("-fopenmp") link_args.append("-fopenmp") @@ -45,38 +45,60 @@ def define_extensions(): # except ImportError: # raise ValueError("numpy is required to build from source") - src_ext = '.pyx' - modules = [Extension("implicit." + name, - [os.path.join("implicit", name + src_ext)], - language='c++', - extra_compile_args=compile_args, - extra_link_args=link_args) - for name in ['_nearest_neighbours', 'lmf', 'evaluation']] - modules.extend([Extension("implicit.cpu." + name, - [os.path.join("implicit", "cpu", name + src_ext)], - language='c++', - extra_compile_args=compile_args, - extra_link_args=link_args) - for name in ['_als', 'bpr']]) - modules.append(Extension("implicit." + 'recommender_base', - [os.path.join("implicit", 'recommender_base' + src_ext), - os.path.join("implicit", 'topnc.cpp')], - language='c++', - extra_compile_args=compile_args, - extra_link_args=link_args)) + src_ext = ".pyx" + modules = [ + Extension( + "implicit." + name, + [os.path.join("implicit", name + src_ext)], + language="c++", + extra_compile_args=compile_args, + extra_link_args=link_args, + ) + for name in ["_nearest_neighbours", "lmf", "evaluation"] + ] + modules.extend( + [ + Extension( + "implicit.cpu." + name, + [os.path.join("implicit", "cpu", name + src_ext)], + language="c++", + extra_compile_args=compile_args, + extra_link_args=link_args, + ) + for name in ["_als", "bpr"] + ] + ) + modules.append( + Extension( + "implicit." + "recommender_base", + [ + os.path.join("implicit", "recommender_base" + src_ext), + os.path.join("implicit", "topnc.cpp"), + ], + language="c++", + extra_compile_args=compile_args, + extra_link_args=link_args, + ) + ) if CUDA: - modules.append(Extension("implicit.gpu._cuda", - [os.path.join("implicit", "gpu", "_cuda" + src_ext), - os.path.join("implicit", "gpu", "als.cu"), - os.path.join("implicit", "gpu", "bpr.cu"), - os.path.join("implicit", "gpu", "matrix.cu")], - language="c++", - extra_compile_args=compile_args, - extra_link_args=link_args, - library_dirs=[CUDA['lib64']], - libraries=['cudart', 'cublas', 'curand'], - include_dirs=[CUDA['include'], '.'])) + modules.append( + Extension( + "implicit.gpu._cuda", + [ + os.path.join("implicit", "gpu", "_cuda" + src_ext), + os.path.join("implicit", "gpu", "als.cu"), + os.path.join("implicit", "gpu", "bpr.cu"), + os.path.join("implicit", "gpu", "matrix.cu"), + ], + language="c++", + extra_compile_args=compile_args, + extra_link_args=link_args, + library_dirs=[CUDA["lib64"]], + libraries=["cudart", "cublas", "curand"], + include_dirs=[CUDA["include"], "."], + ) + ) else: print("Failed to find CUDA toolkit. Building without GPU acceleration.") @@ -86,13 +108,16 @@ def define_extensions(): # set_gcc copied from glove-python project # https://github.com/maciejkula/glove-python + def extract_gcc_binaries(): """Try to find GCC on OSX for OpenMP support.""" - patterns = ['/opt/local/bin/g++-mp-[0-9]*.[0-9]*', - '/opt/local/bin/g++-mp-[0-9]*', - '/usr/local/bin/g++-[0-9]*.[0-9]*', - '/usr/local/bin/g++-[0-9]*'] - if platform.system() == 'Darwin': + patterns = [ + "/opt/local/bin/g++-mp-[0-9]*.[0-9]*", + "/opt/local/bin/g++-mp-[0-9]*", + "/usr/local/bin/g++-[0-9]*.[0-9]*", + "/usr/local/bin/g++-[0-9]*", + ] + if platform.system() == "Darwin": gcc_binaries = [] for pattern in patterns: gcc_binaries += glob.glob(pattern) @@ -109,7 +134,7 @@ def extract_gcc_binaries(): def set_gcc(): """Try to use GCC on OSX for OpenMP support.""" # For macports and homebrew - if platform.system() == 'Darwin': + if platform.system() == "Darwin": gcc = extract_gcc_binaries() if gcc is not None: @@ -119,8 +144,9 @@ def set_gcc(): else: global use_openmp use_openmp = False - logging.warning('No GCC available. Install gcc from Homebrew ' - 'using brew install gcc.') + logging.warning( + "No GCC available. Install gcc from Homebrew " "using brew install gcc." + ) set_gcc() @@ -136,32 +162,31 @@ def read(file_name): setup( name=NAME, version=VERSION, - description='Collaborative Filtering for Implicit Feedback Datasets', + description="Collaborative Filtering for Implicit Feedback Datasets", long_description=read("README.md"), long_description_content_type="text/markdown", - url='http://github.com/benfred/implicit/', - author='Ben Frederickson', - author_email='ben@benfrederickson.com', - license='MIT', + url="http://github.com/benfred/implicit/", + author="Ben Frederickson", + author_email="ben@benfrederickson.com", + license="MIT", classifiers=[ - 'Development Status :: 4 - Beta', - 'Natural Language :: English', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 3', - 'Programming Language :: Cython', - 'Operating System :: OS Independent', - 'Topic :: Software Development :: Libraries :: Python Modules'], - - keywords='Matrix Factorization, Implicit Alternating Least Squares, ' - 'Collaborative Filtering, Recommender Systems', - + "Development Status :: 4 - Beta", + "Natural Language :: English", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 3", + "Programming Language :: Cython", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + keywords="Matrix Factorization, Implicit Alternating Least Squares, " + "Collaborative Filtering, Recommender Systems", packages=find_packages(), - install_requires=['numpy', 'scipy>=0.16', 'tqdm>=4.27'], + install_requires=["numpy", "scipy>=0.16", "tqdm>=4.27"], setup_requires=["Cython>=0.24"], ext_modules=define_extensions(), - cmdclass={'build_ext': build_ext}, + cmdclass={"build_ext": build_ext}, test_suite="tests", ) diff --git a/tests/als_test.py b/tests/als_test.py index 0292752b..fda1c628 100644 --- a/tests/als_test.py +++ b/tests/als_test.py @@ -12,35 +12,37 @@ class ALSTest(unittest.TestCase, TestRecommenderBaseMixin): - def _get_model(self): - return AlternatingLeastSquares(factors=3, regularization=0, use_gpu=False, - random_state=23) + return AlternatingLeastSquares(factors=3, regularization=0, use_gpu=False, random_state=23) def test_cg_nan(self): # test issue with CG code that was causing NaN values in output: # https://github.com/benfred/implicit/issues/19#issuecomment-283164905 - raw = [[0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] + raw = [ + [0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + ] counts = csr_matrix(raw, dtype=np.float64) for use_native in (True, False): - model = AlternatingLeastSquares(factors=3, - regularization=0.01, - dtype=np.float64, - use_native=use_native, - use_cg=True, - use_gpu=False, - random_state=23) + model = AlternatingLeastSquares( + factors=3, + regularization=0.01, + dtype=np.float64, + use_native=use_native, + use_cg=True, + use_gpu=False, + random_state=23, + ) model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors @@ -49,38 +51,57 @@ def test_cg_nan(self): def test_cg_nan2(self): # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106) - Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32, - random_state=42, data_rvs=None).T.tocsr() - - configs = [{'use_native': True, 'use_gpu': False}, {'use_native': False, 'use_gpu': False}] + Ciu = random( + m=100, + n=100, + density=0.0005, + format="coo", + dtype=np.float32, + random_state=42, + data_rvs=None, + ).T.tocsr() + + configs = [{"use_native": True, "use_gpu": False}, {"use_native": False, "use_gpu": False}] if HAS_CUDA: - configs.append({'use_gpu': True}) + configs.append({"use_gpu": True}) for options in configs: - model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10, - dtype=np.float32, random_state=23, - **options) + model = AlternatingLeastSquares( + factors=32, + regularization=10, + iterations=10, + dtype=np.float32, + random_state=23, + **options + ) model.fit(Ciu, show_progress=False) self.assertTrue(np.isfinite(model.item_factors).all()) self.assertTrue(np.isfinite(model.user_factors).all()) def test_factorize(self): - counts = csr_matrix([[1, 1, 0, 1, 0, 0], - [0, 1, 1, 1, 0, 0], - [1, 0, 1, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [0, 0, 1, 1, 0, 1], - [0, 1, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1]], dtype=np.float64) + counts = csr_matrix( + [ + [1, 1, 0, 1, 0, 0], + [0, 1, 1, 1, 0, 0], + [1, 0, 1, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 1, 1, 0, 1], + [0, 1, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 1], + ], + dtype=np.float64, + ) user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors - options = [(dtype, cg, native, False) - for dtype in (np.float32, np.float64) - for cg in (False, True) - for native in (False, True)] + options = [ + (dtype, cg, native, False) + for dtype in (np.float32, np.float64) + for cg in (False, True) + for native in (False, True) + ] # also try out GPU support if available if HAS_CUDA: @@ -88,49 +109,61 @@ def test_factorize(self): for dtype, use_cg, use_native, use_gpu in options: try: - model = AlternatingLeastSquares(factors=6, - regularization=0, - dtype=dtype, - use_native=use_native, - use_cg=use_cg, - use_gpu=use_gpu, - random_state=23) + model = AlternatingLeastSquares( + factors=6, + regularization=0, + dtype=dtype, + use_native=use_native, + use_cg=use_cg, + use_gpu=use_gpu, + random_state=23, + ) model.fit(user_items, show_progress=False) rows, cols = model.item_factors, model.user_factors except Exception as e: - self.fail(msg="failed to factorize matrix. Error=%s" - " dtype=%s, cg=%s, native=%s gpu=%s" - % (e, dtype, use_cg, use_native, use_gpu)) + self.fail( + msg="failed to factorize matrix. Error=%s" + " dtype=%s, cg=%s, native=%s gpu=%s" % (e, dtype, use_cg, use_native, use_gpu) + ) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): - self.assertAlmostEqual(counts[i, j], reconstructed[i, j], - delta=0.0001, - msg="failed to reconstruct row=%s, col=%s," - " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" - % (i, j, reconstructed[i, j], dtype, use_cg, - use_native, use_gpu)) + self.assertAlmostEqual( + counts[i, j], + reconstructed[i, j], + delta=0.0001, + msg="failed to reconstruct row=%s, col=%s," + " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" + % (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu), + ) def test_explain(self): - counts = csr_matrix([[1, 1, 0, 1, 0, 0], - [0, 1, 1, 1, 0, 0], - [1, 4, 1, 0, 7, 0], - [1, 1, 0, 0, 0, 0], - [9, 0, 4, 1, 0, 1], - [0, 1, 0, 0, 0, 1], - [0, 0, 2, 0, 1, 1]], dtype=np.float64) + counts = csr_matrix( + [ + [1, 1, 0, 1, 0, 0], + [0, 1, 1, 1, 0, 0], + [1, 4, 1, 0, 7, 0], + [1, 1, 0, 0, 0, 0], + [9, 0, 4, 1, 0, 1], + [0, 1, 0, 0, 0, 1], + [0, 0, 2, 0, 1, 1], + ], + dtype=np.float64, + ) user_items = counts * 2 item_users = user_items.T - model = AlternatingLeastSquares(factors=4, - regularization=20, - use_native=False, - use_cg=False, - use_gpu=False, - iterations=100, - random_state=23) + model = AlternatingLeastSquares( + factors=4, + regularization=20, + use_native=False, + use_cg=False, + use_gpu=False, + iterations=100, + random_state=23, + ) model.fit(user_items, show_progress=False) userid = 0 @@ -154,7 +187,8 @@ def test_explain(self): # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( - userid, item_users, itemid=top_rec, user_weights=W, N=2) + userid, item_users, itemid=top_rec, user_weights=W, N=2 + ) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) @@ -180,20 +214,22 @@ def test_recommend_all(self): offset = 2 recs = model.recommend_all( - user_items[[2, 3, 4]], - N=1, - show_progress=False, - users_items_offset=offset) + user_items[[2, 3, 4]], N=1, show_progress=False, users_items_offset=offset + ) for userid in range(2, 5): - self.assertEqual(len(recs[userid-offset]), 1) - self.assertEqual(recs[userid-offset][0], userid) + self.assertEqual(len(recs[userid - offset]), 1) + self.assertEqual(recs[userid - offset][0], userid) # try asking for more items than possible self.assertRaises(ValueError, model.recommend_all, user_items, N=10000, show_progress=False) self.assertRaises( - ValueError, model.recommend_all, user_items, filter_items=list(range(10000)), - show_progress=False) + ValueError, + model.recommend_all, + user_items, + filter_items=list(range(10000)), + show_progress=False, + ) # filter recommended items using an additional filter list recs = model.recommend_all(user_items, N=1, filter_items=[0], show_progress=False) @@ -201,10 +237,11 @@ def test_recommend_all(self): if HAS_CUDA: + class GPUALSTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return AlternatingLeastSquares(factors=32, regularization=0, - random_state=23) + return AlternatingLeastSquares(factors=32, regularization=0, random_state=23) + if __name__ == "__main__": unittest.main() diff --git a/tests/approximate_als_test.py b/tests/approximate_als_test.py index 90a2d391..09b48981 100644 --- a/tests/approximate_als_test.py +++ b/tests/approximate_als_test.py @@ -2,8 +2,11 @@ import unittest -from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares, - NMSLibAlternatingLeastSquares) +from implicit.approximate_als import ( + AnnoyAlternatingLeastSquares, + FaissAlternatingLeastSquares, + NMSLibAlternatingLeastSquares, +) from implicit.gpu import HAS_CUDA from .recommender_base_test import TestRecommenderBaseMixin @@ -14,13 +17,13 @@ class AnnoyALSTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return AnnoyAlternatingLeastSquares(factors=2, regularization=0, - random_state=23) + return AnnoyAlternatingLeastSquares(factors=2, regularization=0, random_state=23) def test_pickle(self): # pickle isn't supported on annoy indices pass + except ImportError: pass @@ -29,14 +32,15 @@ def test_pickle(self): class NMSLibALSTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return NMSLibAlternatingLeastSquares(factors=2, regularization=0, - index_params={'post': 2}, - random_state=23) + return NMSLibAlternatingLeastSquares( + factors=2, regularization=0, index_params={"post": 2}, random_state=23 + ) def test_pickle(self): # pickle isn't supported on nmslib indices pass + except ImportError: pass @@ -45,21 +49,28 @@ def test_pickle(self): class FaissALSTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return FaissAlternatingLeastSquares(nlist=1, nprobe=1, factors=2, regularization=0, - use_gpu=False, random_state=23) + return FaissAlternatingLeastSquares( + nlist=1, nprobe=1, factors=2, regularization=0, use_gpu=False, random_state=23 + ) def test_pickle(self): # pickle isn't supported on faiss indices pass if HAS_CUDA: + class FaissALSGPUTest(unittest.TestCase, TestRecommenderBaseMixin): __regularization = 0 def _get_model(self): - return FaissAlternatingLeastSquares(nlist=1, nprobe=1, factors=32, - regularization=self.__regularization, - use_gpu=True, random_state=23) + return FaissAlternatingLeastSquares( + nlist=1, + nprobe=1, + factors=32, + regularization=self.__regularization, + use_gpu=True, + random_state=23, + ) def test_similar_items(self): # For the GPU version, we currently have to have factors be a multiple of 32 @@ -89,6 +100,7 @@ def test_pickle(self): # pickle isn't supported on faiss indices pass + except ImportError: pass diff --git a/tests/bpr_test.py b/tests/bpr_test.py index 82835706..7c4db32c 100644 --- a/tests/bpr_test.py +++ b/tests/bpr_test.py @@ -9,10 +9,10 @@ class BPRTest(unittest.TestCase, TestRecommenderBaseMixin): - def _get_model(self): - return BayesianPersonalizedRanking(factors=3, regularization=0, use_gpu=False, - random_state=42) + return BayesianPersonalizedRanking( + factors=3, regularization=0, use_gpu=False, random_state=42 + ) # Test issue #264 causing crashes on empty matrices def test_fit_empty_matrix(self): @@ -26,11 +26,13 @@ def test_fit_almost_empty_matrix(self): if HAS_CUDA: - class BPRGPUTest(unittest.TestCase, TestRecommenderBaseMixin): + class BPRGPUTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return BayesianPersonalizedRanking(factors=31, regularization=0, use_gpu=True, - learning_rate=0.02, random_state=42) + return BayesianPersonalizedRanking( + factors=31, regularization=0, use_gpu=True, learning_rate=0.02, random_state=42 + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/knn_test.py b/tests/knn_test.py index 07448b0c..97fa7e9e 100644 --- a/tests/knn_test.py +++ b/tests/knn_test.py @@ -27,13 +27,18 @@ def _get_model(self): class NearestNeighboursTest(unittest.TestCase): def test_all_pairs_knn(self): - counts = csr_matrix([[5, 1, 0, 9, 0, 0], - [0, 2, 1, 1, 0, 0], - [7, 0, 3, 0, 0, 0], - [1, 8, 0, 0, 0, 0], - [0, 0, 4, 4, 0, 0], - [0, 3, 0, 0, 0, 2], - [0, 0, 0, 0, 6, 0]], dtype=np.float64) + counts = csr_matrix( + [ + [5, 1, 0, 9, 0, 0], + [0, 2, 1, 1, 0, 0], + [7, 0, 3, 0, 0, 0], + [1, 8, 0, 0, 0, 0], + [0, 0, 4, 4, 0, 0], + [0, 3, 0, 0, 0, 2], + [0, 0, 0, 0, 6, 0], + ], + dtype=np.float64, + ) counts = implicit.nearest_neighbours.tfidf_weight(counts).tocsr() # compute all neighbours using matrix dot product @@ -48,9 +53,13 @@ def test_all_pairs_knn(self): # make sure top K selected row = all_neighbours[rowid] - self.assertEqual(set(knn[rowid].indices), - set(colid for colid, _ in - sorted(zip(row.indices, row.data), key=lambda x: -x[1])[:K])) + self.assertEqual( + set(knn[rowid].indices), + set( + colid + for colid, _ in sorted(zip(row.indices, row.data), key=lambda x: -x[1])[:K] + ), + ) if __name__ == "__main__": diff --git a/tests/lmf_test.py b/tests/lmf_test.py index d9983a86..26a952aa 100644 --- a/tests/lmf_test.py +++ b/tests/lmf_test.py @@ -7,8 +7,9 @@ class LMFTest(unittest.TestCase, TestRecommenderBaseMixin): def _get_model(self): - return LogisticMatrixFactorization(factors=3, regularization=0, use_gpu=False, - random_state=43) + return LogisticMatrixFactorization( + factors=3, regularization=0, use_gpu=False, random_state=43 + ) if __name__ == "__main__": diff --git a/tests/recommender_base_test.py b/tests/recommender_base_test.py index 7f1bf815..8900fa79 100644 --- a/tests/recommender_base_test.py +++ b/tests/recommender_base_test.py @@ -12,8 +12,8 @@ class TestRecommenderBaseMixin(object): - """ Mixin to test a bunch of common functionality in models - deriving from RecommenderBase """ + """Mixin to test a bunch of common functionality in models + deriving from RecommenderBase""" def _get_model(self): raise NotImplementedError() @@ -59,8 +59,9 @@ def test_recalculate_user(self): # we should get the same item if we recalculate_user try: - recs_from_liked = model.recommend(userid=0, user_items=user_vector, - N=1, recalculate_user=True) + recs_from_liked = model.recommend( + userid=0, user_items=user_vector, N=1, recalculate_user=True + ) self.assertEqual(recs[0][0], recs_from_liked[0][0]) # TODO: if we set regularization for the model to be sufficiently high, the @@ -80,8 +81,9 @@ def test_evaluation(self): # we've withheld the diagnoal for testing, and have verified that in test_recommend # it is returned for each user. So p@1 should be 1.0 - p = precision_at_k(model, user_items.tocsr(), csr_matrix(np.eye(50)), K=1, - show_progress=False) + p = precision_at_k( + model, user_items.tocsr(), csr_matrix(np.eye(50)), K=1, show_progress=False + ) self.assertEqual(p, 1) def test_similar_users(self): @@ -168,9 +170,9 @@ def test_pickle(self): pickle.loads(pickled) def get_checker_board(self, X): - """ Returns a 'checkerboard' matrix: where every even userid has liked + """Returns a 'checkerboard' matrix: where every even userid has liked every even itemid and every odd userid has liked every odd itemid. - The diagonal is withheld for testing recommend methods """ + The diagonal is withheld for testing recommend methods""" ret = np.zeros((X, X)) for i in range(X): for j in range(i % 2, X, 2):