Permalink
Browse files

Implement Conjugate Gradient ALS

Implement the algorithm described in the paper "Applications of the
Conjugate Gradient Method for Implicit Feedback Collaborative Filtering".

More details in the blog post here:
http://www.benfrederickson.com/fast-implicit-matrix-factorization/ ,
but this leads to between a 3x to 19x speed increase in training depending
on the number of factors in the model, with identical results.
  • Loading branch information...
benfred committed Dec 12, 2016
1 parent da1a7fa commit 4139e0ae19eb227c9d3a1358fefc4dff57f627b5
Showing with 304 additions and 32 deletions.
  1. +11 −4 README.md
  2. +146 −0 examples/benchmark_cg.py
  3. +9 −3 examples/lastfm.py
  4. +85 −1 implicit/_implicit.pyx
  5. +41 −5 implicit/implicit.py
  6. +1 −1 setup.py
  7. +11 −18 tests/implicit_test.py
@@ -6,8 +6,10 @@ Implicit
Fast Python Collaborative Filtering for Implicit Datasets.
This project provides a fast Python implementation of the algorithm described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
http://yifanhu.net/PUB/cf.pdf).
This project provides fast Python implementations of the algorithms described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
http://yifanhu.net/PUB/cf.pdf) and in [Applications of the Conjugate Gradient Method for Implicit
Feedback Collaborative
Filtering](https://pdfs.semanticscholar.org/bfdf/7af6cf7fd7bb5e6b6db5bbd91be11597eaf0.pdf).
To install:
@@ -29,7 +31,7 @@ last.fm dataset](https://github.com/benfred/implicit/blob/master/examples/lastfm
#### Requirements
This library requires SciPy version 0.16 or later. Running on OSX requires an OpenMP compiler,
which can be installed with homebrew: ```brew install gcc```.
which can be installed with homebrew: ```brew install gcc```.
#### Why Use This?
@@ -44,7 +46,12 @@ libraries distributed with SciPy. This leads to extremely fast matrix factorizat
On a simple [benchmark](https://github.com/benfred/implicit/blob/master/examples/benchmark.py), this
library is about 1.8 times faster than the multithreaded C++ implementation provided by Quora's
[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than [implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).
[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than
[implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).
A [follow up post](http://www.benfrederickson.com/fast-implicit-matrix-factorization/) describes
further performance improvements based on the Conjugate Gradient method - that further boosts performance
by 3x to over 19x depending on the number of factors used.
This library has been tested with Python 2.7 and 3.5. Running 'tox' will
run unittests on both versions, and verify that all python files pass flake8.
@@ -0,0 +1,146 @@
""" test script to verify the CG method works, and time it versus cholesky """
from __future__ import print_function
import argparse
import functools
import json
import logging
import time
from collections import defaultdict
import numpy
from implicit._implicit import calculate_loss, least_squares, least_squares_cg
from lastfm import bm25_weight, read_data
def benchmark_solver(Cui, factors, solver, callback, iterations=7, dtype=numpy.float64,
regularization=0.00, num_threads=0):
users, items = Cui.shape
# have to explode out most of the alternating_least_squares call here
X = numpy.random.rand(users, factors).astype(dtype) * 0.01
Y = numpy.random.rand(items, factors).astype(dtype) * 0.01
Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()
for iteration in range(iterations):
s = time.time()
solver(Cui, X, Y, regularization, num_threads=num_threads)
solver(Ciu, Y, X, regularization, num_threads=num_threads)
callback(time.time() - s, X, Y)
logging.debug("finished iteration %i in %s", iteration, time.time() - s)
return X, Y
def benchmark_accuracy(plays):
output = defaultdict(list)
benchmark_solver(plays, 100,
least_squares,
lambda _, X, Y: output['cholesky'].append(calculate_loss(plays, X, Y,
0)),
iterations=25)
for steps in [2, 3, 4]:
benchmark_solver(plays, 100, functools.partial(least_squares_cg, cg_steps=steps),
lambda _, X, Y: output['cg%i' % steps].append(calculate_loss(plays, X, Y,
0)),
iterations=25)
return output
def benchmark_times(plays):
output = defaultdict(list)
for factors in [50, 100, 150, 200, 250]:
output['factors'].append(factors)
for steps in [2, 3, 4]:
current = []
benchmark_solver(plays, factors,
functools.partial(least_squares_cg, cg_steps=steps),
lambda elapsed, X, Y: current.append(elapsed),
iterations=3)
print("cg%i: %i factors : %ss" % (steps, factors, min(current)))
output['cg%i' % steps].append(min(current))
current = []
benchmark_solver(plays, factors, least_squares,
lambda elapsed, X, Y: current.append(elapsed),
iterations=3)
output['cholesky'].append(min(current))
print("cholesky: %i factors : %ss" % (factors, min(current)))
return output
def generate_speed_graph(data, filename="cg_training_speed.html"):
from bokeh.plotting import figure, save
p = figure(title="Training Time", x_axis_label='Factors', y_axis_label='Seconds')
to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
(data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
# (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
(data['cholesky'], "Cholesky", "#1f77b4")]
p = figure(title="Training Speed", x_axis_label='Factors', y_axis_label='Time / Iteration (s)')
for current, label, colour in to_plot:
p.line(data['factors'], current, legend=label, line_color=colour, line_width=1)
p.circle(data['factors'], current, legend=label, line_color=colour, size=6,
fill_color="white")
p.legend.location = "top_left"
save(p, filename, title="CG ALS Training Speed")
def generate_loss_graph(data, filename):
from bokeh.plotting import figure, save
iterations = range(1, len(data['cholesky']) + 1)
to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
(data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
# (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
(data['cholesky'], "Cholesky", "#1f77b4")]
p = figure(title="Training Loss", x_axis_label='Iteration', y_axis_label='MSE')
for loss, label, colour in to_plot:
p.line(iterations, loss, legend=label, line_color=colour, line_width=1)
p.circle(iterations, loss, legend=label, line_color=colour, size=6, fill_color="white")
save(p, filename, title="CG ALS Training Loss")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark CG version against Cholesky",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--input', type=str,
dest='inputfile', help='last.fm dataset file', required=True)
parser.add_argument('--graph', help='generates graphs (requires bokeh)',
action="store_true")
parser.add_argument('--loss', help='test training loss',
action="store_true")
parser.add_argument('--speed', help='test training speed',
action="store_true")
args = parser.parse_args()
if not (args.speed or args.loss):
print("must specify at least one of --speed or --loss")
parser.print_help()
else:
plays = bm25_weight(read_data(args.inputfile)[1]).tocsr()
logging.basicConfig(level=logging.DEBUG)
if args.loss:
acc = benchmark_accuracy(plays)
json.dump(acc, open("cg_accuracy.json", "w"))
if args.graph:
generate_loss_graph(acc, "cg_accuracy.html")
if args.speed:
speed = benchmark_times(plays)
json.dump(speed, open("cg_speed.json", "w"))
if args.graph:
generate_speed_graph(speed, "cg_speed.html")
@@ -92,7 +92,8 @@ def calculate_similar_artists(input_filename, output_filename,
iterations=15,
exact=False, trees=20,
use_native=True,
dtype=numpy.float64):
dtype=numpy.float64,
cg=False):
logging.debug("Calculating similar artists. This might take a while")
logging.debug("reading data from %s", input_filename)
start = time.time()
@@ -109,7 +110,8 @@ def calculate_similar_artists(input_filename, output_filename,
regularization=regularization,
iterations=iterations,
use_native=use_native,
dtype=dtype)
dtype=dtype,
use_cg=cg)
logging.debug("calculated factors in %s", time.time() - start)
# write out artists by popularity
@@ -154,6 +156,9 @@ def calculate_similar_artists(input_filename, output_filename,
parser.add_argument('--float32',
help='use 32 bit floating point numbers',
action="store_true")
parser.add_argument('--cg',
help='use CG optimizer',
action="store_true")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
@@ -163,5 +168,6 @@ def calculate_similar_artists(input_filename, output_filename,
exact=args.exact, trees=args.treecount,
iterations=args.iterations,
use_native=not args.purepython,
dtype=numpy.float32 if args.float32 else numpy.float64)
dtype=numpy.float32 if args.float32 else numpy.float64,
cg=args.cg)
@@ -28,6 +28,12 @@ cdef inline floating dot(int *n, floating *sx, int *incx, floating *sy, int *inc
else:
return cython_blas.sdot(n, sx, incx, sy, incy)
cdef inline void scal(int *n, floating *sa, floating *sx, int *incx) nogil:
if floating is double:
cython_blas.dscal(n, sa, sx, incx)
else:
cython_blas.sscal(n, sa, sx, incx)
cdef inline void posv(char * u, int * n, int * nrhs, floating * a, int * lda, floating * b, int * ldb, int * info) nogil:
if floating is double:
cython_lapack.dposv(u, n, nrhs, a, lda, b, ldb, info)
@@ -42,7 +48,7 @@ cdef inline void gesv(int * n, int * nrhs, floating * a, int * lda, int * piv, f
@cython.boundscheck(False)
def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads):
def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads=0):
dtype = numpy.float64 if floating is double else numpy.float32
cdef int [:] indptr = Cui.indptr, indices = Cui.indices
@@ -107,6 +113,84 @@ def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularizati
free(pivot)
@cython.cdivision(True)
@cython.boundscheck(False)
def least_squares_cg(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0, int cg_steps=3):
dtype = numpy.float64 if floating is double else numpy.float32
cdef int [:] indptr = Cui.indptr, indices = Cui.indices
cdef double [:] data = Cui.data
cdef int users = X.shape[0], N = X.shape[1], u, i, index, one = 1, it
cdef floating confidence, temp, alpha, rsnew, rsold
cdef floating zero = 0.
cdef floating[:, :] YtY = numpy.dot(numpy.transpose(Y), Y) + regularization * numpy.eye(N, dtype=dtype)
cdef floating * x
cdef floating * p
cdef floating * r
cdef floating * Ap
with nogil, parallel(num_threads = num_threads):
# allocate temp memory for each thread
Ap = <floating *> malloc(sizeof(floating) * N)
p = <floating *> malloc(sizeof(floating) * N)
r = <floating *> malloc(sizeof(floating) * N)
try:
for u in prange(users, schedule='guided'):
# start from previous iteration
x = &X[u, 0]
# calculate residual r = (YtCuPu - (YtCuY.dot(Xu)
temp = -1.0
symv("U", &N, &temp, &YtY[0, 0], &N, x, &one, &zero, r, &one)
for index in range(indptr[u], indptr[u + 1]):
i = indices[index]
confidence = data[index]
temp = confidence - (confidence - 1) * dot(&N, &Y[i, 0], &one, x, &one)
axpy(&N, &temp, &Y[i, 0], &one, r, &one)
memcpy(p, r, sizeof(floating) * N)
rsold = dot(&N, r, &one, r, &one)
for it in range(cg_steps):
# calculate Ap = YtCuYp - without actually calculating YtCuY
temp = 1.0
symv("U", &N, &temp, &YtY[0, 0], &N, p, &one, &zero, Ap, &one)
for index in range(indptr[u], indptr[u + 1]):
i = indices[index]
confidence = data[index]
temp = (confidence - 1) * dot(&N, &Y[i, 0], &one, p, &one)
axpy(&N, &temp, &Y[i, 0], &one, Ap, &one)
# alpha = rsold / p.dot(Ap);
alpha = rsold / dot(&N, p, &one, Ap, &one)
# x += alpha * p
axpy(&N, &alpha, p, &one, x, &one)
# r -= alpha * Ap
temp = alpha * -1
axpy(&N, &temp, Ap, &one, r, &one)
rsnew = dot(&N, r, &one, r, &one)
# p = r + (rsnew/rsold) * p
temp = rsnew / rsold
scal(&N, &temp, p, &one)
temp = 1.0
axpy(&N, &temp, r, &one, p, &one)
rsold = rsnew
finally:
free(p)
free(r)
free(Ap)
@cython.cdivision(True)
@cython.boundscheck(False)
def calculate_loss(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0):
Oops, something went wrong.

0 comments on commit 4139e0a

Please sign in to comment.