Skip to content

Commit

Permalink
Implement Conjugate Gradient ALS
Browse files Browse the repository at this point in the history
Implement the algorithm described in the paper "Applications of the
Conjugate Gradient Method for Implicit Feedback Collaborative Filtering".

More details in the blog post here:
http://www.benfrederickson.com/fast-implicit-matrix-factorization/ ,
but this leads to between a 3x to 19x speed increase in training depending
on the number of factors in the model, with identical results.
  • Loading branch information
benfred committed Dec 12, 2016
1 parent da1a7fa commit 4139e0a
Show file tree
Hide file tree
Showing 7 changed files with 304 additions and 32 deletions.
15 changes: 11 additions & 4 deletions README.md
Expand Up @@ -6,8 +6,10 @@ Implicit

Fast Python Collaborative Filtering for Implicit Datasets.

This project provides a fast Python implementation of the algorithm described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
http://yifanhu.net/PUB/cf.pdf).
This project provides fast Python implementations of the algorithms described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
http://yifanhu.net/PUB/cf.pdf) and in [Applications of the Conjugate Gradient Method for Implicit
Feedback Collaborative
Filtering](https://pdfs.semanticscholar.org/bfdf/7af6cf7fd7bb5e6b6db5bbd91be11597eaf0.pdf).


To install:
Expand All @@ -29,7 +31,7 @@ last.fm dataset](https://github.com/benfred/implicit/blob/master/examples/lastfm
#### Requirements

This library requires SciPy version 0.16 or later. Running on OSX requires an OpenMP compiler,
which can be installed with homebrew: ```brew install gcc```.
which can be installed with homebrew: ```brew install gcc```.

#### Why Use This?

Expand All @@ -44,7 +46,12 @@ libraries distributed with SciPy. This leads to extremely fast matrix factorizat

On a simple [benchmark](https://github.com/benfred/implicit/blob/master/examples/benchmark.py), this
library is about 1.8 times faster than the multithreaded C++ implementation provided by Quora's
[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than [implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).
[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than
[implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).

A [follow up post](http://www.benfrederickson.com/fast-implicit-matrix-factorization/) describes
further performance improvements based on the Conjugate Gradient method - that further boosts performance
by 3x to over 19x depending on the number of factors used.

This library has been tested with Python 2.7 and 3.5. Running 'tox' will
run unittests on both versions, and verify that all python files pass flake8.
Expand Down
146 changes: 146 additions & 0 deletions examples/benchmark_cg.py
@@ -0,0 +1,146 @@
""" test script to verify the CG method works, and time it versus cholesky """

from __future__ import print_function

import argparse
import functools
import json
import logging
import time
from collections import defaultdict

import numpy
from implicit._implicit import calculate_loss, least_squares, least_squares_cg

from lastfm import bm25_weight, read_data


def benchmark_solver(Cui, factors, solver, callback, iterations=7, dtype=numpy.float64,
regularization=0.00, num_threads=0):
users, items = Cui.shape

# have to explode out most of the alternating_least_squares call here
X = numpy.random.rand(users, factors).astype(dtype) * 0.01
Y = numpy.random.rand(items, factors).astype(dtype) * 0.01

Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()

for iteration in range(iterations):
s = time.time()
solver(Cui, X, Y, regularization, num_threads=num_threads)
solver(Ciu, Y, X, regularization, num_threads=num_threads)
callback(time.time() - s, X, Y)
logging.debug("finished iteration %i in %s", iteration, time.time() - s)

return X, Y


def benchmark_accuracy(plays):
output = defaultdict(list)
benchmark_solver(plays, 100,
least_squares,
lambda _, X, Y: output['cholesky'].append(calculate_loss(plays, X, Y,
0)),
iterations=25)

for steps in [2, 3, 4]:
benchmark_solver(plays, 100, functools.partial(least_squares_cg, cg_steps=steps),
lambda _, X, Y: output['cg%i' % steps].append(calculate_loss(plays, X, Y,
0)),
iterations=25)

return output


def benchmark_times(plays):
output = defaultdict(list)
for factors in [50, 100, 150, 200, 250]:
output['factors'].append(factors)
for steps in [2, 3, 4]:
current = []
benchmark_solver(plays, factors,
functools.partial(least_squares_cg, cg_steps=steps),
lambda elapsed, X, Y: current.append(elapsed),
iterations=3)
print("cg%i: %i factors : %ss" % (steps, factors, min(current)))
output['cg%i' % steps].append(min(current))

current = []
benchmark_solver(plays, factors, least_squares,
lambda elapsed, X, Y: current.append(elapsed),
iterations=3)
output['cholesky'].append(min(current))
print("cholesky: %i factors : %ss" % (factors, min(current)))

return output


def generate_speed_graph(data, filename="cg_training_speed.html"):
from bokeh.plotting import figure, save
p = figure(title="Training Time", x_axis_label='Factors', y_axis_label='Seconds')

to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
(data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
# (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
(data['cholesky'], "Cholesky", "#1f77b4")]

p = figure(title="Training Speed", x_axis_label='Factors', y_axis_label='Time / Iteration (s)')
for current, label, colour in to_plot:
p.line(data['factors'], current, legend=label, line_color=colour, line_width=1)
p.circle(data['factors'], current, legend=label, line_color=colour, size=6,
fill_color="white")
p.legend.location = "top_left"
save(p, filename, title="CG ALS Training Speed")


def generate_loss_graph(data, filename):
from bokeh.plotting import figure, save

iterations = range(1, len(data['cholesky']) + 1)
to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
(data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
# (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
(data['cholesky'], "Cholesky", "#1f77b4")]

p = figure(title="Training Loss", x_axis_label='Iteration', y_axis_label='MSE')
for loss, label, colour in to_plot:
p.line(iterations, loss, legend=label, line_color=colour, line_width=1)
p.circle(iterations, loss, legend=label, line_color=colour, size=6, fill_color="white")

save(p, filename, title="CG ALS Training Loss")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark CG version against Cholesky",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--input', type=str,
dest='inputfile', help='last.fm dataset file', required=True)
parser.add_argument('--graph', help='generates graphs (requires bokeh)',
action="store_true")
parser.add_argument('--loss', help='test training loss',
action="store_true")
parser.add_argument('--speed', help='test training speed',
action="store_true")

args = parser.parse_args()
if not (args.speed or args.loss):
print("must specify at least one of --speed or --loss")
parser.print_help()

else:

plays = bm25_weight(read_data(args.inputfile)[1]).tocsr()
logging.basicConfig(level=logging.DEBUG)

if args.loss:
acc = benchmark_accuracy(plays)
json.dump(acc, open("cg_accuracy.json", "w"))
if args.graph:
generate_loss_graph(acc, "cg_accuracy.html")

if args.speed:
speed = benchmark_times(plays)
json.dump(speed, open("cg_speed.json", "w"))
if args.graph:
generate_speed_graph(speed, "cg_speed.html")
12 changes: 9 additions & 3 deletions examples/lastfm.py
Expand Up @@ -92,7 +92,8 @@ def calculate_similar_artists(input_filename, output_filename,
iterations=15,
exact=False, trees=20,
use_native=True,
dtype=numpy.float64):
dtype=numpy.float64,
cg=False):
logging.debug("Calculating similar artists. This might take a while")
logging.debug("reading data from %s", input_filename)
start = time.time()
Expand All @@ -109,7 +110,8 @@ def calculate_similar_artists(input_filename, output_filename,
regularization=regularization,
iterations=iterations,
use_native=use_native,
dtype=dtype)
dtype=dtype,
use_cg=cg)
logging.debug("calculated factors in %s", time.time() - start)

# write out artists by popularity
Expand Down Expand Up @@ -154,6 +156,9 @@ def calculate_similar_artists(input_filename, output_filename,
parser.add_argument('--float32',
help='use 32 bit floating point numbers',
action="store_true")
parser.add_argument('--cg',
help='use CG optimizer',
action="store_true")
args = parser.parse_args()

logging.basicConfig(level=logging.DEBUG)
Expand All @@ -163,5 +168,6 @@ def calculate_similar_artists(input_filename, output_filename,
exact=args.exact, trees=args.treecount,
iterations=args.iterations,
use_native=not args.purepython,
dtype=numpy.float32 if args.float32 else numpy.float64)
dtype=numpy.float32 if args.float32 else numpy.float64,
cg=args.cg)

86 changes: 85 additions & 1 deletion implicit/_implicit.pyx
Expand Up @@ -28,6 +28,12 @@ cdef inline floating dot(int *n, floating *sx, int *incx, floating *sy, int *inc
else:
return cython_blas.sdot(n, sx, incx, sy, incy)

cdef inline void scal(int *n, floating *sa, floating *sx, int *incx) nogil:
if floating is double:
cython_blas.dscal(n, sa, sx, incx)
else:
cython_blas.sscal(n, sa, sx, incx)

cdef inline void posv(char * u, int * n, int * nrhs, floating * a, int * lda, floating * b, int * ldb, int * info) nogil:
if floating is double:
cython_lapack.dposv(u, n, nrhs, a, lda, b, ldb, info)
Expand All @@ -42,7 +48,7 @@ cdef inline void gesv(int * n, int * nrhs, floating * a, int * lda, int * piv, f


@cython.boundscheck(False)
def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads):
def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads=0):
dtype = numpy.float64 if floating is double else numpy.float32

cdef int [:] indptr = Cui.indptr, indices = Cui.indices
Expand Down Expand Up @@ -107,6 +113,84 @@ def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularizati
free(pivot)


@cython.cdivision(True)
@cython.boundscheck(False)
def least_squares_cg(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0, int cg_steps=3):
dtype = numpy.float64 if floating is double else numpy.float32
cdef int [:] indptr = Cui.indptr, indices = Cui.indices
cdef double [:] data = Cui.data

cdef int users = X.shape[0], N = X.shape[1], u, i, index, one = 1, it
cdef floating confidence, temp, alpha, rsnew, rsold
cdef floating zero = 0.

cdef floating[:, :] YtY = numpy.dot(numpy.transpose(Y), Y) + regularization * numpy.eye(N, dtype=dtype)

cdef floating * x
cdef floating * p
cdef floating * r
cdef floating * Ap

with nogil, parallel(num_threads = num_threads):

# allocate temp memory for each thread
Ap = <floating *> malloc(sizeof(floating) * N)
p = <floating *> malloc(sizeof(floating) * N)
r = <floating *> malloc(sizeof(floating) * N)
try:
for u in prange(users, schedule='guided'):
# start from previous iteration
x = &X[u, 0]

# calculate residual r = (YtCuPu - (YtCuY.dot(Xu)
temp = -1.0
symv("U", &N, &temp, &YtY[0, 0], &N, x, &one, &zero, r, &one)

for index in range(indptr[u], indptr[u + 1]):
i = indices[index]
confidence = data[index]
temp = confidence - (confidence - 1) * dot(&N, &Y[i, 0], &one, x, &one)
axpy(&N, &temp, &Y[i, 0], &one, r, &one)

memcpy(p, r, sizeof(floating) * N)
rsold = dot(&N, r, &one, r, &one)

for it in range(cg_steps):
# calculate Ap = YtCuYp - without actually calculating YtCuY
temp = 1.0
symv("U", &N, &temp, &YtY[0, 0], &N, p, &one, &zero, Ap, &one)

for index in range(indptr[u], indptr[u + 1]):
i = indices[index]
confidence = data[index]
temp = (confidence - 1) * dot(&N, &Y[i, 0], &one, p, &one)
axpy(&N, &temp, &Y[i, 0], &one, Ap, &one)

# alpha = rsold / p.dot(Ap);
alpha = rsold / dot(&N, p, &one, Ap, &one)

# x += alpha * p
axpy(&N, &alpha, p, &one, x, &one)

# r -= alpha * Ap
temp = alpha * -1
axpy(&N, &temp, Ap, &one, r, &one)

rsnew = dot(&N, r, &one, r, &one)

# p = r + (rsnew/rsold) * p
temp = rsnew / rsold
scal(&N, &temp, p, &one)
temp = 1.0
axpy(&N, &temp, r, &one, p, &one)

rsold = rsnew
finally:
free(p)
free(r)
free(Ap)


@cython.cdivision(True)
@cython.boundscheck(False)
def calculate_loss(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0):
Expand Down

0 comments on commit 4139e0a

Please sign in to comment.