Implement Conjugate Gradient ALS

Implement the algorithm described in the paper "Applications of the Conjugate Gradient Method for Implicit Feedback Collaborative Filtering". More details in the blog post here: http://www.benfrederickson.com/fast-implicit-matrix-factorization/ , but this leads to between a 3x to 19x speed increase in training depending on the number of factors in the model, with identical results.
benfred · Dec 12, 2016 · 4139e0a · 4139e0a
1 parent da1a7fa
commit 4139e0a
Show file tree

Hide file tree

Showing 7 changed files with 304 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -6,8 +6,10 @@ Implicit
 
 Fast Python Collaborative Filtering for Implicit Datasets.
 
-This project provides a fast Python implementation of the algorithm described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
-http://yifanhu.net/PUB/cf.pdf).
+This project provides fast Python implementations of the algorithms described in the paper [Collaborative Filtering for Implicit Feedback Datasets](
+http://yifanhu.net/PUB/cf.pdf) and in [Applications of the Conjugate Gradient Method for Implicit
+Feedback Collaborative
+Filtering](https://pdfs.semanticscholar.org/bfdf/7af6cf7fd7bb5e6b6db5bbd91be11597eaf0.pdf).
 
 
 To install:
@@ -29,7 +31,7 @@ last.fm dataset](https://github.com/benfred/implicit/blob/master/examples/lastfm
 #### Requirements
 
 This library requires SciPy version 0.16 or later. Running on OSX requires an OpenMP compiler,
-which can be installed with homebrew: ```brew install gcc```. 
+which can be installed with homebrew: ```brew install gcc```.
 
 #### Why Use This?
 
@@ -44,7 +46,12 @@ libraries distributed with SciPy. This leads to extremely fast matrix factorizat
 
 On a simple [benchmark](https://github.com/benfred/implicit/blob/master/examples/benchmark.py), this
 library is about 1.8 times faster than the multithreaded C++ implementation provided by Quora's
-[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than [implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).
+[QMF Library](https://github.com/quora/qmf) and at least 60,000 times faster than
+[implicit-mf](https://github.com/MrChrisJohnson/implicit-mf).
+
+A [follow up post](http://www.benfrederickson.com/fast-implicit-matrix-factorization/) describes
+further performance improvements based on the Conjugate Gradient method - that further boosts performance
+by 3x to over 19x depending on the number of factors used.
 
 This library has been tested with Python 2.7 and 3.5. Running 'tox' will
 run unittests on both versions, and verify that all python files pass flake8.

diff --git a/examples/benchmark_cg.py b/examples/benchmark_cg.py
@@ -0,0 +1,146 @@
+""" test script to verify the CG method works, and time it versus cholesky """
+
+from __future__ import print_function
+
+import argparse
+import functools
+import json
+import logging
+import time
+from collections import defaultdict
+
+import numpy
+from implicit._implicit import calculate_loss, least_squares, least_squares_cg
+
+from lastfm import bm25_weight, read_data
+
+
+def benchmark_solver(Cui, factors, solver, callback, iterations=7, dtype=numpy.float64,
+                     regularization=0.00, num_threads=0):
+    users, items = Cui.shape
+
+    # have to explode out most of the alternating_least_squares call here
+    X = numpy.random.rand(users, factors).astype(dtype) * 0.01
+    Y = numpy.random.rand(items, factors).astype(dtype) * 0.01
+
+    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()
+
+    for iteration in range(iterations):
+        s = time.time()
+        solver(Cui, X, Y, regularization, num_threads=num_threads)
+        solver(Ciu, Y, X, regularization, num_threads=num_threads)
+        callback(time.time() - s, X, Y)
+        logging.debug("finished iteration %i in %s", iteration, time.time() - s)
+
+    return X, Y
+
+
+def benchmark_accuracy(plays):
+    output = defaultdict(list)
+    benchmark_solver(plays, 100,
+                     least_squares,
+                     lambda _, X, Y: output['cholesky'].append(calculate_loss(plays, X, Y,
+                                                                              0)),
+                     iterations=25)
+
+    for steps in [2, 3, 4]:
+        benchmark_solver(plays, 100, functools.partial(least_squares_cg, cg_steps=steps),
+                         lambda _, X, Y: output['cg%i' % steps].append(calculate_loss(plays, X, Y,
+                                                                                      0)),
+                         iterations=25)
+
+    return output
+
+
+def benchmark_times(plays):
+    output = defaultdict(list)
+    for factors in [50, 100, 150, 200, 250]:
+        output['factors'].append(factors)
+        for steps in [2, 3, 4]:
+            current = []
+            benchmark_solver(plays, factors,
+                             functools.partial(least_squares_cg, cg_steps=steps),
+                             lambda elapsed, X, Y: current.append(elapsed),
+                             iterations=3)
+            print("cg%i: %i factors : %ss" % (steps, factors, min(current)))
+            output['cg%i' % steps].append(min(current))
+
+        current = []
+        benchmark_solver(plays, factors, least_squares,
+                         lambda elapsed, X, Y: current.append(elapsed),
+                         iterations=3)
+        output['cholesky'].append(min(current))
+        print("cholesky: %i factors : %ss" % (factors, min(current)))
+
+    return output
+
+
+def generate_speed_graph(data, filename="cg_training_speed.html"):
+    from bokeh.plotting import figure, save
+    p = figure(title="Training Time", x_axis_label='Factors', y_axis_label='Seconds')
+
+    to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
+               (data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
+               # (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
+               (data['cholesky'], "Cholesky", "#1f77b4")]
+
+    p = figure(title="Training Speed", x_axis_label='Factors', y_axis_label='Time / Iteration (s)')
+    for current, label, colour in to_plot:
+        p.line(data['factors'], current, legend=label, line_color=colour, line_width=1)
+        p.circle(data['factors'], current, legend=label, line_color=colour, size=6,
+                 fill_color="white")
+    p.legend.location = "top_left"
+    save(p, filename, title="CG ALS Training Speed")
+
+
+def generate_loss_graph(data, filename):
+    from bokeh.plotting import figure, save
+
+    iterations = range(1, len(data['cholesky']) + 1)
+    to_plot = [(data['cg2'], "CG (2 Steps/Iteration)", "#2ca02c"),
+               (data['cg3'], "CG (3 Steps/Iteration)", "#ff7f0e"),
+               # (data['cg4'], "CG (4 Steps/Iteration)", "#d62728"),
+               (data['cholesky'], "Cholesky", "#1f77b4")]
+
+    p = figure(title="Training Loss", x_axis_label='Iteration', y_axis_label='MSE')
+    for loss, label, colour in to_plot:
+        p.line(iterations, loss, legend=label, line_color=colour, line_width=1)
+        p.circle(iterations, loss, legend=label, line_color=colour, size=6, fill_color="white")
+
+    save(p, filename, title="CG ALS Training Loss")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark CG version against Cholesky",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--input', type=str,
+                        dest='inputfile', help='last.fm dataset file', required=True)
+    parser.add_argument('--graph', help='generates graphs (requires bokeh)',
+                        action="store_true")
+    parser.add_argument('--loss', help='test training loss',
+                        action="store_true")
+    parser.add_argument('--speed', help='test training speed',
+                        action="store_true")
+
+    args = parser.parse_args()
+    if not (args.speed or args.loss):
+        print("must specify at least one of --speed or --loss")
+        parser.print_help()
+
+    else:
+
+        plays = bm25_weight(read_data(args.inputfile)[1]).tocsr()
+        logging.basicConfig(level=logging.DEBUG)
+
+        if args.loss:
+            acc = benchmark_accuracy(plays)
+            json.dump(acc, open("cg_accuracy.json", "w"))
+            if args.graph:
+                generate_loss_graph(acc, "cg_accuracy.html")
+
+        if args.speed:
+            speed = benchmark_times(plays)
+            json.dump(speed, open("cg_speed.json", "w"))
+            if args.graph:
+                generate_speed_graph(speed, "cg_speed.html")
diff --git a/examples/lastfm.py b/examples/lastfm.py
@@ -92,7 +92,8 @@ def calculate_similar_artists(input_filename, output_filename,
                               iterations=15,
                               exact=False, trees=20,
                               use_native=True,
-                              dtype=numpy.float64):
+                              dtype=numpy.float64,
+                              cg=False):
     logging.debug("Calculating similar artists. This might take a while")
     logging.debug("reading data from %s", input_filename)
     start = time.time()
@@ -109,7 +110,8 @@ def calculate_similar_artists(input_filename, output_filename,
                                                              regularization=regularization,
                                                              iterations=iterations,
                                                              use_native=use_native,
-                                                             dtype=dtype)
+                                                             dtype=dtype,
+                                                             use_cg=cg)
     logging.debug("calculated factors in %s", time.time() - start)
 
     # write out artists by popularity
@@ -154,6 +156,9 @@ def calculate_similar_artists(input_filename, output_filename,
     parser.add_argument('--float32',
                         help='use 32 bit floating point numbers',
                         action="store_true")
+    parser.add_argument('--cg',
+                        help='use CG optimizer',
+                        action="store_true")
     args = parser.parse_args()
 
     logging.basicConfig(level=logging.DEBUG)
@@ -163,5 +168,6 @@ def calculate_similar_artists(input_filename, output_filename,
                               exact=args.exact, trees=args.treecount,
                               iterations=args.iterations,
                               use_native=not args.purepython,
-                              dtype=numpy.float32 if args.float32 else numpy.float64)
+                              dtype=numpy.float32 if args.float32 else numpy.float64,
+                              cg=args.cg)
 
diff --git a/implicit/_implicit.pyx b/implicit/_implicit.pyx
@@ -28,6 +28,12 @@ cdef inline floating dot(int *n, floating *sx, int *incx, floating *sy, int *inc
     else:
         return cython_blas.sdot(n, sx, incx, sy, incy)
 
+cdef inline void scal(int *n, floating *sa, floating *sx, int *incx) nogil:
+    if floating is double:
+        cython_blas.dscal(n, sa, sx, incx)
+    else:
+        cython_blas.sscal(n, sa, sx, incx)
+
 cdef inline void posv(char * u, int * n, int * nrhs, floating * a, int * lda, floating * b, int * ldb, int * info) nogil:
     if floating is double:
         cython_lapack.dposv(u, n, nrhs, a, lda, b, ldb, info)
@@ -42,7 +48,7 @@ cdef inline void gesv(int * n, int * nrhs, floating * a, int * lda, int * piv, f
 
 
 @cython.boundscheck(False)
-def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads):
+def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularization, int num_threads=0):
     dtype = numpy.float64 if floating is double else numpy.float32
 
     cdef int [:] indptr = Cui.indptr, indices = Cui.indices
@@ -107,6 +113,84 @@ def least_squares(Cui, floating [:, :] X, floating [:, :] Y, double regularizati
             free(pivot)
 
 
+@cython.cdivision(True)
+@cython.boundscheck(False)
+def least_squares_cg(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0, int cg_steps=3):
+    dtype = numpy.float64 if floating is double else numpy.float32
+    cdef int [:] indptr = Cui.indptr, indices = Cui.indices
+    cdef double [:] data = Cui.data
+
+    cdef int users = X.shape[0], N = X.shape[1], u, i, index, one = 1, it
+    cdef floating confidence, temp, alpha, rsnew, rsold
+    cdef floating zero = 0.
+
+    cdef floating[:, :] YtY = numpy.dot(numpy.transpose(Y), Y) + regularization * numpy.eye(N, dtype=dtype)
+
+    cdef floating * x
+    cdef floating * p
+    cdef floating * r
+    cdef floating * Ap
+
+    with nogil, parallel(num_threads = num_threads):
+
+        # allocate temp memory for each thread
+        Ap = <floating *> malloc(sizeof(floating) * N)
+        p = <floating *> malloc(sizeof(floating) * N)
+        r = <floating *> malloc(sizeof(floating) * N)
+        try:
+            for u in prange(users, schedule='guided'):
+                # start from previous iteration
+                x = &X[u, 0]
+
+                # calculate residual r = (YtCuPu - (YtCuY.dot(Xu)
+                temp = -1.0
+                symv("U", &N, &temp, &YtY[0, 0], &N, x, &one, &zero, r, &one)
+
+                for index in range(indptr[u], indptr[u + 1]):
+                    i = indices[index]
+                    confidence = data[index]
+                    temp = confidence - (confidence - 1) * dot(&N, &Y[i, 0], &one, x, &one)
+                    axpy(&N, &temp, &Y[i, 0], &one, r, &one)
+
+                memcpy(p, r, sizeof(floating) * N)
+                rsold = dot(&N, r, &one, r, &one)
+
+                for it in range(cg_steps):
+                    # calculate Ap = YtCuYp - without actually calculating YtCuY
+                    temp = 1.0
+                    symv("U", &N, &temp, &YtY[0, 0], &N, p, &one, &zero, Ap, &one)
+
+                    for index in range(indptr[u], indptr[u + 1]):
+                        i = indices[index]
+                        confidence = data[index]
+                        temp = (confidence - 1) * dot(&N, &Y[i, 0], &one, p, &one)
+                        axpy(&N, &temp, &Y[i, 0], &one, Ap, &one)
+
+                    # alpha = rsold / p.dot(Ap);
+                    alpha = rsold / dot(&N, p, &one, Ap, &one)
+
+                    # x += alpha * p
+                    axpy(&N, &alpha, p, &one, x, &one)
+
+                    # r -= alpha * Ap
+                    temp = alpha * -1
+                    axpy(&N, &temp, Ap, &one, r, &one)
+
+                    rsnew = dot(&N, r, &one, r, &one)
+
+                    # p = r + (rsnew/rsold) * p
+                    temp = rsnew / rsold
+                    scal(&N, &temp, p, &one)
+                    temp = 1.0
+                    axpy(&N, &temp, r, &one, p, &one)
+
+                    rsold = rsnew
+        finally:
+            free(p)
+            free(r)
+            free(Ap)
+
+
 @cython.cdivision(True)
 @cython.boundscheck(False)
 def calculate_loss(Cui, floating [:, :] X, floating [:, :] Y, float regularization, int num_threads=0):