diff --git a/implicit/als.py b/implicit/als.py index 560b8863..723a0ccf 100644 --- a/implicit/als.py +++ b/implicit/als.py @@ -1,9 +1,11 @@ """ Implicit Alternating Least Squares """ +import heapq import itertools import logging import time import numpy as np +import scipy from . import _als from .recommender_base import RecommenderBase @@ -58,7 +60,6 @@ def fit(self, item_users): """ Ciu, Cui = item_users.tocsr(), item_users.T.tocsr() items, users = Ciu.shape - self._YtY = None # Initialize the variables randomly if they haven't already been set if self.user_factors is None: @@ -66,8 +67,9 @@ def fit(self, item_users): if self.item_factors is None: self.item_factors = np.random.rand(items, self.factors).astype(self.dtype) * 0.01 - # invalidate cached norms + # invalidate cached norms and squared factors self._item_norms = None + self._YtY = None solver = self.solver @@ -105,12 +107,46 @@ def recommend(self, userid, user_items, N=10, filter_items=None, recalculate_use def _user_factor(self, userid, user_items, recalculate_user=False): if not recalculate_user: return self.user_factors[userid] - Y = self.item_factors - if self._YtY is None: - self._YtY = Y.T.dot(Y) - return user_factor(Y, self._YtY, user_items, userid, + return user_factor(self.item_factors, self.YtY, + user_items.tocsr(), userid, self.regularization, self.factors) + def explain(self, userid, user_items, itemid, user_weights=None, N=10): + """ Returns the predicted rating for an user x item pair, + the explanation (the contribution from the top N items the user liked), + and a user latent factor weight that can be cached if you want to + get more than one explanation for the same user. + """ + # user_weights = Cholesky decomposition of Wu^-1 + # from section 5 of the paper CF for Implicit Feedback Datasets + user_items = user_items.tocsr() + if user_weights is None: + A, _ = user_linear_equation(self.item_factors, self.YtY, + user_items, userid, + self.regularization, self.factors) + user_weights = scipy.linalg.cho_factor(A) + seed_item = self.item_factors[itemid] + + # weighted_item = y_i^t W_u + weighted_item = scipy.linalg.cho_solve(user_weights, seed_item) + + total_score = 0.0 + h = [] + for i, (itemid, confidence) in enumerate(nonzeros(user_items, userid)): + factor = self.item_factors[itemid] + # s_u^ij = (y_i^t W^u) y_j + score = weighted_item.dot(factor) * confidence + total_score += score + contribution = (score, itemid) + if i < N: + heapq.heappush(h, contribution) + else: + heapq.heappushpop(h, contribution) + + items = (heapq.heappop(h) for i in range(len(h))) + top_contributions = list((i, s) for s, i in items)[::-1] + return total_score, top_contributions, user_weights + def similar_items(self, itemid, N=10): """ Return the top N similar items for itemid. """ scores = self.item_factors.dot(self.item_factors[itemid]) / self.item_norms @@ -129,6 +165,13 @@ def solver(self): return _als.least_squares_cg if self.use_native else least_squares_cg return _als.least_squares if self.use_native else least_squares + @property + def YtY(self): + if self._YtY is None: + Y = self.item_factors + self._YtY = Y.T.dot(Y) + return self._YtY + def alternating_least_squares(Ciu, factors, **kwargs): """ factorizes the matrix Cui using an implicit alternating least squares @@ -158,7 +201,10 @@ def least_squares(Cui, X, Y, regularization, num_threads=0): X[u] = user_factor(Y, YtY, Cui, u, regularization, n_factors) -def user_factor(Y, YtY, Cui, u, regularization, n_factors): +def user_linear_equation(Y, YtY, Cui, u, regularization, n_factors): + # Xu = (YtCuY + regularization * I)^-1 (YtCuPu) + # YtCuY + regularization * I = YtY + regularization * I + Yt(Cu-I) + # accumulate YtCuY + regularization*I in A A = YtY + regularization * np.eye(n_factors) @@ -169,8 +215,12 @@ def user_factor(Y, YtY, Cui, u, regularization, n_factors): factor = Y[i] A += (confidence - 1) * np.outer(factor, factor) b += confidence * factor + return A, b + +def user_factor(Y, YtY, Cui, u, regularization, n_factors): # Xu = (YtCuY + regularization * I)^-1 (YtCuPu) + A, b = user_linear_equation(Y, YtY, Cui, u, regularization, n_factors) return np.linalg.solve(A, b) diff --git a/tests/als_test.py b/tests/als_test.py index a82282ae..d8ca60a4 100644 --- a/tests/als_test.py +++ b/tests/als_test.py @@ -50,6 +50,7 @@ def test_factorize(self): [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1]], dtype=np.float64) + user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors @@ -63,7 +64,7 @@ def test_factorize(self): use_native=use_native, use_cg=use_cg) np.random.seed(23) - model.fit(counts * 2) + model.fit(user_items) rows, cols = model.item_factors, model.user_factors except Exception as e: @@ -81,6 +82,54 @@ def test_factorize(self): % (i, j, reconstructed[i, j], dtype, use_cg, use_native)) + def test_explain(self): + counts = csr_matrix([[1, 1, 0, 1, 0, 0], + [0, 1, 1, 1, 0, 0], + [1, 4, 1, 0, 7, 0], + [1, 1, 0, 0, 0, 0], + [9, 0, 4, 1, 0, 1], + [0, 1, 0, 0, 0, 1], + [0, 0, 2, 0, 1, 1]], dtype=np.float64) + user_items = counts * 2 + item_users = user_items.T + + model = AlternatingLeastSquares(factors=4, + regularization=20, + use_native=False, + use_cg=False, + iterations=100) + np.random.seed(23) + model.fit(user_items) + + userid = 0 + + # Assert recommendation is the the same if we recompute user vectors + recs = model.recommend(userid, item_users, N=10) + recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) + for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): + self.assertEqual(item1, item2) + self.assertAlmostEqual(score1, score2, 4) + + # Assert explanation makes sense + top_rec, score = recalculated_recs[0] + score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) + scores = [s for _, s in contributions] + items = [i for i, _ in contributions] + self.assertAlmostEqual(score, score_explained, 4) + self.assertAlmostEqual(score, sum(scores), 4) + self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") + self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") + + # Assert explanation with precomputed user weights is correct + top_score_explained, top_contributions, W = model.explain( + userid, item_users, itemid=top_rec, user_weights=W, N=2) + top_scores = [s for _, s in top_contributions] + top_items = [i for i, _ in top_contributions] + self.assertEqual(2, len(top_contributions)) + self.assertAlmostEqual(score, top_score_explained, 4) + self.assertEqual(scores[:2], top_scores) + self.assertEqual(items[:2], top_items) + if __name__ == "__main__": unittest.main()