benfred · benfred · Jun 23, 2017 · Jun 21, 2017 · Jun 22, 2017 · Jun 22, 2017
diff --git a/implicit/als.py b/implicit/als.py
@@ -1,9 +1,11 @@
 """ Implicit Alternating Least Squares """
+import heapq
 import itertools
 import logging
 import time
 
 import numpy as np
+import scipy
 
 from . import _als
 from .recommender_base import RecommenderBase
@@ -58,16 +60,16 @@ def fit(self, item_users):
         """
         Ciu, Cui = item_users.tocsr(), item_users.T.tocsr()
         items, users = Ciu.shape
-        self._YtY = None
 
         # Initialize the variables randomly if they haven't already been set
         if self.user_factors is None:
             self.user_factors = np.random.rand(users, self.factors).astype(self.dtype) * 0.01
         if self.item_factors is None:
             self.item_factors = np.random.rand(items, self.factors).astype(self.dtype) * 0.01
 
-        # invalidate cached norms
+        # invalidate cached norms and squared factors
         self._item_norms = None
+        self._YtY = None
 
         solver = self.solver
 
@@ -105,12 +107,46 @@ def recommend(self, userid, user_items, N=10, filter_items=None, recalculate_use
     def _user_factor(self, userid, user_items, recalculate_user=False):
         if not recalculate_user:
             return self.user_factors[userid]
-        Y = self.item_factors
-        if self._YtY is None:
-            self._YtY = Y.T.dot(Y)
-        return user_factor(Y, self._YtY, user_items, userid,
+        return user_factor(self.item_factors, self.YtY,
+                           user_items.tocsr(), userid,
                            self.regularization, self.factors)
 
+    def explain(self, userid, user_items, itemid, user_weights=None, N=10):
+        """ Returns the predicted rating for an user x item pair,
+            the explanation (the contribution from the top N items the user liked),
+            and a user latent factor weight that can be cached if you want to
+            get more than one explanation for the same user.
+        """
+        # user_weights = Cholesky decomposition of Wu^-1
+        # from section 5 of the paper CF for Implicit Feedback Datasets
+        user_items = user_items.tocsr()
+        if user_weights is None:
+            A, _ = user_linear_equation(self.item_factors, self.YtY,
+                                        user_items, userid,
+                                        self.regularization, self.factors)
+            user_weights = scipy.linalg.cho_factor(A)
+        seed_item = self.item_factors[itemid]
+
+        # weighted_item = y_i^t W_u
+        weighted_item = scipy.linalg.cho_solve(user_weights, seed_item)
+
+        total_score = 0.0
+        h = []
+        for i, (itemid, confidence) in enumerate(nonzeros(user_items, userid)):
+            factor = self.item_factors[itemid]
+            # s_u^ij = (y_i^t W^u) y_j
+            score = weighted_item.dot(factor) * confidence
+            total_score += score
+            contribution = (score, itemid)
+            if i < N:
+                heapq.heappush(h, contribution)
+            else:
+                heapq.heappushpop(h, contribution)
+
+        items = (heapq.heappop(h) for i in range(len(h)))
+        top_contributions = list((i, s) for s, i in items)[::-1]
+        return total_score, top_contributions, user_weights
+
     def similar_items(self, itemid, N=10):
         """ Return the top N similar items for itemid. """
         scores = self.item_factors.dot(self.item_factors[itemid]) / self.item_norms
@@ -129,6 +165,13 @@ def solver(self):
             return _als.least_squares_cg if self.use_native else least_squares_cg
         return _als.least_squares if self.use_native else least_squares
 
+    @property
+    def YtY(self):
+        if self._YtY is None:
+            Y = self.item_factors
+            self._YtY = Y.T.dot(Y)
+        return self._YtY
+
 
 def alternating_least_squares(Ciu, factors, **kwargs):
     """ factorizes the matrix Cui using an implicit alternating least squares
@@ -158,7 +201,10 @@ def least_squares(Cui, X, Y, regularization, num_threads=0):
         X[u] = user_factor(Y, YtY, Cui, u, regularization, n_factors)
 
 
-def user_factor(Y, YtY, Cui, u, regularization, n_factors):
+def user_linear_equation(Y, YtY, Cui, u, regularization, n_factors):
+    # Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
+    # YtCuY + regularization * I = YtY + regularization * I + Yt(Cu-I)
+
     # accumulate YtCuY + regularization*I in A
     A = YtY + regularization * np.eye(n_factors)
 
@@ -169,8 +215,12 @@ def user_factor(Y, YtY, Cui, u, regularization, n_factors):
         factor = Y[i]
         A += (confidence - 1) * np.outer(factor, factor)
         b += confidence * factor
+    return A, b
+
 
+def user_factor(Y, YtY, Cui, u, regularization, n_factors):
     # Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
+    A, b = user_linear_equation(Y, YtY, Cui, u, regularization, n_factors)
     return np.linalg.solve(A, b)
 
 

diff --git a/tests/als_test.py b/tests/als_test.py
@@ -50,6 +50,7 @@ def test_factorize(self):
                              [0, 0, 1, 1, 0, 1],
                              [0, 1, 0, 0, 0, 1],
                              [0, 0, 0, 0, 1, 1]], dtype=np.float64)
+        user_items = counts * 2
 
         # try all 8 variants of native/python, cg/cholesky, and
         # 64 vs 32 bit factors
@@ -63,7 +64,7 @@ def test_factorize(self):
                                                         use_native=use_native,
                                                         use_cg=use_cg)
                         np.random.seed(23)
-                        model.fit(counts * 2)
+                        model.fit(user_items)
                         rows, cols = model.item_factors, model.user_factors
 
                     except Exception as e:
@@ -81,6 +82,54 @@ def test_factorize(self):
                                                        % (i, j, reconstructed[i, j], dtype, use_cg,
                                                           use_native))
 
+    def test_explain(self):
+        counts = csr_matrix([[1, 1, 0, 1, 0, 0],
+                             [0, 1, 1, 1, 0, 0],
+                             [1, 4, 1, 0, 7, 0],
+                             [1, 1, 0, 0, 0, 0],
+                             [9, 0, 4, 1, 0, 1],
+                             [0, 1, 0, 0, 0, 1],
+                             [0, 0, 2, 0, 1, 1]], dtype=np.float64)
+        user_items = counts * 2
+        item_users = user_items.T
+
+        model = AlternatingLeastSquares(factors=4,
+                                        regularization=20,
+                                        use_native=False,
+                                        use_cg=False,
+                                        iterations=100)
+        np.random.seed(23)
+        model.fit(user_items)
+
+        userid = 0
+
+        # Assert recommendation is the the same if we recompute user vectors
+        recs = model.recommend(userid, item_users, N=10)
+        recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True)
+        for (item1, score1), (item2, score2) in zip(recs, recalculated_recs):
+            self.assertEqual(item1, item2)
+            self.assertAlmostEqual(score1, score2, 4)
+
+        # Assert explanation makes sense
+        top_rec, score = recalculated_recs[0]
+        score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec)
+        scores = [s for _, s in contributions]
+        items = [i for i, _ in contributions]
+        self.assertAlmostEqual(score, score_explained, 4)
+        self.assertAlmostEqual(score, sum(scores), 4)
+        self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order")
+        self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user")
+
+        # Assert explanation with precomputed user weights is correct
+        top_score_explained, top_contributions, W = model.explain(
+            userid, item_users, itemid=top_rec, user_weights=W, N=2)
+        top_scores = [s for _, s in top_contributions]
+        top_items = [i for i, _ in top_contributions]
+        self.assertEqual(2, len(top_contributions))
+        self.assertAlmostEqual(score, top_score_explained, 4)
+        self.assertEqual(scores[:2], top_scores)
+        self.assertEqual(items[:2], top_items)
+
 
 if __name__ == "__main__":
     unittest.main()