Fix ordering in .recommend for KNN models (#629)

#482 introduced a bug where the output of the knn models wasn't strictly ordered by score. Fix this and add a test that would have caught this bug
benfred · Nov 21, 2022 · 4254b6a · 4254b6a
1 parent ba87a1c
commit 4254b6a
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 2 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -17,7 +17,6 @@ disable=fixme,
     trailing-whitespace,
     invalid-name,
     import-error,
-    no-self-use,
 
     # disable code-complexity check
     too-many-function-args,

diff --git a/implicit/_nearest_neighbours.pyx b/implicit/_nearest_neighbours.pyx
@@ -5,11 +5,14 @@ import warnings
 import cython
 import numpy as np
 import scipy.sparse
+
 from cython cimport floating, integral
+
 from cython.operator import dereference
 from cython.parallel import parallel, prange
 
 from libcpp cimport bool
+from libcpp.algorithm cimport sort_heap
 from libcpp.utility cimport pair
 from libcpp.vector cimport vector
 
@@ -18,10 +21,16 @@ from tqdm.auto import tqdm
 from implicit.utils import check_csr
 
 
+cdef extern from "<functional>" namespace "std" nogil:
+    cdef cppclass greater[T=*]:
+        greater() except +
+
+
 cdef extern from "implicit/nearest_neighbours.h" namespace "implicit" nogil:
     cdef cppclass TopK[Index, Value]:
         TopK(size_t K)
         vector[pair[Value, Index]] results
+        greater[pair[Value, Index]] heap_order
 
     cdef cppclass SparseMatrixMultiplier[Index, Value]:
         SparseMatrixMultiplier(Index item_count)
@@ -83,6 +92,7 @@ cdef class NearestNeighboursScorer(object):
             indices = ret_indices = np.zeros(count, dtype=np.int32)
             data = ret_data = np.zeros(count)
 
+            sort_heap(topK.results.begin(), topK.results.end(), topK.heap_order)
             with nogil:
                 i = 0
                 for result in topK.results:
@@ -143,6 +153,7 @@ def all_pairs_knn(users, unsigned int K=100, int num_threads=0, show_progress=Tr
                 neighbours.foreach(dereference(topk))
 
                 index2 = K * i
+
                 for result in topk.results:
                     rows[index2] = i
                     cols[index2] = result.second

diff --git a/tests/recommender_base_test.py b/tests/recommender_base_test.py
@@ -7,12 +7,14 @@
 import numpy as np
 import pytest
 from numpy.testing import assert_array_equal
-from scipy.sparse import csr_matrix
+from scipy.sparse import coo_matrix, csr_matrix
 
 from implicit.evaluation import precision_at_k
 from implicit.nearest_neighbours import ItemItemRecommender
 from implicit.utils import ParameterWarning
 
+# pylint: disable=too-many-public-methods
+
 
 def get_checker_board(X):
     """Returns a 'checkerboard' matrix: where every even userid has liked
@@ -97,6 +99,7 @@ def test_recommend_batch(self):
         ids, scores = model.recommend(
             userids, user_items[userids], N=5, filter_already_liked_items=False
         )
+
         for userid in range(50):
             ids_user, scores_user = model.recommend(
                 userid, user_items[userid], N=5, filter_already_liked_items=False
@@ -310,6 +313,26 @@ def test_fit_non_csr_matrix(self):
         with pytest.warns(ParameterWarning):
             model.fit(user_items.tolil(), show_progress=False)
 
+    def test_fit_ordering(self):
+        # models should return scores that are decreasing in value
+        samples = 1000
+        user_count = 100
+        item_count = 200
+
+        rng = np.random.RandomState(10)
+        itemids = rng.randint(0, item_count, size=samples, dtype=np.int32)
+        userids = rng.randint(0, user_count, size=samples, dtype=np.int32)
+
+        likes = coo_matrix((np.ones(samples), (userids, itemids))).tocsr()
+
+        model = self._get_model()
+
+        model.fit(likes, show_progress=False)
+        for userid in userids:
+            ids, scores = model.recommend(userid, likes[userid])
+            print(ids, scores)
+            assert np.all(np.diff(scores) <= 0)
+
     def test_dtype(self):
         # models should be able to accept input of either float32 or float64
         item_users = get_checker_board(50)