Skip to content

Commit

Permalink
Fix ordering in .recommend for KNN models (#629)
Browse files Browse the repository at this point in the history
#482 introduced a bug
where the output of the knn models wasn't strictly ordered by score.

Fix this and add a test that would have caught this bug
  • Loading branch information
benfred committed Nov 21, 2022
1 parent ba87a1c commit 4254b6a
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
1 change: 0 additions & 1 deletion .pylintrc
Expand Up @@ -17,7 +17,6 @@ disable=fixme,
trailing-whitespace,
invalid-name,
import-error,
no-self-use,

# disable code-complexity check
too-many-function-args,
Expand Down
11 changes: 11 additions & 0 deletions implicit/_nearest_neighbours.pyx
Expand Up @@ -5,11 +5,14 @@ import warnings
import cython
import numpy as np
import scipy.sparse

from cython cimport floating, integral

from cython.operator import dereference
from cython.parallel import parallel, prange

from libcpp cimport bool
from libcpp.algorithm cimport sort_heap
from libcpp.utility cimport pair
from libcpp.vector cimport vector

Expand All @@ -18,10 +21,16 @@ from tqdm.auto import tqdm
from implicit.utils import check_csr


cdef extern from "<functional>" namespace "std" nogil:
cdef cppclass greater[T=*]:
greater() except +


cdef extern from "implicit/nearest_neighbours.h" namespace "implicit" nogil:
cdef cppclass TopK[Index, Value]:
TopK(size_t K)
vector[pair[Value, Index]] results
greater[pair[Value, Index]] heap_order

cdef cppclass SparseMatrixMultiplier[Index, Value]:
SparseMatrixMultiplier(Index item_count)
Expand Down Expand Up @@ -83,6 +92,7 @@ cdef class NearestNeighboursScorer(object):
indices = ret_indices = np.zeros(count, dtype=np.int32)
data = ret_data = np.zeros(count)

sort_heap(topK.results.begin(), topK.results.end(), topK.heap_order)
with nogil:
i = 0
for result in topK.results:
Expand Down Expand Up @@ -143,6 +153,7 @@ def all_pairs_knn(users, unsigned int K=100, int num_threads=0, show_progress=Tr
neighbours.foreach(dereference(topk))

index2 = K * i

for result in topk.results:
rows[index2] = i
cols[index2] = result.second
Expand Down
25 changes: 24 additions & 1 deletion tests/recommender_base_test.py
Expand Up @@ -7,12 +7,14 @@
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, csr_matrix

from implicit.evaluation import precision_at_k
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.utils import ParameterWarning

# pylint: disable=too-many-public-methods


def get_checker_board(X):
"""Returns a 'checkerboard' matrix: where every even userid has liked
Expand Down Expand Up @@ -97,6 +99,7 @@ def test_recommend_batch(self):
ids, scores = model.recommend(
userids, user_items[userids], N=5, filter_already_liked_items=False
)

for userid in range(50):
ids_user, scores_user = model.recommend(
userid, user_items[userid], N=5, filter_already_liked_items=False
Expand Down Expand Up @@ -310,6 +313,26 @@ def test_fit_non_csr_matrix(self):
with pytest.warns(ParameterWarning):
model.fit(user_items.tolil(), show_progress=False)

def test_fit_ordering(self):
# models should return scores that are decreasing in value
samples = 1000
user_count = 100
item_count = 200

rng = np.random.RandomState(10)
itemids = rng.randint(0, item_count, size=samples, dtype=np.int32)
userids = rng.randint(0, user_count, size=samples, dtype=np.int32)

likes = coo_matrix((np.ones(samples), (userids, itemids))).tocsr()

model = self._get_model()

model.fit(likes, show_progress=False)
for userid in userids:
ids, scores = model.recommend(userid, likes[userid])
print(ids, scores)
assert np.all(np.diff(scores) <= 0)

def test_dtype(self):
# models should be able to accept input of either float32 or float64
item_users = get_checker_board(50)
Expand Down

0 comments on commit 4254b6a

Please sign in to comment.