Skip to content

Commit

Permalink
Add leave one out split to evaluation module. (#420)
Browse files Browse the repository at this point in the history
* Added 'recommender_split' function to 'evaluation' module.

* Added docstrings, new tests.

* Fixed linting errors.

* Added 'recommender_split' function to 'evaluation' module.

* Added docstrings, new tests.

* Fixed linting errors.

* Improve `leave_k_out_split` performance, particularly for k > 1.

* Fixed linting issue.

* Update evaluation_test.py

* Remove auto-generated cpp & header files.

* Add back nearest neighbour header file.

Authored-by: Mark Douthwaite <mark@douthwaite.io>
  • Loading branch information
markdouthwaite committed Feb 10, 2021
1 parent 80f0e4d commit ba673a9
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,4 @@ target/
.ipynb_checkpoints

.vscode/
.idea/
185 changes: 184 additions & 1 deletion implicit/evaluation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import cython
import numpy as np
from cython.operator import dereference
from cython.parallel import parallel, prange
from scipy.sparse import coo_matrix, csr_matrix
from tqdm.auto import tqdm
Expand Down Expand Up @@ -53,6 +52,190 @@ def train_test_split(ratings, train_percentage=0.8, random_state=None):
return train, test


cdef _choose(rng, int n, float frac):
"""Given a range of numbers, select *approximately* 'frac' of them _without_
replacement.
Parameters
----------
rng : int, None or RandomState
The existing RandomState. If None, or an int, will be used
to seed a new numpy RandomState.
n: int
The upper bound on the range to sample from. Will draw from range(0 -> n).
frac: float
The fraction of the total range to be sampled. Must be in interval (0 -> 1).
Returns
-------
ndarray
An array of randomly sampled integers in the range (0 -> n).
"""

size = max(1, int(n * frac))
arr = rng.choice(n, size=size, replace=False)
return arr


cdef _take_tails(arr, int n, return_complement=False, shuffled=False):
"""
Given an array of (optionally shuffled) integers in the range 0->n, take the indices
of the last 'n' occurrences of each integer (tail) -- subject to shuffling.
Concretely, given an array of 25 integers in the range 0->4:
arr = [4 0 1 2 0 3 1 3 3 2 0 3 2 3 3 1 1 2 3 2 3 4 4 0 4]
Return the index of the last 'n' elements for each unique integer. For n=2:
idx = [ 4 23 16 15 9 3 18 20 21 24]
Such that:
arr[idx] = [0 0 1 1 2 2 3 3 4 4]
Parameters
----------
arr: ndarray
The input array. This should be an array of integers in the range 0->n, where
the ordered unique set of integers in said array should produce an array of
consecutive integers. Concretely, the array [1, 0, 1, 1, 0, 3] would be invalid,
but the array [1, 0, 1, 1, 0, 2] would not be.
n: int
The number of elements in the tail to take. Note that no checks are made to
ensure that this value is correct (i.e. that a given integer occurs > n times in
the given array). Invalid values of 'n' will produce IndexErrors.
return_complement: bool
If True, returns the complement (i.e. heads) of the tail indices.
Default is False (only returns tails).
shuffled: bool
Optionally indicate whether you wish the dataset to be shuffled. This will act
to randomise the 'tails' selected.
Returns
-------
output: ndarray or tuple
If 'return_complement' is False, this will return an array of integers
corresponding to the tails
"""

idx = arr.argsort()
sorted_arr = arr[idx]

end = np.bincount(sorted_arr).cumsum() - 1
start = end - n
ranges = np.linspace(start, end, num=n + 1, dtype=int)[1:]

if shuffled:
shuffled_idx = (sorted_arr + np.random.random(arr.shape)).argsort()
tails = shuffled_idx[np.ravel(ranges, order="f")]
else:
tails = np.ravel(ranges, order="f")

heads = np.setdiff1d(idx, tails)

if return_complement:
return idx[tails], idx[heads]
else:
return idx[tails]


cpdef leave_k_out_split(
ratings, int K=1, float train_only_size=0.0, random_state=None
):
"""Implements the 'leave-k-out' split protocol for a ratings matrix. Default
parameters will produce a 'leave-one-out' split.
This will create two matrices, one where each eligible user (i.e. user with > K + 1
ratings) will have a single rating held in the test set, and all other ratings held
in the train set. Optionally, a percentage of users can be reserved to appear _only_
in the train set. By default, all eligible users may appear in the test set.
Parameters
----------
ratings : csr_matrix
The input ratings CSR matrix to be split.
K : int
The total number of samples to be 'left out' in the test set.
train_only_size : float
The size (as a fraction) of the users set that should appear *only* in the
training matrix.
random_state : int, None or RandomState
The existing RandomState. If None, or an int, will be used
to seed a new numpy RandomState.
Returns
-------
(train, test) : csr_matrix, csr_matrix
A tuple of CSR matrix corresponding to training/testing matrices.
"""

if K < 1:
raise ValueError("The 'K' must be >= 1.")
if not 0.0 <= train_only_size < 1.0:
raise ValueError("The 'train_only_size' must be in the range (0.0 <= x < 1.0).")

ratings = ratings.tocoo() # this will sort row/cols unless ratings is COO.
random_state = check_random_state(random_state)

users = ratings.row
items = ratings.col
data = ratings.data

unique_users, counts = np.unique(users, return_counts=True)

# get only users with n + 1 interactions
candidate_mask = counts > K + 1

# keep a given subset of users _only_ in the training set.
if train_only_size > 0.0:
train_only_mask = ~np.isin(
unique_users, _choose(random_state, len(unique_users), train_only_size)
)
candidate_mask = train_only_mask & candidate_mask

# get unique users who appear in the test set
unique_candidate_users = unique_users[candidate_mask]
full_candidate_mask = np.isin(users, unique_candidate_users)

# get all users, items and ratings that match specified requirements to be
# included in test set.
candidate_users = users[full_candidate_mask]
candidate_items = items[full_candidate_mask]
candidate_data = data[full_candidate_mask]

test_idx, train_idx = _take_tails(
candidate_users, K, shuffled=True, return_complement=True
)

# get all remaining remaining candidate user-item pairs, and prepare to append to
# training set.
train_idx = np.setdiff1d(np.arange(len(candidate_users), dtype=int), test_idx)

# build test matrix
test_users = candidate_users[test_idx]
test_items = candidate_items[test_idx]
test_data = candidate_data[test_idx]
test_mat = csr_matrix(
(test_data, (test_users, test_items)), shape=ratings.shape, dtype=ratings.dtype
)

# build training matrix
train_users = np.r_[users[~full_candidate_mask], candidate_users[train_idx]]
train_items = np.r_[items[~full_candidate_mask], candidate_items[train_idx]]
train_data = np.r_[data[~full_candidate_mask], candidate_data[train_idx]]
train_mat = csr_matrix(
(train_data, (train_users, train_items)),
shape=ratings.shape,
dtype=ratings.dtype,
)

return train_mat, test_mat


@cython.boundscheck(False)
def precision_at_k(model, train_user_items, test_user_items, int K=10,
show_progress=True, int num_threads=1):
Expand Down
79 changes: 75 additions & 4 deletions tests/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,93 @@
import unittest

import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix, random

from implicit.evaluation import train_test_split
from implicit.evaluation import leave_k_out_split, train_test_split


class EvaluationTest(unittest.TestCase):
def _get_sample_matrix(self):
@staticmethod
def _get_sample_matrix():
return csr_matrix((np.random.random((10, 10)) > 0.5).astype(np.float64))

def test_split(self):
@staticmethod
def _get_matrix():
mat = random(100, 100, density=0.5, format="csr", dtype=np.float32)
return mat.tocoo()

def test_train_test_split(self):
seed = np.random.randint(1000)
mat = self._get_sample_matrix()
train, test = train_test_split(mat, 0.8, seed)
train2, test2 = train_test_split(mat, 0.8, seed)
self.assertTrue(np.all(train.todense() == train2.todense()))

def test_leave_k_out_returns_correct_shape(self):
"""
Test that the output matrices are of the same shape as the input matrix.
"""

mat = self._get_matrix()
train, test = leave_k_out_split(mat, K=1)
self.assertTrue(train.shape == mat.shape)
self.assertTrue(test.shape == mat.shape)

def test_leave_k_out_outputs_produce_input(self):
"""
Test that the sum of the output matrices is equal to the input matrix (i.e.
that summing the output matrices produces the input matrix).
"""

mat = self._get_matrix()
train, test = leave_k_out_split(mat, K=1)
self.assertTrue(((train + test) - mat).nnz == 0)

def test_leave_k_split_is_reservable(self):
"""
Test that the sum of the train and test set equals the input.
"""

mat = self._get_matrix()
train, test = leave_k_out_split(mat, K=1)

# check all matrices are positive, non-zero
self.assertTrue(mat.sum() > 0)
self.assertTrue(test.sum() > 0)
self.assertTrue(train.sum() > 0)

# check sum of train + test = input
self.assertTrue(((train + test) - mat).nnz == 0)

def test_leave_k_out_gets_correct_train_only_shape(self):
"""Test that the correct number of users appear *only* in the train set."""

mat = self._get_matrix()
train, test = leave_k_out_split(mat, K=1, train_only_size=0.8)
train_only = ~np.isin(np.unique(train.tocoo().row), test.tocoo().row)
self.assertTrue(train_only.sum() == int(train.shape[0] * 0.8))

def test_leave_k_out_raises_error_for_k_less_than_zero(self):
"""
Test that an error is raised when K < 0.
"""

self.assertRaises(ValueError, leave_k_out_split, None, K=0)

def test_leave_k_out_raises_error_for_invalid_train_only_size_lower_bound(self):
"""
Test that an error is raised when train_only_size < 0.
"""

self.assertRaises(ValueError, leave_k_out_split, None, K=1, train_only_size=-1.0)

def test_leave_k_out_raises_error_for_invalid_train_only_size_upper_bound(self):
"""
Test that an error is raised when train_only_size >= 1.
"""

self.assertRaises(ValueError, leave_k_out_split, None, K=1, train_only_size=1.0)


if __name__ == "__main__":
unittest.main()

0 comments on commit ba673a9

Please sign in to comment.