Add leave one out split to evaluation module. (#420)

* Added 'recommender_split' function to 'evaluation' module. * Added docstrings, new tests. * Fixed linting errors. * Added 'recommender_split' function to 'evaluation' module. * Added docstrings, new tests. * Fixed linting errors. * Improve `leave_k_out_split` performance, particularly for k > 1. * Fixed linting issue. * Update evaluation_test.py * Remove auto-generated cpp & header files. * Add back nearest neighbour header file. Authored-by: Mark Douthwaite <mark@douthwaite.io>
benfred · Feb 10, 2021 · ba673a9 · ba673a9
1 parent 80f0e4d
commit ba673a9
Show file tree

Hide file tree

Showing 3 changed files with 260 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -73,3 +73,4 @@ target/
 .ipynb_checkpoints
 
 .vscode/
+.idea/
diff --git a/implicit/evaluation.pyx b/implicit/evaluation.pyx
@@ -3,7 +3,6 @@
 
 import cython
 import numpy as np
-from cython.operator import dereference
 from cython.parallel import parallel, prange
 from scipy.sparse import coo_matrix, csr_matrix
 from tqdm.auto import tqdm
@@ -53,6 +52,190 @@ def train_test_split(ratings, train_percentage=0.8, random_state=None):
     return train, test
 
 
+cdef _choose(rng, int n, float frac):
+    """Given a range of numbers, select *approximately* 'frac' of them _without_
+    replacement.
+
+    Parameters
+    ----------
+    rng : int, None or RandomState
+        The existing RandomState. If None, or an int, will be used
+        to seed a new numpy RandomState.
+    n: int
+        The upper bound on the range to sample from. Will draw from range(0 -> n).
+    frac: float
+        The fraction of the total range to be sampled. Must be in interval (0 -> 1).
+
+    Returns
+    -------
+    ndarray
+        An array of randomly sampled integers in the range (0 -> n).
+
+    """
+
+    size = max(1, int(n * frac))
+    arr = rng.choice(n, size=size, replace=False)
+    return arr
+
+
+cdef _take_tails(arr, int n, return_complement=False, shuffled=False):
+    """
+    Given an array of (optionally shuffled) integers in the range 0->n, take the indices
+    of the last 'n' occurrences of each integer (tail) -- subject to shuffling.
+
+    Concretely, given an array of 25 integers in the range 0->4:
+
+    arr = [4 0 1 2 0 3 1 3 3 2 0 3 2 3 3 1 1 2 3 2 3 4 4 0 4]
+
+    Return the index of the last 'n' elements for each unique integer. For n=2:
+
+    idx = [ 4 23 16 15  9  3 18 20 21 24]
+
+    Such that:
+
+    arr[idx] = [0 0 1 1 2 2 3 3 4 4]
+
+    Parameters
+    ----------
+    arr: ndarray
+        The input array. This should be an array of integers in the range 0->n, where
+        the ordered unique set of integers in said array should produce an array of
+        consecutive integers. Concretely, the array [1, 0, 1, 1, 0, 3] would be invalid,
+        but the array [1, 0, 1, 1, 0, 2] would not be.
+    n: int
+        The number of elements in the tail to take. Note that no checks are made to
+        ensure that this value is correct (i.e. that a given integer occurs > n times in
+        the given array). Invalid values of 'n' will produce IndexErrors.
+    return_complement: bool
+        If True, returns the complement (i.e. heads) of the tail indices.
+        Default is False (only returns tails).
+    shuffled: bool
+        Optionally indicate whether you wish the dataset to be shuffled. This will act
+        to randomise the 'tails' selected.
+
+    Returns
+    -------
+    output: ndarray or tuple
+        If 'return_complement' is False, this will return an array of integers
+        corresponding to the tails
+
+    """
+
+    idx = arr.argsort()
+    sorted_arr = arr[idx]
+
+    end = np.bincount(sorted_arr).cumsum() - 1
+    start = end - n
+    ranges = np.linspace(start, end, num=n + 1, dtype=int)[1:]
+
+    if shuffled:
+        shuffled_idx = (sorted_arr + np.random.random(arr.shape)).argsort()
+        tails = shuffled_idx[np.ravel(ranges, order="f")]
+    else:
+        tails = np.ravel(ranges, order="f")
+
+    heads = np.setdiff1d(idx, tails)
+
+    if return_complement:
+        return idx[tails], idx[heads]
+    else:
+        return idx[tails]
+
+
+cpdef leave_k_out_split(
+    ratings, int K=1, float train_only_size=0.0, random_state=None
+):
+    """Implements the 'leave-k-out' split protocol for a ratings matrix. Default
+    parameters will produce a 'leave-one-out' split.
+
+    This will create two matrices, one where each eligible user (i.e. user with > K + 1
+    ratings) will have a single rating held in the test set, and all other ratings held
+    in the train set. Optionally, a percentage of users can be reserved to appear _only_
+    in the train set. By default, all eligible users may appear in the test set.
+
+    Parameters
+    ----------
+    ratings : csr_matrix
+        The input ratings CSR matrix to be split.
+    K : int
+        The total number of samples to be 'left out' in the test set.
+    train_only_size : float
+        The size (as a fraction) of the users set that should appear *only* in the
+        training matrix.
+    random_state : int, None or RandomState
+        The existing RandomState. If None, or an int, will be used
+        to seed a new numpy RandomState.
+
+    Returns
+    -------
+    (train, test) : csr_matrix, csr_matrix
+        A tuple of CSR matrix corresponding to training/testing matrices.
+
+    """
+
+    if K < 1:
+        raise ValueError("The 'K' must be >= 1.")
+    if not 0.0 <= train_only_size < 1.0:
+        raise ValueError("The 'train_only_size' must be in the range (0.0 <= x < 1.0).")
+
+    ratings = ratings.tocoo()  # this will sort row/cols unless ratings is COO.
+    random_state = check_random_state(random_state)
+
+    users = ratings.row
+    items = ratings.col
+    data = ratings.data
+
+    unique_users, counts = np.unique(users, return_counts=True)
+
+    # get only users with n + 1 interactions
+    candidate_mask = counts > K + 1
+
+    # keep a given subset of users _only_ in the training set.
+    if train_only_size > 0.0:
+        train_only_mask = ~np.isin(
+            unique_users, _choose(random_state, len(unique_users), train_only_size)
+        )
+        candidate_mask = train_only_mask & candidate_mask
+
+    # get unique users who appear in the test set
+    unique_candidate_users = unique_users[candidate_mask]
+    full_candidate_mask = np.isin(users, unique_candidate_users)
+
+    # get all users, items and ratings that match specified requirements to be
+    # included in test set.
+    candidate_users = users[full_candidate_mask]
+    candidate_items = items[full_candidate_mask]
+    candidate_data = data[full_candidate_mask]
+
+    test_idx, train_idx = _take_tails(
+        candidate_users, K, shuffled=True, return_complement=True
+    )
+
+    # get all remaining remaining candidate user-item pairs, and prepare to append to
+    # training set.
+    train_idx = np.setdiff1d(np.arange(len(candidate_users), dtype=int), test_idx)
+
+    # build test matrix
+    test_users = candidate_users[test_idx]
+    test_items = candidate_items[test_idx]
+    test_data = candidate_data[test_idx]
+    test_mat = csr_matrix(
+        (test_data, (test_users, test_items)), shape=ratings.shape, dtype=ratings.dtype
+    )
+
+    # build training matrix
+    train_users = np.r_[users[~full_candidate_mask], candidate_users[train_idx]]
+    train_items = np.r_[items[~full_candidate_mask], candidate_items[train_idx]]
+    train_data = np.r_[data[~full_candidate_mask], candidate_data[train_idx]]
+    train_mat = csr_matrix(
+        (train_data, (train_users, train_items)),
+        shape=ratings.shape,
+        dtype=ratings.dtype,
+    )
+
+    return train_mat, test_mat
+
+
 @cython.boundscheck(False)
 def precision_at_k(model, train_user_items, test_user_items, int K=10,
                    show_progress=True, int num_threads=1):

diff --git a/tests/evaluation_test.py b/tests/evaluation_test.py
@@ -3,22 +3,93 @@
 import unittest
 
 import numpy as np
-from scipy.sparse import csr_matrix
+from scipy.sparse import csr_matrix, random
 
-from implicit.evaluation import train_test_split
+from implicit.evaluation import leave_k_out_split, train_test_split
 
 
 class EvaluationTest(unittest.TestCase):
-    def _get_sample_matrix(self):
+    @staticmethod
+    def _get_sample_matrix():
         return csr_matrix((np.random.random((10, 10)) > 0.5).astype(np.float64))
 
-    def test_split(self):
+    @staticmethod
+    def _get_matrix():
+        mat = random(100, 100, density=0.5, format="csr", dtype=np.float32)
+        return mat.tocoo()
+
+    def test_train_test_split(self):
         seed = np.random.randint(1000)
         mat = self._get_sample_matrix()
         train, test = train_test_split(mat, 0.8, seed)
         train2, test2 = train_test_split(mat, 0.8, seed)
         self.assertTrue(np.all(train.todense() == train2.todense()))
 
+    def test_leave_k_out_returns_correct_shape(self):
+        """
+        Test that the output matrices are of the same shape as the input matrix.
+        """
+
+        mat = self._get_matrix()
+        train, test = leave_k_out_split(mat, K=1)
+        self.assertTrue(train.shape == mat.shape)
+        self.assertTrue(test.shape == mat.shape)
+
+    def test_leave_k_out_outputs_produce_input(self):
+        """
+        Test that the sum of the output matrices is equal to the input matrix (i.e.
+        that summing the output matrices produces the input matrix).
+        """
+
+        mat = self._get_matrix()
+        train, test = leave_k_out_split(mat, K=1)
+        self.assertTrue(((train + test) - mat).nnz == 0)
+
+    def test_leave_k_split_is_reservable(self):
+        """
+        Test that the sum of the train and test set equals the input.
+        """
+
+        mat = self._get_matrix()
+        train, test = leave_k_out_split(mat, K=1)
+
+        # check all matrices are positive, non-zero
+        self.assertTrue(mat.sum() > 0)
+        self.assertTrue(test.sum() > 0)
+        self.assertTrue(train.sum() > 0)
+
+        # check sum of train + test = input
+        self.assertTrue(((train + test) - mat).nnz == 0)
+
+    def test_leave_k_out_gets_correct_train_only_shape(self):
+        """Test that the correct number of users appear *only* in the train set."""
+
+        mat = self._get_matrix()
+        train, test = leave_k_out_split(mat, K=1, train_only_size=0.8)
+        train_only = ~np.isin(np.unique(train.tocoo().row), test.tocoo().row)
+        self.assertTrue(train_only.sum() == int(train.shape[0] * 0.8))
+
+    def test_leave_k_out_raises_error_for_k_less_than_zero(self):
+        """
+        Test that an error is raised when K < 0.
+        """
+
+        self.assertRaises(ValueError, leave_k_out_split, None, K=0)
+
+    def test_leave_k_out_raises_error_for_invalid_train_only_size_lower_bound(self):
+        """
+        Test that an error is raised when train_only_size < 0.
+        """
+
+        self.assertRaises(ValueError, leave_k_out_split, None, K=1, train_only_size=-1.0)
+
+    def test_leave_k_out_raises_error_for_invalid_train_only_size_upper_bound(self):
+        """
+        Test that an error is raised when train_only_size >= 1.
+        """
+
+        self.assertRaises(ValueError, leave_k_out_split, None, K=1, train_only_size=1.0)
+
 
 if __name__ == "__main__":
     unittest.main()