-
Notifications
You must be signed in to change notification settings - Fork 95
/
_subsample.pyx
71 lines (58 loc) · 2.2 KB
/
_subsample.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------
import numpy as np
cimport numpy as cnp
def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Returns
-------
ndarray
Subsampled data
Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)
"""
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()
if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue
r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]
result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1
data[start:end] = result