Merge pull request #70 from bashtage/relax-sign-test

BUG: Correct handling of nans in distributions
bashtage · Apr 10, 2019 · fea5a6c · fea5a6c
2 parents 5d0d8fd + b731988
commit fea5a6c
Show file tree

Hide file tree

Showing 14 changed files with 749 additions and 594 deletions.
diff --git a/README.rst b/README.rst
diff --git a/doc/source/change-log.rst b/doc/source/change-log.rst
@@ -1,6 +1,11 @@
 Change Log
 ----------
 
+Since v1.16.2
+=============
+- Added broadcasting to multinomial (see
+  `NumPy issue 9710 <https://github.com/numpy/numpy/pull/9710>`_)
+
 v1.16.2
 =======
 - Updated Xoroshiro120 to use AUthor's latest parameterization

diff --git a/randomgen/common.pxd b/randomgen/common.pxd
@@ -16,6 +16,7 @@ cdef enum ConstraintType:
     CONS_NONE
     CONS_NON_NEGATIVE
     CONS_POSITIVE
+    CONS_POSITIVE_NOT_NAN
     CONS_BOUNDED_0_1
     CONS_BOUNDED_0_1_NOTNAN
     CONS_BOUNDED_GT_0_1

diff --git a/randomgen/common.pyx b/randomgen/common.pyx
@@ -297,52 +297,60 @@ cdef uint64_t MAXSIZE = <uint64_t>sys.maxsize
 
 cdef int check_array_constraint(np.ndarray val, object name, constraint_type cons) except -1:
     if cons == CONS_NON_NEGATIVE:
-        if np.any(np.signbit(val)) or np.any(np.isnan(val)):
+        if np.any(np.logical_and(np.logical_not(np.isnan(val)), np.signbit(val))):
             raise ValueError(name + " < 0")
-    elif cons == CONS_POSITIVE:
-        if not np.all(np.greater(val, 0)):
+    elif cons == CONS_POSITIVE or cons == CONS_POSITIVE_NOT_NAN:
+        if cons == CONS_POSITIVE_NOT_NAN and np.any(np.isnan(val)):
+            raise ValueError(name + " must not be NaN")
+        elif np.any(np.less_equal(val, 0)):
             raise ValueError(name + " <= 0")
     elif cons == CONS_BOUNDED_0_1:
         if not np.all(np.greater_equal(val, 0)) or \
                 not np.all(np.less_equal(val, 1)):
-            raise ValueError(name + " < 0 or " + name + " > 1")
+            raise ValueError("{0} < 0 , {0} > 1 or {0} contains NaNs".format(name))
     elif cons == CONS_BOUNDED_GT_0_1:
         if not np.all(np.greater(val, 0)) or not np.all(np.less_equal(val, 1)):
-            raise ValueError(name + " <= 0 or " + name + " > 1")
+            raise ValueError("{0} <= 0 , {0} > 1 or {0} contains NaNs".format(name))
     elif cons == CONS_GT_1:
         if not np.all(np.greater(val, 1)):
-            raise ValueError(name + " <= 1")
+            raise ValueError("{0} <= 1 or {0} contains NaNs".format(name))
     elif cons == CONS_GTE_1:
         if not np.all(np.greater_equal(val, 1)):
-            raise ValueError(name + " < 1")
+            raise ValueError("{0} < 1 or {0} contains NaNs".format(name))
     elif cons == CONS_POISSON:
         if not np.all(np.less_equal(val, POISSON_LAM_MAX)):
-            raise ValueError(name + " value too large")
-        if not np.all(np.greater_equal(val, 0.0)):
-            raise ValueError(name + " < 0")
+            raise ValueError("{0} value too large".format(name))
+        elif not np.all(np.greater_equal(val, 0.0)):
+            raise ValueError("{0} < 0 or {0} contains NaNs".format(name))
 
     return 0
 
 
 cdef int check_constraint(double val, object name, constraint_type cons) except -1:
+    cdef bint is_nan
     if cons == CONS_NON_NEGATIVE:
-        if np.signbit(val) or np.isnan(val):
+        if not np.isnan(val) and np.signbit(val):
             raise ValueError(name + " < 0")
-    elif cons == CONS_POSITIVE:
-        if not (val > 0):
+    elif cons == CONS_POSITIVE or cons == CONS_POSITIVE_NOT_NAN:
+        if cons == CONS_POSITIVE_NOT_NAN and np.isnan(val):
+            raise ValueError(name + " must not be NaN")
+        elif val <= 0:
             raise ValueError(name + " <= 0")
     elif cons == CONS_BOUNDED_0_1:
         if not (val >= 0) or not (val <= 1):
-            raise ValueError(name + " < 0 or " + name + " > 1")
+            raise ValueError("{0} < 0 , {0} > 1 or {0} is NaN".format(name))
+    elif cons == CONS_BOUNDED_GT_0_1:
+        if not val >0 or not val <= 1:
+            raise ValueError("{0} <= 0 , {0} > 1 or {0} contains NaNs".format(name))
     elif cons == CONS_GT_1:
         if not (val > 1):
-            raise ValueError(name + " <= 1")
+            raise ValueError("{0} <= 1 or {0} is NaN".format(name))
     elif cons == CONS_GTE_1:
         if not (val >= 1):
-            raise ValueError(name + " < 1")
+            raise ValueError("{0} < 1 or {0} is NaN".format(name))
     elif cons == CONS_POISSON:
         if not (val >= 0):
-            raise ValueError(name + " < 0")
+            raise ValueError("{0} < 0 or {0} is NaN".format(name))
         elif not (val <= POISSON_LAM_MAX):
             raise ValueError(name + " value too large")
 

diff --git a/randomgen/distributions.pxd b/randomgen/distributions.pxd
@@ -144,3 +144,5 @@ cdef extern from "src/distributions/distributions.h":
                                   np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
                                   bint use_masked,
                                   np.npy_bool *out) nogil
+    void random_multinomial(brng_t *brng_state, int64_t n, int64_t *mnix,
+                            double *pix, np.npy_intp d, binomial_t *binomial) nogil
diff --git a/randomgen/generator.pyx b/randomgen/generator.pyx
@@ -25,7 +25,6 @@ import randomgen.pickle
 
 np.import_array()
 
-
 cdef class RandomGenerator:
     """
     RandomGenerator(brng=None)
@@ -908,7 +907,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         d0, d1, ..., dn : int, optional
-            The dimensions of the returned array, should all be positive.
+            The dimensions of the returned array, must be non-negative.
             If no argument is given a single Python float is returned.
         dtype : {str, dtype}, optional
             Desired dtype of the result, either 'd' (or 'float64') or 'f'
@@ -958,7 +957,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         d0, d1, ..., dn : int, optional
-            The dimensions of the returned array, should be all positive.
+            The dimensions of the returned array, must be non-negative.
             If no argument is given a single Python float is returned.
         dtype : {str, dtype}, optional
             Desired dtype of the result, either 'd' (or 'float64') or 'f'
@@ -1447,7 +1446,7 @@ cdef class RandomGenerator:
 
         Samples are drawn from an F distribution with specified parameters,
         `dfnum` (degrees of freedom in numerator) and `dfden` (degrees of
-        freedom in denominator), where both parameters should be greater than
+        freedom in denominator), where both parameters must be greater than
         zero.
 
         The random variate of the F distribution (also known as the
@@ -1458,9 +1457,9 @@ cdef class RandomGenerator:
         Parameters
         ----------
         dfnum : float or array_like of floats
-            Degrees of freedom in numerator, should be > 0.
+            Degrees of freedom in numerator, must be > 0.
         dfden : float or array_like of float
-            Degrees of freedom in denominator, should be > 0.
+            Degrees of freedom in denominator, must be > 0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -1541,15 +1540,15 @@ cdef class RandomGenerator:
         Parameters
         ----------
         dfnum : float or array_like of floats
-            Numerator degrees of freedom, should be > 0.
+            Numerator degrees of freedom, must be > 0.
 
             .. versionchanged:: 1.14.0
                Earlier NumPy versions required dfnum > 1.
         dfden : float or array_like of floats
-            Denominator degrees of freedom, should be > 0.
+            Denominator degrees of freedom, must be > 0.
         nonc : float or array_like of floats
             Non-centrality parameter, the sum of the squares of the numerator
-            means, should be >= 0.
+            means, must be >= 0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -1618,7 +1617,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         df : float or array_like of floats
-             Number of degrees of freedom, should be > 0.
+             Number of degrees of freedom, must be > 0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -1684,12 +1683,12 @@ cdef class RandomGenerator:
         Parameters
         ----------
         df : float or array_like of floats
-            Degrees of freedom, should be > 0.
+            Degrees of freedom, must be > 0.
 
             .. versionchanged:: 1.10.0
                Earlier NumPy versions required dfnum > 1.
         nonc : float or array_like of floats
-            Non-centrality, should be non-negative.
+            Non-centrality, must be non-negative.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -1830,7 +1829,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         df : float or array_like of floats
-            Degrees of freedom, should be > 0.
+            Degrees of freedom, must be > 0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -2020,7 +2019,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         a : float or array_like of floats
-            Shape of the distribution. Must all be positive.
+            Shape of the distribution. Must be positive.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -2836,9 +2835,9 @@ cdef class RandomGenerator:
             Lower limit.
         mode : float or array_like of floats
             The value where the peak of the distribution occurs.
-            The value should fulfill the condition ``left <= mode <= right``.
+            The value must fulfill the condition ``left <= mode <= right``.
         right : float or array_like of floats
-            Upper limit, should be larger than `left`.
+            Upper limit, must be larger than `left`.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -3133,7 +3132,7 @@ cdef class RandomGenerator:
 
         """
         return disc(&random_negative_binomial, self._brng, size, self.lock, 2, 0,
-                    n, 'n', CONS_POSITIVE,
+                    n, 'n', CONS_POSITIVE_NOT_NAN,
                     p, 'p', CONS_BOUNDED_0_1,
                     0.0, '', CONS_NONE)
 
@@ -3149,7 +3148,7 @@ cdef class RandomGenerator:
         Parameters
         ----------
         lam : float or array_like of floats
-            Expectation of interval, should be >= 0. A sequence of expectation
+            Expectation of interval, must be >= 0. A sequence of expectation
             intervals must be broadcastable over the requested size.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
@@ -3488,7 +3487,7 @@ cdef class RandomGenerator:
 
         Notes
         -----
-        The probability density for the Log Series distribution is
+        The probability mass function for the Log Series distribution is
 
         .. math:: P(k) = \\frac{-p^k}{k \\ln(1-p)},
 
@@ -3702,7 +3701,7 @@ cdef class RandomGenerator:
         x.shape = tuple(final_shape)
         return x
 
-    def multinomial(self, np.npy_intp n, object pvals, size=None):
+    def multinomial(self, object n, object pvals, size=None):
         """
         multinomial(n, pvals, size=None)
 
@@ -3718,7 +3717,7 @@ cdef class RandomGenerator:
 
         Parameters
         ----------
-        n : int
+        n : int or array-like of ints
             Number of experiments.
         pvals : sequence of floats, length p
             Probabilities of each of the ``p`` different outcomes.  These
@@ -3757,6 +3756,18 @@ cdef class RandomGenerator:
         For the first run, we threw 3 times 1, 4 times 2, etc.  For the second,
         we threw 2 times 1, 4 times 2, etc.
 
+        Now, do one experiment throwing the dice 10 time, and 10 times again,
+        and another throwing the dice 20 times, and 20 times again:
+
+        >>> randomgen.generator.multinomial([[10], [20]], [1/6.]*6, size=2)
+        array([[[2, 4, 0, 1, 2, 1],
+                [1, 3, 0, 3, 1, 2]],
+               [[1, 4, 4, 4, 4, 3],
+                [3, 3, 2, 5, 5, 2]]])  # random
+
+        The first array shows the outcomes of throwing the dice 10 times, and
+        the second shows the outcomes from throwing the dice 20 times.
+
         A loaded die is more likely to land on number 6:
 
         >>> randomgen.generator.multinomial(100, [1/7.]*5 + [2/7.])
@@ -3777,19 +3788,43 @@ cdef class RandomGenerator:
         array([100,   0])
 
         """
-        cdef np.npy_intp d, i, j, dn, sz
-        cdef np.ndarray parr "arrayObject_parr", mnarr "arrayObject_mnarr"
+
+        cdef np.npy_intp d, i, sz, offset
+        cdef np.ndarray parr, mnarr, on, temp_arr
         cdef double *pix
         cdef int64_t *mnix
-        cdef double Sum
+        cdef int64_t ni
+        cdef np.broadcast it
 
         d = len(pvals)
+        on = <np.ndarray>np.PyArray_FROM_OTF(n, np.NPY_INT64, np.NPY_ALIGNED)
         parr = <np.ndarray>np.PyArray_FROM_OTF(pvals, np.NPY_DOUBLE, np.NPY_ALIGNED)
         pix = <double*>np.PyArray_DATA(parr)
 
         if kahan_sum(pix, d-1) > (1.0 + 1e-12):
             raise ValueError("sum(pvals[:-1]) > 1.0")
 
+        if np.PyArray_NDIM(on) != 0: # vector
+            if size is None:
+                it = np.PyArray_MultiIterNew1(on)
+            else:
+                temp = np.empty(size, dtype=np.int8)
+                temp_arr = <np.ndarray>temp
+                it = np.PyArray_MultiIterNew2(on, temp_arr)
+            shape = it.shape + (d,)
+            multin = np.zeros(shape, dtype=np.int64)
+            mnarr = <np.ndarray>multin
+            mnix = <int64_t*>np.PyArray_DATA(mnarr)
+            offset = 0
+            sz = it.size
+            with self.lock, nogil:
+                for i in range(sz):
+                    ni = (<int64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+                    random_multinomial(self._brng, ni, &mnix[offset], pix, d, self._binomial)
+                    offset += d
+                    np.PyArray_MultiIter_NEXT(it)
+            return multin
+
         if size is None:
             shape = (d,)
         else:
@@ -3802,23 +3837,12 @@ cdef class RandomGenerator:
         mnarr = <np.ndarray>multin
         mnix = <int64_t*>np.PyArray_DATA(mnarr)
         sz = np.PyArray_SIZE(mnarr)
-
+        ni = n
+        offset = 0
         with self.lock, nogil:
-            i = 0
-            while i < sz:
-                Sum = 1.0
-                dn = n
-                for j in range(d-1):
-                    mnix[i+j] = random_binomial(self._brng, pix[j]/Sum, dn,
-                                                self._binomial)
-                    dn = dn - mnix[i+j]
-                    if dn <= 0:
-                        break
-                    Sum = Sum - pix[j]
-                if dn > 0:
-                    mnix[i+d-1] = dn
-
-                i = i + d
+            for i in range(sz // d):
+                random_multinomial(self._brng, ni, &mnix[offset], pix, d, self._binomial)
+                offset += d
 
         return multin