Add ignore_nan option to mad_std

(this is a squash of 38 commits with messages below:) make mad_std nan-compatible mad_std too add changelog entry np->numpy in docstrings & check numpy version (using 1.9, because officially it was added to numpy then according to their docs...) use astropy.utils.compat (thanks @bsipocz) 1_9, not 1P9 bump to 1.10 to avoid np 1.9 weirdness attempt to work around older numpy versions using ma.median add tests for scalarness and use np.ma.median except when axis=None. Also, change 'a' to 'data'. 'a' is a bad variable name and led to a few mistakes add another test of madstd with nan & axes add a warning about NaNs in arrays for np<1.10. Remove a test for those versions. update docstrings & changelog. Add deprecation of old argument correct the code to match the comment change order of bool checking to do expensive one last Test appropriate numpy versions, check warnings, and use asked array comparison switch to catch_warnigns for numpy 1.13, treatment of NaNs in masked arrays changed add GE_13 to __ALL__ xfail the masked array test to avoid supporting incorrect behavior see #5232 (review) udpate the special cases as per @mwcraig's helpful chart! fix the tests: the behavior of the function when given masked arrays vs non-masked arrays has changed: we're now forcing the correct behavior for earlier versions implement @juliantaylor's suggetions test should no longer use np.ma.allclose (though I don't know why ma wouldn't just work....) this commit will fail: I've fixed the underlying issue but kept the tests flexible using the wrong data type... tests for array type double backticks a second instance of bacticks had a case wrong: one of the warn cases should *not* return a NaN for np<1.11 (which is why we're catching a warning there) numpy 1.10 np.ma.median([1,2,nan,4,5]) returns nan by default no more old numpy supports. Mixed feelings about this fix an import cleanup to address the numpy <=1.8 deprecation more cleanup fix changelog add a note about keepdims whitespace fix fix imports trailing whitespace?!!?!?!
astropy · May 15, 2017 · e96efa7 · e96efa7
1 parent 5e6c349
commit e96efa7
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 10 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -597,6 +597,10 @@ New Features
   - Added ``axis`` keyword to ``biweight_location`` and
     ``biweight_midvariance``. [#5127, #5158]
 
+  - ``median_absolute_deviation`` and ``mad_std`` have ``ignore_nan`` option
+    that will use ``np.ma.median`` with nans masked out or ``np.nanmedian``
+    instead of ``np.median`` when computing the median. [#5232]
+
 - ``astropy.table``
 
   - Allow renaming mixin columns. [#5469]

diff --git a/astropy/stats/funcs.py b/astropy/stats/funcs.py
@@ -15,6 +15,10 @@
 import math
 import numpy as np
 
+from warnings import warn
+
+from ..utils.compat import NUMPY_LT_1_10
+from ..utils.decorators import deprecated_renamed_argument
 from ..utils import isiterable
 from ..extern.six.moves import range
 
@@ -718,22 +722,28 @@ def poisson_conf_interval(n, interval='root-n', sigma=1, background=0,
     return conf_interval
 
 
-def median_absolute_deviation(a, axis=None, func=None):
+@deprecated_renamed_argument('a', 'data', '2.0')
+def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False):
     """
     Calculate the median absolute deviation (MAD).
 
     The MAD is defined as ``median(abs(a - median(a)))``.
 
     Parameters
     ----------
-    a : array-like
+    data : array-like
         Input array or object that can be converted to an array.
     axis : {int, sequence of int, None}, optional
         Axis along which the MADs are computed.  The default (`None`) is
         to compute the MAD of the flattened array.
     func : callable, optional
         The function used to compute the median. Defaults to `numpy.ma.median`
         for masked arrays, otherwise to `numpy.median`.
+    ignore_nan : bool
+        Ignore NaN values (treat them as if they are not in the array) when
+        computing the median.  This will use `numpy.ma.median` if ``axis`` is
+        specified, or `numpy.nanmedian` if ``axis==None`` and numpy's version
+        is >1.10 because nanmedian is slightly faster in this case.
 
     Returns
     -------
@@ -765,26 +775,52 @@ def median_absolute_deviation(a, axis=None, func=None):
         # See https://github.com/numpy/numpy/issues/7330 why using np.ma.median
         # for normal arrays should not be done (summary: np.ma.median always
         # returns an masked array even if the result should be scalar). (#4658)
-        if isinstance(a, np.ma.MaskedArray):
+        if isinstance(data, np.ma.MaskedArray):
+            is_masked = True
             func = np.ma.median
+            if ignore_nan:
+                data = np.ma.masked_invalid(data)
+        elif ignore_nan:
+            is_masked = False
+            func = np.nanmedian
         else:
+            is_masked = False
             func = np.median
+    else:
+        is_masked = None
 
-    a = np.asanyarray(a)
-    a_median = func(a, axis=axis)
+    if not ignore_nan and NUMPY_LT_1_10 and np.any(np.isnan(data)):
+        warn("Numpy versions <1.10 will return a number rather than NaN for "
+             "the median of arrays containing NaNs.  This behavior is "
+             "unlikely to be what you expect.")
+
+    data = np.asanyarray(data)
+    # np.nanmedian has `keepdims`, which is a good option if we're not allowing
+    # user-passed functions here
+    data_median = func(data, axis=axis)
 
     # broadcast the median array before subtraction
     if axis is not None:
         if isiterable(axis):
             for ax in sorted(list(axis)):
-                a_median = np.expand_dims(a_median, axis=ax)
+                data_median = np.expand_dims(data_median, axis=ax)
         else:
-            a_median = np.expand_dims(a_median, axis=axis)
+            data_median = np.expand_dims(data_median, axis=axis)
+
+    result = func(np.abs(data - data_median), axis=axis, overwrite_input=True)
+
+    if axis is None and np.ma.isMaskedArray(result):
+        # return scalar version
+        result = result.item()
+    elif np.ma.isMaskedArray(result) and is_masked == False:
+        # if the input array was not a masked array, we don't want to return a
+        # masked array
+        result = result.filled(fill_value=np.nan)
 
-    return func(np.abs(a - a_median), axis=axis)
+    return result
 
 
-def mad_std(data, axis=None, func=None):
+def mad_std(data, axis=None, func=None, ignore_nan=False):
     r"""
     Calculate a robust standard deviation using the `median absolute
     deviation (MAD)
@@ -811,6 +847,11 @@ def mad_std(data, axis=None, func=None):
     func : callable, optional
         The function used to compute the median. Defaults to `numpy.ma.median`
         for masked arrays, otherwise to `numpy.median`.
+    ignore_nan : bool
+        Ignore NaN values (treat them as if they are not in the array) when
+        computing the median.  This will use `numpy.ma.median` if ``axis`` is
+        specified, or `numpy.nanmedian` if ``axis=None`` and numpy's version is
+        >1.10 because nanmedian is slightly faster in this case.
 
     Returns
     -------
@@ -834,7 +875,8 @@ def mad_std(data, axis=None, func=None):
     """
 
     # NOTE: 1. / scipy.stats.norm.ppf(0.75) = 1.482602218505602
-    return median_absolute_deviation(data, axis=axis, func=func) * 1.482602218505602
+    MAD = median_absolute_deviation(data, axis=axis, func=func, ignore_nan=ignore_nan)
+    return MAD * 1.482602218505602
 
 
 def signal_to_noise_oir_ccd(t, source_eps, sky_eps, dark_eps, rd, npix,

diff --git a/astropy/stats/tests/test_funcs.py b/astropy/stats/tests/test_funcs.py
@@ -7,6 +7,8 @@
 from numpy.random import randn, normal
 from numpy.testing import assert_equal
 from numpy.testing.utils import assert_allclose
+from astropy.utils.compat import NUMPY_LT_1_10, NUMPY_LT_1_11
+from ...tests.helper import catch_warnings
 
 try:
     import scipy  # pylint: disable=W0611
@@ -329,6 +331,48 @@ def test_mad_std():
         data = normal(5, 2, size=(100, 100))
         assert_allclose(funcs.mad_std(data), 2.0, rtol=0.05)
 
+@pytest.mark.xfail('not NUMPY_LT_1_10')
+def test_mad_std_scalar_return():
+    with NumpyRNGContext(12345):
+        data = normal(5, 2, size=(10, 10))
+        # make a masked array with no masked points
+        data = np.ma.masked_where(np.isnan(data), data)
+        rslt = funcs.mad_std(data)
+        # want a scalar result, NOT a masked array
+        assert np.isscalar(rslt)
+
+        data[5,5] = np.nan
+        rslt = funcs.mad_std(data, ignore_nan=True)
+        assert np.isscalar(rslt)
+        with catch_warnings() as warns:
+            rslt = funcs.mad_std(data)
+            assert np.isscalar(rslt)
+            assert not np.isnan(rslt)
+
+def test_mad_std_warns():
+    with NumpyRNGContext(12345):
+        data = normal(5, 2, size=(10, 10))
+        data[5,5] = np.nan
+
+        with catch_warnings() as warns:
+            rslt = funcs.mad_std(data, ignore_nan=False)
+            if NUMPY_LT_1_10:
+                w = warns[0]
+                assert str(w.message).startswith("Numpy versions <1.10 will return")
+            else:
+                assert np.isnan(rslt)
+
+def test_mad_std_withnan():
+    with NumpyRNGContext(12345):
+        data = np.empty([102,102])
+        data[:] = np.nan
+        data[1:-1,1:-1] = normal(5, 2, size=(100, 100))
+        assert_allclose(funcs.mad_std(data, ignore_nan=True), 2.0, rtol=0.05)
+
+    if not NUMPY_LT_1_10:
+        assert np.isnan(funcs.mad_std([1, 2, 3, 4, 5, np.nan]))
+    assert_allclose(funcs.mad_std([1, 2, 3, 4, 5, np.nan],  ignore_nan=True),
+                    1.482602218505602)
 
 def test_mad_std_with_axis():
     data = np.array([[1, 2, 3, 4],
@@ -340,6 +384,29 @@ def test_mad_std_with_axis():
     assert_allclose(funcs.mad_std(data, axis=0), result_axis0)
     assert_allclose(funcs.mad_std(data, axis=1), result_axis1)
 
+def test_mad_std_with_axis_and_nan():
+    data = np.array([[1, 2, 3, 4, np.nan],
+                     [4, 3, 2, 1, np.nan]])
+    # results follow data symmetry
+    result_axis0 = np.array([2.22390333, 0.74130111, 0.74130111,
+                             2.22390333, np.nan])
+    result_axis1 = np.array([1.48260222, 1.48260222])
+
+    assert_allclose(funcs.mad_std(data, axis=0, ignore_nan=True), result_axis0)
+    assert_allclose(funcs.mad_std(data, axis=1, ignore_nan=True), result_axis1)
+
+def test_mad_std_with_axis_and_nan_array_type():
+    # mad_std should return a masked array if given one, and not otherwise
+    data = np.array([[1, 2, 3, 4, np.nan],
+                     [4, 3, 2, 1, np.nan]])
+
+    result = funcs.mad_std(data, axis=0, ignore_nan=True)
+    assert not np.ma.isMaskedArray(result)
+
+    data = np.ma.masked_where(np.isnan(data), data)
+    result = funcs.mad_std(data, axis=0, ignore_nan=True)
+    assert np.ma.isMaskedArray(result)
+
 
 def test_gaussian_fwhm_to_sigma():
     fwhm = (2.0 * np.sqrt(2.0 * np.log(2.0)))