From e96efa7913e221d00d97a326e4996a75f2f6e858 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Thu, 11 Aug 2016 10:28:14 +0200 Subject: [PATCH] Add ignore_nan option to mad_std (this is a squash of 38 commits with messages below:) make mad_std nan-compatible mad_std too add changelog entry np->numpy in docstrings & check numpy version (using 1.9, because officially it was added to numpy then according to their docs...) use astropy.utils.compat (thanks @bsipocz) 1_9, not 1P9 bump to 1.10 to avoid np 1.9 weirdness attempt to work around older numpy versions using ma.median add tests for scalarness and use np.ma.median except when axis=None. Also, change 'a' to 'data'. 'a' is a bad variable name and led to a few mistakes add another test of madstd with nan & axes add a warning about NaNs in arrays for np<1.10. Remove a test for those versions. update docstrings & changelog. Add deprecation of old argument correct the code to match the comment change order of bool checking to do expensive one last Test appropriate numpy versions, check warnings, and use asked array comparison switch to catch_warnigns for numpy 1.13, treatment of NaNs in masked arrays changed add GE_13 to __ALL__ xfail the masked array test to avoid supporting incorrect behavior see https://github.com/astropy/astropy/pull/5232#pullrequestreview-11305740 udpate the special cases as per @mwcraig's helpful chart! fix the tests: the behavior of the function when given masked arrays vs non-masked arrays has changed: we're now forcing the correct behavior for earlier versions implement @juliantaylor's suggetions test should no longer use np.ma.allclose (though I don't know why ma wouldn't just work....) this commit will fail: I've fixed the underlying issue but kept the tests flexible using the wrong data type... tests for array type double backticks a second instance of bacticks had a case wrong: one of the warn cases should *not* return a NaN for np<1.11 (which is why we're catching a warning there) numpy 1.10 np.ma.median([1,2,nan,4,5]) returns nan by default no more old numpy supports. Mixed feelings about this fix an import cleanup to address the numpy <=1.8 deprecation more cleanup fix changelog add a note about keepdims whitespace fix fix imports trailing whitespace?!!?!?! --- CHANGES.rst | 4 ++ astropy/stats/funcs.py | 62 +++++++++++++++++++++++----- astropy/stats/tests/test_funcs.py | 67 +++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 10 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9d61355487b..4947592b01b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -597,6 +597,10 @@ New Features - Added ``axis`` keyword to ``biweight_location`` and ``biweight_midvariance``. [#5127, #5158] + - ``median_absolute_deviation`` and ``mad_std`` have ``ignore_nan`` option + that will use ``np.ma.median`` with nans masked out or ``np.nanmedian`` + instead of ``np.median`` when computing the median. [#5232] + - ``astropy.table`` - Allow renaming mixin columns. [#5469] diff --git a/astropy/stats/funcs.py b/astropy/stats/funcs.py index 2091ec1b1e4..7d81dbeefff 100644 --- a/astropy/stats/funcs.py +++ b/astropy/stats/funcs.py @@ -15,6 +15,10 @@ import math import numpy as np +from warnings import warn + +from ..utils.compat import NUMPY_LT_1_10 +from ..utils.decorators import deprecated_renamed_argument from ..utils import isiterable from ..extern.six.moves import range @@ -718,7 +722,8 @@ def poisson_conf_interval(n, interval='root-n', sigma=1, background=0, return conf_interval -def median_absolute_deviation(a, axis=None, func=None): +@deprecated_renamed_argument('a', 'data', '2.0') +def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False): """ Calculate the median absolute deviation (MAD). @@ -726,7 +731,7 @@ def median_absolute_deviation(a, axis=None, func=None): Parameters ---------- - a : array-like + data : array-like Input array or object that can be converted to an array. axis : {int, sequence of int, None}, optional Axis along which the MADs are computed. The default (`None`) is @@ -734,6 +739,11 @@ def median_absolute_deviation(a, axis=None, func=None): func : callable, optional The function used to compute the median. Defaults to `numpy.ma.median` for masked arrays, otherwise to `numpy.median`. + ignore_nan : bool + Ignore NaN values (treat them as if they are not in the array) when + computing the median. This will use `numpy.ma.median` if ``axis`` is + specified, or `numpy.nanmedian` if ``axis==None`` and numpy's version + is >1.10 because nanmedian is slightly faster in this case. Returns ------- @@ -765,26 +775,52 @@ def median_absolute_deviation(a, axis=None, func=None): # See https://github.com/numpy/numpy/issues/7330 why using np.ma.median # for normal arrays should not be done (summary: np.ma.median always # returns an masked array even if the result should be scalar). (#4658) - if isinstance(a, np.ma.MaskedArray): + if isinstance(data, np.ma.MaskedArray): + is_masked = True func = np.ma.median + if ignore_nan: + data = np.ma.masked_invalid(data) + elif ignore_nan: + is_masked = False + func = np.nanmedian else: + is_masked = False func = np.median + else: + is_masked = None - a = np.asanyarray(a) - a_median = func(a, axis=axis) + if not ignore_nan and NUMPY_LT_1_10 and np.any(np.isnan(data)): + warn("Numpy versions <1.10 will return a number rather than NaN for " + "the median of arrays containing NaNs. This behavior is " + "unlikely to be what you expect.") + + data = np.asanyarray(data) + # np.nanmedian has `keepdims`, which is a good option if we're not allowing + # user-passed functions here + data_median = func(data, axis=axis) # broadcast the median array before subtraction if axis is not None: if isiterable(axis): for ax in sorted(list(axis)): - a_median = np.expand_dims(a_median, axis=ax) + data_median = np.expand_dims(data_median, axis=ax) else: - a_median = np.expand_dims(a_median, axis=axis) + data_median = np.expand_dims(data_median, axis=axis) + + result = func(np.abs(data - data_median), axis=axis, overwrite_input=True) + + if axis is None and np.ma.isMaskedArray(result): + # return scalar version + result = result.item() + elif np.ma.isMaskedArray(result) and is_masked == False: + # if the input array was not a masked array, we don't want to return a + # masked array + result = result.filled(fill_value=np.nan) - return func(np.abs(a - a_median), axis=axis) + return result -def mad_std(data, axis=None, func=None): +def mad_std(data, axis=None, func=None, ignore_nan=False): r""" Calculate a robust standard deviation using the `median absolute deviation (MAD) @@ -811,6 +847,11 @@ def mad_std(data, axis=None, func=None): func : callable, optional The function used to compute the median. Defaults to `numpy.ma.median` for masked arrays, otherwise to `numpy.median`. + ignore_nan : bool + Ignore NaN values (treat them as if they are not in the array) when + computing the median. This will use `numpy.ma.median` if ``axis`` is + specified, or `numpy.nanmedian` if ``axis=None`` and numpy's version is + >1.10 because nanmedian is slightly faster in this case. Returns ------- @@ -834,7 +875,8 @@ def mad_std(data, axis=None, func=None): """ # NOTE: 1. / scipy.stats.norm.ppf(0.75) = 1.482602218505602 - return median_absolute_deviation(data, axis=axis, func=func) * 1.482602218505602 + MAD = median_absolute_deviation(data, axis=axis, func=func, ignore_nan=ignore_nan) + return MAD * 1.482602218505602 def signal_to_noise_oir_ccd(t, source_eps, sky_eps, dark_eps, rd, npix, diff --git a/astropy/stats/tests/test_funcs.py b/astropy/stats/tests/test_funcs.py index eb1a33c0c74..6d11fcf12aa 100644 --- a/astropy/stats/tests/test_funcs.py +++ b/astropy/stats/tests/test_funcs.py @@ -7,6 +7,8 @@ from numpy.random import randn, normal from numpy.testing import assert_equal from numpy.testing.utils import assert_allclose +from astropy.utils.compat import NUMPY_LT_1_10, NUMPY_LT_1_11 +from ...tests.helper import catch_warnings try: import scipy # pylint: disable=W0611 @@ -329,6 +331,48 @@ def test_mad_std(): data = normal(5, 2, size=(100, 100)) assert_allclose(funcs.mad_std(data), 2.0, rtol=0.05) +@pytest.mark.xfail('not NUMPY_LT_1_10') +def test_mad_std_scalar_return(): + with NumpyRNGContext(12345): + data = normal(5, 2, size=(10, 10)) + # make a masked array with no masked points + data = np.ma.masked_where(np.isnan(data), data) + rslt = funcs.mad_std(data) + # want a scalar result, NOT a masked array + assert np.isscalar(rslt) + + data[5,5] = np.nan + rslt = funcs.mad_std(data, ignore_nan=True) + assert np.isscalar(rslt) + with catch_warnings() as warns: + rslt = funcs.mad_std(data) + assert np.isscalar(rslt) + assert not np.isnan(rslt) + +def test_mad_std_warns(): + with NumpyRNGContext(12345): + data = normal(5, 2, size=(10, 10)) + data[5,5] = np.nan + + with catch_warnings() as warns: + rslt = funcs.mad_std(data, ignore_nan=False) + if NUMPY_LT_1_10: + w = warns[0] + assert str(w.message).startswith("Numpy versions <1.10 will return") + else: + assert np.isnan(rslt) + +def test_mad_std_withnan(): + with NumpyRNGContext(12345): + data = np.empty([102,102]) + data[:] = np.nan + data[1:-1,1:-1] = normal(5, 2, size=(100, 100)) + assert_allclose(funcs.mad_std(data, ignore_nan=True), 2.0, rtol=0.05) + + if not NUMPY_LT_1_10: + assert np.isnan(funcs.mad_std([1, 2, 3, 4, 5, np.nan])) + assert_allclose(funcs.mad_std([1, 2, 3, 4, 5, np.nan], ignore_nan=True), + 1.482602218505602) def test_mad_std_with_axis(): data = np.array([[1, 2, 3, 4], @@ -340,6 +384,29 @@ def test_mad_std_with_axis(): assert_allclose(funcs.mad_std(data, axis=0), result_axis0) assert_allclose(funcs.mad_std(data, axis=1), result_axis1) +def test_mad_std_with_axis_and_nan(): + data = np.array([[1, 2, 3, 4, np.nan], + [4, 3, 2, 1, np.nan]]) + # results follow data symmetry + result_axis0 = np.array([2.22390333, 0.74130111, 0.74130111, + 2.22390333, np.nan]) + result_axis1 = np.array([1.48260222, 1.48260222]) + + assert_allclose(funcs.mad_std(data, axis=0, ignore_nan=True), result_axis0) + assert_allclose(funcs.mad_std(data, axis=1, ignore_nan=True), result_axis1) + +def test_mad_std_with_axis_and_nan_array_type(): + # mad_std should return a masked array if given one, and not otherwise + data = np.array([[1, 2, 3, 4, np.nan], + [4, 3, 2, 1, np.nan]]) + + result = funcs.mad_std(data, axis=0, ignore_nan=True) + assert not np.ma.isMaskedArray(result) + + data = np.ma.masked_where(np.isnan(data), data) + result = funcs.mad_std(data, axis=0, ignore_nan=True) + assert np.ma.isMaskedArray(result) + def test_gaussian_fwhm_to_sigma(): fwhm = (2.0 * np.sqrt(2.0 * np.log(2.0)))