From e96efa7913e221d00d97a326e4996a75f2f6e858 Mon Sep 17 00:00:00 2001
From: "Adam Ginsburg (keflavich)" <keflavich@gmail.com>
Date: Thu, 11 Aug 2016 10:28:14 +0200
Subject: [PATCH] Add ignore_nan option to mad_std

(this is a squash of 38 commits with messages below:)

make mad_std nan-compatible

mad_std too

add changelog entry

np->numpy in docstrings & check numpy version (using 1.9, because
officially it was added to numpy then according to their docs...)

use astropy.utils.compat (thanks @bsipocz)

1_9, not 1P9

bump to 1.10 to avoid np 1.9 weirdness

attempt to work around older numpy versions using ma.median

add tests for scalarness and use np.ma.median except when axis=None.
Also, change 'a' to 'data'.  'a' is a bad variable name and led to a few
mistakes

add another test of madstd with nan & axes

add a warning about NaNs in arrays for np<1.10.  Remove a test for those
versions.

update docstrings & changelog.  Add deprecation of old argument

correct the code to match the comment

change order of bool checking to do expensive one last

Test appropriate numpy versions, check warnings, and use asked array comparison

switch to catch_warnigns

for numpy 1.13, treatment of NaNs in masked arrays changed

add GE_13 to __ALL__

xfail the masked array test to avoid supporting incorrect behavior
see
https://github.com/astropy/astropy/pull/5232#pullrequestreview-11305740

udpate the special cases as per @mwcraig's helpful chart!

fix the tests: the behavior of the function when given masked arrays vs
non-masked arrays has changed: we're now forcing the correct behavior
for earlier versions

implement @juliantaylor's  suggetions

test should no longer use np.ma.allclose (though I don't know why ma
wouldn't just work....)

this commit will fail: I've fixed the underlying issue but kept the
tests flexible using the wrong data type...

tests for array type

double backticks

a second instance of bacticks

had a case wrong: one of the warn cases should *not* return a NaN for
np<1.11 (which is why we're catching a warning there)

numpy 1.10 np.ma.median([1,2,nan,4,5]) returns nan by default

no more old numpy supports.  Mixed feelings about this

fix an import

cleanup to address the numpy <=1.8 deprecation

more cleanup

fix changelog

add a note about keepdims

whitespace fix

fix imports

trailing whitespace?!!?!?!
---
 CHANGES.rst                       |  4 ++
 astropy/stats/funcs.py            | 62 +++++++++++++++++++++++-----
 astropy/stats/tests/test_funcs.py | 67 +++++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 9d61355487b..4947592b01b 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -597,6 +597,10 @@ New Features
   - Added ``axis`` keyword to ``biweight_location`` and
     ``biweight_midvariance``. [#5127, #5158]
 
+  - ``median_absolute_deviation`` and ``mad_std`` have ``ignore_nan`` option
+    that will use ``np.ma.median`` with nans masked out or ``np.nanmedian``
+    instead of ``np.median`` when computing the median. [#5232]
+
 - ``astropy.table``
 
   - Allow renaming mixin columns. [#5469]
diff --git a/astropy/stats/funcs.py b/astropy/stats/funcs.py
index 2091ec1b1e4..7d81dbeefff 100644
--- a/astropy/stats/funcs.py
+++ b/astropy/stats/funcs.py
@@ -15,6 +15,10 @@
 import math
 import numpy as np
 
+from warnings import warn
+
+from ..utils.compat import NUMPY_LT_1_10
+from ..utils.decorators import deprecated_renamed_argument
 from ..utils import isiterable
 from ..extern.six.moves import range
 
@@ -718,7 +722,8 @@ def poisson_conf_interval(n, interval='root-n', sigma=1, background=0,
     return conf_interval
 
 
-def median_absolute_deviation(a, axis=None, func=None):
+@deprecated_renamed_argument('a', 'data', '2.0')
+def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False):
     """
     Calculate the median absolute deviation (MAD).
 
@@ -726,7 +731,7 @@ def median_absolute_deviation(a, axis=None, func=None):
 
     Parameters
     ----------
-    a : array-like
+    data : array-like
         Input array or object that can be converted to an array.
     axis : {int, sequence of int, None}, optional
         Axis along which the MADs are computed.  The default (`None`) is
@@ -734,6 +739,11 @@ def median_absolute_deviation(a, axis=None, func=None):
     func : callable, optional
         The function used to compute the median. Defaults to `numpy.ma.median`
         for masked arrays, otherwise to `numpy.median`.
+    ignore_nan : bool
+        Ignore NaN values (treat them as if they are not in the array) when
+        computing the median.  This will use `numpy.ma.median` if ``axis`` is
+        specified, or `numpy.nanmedian` if ``axis==None`` and numpy's version
+        is >1.10 because nanmedian is slightly faster in this case.
 
     Returns
     -------
@@ -765,26 +775,52 @@ def median_absolute_deviation(a, axis=None, func=None):
         # See https://github.com/numpy/numpy/issues/7330 why using np.ma.median
         # for normal arrays should not be done (summary: np.ma.median always
         # returns an masked array even if the result should be scalar). (#4658)
-        if isinstance(a, np.ma.MaskedArray):
+        if isinstance(data, np.ma.MaskedArray):
+            is_masked = True
             func = np.ma.median
+            if ignore_nan:
+                data = np.ma.masked_invalid(data)
+        elif ignore_nan:
+            is_masked = False
+            func = np.nanmedian
         else:
+            is_masked = False
             func = np.median
+    else:
+        is_masked = None
 
-    a = np.asanyarray(a)
-    a_median = func(a, axis=axis)
+    if not ignore_nan and NUMPY_LT_1_10 and np.any(np.isnan(data)):
+        warn("Numpy versions <1.10 will return a number rather than NaN for "
+             "the median of arrays containing NaNs.  This behavior is "
+             "unlikely to be what you expect.")
+
+    data = np.asanyarray(data)
+    # np.nanmedian has `keepdims`, which is a good option if we're not allowing
+    # user-passed functions here
+    data_median = func(data, axis=axis)
 
     # broadcast the median array before subtraction
     if axis is not None:
         if isiterable(axis):
             for ax in sorted(list(axis)):
-                a_median = np.expand_dims(a_median, axis=ax)
+                data_median = np.expand_dims(data_median, axis=ax)
         else:
-            a_median = np.expand_dims(a_median, axis=axis)
+            data_median = np.expand_dims(data_median, axis=axis)
+
+    result = func(np.abs(data - data_median), axis=axis, overwrite_input=True)
+
+    if axis is None and np.ma.isMaskedArray(result):
+        # return scalar version
+        result = result.item()
+    elif np.ma.isMaskedArray(result) and is_masked == False:
+        # if the input array was not a masked array, we don't want to return a
+        # masked array
+        result = result.filled(fill_value=np.nan)
 
-    return func(np.abs(a - a_median), axis=axis)
+    return result
 
 
-def mad_std(data, axis=None, func=None):
+def mad_std(data, axis=None, func=None, ignore_nan=False):
     r"""
     Calculate a robust standard deviation using the `median absolute
     deviation (MAD)
@@ -811,6 +847,11 @@ def mad_std(data, axis=None, func=None):
     func : callable, optional
         The function used to compute the median. Defaults to `numpy.ma.median`
         for masked arrays, otherwise to `numpy.median`.
+    ignore_nan : bool
+        Ignore NaN values (treat them as if they are not in the array) when
+        computing the median.  This will use `numpy.ma.median` if ``axis`` is
+        specified, or `numpy.nanmedian` if ``axis=None`` and numpy's version is
+        >1.10 because nanmedian is slightly faster in this case.
 
     Returns
     -------
@@ -834,7 +875,8 @@ def mad_std(data, axis=None, func=None):
     """
 
     # NOTE: 1. / scipy.stats.norm.ppf(0.75) = 1.482602218505602
-    return median_absolute_deviation(data, axis=axis, func=func) * 1.482602218505602
+    MAD = median_absolute_deviation(data, axis=axis, func=func, ignore_nan=ignore_nan)
+    return MAD * 1.482602218505602
 
 
 def signal_to_noise_oir_ccd(t, source_eps, sky_eps, dark_eps, rd, npix,
diff --git a/astropy/stats/tests/test_funcs.py b/astropy/stats/tests/test_funcs.py
index eb1a33c0c74..6d11fcf12aa 100644
--- a/astropy/stats/tests/test_funcs.py
+++ b/astropy/stats/tests/test_funcs.py
@@ -7,6 +7,8 @@
 from numpy.random import randn, normal
 from numpy.testing import assert_equal
 from numpy.testing.utils import assert_allclose
+from astropy.utils.compat import NUMPY_LT_1_10, NUMPY_LT_1_11
+from ...tests.helper import catch_warnings
 
 try:
     import scipy  # pylint: disable=W0611
@@ -329,6 +331,48 @@ def test_mad_std():
         data = normal(5, 2, size=(100, 100))
         assert_allclose(funcs.mad_std(data), 2.0, rtol=0.05)
 
+@pytest.mark.xfail('not NUMPY_LT_1_10')
+def test_mad_std_scalar_return():
+    with NumpyRNGContext(12345):
+        data = normal(5, 2, size=(10, 10))
+        # make a masked array with no masked points
+        data = np.ma.masked_where(np.isnan(data), data)
+        rslt = funcs.mad_std(data)
+        # want a scalar result, NOT a masked array
+        assert np.isscalar(rslt)
+
+        data[5,5] = np.nan
+        rslt = funcs.mad_std(data, ignore_nan=True)
+        assert np.isscalar(rslt)
+        with catch_warnings() as warns:
+            rslt = funcs.mad_std(data)
+            assert np.isscalar(rslt)
+            assert not np.isnan(rslt)
+
+def test_mad_std_warns():
+    with NumpyRNGContext(12345):
+        data = normal(5, 2, size=(10, 10))
+        data[5,5] = np.nan
+
+        with catch_warnings() as warns:
+            rslt = funcs.mad_std(data, ignore_nan=False)
+            if NUMPY_LT_1_10:
+                w = warns[0]
+                assert str(w.message).startswith("Numpy versions <1.10 will return")
+            else:
+                assert np.isnan(rslt)
+
+def test_mad_std_withnan():
+    with NumpyRNGContext(12345):
+        data = np.empty([102,102])
+        data[:] = np.nan
+        data[1:-1,1:-1] = normal(5, 2, size=(100, 100))
+        assert_allclose(funcs.mad_std(data, ignore_nan=True), 2.0, rtol=0.05)
+
+    if not NUMPY_LT_1_10:
+        assert np.isnan(funcs.mad_std([1, 2, 3, 4, 5, np.nan]))
+    assert_allclose(funcs.mad_std([1, 2, 3, 4, 5, np.nan],  ignore_nan=True),
+                    1.482602218505602)
 
 def test_mad_std_with_axis():
     data = np.array([[1, 2, 3, 4],
@@ -340,6 +384,29 @@ def test_mad_std_with_axis():
     assert_allclose(funcs.mad_std(data, axis=0), result_axis0)
     assert_allclose(funcs.mad_std(data, axis=1), result_axis1)
 
+def test_mad_std_with_axis_and_nan():
+    data = np.array([[1, 2, 3, 4, np.nan],
+                     [4, 3, 2, 1, np.nan]])
+    # results follow data symmetry
+    result_axis0 = np.array([2.22390333, 0.74130111, 0.74130111,
+                             2.22390333, np.nan])
+    result_axis1 = np.array([1.48260222, 1.48260222])
+
+    assert_allclose(funcs.mad_std(data, axis=0, ignore_nan=True), result_axis0)
+    assert_allclose(funcs.mad_std(data, axis=1, ignore_nan=True), result_axis1)
+
+def test_mad_std_with_axis_and_nan_array_type():
+    # mad_std should return a masked array if given one, and not otherwise
+    data = np.array([[1, 2, 3, 4, np.nan],
+                     [4, 3, 2, 1, np.nan]])
+
+    result = funcs.mad_std(data, axis=0, ignore_nan=True)
+    assert not np.ma.isMaskedArray(result)
+
+    data = np.ma.masked_where(np.isnan(data), data)
+    result = funcs.mad_std(data, axis=0, ignore_nan=True)
+    assert np.ma.isMaskedArray(result)
+
 
 def test_gaussian_fwhm_to_sigma():
     fwhm = (2.0 * np.sqrt(2.0 * np.log(2.0)))