astropy · bsipocz · Nov 13, 2018 · Oct 26, 2018 · Oct 28, 2018 · Oct 28, 2018
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1497,6 +1497,10 @@ astropy.visualization
 - Fixed a bug that caused an error when plotting grids multiple times
   with grid_type='contours'. [#7927]
 
+- Put an upper limit on the number of bins in ``hist`` and ``histogram`` and
+  factor out calculation of bin edges into public function
+  ``calculate_bin_edges``. [#7991]
+
 astropy.vo
 ^^^^^^^^^^
 

diff --git a/astropy/stats/histogram.py b/astropy/stats/histogram.py
@@ -10,7 +10,90 @@
 from . import bayesian_blocks
 
 __all__ = ['histogram', 'scott_bin_width', 'freedman_bin_width',
-           'knuth_bin_width']
+           'knuth_bin_width', 'calculate_bin_edges']
+
+
+def calculate_bin_edges(a, bins=10, range=None, weights=None):
+    """
+    Calculate histogram bin edges like `numpy.histogram_bin_edges`.
+
+    Parameters
+    ----------
+
+    a : array_like
+        Input data. The bin edges are calculated over the flattened array.
+
+    bins : int or list or str (optional)
+        If ``bins`` is an int, it is the number of bins. If it is a list
+        it is taken to be the bin edges. If it is a string, it must be one
+        of  'blocks', 'knuth', 'scott' or 'freedman'. See
+        `~astropy.stats.histogram` for a description of each method.
+
+    range : tuple or None (optional)
+        The minimum and maximum range for the histogram.  If not specified,
+        it will be (x.min(), x.max()). However, if bins is a list it is
+        returned unmodified regardless of the range argument.
+
+    weights : array_like, optional
+        An array the same shape as ``a``. If given, the histogram accumulates
+        the value of the weight corresponding to ``a`` instead of returning the
+        count of values. This argument does not affect determination of bin
+        edges.
+    """
+    # if range is specified, we need to truncate the data for
+    # the bin-finding routines
+    if range is not None:
+        a = a[(a >= range[0]) & (a <= range[1])]
+
+    # if bins is a string, first compute bin edges with the desired heuristic
+    if isinstance(bins, str):
+        a = np.asarray(a).ravel()
+
+        # TODO: if weights is specified, we need to modify things.
+        #       e.g. we could use point measures fitness for Bayesian blocks
+        if weights is not None:
+            raise NotImplementedError("weights are not yet supported "
+                                      "for the enhanced histogram")
+
+        if bins == 'blocks':
+            bins = bayesian_blocks(a)
+        elif bins == 'knuth':
+            da, bins = knuth_bin_width(a, True)
+        elif bins == 'scott':
+            da, bins = scott_bin_width(a, True)
+        elif bins == 'freedman':
+            da, bins = freedman_bin_width(a, True)
+        else:
+            raise ValueError("unrecognized bin code: '{}'".format(bins))
+
+        if range:
+            # Check that the upper and lower edges are what was requested.
+            # The current implementation of the bin width estimators does not
+            # guarantee this, it only ensures that data outside the range is
+            # excluded from calculation of the bin widths.
+            if bins[0] != range[0]:
+                bins[0] = range[0]
+            if bins[-1] != range[1]:
+                bins[-1] = range[1]
+
+    elif np.ndim(bins) == 0:
+        # Number of bins was given
+        try:
+            # This works for numpy 1.15 or later
+            bins = np.histogram_bin_edges(a, bins,
+                                          range=range,
+                                          weights=weights)
+        except AttributeError:
+            # In this case only (integer number of bins, older version of
+            # numpy) then calculate like the bin edges manually.
+            if range is not None:
+                lower, upper = range
+            else:
+                lower = a.min()
+                upper = a.max()
+            bins = np.linspace(lower, upper, bins + 1, endpoint=True)
+
+    return bins
 
 
 def histogram(a, bins=10, range=None, weights=None, **kwargs):
@@ -42,7 +125,10 @@ def histogram(a, bins=10, range=None, weights=None, **kwargs):
         it will be (x.min(), x.max())
 
     weights : array_like, optional
-        Not Implemented
+        An array the same shape as ``a``. If given, the histogram accumulates
+        the value of the weight corresponding to ``a`` instead of returning the
+        count of values. This argument does not affect determination of bin
+        edges.
 
     other keyword arguments are described in numpy.histogram().
 
@@ -58,32 +144,8 @@ def histogram(a, bins=10, range=None, weights=None, **kwargs):
     --------
     numpy.histogram
     """
-    # if bins is a string, first compute bin edges with the desired heuristic
-    if isinstance(bins, str):
-        a = np.asarray(a).ravel()
-
-        # TODO: if weights is specified, we need to modify things.
-        #       e.g. we could use point measures fitness for Bayesian blocks
-        if weights is not None:
-            raise NotImplementedError("weights are not yet supported "
-                                      "for the enhanced histogram")
-
-        # if range is specified, we need to truncate the data for
-        # the bin-finding routines
-        if range is not None:
-            a = a[(a >= range[0]) & (a <= range[1])]
-
-        if bins == 'blocks':
-            bins = bayesian_blocks(a)
-        elif bins == 'knuth':
-            da, bins = knuth_bin_width(a, True)
-        elif bins == 'scott':
-            da, bins = scott_bin_width(a, True)
-        elif bins == 'freedman':
-            da, bins = freedman_bin_width(a, True)
-        else:
-            raise ValueError("unrecognized bin code: '{}'".format(bins))
 
+    bins = calculate_bin_edges(a, bins=bins, range=range, weights=weights)
     # Now we call numpy's histogram with the resulting bin edges
     return np.histogram(a, bins=bins, range=range, weights=weights, **kwargs)
 

diff --git a/astropy/stats/tests/test_histogram.py b/astropy/stats/tests/test_histogram.py
@@ -4,7 +4,8 @@
 import numpy as np
 from numpy.testing import assert_allclose
 
-from .. import histogram, scott_bin_width, freedman_bin_width, knuth_bin_width
+from .. import (histogram, calculate_bin_edges,
+                scott_bin_width, freedman_bin_width, knuth_bin_width)
 
 try:
     import scipy  # pylint: disable=W0611
@@ -80,24 +81,47 @@ def test_knuth_histogram(N=1000, rseed=0):
     assert (len(counts) == len(bins) - 1)
 
 
-def test_histogram(N=1000, rseed=0):
+_bin_types_to_test = [30, 'scott', 'freedman', 'blocks']
+
+if HAS_SCIPY:
+    _bin_types_to_test += ['knuth']
+
+
+@pytest.mark.parametrize('bin_type',
+                         _bin_types_to_test + [np.linspace(-5, 5, 31)])
+def test_histogram(bin_type, N=1000, rseed=0):
     rng = np.random.RandomState(rseed)
     x = rng.randn(N)
 
-    for bins in [30, np.linspace(-5, 5, 31),
-                 'scott', 'freedman', 'blocks']:
-        counts, bins = histogram(x, bins)
-        assert (counts.sum() == len(x))
-        assert (len(counts) == len(bins) - 1)
+    counts, bins = histogram(x, bin_type)
+    assert (counts.sum() == len(x))
+    assert (len(counts) == len(bins) - 1)
 
 
-def test_histogram_range(N=1000, rseed=0):
+# Don't include a list of bins as a bin_type here because the effect
+# of range is different in that case
+@pytest.mark.parametrize('bin_type', _bin_types_to_test)
+def test_histogram_range(bin_type, N=1000, rseed=0):
+    # Regression test for #8010
     rng = np.random.RandomState(rseed)
     x = rng.randn(N)
     range = (0.1, 0.8)
 
-    for bins in ['scott', 'freedman', 'blocks']:
-        counts, bins = histogram(x, bins, range=range)
+    bins = calculate_bin_edges(x, bin_type, range=range)
+    assert bins.max() == range[1]
+    assert bins.min() == range[0]
+
+
+def test_histogram_range_with_bins_list(N=1000, rseed=0):
+    # The expected result when the input bins is a list is
+    # the same list on output.
+    rng = np.random.RandomState(rseed)
+    x = rng.randn(N)
+    range = (0.1, 0.8)
+
+    input_bins = np.linspace(-5, 5, 31)
+    bins = calculate_bin_edges(x, input_bins, range=range)
+    assert all(bins == input_bins)
 
 
 @pytest.mark.skipif('not HAS_SCIPY')

diff --git a/astropy/visualization/hist.py b/astropy/visualization/hist.py
@@ -3,12 +3,12 @@
 
 import numpy as np
 from inspect import signature
-from ..stats import histogram
+from ..stats.histogram import calculate_bin_edges
 
 __all__ = ['hist']
 
 
-def hist(x, bins=10, ax=None, **kwargs):
+def hist(x, bins=10, ax=None, max_bins=1e5, **kwargs):
     """Enhanced histogram function
 
     This is a histogram function that enables the use of more sophisticated
@@ -38,6 +38,12 @@ def hist(x, bins=10, ax=None, **kwargs):
         specify the Axes on which to draw the histogram.  If not specified,
         then the current active axes will be used.
 
+    max_bins : int (optional)
+        Maximum number of bins allowed. With more than a few thousand bins
+        the performance of matplotlib will not be great. If the number of
+        bins is large *and* the number of input data points is large then
+        the it will take a very long time to compute the histogram.
+
     **kwargs :
         other keyword arguments are described in ``plt.hist()``.
 
@@ -49,10 +55,17 @@ def hist(x, bins=10, ax=None, **kwargs):
     --------
     astropy.stats.histogram
     """
-    # arguments of np.histogram should be passed to astropy.stats.histogram
-    arglist = list(signature(np.histogram).parameters.keys())[1:]
-    np_hist_kwds = dict((key, kwargs[key]) for key in arglist if key in kwargs)
-    hist, bins = histogram(x, bins, **np_hist_kwds)
+    # Note that we only calculate the bin edges...matplotlib will calculate
+    # the actual histogram.
+    range = kwargs.get('range', None)
+    weights = kwargs.get('weights', None)
+    bins = calculate_bin_edges(x, bins, range=range, weights=weights)
+
+    if len(bins) > max_bins:
+        raise ValueError('Histogram has too many bins: '
+                         '{nbin}. Use max_bins to increase the number '
+                         'of allowed bins or range to restrict '
+                         'the histogram range.'.format(nbin=len(bins)))
 
     if ax is None:
         # optional dependency; only import if strictly needed.

diff --git a/astropy/visualization/tests/test_histogram.py b/astropy/visualization/tests/test_histogram.py
@@ -67,3 +67,16 @@ def test_hist_autobin(rseed=0):
             n2, bins2, patches = hist(x, bintype, range=range)
             assert_allclose(n1, n2)
             assert_allclose(bins1, bins2)
+
+
+def test_histogram_pathological_input():
+    # Regression test for https://github.com/astropy/astropy/issues/7758
+
+    # The key feature of the data below is that one of the points is very,
+    # very different than the rest. That leads to a large number of bins.
+    data = [9.99999914e+05, -8.31312483e-03, 6.52755852e-02, 1.43104653e-03,
+            -2.26311017e-02, 2.82660007e-03, 1.80307521e-02, 9.26294279e-03,
+            5.06606026e-02, 2.05418011e-03]
+
+    with pytest.raises(ValueError):
+        hist(data, bins='freedman', max_bins=10000)