Skip to content

Commit

Permalink
Merge pull request #7991 from mwcraig/fix-7758
Browse files Browse the repository at this point in the history
Add upper limit to number of bins in hist and refactor bin edge calculation
  • Loading branch information
bsipocz committed Nov 13, 2018
2 parents e3838cd + ce61ee1 commit 35dc079
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 43 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1800,6 +1800,10 @@ astropy.visualization
- Fixed a bug that caused an error when plotting grids multiple times
with grid_type='contours'. [#7927]

- Put an upper limit on the number of bins in ``hist`` and ``histogram`` and
factor out calculation of bin edges into public function
``calculate_bin_edges``. [#7991]

astropy.vo
^^^^^^^^^^

Expand Down
116 changes: 89 additions & 27 deletions astropy/stats/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,90 @@
from . import bayesian_blocks

__all__ = ['histogram', 'scott_bin_width', 'freedman_bin_width',
'knuth_bin_width']
'knuth_bin_width', 'calculate_bin_edges']


def calculate_bin_edges(a, bins=10, range=None, weights=None):
"""
Calculate histogram bin edges like `numpy.histogram_bin_edges`.
Parameters
----------
a : array_like
Input data. The bin edges are calculated over the flattened array.
bins : int or list or str (optional)
If ``bins`` is an int, it is the number of bins. If it is a list
it is taken to be the bin edges. If it is a string, it must be one
of 'blocks', 'knuth', 'scott' or 'freedman'. See
`~astropy.stats.histogram` for a description of each method.
range : tuple or None (optional)
The minimum and maximum range for the histogram. If not specified,
it will be (a.min(), a.max()). However, if bins is a list it is
returned unmodified regardless of the range argument.
weights : array_like, optional
An array the same shape as ``a``. If given, the histogram accumulates
the value of the weight corresponding to ``a`` instead of returning the
count of values. This argument does not affect determination of bin
edges, though they may be used in the future as new methods are added.
"""
# if range is specified, we need to truncate the data for
# the bin-finding routines
if range is not None:
a = a[(a >= range[0]) & (a <= range[1])]

# if bins is a string, first compute bin edges with the desired heuristic
if isinstance(bins, str):
a = np.asarray(a).ravel()

# TODO: if weights is specified, we need to modify things.
# e.g. we could use point measures fitness for Bayesian blocks
if weights is not None:
raise NotImplementedError("weights are not yet supported "
"for the enhanced histogram")

if bins == 'blocks':
bins = bayesian_blocks(a)
elif bins == 'knuth':
da, bins = knuth_bin_width(a, True)
elif bins == 'scott':
da, bins = scott_bin_width(a, True)
elif bins == 'freedman':
da, bins = freedman_bin_width(a, True)
else:
raise ValueError("unrecognized bin code: '{}'".format(bins))

if range:
# Check that the upper and lower edges are what was requested.
# The current implementation of the bin width estimators does not
# guarantee this, it only ensures that data outside the range is
# excluded from calculation of the bin widths.
if bins[0] != range[0]:
bins[0] = range[0]
if bins[-1] != range[1]:
bins[-1] = range[1]

elif np.ndim(bins) == 0:
# Number of bins was given
try:
# This works for numpy 1.15 or later
bins = np.histogram_bin_edges(a, bins,
range=range,
weights=weights)
except AttributeError:
# In this case only (integer number of bins, older version of
# numpy) then calculate like the bin edges manually.
if range is not None:
lower, upper = range
else:
lower = a.min()
upper = a.max()
bins = np.linspace(lower, upper, bins + 1, endpoint=True)

return bins


def histogram(a, bins=10, range=None, weights=None, **kwargs):
Expand Down Expand Up @@ -42,7 +125,10 @@ def histogram(a, bins=10, range=None, weights=None, **kwargs):
it will be (x.min(), x.max())
weights : array_like, optional
Not Implemented
An array the same shape as ``a``. If given, the histogram accumulates
the value of the weight corresponding to ``a`` instead of returning the
count of values. This argument does not affect determination of bin
edges.
other keyword arguments are described in numpy.histogram().
Expand All @@ -58,32 +144,8 @@ def histogram(a, bins=10, range=None, weights=None, **kwargs):
--------
numpy.histogram
"""
# if bins is a string, first compute bin edges with the desired heuristic
if isinstance(bins, str):
a = np.asarray(a).ravel()

# TODO: if weights is specified, we need to modify things.
# e.g. we could use point measures fitness for Bayesian blocks
if weights is not None:
raise NotImplementedError("weights are not yet supported "
"for the enhanced histogram")

# if range is specified, we need to truncate the data for
# the bin-finding routines
if range is not None:
a = a[(a >= range[0]) & (a <= range[1])]

if bins == 'blocks':
bins = bayesian_blocks(a)
elif bins == 'knuth':
da, bins = knuth_bin_width(a, True)
elif bins == 'scott':
da, bins = scott_bin_width(a, True)
elif bins == 'freedman':
da, bins = freedman_bin_width(a, True)
else:
raise ValueError("unrecognized bin code: '{}'".format(bins))

bins = calculate_bin_edges(a, bins=bins, range=range, weights=weights)
# Now we call numpy's histogram with the resulting bin edges
return np.histogram(a, bins=bins, range=range, weights=weights, **kwargs)

Expand Down
44 changes: 34 additions & 10 deletions astropy/stats/tests/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
from numpy.testing import assert_allclose

from .. import histogram, scott_bin_width, freedman_bin_width, knuth_bin_width
from .. import (histogram, calculate_bin_edges,
scott_bin_width, freedman_bin_width, knuth_bin_width)

try:
import scipy # pylint: disable=W0611
Expand Down Expand Up @@ -80,24 +81,47 @@ def test_knuth_histogram(N=1000, rseed=0):
assert (len(counts) == len(bins) - 1)


def test_histogram(N=1000, rseed=0):
_bin_types_to_test = [30, 'scott', 'freedman', 'blocks']

if HAS_SCIPY:
_bin_types_to_test += ['knuth']


@pytest.mark.parametrize('bin_type',
_bin_types_to_test + [np.linspace(-5, 5, 31)])
def test_histogram(bin_type, N=1000, rseed=0):
rng = np.random.RandomState(rseed)
x = rng.randn(N)

for bins in [30, np.linspace(-5, 5, 31),
'scott', 'freedman', 'blocks']:
counts, bins = histogram(x, bins)
assert (counts.sum() == len(x))
assert (len(counts) == len(bins) - 1)
counts, bins = histogram(x, bin_type)
assert (counts.sum() == len(x))
assert (len(counts) == len(bins) - 1)


def test_histogram_range(N=1000, rseed=0):
# Don't include a list of bins as a bin_type here because the effect
# of range is different in that case
@pytest.mark.parametrize('bin_type', _bin_types_to_test)
def test_histogram_range(bin_type, N=1000, rseed=0):
# Regression test for #8010
rng = np.random.RandomState(rseed)
x = rng.randn(N)
range = (0.1, 0.8)

for bins in ['scott', 'freedman', 'blocks']:
counts, bins = histogram(x, bins, range=range)
bins = calculate_bin_edges(x, bin_type, range=range)
assert bins.max() == range[1]
assert bins.min() == range[0]


def test_histogram_range_with_bins_list(N=1000, rseed=0):
# The expected result when the input bins is a list is
# the same list on output.
rng = np.random.RandomState(rseed)
x = rng.randn(N)
range = (0.1, 0.8)

input_bins = np.linspace(-5, 5, 31)
bins = calculate_bin_edges(x, input_bins, range=range)
assert all(bins == input_bins)


@pytest.mark.skipif('not HAS_SCIPY')
Expand Down
25 changes: 19 additions & 6 deletions astropy/visualization/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

import numpy as np
from inspect import signature
from ..stats import histogram
from ..stats.histogram import calculate_bin_edges

__all__ = ['hist']


def hist(x, bins=10, ax=None, **kwargs):
def hist(x, bins=10, ax=None, max_bins=1e5, **kwargs):
"""Enhanced histogram function
This is a histogram function that enables the use of more sophisticated
Expand Down Expand Up @@ -38,6 +38,12 @@ def hist(x, bins=10, ax=None, **kwargs):
specify the Axes on which to draw the histogram. If not specified,
then the current active axes will be used.
max_bins : int (optional)
Maximum number of bins allowed. With more than a few thousand bins
the performance of matplotlib will not be great. If the number of
bins is large *and* the number of input data points is large then
the it will take a very long time to compute the histogram.
**kwargs :
other keyword arguments are described in ``plt.hist()``.
Expand All @@ -49,10 +55,17 @@ def hist(x, bins=10, ax=None, **kwargs):
--------
astropy.stats.histogram
"""
# arguments of np.histogram should be passed to astropy.stats.histogram
arglist = list(signature(np.histogram).parameters.keys())[1:]
np_hist_kwds = dict((key, kwargs[key]) for key in arglist if key in kwargs)
hist, bins = histogram(x, bins, **np_hist_kwds)
# Note that we only calculate the bin edges...matplotlib will calculate
# the actual histogram.
range = kwargs.get('range', None)
weights = kwargs.get('weights', None)
bins = calculate_bin_edges(x, bins, range=range, weights=weights)

if len(bins) > max_bins:
raise ValueError('Histogram has too many bins: '
'{nbin}. Use max_bins to increase the number '
'of allowed bins or range to restrict '
'the histogram range.'.format(nbin=len(bins)))

if ax is None:
# optional dependency; only import if strictly needed.
Expand Down
13 changes: 13 additions & 0 deletions astropy/visualization/tests/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,16 @@ def test_hist_autobin(rseed=0):
n2, bins2, patches = hist(x, bintype, range=range)
assert_allclose(n1, n2)
assert_allclose(bins1, bins2)


def test_histogram_pathological_input():
# Regression test for https://github.com/astropy/astropy/issues/7758

# The key feature of the data below is that one of the points is very,
# very different than the rest. That leads to a large number of bins.
data = [9.99999914e+05, -8.31312483e-03, 6.52755852e-02, 1.43104653e-03,
-2.26311017e-02, 2.82660007e-03, 1.80307521e-02, 9.26294279e-03,
5.06606026e-02, 2.05418011e-03]

with pytest.raises(ValueError):
hist(data, bins='freedman', max_bins=10000)

0 comments on commit 35dc079

Please sign in to comment.