add a new module: pycraf.mc containing MC helpers

bwinkel · Aug 1, 2017 · 232d4a3 · 232d4a3
1 parent 3db3118
commit 232d4a3
Show file tree

Hide file tree

Showing 7 changed files with 289 additions and 2 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -44,6 +44,7 @@ Available modules
    geospatial/index
    satellite/index
    geometry/index
+   mc/index
 
 .. automodapi:: pycraf
     :no-heading:
@@ -64,7 +65,7 @@ Project details
 Acknowledgments
 ***************
 
-This code is makes use of the excellent work provided by the
+This code makes use of the excellent work provided by the
 `Astropy <http://www.astropy.org/>`__ community. pycraf uses the Astropy package and also the
 `Astropy Package Template <https://github.com/astropy/package-template>`__
 for the packaging.

diff --git a/docs/mc/index.rst b/docs/mc/index.rst
@@ -0,0 +1,26 @@
+.. pycraf-mc:
+
+*********************************
+Monte-Carlo helpers (`pycraf.mc`)
+*********************************
+
+.. currentmodule:: pycraf.mc
+
+Introduction
+============
+
+The `~pycraf.mc` subpackage
+
+
+See Also
+========
+
+- `Kernel density estimation (KDE) <https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/>`__, blog by Jake Vanderplas.
+- `Scipy gaussian_kde <https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.gaussian_kde.html#scipy.stats.gaussian_kde>`__
+
+Reference/API
+=============
+
+.. automodapi:: pycraf.mc
+    :no-inheritance-diagram:
+    :include-all-objects:
diff --git a/pycraf/__init__.py b/pycraf/__init__.py
@@ -21,7 +21,8 @@
     from . import conversions
     from . import geometry
     from . import geospatial
-    from . import utils
+    from . import mc
     from . import pathprof
     from . import protection
     from . import satellite
+    from . import utils
diff --git a/pycraf/mc/__init__.py b/pycraf/mc/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+'''
+This subpackage contains various little convenience functions to
+help with Monte-Carlo type compatiblity studies.
+'''
+
+from .sampler import *
diff --git a/pycraf/mc/sampler.py b/pycraf/mc/sampler.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+__all__ = ['HistogramSampler']
+
+
+class HistogramSampler(object):
+    '''
+    Sampler to get random values obeying a discrete(!) density distribution.
+
+    With this class, one can use discrete densities (think of them as
+    binned entities, aka histograms) to get random samples that follow
+    the same (binned) distribution as the histogram. For simplicity
+    the returned values are the N-dim indices that need to be fed
+    into the histogram bin-ranges if one wants to convert them
+    to physical values (see examples, below).
+
+    Parameters
+    ----------
+    histvals : N-D `~numpy.ndarray`
+        Discrete density distribution. (This is the histogram array, which
+        one would get out of `~numpy.histogram` functions.)
+
+    Returns
+    -------
+    hist_sampler : `~pycraf.mc.HistogramSampler`
+        A `~pycraf.mc.HistogramSampler` instance.
+
+    Examples
+    --------
+
+    A trivial one-dimensional use case::
+
+        >>> import numpy as np
+        >>> from astropy.utils.misc import NumpyRNGContext
+        >>> from pycraf import mc
+
+        >>> with NumpyRNGContext(1):
+        ...     x = np.random.normal(0, 1, 100)
+
+        >>> hist, bins = np.histogram(x, bins=16, range=(-4, 4))
+        >>> mid_points = (bins[1:] + bins[:-1]) / 2
+
+        >>> my_sampler = mc.HistogramSampler(hist)
+        >>> with NumpyRNGContext(1):
+        ...     indices = my_sampler.sample(10)
+        >>> print(indices)
+        [7 9 3 7 6 5 6 7 7 8]
+
+        >>> print(mid_points[indices])
+        [-0.25  0.75 -2.25 -0.25 -0.75 -1.25 -0.75 -0.25 -0.25  0.25]
+
+    Works equally simple in 2D::
+
+        >>> with NumpyRNGContext(1):
+        ...     x = np.random.normal(0, 1, 1000)
+        ...     y = np.random.normal(2, 2, 1000)
+
+        >>> hist2d, xbins, ybins = np.histogram2d(
+        ...     x, y, bins=(16, 16), range=((-4, 4), (-6, 10))
+        ...     )
+        >>> xmids = (xbins[1:] + xbins[:-1]) / 2
+        >>> ymids = (ybins[1:] + ybins[:-1]) / 2
+
+        >>> my_sampler = mc.HistogramSampler(hist2d)
+        >>> with NumpyRNGContext(1):
+        ...     indices = my_sampler.sample(10)
+        >>> print(list(zip(*indices)))
+        [(7, 8), (9, 6), (1, 7), (7, 5), (6, 4),
+         (5, 7), (6, 7), (7, 6), (7, 8), (8, 6)]
+
+        >>> print(list(zip(xmids[indices[0]], ymids[indices[1]])))
+        [(-0.25, 2.5), (0.75, 0.5), (-3.25, 1.5), (-0.25, -0.5),
+         (-0.75, -1.5), (-1.25, 1.5), (-0.75, 1.5), (-0.25, 0.5),
+         (-0.25, 2.5), (0.25, 0.5)]
+
+    It is also easily possible to apply weights. Just assume
+    that one bin was observed exceptionally frequent::
+
+        >>> weights = np.ones_like(x)
+        >>> weights[500] = 1000
+
+        >>> hist2d, xbins, ybins = np.histogram2d(
+        ...     x, y, bins=(16, 16), range=((-4, 4), (-6, 10)),
+        ...     weights=weights
+        ...     )
+
+        >>> my_sampler = mc.HistogramSampler(hist2d)
+        >>> with NumpyRNGContext(1):
+        ...     indices = my_sampler.sample(10)
+        >>> print(list(zip(xmids[indices[0]], ymids[indices[1]])))
+        [(-1.75, 4.5), (-0.25, 3.5), (-3.25, 1.5), (-1.75, 4.5),
+         (-1.75, 4.5), (-1.75, 4.5), (-1.75, 4.5), (-1.75, 4.5),
+         (-1.75, 4.5), (-1.25, 0.5)]
+
+    As can be seen, the value `((1.25, -1.5))` is now exceptionally
+    often sampled from the distribution.
+
+    As discussed in the notes, for some use-cases a KDE might
+    be the better tool:
+
+        >>> from scipy.stats import gaussian_kde
+
+        >>> kernel = gaussian_kde((x, y))
+        >>> with NumpyRNGContext(1):
+        ...     values = kernel.resample(10)
+
+        >>> print('\\n'.join(
+        ...     '({:9.6f}, {:9.6f})'.format(*t)
+        ...     for t in zip(values[0], values[1])
+        ...     ))
+        (-1.470808,  0.730811)
+        ( 0.088397,  3.407584)
+        ...
+        (-2.097790, -0.251477)
+        ( 0.261941, -0.936229)
+
+
+    Notes
+    -----
+    Even if you have continuous data to start with, using the histogram
+    approach obviously works, as well (as is demonstrated in the examples).
+    However, one looses the "continuous" property in the process.
+    The resulting samples will always be just be able to work
+    out the bin, but no continuous quantity can be reconstructed.
+    For many use cases this is probably fine, but in others one
+    might be better of by using Kernel Density Estimation.
+    There is a function for this in `~scipy.stats`
+    (`~scipy.stats.gaussian_kde`), which even allows works with
+    multi-variate data and allows one to sample from the KDE PDF
+    (see also the examples). Unfortunately, one cannot work with
+    weighted data.
+    '''
+
+    def __init__(self, histvals):
+
+        histvals = np.atleast_1d(histvals)
+
+        self._hshape = histvals.shape
+        self._ndim = histvals.ndim
+        # cdf is flat, will need to unravel indices later
+        self._cdf = np.cumsum(
+            histvals.flatten().astype(np.float64, copy=False)
+            )
+        self._cdf /= self._cdf[-1]
+
+    def sample(self, n):
+        '''
+        Sample from the (discrete) density distribution.
+
+        Parameters
+        ----------
+        n : int
+            Number of samples to draw.
+
+        Returns
+        -------
+        Indices : tuple of `~numpy.ndarray`
+            The indices of the drawn samples. See
+            `~pycraf.mc.HistogramSampler` for examples of use.
+        '''
+
+        rsamples = np.random.rand(n)
+        rbins = np.searchsorted(self._cdf, rsamples)
+
+        indices = np.unravel_index(rbins, self._hshape)
+
+        if self._ndim == 1:
+            return indices[0]
+        else:
+            return indices
diff --git a/pycraf/mc/tests/__init__.py b/pycraf/mc/tests/__init__.py
diff --git a/pycraf/mc/tests/test_sampler.py b/pycraf/mc/tests/test_sampler.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import pytest
+import numpy as np
+from numpy.testing import assert_equal, assert_allclose
+from astropy.tests.helper import assert_quantity_allclose, remote_data
+from astropy.utils.misc import NumpyRNGContext
+from astropy import units as apu
+from ...utils import check_astro_quantities
+from ... import mc
+
+
+TOL_KWARGS = {'atol': 1.e-4, 'rtol': 1.e-4}
+
+
+class TestHistogramSamper():
+
+    def setup(self):
+
+        with NumpyRNGContext(1):
+            self.x = np.random.normal(0, 1, 1000)
+            self.y = np.random.normal(2, 2, 1000)
+
+        self.xbins = np.linspace(-4, 4, 17)
+        self.ybins = np.linspace(-6, 10, 17)
+        self.xmids = (self.xbins[1:] + self.xbins[:-1]) / 2
+        self.ymids = (self.ybins[1:] + self.ybins[:-1]) / 2
+
+    def test_sample1d(self):
+
+        hist, _ = np.histogram(self.x, bins=self.xbins)
+
+        my_sampler = mc.HistogramSampler(hist)
+        with NumpyRNGContext(1):
+            indices = my_sampler.sample(10)
+        assert_equal(
+            indices,
+            np.array([7, 9, 1, 7, 6, 5, 6, 7, 7, 8]),
+            )
+
+        assert_allclose(
+            self.xmids[indices],
+            np.array([-0.25, 0.75, -3.25, -0.25, -0.75,
+                      -1.25, -0.75, -0.25, -0.25, 0.25]),
+            )
+
+    def test_sample2d(self):
+
+        hist, *_ = np.histogram2d(
+            self.x, self.y, bins=[self.xbins, self.ybins],
+            )
+
+        my_sampler = mc.HistogramSampler(hist)
+        with NumpyRNGContext(1):
+            indices = my_sampler.sample(10)
+        assert_equal(
+            indices,
+            np.array([
+                [7, 9, 1, 7, 6, 5, 6, 7, 7, 8],
+                [8, 6, 7, 5, 4, 7, 7, 6, 8, 6]
+                ]),
+            )
+
+        assert_allclose(
+            self.xmids[indices[0]],
+            np.array([-0.25, 0.75, -3.25, -0.25, -0.75,
+                      -1.25, -0.75, -0.25, -0.25, 0.25]),
+            )
+
+        assert_allclose(
+            self.ymids[indices[1]],
+            np.array([2.5, 0.5, 1.5, -0.5, -1.5,
+                      1.5, 1.5, 0.5, 2.5, 0.5]),
+            )