Merge 6c5b4e8 into 5b41d87

arviz-devs · Sep 9, 2018 · f4e6dfd · f4e6dfd
2 parents 5b41d87 + 6c5b4e8
commit f4e6dfd
Show file tree

Hide file tree

Showing 10 changed files with 100 additions and 70 deletions.
diff --git a/arviz/data/centered_eight.nc b/arviz/data/centered_eight.nc
diff --git a/arviz/data/non_centered_eight.nc b/arviz/data/non_centered_eight.nc
diff --git a/arviz/inference_data.py b/arviz/inference_data.py
@@ -58,13 +58,16 @@ def from_netcdf(filename):
                 groups[group] = data
         return InferenceData(**groups)
 
-    def to_netcdf(self, filename):
+    def to_netcdf(self, filename, compress=True):
         """Write InferenceData to file using netcdf4.
 
         Parameters
         ----------
         filename : str
             Location to write to
+        compress : bool
+            Whether to compress result. Note this saves disk space, but may make
+            saving and loading somewhat slower (default: True).
 
         Returns
         -------
@@ -74,7 +77,10 @@ def to_netcdf(self, filename):
         mode = 'w' # overwrite first, then append
         for group in self._groups:
             data = getattr(self, group)
-            data.to_netcdf(filename, mode=mode, group=group)
+            kwargs = {}
+            if compress:
+                kwargs['encoding'] = {var_name: {'zlib': True} for var_name in data.variables}
+            data.to_netcdf(filename, mode=mode, group=group, **kwargs)
             data.close()
             mode = 'a'
         return filename
diff --git a/arviz/plots/autocorrplot.py b/arviz/plots/autocorrplot.py
@@ -6,8 +6,7 @@
 from ..stats.diagnostics import autocorr
 
 
-def autocorrplot(data, var_names=None, max_lag=100, combined=False,
-                 figsize=None, textsize=None):
+def autocorrplot(data, var_names=None, max_lag=100, combined=False, figsize=None, textsize=None):
     """Bar plot of the autocorrelation function for a sequence of data.
 
     Useful in particular for posteriors from MCMC samples which may display correlation.

diff --git a/arviz/plots/ppcplot.py b/arviz/plots/ppcplot.py
@@ -4,81 +4,104 @@
 from .plot_utils import _scale_text, _create_axes_grid, default_grid
 
 
-def ppcplot(data, ppc_sample, kind='kde', mean=True, figsize=None, textsize=None, ax=None):
+def ppcplot(data, kind='kde', alpha=0.2, mean=True, figsize=None, textsize=None):
     """
     Plot for Posterior Predictive checks.
 
+    Note that this plot will flatten out any dimensions in the posterior predictive variables.
+
     Parameters
     ----------
     data : Array-like
         Observed values
-    ppc_samples : dict
-        Posterior predictive check samples
     kind : str
         Type of plot to display (kde or cumulative)
+    alpha : float
+        Opacity of posterior predictive density curves
     mean : bool
-        Whether or not to plot the mean ppc distribution. Defaults to True
+        Whether or not to plot the mean posterior predictive distribution. Defaults to True
     figsize : figure size tuple
         If None, size is (6, 5)
     textsize: int
         Text size for labels. If None it will be auto-scaled based on figsize.
-    ax: axes
-        Matplotlib axes
 
     Returns
     -------
-    ax : matplotlib axes
+    axes : matplotlib axes
     """
-    rows, cols = default_grid(len(ppc_sample))
-
-    if figsize is None:
-        figsize = (7, 5)
+    for group in ('posterior_predictive', 'observed_data'):
+        if not hasattr(data, group):
+            raise TypeError(
+                '`data` argument must have the group "{group}" for ppcplot'.format(group=group))
 
-    _, ax = _create_axes_grid(len(ppc_sample), rows, cols, figsize=figsize)
+    observed = data.observed_data
+    posterior_predictive = data.posterior_predictive
 
-    textsize, linewidth, _ = _scale_text(figsize, textsize, 2)
+    rows, cols = default_grid(len(observed.data_vars))
+    if figsize is None:
+        figsize = (7 * cols, 5 * rows)
+    _, axes = _create_axes_grid(len(observed.data_vars), rows, cols, figsize=figsize)
 
-    for ax_, (var, ppss) in zip(np.atleast_1d(ax), ppc_sample.items()):
+    textsize, linewidth, _ = _scale_text(figsize, textsize)
+    for ax, var_name in zip(np.atleast_1d(axes), observed.data_vars):
         if kind == 'kde':
-            kdeplot(data, label='{}'.format(var),
+            kdeplot(observed[var_name].values.flatten(), label='Observed {}'.format(var_name),
                     plot_kwargs={'color': 'k', 'linewidth': linewidth, 'zorder': 3},
                     fill_kwargs={'alpha': 0},
-                    ax=ax_)
-            for pps in ppss:
-                kdeplot(pps,
-                        plot_kwargs={'color': 'C5', 'linewidth': 0.5 * linewidth},
-                        fill_kwargs={'alpha': 0},
-                        ax=ax_)
-            ax_.plot([], color='C5', label='{}_pps'.format(var))
+                    ax=ax)
+            for _, chain_vals in posterior_predictive[var_name].groupby('chain'):
+                for _, vals in chain_vals.groupby('draw'):
+                    kdeplot(vals,
+                            plot_kwargs={'color': 'C4',
+                                         'alpha': alpha,
+                                         'linewidth': 0.5 * linewidth},
+                            fill_kwargs={'alpha': 0},
+                            ax=ax)
+            ax.plot([], color='C4', label='Posterior predictive {}'.format(var_name))
             if mean:
-                kdeplot(ppss,
+                kdeplot(posterior_predictive[var_name].values.flatten(),
                         plot_kwargs={'color': 'C0',
                                      'linestyle': '--',
                                      'linewidth': linewidth,
                                      'zorder': 2},
-                        label='mean {}_pps'.format(var),
-                        ax=ax_)
-            ax_.set_xlabel(var, fontsize=textsize)
-            ax_.set_yticks([])
+                        label='Posterior predictive mean {}'.format(var_name),
+                        ax=ax)
+            ax.set_xlabel(var_name, fontsize=textsize)
+            ax.set_yticks([])
 
         elif kind == 'cumulative':
-            ax_.plot(*_ecdf(data), color='k', lw=linewidth, label='{}'.format(var), zorder=3)
-            for pps in ppss:
-                ax_.plot(*_ecdf(pps), alpha=0.2, color='C5', lw=linewidth)
-            ax_.plot([], color='C5', label='{}_pps'.format(var))
+            ax.plot(*_empirical_cdf(observed[var_name].values.flatten()),
+                    color='k',
+                    linewidth=linewidth,
+                    label='Observed {}'.format(var_name),
+                    zorder=3)
+            for _, chain_vals in posterior_predictive[var_name].groupby('chain'):
+                for _, vals in chain_vals.groupby('draw'):
+                    ax.plot(*_empirical_cdf(vals), alpha=alpha, color='C4', linewidth=linewidth)
+            ax.plot([], color='C4', label='Posterior predictive {}'.format(var_name))
             if mean:
-                ax_.plot(*_ecdf(ppss.flatten()), color='C0', ls='--', lw=linewidth,
-                         label='mean {}_pps'.format(var))
-            ax_.set_xlabel(var, fontsize=textsize)
-            ax_.set_yticks([0, 0.5, 1])
-        ax_.legend(fontsize=textsize)
-        ax_.tick_params(labelsize=textsize)
+                ax.plot(*_empirical_cdf(posterior_predictive[var_name].values.flatten()),
+                        color='C0',
+                        linestyle='--',
+                        linewidth=linewidth,
+                        label='Posterior predictive mean {}'.format(var_name))
+            ax.set_xlabel(var_name, fontsize=textsize)
+            ax.set_yticks([0, 0.5, 1])
+        ax.legend(fontsize=textsize)
+    return axes
+
 
-    return ax
+def _empirical_cdf(data):
+    """Compute empirical cdf of a numpy array.
 
+    Parameters
+    ----------
+    data : np.array
+        1d array
 
-def _ecdf(data):
-    len_data = len(data)
-    data_s = np.sort(data)
-    cdf = np.arange(1, len_data + 1) / len_data
-    return data_s, cdf
+    Returns
+    -------
+    np.array, np.array
+        x and y coordinates for the empirical cdf of the data
+    """
+    return np.sort(data), np.linspace(0, 1, len(data))
diff --git a/arviz/tests/test_plots.py b/arviz/tests/test_plots.py
@@ -5,6 +5,7 @@
 import pytest
 
 from .helpers import eight_schools_params, load_cached_models, BaseArvizTest
+from ..utils import pymc3_to_inference_data
 from ..plots import (densityplot, traceplot, energyplot, posteriorplot, autocorrplot, forestplot,
                      parallelplot, pairplot, jointplot, ppcplot, violintraceplot)
 
@@ -80,8 +81,10 @@ def test_pairplot(self):
         pairplot(self.short_trace, kind='hexbin', var_names=['theta'],
                  coords={'theta_dim_0': [0, 1]}, plot_kwargs={'cmap': 'viridis'}, textsize=20)
 
-    def test_ppcplot(self):
-        ppcplot(self.data['y'], self.sample_ppc)
+    @pytest.mark.parametrize('kind', ['kde', 'cumulative'])
+    def test_ppcplot(self, kind):
+        data = pymc3_to_inference_data(trace=self.short_trace, posterior_predictive=self.sample_ppc)
+        ppcplot(data, kind=kind)
 
     def test_violintraceplot(self):
         for obj in (self.short_trace, self.fit):

diff --git a/arviz/utils/utils.py b/arviz/utils/utils.py
@@ -385,7 +385,7 @@ def load_arviz_data(dataset):
                 NUTS in PyMC3.  Features named coordinates for each of the eight schools.
             ''',
             'path': os.path.join(data_path, 'non_centered_eight.nc')
-        }
+        },
     }
     if dataset in datasets_available:
         return InferenceData.from_netcdf(datasets_available[dataset]['path'])

diff --git a/arviz/utils/xarray_utils.py b/arviz/utils/xarray_utils.py
@@ -282,9 +282,13 @@ def posterior_to_xarray(self):
     @requires('trace')
     def sample_stats_to_xarray(self):
         """Extract sample_stats from PyMC3 trace."""
+        rename_key = {
+            'model_logp' : 'lp',
+        }
         data = {}
         for stat in self.trace.stat_names:
-            data[stat] = np.array(self.trace.get_sampler_stats(stat, combine=False))
+            name = rename_key.get(stat, stat)
+            data[name] = np.array(self.trace.get_sampler_stats(stat, combine=False))
         return dict_to_dataset(data)
 
     @requires('posterior_predictive')

diff --git a/examples/ppcplot.py b/examples/ppcplot.py
@@ -5,25 +5,8 @@
 _thumb: .6, .5
 """
 import arviz as az
-import numpy as np
-import pymc3 as pm
 
 az.style.use('arviz-darkgrid')
 
-# Data of the Eight Schools Model
-J = 8
-y = np.array([28.,  8., -3.,  7., -1.,  1., 18., 12.])
-sigma = np.array([15., 10., 16., 11.,  9., 11., 10., 18.])
-
-
-with pm.Model() as centered_eight:
-    mu = pm.Normal('mu', mu=0, sd=5)
-    tau = pm.HalfCauchy('tau', beta=5)
-    theta = pm.Normal('theta', mu=mu, sd=tau, shape=J)
-    obs = pm.Normal('obs', mu=theta, sd=sigma, observed=y)
-    centered_eight_trace = pm.sample()
-
-with centered_eight:
-    ppc_samples = pm.sample_ppc(centered_eight_trace, samples=100)
-
-az.ppcplot(y, ppc_samples)
+data = az.load_arviz_data('non_centered_eight')
+az.ppcplot(data, alpha=0.03, figsize=(12, 6), textsize=14);
diff --git a/examples/ppcplot_cumulative.py b/examples/ppcplot_cumulative.py
@@ -0,0 +1,12 @@
+"""
+Posterior Predictive Check Cumulative Plot
+==========================================
+
+_thumb: .6, .5
+"""
+import arviz as az
+
+az.style.use('arviz-darkgrid')
+
+data = az.load_arviz_data('non_centered_eight')
+az.ppcplot(data, alpha=0.03, kind='cumulative', figsize=(12, 6), textsize=14)