Skip to content

Commit

Permalink
Merge 6c5b4e8 into 5b41d87
Browse files Browse the repository at this point in the history
  • Loading branch information
ColCarroll committed Sep 9, 2018
2 parents 5b41d87 + 6c5b4e8 commit f4e6dfd
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 70 deletions.
Binary file modified arviz/data/centered_eight.nc
Binary file not shown.
Binary file modified arviz/data/non_centered_eight.nc
Binary file not shown.
10 changes: 8 additions & 2 deletions arviz/inference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,16 @@ def from_netcdf(filename):
groups[group] = data
return InferenceData(**groups)

def to_netcdf(self, filename):
def to_netcdf(self, filename, compress=True):
"""Write InferenceData to file using netcdf4.
Parameters
----------
filename : str
Location to write to
compress : bool
Whether to compress result. Note this saves disk space, but may make
saving and loading somewhat slower (default: True).
Returns
-------
Expand All @@ -74,7 +77,10 @@ def to_netcdf(self, filename):
mode = 'w' # overwrite first, then append
for group in self._groups:
data = getattr(self, group)
data.to_netcdf(filename, mode=mode, group=group)
kwargs = {}
if compress:
kwargs['encoding'] = {var_name: {'zlib': True} for var_name in data.variables}
data.to_netcdf(filename, mode=mode, group=group, **kwargs)
data.close()
mode = 'a'
return filename
3 changes: 1 addition & 2 deletions arviz/plots/autocorrplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from ..stats.diagnostics import autocorr


def autocorrplot(data, var_names=None, max_lag=100, combined=False,
figsize=None, textsize=None):
def autocorrplot(data, var_names=None, max_lag=100, combined=False, figsize=None, textsize=None):
"""Bar plot of the autocorrelation function for a sequence of data.
Useful in particular for posteriors from MCMC samples which may display correlation.
Expand Down
109 changes: 66 additions & 43 deletions arviz/plots/ppcplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,81 +4,104 @@
from .plot_utils import _scale_text, _create_axes_grid, default_grid


def ppcplot(data, ppc_sample, kind='kde', mean=True, figsize=None, textsize=None, ax=None):
def ppcplot(data, kind='kde', alpha=0.2, mean=True, figsize=None, textsize=None):
"""
Plot for Posterior Predictive checks.
Note that this plot will flatten out any dimensions in the posterior predictive variables.
Parameters
----------
data : Array-like
Observed values
ppc_samples : dict
Posterior predictive check samples
kind : str
Type of plot to display (kde or cumulative)
alpha : float
Opacity of posterior predictive density curves
mean : bool
Whether or not to plot the mean ppc distribution. Defaults to True
Whether or not to plot the mean posterior predictive distribution. Defaults to True
figsize : figure size tuple
If None, size is (6, 5)
textsize: int
Text size for labels. If None it will be auto-scaled based on figsize.
ax: axes
Matplotlib axes
Returns
-------
ax : matplotlib axes
axes : matplotlib axes
"""
rows, cols = default_grid(len(ppc_sample))

if figsize is None:
figsize = (7, 5)
for group in ('posterior_predictive', 'observed_data'):
if not hasattr(data, group):
raise TypeError(
'`data` argument must have the group "{group}" for ppcplot'.format(group=group))

_, ax = _create_axes_grid(len(ppc_sample), rows, cols, figsize=figsize)
observed = data.observed_data
posterior_predictive = data.posterior_predictive

textsize, linewidth, _ = _scale_text(figsize, textsize, 2)
rows, cols = default_grid(len(observed.data_vars))
if figsize is None:
figsize = (7 * cols, 5 * rows)
_, axes = _create_axes_grid(len(observed.data_vars), rows, cols, figsize=figsize)

for ax_, (var, ppss) in zip(np.atleast_1d(ax), ppc_sample.items()):
textsize, linewidth, _ = _scale_text(figsize, textsize)
for ax, var_name in zip(np.atleast_1d(axes), observed.data_vars):
if kind == 'kde':
kdeplot(data, label='{}'.format(var),
kdeplot(observed[var_name].values.flatten(), label='Observed {}'.format(var_name),
plot_kwargs={'color': 'k', 'linewidth': linewidth, 'zorder': 3},
fill_kwargs={'alpha': 0},
ax=ax_)
for pps in ppss:
kdeplot(pps,
plot_kwargs={'color': 'C5', 'linewidth': 0.5 * linewidth},
fill_kwargs={'alpha': 0},
ax=ax_)
ax_.plot([], color='C5', label='{}_pps'.format(var))
ax=ax)
for _, chain_vals in posterior_predictive[var_name].groupby('chain'):
for _, vals in chain_vals.groupby('draw'):
kdeplot(vals,
plot_kwargs={'color': 'C4',
'alpha': alpha,
'linewidth': 0.5 * linewidth},
fill_kwargs={'alpha': 0},
ax=ax)
ax.plot([], color='C4', label='Posterior predictive {}'.format(var_name))
if mean:
kdeplot(ppss,
kdeplot(posterior_predictive[var_name].values.flatten(),
plot_kwargs={'color': 'C0',
'linestyle': '--',
'linewidth': linewidth,
'zorder': 2},
label='mean {}_pps'.format(var),
ax=ax_)
ax_.set_xlabel(var, fontsize=textsize)
ax_.set_yticks([])
label='Posterior predictive mean {}'.format(var_name),
ax=ax)
ax.set_xlabel(var_name, fontsize=textsize)
ax.set_yticks([])

elif kind == 'cumulative':
ax_.plot(*_ecdf(data), color='k', lw=linewidth, label='{}'.format(var), zorder=3)
for pps in ppss:
ax_.plot(*_ecdf(pps), alpha=0.2, color='C5', lw=linewidth)
ax_.plot([], color='C5', label='{}_pps'.format(var))
ax.plot(*_empirical_cdf(observed[var_name].values.flatten()),
color='k',
linewidth=linewidth,
label='Observed {}'.format(var_name),
zorder=3)
for _, chain_vals in posterior_predictive[var_name].groupby('chain'):
for _, vals in chain_vals.groupby('draw'):
ax.plot(*_empirical_cdf(vals), alpha=alpha, color='C4', linewidth=linewidth)
ax.plot([], color='C4', label='Posterior predictive {}'.format(var_name))
if mean:
ax_.plot(*_ecdf(ppss.flatten()), color='C0', ls='--', lw=linewidth,
label='mean {}_pps'.format(var))
ax_.set_xlabel(var, fontsize=textsize)
ax_.set_yticks([0, 0.5, 1])
ax_.legend(fontsize=textsize)
ax_.tick_params(labelsize=textsize)
ax.plot(*_empirical_cdf(posterior_predictive[var_name].values.flatten()),
color='C0',
linestyle='--',
linewidth=linewidth,
label='Posterior predictive mean {}'.format(var_name))
ax.set_xlabel(var_name, fontsize=textsize)
ax.set_yticks([0, 0.5, 1])
ax.legend(fontsize=textsize)
return axes


return ax
def _empirical_cdf(data):
"""Compute empirical cdf of a numpy array.
Parameters
----------
data : np.array
1d array
def _ecdf(data):
len_data = len(data)
data_s = np.sort(data)
cdf = np.arange(1, len_data + 1) / len_data
return data_s, cdf
Returns
-------
np.array, np.array
x and y coordinates for the empirical cdf of the data
"""
return np.sort(data), np.linspace(0, 1, len(data))
7 changes: 5 additions & 2 deletions arviz/tests/test_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from .helpers import eight_schools_params, load_cached_models, BaseArvizTest
from ..utils import pymc3_to_inference_data
from ..plots import (densityplot, traceplot, energyplot, posteriorplot, autocorrplot, forestplot,
parallelplot, pairplot, jointplot, ppcplot, violintraceplot)

Expand Down Expand Up @@ -80,8 +81,10 @@ def test_pairplot(self):
pairplot(self.short_trace, kind='hexbin', var_names=['theta'],
coords={'theta_dim_0': [0, 1]}, plot_kwargs={'cmap': 'viridis'}, textsize=20)

def test_ppcplot(self):
ppcplot(self.data['y'], self.sample_ppc)
@pytest.mark.parametrize('kind', ['kde', 'cumulative'])
def test_ppcplot(self, kind):
data = pymc3_to_inference_data(trace=self.short_trace, posterior_predictive=self.sample_ppc)
ppcplot(data, kind=kind)

def test_violintraceplot(self):
for obj in (self.short_trace, self.fit):
Expand Down
2 changes: 1 addition & 1 deletion arviz/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def load_arviz_data(dataset):
NUTS in PyMC3. Features named coordinates for each of the eight schools.
''',
'path': os.path.join(data_path, 'non_centered_eight.nc')
}
},
}
if dataset in datasets_available:
return InferenceData.from_netcdf(datasets_available[dataset]['path'])
Expand Down
6 changes: 5 additions & 1 deletion arviz/utils/xarray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,13 @@ def posterior_to_xarray(self):
@requires('trace')
def sample_stats_to_xarray(self):
"""Extract sample_stats from PyMC3 trace."""
rename_key = {
'model_logp' : 'lp',
}
data = {}
for stat in self.trace.stat_names:
data[stat] = np.array(self.trace.get_sampler_stats(stat, combine=False))
name = rename_key.get(stat, stat)
data[name] = np.array(self.trace.get_sampler_stats(stat, combine=False))
return dict_to_dataset(data)

@requires('posterior_predictive')
Expand Down
21 changes: 2 additions & 19 deletions examples/ppcplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,8 @@
_thumb: .6, .5
"""
import arviz as az
import numpy as np
import pymc3 as pm

az.style.use('arviz-darkgrid')

# Data of the Eight Schools Model
J = 8
y = np.array([28., 8., -3., 7., -1., 1., 18., 12.])
sigma = np.array([15., 10., 16., 11., 9., 11., 10., 18.])


with pm.Model() as centered_eight:
mu = pm.Normal('mu', mu=0, sd=5)
tau = pm.HalfCauchy('tau', beta=5)
theta = pm.Normal('theta', mu=mu, sd=tau, shape=J)
obs = pm.Normal('obs', mu=theta, sd=sigma, observed=y)
centered_eight_trace = pm.sample()

with centered_eight:
ppc_samples = pm.sample_ppc(centered_eight_trace, samples=100)

az.ppcplot(y, ppc_samples)
data = az.load_arviz_data('non_centered_eight')
az.ppcplot(data, alpha=0.03, figsize=(12, 6), textsize=14);
12 changes: 12 additions & 0 deletions examples/ppcplot_cumulative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
Posterior Predictive Check Cumulative Plot
==========================================
_thumb: .6, .5
"""
import arviz as az

az.style.use('arviz-darkgrid')

data = az.load_arviz_data('non_centered_eight')
az.ppcplot(data, alpha=0.03, kind='cumulative', figsize=(12, 6), textsize=14)

0 comments on commit f4e6dfd

Please sign in to comment.