Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft plot_bpv (bayesian p-value) #1222

Merged
merged 12 commits into from
Jun 29, 2020
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* `plot_trace` now supports multiple aesthetics to identify chain and variable
shape and support matplotlib aliases (#1253)
* `plot_hdi` can now take already computed HDI values (#1241)
* `plot_bpv`. A new plot for Bayesian p-values (#1222)

### Maintenance and fixes
* Include data from `MultiObservedRV` to `observed_data` when using
Expand Down
2 changes: 2 additions & 0 deletions arviz/plots/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Plotting functions."""
from .autocorrplot import plot_autocorr
from .bpvplot import plot_bpv
from .compareplot import plot_compare
from .densityplot import plot_density
from .distplot import plot_dist
Expand All @@ -26,6 +27,7 @@

__all__ = [
"plot_autocorr",
"plot_bpv",
"plot_compare",
"plot_density",
"plot_dist",
Expand Down
167 changes: 167 additions & 0 deletions arviz/plots/backends/bokeh/bpvplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""Bokeh Bayesian p-value Posterior predictive plot."""
import numpy as np
from bokeh.models import BoxAnnotation
from bokeh.models.annotations import Title
from scipy import stats

from . import backend_kwarg_defaults
from .. import show_layout
from ...kdeplot import plot_kde
from ...plot_utils import (
_create_axes_grid,
sample_reference_distribution,
is_valid_quantile,
)
from ....numeric_utils import _fast_kde


def plot_bpv(
ax,
length_plotters,
rows,
cols,
obs_plotters,
pp_plotters,
total_pp_samples,
kind,
t_stat,
bpv,
plot_mean,
reference,
n_ref,
hdi_prob,
color,
figsize,
ax_labelsize,
markersize,
linewidth,
plot_ref_kwargs,
backend_kwargs,
show,
):
"""Bokeh bpv plot."""
if backend_kwargs is None:
backend_kwargs = {}

backend_kwargs = {
**backend_kwarg_defaults(("dpi", "plot.bokeh.figure.dpi"),),
**backend_kwargs,
}
if ax is None:
_, axes = _create_axes_grid(
length_plotters,
rows,
cols,
figsize=figsize,
backend="bokeh",
backend_kwargs=backend_kwargs,
)
else:
axes = np.atleast_2d(ax)

if len([item for item in axes.ravel() if not None]) != length_plotters:
raise ValueError(
"Found {} variables to plot but {} axes instances. They must be equal.".format(
length_plotters, len(axes)
)
)

for i, ax_i in enumerate((item for item in axes.flatten() if item is not None)):
var_name, _, obs_vals = obs_plotters[i]
pp_var_name, _, pp_vals = pp_plotters[i]

obs_vals = obs_vals.flatten()
pp_vals = pp_vals.reshape(total_pp_samples, -1)

if kind == "p_value":
tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1)
tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit)
x_s = np.linspace(xmin, xmax, len(tstat_pit_dens))
ax_i.line(x_s, tstat_pit_dens, line_width=linewidth, color=color)
# ax_i.set_yticks([])
if reference is not None:
dist = stats.beta(obs_vals.size / 2, obs_vals.size / 2)
if reference == "analytical":
lwb = dist.ppf((1 - 0.9999) / 2)
upb = 1 - lwb
x = np.linspace(lwb, upb, 500)
dens_ref = dist.pdf(x)
ax_i.line(x, dens_ref, **plot_ref_kwargs)
elif reference == "samples":
x_ss, u_dens = sample_reference_distribution(
dist, (n_ref, tstat_pit_dens.size,)
)
for x_ss_i, u_dens_i in zip(x_ss.T, u_dens.T):
ax_i.line(x_ss_i, u_dens_i, line_width=linewidth, **plot_ref_kwargs)

elif kind == "u_value":
tstat_pit = np.mean(pp_vals <= obs_vals, axis=0)
tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit)
x_s = np.linspace(xmin, xmax, len(tstat_pit_dens))
ax_i.line(x_s, tstat_pit_dens, color=color)
if reference is not None:
if reference == "analytical":
n_obs = obs_vals.size
hdi = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
hdi_odds = (hdi / (1 - hdi), (1 - hdi) / hdi)
ax_i.add_layout(
BoxAnnotation(
bottom=hdi_odds[1],
top=hdi_odds[0],
fill_alpha=plot_ref_kwargs.pop("alpha"),
fill_color=plot_ref_kwargs.pop("color"),
**plot_ref_kwargs,
)
)

elif reference == "samples":
dist = stats.uniform(0, 1)
x_ss, u_dens = sample_reference_distribution(dist, (tstat_pit_dens.size, n_ref))
for x_ss_i, u_dens_i in zip(x_ss.T, u_dens.T):
ax_i.line(x_ss_i, u_dens_i, line_width=linewidth, **plot_ref_kwargs)
# ax_i.set_ylim(0, None)
# ax_i.set_xlim(0, 1)
else:
if t_stat in ["mean", "median", "std"]:
if t_stat == "mean":
tfunc = np.mean
elif t_stat == "median":
tfunc = np.median
elif t_stat == "std":
tfunc = np.std
obs_vals = tfunc(obs_vals)
pp_vals = tfunc(pp_vals, axis=1)
elif hasattr(t_stat, "__call__"):
obs_vals = t_stat(obs_vals.flatten())
pp_vals = t_stat(pp_vals)
elif is_valid_quantile(t_stat):
t_stat = float(t_stat)
obs_vals = np.quantile(obs_vals, q=t_stat)
pp_vals = np.quantile(pp_vals, q=t_stat, axis=1)
else:
raise ValueError(f"T statistics {t_stat} not implemented")

plot_kde(pp_vals, ax=ax_i, plot_kwargs={"color": color}, backend="bokeh", show=False)
# ax_i.set_yticks([])
if bpv:
p_value = np.mean(pp_vals <= obs_vals)
ax_i.line(0, 0, legend_label=f"bpv={p_value:.2f}", alpha=0)

if plot_mean:
ax_i.circle(
obs_vals.mean(), 0, fill_color=color, line_color="black", size=markersize
)

if var_name != pp_var_name:
xlabel = "{} / {}".format(var_name, pp_var_name)
else:
xlabel = var_name
_title = Title()
_title.text = xlabel
ax_i.title = _title
size = str(int(ax_labelsize))
ax_i.title.text_font_size = f"{size}pt"

show_layout(axes, show)

return axes
1 change: 1 addition & 0 deletions arviz/plots/backends/matplotlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def backend_show(show):


from .autocorrplot import plot_autocorr
from .bpvplot import plot_bpv
from .compareplot import plot_compare
from .densityplot import plot_density
from .distplot import plot_dist
Expand Down
140 changes: 140 additions & 0 deletions arviz/plots/backends/matplotlib/bpvplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Matplotib Bayesian p-value Posterior predictive plot."""
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from . import backend_show
from ...kdeplot import plot_kde
from ...plot_utils import (
make_label,
_create_axes_grid,
sample_reference_distribution,
is_valid_quantile,
)
from ....numeric_utils import _fast_kde


def plot_bpv(
ax,
length_plotters,
rows,
cols,
obs_plotters,
pp_plotters,
total_pp_samples,
kind,
t_stat,
bpv,
plot_mean,
reference,
n_ref,
hdi_prob,
color,
figsize,
ax_labelsize,
markersize,
linewidth,
plot_ref_kwargs,
backend_kwargs,
show,
):
"""Matplotlib bpv plot."""
if ax is None:
_, axes = _create_axes_grid(
length_plotters, rows, cols, figsize=figsize, backend_kwargs=backend_kwargs
)
else:
axes = np.ravel(ax)
if len(axes) != length_plotters:
raise ValueError(
"Found {} variables to plot but {} axes instances. They must be equal.".format(
length_plotters, len(axes)
)
)

for i, ax_i in enumerate(axes):
var_name, selection, obs_vals = obs_plotters[i]
pp_var_name, _, pp_vals = pp_plotters[i]

obs_vals = obs_vals.flatten()
pp_vals = pp_vals.reshape(total_pp_samples, -1)

if kind == "p_value":
tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1)
tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit)
x_s = np.linspace(xmin, xmax, len(tstat_pit_dens))
ax_i.plot(x_s, tstat_pit_dens, linewidth=linewidth, color=color)
ax_i.set_yticks([])
if reference is not None:
dist = stats.beta(obs_vals.size / 2, obs_vals.size / 2)
if reference == "analytical":
lwb = dist.ppf((1 - 0.9999) / 2)
upb = 1 - lwb
x = np.linspace(lwb, upb, 500)
dens_ref = dist.pdf(x)
ax_i.plot(x, dens_ref, **plot_ref_kwargs)
elif reference == "samples":
x_ss, u_dens = sample_reference_distribution(
dist, (n_ref, tstat_pit_dens.size,)
)
ax_i.plot(x_ss, u_dens, linewidth=linewidth, **plot_ref_kwargs)

elif kind == "u_value":
tstat_pit = np.mean(pp_vals <= obs_vals, axis=0)
tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit)
x_s = np.linspace(xmin, xmax, len(tstat_pit_dens))
ax_i.plot(x_s, tstat_pit_dens, color=color)
if reference is not None:
if reference == "analytical":
n_obs = obs_vals.size
hdi = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
hdi_odds = (hdi / (1 - hdi), (1 - hdi) / hdi)
ax_i.axhspan(*hdi_odds, **plot_ref_kwargs)
elif reference == "samples":
dist = stats.uniform(0, 1)
x_ss, u_dens = sample_reference_distribution(dist, (tstat_pit_dens.size, n_ref))
ax_i.plot(x_ss, u_dens, linewidth=linewidth, **plot_ref_kwargs)
ax_i.set_ylim(0, None)
ax_i.set_xlim(0, 1)
else:
if t_stat in ["mean", "median", "std"]:
if t_stat == "mean":
tfunc = np.mean
elif t_stat == "median":
tfunc = np.median
elif t_stat == "std":
tfunc = np.std
obs_vals = tfunc(obs_vals)
pp_vals = tfunc(pp_vals, axis=1)
elif hasattr(t_stat, "__call__"):
obs_vals = t_stat(obs_vals.flatten())
pp_vals = t_stat(pp_vals)
elif is_valid_quantile(t_stat):
t_stat = float(t_stat)
obs_vals = np.quantile(obs_vals, q=t_stat)
pp_vals = np.quantile(pp_vals, q=t_stat, axis=1)
else:
raise ValueError(f"T statistics {t_stat} not implemented")

plot_kde(pp_vals, ax=ax_i, plot_kwargs={"color": color})
ax_i.set_yticks([])
if bpv:
p_value = np.mean(pp_vals <= obs_vals)
ax_i.plot(0, 0, label=f"bpv={p_value:.2f}", alpha=0)
ax_i.legend()

if plot_mean:
ax_i.plot(
obs_vals.mean(), 0, "o", color=color, markeredgecolor="k", markersize=markersize
)

if var_name != pp_var_name:
xlabel = "{} / {}".format(var_name, pp_var_name)
else:
xlabel = var_name
ax_i.set_title(make_label(xlabel, selection), fontsize=ax_labelsize)

if backend_show(show):
plt.show()

return axes
Loading