In [None]:
from astropy.io import ascii
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('apw-notebook.mplstyle')
%matplotlib inline
from scipy.stats import scoreatpercentile
import emcee
import corner

We'll subclass this helper class below to define our probabilistic models:

In [None]:
class ProbModel(object):
    
    def __init__(self, x, y, y_err):
        """ 
        We store the data as attributes of the object so we don't have to 
        keep passing it in to the methods that compute the probabilities.
        """
        self.x = np.asarray(x)
        self.y = np.asarray(y)
        self.y_err = np.asarray(y_err)

    def ln_likelihood(self, pars):
        raise NotImplementedError()

    def ln_prior(self, pars):
        raise NotImplementedError()
        
    def ln_posterior(self, pars):
        """ 
        Up to a normalization constant, the log of the posterior pdf is just 
        the sum of the log likelihood plus the log prior.
        """
        lnp = self.ln_prior(pars)
        if np.isinf(lnp): # short-circuit if the prior is infinite (don't bother computing likelihood)
            return lnp

        lnL = self.ln_likelihood(pars).sum()
        lnprob = lnp + lnL

        if np.isnan(lnprob):
            return -np.inf

        return lnprob
    
    def __call__(self, pars):
        return self.ln_posterior(pars)

# Marginalization in simple probabilistic models

In both cases below, let's assume we are given $N$ measurements of the flux of a source, $f_n$, at times $t_n$ and are handed Gaussian uncertainties $\sigma_n$ for each datum.


## Case 1

We would like to measure the mean or true flux of the source, $f_0$. But when we plot the data, we notice that the scatter in the flux measurements looks much larger than the reported uncertainties. The error bars are either *underestimated*, or the source has some *intrinsic scatter* or noise that we need to take into account in our model. (BTW: In practice, disentangling these two possibilities is impossible without other data.) We'll build a model that adds this extra scatter (whether it's because of underestimated error bars or intrinsic scatter) under the assumption that the unaccounted for noise is also Gaussian. Then, we'll want to marginalize our posterior pdf over the nuisance parameter (the extra variance).

If the extra noise is correlated in some way (e.g., stellar turbulence for light curves of stars, AGN variability, nuisance features in a spectrum) then this is *not* an unbiased way to infer the mean model for the source.

Let's first read some data (also in the repo) that we'll be using:

In [None]:
tbl1 = ascii.read("case1.csv")
tbl1.colnames

The first thing we can do is just estimate the mean flux and uncertainty on the mean using a maximum-likelihood estimator (ignoring the extra variance). We find that:

In [None]:
flux_ivar = 1 / tbl1['flux_err']**2 # inverse-variance
mean_flux = np.sum(tbl1['flux']*flux_ivar) / np.sum(flux_ivar)
mean_flux_err = np.sqrt(1 / np.sum(flux_ivar))

# the truth is 10.
print('Mean flux: {:.2f} ± {:.2f}'.format(mean_flux, mean_flux_err))

Clearly the uncertainty we get on the mean flux is too small! This estimate is many sigma away from the truth (10)

In [None]:
plt.figure(figsize=(8,6))
plt.errorbar(tbl1['time'], tbl1['flux'], tbl1['flux_err'], 
             linestyle='none', marker='o')
plt.axhline(mean_flux, linestyle='--')

xlim = plt.xlim()
plt.axhspan(mean_flux-mean_flux_err, mean_flux+mean_flux_err, 
            xmin=xlim[0], xmax=xlim[1], color='#aaaaaa')

We'll now instead add a parameter to our model to account for this intrinsic scatter: a variance $V$. Following our derivation from the board, our likelihood (for a single observation) with this extra parameter is:

$$
p(f_n \,|\, f_0, V, \sigma_n^2) = \mathcal{N}(f_n \,|\, f_0, \sigma_n^2 + V) \\
\ln p(f_n \,|\, f_0, V, \sigma_n^2) = -\frac{1}{2}\frac{(f_n - f_0)^2}{\sigma_n^2 + V} - 
\frac{1}{2}\ln\left[2\pi \, (\sigma_n^2 + V) \right]
$$

If we want to use MCMC and do Bayesian inference, we also need to specify prior probability distributions for both $f_0$ and $V$. For $f_0$, we'll use a uniform prior over some large range of positive values (say, from 0 to 100). For the extra variance $V$, we'll use a prior that is uniform in log-space over some large range, e.g.:

$$
p(\ln V) = \mathcal{U}(-2, 2)
$$

This choice comes from using a [Jeffrey's prior](https://en.wikipedia.org/wiki/Jeffreys_prior). When using a log-uniform prior like the above, it is usually easier to just sample in the log of the variable. That is, we'll use $\ln V$ as our parameter instead of $V$:

In [None]:
class Model1(ProbModel):

    def ln_prior(self, pars):
        f0, lnV = pars
        
        # FILL IN HERE
    
    def ln_likelihood(self, pars):
        f0, lnV = pars
        
        # FILL IN HERE

In [None]:
model1 = Model1(x=tbl1['time'], y=tbl1['flux'], y_err=tbl1['flux_err'])

In [None]:
n_walkers = 16
n_dim = 2
sampler = emcee.EnsembleSampler(n_walkers, n_dim, model1)

In [None]:
# generate initial conditions for the sampler
p0 = [1., 0.5]
p0 = emcee.utils.sample_ball(p0, np.full_like(p0, 1E-3), size=n_walkers)

In [None]:
pos,_,_ = sampler.run_mcmc(p0, 1024) # burn-in phase
sampler.reset() # throw out the burn-in samples
_ = sampler.run_mcmc(pos, 4096) # re-run from position at end of burn-in

In [None]:
fig = corner.corner(sampler.flatchain, 
                    labels=['$f_0$', r'$\ln V$'],
                    truths=[10., np.log(0.5**2)])
# fig.axes[0].axvline(mean_flux)

The marginalization we'd like to do is, defining $a = ln V$:

$$
p(f_0 \,|\, \{f_n\}) = \int {\rm d}a \, p(f_0, a \,|\, \{f_n\})
$$

From running MCMC, we have samples from the posterior pdf $p(f_0, a \,|\, \{f_n\})$. To get samples from the marginal distribution, it turns out all we need to do is ignore the column of values for $\ln V$! That is, MCMC gives us samples but also implicitly does the margnializations we need!

In [None]:
plt.figure(figsize=(8,6))
plt.errorbar(tbl1['time'], tbl1['flux'], tbl1['flux_err'], 
             linestyle='none', marker='o')

xlim = plt.xlim()
plt.axhline(10, color='k', zorder=0)

# mean flux
f0_median = np.median(sampler.flatchain[:,0])
f0_quantiles = scoreatpercentile(sampler.flatchain[:,0], [16, 84])
plt.axhline(f0_median, color='#3182bd', zorder=-1)
plt.axhspan(f0_quantiles[0], f0_quantiles[1], 
            xmin=xlim[0], xmax=xlim[1], color='#3182bd', alpha=0.25, zorder=-10)

plt.xlabel('$t$')
plt.ylabel('$f$')

In [None]:
f0_samples = sampler.flatchain[:,0]
print('MCMC estimated f_0: {:.2f} ± {:.2f}'.format(np.median(f0_samples), np.std(f0_samples)))

---

## Case 2

We again would like to measure the mean or true flux $f_0$ of a source under the assumption that the source is not varying. However, we have some reason to believe that there was a problem with some fraction of our data and there will be outliers. Here we'll construct a model to handle this situation.

Let's first read some data (also in the repo) that we'll be using:

In [None]:
tbl2 = ascii.read("case2.csv")
tbl2.colnames

We'll again estimate the mean flux and uncertainty on the mean using a simple maximum-likelihood estimator, ignoring the issue of outliers:

In [None]:
flux_ivar = 1 / tbl2['flux_err']**2 # inverse-variance
mean_flux = np.sum(tbl2['flux']*flux_ivar) / np.sum(flux_ivar)
mean_flux_err = np.sqrt(1 / np.sum(flux_ivar))

# the truth is 10.
print('Mean flux: {:.2f} ± {:.2f}'.format(mean_flux, mean_flux_err))

The true flux is 10

In [None]:
plt.figure(figsize=(8,6))
plt.errorbar(tbl2['time'], tbl2['flux'], tbl2['flux_err'], 
             linestyle='none', marker='o')
plt.axhline(mean_flux, linestyle='--')

xlim = plt.xlim()
plt.axhspan(mean_flux-mean_flux_err, mean_flux+mean_flux_err, 
            xmin=xlim[0], xmax=xlim[1], color='#aaaaaa')

In [None]:
class Model2(ProbModel):

    def ln_prior(self, pars):
        f0, Q, out_mu, out_lnV = pars
        
        # FILL IN HERE
    
    def ln_likelihood_inlier(self, pars):
        f0, Q, _, _ = pars
        
        # FILL IN HERE
    
    def ln_likelihood_outlier(self, pars):
        _, Q, out_mu, out_lnV = pars
        
        # FILL IN HERE
        
    def ln_likelihood(self, pars):       
        # the outlier likelihood:
        ll_out = self.ln_likelihood_outlier(pars)

        # the inlier likelihood:
        ll_in = self.ln_likelihood_inlier(pars)

        # Combine these using log-add-exp for numerical stability.
        ll = np.sum(np.logaddexp(ll_out, ll_in))

        return ll

In [None]:
model2 = Model2(x=tbl2['time'], y=tbl2['flux'], y_err=tbl2['flux_err'])

In [None]:
n_walkers = 32
n_dim = 4
sampler = emcee.EnsembleSampler(n_walkers, n_dim, model2)

In [None]:
# generate initial conditions
p0 = [10., 0.1, 10., 0.]
p0 = emcee.utils.sample_ball(p0, np.full_like(p0, 1E-3), size=n_walkers)

In [None]:
pos,_,_ = sampler.run_mcmc(p0, 1024)
sampler.reset()
_ = sampler.run_mcmc(pos, 4096)

In [None]:
for walker in sampler.chain[...,3]:
    plt.plot(walker, marker='', drawstyle='steps-mid', alpha=0.1)

In [None]:
fig = corner.corner(sampler.flatchain, 
                    labels=['$f_0$', '$Q$', r'$\mu_{\rm bad}$', r'$\ln V_{\rm bad}$'],
                    truths=[10., 0.75, np.nan, np.nan]) 

One of the things it looks like we lose by marginalizing over the per-observation outlier flag (the $q_n$'s) is the ability to identify likely outlier points. We can reconstruct this after the fact using the samples above:

In [None]:
sampler.flatchain.shape[0]//2

In [None]:
K = 65536
post_prob = np.zeros(len(tbl2))
for i in range(K): # only use some of the samples
    theta = sampler.flatchain[i]
    ll_in = model2.ln_likelihood_inlier(theta)
    ll_out = model2.ln_likelihood_outlier(theta)
    post_prob += np.exp(ll_in - np.logaddexp(ll_in, ll_out))

post_prob /= K

In [None]:
print(", ".join(map("{0:.3f}".format, 1-post_prob)))

In [None]:
outlier_idx = post_prob < 0.5 # <50% chance

In [None]:
plt.figure(figsize=(8,6))
plt.errorbar(tbl2['time'][~outlier_idx], tbl2['flux'][~outlier_idx], tbl2['flux_err'][~outlier_idx], 
             linestyle='none', marker='o')
plt.errorbar(tbl2['time'][outlier_idx], tbl2['flux'][outlier_idx], tbl2['flux_err'][outlier_idx], 
             linestyle='none', marker='o', color='r')

xlim = plt.xlim()

f0_median = np.median(sampler.flatchain[:,0])
f0_quantiles = scoreatpercentile(sampler.flatchain[:,0], [16, 84])
plt.axhline(f0_median, color='#3182bd', zorder=-1)
plt.axhspan(f0_quantiles[0], f0_quantiles[1], 
            xmin=xlim[0], xmax=xlim[1], color='#3182bd', alpha=0.25, zorder=-10)

plt.xlabel('$t$')
plt.ylabel('$f$')