# Mock Data Pipeline

Alex Malz

In [None]:
import daft
import numpy as np
import bisect
import astropy.cosmology as cosmology
import scipy.stats as sps
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rc
rc("font", family="serif", size=12)
rc("text", usetex=True)
# from scipy.stats import norm
# import emcee
# from datetime import datetime

colors = 'rbgcymk'

[abstract]

In [None]:
#initialize the PGM
pgm = daft.PGM([5, 6], origin=[0, 0])

#desired hyperparameters
pgm.add_node(daft.Node("cosmology", r"$\vec{\theta}$", 1., 5.5))
pgm.add_node(daft.Node("dist", r"$\underline{\phi}$", 2.5, 5.5))
#pgm.add_node(daft.Node("rates", r"$\vec{R}$", 3., 5.5, fixed=True))

#latent variables/parameters
pgm.add_node(daft.Node("distance", r"$\mu_{n}$", 1., 4.))
pgm.add_node(daft.Node("redshift", r"$z_{n}$", 2., 4.5))
pgm.add_node(daft.Node("type", r"$t_{n}$", 3., 4.5))

#data
pgm.add_node(daft.Node("lightcurve", r"$\underline{\ell}_{n}$", 1.5, 3., observed=True))
pgm.add_node(daft.Node("photometry", r"$\vec{m}_{n}$", 3., 3., observed=True))

# Add in the edges.
pgm.add_edge("dist", "type")
pgm.add_edge("cosmology", "distance")
pgm.add_edge("dist", "redshift")
pgm.add_edge("redshift", "distance")
#pgm.add_edge("distance", "photometry")
pgm.add_edge("distance", "lightcurve")
pgm.add_edge("redshift", "photometry")
pgm.add_edge("redshift", "lightcurve")
pgm.add_edge("type", "lightcurve")

# plates
pgm.add_plate(daft.Plate([0.5, 2., 3., 3.], label=r"$n = 1, \cdots, N$"))

# Render and save.
pgm.render()
pgm.figure.show()

## Choosing true parameters

[the true redshift-dependent type rate distribution, with plot of three functions]

In [None]:
types = ['Ia', 'Ibc', 'II']#{'a': 0, 'b': 1, 'c': 2}
n_types = len(types)
frac_types = np.array([0.2, 0.3, 0.5])
assert np.isclose(np.sum(frac_types), 1.)

n_zs = 20
min_z = 0.01
max_z = 1.
z_bins = np.linspace(min_z, max_z, num=n_zs + 1, endpoint=True)
z_difs = z_bins[1:] - z_bins[:-1]
z_dif = np.mean(z_difs)
z_range = max_z - min_z
z_mids = (z_bins[1:] + z_bins[:-1]) / 2.

n_of_z = np.zeros((n_types, n_zs))
n_of_z[0] += sps.norm(loc = 0.75, scale = 0.5).pdf(z_mids)
n_of_z[1] += sps.norm(loc = 0.5, scale = 0.5).pdf(z_mids)
n_of_z[2] += sps.norm(loc = 0.25, scale = 0.5).pdf(z_mids)
n_of_z /= np.sum(n_of_z * z_difs[np.newaxis, :], axis=1)[:, np.newaxis]

true_n_of_z = frac_types[:, np.newaxis] * np.array(n_of_z)# / z_range
true_n_of_z /= np.sum(true_n_of_z * z_difs[np.newaxis, :])
assert np.isclose(np.sum(true_n_of_z * z_difs[np.newaxis, :]), 1.)

for t in range(n_types):
    plt.plot(z_mids, true_n_of_z[t], color=colors[t], label=types[t])
plt.xlabel(r'$z$')
plt.ylabel(r'relative rate')
plt.legend()

[samples of t, z from the true redshift-dependent type rate distribution, with histograms]

In [None]:
def sample_discrete(dist, N):
    out_info = []
    norm_dist = dist * z_difs[np.newaxis, :]
    assert np.isclose(np.sum(norm_dist), 1.)
    dist_shape = np.shape(norm_dist)
    flat_dist = norm_dist.flatten()
    cdf = np.cumsum(flat_dist)
    for n in range(N):
        each = {}
        r = np.random.random()
        k = bisect.bisect(cdf, r)
        (t_ind, z_ind) = np.unravel_index(k, dist_shape)
        each['t'] = types[t_ind]
        each['z'] = np.random.uniform(low=z_bins[z_ind], high=z_bins[z_ind + 1])
        out_info.append(each)
    return out_info

n_sne = 10

true_params = sample_discrete(true_n_of_z, n_sne)

to_plot = [[d['z'] for d in true_params if d['t'] == types[t]] for t in range(n_types)]
for t in range(n_types):
    plt.hist(to_plot[t], color=colors[t], alpha=1./3., label=types[t], normed=True)
plt.xlabel(r'$z$')
plt.ylabel(r'relative rate')
plt.legend()

[the true cosmology, true mu for each t, z, plot hubble diagram]

In [None]:
H0 = 72
Om0 = 0.3

true_cosmo = cosmology.FlatLambdaCDM(H0=H0, Om0=Om0)

for n in range(n_sne):
    true_params[n]['mu'] = true_cosmo.distmod(true_params[n]['z']).value
    
to_plot_x = [[d['z'] for d in true_params if d['t'] == types[t]] for t in range(n_types)]
to_plot_y = [[d['mu'] for d in true_params if d['t'] == types[t]] for t in range(n_types)]
for t in range(n_types):
    plt.scatter(to_plot_x[t], to_plot_y[t], color=colors[t], label=types[t])
plt.xlabel(r'$z$')
plt.ylabel(r'$\mu$')

## Making likelihoods

[the confusion matrix]

In [None]:
conf_matrix = 0.25 + 0.25 * np.eye(3)#np.ones((n_types, n_types)) / n_types **2
assert np.isclose(np.sum(conf_matrix, axis=1).all(), frac_types.all())

[set up mu parametrization]

In [None]:
n_mus = n_zs
min_mu, max_mu = 35., 45.#min([s['mu'] for s in true_params]), max([s['mu'] for s in true_params])
mu_bins = np.linspace(min_mu, max_mu, num=n_mus + 1, endpoint=True)#true_cosmo.distmod(z_bins).value
mu_difs = mu_bins[1:] - mu_bins[:-1]
mu_dif = np.mean(mu_difs)
mu_range = np.max(mu_bins) - np.min(mu_bins)
mu_mids = (mu_bins[1:] + mu_bins[:-1]) / 2.

z_mu_grid = np.array([[(z, mu) for mu in mu_mids] for z in z_mids])
cake_shape = np.shape(z_mu_grid)

[the functions taking true type, true z, and true mu returning p(type, z, mu | hat(t))]

In [None]:
Ia_Ia_var = np.array([0.01, 0.05])
Ibc_Ia_delta = 1.
Ibc_Ia_var = np.array([0.01, 0.01])
II_Ia_delta = np.mean(mu_mids)
II_Ia_var = np.array([0.01, 0.1])

def fit_Ia(z, mu):
    cake = np.zeros((n_types, n_zs, n_mus))
    cake_Ia = sps.multivariate_normal(mean = np.array([z, mu]), cov = Ia_Ia_var * np.eye(2))
    [z_samp, mu_samp] = cake_Ia.rvs()
    cake_Ia = sps.multivariate_normal(mean = np.array([z_samp, mu_samp]), cov = Ia_Ia_var * np.eye(2))
    cake[0] = cake_Ia.pdf(z_mu_grid.reshape(-1, cake_shape[-1])).reshape(cake_shape[:-1])
    cake[1] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake[2] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake *= conf_matrix[:, 0, np.newaxis, np.newaxis]
    return cake
    
def fit_Ibc(z, mu):
    cake = np.zeros((n_types, n_zs, n_mus))
    cake_Ia = sps.multivariate_normal(mean = np.array([z, mu - Ibc_Ia_delta]), cov = Ibc_Ia_var * np.eye(2))
    [z_samp, mu_samp] = cake_Ia.rvs()
    cake_Ia = sps.multivariate_normal(mean = np.array([z_samp, mu_samp]), cov = Ibc_Ia_var * np.eye(2))
    cake[0] = cake_Ia.pdf(z_mu_grid.reshape(-1, cake_shape[-1])).reshape(cake_shape[:-1])
    cake[1] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake[2] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake *= conf_matrix[:, 1, np.newaxis, np.newaxis]
    return cake
    
def fit_II(z, mu):
    cake = np.zeros((n_types, n_zs, n_mus))
    cake_Ia = sps.multivariate_normal(mean = np.array([z, II_Ia_delta]), cov = II_Ia_var * np.eye(2))
    [z_samp, mu_samp] = cake_Ia.rvs()
    cake_Ia = sps.multivariate_normal(mean = np.array([z_samp, mu_samp]), cov = II_Ia_var * np.eye(2))
    cake[0] = cake_Ia.pdf(z_mu_grid.reshape(-1, cake_shape[-1])).reshape(cake_shape[:-1])
    cake[1] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake[2] = np.ones(cake_shape[:-1]) / np.prod(cake_shape[:-1])
    cake *= conf_matrix[:, 2, np.newaxis, np.newaxis]
    return cake
    
def fit_any(true_vals):
    if true_vals['t'] == 'Ia':
        cake = fit_Ia(true_vals['z'], true_vals['mu'])
    if true_vals['t'] == 'Ibc':
        cake = fit_Ibc(true_vals['z'], true_vals['mu'])
    if true_vals['t'] == 'II':
        cake = fit_II(true_vals['z'], true_vals['mu'])
    return cake

def fit_all(catalog):
    dessert = []
    for true_vals in catalog:
        dessert.append(fit_any(true_vals))
    return np.array(dessert)

sheet_cake = fit_all(true_params)
print(np.shape(sheet_cake))

fig = plt.figure(figsize=(n_types*10, n_sne*10))
p = 0
for s in range(n_sne):
    for t in range(n_types):
        p += 1
        ax = fig.add_subplot(n_sne, n_types, p)
        ax.pcolormesh(z_mids, mu_mids, sheet_cake[s][t], cmap='viridis')
        ax.scatter(true_params[s]['z'], true_params[s]['mu'], color='k')
        ax.set_xticklabels(z_mids)
        ax.set_yticklabels(mu_mids)
        ax.set_title('true '+true_params[s]['t']+', class '+types[t])

[make p(z)s]

In [None]:
pzs = []
for s in range(n_sne):
    dist = sps.norm(loc = true_params[s]['z'], scale = 0.03)
    pz_mean = dist.rvs()
    pz = sps.norm(loc = pz_mean, scale = 0.03).pdf(z_mids)
    pzs.append(pz)
pzs = np.array(pzs)

for s in range(n_sne):
    plt.plot(z_mids, pzs[s])
    plt.vlines(true_params[s]['z'], 0., 15.)

[interim priors to make interim posteriors]

In [None]:
interim_n_of_z = np.ones((n_types, n_zs))
interim_n_of_z /= np.sum(interim_n_of_z * z_difs[np.newaxis, :])
assert np.isclose(np.sum(interim_n_of_z * z_difs[np.newaxis, :]), 1.)

interim_H0 = 70
interim_Om0 = 0.25

interim_cosmo = cosmology.FlatLambdaCDM(H0=interim_H0, Om0=interim_Om0)

interim_prior = np.zeros((n_types, n_zs, n_mus))
for t in range(n_types):
    for z in range(n_zs):
        mu = interim_cosmo.distmod(z_mids[z]).value
        k = bisect.bisect(mu_mids, mu)
        interim_prior[t][z][k] += interim_n_of_z[t][z]
interim_prior /= mu_difs[np.newaxis, np.newaxis, :]
assert np.isclose(np.sum(interim_prior * z_difs[np.newaxis, :, np.newaxis] * mu_difs[np.newaxis, np.newaxis, :]), 1.)

[write data to file]