# Validate BayesFlow Posterior with ABC

In this notebook we are going to validate the posterior from BayesFlow by comparing it to posteriors generated from ABC.

In [None]:
import os
from datetime import timedelta
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyabc
from tqdm import tqdm

In [None]:
# specify which model to use
model_name = ['fröhlich-simple', 'fröhlich-detailed', 'fröhlich-sde', 'pharmacokinetic_model', 'clairon_small_model'][2]
network_idx = 0
load_best_network = True

## Load individual model


In [None]:
if model_name == 'fröhlich-simple':
    from models.froehlich_model_simple import FroehlichModelSimple, batch_simulator
    individual_model = FroehlichModelSimple(network_idx=network_idx, load_best=load_best_network)
    
elif model_name == 'fröhlich-detailed':
    from models.froehlich_model_detailed import FroehlichModelDetailed, batch_simulator
    individual_model = FroehlichModelDetailed(network_idx=network_idx, load_best=load_best_network)
    
elif model_name == 'fröhlich-sde':
    from models.froehlich_model_sde import FroehlichModelSDE, batch_simulator
    individual_model = FroehlichModelSDE(network_idx=network_idx, load_best=load_best_network)

elif model_name == 'pharmacokinetic_model':
    from models.pharmacokinetic_model import PharmacokineticModel, batch_simulator, convert_bf_to_observables
    individual_model = PharmacokineticModel(network_idx=network_idx, load_best=load_best_network)
    
elif model_name == 'clairon_small_model':
    from models.clairon_small_model import ClaironSmallModel, batch_simulator, convert_bf_to_observables
    prior_type = ['normal', 'uniform'][0]
    individual_model = ClaironSmallModel(network_idx=network_idx, load_best=load_best_network, prior_type=prior_type)
else:
    raise NotImplementedError('model not implemented')

# load network
trainer = individual_model.build_trainer('../networks/' + individual_model.network_name)

## Load Data

In [None]:
# load synthetic data for specific model
load_synthetic = False
obs_data = individual_model.load_data(synthetic=load_synthetic)

# chose 10 random individuals/cells
np.random.seed(42)
individual_ids = np.random.randint(0, len(obs_data), size=10)  # obs_data can be list or numpy array
obs_data = [obs_data[i] for i in individual_ids]
    

if load_synthetic:
    # for these model parameters are known
    if model_name == 'fröhlich-sde':
        cell_param_log = pd.read_csv(f'../data/synthetic/synthetic_individual_cell_params_sde_model.csv',
                                     index_col=0, header=0)
    elif model_name == 'fröhlich-detailed':
        cell_param_log = pd.read_csv(f'../data/synthetic/synthetic_individual_cell_params_detailed_model.csv',
                                     index_col=0, header=0)
    else:
        cell_param_log = pd.read_csv(f'../data/synthetic/synthetic_individual_cell_params.csv',
                                     index_col=0, header=0)

## Examine Posterior for a Single Individual/Cell

In [None]:
# use observations to get a first look at the posterior
n_bayesflow_samples = 1000
obs_data_posterior_samples = individual_model.draw_posterior_samples(data=obs_data, n_samples=n_bayesflow_samples)

In [None]:
rows = 4
fig, ax = plt.subplots(rows, int(np.ceil(len(obs_data) / rows)), tight_layout=True, figsize=(10, rows*3),
                       sharex='row', sharey='all')
axis = ax.flatten()
    
for p_id in tqdm(range(len(obs_data))):
    axis[p_id] = individual_model.prepare_plotting(obs_data[p_id], obs_data_posterior_samples[p_id, :100], axis[p_id])
    _, labels = axis[p_id].get_legend_handles_labels()
    
for _ax in axis[len(obs_data):]:
    _ax.remove()

fig.legend(labels, ncol=3, loc='upper center', bbox_to_anchor=(0.5, 1))
plt.show()

## Prepare ABC Posterior

In [None]:
individual_id = 1  # patient 5 for pharma, fro-detailed 0
obs_data_indv = obs_data[individual_id]

In [None]:
# prepare simulator accordingly to the model
if 'Froehlich' in individual_model.name :
    # prepare simulator, data should be on log-scale
    simulator = partial(batch_simulator, 
                                n_obs=180,
                                with_noise=True)
    obs_data_indv_prepared = obs_data_indv.flatten()  # just one measurement per time point, already on log-scale
    observation = {"data": obs_data_indv_prepared}
    
    # pyABC 
    def abc_model(parameter: dict):
        return {"data": simulator(np.fromiter(parameter.values(), dtype=float))}
elif 'Pharma' in individual_model.name:
    # prepare simulator, data should be on log-scale
    obs_data_indv_prepared, t_measurement, doses_time_points, dos, wt = convert_bf_to_observables(obs_data_indv)
    simulator = partial(batch_simulator,
                       t_measurement=t_measurement,
                       t_doses=doses_time_points,
                       wt=wt,
                       dos=dos,
                       with_noise=True,
                       convert_to_bf_batch=False)
    observation = {"y1": obs_data_indv_prepared[:, 0],
                     "y2": obs_data_indv_prepared[:, 1]}
    # pyABC 
    def abc_model(parameter: dict):
        data = simulator(np.fromiter(parameter.values(), dtype=float))
        return {"y1": data[:, 0],
                "y2": data[:, 1]}
elif 'Clairon' in individual_model.name:
    # prepare simulator, data should be on linear scale
    obs_data_indv_prepared, t_measurements, doses_time_points, dose_amount = convert_bf_to_observables(obs_data_indv)
    simulator = partial(batch_simulator,
                        t_measurements=t_measurements,
                        t_doses=doses_time_points,
                        with_noise=True,
                        convert_to_bf_batch=False)    
    observation = {"data": obs_data_indv_prepared}
    # pyABC 
    def abc_model(parameter: dict):
        return {"data": simulator(np.fromiter(parameter.values(), dtype=float))}
else:
    raise NotImplementedError('model not implemented')

assert simulator(individual_model.prior_mean).shape == obs_data_indv_prepared.shape, 'simulator output shape does not match data shape' 

In [None]:
# build dict with name and mean adn std of parameters
param_dict = {}
for p_i, p in enumerate(individual_model.param_names):
    param_dict[p] = pyabc.RV("norm", loc=individual_model.prior_mean[p_i], scale=individual_model.prior_std[p_i])
prior = pyabc.Distribution(param_dict)

In [None]:
abc = pyabc.ABCSMC(abc_model, prior,
                   distance_function=pyabc.distance.AdaptivePNormDistance(p=1),
                   population_size=10000,
                   sampler=pyabc.sampler.SingleCoreSampler())
db_path = os.path.join('sampling_results', f'abc_{individual_model.name}_individual_{individual_id}.db')

In [None]:
if os.path.exists(db_path):
    history = abc.load("sqlite:///" + db_path, 1)
else:
    abc.new("sqlite:///" + db_path, observation)
    max_walltime = timedelta(hours=0.1)
    history = abc.run(min_acceptance_rate=1e-2, max_walltime=max_walltime)

In [None]:
abc_samples_raw, abc_weights_raw = history.get_distribution()
abc_samples_raw = abc_samples_raw.to_numpy()

# Compare BayesFlow and ABC

In [None]:
# reduce to same number of samples
n_samples = min(obs_data_posterior_samples[individual_id].shape[0], abc_samples_raw.shape[0])
bayes_flow_samples = obs_data_posterior_samples[individual_id, :n_samples]

# thin abc samples to same number of samples
abc_index = np.random.choice(range(abc_samples_raw.shape[0]), n_samples, replace=False)
abc_samples = abc_samples_raw[abc_index]
abc_weights = abc_weights_raw[abc_index]

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=int(np.ceil(individual_model.n_params/2)), tight_layout=True, figsize=(16,12))
axis = ax.flatten()
bins = 40
for i, name in enumerate(individual_model.param_names):
    axis[i].set_title('log '+name)
    axis[i].hist(bayes_flow_samples[:, i], bins=bins, density=True, label='BayesFlow', color='blue')

    axis[i].hist(abc_samples[:, i], weights=abc_weights,
                 bins=bins, density=True, label='ABC', alpha=0.6, color='red')
    axis[i].legend()

for _ax in axis[individual_model.n_params:]:
    _ax.remove()
#plt.savefig(f'../plots/abc/posterior_validation_{model.name}_individual_{individual_id}.png', dpi=600)
plt.show()

# fig, ax = plt.subplots(nrows=2, ncols=int(np.ceil(individual_model.n_params/2)), tight_layout=True, figsize=(16,12))
# axis = ax.flatten()
# for i, name in enumerate(individual_model.param_names):
#     axis[i].set_title(name)
#     axis[i].hist(np.exp(bayes_flow_samples[:, i]), bins=bins, density=True, label='BayesFlow', color='blue')
# 
#     axis[i].hist(np.exp(abc_samples[:, i]), weights=abc_weights,
#                  bins=bins, density=True, label='ABC', alpha=0.6, color='red')
#     axis[i].legend()
# 
# for _ax in axis[individual_model.n_params:]:
#     _ax.remove()
# plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, tight_layout=True, figsize=(16, 6),
                       sharex='row', sharey='all')
    
ax[0] = individual_model.prepare_plotting(obs_data_indv, obs_data_posterior_samples[individual_id], ax[0])
ax[1] = individual_model.prepare_plotting(obs_data_indv, abc_samples, ax[1])
_, labels = ax[0].get_legend_handles_labels()
ax[1].set_ylabel('')

fig.legend(labels, ncol=3, loc='lower center', bbox_to_anchor=(0.5, -0.01))
ax[0].set_title('BayesFlow Posterior Predictive')
ax[1].set_title('MCMC Posterior Predictive')
#plt.savefig(f'../plots/abc/posterior_simulation_{individual_model.name}_individual_{individual_id}.png', dpi=600)
plt.show()

## Dimensionality Reduction

To see visually if samples differ, we map the posterior samples in a two-dimensional space using a UMAP. 

In [None]:
import umap
from sklearn.preprocessing import StandardScaler

In [None]:
# normalize samples
all_samples = np.concatenate((bayes_flow_samples, abc_samples), axis=0)
scaled_samples = StandardScaler().fit_transform(all_samples)

# create umap
reducer = umap.UMAP(random_state=42, n_jobs=1,   # for reproducibility 
                    #densmap=True,  # preserve local density
                    ) 
umap_embedding = reducer.fit_transform(scaled_samples)

In [None]:
fig = plt.figure(tight_layout=True, figsize=(8, 6))
plt.scatter(
    umap_embedding[:n_samples, 0],
    umap_embedding[:n_samples, 1], label='BayesFlow', alpha=0.7, color='blue')
plt.scatter(
    umap_embedding[n_samples:, 0],
    umap_embedding[n_samples:, 1], label='MCMC', alpha=0.7, color='red')
plt.legend()
plt.title('Umap Based Representation of Posterior Distributions')

#plt.savefig(f'../plots/abc/posterior_umap_{individual_model.name}_individual_{individual_id}.png', dpi=600)
plt.show()