In [1]:
import numpy as np
from scipy.stats import rankdata
from scipy.spatial.distance import pdist, cdist
from enum import Enum
from multiprocessing import Pool, cpu_count

In [2]:
# Distance labels
class Distance(Enum):
    CVM = 0 # ABC-CvM
    MMD = 1 # ABC-MMD
    WASS = 2 # ABC-Wass
    STAT = 3  # ABC-Stat

DISTANCE_LABELS = {Distance.CVM: "CvM", Distance.MMD: "MMD", Distance.WASS: "Wass", Distance.STAT: "Stat"}

In [3]:
# Functions to define the distance metrics

# Common function to all examples
def cramer_von_mises_distance(obs, sim):
    n = len(obs)
    m = len(sim)
    combined = np.concatenate((obs, sim))
    ranks = rankdata(combined)
    obs_ranks = np.sort(ranks[:n])
    sim_ranks = np.sort(ranks[n:])
    i = np.arange(1, n + 1)
    j = np.arange(1, m + 1)
    term1 = n * np.sum((obs_ranks - i) ** 2)
    term2 = m * np.sum((sim_ranks - j) ** 2)
    denom = n * m * (n + m)
    distance = (term1 + term2) / denom - (4 * n * m - 1) / (6 * (n + m))
    return distance

# Common function to all examples
def wasserstein_distance(obs, sim):
    if len(obs) == len(sim):
        sorted_obs = np.sort(obs)
        sorted_sim = np.sort(sim)
        return np.mean(np.abs(sorted_obs - sorted_sim))
    else:
        # fallback to scipy implementation if different sizes
        from scipy.stats import wasserstein_distance as ws
        return ws(obs, sim)

# Common function to all examples
def gaussian_kernel(sq_distances, sigma):
    return np.exp(-sq_distances / (2 * sigma))

# Common function to all examples
def maximum_mean_discrepancy(obs, sim, obs_sq_dist=None, sigma=None):
    if obs_sq_dist is None:
        obs_sq_dist = pdist(obs.reshape(-1,1), 'sqeuclidean')
    if sigma is None:
        sigma = np.median(obs_sq_dist) ** 0.5
    sim_sq_dist = pdist(sim.reshape(-1,1), 'sqeuclidean')
    mixed_sq_dist = cdist(obs.reshape(-1,1), sim.reshape(-1,1), 'sqeuclidean')
    k_xx = np.mean(gaussian_kernel(obs_sq_dist, sigma))
    k_yy = np.mean(gaussian_kernel(sim_sq_dist, sigma))
    k_xy = np.mean(gaussian_kernel(mixed_sq_dist, sigma))
    return k_xx + k_yy - 2 * k_xy

# Specifc function for ABC-Stat for the normal example: absolute distance between sample means
def stat_distance(obs, sim):
    mean_obs = np.mean(obs)
    mean_sim = np.mean(sim)
    return np.abs(mean_obs - mean_sim)

In [5]:
# Function to simulate datasets from each model. It simulates the same number of datasets for each model.  
# This function performs a comparison between H0: theta0 = 0 vs theta0 != 0 
def simulate_datasets(n_sim=10**3, sample_size=100, prior_mu_var=100):
    half = n_sim // 2
    
    # Model 0: Normal(0,1)
    mus0 = np.zeros(half)
    sim0 = np.random.normal(0, 1, size=(half, sample_size))
    models0 = np.zeros(half, dtype=int)
    
    # Model 1: Normal(mu,1), mu ~ Normal(0, prior_mu_var)
    mus1 = np.random.normal(0, prior_mu_var ** 0.5, size=half)
    sim1 = np.array([np.random.normal(mu, 1, size=sample_size) for mu in mus1])
    models1 = np.ones(half, dtype=int)
    
    # Combine
    sims = np.vstack((sim0, sim1))
    mus = np.concatenate((mus0, mus1))
    models = np.concatenate((models0, models1))
    
    return sims, mus, models

In [6]:
# Compute distances
def compute_distances(observed_sample, sims):
    n_sim = sims.shape[0]
    sample_size = len(observed_sample)
    
    # Precompute squared distances and sigma for MMD
    obs_sq_dist = pdist(observed_sample.reshape(-1,1), 'sqeuclidean')
    sigma = np.median(obs_sq_dist) ** 0.5
    
    distances_cvm = np.zeros(n_sim)
    distances_wass = np.zeros(n_sim)
    distances_mmd = np.zeros(n_sim)
    distances_stat = np.zeros(n_sim)
    
    for i in range(n_sim):
        sim_sample = sims[i]
        distances_cvm[i] = cramer_von_mises_distance(observed_sample, sim_sample)
        distances_wass[i] = wasserstein_distance(observed_sample, sim_sample)
        distances_mmd[i] = maximum_mean_discrepancy(observed_sample, sim_sample, obs_sq_dist, sigma)
        distances_stat[i] = stat_distance(observed_sample, sim_sample)
    
    return distances_cvm, distances_mmd, distances_wass, distances_stat

In [7]:
# Extract results relative to the q% smallest distances. 
def summarize_percentile(mus, models, distances, percentile):
    n = len(distances)
    k = max(1, round(n * percentile / 100))
    indices = np.argsort(distances)[:k]
    selected_mus = mus[indices]
    selected_models = models[indices]
    prop_model0 = np.mean(selected_models == 0)
    mean_mu = np.mean(selected_mus)
    return prop_model0, mean_mu

In [8]:
# Run ABC on 1 dataset
def run_abc_for_one_observed(args):
    observed_sample, sims, mus, models, percentiles = args
    dist_cvm, dist_mmd, dist_wass, dist_stat = compute_distances(observed_sample, sims)
    
    results = {}
    for dist_enum, dist_array in zip(Distance, [dist_cvm, dist_mmd, dist_wass, dist_stat]):
        dist_name = DISTANCE_LABELS[dist_enum]
        results[dist_name] = {'prop_model0': [], 'mean_mu': []}
        for perc in percentiles:
            prop0, mean_mu = summarize_percentile(mus, models, dist_array, perc)
            results[dist_name]['prop_model0'].append(prop0)
            results[dist_name]['mean_mu'].append(mean_mu)
    return results

In [9]:
# Main function
def main():
    np.random.seed(42)
    sample_size = 100
    n_sim = 10**6
    n_observed = 100
    percentiles = [0.1, 0.05, 0.01]  
    
    print("Simulating datasets...")
    sims, mus, models = simulate_datasets(n_sim=n_sim, sample_size=sample_size)

    seed = 12345
    np.random.seed(seed)
    random.seed(seed)

    print(f"Simulating {n_observed} observed datasets...")
    observed_datasets = [np.random.normal(0, 1, sample_size) for _ in range(n_observed)] # Simulating data from H0; change this depending on the true model
    
    # Prepare args for multiprocessing
    args_list = [(obs, sims, mus, models, percentiles) for obs in observed_datasets]
    
    print(f"Starting parallel ABC with {cpu_count()} cores...")
    from multiprocessing import Pool
    with Pool() as pool:
        all_results = pool.map(run_abc_for_one_observed, args_list)
    
    # Aggregate results in arrays
    distance_names = list(DISTANCE_LABELS.values())
    n_dist = len(distance_names)
    n_perc = len(percentiles)
    
    prop_model0_summary = np.zeros((n_observed, n_dist, n_perc))
    mean_mu_summary = np.zeros((n_observed, n_dist, n_perc))
    
    for i, res in enumerate(all_results):
        for d_idx, dist_name in enumerate(distance_names):
            prop_model0_summary[i, d_idx, :] = res[dist_name]['prop_model0']
            mean_mu_summary[i, d_idx, :] = res[dist_name]['mean_mu']
    
    # Save to npz
    np.savez('normal_example_m0.npz', # Change name of model 
             prop_model0=prop_model0_summary,
             mean_mu=mean_mu_summary,
             percentiles=percentiles,
             distance_names=distance_names)
    
    print("ABC summaries saved to normal_example_m0.npz")

if __name__ == "__main__":
    main()

Simulating datasets...
Simulating 2 observed datasets...
Starting parallel ABC with 8 cores...
ABC summaries saved to abc_100obs_summary_with_stat.npz
