In [1]:
import numpy as np
from scipy.stats import rankdata
from scipy.spatial.distance import pdist, cdist
from enum import Enum
from multiprocessing import Pool, cpu_count

In [2]:
# Distance labels
class Distance(Enum):
    CVM = 0 # ABC-CvM
    MMD = 1 # ABC-MMD
    WASS = 2 # ABC-Wass
    STAT = 3  # ABC-Stat

DISTANCE_LABELS = {Distance.CVM: "CvM", Distance.MMD: "MMD", Distance.WASS: "Wass", Distance.STAT: "Stat"}

In [3]:
# Functions to compute the distance metrics

# Common to all examples
def cramer_von_mises_distance(obs, sim):
    n = len(obs)
    m = len(sim)
    combined = np.concatenate((obs, sim))
    ranks = rankdata(combined)
    obs_ranks = np.sort(ranks[:n])
    sim_ranks = np.sort(ranks[n:])
    i = np.arange(1, n + 1)
    j = np.arange(1, m + 1)
    term1 = n * np.sum((obs_ranks - i) ** 2)
    term2 = m * np.sum((sim_ranks - j) ** 2)
    denom = n * m * (n + m)
    distance = (term1 + term2) / denom - (4 * n * m - 1) / (6 * (n + m))
    return distance

# Common to all examples
def wasserstein_distance(obs, sim):
    if len(obs) == len(sim):
        sorted_obs = np.sort(obs)
        sorted_sim = np.sort(sim)
        return np.mean(np.abs(sorted_obs - sorted_sim))
    else:
        # fallback to scipy implementation if different sizes
        from scipy.stats import wasserstein_distance as ws
        return ws(obs, sim)

# Common to all examples
def gaussian_kernel(sq_distances, sigma):
    return np.exp(-sq_distances / (2 * sigma))

# Common to all examples
def maximum_mean_discrepancy(obs, sim, obs_sq_dist=None, sigma=None):
    if obs_sq_dist is None:
        obs_sq_dist = pdist(obs.reshape(-1,1), 'sqeuclidean')
    if sigma is None:
        sigma = np.median(obs_sq_dist) ** 0.5
    sim_sq_dist = pdist(sim.reshape(-1,1), 'sqeuclidean')
    mixed_sq_dist = cdist(obs.reshape(-1,1), sim.reshape(-1,1), 'sqeuclidean')
    k_xx = np.mean(gaussian_kernel(obs_sq_dist, sigma))
    k_yy = np.mean(gaussian_kernel(sim_sq_dist, sigma))
    k_xy = np.mean(gaussian_kernel(mixed_sq_dist, sigma))
    return k_xx + k_yy - 2 * k_xy

# Function specific for the quantile distribution example for ABC-Stat
def stat_distance(obs, sim):
    # Compute 0.1 and 0.9 quantiles for both datasets
    obs_q = np.quantile(obs, [0.1, 0.9])
    sim_q = np.quantile(sim, [0.1, 0.9])
    
    # Compute L1 norm (sum of absolute differences)
    return np.sum(np.abs(obs_q - sim_q))

In [4]:
# Quantile function for the g-and-k distribution
def g_and_k_quantile(z, A=0, B=1, g=0, k=0, c=0.8):
    term1 = (1 + c * (1 - np.exp(-g * z)) / (1 + np.exp(-g * z)))
    term2 = (1 + z**2) ** k
    return A + B * term1 * term2 * z

# Function to generate simulated datasets. We generate the same number of simulated datasets.
def simulate_datasets(n_sim=10**3, sample_size=100):
    half = n_sim // 2
    A = 0
    B = 1
    c = 0.8

    # Model 0: g = 0, k ~ Uniform(-0.5, 5)
    k0 = np.random.uniform(-0.5, 5, size=half)
    g0 = np.zeros(half)
    models0 = np.zeros(half, dtype=int)
    z0 = np.random.normal(0, 1, size=(half, sample_size))
    sim0 = np.array([
        g_and_k_quantile(z0[i], A=A, B=B, g=g0[i], k=k0[i], c=c)
        for i in range(half)
    ])
    
    # Model 1: g ~ Uniform(0, 4), k ~ Uniform(-0.5, 5)
    g1 = np.random.uniform(0, 4, size=half)
    k1 = np.random.uniform(-0.5, 5, size=half)
    models1 = np.ones(half, dtype=int)
    z1 = np.random.normal(0, 1, size=(half, sample_size))
    sim1 = np.array([
        g_and_k_quantile(z1[i], A=A, B=B, g=g1[i], k=k1[i], c=c)
        for i in range(half)
    ])

    # Combine datasets
    sims = np.vstack((sim0, sim1))
    g_values = np.concatenate((g0, g1))
    k_values = np.concatenate((k0, k1))
    models = np.concatenate((models0, models1))

    thetas = np.vstack((g_values, k_values)).T

    return sims, thetas, models

In [5]:
# Compute distances for one observed sample
def compute_distances(observed_sample, sims):
    n_sim = sims.shape[0]
    sample_size = len(observed_sample)
    
    # Precompute squared distances and sigma for MMD
    obs_sq_dist = pdist(observed_sample.reshape(-1,1), 'sqeuclidean')
    sigma = np.median(obs_sq_dist) ** 0.5
    
    distances_cvm = np.zeros(n_sim)
    distances_wass = np.zeros(n_sim)
    distances_mmd = np.zeros(n_sim)
    distances_stat = np.zeros(n_sim)
    
    for i in range(n_sim):
        sim_sample = sims[i]
        distances_cvm[i] = cramer_von_mises_distance(observed_sample, sim_sample)
        distances_wass[i] = wasserstein_distance(observed_sample, sim_sample)
        distances_mmd[i] = maximum_mean_discrepancy(observed_sample, sim_sample, obs_sq_dist, sigma)
        distances_stat[i] = stat_distance(observed_sample, sim_sample)
    
    return distances_cvm, distances_mmd, distances_wass, distances_stat

In [6]:
# Summarize lowest percentile
def summarize_percentile(thetas, models, distances, percentile):
    n = len(distances)
    n_keep = max(1, round(n * percentile / 100))
    indices = np.argsort(distances)[:n_keep]

    selected_thetas = thetas[indices]
    selected_models = models[indices]

    # Find the most frequently selected model
    if len(selected_models) == 0:
        return np.nan, np.array([np.nan, np.nan])

    values, counts = np.unique(selected_models, return_counts=True)
    most_common_model = values[np.argmax(counts)]

    # Compute proportion of most common model
    prop_common_model = np.mean(selected_models == most_common_model)

    # Average theta only from simulations matching the most common model
    theta_subset = selected_thetas[selected_models == most_common_model]
    mean_theta = np.mean(theta_subset, axis=0) if len(theta_subset) > 0 else np.array([np.nan, np.nan])

    return prop_common_model, mean_theta

In [7]:
# Run ABC for one observed dataset
def run_abc_for_one_observed(args):
    observed_sample, sims, thetas, models, percentiles = args
    dist_cvm, dist_mmd, dist_wass, dist_stat = compute_distances(observed_sample, sims)
    
    results = {}
    for dist_enum, dist_array in zip(Distance, [dist_cvm, dist_mmd, dist_wass, dist_stat]):
        dist_name = DISTANCE_LABELS[dist_enum]
        results[dist_name] = {
            'prop_model': [],
            'mean_theta': []  
        }
        for perc in percentiles:
            prop, mean_theta = summarize_percentile(thetas, models, dist_array, perc)
            results[dist_name]['prop_model'].append(prop)
            results[dist_name]['mean_theta'].append(mean_theta)
    
    return results

In [8]:
def g_and_k_quantile(z, A=0, B=1, c=0.8, g=0, k=2):
    return A + B * (1 + c * np.tanh(g * z / 2)) * z * (1 + z**2)**k

def sample_g_and_k(n, A=0, B=1, c=0.8, g=0, k=2):
    z = np.random.normal(0, 1, size=n)
    return g_and_k_quantile(z, A=A, B=B, c=c, g=g, k=k)

def main():
    np.random.seed(42)
    sample_size = 100
    n_sim = 10**6
    n_observed = 100
    percentiles = [0.1, 0.05, 0.01] # Change this to the percentile of interest

    print("Simulating datasets...")
    sims, thetas, models = simulate_datasets(n_sim=n_sim, sample_size=sample_size)

    print(f"Simulating {n_observed} observed datasets from g-and-k with g=0, k=2...") # Simulations from model M1 (with no skewness)
    observed_datasets = [sample_g_and_k(sample_size, A=0, B=1, c=0.8, g=0, k=2)
                         for _ in range(n_observed)]

    # Prepare arguments for parallel execution
    args_list = [(obs, sims, thetas, models, percentiles) for obs in observed_datasets]

    print(f"Starting parallel ABC with {cpu_count()} cores...")
    from multiprocessing import Pool
    with Pool() as pool:
        all_results = pool.map(run_abc_for_one_observed, args_list)

    # Initialize result containers
    distance_names = list(DISTANCE_LABELS.values())
    n_dist = len(distance_names)
    n_perc = len(percentiles)

    prop_model_summary = np.zeros((n_observed, n_dist, n_perc))
    mean_theta_summary = np.zeros((n_observed, n_dist, n_perc, 2))  # For [g, k]

    for i, res in enumerate(all_results):
        for d_idx, dist_name in enumerate(distance_names):
            prop_model_summary[i, d_idx, :] = res[dist_name]['prop_model']
            mean_theta_summary[i, d_idx, :, :] = res[dist_name]['mean_theta']

    # Save results
    np.savez('gk_example_g0.npz',
             prop_model=prop_model_summary,
             mean_theta=mean_theta_summary,
             percentiles=percentiles,
             distance_names=distance_names)

    print("ABC summaries saved to gk_example_g0.npz")

if __name__ == "__main__":
    main()

Simulating datasets...
Simulating 10 observed datasets from g-and-k with g=0, k=2...
Starting parallel ABC with 8 cores...
ABC summaries saved to gk_example_results.npz
