In [1]:
import numpy as np
from scipy.stats import rankdata
from scipy.spatial.distance import pdist, cdist
from enum import Enum
from multiprocessing import Pool, cpu_count
import pprint

import random
import math
import time
from scipy.stats import levy_stable
from typing import List, Tuple

In [2]:
# Distance labels
class Distance(Enum):
    CVM = 0
    MMD = 1
    WASS = 2
    STAT = 3  # New distance: Euclidean distance between means

DISTANCE_LABELS = {Distance.CVM: "CvM", Distance.MMD: "MMD", Distance.WASS: "Wass", Distance.STAT: "Stat"}

In [3]:
# Functions to compute the distances

# Common to all examples
def cramer_von_mises_distance(obs, sim):
    n = len(obs)
    m = len(sim)
    if n == 0 or m == 0:
        return np.nan  # or return 0 or a large constant depending on use case

    combined = np.concatenate((obs, sim))
    ranks = rankdata(combined)
    obs_ranks = np.sort(ranks[:n])
    sim_ranks = np.sort(ranks[n:])
    i = np.arange(1, n + 1)
    j = np.arange(1, m + 1)
    term1 = n * np.sum((obs_ranks - i) ** 2)
    term2 = m * np.sum((sim_ranks - j) ** 2)
    denom = n * m * (n + m)
    distance = (term1 + term2) / denom - (4 * n * m - 1) / (6 * (n + m))
    return distance

# Common to all examples
def wasserstein_distance(obs, sim):
    if len(obs) == len(sim):
        sorted_obs = np.sort(obs)
        sorted_sim = np.sort(sim)
        return np.mean(np.abs(sorted_obs - sorted_sim))
    else:
        # fallback to scipy implementation if different sizes
        from scipy.stats import wasserstein_distance as ws
        return ws(obs, sim)

# Common to all examples
def gaussian_kernel(sq_distances, sigma):
    return np.exp(-sq_distances / (2 * sigma))

# Common to all examples
def maximum_mean_discrepancy(obs, sim, obs_sq_dist=None, sigma=None):
    if obs_sq_dist is None:
        obs_sq_dist = pdist(obs.reshape(-1,1), 'sqeuclidean')
    if sigma is None:
        sigma = np.median(obs_sq_dist) ** 0.5
    sim_sq_dist = pdist(sim.reshape(-1,1), 'sqeuclidean')
    mixed_sq_dist = cdist(obs.reshape(-1,1), sim.reshape(-1,1), 'sqeuclidean')
    k_xx = np.mean(gaussian_kernel(obs_sq_dist, sigma))
    k_yy = np.mean(gaussian_kernel(sim_sq_dist, sigma))
    k_xy = np.mean(gaussian_kernel(mixed_sq_dist, sigma))
    return k_xx + k_yy - 2 * k_xy

# Specific to the toad example
def statistical_distance(x, y):
    q = np.linspace(0.0, 1.0, 11)
    xq = np.quantile(x, q)
    yq = np.quantile(y, q)
    with np.errstate(divide='ignore'):
        logdiff = np.abs(np.log1p(xq) - np.log1p(yq))
    return np.nansum(logdiff)

In [4]:
# Functions to simulate for the three toad movement models
class Model(Enum):
    RANDOM = 0
    NEAREST = 1
    DISTANCE = 2
    
def distance_based_probs(position: float, refuge_locations: np.ndarray, p0: float, d0: float) -> np.ndarray:
    # Calculating individual return probabilties based on the current position compared to the 
    # refuge locations for the distance-based return model
    refuge_distances = np.abs(position - refuge_locations)
    
    return p0 * np.exp(-refuge_distances / d0)

def toad_movement_sample(model: Model, alpha: float, gamma: float, p0: float, d0: float = None, num_toads: int = 66, num_days: int = 63) -> np.ndarray:
    
    # Storing 0 as initial refuge for distance-based return
    toad_positions = np.zeros((num_days, num_toads))
    if model == Model.DISTANCE:
        refuge_counts = np.ones(num_toads, dtype=int)
        refuge_locations = np.zeros((num_days, num_toads))
    else:
        no_return_probs = 1 - p0
    
    # Simulation of step sizes for each toad over the tracking period
    steps = levy_stable.rvs(alpha, 0, scale=gamma, size=(num_days - 1, num_toads))
    
    # Main loop, all toads are handled in one loop to make use of vectorised calculations
    for i in range(1, num_days):
        # Calculating new position
        new_pos = toad_positions[i - 1] + steps[i - 1]
        
        # Calculating no return probability for distance-based return model (not constant)
        if model == Model.DISTANCE:
            refuge_probs = [distance_based_probs(new_pos[j], refuge_locations[:refuge_counts[j], j], p0, d0) for j in range(num_toads)]
            no_return_probs = np.array([np.prod(1 - refuge_probs[j]) for j in range(num_toads)])
        
        # Separating toads which are return and not returning for the current day
        no_return_flag = np.random.uniform(size=num_toads) < no_return_probs
        no_return_ids = np.nonzero(no_return_flag)[0]
        return_ids = np.nonzero(~no_return_flag)[0]
        
        # Updating toad position for non returning toads to th new positions
        toad_positions[i, no_return_ids] = new_pos[no_return_ids]
        
        if model == Model.RANDOM:
            # Randomly selecting a location among all previous locations for returning toads
            return_location_ids = np.random.randint(0, i, size=return_ids.shape)
            toad_positions[i, return_ids] = toad_positions[return_location_ids, return_ids]
        elif model == Model.NEAREST:
            # Determining nearest return location for each return toad
            return_location_ids = np.argmin(np.abs(new_pos[return_ids] - toad_positions[:i, return_ids]), axis=0)
            toad_positions[i, return_ids] = toad_positions[return_location_ids, return_ids]
        else:
            # Randomly selecting previous location using distance-based probabilities for returning toads
            # and updating refuge locations and counts for non-return toads
            return_location_ids = [np.random.choice(list(range(refuge_counts[j])), p=refuge_probs[j] / np.sum(refuge_probs[j])) for j in return_ids]
            toad_positions[i, return_ids] = refuge_locations[return_location_ids, return_ids]
            refuge_locations[refuge_counts[no_return_ids], no_return_ids] = new_pos[no_return_ids]
            refuge_counts[no_return_ids] += 1
            
    return toad_positions

# Compute the 48 summaries that will represent the observed dataset
def compute_displacement_summaries(Y: np.ndarray, lags=[1,2,4,8], threshold=10.0):
    summaries = []
    n_days, n_toads = Y.shape
    for lag in lags:
        displacements = np.abs(Y[lag:, :] - Y[:-lag, :]).flatten()
        returns = np.sum(displacements < threshold)
        non_returns = displacements[displacements >= threshold]
        summaries.append({'returns': returns, 'non_returns': non_returns})
    return dict(zip(lags, summaries))

In [5]:
# Compute distances - this function has similarity with the same function for other examples, but, given the different structure of 
# the input dataset, compute distances for each lag and type of statistics - so it is specific for the toad example
def compute_lag_distances(obs_summary, sim_summaries, omega=0.2):
    lag_keys = [1, 2, 4, 8]
    n_sim = len(sim_summaries)
    all_results = []

    for sim_idx, sim_summary in enumerate(sim_summaries):
        abs_diff_sum = 0
        cvm_vals = []
        wass_vals = []
        mmd_vals = []
        stat_vals = []

        for lag in lag_keys:
            ret_obs = obs_summary[lag]['returns']
            ret_sim = sim_summary[lag]['returns']
            nonret_obs = obs_summary[lag]['non_returns']
            nonret_sim = sim_summary[lag]['non_returns']

            abs_diff_sum += abs(ret_obs - ret_sim)

            if len(nonret_obs) > 0 and len(nonret_sim) > 0:
                cvm_vals.append(cramer_von_mises_distance(nonret_obs, nonret_sim))
                wass_vals.append(wasserstein_distance(nonret_obs, nonret_sim))
                mmd_vals.append(maximum_mean_discrepancy(nonret_obs, nonret_sim))
                stat_vals.append(statistical_distance(nonret_obs, nonret_sim))
            else:
                cvm_vals.append(np.nan)
                wass_vals.append(np.nan)
                mmd_vals.append(np.nan)
                stat_vals.append(np.nan)

        sim_result = {
            'CvM': {'return': abs_diff_sum, 'non_return': np.nanmean(cvm_vals)},
            'Wass': {'return': abs_diff_sum, 'non_return': np.nanmean(wass_vals)},
            'MMD': {'return': abs_diff_sum, 'non_return': np.nanmean(mmd_vals)},
            'Stat': {'return': abs_diff_sum, 'non_return': np.nanmean(stat_vals)}
        }

        all_results.append(sim_result)

    return all_results

In [6]:
# Function to combine the 8 distances (2*number of lags) computed with the compute_lag_distances() function
def combine_distances(dist_results, omega=0.2):
    n_sim = len(dist_results)
    metrics = dist_results[0].keys()

    combined_per_metric = {}

    for metric in metrics:
        ret_dists = np.array([d[metric]['return'] for d in dist_results])
        nonret_dists = np.array([d[metric]['non_return'] for d in dist_results])

        max_ret = ret_dists.max() if ret_dists.max() > 0 else 1e-10
        max_nonret = nonret_dists.max() if nonret_dists.max() > 0 else 1e-10

        # Weighted normalized distance per simulation
        combined = omega * (ret_dists / max_ret) + (1 - omega) * (nonret_dists / max_nonret)
        combined_per_metric[metric] = combined

    return combined_per_metric

In [7]:
# Given the increased computational time to simulate datasets and extract summaries, we write 
# functions specific for the toad example, making use of parallel simulation.
def simulate_one(args):
    model = args
    if model == Model.DISTANCE:
        alpha = np.random.uniform(1, 2.0)
        gamma = np.random.uniform(10, 100)
        p0 = np.random.uniform(0, 1)
        d0 = np.random.uniform(20, 2000)
        Y = toad_movement_sample(model, alpha, gamma, p0, d0)
        thetas = (alpha, gamma, p0, d0)
    else:
        alpha = np.random.uniform(1, 2.0)
        gamma = np.random.uniform(10, 100)
        p0 = np.random.uniform(0, 1)
        Y = toad_movement_sample(model, alpha, gamma, p0)
        thetas = (alpha, gamma, p0, np.nan)

    summary = compute_displacement_summaries(Y)
    return summary, thetas, model.value

def simulate_datasets_parallel(n_sim: int, processes: int = None):
    if processes is None:
        processes = cpu_count()

    model_list = ([Model.RANDOM] * (n_sim // 3) +
                  [Model.NEAREST] * (n_sim // 3) +
                  [Model.DISTANCE] * (n_sim // 3))

    with Pool(processes) as pool:
        results = pool.map(simulate_one, [m for m in model_list])

    summaries, thetas, models = zip(*results)
    return np.array(summaries), np.array(thetas), np.array(models)

In [8]:
# Function to extract the models and parameters relative to the q% smallest distances - we need a new function 
# because we are using dictionaries for this example
from collections import defaultdict

def summarize_percentile(sim_thetas, sim_models, dists, percentile=1.0):
    dists = np.asarray(dists)
    sim_thetas = np.asarray(sim_thetas)
    sim_models = np.asarray(sim_models)

    # Determine threshold
    threshold = np.percentile(dists, percentile * 100)
    accepted_idx = np.where(dists <= threshold)[0]

    # Log how many are accepted
    print(f"Percentile {percentile:.2f}: accepted {len(accepted_idx)} / {len(dists)} simulations")

    if len(accepted_idx) == 0:
        num_models = len(np.unique(sim_models))
        num_thetas = sim_thetas.shape[1] if sim_thetas.ndim > 1 else 1
        return (
            np.full(num_models, np.nan),
            np.full(num_thetas, np.nan),
        )

    accepted_models = sim_models[accepted_idx]
    accepted_thetas = sim_thetas[accepted_idx]

    # Model probabilities
    unique_models, counts = np.unique(accepted_models, return_counts=True)
    model_probs = np.zeros(len(np.unique(sim_models)))
    model_probs[unique_models] = counts / counts.sum()

    # Mean theta
    theta_means = np.nanmean(accepted_thetas, axis=0)

    return model_probs, theta_means

In [21]:
# Function to run ABC for one observed dataset
def run_abc_for_one_observed(i, obs_data, sim_summaries, sim_thetas, sim_models, percentiles, output_dir):
    obs_summary = compute_displacement_summaries(obs_data)
    lag_distances = compute_lag_distances(obs_summary, sim_summaries)
    combined_all = combine_distances(lag_distances, omega=0.2)

    results = {}
    for dist_enum in Distance:
        dist_name = DISTANCE_LABELS[dist_enum]
        combined_dist = combined_all[dist_name]

        model_probs_list = []
        theta_means_list = []

        for perc in percentiles:
            model_probs, theta_means = summarize_percentile(sim_thetas, sim_models, combined_dist, perc)

            if model_probs is not None and theta_means is not None:
                model_probs_list.append(np.array(model_probs[:3]))
                theta_means_list.append(np.array(theta_means[:3]))
            else:
                model_probs_list.append(np.full(3, np.nan))
                theta_means_list.append(np.full(3, np.nan))

        results[dist_name] = {
            'model_probs': model_probs_list,  # shape: (len(percentiles), 3)
            'theta_means': theta_means_list   
        }

    out_path = Path(output_dir) / f"toad_result_random_{i+1}.npz"
    np.savez(
        out_path,
        result=results,
        index=i,
        percentiles=percentiles
    )
    return results

In [28]:
import os
from multiprocessing import Pool, cpu_count
from pathlib import Path
from functools import partial
import numpy as np

def run_one(indexed_obs, sim_summaries, sim_thetas, sim_models, percentiles, output_dir):
    i, obs = indexed_obs
    return run_abc_for_one_observed(i, obs, sim_summaries, sim_thetas, sim_models, percentiles, output_dir)

# Main function
def main():
    np.random.seed(42)

    # Parameters
    n_observed = 100
    n_simulations = 100000
    percentiles = [0.01, 0.005, 0.001]
    output_dir = Path("toad/random/") # change to your folder
    output_dir.mkdir(parents=True, exist_ok=True)

    # Step 1: Simulate training data
    print("Generating simulations...")
    sims, thetas, models = simulate_datasets_parallel(n_sim=n_simulations)
    sim_summaries = sims

    # Step 2: Simulate observed datasets
    print("Simulating observed datasets...")
    observed_raw = [toad_movement_sample(Model.RANDOM, alpha=1.7, gamma=34, p0=0.6) for _ in range(n_observed)] # Simulation for random return model (Model M1)

    # Step 3: Run ABC
    print("Running ABC in parallel...")
    func = partial(
        run_one,
        sim_summaries=sim_summaries,
        sim_thetas=thetas,
        sim_models=models,
        percentiles=percentiles,
        output_dir=output_dir
    )

    with Pool(processes=cpu_count()) as pool:
        indexed_obs = list(enumerate(observed_raw))
        all_results = pool.map(func, indexed_obs)

    # Step 4: Aggregate results
    distance_names = list(DISTANCE_LABELS.values())
    n_dist = len(distance_names)
    n_perc = len(percentiles)
    n_theta = len(thetas[0]) if len(thetas) > 0 else 0

    model_probs_summary = np.zeros((n_observed, n_dist, 3, n_perc))
    theta_means_summary = np.zeros((n_observed, n_dist, 3, n_perc))

    for i, result in enumerate(all_results):
        for j, dist_name in enumerate(distance_names):
            model_probs = np.array(result[dist_name]['model_probs'])  # shape: (n_perc, 3)
            theta_means = np.array(result[dist_name]['theta_means'])  # shape: (n_perc, 3)

            # Transpose to (3, n_perc)
            model_probs_summary[i, j] = model_probs.T
            theta_means_summary[i, j] = theta_means.T

    # Step 5: Save results
    np.save(output_dir / "model_probs_summary.npy", model_probs_summary)
    np.save(output_dir / "theta_means_summary.npy", theta_means_summary)

    return model_probs_summary, theta_means_summary

if __name__ == "__main__":
    main()

Generating simulations...
Simulating observed datasets...
Running ABC in parallel...
Percentile 0.01: accepted 12 / 999 simulations
Percentile 0.01: accepted 5 / 999 simulations
Percentile 0.00: accepted 1 / 999 simulations
Percentile 0.01: accepted 14 / 999 simulations
Percentile 0.01: accepted 6 / 999 simulations
Percentile 0.00: accepted 2 / 999 simulations
Percentile 0.01: accepted 10 / 999 simulations
Percentile 0.01: accepted 10 / 999 simulations
Percentile 0.00: accepted 2 / 999 simulations
Percentile 0.01: accepted 10 / 999 simulations
Percentile 0.01: accepted 6 / 999 simulations
Percentile 0.00: accepted 2 / 999 simulations
Percentile 0.01: accepted 11 / 999 simulations
Percentile 0.01: accepted 6 / 999 simulations
Percentile 0.00: accepted 2 / 999 simulations
Percentile 0.01: accepted 11 / 999 simulations
Percentile 0.01: accepted 6 / 999 simulations
Percentile 0.00: accepted 2 / 999 simulations
Percentile 0.01: accepted 10 / 999 simulations
Percentile 0.01: accepted 10 / 99