In [1]:
# Cell 1 — imports + parameters
import os, glob
from pathlib import Path
import numpy as np
import pandas as pd

# folders
FOLDERS = ["paths1", "paths2"]
OUTDIR = Path("AVMs4")
OUTDIR.mkdir(exist_ok=True)

# (d) settings
B_LIST = [50, 100, 200, 500, 1000, 2000]   # block lengths b (edit if you want)
STRIDE_S = 100                              # thinning stride for ensemble proxy
N_BLOCKS_PER_B = 10                         # how many random blocks per b per file
Q_GRID_N = 4001                             # quantile grid size for trapz integration
SEED = 123

rng = np.random.default_rng(SEED)


In [2]:
# Cell 2 — I/O + ensemble reference construction (thinned) + AVM via trapz

def read_csv_4cols(file_path: str) -> np.ndarray:
    """
    Returns array shape (n,4): theta1, w1, theta2, w2
    """
    df = pd.read_csv(file_path)
    if df.shape[1] < 4:
        raise ValueError(f"Expected >=4 columns, got {df.shape[1]} in {file_path}")
    arr = df.iloc[:, :4].to_numpy(dtype=float, copy=False)
    return arr

def build_ensemble_reference(folder: str, stride: int) -> dict:
    """
    Build thinned ensemble reference arrays per variable for a folder.
    Returns dict: {0: ref_sorted_theta1, 1: ref_sorted_w1, 2: ref_sorted_theta2, 3: ref_sorted_w2}
    """
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    refs = {i: [] for i in range(4)}

    for fp in files:
        arr = read_csv_4cols(fp)
        arr = arr[::stride, :]  # thinning
        for i in range(4):
            x = arr[:, i]
            x = x[np.isfinite(x)]
            if x.size:
                refs[i].append(x)

    ref_sorted = {}
    for i in range(4):
        allx = np.concatenate(refs[i]) if refs[i] else np.array([], dtype=float)
        allx = allx[np.isfinite(allx)]
        allx.sort()
        ref_sorted[i] = allx

    return ref_sorted

def make_quantile_grid(ref_sorted: np.ndarray, q_grid_n: int) -> np.ndarray:
    """
    Quantile grid in x-space for trapezoidal integration.
    """
    if ref_sorted.size == 0:
        return np.array([], dtype=float)
    qs = np.linspace(0.0, 1.0, q_grid_n)
    # np.quantile can be a bit heavy for huge arrays; ref here is thinned (manageable)
    xg = np.quantile(ref_sorted, qs, method="linear")
    # ensure increasing grid (handles flat spots)
    xg = np.maximum.accumulate(xg)
    return xg

def ecdf_at_grid(sample_sorted: np.ndarray, x_grid: np.ndarray) -> np.ndarray:
    """
    ECDF of sample evaluated at x_grid.
    """
    n = sample_sorted.size
    if n == 0:
        return np.zeros_like(x_grid)
    return np.searchsorted(sample_sorted, x_grid, side="right") / n

def avm_trapz(sample: np.ndarray, ref_sorted: np.ndarray, x_grid: np.ndarray) -> float:
    """
    AVM = ∫ |F_sample(x) - F_ref(x)| dx  via trapezoidal rule on x_grid.
    """
    if x_grid.size == 0 or ref_sorted.size == 0:
        return np.nan

    s = sample[np.isfinite(sample)]
    if s.size == 0:
        return np.nan
    s.sort()

    Fs = ecdf_at_grid(s, x_grid)
    Fr = ecdf_at_grid(ref_sorted, x_grid)  # should be close to grid quantiles, but computed directly

    return float(np.trapz(np.abs(Fs - Fr), x_grid))


In [3]:
# Cell 3 — per-file block sampling + run folder + write CSVs

def mean_block_avm_for_series(x: np.ndarray, ref_sorted: np.ndarray, x_grid: np.ndarray,
                              b: int, n_blocks: int, rng: np.random.Generator) -> float:
    """
    Sample n_blocks random contiguous blocks of length b from x, compute AVM for each,
    and return the mean.
    """
    x = x[np.isfinite(x)]
    n = x.size
    if n < b or b <= 1:
        return np.nan

    max_start = n - b
    starts = rng.integers(0, max_start + 1, size=n_blocks)

    vals = np.empty(n_blocks, dtype=float)
    for j, st in enumerate(starts):
        block = x[st:st + b]
        vals[j] = avm_trapz(block, ref_sorted, x_grid)

    return float(np.nanmean(vals))

def process_folder(folder: str):
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    # Build reference (thinned) once per folder
    print(f"[{folder}] Building ensemble reference with stride={STRIDE_S} ...")
    ref_sorted = build_ensemble_reference(folder, stride=STRIDE_S)

    # Precompute x-grids per variable
    x_grids = {}
    for i in range(4):
        x_grids[i] = make_quantile_grid(ref_sorted[i], Q_GRID_N)

    # Prepare output columns
    col_names = ["file_name"]
    for i in range(1, 5):  # W1..W4
        for b in B_LIST:
            col_names.append(f"W{i}_{b}")

    rows = []
    for idx, fp in enumerate(files, start=1):
        if idx % 10 == 0:
            print(f"[{folder}] Processing file {idx}/{len(files)}: {os.path.basename(fp)}")

        arr = read_csv_4cols(fp)
        row = {"file_name": os.path.basename(fp)}

        for var_idx in range(4):
            x = arr[:, var_idx]
            rs = ref_sorted[var_idx]
            xg = x_grids[var_idx]

            for b in B_LIST:
                key = f"W{var_idx+1}_{b}"
                row[key] = mean_block_avm_for_series(
                    x=x,
                    ref_sorted=rs,
                    x_grid=xg,
                    b=b,
                    n_blocks=N_BLOCKS_PER_B,
                    rng=rng
                )

        rows.append(row)

    out_df = pd.DataFrame(rows, columns=col_names)
    out_path = OUTDIR / f"{folder}_results.csv"
    out_df.to_csv(out_path, index=False)
    print(f"[{folder}] Wrote: {out_path}")

for folder in FOLDERS:
    process_folder(folder)


[paths1] Building ensemble reference with stride=100 ...
[paths1] Processing file 10/1000: -0.022268896275924988-3.73193616057468--0.32064977268139616--8.012785629954287.csv
[paths1] Processing file 20/1000: -0.10308041856142358-0.008957013283678261-2.8342852915331997--4.010556567926857.csv
[paths1] Processing file 30/1000: -0.1415415725923368-1.2710069788687761-1.2540042176602633--5.249039073807581.csv
[paths1] Processing file 40/1000: -0.17787207980484965-2.7408234864908234-1.4383135537256466-3.526792122351696.csv
[paths1] Processing file 50/1000: -0.2005972134229177--2.447456763549089--2.3070685478486515-1.6154600372322605.csv
[paths1] Processing file 60/1000: -0.22922367685167666--1.5415818700770414--0.47935702634937183--4.097519506450378.csv
[paths1] Processing file 70/1000: -0.27094500328458304--0.728186821144921--0.11325565382848701--5.11855674507118.csv
[paths1] Processing file 80/1000: -0.3192326554422049-2.533436561925863--2.8793038811671603--0.4887602401603175.csv
[paths1] P

## Using the scipy Wasserstein function

In [5]:
# Cell 1 — imports + parameters
import os, glob
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance

# folders
FOLDERS = ["paths1", "paths2"]
OUTDIR = Path("AVMs4a")
OUTDIR.mkdir(exist_ok=True)

# (d) settings
B_LIST = [50, 100, 200, 500, 1000, 2000]   # block lengths b (edit if you want)
STRIDE_S = 100                              # thinning stride for ensemble proxy
N_BLOCKS_PER_B = 10                         # how many random blocks per b per file
SEED = 123

rng = np.random.default_rng(SEED)

# Cell 2 — I/O + ensemble reference construction (thinned) + Wasserstein distance

def read_csv_4cols(file_path: str) -> np.ndarray:
    """
    Returns array shape (n,4): theta1, w1, theta2, w2
    """
    df = pd.read_csv(file_path)
    if df.shape[1] < 4:
        raise ValueError(f"Expected >=4 columns, got {df.shape[1]} in {file_path}")
    arr = df.iloc[:, :4].to_numpy(dtype=float, copy=False)
    return arr

def build_ensemble_reference(folder: str, stride: int) -> dict:
    """
    Build thinned ensemble reference arrays per variable for a folder.
    Returns dict: {0: ref_theta1, 1: ref_w1, 2: ref_theta2, 3: ref_w2}
    """
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    refs = {i: [] for i in range(4)}

    for fp in files:
        arr = read_csv_4cols(fp)
        arr = arr[::stride, :]  # thinning
        for i in range(4):
            x = arr[:, i]
            x = x[np.isfinite(x)]
            if x.size:
                refs[i].append(x)

    ref = {}
    for i in range(4):
        allx = np.concatenate(refs[i]) if refs[i] else np.array([], dtype=float)
        allx = allx[np.isfinite(allx)]
        ref[i] = allx  # no need to sort for scipy.stats.wasserstein_distance
    return ref

def avm_wasserstein(sample: np.ndarray, ref: np.ndarray) -> float:
    """
    1D Wasserstein distance W1 between the sample and the ensemble reference.
    (Equal weights; SciPy handles sorting internally.)
    """
    if ref.size == 0:
        return np.nan
    s = sample[np.isfinite(sample)]
    r = ref[np.isfinite(ref)]
    if s.size == 0 or r.size == 0:
        return np.nan
    return float(wasserstein_distance(s, r))

# Cell 3 — per-file block sampling + run folder + write CSVs

def mean_block_avm_for_series(x: np.ndarray, ref: np.ndarray,
                              b: int, n_blocks: int, rng: np.random.Generator) -> float:
    """
    Sample n_blocks random contiguous blocks of length b from x, compute Wasserstein distance for each,
    and return the mean.
    """
    x = x[np.isfinite(x)]
    n = x.size
    if n < b or b <= 1:
        return np.nan

    max_start = n - b
    starts = rng.integers(0, max_start + 1, size=n_blocks)

    vals = np.empty(n_blocks, dtype=float)
    for j, st in enumerate(starts):
        block = x[st:st + b]
        vals[j] = avm_wasserstein(block, ref)

    return float(np.nanmean(vals))

def process_folder(folder: str):
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    # Build reference (thinned) once per folder
    print(f"[{folder}] Building ensemble reference with stride={STRIDE_S} ...")
    ref = build_ensemble_reference(folder, stride=STRIDE_S)

    # Prepare output columns
    col_names = ["file_name"]
    for i in range(1, 5):  # W1..W4
        for b in B_LIST:
            col_names.append(f"W{i}_{b}")

    out_path = OUTDIR / f"{folder}_results.csv"

    # remove old file if it exists (so we start clean)
    if out_path.exists():
        out_path.unlink()

    rows = []
    wrote_header = False

    for idx, fp in enumerate(files, start=1):
        if idx % 10 == 0:
            print(f"[{folder}] Processing file {idx}/{len(files)}: {os.path.basename(fp)}")

        arr = read_csv_4cols(fp)
        row = {"file_name": os.path.basename(fp)}

        for var_idx in range(4):
            x = arr[:, var_idx]
            r = ref[var_idx]

            for b in B_LIST:
                key = f"W{var_idx+1}_{b}"
                row[key] = mean_block_avm_for_series(
                    x=x,
                    ref=r,
                    b=b,
                    n_blocks=N_BLOCKS_PER_B,
                    rng=rng
                )

        rows.append(row)

        # ---- periodic flush every 10 rows ----
        if len(rows) >= 10:
            df_chunk = pd.DataFrame(rows, columns=col_names)
            df_chunk.to_csv(
                out_path,
                mode="a",
                header=not wrote_header,
                index=False
            )
            wrote_header = True
            rows = []  # clear buffer

    # ---- flush remainder ----
    if rows:
        df_chunk = pd.DataFrame(rows, columns=col_names)
        df_chunk.to_csv(
            out_path,
            mode="a",
            header=not wrote_header,
            index=False
        )

    print(f"[{folder}] Wrote: {out_path}")

for folder in FOLDERS:
    process_folder(folder)


[paths1] Building ensemble reference with stride=100 ...
[paths1] Processing file 10/1000: -0.022268896275924988-3.73193616057468--0.32064977268139616--8.012785629954287.csv
[paths1] Processing file 20/1000: -0.10308041856142358-0.008957013283678261-2.8342852915331997--4.010556567926857.csv
[paths1] Processing file 30/1000: -0.1415415725923368-1.2710069788687761-1.2540042176602633--5.249039073807581.csv
[paths1] Processing file 40/1000: -0.17787207980484965-2.7408234864908234-1.4383135537256466-3.526792122351696.csv
[paths1] Processing file 50/1000: -0.2005972134229177--2.447456763549089--2.3070685478486515-1.6154600372322605.csv
[paths1] Processing file 60/1000: -0.22922367685167666--1.5415818700770414--0.47935702634937183--4.097519506450378.csv
[paths1] Processing file 70/1000: -0.27094500328458304--0.728186821144921--0.11325565382848701--5.11855674507118.csv
[paths1] Processing file 80/1000: -0.3192326554422049-2.533436561925863--2.8793038811671603--0.4887602401603175.csv
[paths1] P

KeyboardInterrupt: 

## Corrected Approach for the $b$-thinning

In [3]:
# Cell 1 — imports + parameters
import os, glob
from pathlib import Path
import numpy as np
import pandas as pd

# folders
FOLDERS = ["paths1", "paths2"]
OUTDIR = Path("AVMs4b")
OUTDIR.mkdir(exist_ok=True)

# stride-b settings (b plays the role of m)
B_LIST = [50, 100, 200, 500, 1000, 2000]

# ensemble proxy settings
STRIDE_S = 100

# trapezoid grid resolution (accuracy vs speed)
GRID_SIZE = 5000   # raise if you want better accuracy, lower if slow

SEED = 123
rng = np.random.default_rng(SEED)


In [4]:
# Cell 2 — I/O + ensemble reference construction (thinned)

def read_csv_4cols(file_path: str) -> np.ndarray:
    """
    Returns array shape (n,4): theta1, w1, theta2, w2
    """
    df = pd.read_csv(file_path)
    if df.shape[1] < 4:
        raise ValueError(f"Expected >=4 columns, got {df.shape[1]} in {file_path}")
    return df.iloc[:, :4].to_numpy(dtype=float, copy=False)

def build_ensemble_reference(folder: str, stride: int) -> dict:
    """
    Build thinned ensemble reference arrays per variable for a folder.
    Returns dict: {0: ref_theta1, 1: ref_w1, 2: ref_theta2, 3: ref_w2}
    """
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    refs = {i: [] for i in range(4)}

    for fp in files:
        arr = read_csv_4cols(fp)
        arr = arr[::stride, :]  # thinning for ensemble proxy
        for i in range(4):
            x = arr[:, i]
            x = x[np.isfinite(x)]
            if x.size:
                refs[i].append(x)

    ref = {}
    for i in range(4):
        allx = np.concatenate(refs[i]) if refs[i] else np.array([], dtype=float)
        allx = allx[np.isfinite(allx)]
        ref[i] = allx
    return ref


In [5]:
# Cell 3 — trapezoidal W1 approximation on a grid

def ecdf_on_grid(sorted_sample: np.ndarray, grid: np.ndarray) -> np.ndarray:
    """
    Empirical CDF evaluated at grid points.
    sorted_sample must be 1D sorted ascending.
    """
    n = sorted_sample.size
    if n == 0:
        return np.full_like(grid, np.nan, dtype=float)
    return np.searchsorted(sorted_sample, grid, side="right") / n

def w1_trapz(sample: np.ndarray, ref: np.ndarray, grid_size: int = GRID_SIZE) -> float:
    """
    Approximate W1(sample, ref) = ∫ |F_s(x) - F_r(x)| dx
    using trapezoidal rule over a grid.

    Grid is built mostly from reference quantiles (to keep it small even if ref is huge),
    with endpoints padded using min/max of both arrays.
    """
    s = sample[np.isfinite(sample)]
    r = ref[np.isfinite(ref)]
    if s.size == 0 or r.size == 0:
        return np.nan

    s_sorted = np.sort(s)
    r_sorted = np.sort(r)

    lo = min(s_sorted[0], r_sorted[0])
    hi = max(s_sorted[-1], r_sorted[-1])
    if not np.isfinite(lo) or not np.isfinite(hi) or lo == hi:
        return 0.0

    # Grid: reference quantiles + endpoints.
    # (Quantiles keep grid size manageable even if r is very large.)
    probs = np.linspace(0.0, 1.0, grid_size)
    grid = np.quantile(r_sorted, probs)
    grid[0] = lo
    grid[-1] = hi

    Fs = ecdf_on_grid(s_sorted, grid)
    Fr = ecdf_on_grid(r_sorted, grid)
    diff = np.abs(Fs - Fr)

    return float(np.trapz(diff, grid))


In [6]:
# Cell 4 — stride-b thinning statistic (UNSCALED) + run folder + write CSVs

def stride_thinned_series(x: np.ndarray, b: int) -> np.ndarray:
    """
    Form X = {X_0, X_b, X_{2b}, ..., X_{qb}}, q = floor((n-1)/b)
    """
    x = x[np.isfinite(x)]
    n = x.size
    if b <= 0 or n == 0:
        return np.array([], dtype=float)

    q = (n - 1) // b
    if q < 1:
        return np.array([], dtype=float)

    return x[0 : q * b + 1 : b]

def avm_stride_unscaled_for_series(x: np.ndarray, ref: np.ndarray, b: int) -> float:
    """
    UN-SCALED Wasserstein-1 distance approximation via trapezoid rule:
    W1(mu_{X_0, X_b, ..., X_{qb}}, mu_ref).
    """
    subseq = stride_thinned_series(x, b=b)
    if subseq.size == 0:
        return np.nan
    return w1_trapz(subseq, ref)

def process_folder(folder: str):
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}/")

    print(f"[{folder}] Building ensemble reference with stride={STRIDE_S} ...")
    ref = build_ensemble_reference(folder, stride=STRIDE_S)

    # output columns
    col_names = ["file_name"]
    for i in range(1, 5):  # variables 1..4
        for b in B_LIST:
            col_names.append(f"W{i}_{b}")

    out_path = OUTDIR / f"{folder}_results.csv"
    if out_path.exists():
        out_path.unlink()

    rows = []
    wrote_header = False

    for idx, fp in enumerate(files, start=1):
        if idx % 10 == 0:
            print(f"[{folder}] Processing file {idx}/{len(files)}: {os.path.basename(fp)}")

        arr = read_csv_4cols(fp)
        row = {"file_name": os.path.basename(fp)}

        for var_idx in range(4):
            x = arr[:, var_idx]
            r = ref[var_idx]

            for b in B_LIST:
                key = f"W{var_idx+1}_{b}"
                row[key] = avm_stride_unscaled_for_series(x=x, ref=r, b=b)

        rows.append(row)

        # flush every 10 rows
        if len(rows) >= 10:
            pd.DataFrame(rows, columns=col_names).to_csv(
                out_path, mode="a", header=not wrote_header, index=False
            )
            wrote_header = True
            rows = []

    # flush remainder
    if rows:
        pd.DataFrame(rows, columns=col_names).to_csv(
            out_path, mode="a", header=not wrote_header, index=False
        )

    print(f"[{folder}] Wrote: {out_path}")

for folder in FOLDERS:
    process_folder(folder)


[paths1] Building ensemble reference with stride=100 ...
[paths1] Processing file 10/1000: -0.022268896275924988-3.73193616057468--0.32064977268139616--8.012785629954287.csv
[paths1] Processing file 20/1000: -0.10308041856142358-0.008957013283678261-2.8342852915331997--4.010556567926857.csv
[paths1] Processing file 30/1000: -0.1415415725923368-1.2710069788687761-1.2540042176602633--5.249039073807581.csv
[paths1] Processing file 40/1000: -0.17787207980484965-2.7408234864908234-1.4383135537256466-3.526792122351696.csv
[paths1] Processing file 50/1000: -0.2005972134229177--2.447456763549089--2.3070685478486515-1.6154600372322605.csv
[paths1] Processing file 60/1000: -0.22922367685167666--1.5415818700770414--0.47935702634937183--4.097519506450378.csv
[paths1] Processing file 70/1000: -0.27094500328458304--0.728186821144921--0.11325565382848701--5.11855674507118.csv
[paths1] Processing file 80/1000: -0.3192326554422049-2.533436561925863--2.8793038811671603--0.4887602401603175.csv
[paths1] P