In [2]:
import os
import glob
import numpy as np
import pandas as pd


def _read_4cols_csv(path):
    """
    Read first 4 columns as numeric, drop non-numeric rows.
    Returns: (arr, n) where arr is shape (n,4) float64.
    """
    df = pd.read_csv(path, header=None)
    if df.shape[1] < 4:
        return np.empty((0, 4), dtype=np.float64), 0

    sub = df.iloc[:, :4].apply(pd.to_numeric, errors="coerce").dropna()
    arr = sub.to_numpy(dtype=np.float64)
    return arr, arr.shape[0]


def _reservoir_sample_4cols(files, sample_size=2_000_000, seed=0):
    """
    One-pass reservoir sampling for each of 4 columns independently.
    Returns a list of 4 sorted samples [s0,s1,s2,s3].
    """
    rng = np.random.default_rng(seed)

    k = int(sample_size)
    res = [np.empty(k, dtype=np.float64) for _ in range(4)]
    filled = [0, 0, 0, 0]
    seen = [0, 0, 0, 0]

    for fp in files:
        X, n = _read_4cols_csv(fp)
        if n == 0:
            continue

        # Process each column independently (same rows, separate reservoirs)
        for c in range(4):
            x = X[:, c]
            m = x.size
            if m == 0:
                continue

            # Fill reservoir initially
            if filled[c] < k:
                take = min(k - filled[c], m)
                res[c][filled[c]:filled[c] + take] = x[:take]
                filled[c] += take
                seen[c] += take

                x = x[take:]
                m = x.size
                if m == 0:
                    continue

            # Exact reservoir replacement for remaining stream items
            # For t = 1..m, j = seen + t, draw r ~ Unif{0,...,j-1}
            t = np.arange(1, m + 1, dtype=np.int64)
            r = (rng.random(m) * (seen[c] + t)).astype(np.int64)
            mask = r < k
            if np.any(mask):
                res[c][r[mask]] = x[mask]
            seen[c] += m

    out = []
    for c in range(4):
        if filled[c] == 0:
            raise RuntimeError("No numeric data found across the provided files.")
        out.append(np.sort(res[c][:filled[c]]))
    return out


def _ecdf_from_sorted_sample(sorted_sample):
    """
    Empirical CDF Fhat(x) from sorted sample.
    Mid-rank: F(x) = (rank_left + rank_right)/(2N).
    """
    ss = sorted_sample
    N = ss.size

    def Fhat(x):
        x = np.asarray(x, dtype=np.float64)
        left = np.searchsorted(ss, x, side="left")
        right = np.searchsorted(ss, x, side="right")
        return (left + right) / (2.0 * N)

    return Fhat


def _uavm_integral(u):
    """
    Exact computation of ∫_0^1 |F_n(t) - t| dt for empirical CDF of u in [0,1].
    """
    u = np.clip(np.asarray(u, dtype=np.float64), 0.0, 1.0)
    n = u.size
    if n == 0:
        return np.nan

    us = np.sort(u)
    a = np.concatenate(([0.0], us))
    b = np.concatenate((us, [1.0]))
    k = np.arange(0, n + 1, dtype=np.float64)
    c = k / n  # constant value of F_n on each interval

    mask_left = c <= a
    mask_right = c >= b
    mask_mid = ~(mask_left | mask_right)

    I = np.zeros(n + 1, dtype=np.float64)

    if np.any(mask_left):
        aa = a[mask_left] - c[mask_left]
        bb = b[mask_left] - c[mask_left]
        I[mask_left] = (bb * bb - aa * aa) / 2.0

    if np.any(mask_right):
        aa = c[mask_right] - a[mask_right]
        bb = c[mask_right] - b[mask_right]
        I[mask_right] = (aa * aa - bb * bb) / 2.0

    if np.any(mask_mid):
        aa = c[mask_mid] - a[mask_mid]
        bb = b[mask_mid] - c[mask_mid]
        I[mask_mid] = (aa * aa + bb * bb) / 2.0

    return float(I.sum())


def compute_uavms_for_folder(folder, out_csv, pooled_sample_size=2_000_000, seed=0):
    """
    For an ensemble (folder of CSV trajectories), compute per-trajectory:
      uavm_theta1, uavm_w1, uavm_theta2, uavm_w2
    where u = Fhat_pooled_col(x) and UAVM = ∫_0^1 |F_i(u)-u| du.

    Saves CSV with columns:
      file_name, n, uavm_theta1, uavm_w1, uavm_theta2, uavm_w2
    """
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No CSV files found in '{folder}'.")

    # Step 1: pooled PIT mapping per column (theta1, w1, theta2, w2)
    pooled_sorted_cols = _reservoir_sample_4cols(
        files, sample_size=pooled_sample_size, seed=seed
    )
    Fhats = [_ecdf_from_sorted_sample(s) for s in pooled_sorted_cols]

    # Step 2: per-trajectory UAVM computation
    rows = []
    for i, fp in enumerate(files):
        X, n = _read_4cols_csv(fp)
        if n == 0:
            rows.append([os.path.basename(fp), 0, np.nan, np.nan, np.nan, np.nan])
            continue

        uavms = []
        for c in range(4):
            u = Fhats[c](X[:, c])  # PIT to approx Unif(0,1)
            uavms.append(_uavm_integral(u))

        rows.append([os.path.basename(fp), int(n), *map(float, uavms)])
        if i % 5 == 0:
            print(f"Done {i}/{len(files)}")

    df_out = pd.DataFrame(
        rows,
        columns=["file_name", "n", "uavm_theta1", "uavm_w1", "uavm_theta2", "uavm_w2"]
    )
    df_out.to_csv(out_csv, index=False)
    return df_out


# ===== Run for your two systems =====
os.makedirs("UAVMs", exist_ok=True)

compute_uavms_for_folder(
    folder="paths1",
    out_csv=os.path.join("UAVMs", "paths1_uavms.csv"),
    pooled_sample_size=2_000_000,
    seed=0
)

compute_uavms_for_folder(
    folder="paths2",
    out_csv=os.path.join("UAVMs", "paths2_uavms.csv"),
    pooled_sample_size=2_000_000,
    seed=1
)

print("Saved:")
print(os.path.join("UAVMs", "paths1_uavms.csv"))
print(os.path.join("UAVMs", "paths2_uavms.csv"))


Done 0/1000
Done 5/1000
Done 10/1000
Done 15/1000
Done 20/1000
Done 25/1000
Done 30/1000
Done 35/1000
Done 40/1000
Done 45/1000
Done 50/1000
Done 55/1000
Done 60/1000
Done 65/1000
Done 70/1000
Done 75/1000
Done 80/1000
Done 85/1000
Done 90/1000
Done 95/1000
Done 100/1000
Done 105/1000
Done 110/1000
Done 115/1000
Done 120/1000
Done 125/1000
Done 130/1000
Done 135/1000
Done 140/1000
Done 145/1000
Done 150/1000
Done 155/1000
Done 160/1000
Done 165/1000
Done 170/1000
Done 175/1000
Done 180/1000
Done 185/1000
Done 190/1000
Done 195/1000
Done 200/1000
Done 205/1000
Done 210/1000
Done 215/1000
Done 220/1000
Done 225/1000
Done 230/1000
Done 235/1000
Done 240/1000
Done 245/1000
Done 250/1000
Done 255/1000
Done 260/1000
Done 265/1000
Done 270/1000
Done 275/1000
Done 280/1000
Done 285/1000
Done 290/1000
Done 295/1000
Done 300/1000
Done 305/1000
Done 310/1000
Done 315/1000
Done 320/1000
Done 325/1000
Done 330/1000
Done 335/1000
Done 340/1000
Done 345/1000
Done 350/1000
Done 355/1000
Done 360/1000
