In [None]:
print("yay")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import pymc as pm
import arviz as az
from scipy.stats import gaussian_kde, norm
import ipywidgets as widgets
from IPython.display import display, clear_output
import scipy.stats as st
from scipy.stats import gaussian_kde
from typing import Sequence, Tuple, Dict, Optional, Callable

print("Running with PyMC version:", pm.__version__)


In [1]:
# paths 
# here we have paths to folders where the logging histories of individual training runs are stored.

# for Unet, Swin and Terramind trained on aerial, we have these logs:
UNET_AE_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\UNET\AE\evaluation_unet.csv"
SWIN_AE_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\SWIN\AE\evaluation_swin.csv"
TERRAMIND_AE_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\Terramind\AE\evaluation_tm.csv"

# similarly for PlanetScope:
UNET_PS_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\UNET\PS\evaluation_unet.csv"
SWIN_PS_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\SWIN\PS\evaluation_swin.csv"
TERRAMIND_PS_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\Terramind\PS\evaluation_tm.csv"

#and lastly for Sentinel 2
UNET_S2_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\UNET\S2\evaluation_unet.csv"
SWIN_S2_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\SWIN\S2\evaluation_swin.csv"
TERRAMIND_S2_METRICS = r"N:\isipd\projects\p_planetdw\data\methods_test\results\Terramind\S2\evaluation_tm.csv"

# metrics to analyse
metrics = ['IoU', 'dice_coef', 'normalized_surface_distance', 'mean_epistemic_uncertainty', 'mean_aleatoric_uncertainty']

# which metrics are "higher is better"
maximize_metrics = {'IoU', 'dice_coef'}  # include accuracy too

# metrics that are bounded in (0,1) and should be modelled on logit scale
bounded_01_metrics = {'IoU', 'dice_coef', 'normalized_surface_distance', 'mean_epistemic_uncertainty', 'mean_aleatoric_uncertainty'}

# priors for factorial effects (on transformed scale)
# - logit scale: Normal(0, 0.5) is a reasonable weakly-informative prior
# - log scale  : Normal(0, 0.2) corresponds to ~ +/- 22% multiplicative change (1 SD)
prior_sd_logit = 0.5
prior_sd_log = 0.2

TEST = True

# sampling config
DRAWS = 2000
TUNE = 2000
CHAINS = 4
TARGET_ACCEPT = 0.9
HDI_PROB = 0.95 # use 95 as CI standard
RANDOM_SEED = 1701

# Savage–Dickey KDE bandwidth (can be adjusted)
SD_KDE_BW = 0.3

# overall ranking weights across metrics (explicit utility)
# leave as None for equal weights across metrics
metric_weights: Optional[Dict[str, float]] = None

# output dirs (optional; keep yours)
UNET_AE_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\UNET\AE"
SWIN_AE_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\SWIN\AE"
TERRAMIND_AE_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\TERRAMIND\AE"

UNET_PS_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\UNET\PS"
SWIN_PS_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\SWIN\PS"
TERRAMIND_PS_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\TERRAMIND\PS"

UNET_S2_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\UNET\S2"
SWIN_S2_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\SWIN\S2"
TERRAMIND_S2_OUTPUT_DIR = r"N:\isipd\projects\p_planetdw\git\DriftwoodMappingBenchmark\figs\stats\TERRAMIND\S2"

hdi_lower = (1.0 - HDI_PROB) / 2.0
hdi_upper = 1.0 - hdi_lower

In [2]:
# Map each CSV to its labels
CSV_SPECS = [
    (UNET_AE_METRICS,      "U-Net | AE",        "AE",   "U-Net"),
    (SWIN_AE_METRICS,      "Swin U-Net | AE",   "AE",   "Swin"),
    (TERRAMIND_AE_METRICS, "Terramind | AE",    "AE",   "Terramind"),
    (UNET_PS_METRICS,      "U-Net | PS",        "PS",   "U-Net"),
    (SWIN_PS_METRICS,      "Swin U-Net | PS",   "PS",   "Swin"),
    (TERRAMIND_PS_METRICS, "Terramind | PS",    "PS",   "Terramind"),
    (UNET_S2_METRICS,      "U-Net | S2",        "S2",   "U-Net"),
    (SWIN_S2_METRICS,      "Swin U-Net | S2",   "S2",   "Swin"),
    (TERRAMIND_S2_METRICS, "Terramind | S2",    "S2",   "Terramind"),
]

def load_checkpoint_metrics(csv_specs, metrics):
    # metrics are column names like "IoU", "dice_coef", etc. (no val_ prefixes)
    dfs = []
    for path, group, dataset, arch in csv_specs:
        if os.path.exists(path) is False:
            print(f"Warning: path {path} does not exist, skipping.")
            continue
        df = pd.read_csv(path)
        needed = [m for m in metrics if m in df.columns]
        subset = df[needed].copy()
        subset["group"] = group
        subset["dataset"] = dataset
        subset["arch"] = arch
        dfs.append(subset)
    return pd.concat(dfs, ignore_index=True)

# prepare
group_order = [
    "U-Net | AE",     "Swin U-Net | AE",     "Terramind | AE",
    "U-Net | PS",     "Swin U-Net | PS",     "Terramind | PS",
    "U-Net | S2",     "Swin U-Net | S2",     "Terramind | S2",
]
all_data = load_checkpoint_metrics(CSV_SPECS, metrics)
all_data["group"] = pd.Categorical(all_data["group"], categories=group_order, ordered=True)

# for the last 3 entires in all_data, overwrite dataset with S2, regardless of the entry
if TEST:
    for i in range(len(all_data)-3, len(all_data)):
        all_data.at[i, "dataset"] = "S2"

display(all_data)





NameError: name 'os' is not defined

In [None]:
# We model bounded metrics (IoU, F1, sensitivity, ...) on logit scale.
# We model positive metrics (loss, Hausdorff, ...) on log scale.
# This avoids Normal likelihood pathologies at boundaries and ensures positivity.

def base_metric_name(metric_col: str) -> str:
    return metric_col[4:] if metric_col.startswith("val_") else metric_col

def is_bounded_01(metric_col: str) -> bool:
    return base_metric_name(metric_col) in bounded_01_metrics

def higher_is_better(metric_col: str) -> bool:
    m = base_metric_name(metric_col)
    return m in maximize_metrics

def transform_y(y: np.ndarray, metric_col: str, eps: float = 1e-6) -> Tuple[np.ndarray, Callable[[np.ndarray], np.ndarray], str]:
    """
    Returns:
      y_transformed,
      inverse_transform,
      transform_name
    """
    if is_bounded_01(metric_col):
        y_clip = np.clip(y, eps, 1 - eps)
        y_t = np.log(y_clip / (1 - y_clip))         # logit
        inv = lambda z: 1 / (1 + np.exp(-z))
        return y_t, inv, "logit"
    else:
        y_clip = np.clip(y, eps, None)
        y_t = np.log(y_clip)                        # log
        inv = lambda z: np.exp(z)
        return y_t, inv, "log"


In [None]:
def rank_probabilities_from_draws(draws: np.ndarray, group_names, higher_better: bool) -> pd.DataFrame:
    """
    draws: (S, G) posterior draws of each group's performance (ORIGINAL scale preferred)
    higher_better: True if larger = better, else smaller = better
    """
    S, G = draws.shape
    score = draws if higher_better else -draws

    order = np.argsort(-score, axis=1)  # best..worst
    ranks = np.empty_like(order)
    for s in range(S):
        ranks[s, order[s]] = np.arange(1, G + 1)

    out = {"group": list(group_names)}
    for k in range(1, G + 1):
        out[f"Pr(rank={k})"] = [(ranks[:, j] == k).mean() for j in range(G)]
    out["E[rank]"] = [ranks[:, j].mean() for j in range(G)]
    out["Pr(best)"] = out["Pr(rank=1)"]

    return pd.DataFrame(out).sort_values("E[rank]").reset_index(drop=True)


In [None]:
# BF10 = prior_density(Δ=0) / posterior_density(Δ=0)
# Requires idata.prior draws for the effect parameter.

def _density_at_zero(draws_1d: np.ndarray, bw=SD_KDE_BW) -> float:
    draws_1d = np.asarray(draws_1d).ravel()
    kde = gaussian_kde(draws_1d, bw_method=bw)
    return float(kde.evaluate(0.0)[0])

def savage_dickey_bf10(idata: az.InferenceData, var_name: str, bw=SD_KDE_BW) -> float:
    """
    Savage–Dickey BF10 for H1: var != 0 vs H0: var = 0
    BF10 = p(var=0 | prior) / p(var=0 | posterior)
    """
    if not hasattr(idata, "prior"):
        raise ValueError(
            "idata has no prior group. Ensure you ran pm.sample_prior_predictive(var_names=[...]) "
            "and extended idata with it."
        )

    post = az.extract(idata, group="posterior", var_names=[var_name]).to_numpy().ravel()
    prior = az.extract(idata, group="prior", var_names=[var_name]).to_numpy().ravel()

    prior0 = _density_at_zero(prior, bw=bw)
    post0  = _density_at_zero(post,  bw=bw)

    return prior0 / post0


In [None]:
def fit_factorial_model_with_bf(
    df: pd.DataFrame,
    dataset_col: str,
    arch_col: str,
    metric_col: str,
    draws=DRAWS,
    tune=TUNE,
    chains=CHAINS,
    target_accept=TARGET_ACCEPT,
    seed=RANDOM_SEED,
    bw=SD_KDE_BW,
    hdi_prob=HDI_PROB
):
    data = df[[dataset_col, arch_col, metric_col]].dropna().copy()
    dcat = pd.Categorical(data[dataset_col]) # PS, S2 etc
    acat = pd.Categorical(data[arch_col]) # Unet, Swin, Terramind
    
    len_datasets = len(dcat.categories)
    len_archs = len(acat.categories)
    
    print(f"Fitting a {len_datasets}×{len_archs} factorial model for '{metric_col}'")
    print(f"Datasets: {list(dcat.categories)}")
    print(f"Architectures: {list(acat.categories)}")
    
    # Create effects-coded design matrices (sum-to-zero constraint)
    # For K levels, use K-1 contrasts
    d_idx = dcat.codes # eg ids for datasets o .. K-1
    a_idx = acat.codes
    n_obs = len(d_idx)
    
    # Effects coding: reference level gets -1, others get identity, one matrix each factor
    D_mat = np.zeros((n_obs, len_datasets - 1)) 
    A_mat = np.zeros((n_obs, len_archs - 1))
    
    for i in range(n_obs):
        # Fill in effects coding
        if d_idx[i] < len_datasets - 1:
            D_mat[i, d_idx[i]] = 1 # 1 means that level is an effect
        else:  # reference level
            D_mat[i, :] = -1 # all -1s for reference level

        # same for architecture    
        if a_idx[i] < len_archs - 1:
            A_mat[i, a_idx[i]] = 1
        else:  # reference level
            A_mat[i, :] = -1
        
    
    # Interaction design matrix: outer product structure
    DA_mat = np.zeros((n_obs, (len_datasets - 1) * (len_archs - 1)))
    col_idx = 0

    # fill interaction matrix with effects coding
    for d in range(len_datasets - 1):
        for a in range(len_archs - 1):
            DA_mat[:, col_idx] = D_mat[:, d] * A_mat[:, a]
            col_idx += 1
    
    y_raw = data[metric_col].to_numpy()
    y_t, inv, tname = transform_y(y_raw, metric_col) # transform to logit or log scale, to ensure Normal likelihood is appropriate
    effect_sd = prior_sd_logit if tname == "logit" else prior_sd_log
    
    coords = {
        "dataset_effect": list(dcat.categories)[:-1],
        "arch_effect": list(acat.categories)[:-1],
        "interaction_effect": [
            f"{dcat.categories[d]}×{acat.categories[a]}"
            for d in range(len_datasets - 1)
            for a in range(len_archs - 1)
        ]
    }
    
    with pm.Model(coords=coords) as model:
        # Data containers
        D = pm.Data("D", D_mat) # dataset factor
        A = pm.Data("A", A_mat) # architecture factor
        DA = pm.Data("DA", DA_mat) # interaction factor
        
        # Parameters as priors
        intercept = pm.Normal("intercept", 0.0, 1.5) # concentration around 0, with 1.5 sd
        beta_dataset = pm.Normal("beta_dataset", 0.0, effect_sd, 
                                dims="dataset_effect") # prior say mean effect 0, sd depending on scale
        beta_arch = pm.Normal("beta_arch", 0.0, effect_sd, 
                            dims="arch_effect") # same for architecture, no effect as prior 
        beta_interaction = pm.Normal("beta_interaction", 0.0, effect_sd,
                                    dims="interaction_effect") # and we dont assume any interaction a priori
        
        # Linear predictor using matrix multiplication
        mu = (intercept + 
              pm.math.dot(D, beta_dataset) + 
              pm.math.dot(A, beta_arch) + 
              pm.math.dot(DA, beta_interaction))
        
        # Robust likelihood
        sigma = pm.HalfNormal("sigma", 1.0)
        nu = pm.Exponential("nu", 1/30) + 1
        pm.StudentT("y", nu=nu, mu=mu, sigma=sigma, observed=y_t) # use student-t likelihood to be robust to outliers, although unlikely
        
        idata = pm.sample(
            draws=draws,
            tune=tune,
            chains=chains,
            target_accept=target_accept,
            random_seed=seed,
            return_inferencedata=True
        )
        
        prior = pm.sample_prior_predictive(
            var_names=["beta_dataset", "beta_arch", "beta_interaction"],
            random_seed=seed
        )
        idata.extend(prior) # add our prior samples to idata for BF calculation
    
    # Summarize results
    summary = az.summary(
        idata, 
        var_names=["beta_dataset", "beta_arch", "beta_interaction"], 
        hdi_prob=HDI_PROB
    ).reset_index()
    
    summary = summary.rename(columns={"index": "param"})
    
    # Add directional probabilities and BF for each parameter
    # directional probabilities are defined as Pr(param > 0 | data)
    for var_name in ["beta_dataset", "beta_arch", "beta_interaction"]:
        param_draws = az.extract(idata, group="posterior", var_names=[var_name])
        
        # Handle each dimension separately
        if param_draws.ndim > 1:
            for dim_val in param_draws.coords[param_draws.dims[0]].values:
                draws = param_draws.sel({param_draws.dims[0]: dim_val}).values.ravel()
                param_str = f"{var_name}[{dim_val}]"
                mask = summary["param"] == param_str
                summary.loc[mask, "Pr(>0)"] = (draws > 0).mean()
                # BF calculation would need to handle each dimension
                #summary.loc[mask, "BF10"] = savage_dickey_bf10(idata, var_name, dim_val, bw=SD_KDE_BW)
    
    report = summary.loc[:, ["param", "mean", "sd", f"hdi_{int(hdi_lower*100)}%", f"hdi_{int(hdi_upper*100)}%", "Pr(>0)"]].copy()
    report["scale"] = tname
    report["prior"] = f"Normal(0, {effect_sd}) on {tname} scale"
    
    return idata, report

In [None]:
def fit_group_model_with_ranking(
    df: pd.DataFrame,
    group_col: str,
    metric_col: str,
    draws=DRAWS,
    tune=TUNE,
    chains=CHAINS,
    target_accept=TARGET_ACCEPT,
    seed=RANDOM_SEED
):
    import re
    
    data = df[[group_col, metric_col]].dropna().copy()

    if pd.api.types.is_categorical_dtype(data[group_col]):
        data[ group_col] = data[group_col].cat.remove_unused_categories()

    gcat = pd.Categorical(data[group_col])
    g_idx = gcat.codes
    groups = list(gcat.categories)
    G = len(groups)
    y_raw = data[metric_col].to_numpy()
    y_t, inv, tname = transform_y(y_raw, metric_col)
    coords = {"group": groups}
    
    with pm.Model(coords=coords) as model:
        g = pm.Data("g", g_idx)
        y_obs = pm.Data("y_obs", y_t)
        # hierarchical priors (transformed scale)
        mu0 = pm.Normal("mu0", 0.0, 1.5)
        tau = pm.HalfNormal("tau", 1.0)
        mu = pm.Normal("mu", mu0, tau, dims="group")
        sigma = pm.HalfNormal("sigma", 1.0, dims="group")
        nu = pm.Exponential("nu", 1/30) + 1
        pm.StudentT("y", nu=nu, mu=mu[g], sigma=sigma[g], observed=y_obs)
        # deterministic group means on original scale
        mu_orig = pm.Deterministic("mu_orig", inv(mu), dims="group")
        
        idata = pm.sample(
            draws=draws,
            tune=tune,
            chains=chains,
            target_accept=target_accept,
            random_seed=seed,
            return_inferencedata=True
        )
    
    # posterior draws for ranking (original scale)
    mu_draws = (
        idata.posterior["mu_orig"]
        .stack(sample=("chain","draw"))
        .transpose("sample","group")
        .values
    )
    hib = higher_is_better(metric_col)
    rank_table = rank_probabilities_from_draws(mu_draws, groups, higher_better=hib)
    
    # summary table (original scale)
    summ = az.summary(idata, var_names=["mu_orig"], hdi_prob=0.94).reset_index()
    summ = summ.rename(columns={"index": "param"})
    
    # Extract group name from parameter string (handles labeled coordinates)
    summ["group"] = summ["param"].str.replace(r"mu_orig\[|\]", "", regex=True)
    
    perf_table = (
        summ.loc[:, ["group", "mean", "sd", "hdi_3%", "hdi_97%"]]
        .merge(rank_table, on="group", how="left")
        .sort_values("E[rank]")
        .reset_index(drop=True)
    )
    
    return idata, perf_table, rank_table, mu_draws, tname

In [None]:
# We define a composite score:
#  - for each metric, ensure "higher is better" by sign-flip if necessary
#  - standardise per metric so scales don't dominate
#  - weighted average across metrics (equal weights by default)
#
# Then compute posterior rank probabilities from composite draws.

def overall_ranking_across_metrics(
    mu_draws_by_metric: Dict[str, np.ndarray],
    higher_is_better_by_metric: Dict[str, bool],
    group_names: Sequence[str],
    weights: Optional[Dict[str, float]] = None,
):
    metric_list = list(mu_draws_by_metric.keys())

    if weights is None:
        weights = {m: 1.0 for m in metric_list}
    wsum = sum(weights.values())
    weights = {m: weights[m] / wsum for m in metric_list}

    # align draws
    S = min(mu_draws_by_metric[m].shape[0] for m in metric_list)
    G = len(group_names)

    composite = np.zeros((S, G))
    for m in metric_list:
        mu = mu_draws_by_metric[m][:S, :]
        score = mu if higher_is_better_by_metric[m] else -mu

        # standardize to avoid scale dominance
        sd = score.std() + 1e-12
        z = score / sd

        composite += weights[m] * z

    overall_rank_table = rank_probabilities_from_draws(composite, group_names, higher_better=True)

    return composite, overall_rank_table, weights


In [None]:

# For each metric:
#  1) extract per-run "best checkpoint" values for 4 combinations
#  2) build dataframe with factors (group/dataset/arch)
#  3) plot distribution
#  4) fit:
#     - hierarchical group model (ranking)
#     - factorial model (effects + Savage–Dickey BF10)
#  5) store posterior draws for overall multi-metric ranking

tabs = []
tab_titles = []

# store posterior group-mean draws per metric for overall ranking
mu_draws_by_metric: Dict[str, np.ndarray] = {}
hib_by_metric: Dict[str, bool] = {}

for metric in metrics:

    out = widgets.Output()

    with out:

        if TEST:
            print("TESTMODE, LAST 3 ROWS SET TO S2 DATASET, DO NOT USE FOR REAL ANALYSIS")
            
        clear_output(wait=True)
        print(f'Processing: {metric}\n')

        data = all_data[['group', 'dataset', 'arch', metric]].dropna().copy()

        # visualize run-to-run distribution
        sns.kdeplot(data=data, x=metric, hue="group")
        plt.title(f"Distribution across runs: {metric}")
        plt.show()

        # 1) hierarchical group model -> rank probabilities
        idata_g, perf_table, rank_table, mu_draws, scale_name = fit_group_model_with_ranking(
            data, group_col="group", metric_col=metric
        )

        print("\n--- Hierarchical group model (group means on original scale) ---")
        print(f"Modelled on {scale_name} scale; reporting mu_orig on original scale.\n")
        display(perf_table)

        # store for overall ranking
        mu_draws_by_metric[metric] = mu_draws
        hib_by_metric[metric] = higher_is_better(metric)

        # 2) factorial model -> effect size + uncertainty + BF10 (Savage–Dickey)
        idata_f, factorial_report = fit_factorial_model_with_bf(
            data, dataset_col="dataset", arch_col="arch", metric_col=metric
        )

        print("\n--- Factorial effects (transformed scale) + Savage–Dickey BF10 ---")
        display(factorial_report)

    tabs.append(out)
    tab_titles.append(metric)

tab_widget = widgets.Tab(children=tabs)
for i, title in enumerate(tab_titles):
    tab_widget.set_title(i, title)

display(tab_widget)


In [None]:
# This produces a single posterior ranking of the 4 combinations "overall".
# IMPORTANT: "overall" depends on the utility you define (weights + standardization).
# Default: equal weights.

composite_draws, overall_rank_table, used_weights = overall_ranking_across_metrics(
    mu_draws_by_metric=mu_draws_by_metric,
    higher_is_better_by_metric=hib_by_metric,
    group_names=group_order,
    weights=metric_weights
)

print("\n===============================")
print("OVERALL MULTI-METRIC RANKING")
print("===============================\n")

display(overall_rank_table)

print("\nWeights used:")
print(used_weights)

print("\nInterpretation:")
print("- Pr(rank=1) is the posterior probability a combination is best overall under this utility.")
print("- E[rank] is the expected rank (lower = better).")
print("- If Pr(rank=1) is not dominant and ranks are spread, the 'overall winner' is uncertain.")


In [None]:

def interpret_bf10(bf10: float) -> str:
    """
    Common descriptive categories (Jeffreys-style heuristics).
    """
    if bf10 < 1/10:
        return "strong evidence for H0 (no effect)"
    if bf10 < 1/3:
        return "moderate evidence for H0"
    if bf10 < 1:
        return "anecdotal evidence for H0"
    if bf10 < 3:
        return "anecdotal evidence for H1 (effect)"
    if bf10 < 10:
        return "moderate evidence for H1"
    if bf10 < 30:
        return "strong evidence for H1"
    if bf10 < 100:
        return "very strong evidence for H1"
    return "extreme evidence for H1"


# Example usage:
# interpret_bf10(12.5)
