In [None]:
# -*- coding: utf-8 -*-
import os, re, json, gc, sys, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional (not used in plotting)
import seaborn as sns  # noqa: F401
from scipy.signal import find_peaks  # noqa: F401
from scipy.signal import savgol_filter  # noqa: F401
from scipy.ndimage import gaussian_filter1d  # noqa: F401
from scipy.linalg import svd  # noqa: F401

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold

# ----------------------------
# Repro & GPU memory growth
# ----------------------------
np.random.seed(42)
tf.random.set_seed(42)
try:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
    pass

def helper_regex(text):
    m = re.search(rf"{'Full'}\s+(\w+)", str(text))
    return m.group(1) if m else None

# ----------------------------
# Grouped log-odds gradient (keep model as-is, multi-class)
# ----------------------------
@tf.function(reduce_retracing=True)
def _group_logodds_grad_for_model(x1, model, pos_ids, neg_ids, eps):
    """
    Gradient wrt inputs of log(sum_{i in pos_ids} p_i) - log(sum_{j in neg_ids} p_j).
    x1: (1, D)
    """
    pos_ids = tf.constant(pos_ids, dtype=tf.int32)
    neg_ids = tf.constant(neg_ids, dtype=tf.int32)
    with tf.GradientTape() as tape:
        tape.watch(x1)
        p = model(x1, training=False)  # (1, C)
        p_pos = tf.reduce_sum(tf.gather(p, pos_ids, axis=1), axis=1)  # (1,)
        p_neg = tf.reduce_sum(tf.gather(p, neg_ids, axis=1), axis=1)  # (1,)
        log_odds = tf.math.log(p_pos + eps) - tf.math.log(p_neg + eps)
    g = tape.gradient(log_odds, x1)  # (1, D)
    return tf.squeeze(g, axis=0)     # (D,)

def compute_avg_group_logodds_gradient(
    X: np.ndarray, models: list, pos_ids=(2,3), neg_ids=(0,1), eps: float = 1e-8
) -> np.ndarray:
    """
    Average input gradient across samples and models.
    Returns (D,) np.ndarray.
    """
    X = tf.convert_to_tensor(X, dtype=tf.float32)
    N, D = X.shape[0], X.shape[1]
    sample_grads = []
    for i in range(N):
        x_i = X[i:i+1]  # (1, D)
        grads_over_models = []
        for m in models:
            g = _group_logodds_grad_for_model(x_i, m, pos_ids, neg_ids, eps)
            grads_over_models.append(g)
        g_avg_models = tf.reduce_mean(tf.stack(grads_over_models, axis=0), axis=0)  # (D,)
        sample_grads.append(g_avg_models)
    avg_grad = tf.reduce_mean(tf.stack(sample_grads, axis=0), axis=0)  # (D,)
    return avg_grad.numpy()

# ----------------------------
# Model (unchanged)
# ----------------------------
def build_model(input_dim: int, num_classes: int):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=regularizers.l1(0.01)),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    return model

# ----------------------------
# Cosine & plotting helpers
# ----------------------------
def cosine_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float:
    a = np.asarray(a, dtype=float).ravel()
    b = np.asarray(b, dtype=float).ravel()
    n = min(a.size, b.size)
    a = a[:n]; b = b[:n]
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + eps
    return float(np.dot(a, b) / denom)

def mirror_plot(x, top_y, bottom_y, title, outfile):
    """
    Mirror plot: top_y drawn above baseline; bottom_y mirrored below (negative).
    """
    plt.figure(figsize=(10, 5))
    plt.plot(x, top_y, linewidth=1.0, label="Run A")
    plt.plot(x, -bottom_y, linewidth=1.0, label="Run B (mirrored)")
    plt.axhline(0.0, linewidth=0.8)
    plt.xlabel("m/z (approx grid)")
    plt.ylabel("Gradient magnitude")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(outfile, dpi=200)
    plt.close()

# ----------------------------
# Config (adjust path as needed)
# ----------------------------
CSV_PATH    = r"F:/casts/dataset_rt.csv"  # <- adjust
EPOCHS      = 50
BATCH_SIZE  = 32
K_SPLITS    = 5
N_REPEATS   = 10

# two independent runs
SEED_BASES  = [111, 777]

# Output root (we'll nest per bin)
OUT_ROOT = "./group_compare_all_bins"

# ----------------------------
# Training (multi-class) for one run; return list of models
# ----------------------------
def train_kfold_repeats(X: np.ndarray, Y: np.ndarray, seed_base: int):
    kf = KFold(n_splits=K_SPLITS, shuffle=True, random_state=42)
    all_models = []
    num_classes = int(np.max(Y)) + 1
    for fold, (tr, va) in enumerate(kf.split(X, Y), 1):
        X_tr, y_tr = X[tr], Y[tr]
        X_va, y_va = X[va], Y[va]
        for r in range(N_REPEATS):
            seed = seed_base * 1000 + fold * 100 + r
            tf.keras.utils.set_random_seed(seed)
            np.random.seed(seed)
            m = build_model(X.shape[1], num_classes)
            m.fit(X_tr, y_tr,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  validation_data=(X_va, y_va),
                  verbose=0)
            all_models.append(m)
        print(f"[Seed base {seed_base}] Fold {fold}/{K_SPLITS} trained {N_REPEATS} models (total: {len(all_models)})")
    return all_models

# ----------------------------
# Grouping input (ask user) + parsers
# ----------------------------
def _parse_groupings_from_string(s: str):
    """
    Accepts a few forgiving formats. Examples:
      "((1),(0));((2),(0));((3),(0))"
      "1|0;2|0;3|0"
      "[ (1,2)|(0,3), (3)|(0,1,2) ]"
      "[((1,), (0,)), ((2,), (0,))]"
    Returns: list of tuples: [ (pos_tuple, neg_tuple), ... ]
    """
    s = s.strip()
    if not s:
        return []

    # Try Python-literal first (e.g., "[((1,), (0,)), ...]")
    try:
        obj = ast.literal_eval(s)
        out = []
        for pair in obj:
            pos, neg = pair
            pos_t = tuple(int(x) for x in (pos if isinstance(pos, (list, tuple)) else (pos,)))
            neg_t = tuple(int(x) for x in (neg if isinstance(neg, (list, tuple)) else (neg,)))
            out.append((pos_t, neg_t))
        if out:
            return out
    except Exception:
        pass

    # Try "a|b; c|d" style
    if '|' in s:
        pairs = re.split(r'\s*;\s*', s)
        out = []
        for p in pairs:
            if not p:
                continue
            if '|' not in p:
                continue
            left, right = p.split('|', 1)
            L = tuple(int(x) for x in re.findall(r'-?\d+', left))
            R = tuple(int(x) for x in re.findall(r'-?\d+', right))
            if len(L) and len(R):
                out.append((tuple(L), tuple(R)))
        if out:
            return out

    # Try "((...),(...));((...),(...))" by extracting integers
    chunks = re.findall(r'\(([^()]*)\)\s*,\s*\(([^()]*)\)', s)
    out = []
    for lft, rgt in chunks:
        L = tuple(int(x) for x in re.findall(r'-?\d+', lft))
        R = tuple(int(x) for x in re.findall(r'-?\d+', rgt))
        if len(L) and len(R):
            out.append((tuple(L), tuple(R)))
    return out

def _ask_for_groupings(unique_labels):
    """
    Ask user for groupings via input(), with robust fallback.
    ENV override: if CLASS_GROUPINGS is set, parse from it and skip prompt.
    Special keyword: 'AUTO' -> generate all non-empty bipartitions (pos vs neg).
    """
    env_s = os.environ.get("CLASS_GROUPINGS", "").strip()
    if env_s:
        groups = _parse_groupings_from_string(env_s)
        if not groups and env_s.upper() == "AUTO":
            return auto_groupings(unique_labels)
        if groups:
            return groups
        print("[WARN] Failed to parse CLASS_GROUPINGS from environment; falling back to prompt.", file=sys.stderr)

    prompt = (
        "\nEnter class groupings as pairs of POS vs NEG (examples):\n"
        "  1) Python-literal:  [((1,), (0,)), ((2,), (0,)), ((3,), (0,))]\n"
        "  2) Pipe/semicolon:  1|0; 2|0; 3|0\n"
        "  3) Tuple pairs:     ((2,3),(0,1)); ((1,2),(0,3))\n"
        "Type 'AUTO' for all non-empty bipartitions of observed labels.\n"
        "Press Enter for default: ((1),(0)); ((2),(0)); ((3),(0)); ((2,3),(0,1))\n"
        "Your groupings: "
    )
    try:
        s = input(prompt)
    except EOFError:
        s = ""  # non-interactive: default

    s = (s or "").strip()
    if not s:
        # default sensible set
        return [((1,), (0,)), ((2,), (0,)), ((3,), (0,)), ((2,3), (0,1))]
    if s.upper() == "AUTO":
        return auto_groupings(unique_labels)

    parsed = _parse_groupings_from_string(s)
    if parsed:
        return parsed

    print("[WARN] Could not parse input; using default groupings.", file=sys.stderr)
    return [((1,), (0,)), ((2,), (0,)), ((3,), (0,)), ((2,3), (0,1))]

from itertools import combinations, chain
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def auto_groupings(labels):
    """
    Produce all unique non-empty bipartitions (pos vs neg) of observed labels.
    We enforce min(pos) < min(neg) to avoid duplicates (pos/neg swapped).
    """
    labels = tuple(sorted(labels))
    out = []
    for pos in powerset(labels):
        if not pos:        # skip empty
            continue
        neg = tuple(sorted(set(labels) - set(pos)))
        if not neg:        # skip empty
            continue
        if min(pos) < min(neg):
            out.append((tuple(sorted(pos)), tuple(sorted(neg))))
    return out

# ----------------------------
# Main (all bins): ask for groupings, loop per bin
# ----------------------------
def main():
    # Load once
    df = pd.read_csv(CSV_PATH)

    # Discover bins
    all_bins = sorted([b for b in df["bin"].dropna().unique().tolist()])
    if len(all_bins) == 0:
        raise ValueError(f"No 'bin' values found in {CSV_PATH}")

    # Keep only 0..3 by default (adjust if needed)
    df = df[df["target"].astype(int).isin([0,1,2,3])].copy()

    # Ask for groupings (based on actually present labels)
    unique_labels = np.sort(df["target"].astype(int).unique())
    # sanity: labels should be contiguous 0..C-1
    assert unique_labels[0] == 0 and np.array_equal(unique_labels, np.arange(unique_labels[-1] + 1)), \
        f"Non-contiguous labels detected: {unique_labels}. Please remap to 0..C-1."

    groupings = _ask_for_groupings(unique_labels)
    print(f"\nUsing {len(groupings)} grouping(s): {groupings}")

    # Prepare output root
    os.makedirs(OUT_ROOT, exist_ok=True)

    # Process every bin
    for BIN_VALUE in all_bins:
        print(f"\n================= BIN {BIN_VALUE} =================")
        fdf = df[df["bin"] == BIN_VALUE].copy()
        if fdf.empty:
            print(f"[WARN] No rows for bin {BIN_VALUE}; skipping.")
            continue

        # Normalize all features except ['bin','target'] by (max+1) *within this bin*
        cols_to_norm = fdf.columns.difference(['bin', 'target'])
        fdf[cols_to_norm] = fdf[cols_to_norm].apply(lambda x: x / (x.max() + 1.0))

        Y = fdf["target"].astype(int).to_numpy()
        X = np.nan_to_num(fdf.drop(columns=['bin', 'target']).to_numpy(), copy=False)

        if X.shape[0] < 2 or X.shape[1] < 1:
            print(f"[WARN] Insufficient data for bin {BIN_VALUE} (samples={X.shape[0]}, dim={X.shape[1]}). Skipping.")
            continue

        print(f"Bin {BIN_VALUE}: samples={X.shape[0]}, dim={X.shape[1]}  class_counts="
              f"{dict(zip(*np.unique(Y, return_counts=True)))}")

        # Output dirs per bin
        OUT_DIR  = os.path.join(OUT_ROOT, f"bin_{str(BIN_VALUE).replace('.', '_')}")
        PLOT_DIR = os.path.join(OUT_DIR, "plots")
        os.makedirs(PLOT_DIR, exist_ok=True)

        # Train models ONCE per run (A, B), then reuse across all groupings
        models_A = train_kfold_repeats(X, Y, seed_base=SEED_BASES[0])
        models_B = train_kfold_repeats(X, Y, seed_base=SEED_BASES[1])

        # common x-grid
        # We'll size per gradient later, but define a helper
        def save_and_plot_for_grouping(pos_ids, neg_ids):
            # compute gradients for both runs (same X, different ensembles)
            grad_A = compute_avg_group_logodds_gradient(X, models_A,
                                                        pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8)
            grad_B = compute_avg_group_logodds_gradient(X, models_B,
                                                        pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8)

            # tagging for filenames
            pos_tag = "_".join(map(str, pos_ids))
            neg_tag = "_".join(map(str, neg_ids))
            tag = f"pos_{pos_tag}__neg_{neg_tag}"

            # Save raw gradients
            np.save(os.path.join(OUT_DIR, f"grad_runA__{tag}.npy"), grad_A)
            np.save(os.path.join(OUT_DIR, f"grad_runB__{tag}.npy"), grad_B)

            # CSV with truncated grid for convenience
            n_grid = min(10000, min(grad_A.size, grad_B.size))
            x_grid = np.arange(600, 600 + 0.1 * n_grid, 0.1)[:n_grid]
            csv_path = os.path.join(OUT_DIR, f"grads_AB__{tag}.csv")
            pd.DataFrame({"m/z": x_grid,
                          "grad_runA": grad_A[:n_grid],
                          "grad_runB": grad_B[:n_grid]}).to_csv(csv_path, index=False)

            # Split into pos / neg(abs)
            yA = grad_A[:n_grid]; yB = grad_B[:n_grid]
            yA_pos = np.where(yA > 0, yA, 0.0)
            yB_pos = np.where(yB > 0, yB, 0.0)
            yA_neg = np.where(yA < 0, -yA, 0.0)  # absolute value
            yB_neg = np.where(yB < 0, -yB, 0.0)

            # Cosine similarities
            cos_pos = cosine_sim(yA_pos, yB_pos)
            cos_neg = cosine_sim(yA_neg, yB_neg)

            # Plots
            pos_title = (f"Bin {BIN_VALUE} — Mirror Positive Gradients "
                         f"[{tag}] (cos={cos_pos:.4f})")
            neg_title = (f"Bin {BIN_VALUE} — Mirror Negative Gradients |abs| "
                         f"[{tag}] (cos={cos_neg:.4f})")

            mirror_plot(
                x_grid, yA_pos, yB_pos,
                title=pos_title,
                outfile=os.path.join(PLOT_DIR, f"{tag}__mirror_pos.png")
            )
            mirror_plot(
                x_grid, yA_neg, yB_neg,
                title=neg_title,
                outfile=os.path.join(PLOT_DIR, f"{tag}__mirror_negabs.png")
            )

            # JSON summary
            summary = {
                "bin": BIN_VALUE,
                "grouping": {"pos_ids": list(pos_ids), "neg_ids": list(neg_ids)},
                "comparison": f"log(sum p[{pos_ids}]) - log(sum p[{neg_ids}])",
                "epochs": EPOCHS,
                "batch_size": BATCH_SIZE,
                "k_splits": K_SPLITS,
                "n_repeats": N_REPEATS,
                "seed_bases": SEED_BASES,
                "cosine_pos": cos_pos,
                "cosine_neg_abs": cos_neg,
                "paths": {
                    "grads_csv": csv_path,
                    "grad_runA_npy": os.path.join(OUT_DIR, f"grad_runA__{tag}.npy"),
                    "grad_runB_npy": os.path.join(OUT_DIR, f"grad_runB__{tag}.npy"),
                    "mirror_pos_png": os.path.join(PLOT_DIR, f"{tag}__mirror_pos.png"),
                    "mirror_negabs_png": os.path.join(PLOT_DIR, f"{tag}__mirror_negabs.png"),
                }
            }
            with open(os.path.join(OUT_DIR, f"summary__{tag}.json"), "w") as f:
                json.dump(summary, f, indent=2)

            print(f"  Saved {tag}  |  Cos(pos)={cos_pos:.6f}  Cos(neg|abs|)={cos_neg:.6f}")

        # run all groupings
        for (pos_ids, neg_ids) in groupings:
            # sanity: ensure ids exist
            num_classes = int(np.max(Y)) + 1
            for idx in (*pos_ids, *neg_ids):
                assert 0 <= idx < num_classes, f"Class index {idx} out of range 0..{num_classes-1}"
            save_and_plot_for_grouping(pos_ids, neg_ids)

        # cleanup models for this bin
        try:
            for m in models_A: del m
            for m in models_B: del m
        except Exception:
            pass
        tf.keras.backend.clear_session(); gc.collect()

    print("\nAll bins processed.\n")

if __name__ == "__main__":
    main()


In [1]:
# -*- coding: utf-8 -*-
import os, re, json, gc, sys, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional (not used in plotting)
import seaborn as sns  # noqa: F401
from scipy.signal import find_peaks  # noqa: F401
from scipy.signal import savgol_filter  # noqa: F401
from scipy.ndimage import gaussian_filter1d  # noqa: F401
from scipy.linalg import svd  # noqa: F401

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold

# ----------------------------
# Repro & GPU memory growth
# ----------------------------
np.random.seed(42)
tf.random.set_seed(42)
try:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for _gpu in gpus:
        tf.config.experimental.set_memory_growth(_gpu, True)
except Exception:
    pass

def hard_free():
    """
    Aggressively release memory after each bin.
    - Close matplotlib figures
    - Clear TF session/graphs
    - Run Python GC
    - On Linux, trim libc heap pages back to the OS
    """
    try:
        plt.close('all')
    except Exception:
        pass
    try:
        tf.keras.backend.clear_session()
    except Exception:
        pass
    try:
        gc.collect(); gc.collect()
    except Exception:
        pass
    # Linux-only heap trim
    try:
        import ctypes, platform
        if platform.system().lower() == "linux":
            ctypes.CDLL("libc.so.6").malloc_trim(0)
    except Exception:
        pass

def helper_regex(text):
    m = re.search(rf"{'Full'}\s+(\w+)", str(text))
    return m.group(1) if m else None

# ----------------------------
# Grouped log-odds gradient (keep model as-is, multi-class)
# ----------------------------
@tf.function(reduce_retracing=True)
def _group_logodds_grad_for_model(x1, model, pos_ids, neg_ids, eps):
    """
    Gradient wrt inputs of log(sum_{i in pos_ids} p_i) - log(sum_{j in neg_ids} p_j).
    x1: (1, D)
    """
    pos_ids = tf.constant(pos_ids, dtype=tf.int32)
    neg_ids = tf.constant(neg_ids, dtype=tf.int32)
    with tf.GradientTape() as tape:
        tape.watch(x1)
        p = model(x1, training=False)  # (1, C)
        p_pos = tf.reduce_sum(tf.gather(p, pos_ids, axis=1), axis=1)  # (1,)
        p_neg = tf.reduce_sum(tf.gather(p, neg_ids, axis=1), axis=1)  # (1,)
        log_odds = tf.math.log(p_pos + eps) - tf.math.log(p_neg + eps)
    g = tape.gradient(log_odds, x1)  # (1, D)
    return tf.squeeze(g, axis=0)     # (D,)

def compute_avg_group_logodds_gradient(
    X: np.ndarray, models: list, pos_ids=(2,3), neg_ids=(0,1), eps: float = 1e-8
) -> np.ndarray:
    """
    Average input gradient across samples and models.
    Returns (D,) np.ndarray.
    """
    X_t = tf.convert_to_tensor(X, dtype=tf.float32)
    N = int(X_t.shape[0])
    sample_grads = []
    for i in range(N):
        x_i = X_t[i:i+1]  # (1, D)
        grads_over_models = []
        for m in models:
            g = _group_logodds_grad_for_model(x_i, m, pos_ids, neg_ids, eps)
            grads_over_models.append(g)
        g_avg_models = tf.reduce_mean(tf.stack(grads_over_models, axis=0), axis=0)  # (D,)
        sample_grads.append(g_avg_models)
    avg_grad = tf.reduce_mean(tf.stack(sample_grads, axis=0), axis=0)  # (D,)
    return avg_grad.numpy()

# ----------------------------
# Model (unchanged)
# ----------------------------
def build_model(input_dim: int, num_classes: int):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=regularizers.l1(0.01)),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    return model

# ----------------------------
# Cosine & plotting helpers
# ----------------------------
def cosine_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float:
    a = np.asarray(a, dtype=float).ravel()
    b = np.asarray(b, dtype=float).ravel()
    n = min(a.size, b.size)
    a = a[:n]; b = b[:n]
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + eps
    return float(np.dot(a, b) / denom)

def mirror_plot(x, top_y, bottom_y, title, outfile):
    """
    Mirror plot: top_y drawn above baseline; bottom_y mirrored below (negative).
    """
    plt.figure(figsize=(10, 5))
    plt.plot(x, top_y, linewidth=1.0, label="Run A")
    plt.plot(x, -bottom_y, linewidth=1.0, label="Run B (mirrored)")
    plt.axhline(0.0, linewidth=0.8)
    plt.xlabel("m/z (approx grid)")
    plt.ylabel("Gradient magnitude")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(outfile, dpi=200)
    plt.close()

def single_run_plot(x, y, title, outfile, label="Gradient"):
    plt.figure(figsize=(10, 4))
    plt.plot(x, y, linewidth=1.0, label=label)
    plt.xlabel("m/z (approx grid)")
    plt.ylabel("Gradient")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(outfile, dpi=200)
    plt.close()

# ----------------------------
# Config (adjust path as needed)
# ----------------------------
CSV_PATH    = r"F:/casts/dataset_rt.csv"  # <- adjust
EPOCHS      = 50
BATCH_SIZE  = 32
K_SPLITS    = 5
N_REPEATS   = 10

# two independent runs
SEED_BASES  = [111, 777]

# Output root (we'll nest per bin)
OUT_ROOT = "./group_compare_all_bins_8_groups"

# ----------------------------
# Training (multi-class) for one run; return list of models
# ----------------------------
def train_kfold_repeats(X: np.ndarray, Y: np.ndarray, seed_base: int):
    kf = KFold(n_splits=K_SPLITS, shuffle=True, random_state=42)
    all_models = []
    num_classes = int(np.max(Y)) + 1
    for fold, (tr, va) in enumerate(kf.split(X, Y), 1):
        X_tr, y_tr = X[tr], Y[tr]
        X_va, y_va = X[va], Y[va]
        for r in range(N_REPEATS):
            seed = seed_base * 1000 + fold * 100 + r
            tf.keras.utils.set_random_seed(seed)
            np.random.seed(seed)
            m = build_model(X.shape[1], num_classes)
            m.fit(X_tr, y_tr,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  validation_data=(X_va, y_va),
                  verbose=0)
            all_models.append(m)
        print(f"[Seed base {seed_base}] Fold {fold}/{K_SPLITS} trained {N_REPEATS} models (total: {len(all_models)})")
    return all_models

# ----------------------------
# Grouping input (ask user) + parsers
# ----------------------------
def _parse_groupings_from_string(s: str):
    """
    Accepts a few forgiving formats. Examples:
      "((1),(0));((2),(0));((3),(0))"
      "1|0;2|0;3|0"
      "[ (1,2)|(0,3), (3)|(0,1,2) ]"
      "[((1,), (0,)), ((2,), (0,))]"
    Returns: list of tuples: [ (pos_tuple, neg_tuple), ... ]
    """
    s = s.strip()
    if not s:
        return []

    # Try Python-literal first (e.g., "[((1,), (0,)), ...]")
    try:
        obj = ast.literal_eval(s)
        out = []
        for pair in obj:
            pos, neg = pair
            pos_t = tuple(int(x) for x in (pos if isinstance(pos, (list, tuple)) else (pos,)))
            neg_t = tuple(int(x) for x in (neg if isinstance(neg, (list, tuple)) else (neg,)))
            out.append((pos_t, neg_t))
        if out:
            return out
    except Exception:
        pass

    # Try "a|b; c|d" style
    if '|' in s:
        pairs = re.split(r'\s*;\s*', s)
        out = []
        for p in pairs:
            if not p:
                continue
            if '|' not in p:
                continue
            left, right = p.split('|', 1)
            L = tuple(int(x) for x in re.findall(r'-?\d+', left))
            R = tuple(int(x) for x in re.findall(r'-?\d+', right))
            if len(L) and len(R):
                out.append((tuple(L), tuple(R)))
        if out:
            return out

    # Try "((...),(...));((...),(...))" by extracting integers
    chunks = re.findall(r'\(([^()]*)\)\s*,\s*\(([^()]*)\)', s)
    out = []
    for lft, rgt in chunks:
        L = tuple(int(x) for x in re.findall(r'-?\d+', lft))
        R = tuple(int(x) for x in re.findall(r'-?\d+', rgt))
        if len(L) and len(R):
            out.append((tuple(L), tuple(R)))
    return out

def powerset(iterable):
    from itertools import combinations, chain
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def auto_groupings(labels):
    """
    Produce all unique non-empty bipartitions (pos vs neg) of observed labels.
    We enforce min(pos) < min(neg) to avoid duplicates (pos/neg swapped).
    """
    labels = tuple(sorted(labels))
    out = []
    for pos in powerset(labels):
        if not pos:        # skip empty
            continue
        neg = tuple(sorted(set(labels) - set(pos)))
        if not neg:        # skip empty
            continue
        if min(pos) < min(neg):
            out.append((tuple(sorted(pos)), tuple(sorted(neg))))
    return out

def _ask_for_groupings(unique_labels):
    """
    Ask user for groupings via input(), with robust fallback.
    ENV override: if CLASS_GROUPINGS is set, parse from it and skip prompt.
    Special keyword: 'AUTO' -> generate all non-empty bipartitions (pos vs neg).
    """
    env_s = os.environ.get("CLASS_GROUPINGS", "").strip()
    if env_s:
        groups = _parse_groupings_from_string(env_s)
        if not groups and env_s.upper() == "AUTO":
            return auto_groupings(unique_labels)
        if groups:
            return groups
        print("[WARN] Failed to parse CLASS_GROUPINGS from environment; falling back to prompt.", file=sys.stderr)

    prompt = (
        "\nEnter class groupings as pairs of POS vs NEG (examples):\n"
        "  1) Python-literal:  [((1,), (0,)), ((2,), (0,)), ((3,), (0,))]\n"
        "  2) Pipe/semicolon:  1|0; 2|0; 3|0\n"
        "  3) Tuple pairs:     ((2,3),(0,1)); ((1,2),(0,3))\n"
        "Type 'AUTO' for all non-empty bipartitions of observed labels.\n"
        "Press Enter for default: ((1),(0)); ((2),(0)); ((3),(0)); ((2,3),(0,1))\n"
        "Your groupings: "
    )
    try:
        s = input(prompt)
    except EOFError:
        s = ""  # non-interactive: default

    s = (s or "").strip()
    if not s:
        # default sensible set
        return [((1,), (0,)), ((2,), (0,)), ((3,), (0,)), ((2,3), (0,1))]
    if s.upper() == "AUTO":
        return auto_groupings(unique_labels)

    parsed = _parse_groupings_from_string(s)
    if parsed:
        return parsed

    print("[WARN] Could not parse input; using default groupings.", file=sys.stderr)
    return [((1,), (0,)), ((2,), (0,)), ((3,), (0,)), ((2,3), (0,1))]

# ----------------------------
# Main (all bins): ask for groupings, loop per bin
# ----------------------------
def main():
    # Load once
    df = pd.read_csv(CSV_PATH)

    # Discover bins
    all_bins = sorted([b for b in df["bin"].dropna().unique().tolist()])
    all_bins = [35, 45, 55, 65, 75]
    if len(all_bins) == 0:
        raise ValueError(f"No 'bin' values found in {CSV_PATH}")

    # Keep only 0..3 by default (adjust if needed)
    df = df[df["target"].astype(int).isin([0,1,2,3])].copy()

    # Ask for groupings (based on actually present labels)
    unique_labels = np.sort(df["target"].astype(int).unique())
    # sanity: labels should be contiguous 0..C-1
    assert unique_labels[0] == 0 and np.array_equal(unique_labels, np.arange(unique_labels[-1] + 1)), \
        f"Non-contiguous labels detected: {unique_labels}. Please remap to 0..C-1."

    groupings = _ask_for_groupings(unique_labels)
    print(f"\nUsing {len(groupings)} grouping(s): {groupings}")

    # Prepare output root
    os.makedirs(OUT_ROOT, exist_ok=True)

    # Process every bin
    for BIN_VALUE in all_bins:
        print(f"\n================= BIN {BIN_VALUE} =================")

        # Strict scope so we can drop refs in finally
        models_A = models_B = None
        X = Y = fdf = None

        try:
            fdf = df[df["bin"] == BIN_VALUE].copy()
            if fdf.empty:
                print(f"[WARN] No rows for bin {BIN_VALUE}; skipping.")
                continue

            # Normalize all features except ['bin','target'] by (max+1) *within this bin*
            cols_to_norm = fdf.columns.difference(['bin', 'target'])
            fdf[cols_to_norm] = fdf[cols_to_norm].apply(lambda x: x / (x.max() + 1.0))

            Y = fdf["target"].astype(int).to_numpy()
            X = np.nan_to_num(fdf.drop(columns=['bin', 'target']).to_numpy(), copy=False).astype(np.float32)

            if X.shape[0] < 2 or X.shape[1] < 1:
                print(f"[WARN] Insufficient data for bin {BIN_VALUE} (samples={X.shape[0]}, dim={X.shape[1]}). Skipping.")
                continue

            print(f"Bin {BIN_VALUE}: samples={X.shape[0]}, dim={X.shape[1]}  class_counts="
                  f"{dict(zip(*np.unique(Y, return_counts=True)))}")

            # Output dirs per bin
            OUT_DIR   = os.path.join(OUT_ROOT, f"bin_{str(BIN_VALUE).replace('.', '_')}")
            RUN_A_DIR = os.path.join(OUT_DIR, "run_A")
            RUN_B_DIR = os.path.join(OUT_DIR, "run_B")
            COMP_DIR  = os.path.join(OUT_DIR, "compare_AB")
            for d in [OUT_DIR, RUN_A_DIR, RUN_B_DIR, COMP_DIR]:
                os.makedirs(d, exist_ok=True)
                os.makedirs(os.path.join(d, "plots"), exist_ok=True)
                os.makedirs(os.path.join(d, "npy"), exist_ok=True)
                os.makedirs(os.path.join(d, "csv"), exist_ok=True)

            # Train models ONCE per run (A, B), then reuse across all groupings
            models_A = train_kfold_repeats(X, Y, seed_base=SEED_BASES[0])
            models_B = train_kfold_repeats(X, Y, seed_base=SEED_BASES[1])

            # helper: fixed grid for CSV/plots
            def _make_grid(n):
                n_grid = min(10000, n)
                x_grid = np.arange(600, 600 + 0.1 * n_grid, 0.1)[:n_grid]
                return n_grid, x_grid

            # compute and save for one grouping
            def save_and_plot_for_grouping(pos_ids, neg_ids):
                # compute gradients for both runs (same X, different ensembles)
                grad_A = compute_avg_group_logodds_gradient(
                    X, models_A, pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8
                )
                grad_B = compute_avg_group_logodds_gradient(
                    X, models_B, pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8
                )

                # tagging for filenames
                pos_tag = "_".join(map(str, pos_ids))
                neg_tag = "_".join(map(str, neg_ids))
                tag = f"pos_{pos_tag}__neg_{neg_tag}"

                # --------- SAVE EACH RUN SEPARATELY ----------
                # grid (same for both)
                n_grid, x_grid = _make_grid(min(grad_A.size, grad_B.size))

                # Run A
                np.save(os.path.join(RUN_A_DIR, "npy", f"grad__{tag}.npy"), grad_A)
                pd.DataFrame({"m/z": x_grid, "grad": grad_A[:n_grid]}).to_csv(
                    os.path.join(RUN_A_DIR, "csv", f"grad__{tag}.csv"), index=False
                )
                single_run_plot(
                    x_grid, grad_A[:n_grid],
                    title=f"Bin {BIN_VALUE} — Run A gradient [{tag}]",
                    outfile=os.path.join(RUN_A_DIR, "plots", f"{tag}__runA.png"),
                    label="Run A grad"
                )
                with open(os.path.join(RUN_A_DIR, f"summary__{tag}.json"), "w") as fA:
                    json.dump({
                        "bin": BIN_VALUE,
                        "run": "A",
                        "grouping": {"pos_ids": list(pos_ids), "neg_ids": list(neg_ids)},
                        "epochs": EPOCHS, "batch_size": BATCH_SIZE,
                        "k_splits": K_SPLITS, "n_repeats": N_REPEATS,
                        "seed_base": SEED_BASES[0],
                        "paths": {
                            "grad_npy": os.path.join(RUN_A_DIR, "npy", f"grad__{tag}.npy"),
                            "grad_csv": os.path.join(RUN_A_DIR, "csv", f"grad__{tag}.csv"),
                            "grad_plot": os.path.join(RUN_A_DIR, "plots", f"{tag}__runA.png"),
                        }
                    }, fA, indent=2)

                # Run B
                np.save(os.path.join(RUN_B_DIR, "npy", f"grad__{tag}.npy"), grad_B)
                pd.DataFrame({"m/z": x_grid, "grad": grad_B[:n_grid]}).to_csv(
                    os.path.join(RUN_B_DIR, "csv", f"grad__{tag}.csv"), index=False
                )
                single_run_plot(
                    x_grid, grad_B[:n_grid],
                    title=f"Bin {BIN_VALUE} — Run B gradient [{tag}]",
                    outfile=os.path.join(RUN_B_DIR, "plots", f"{tag}__runB.png"),
                    label="Run B grad"
                )
                with open(os.path.join(RUN_B_DIR, f"summary__{tag}.json"), "w") as fB:
                    json.dump({
                        "bin": BIN_VALUE,
                        "run": "B",
                        "grouping": {"pos_ids": list(pos_ids), "neg_ids": list(neg_ids)},
                        "epochs": EPOCHS, "batch_size": BATCH_SIZE,
                        "k_splits": K_SPLITS, "n_repeats": N_REPEATS,
                        "seed_base": SEED_BASES[1],
                        "paths": {
                            "grad_npy": os.path.join(RUN_B_DIR, "npy", f"grad__{tag}.npy"),
                            "grad_csv": os.path.join(RUN_B_DIR, "csv", f"grad__{tag}.csv"),
                            "grad_plot": os.path.join(RUN_B_DIR, "plots", f"{tag}__runB.png"),
                        }
                    }, fB, indent=2)

                # --------- A vs B COMPARISON (mirror) ----------
                yA = grad_A[:n_grid]; yB = grad_B[:n_grid]
                yA_pos = np.where(yA > 0, yA, 0.0)
                yB_pos = np.where(yB > 0, yB, 0.0)
                yA_neg = np.where(yA < 0, -yA, 0.0)  # absolute value
                yB_neg = np.where(yB < 0, -yB, 0.0)

                # Cosine similarities
                cos_pos = cosine_sim(yA_pos, yB_pos)
                cos_neg = cosine_sim(yA_neg, yB_neg)

                # Save combined CSV used for mirror inspection
                comb_csv = os.path.join(COMP_DIR, "csv", f"grads_AB__{tag}.csv")
                pd.DataFrame({
                    "m/z": x_grid,
                    "grad_runA": yA,
                    "grad_runB": yB,
                    "pos_runA": yA_pos,
                    "pos_runB": yB_pos,
                    "negabs_runA": yA_neg,
                    "negabs_runB": yB_neg,
                }).to_csv(comb_csv, index=False)

                # Mirror plots
                pos_title = (f"Bin {BIN_VALUE} — Mirror Positive Gradients "
                             f"[{tag}] (cos={cos_pos:.4f})")
                neg_title = (f"Bin {BIN_VALUE} — Mirror Negative Gradients |abs| "
                             f"[{tag}] (cos={cos_neg:.4f})")

                mirror_plot(
                    x_grid, yA_pos, yB_pos,
                    title=pos_title,
                    outfile=os.path.join(COMP_DIR, "plots", f"{tag}__mirror_pos.png")
                )
                mirror_plot(
                    x_grid, yA_neg, yB_neg,
                    title=neg_title,
                    outfile=os.path.join(COMP_DIR, "plots", f"{tag}__mirror_negabs.png")
                )

                # Comparison JSON
                with open(os.path.join(COMP_DIR, f"summary__{tag}.json"), "w") as fC:
                    json.dump({
                        "bin": BIN_VALUE,
                        "grouping": {"pos_ids": list(pos_ids), "neg_ids": list(neg_ids)},
                        "comparison": "Run A vs Run B",
                        "cosine_pos": cos_pos,
                        "cosine_neg_abs": cos_neg,
                        "paths": {
                            "combined_csv": comb_csv,
                            "mirror_pos_png": os.path.join(COMP_DIR, "plots", f"{tag}__mirror_pos.png"),
                            "mirror_negabs_png": os.path.join(COMP_DIR, "plots", f"{tag}__mirror_negabs.png"),
                        }
                    }, fC, indent=2)

                print(f"  Saved {tag}  |  Cos(pos)={cos_pos:.6f}  Cos(neg|abs|)={cos_neg:.6f}")

            # run all groupings
            num_classes = int(np.max(Y)) + 1
            for (pos_ids, neg_ids) in groupings:
                for idx in (*pos_ids, *neg_ids):
                    assert 0 <= idx < num_classes, f"Class index {idx} out of range 0..{num_classes-1}"
                save_and_plot_for_grouping(pos_ids, neg_ids)

        finally:
            # Explicitly drop large references from this bin
            try:
                if models_A is not None:
                    for _m in models_A:
                        del _m
                del models_A
            except Exception:
                pass
            try:
                if models_B is not None:
                    for _m in models_B:
                        del _m
                del models_B
            except Exception:
                pass
            try:
                del X
            except Exception:
                pass
            try:
                del Y
            except Exception:
                pass
            try:
                del fdf
            except Exception:
                pass

            # Close any stray figs and free TF/CPU/GPU memory back to OS
            hard_free()

    print("\nAll bins processed.\n")

if __name__ == "__main__":
    # ----------------------------
    # User-configurable globals
    # ----------------------------
    # CSV_PATH, OUT_ROOT, EPOCHS, etc. are defined above. Run main.
    main()



Using 8 grouping(s): [((2,), (1,)), ((2,), (0,)), ((1,), (0,)), ((3,), (2,)), ((3,), (0, 1, 2)), ((2, 1), (0,)), ((3,), (1,)), ((3,), (0,))]

Bin 35: samples=118, dim=13690  class_counts={0: 33, 1: 30, 2: 26, 3: 29}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  Saved pos_2__neg_1  |  Cos(pos)=0.902391  Cos(neg|abs|)=0.935754
  Saved pos_2__neg_0  |  Cos(pos)=0.974754  Cos(neg|abs|)=0.862151
  Saved pos_1__neg_0  |  Cos(pos)=0.967700  Cos(neg|abs|)=0.911179
  Saved pos_3__neg_2  |  Cos(pos)=0.917789  Cos(neg|abs|)=0.988955
  Saved pos_3__neg_0_1_2  |  Cos(pos)=0.924846  Cos(neg|abs|)=0.988356
  Saved pos_2_1__neg_0  |  Cos(pos)=0.976204  Cos(neg|abs|)=0.907891
  Saved pos_3__neg_1  |  Cos(pos)=0.860500

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  Saved pos_2__neg_1  |  Cos(pos)=0.819245  Cos(neg|abs|)=0.762299
  Saved pos_2__neg_0  |  Cos(pos)=0.980210  Cos(neg|abs|)=0.834645
  Saved pos_1__neg_0  |  Cos(pos)=0.973553  Cos(neg|abs|)=0.847506
  Saved pos_3__neg_2  |  Cos(pos)=0.713385  Cos(neg|abs|)=0.986402
  Saved pos_3__neg_0_1_2  |  Cos(pos)=0.833365  Cos(neg|abs|)=0.984717
  Saved pos_2_1__neg_0  |  Cos(pos)=0.980719  Cos(neg|abs|)=0.862161
  Saved pos_3__neg_1  |  Cos(pos)=0.453375

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  Saved pos_2__neg_1  |  Cos(pos)=0.593254  Cos(neg|abs|)=0.846396
  Saved pos_2__neg_0  |  Cos(pos)=0.950467  Cos(neg|abs|)=0.820848
  Saved pos_1__neg_0  |  Cos(pos)=0.965248  Cos(neg|abs|)=0.789730
  Saved pos_3__neg_2  |  Cos(pos)=0.823259  Cos(neg|abs|)=0.970186
  Saved pos_3__neg_0_1_2  |  Cos(pos)=0.828864  Cos(neg|abs|)=0.973406
  Saved pos_2_1__neg_0  |  Cos(pos)=0.966963  Cos(neg|abs|)=0.815025
  Saved pos_3__neg_1  |  Cos(pos)=0.843434

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  Saved pos_2__neg_1  |  Cos(pos)=0.763546  Cos(neg|abs|)=0.901769
  Saved pos_2__neg_0  |  Cos(pos)=0.915023  Cos(neg|abs|)=0.744971
  Saved pos_1__neg_0  |  Cos(pos)=0.951888  Cos(neg|abs|)=0.451910
  Saved pos_3__neg_2  |  Cos(pos)=0.710849  Cos(neg|abs|)=0.964244
  Saved pos_3__neg_0_1_2  |  Cos(pos)=0.722832  Cos(neg|abs|)=0.972398
  Saved pos_2_1__neg_0  |  Cos(pos)=0.947531  Cos(neg|abs|)=0.613226
  Saved pos_3__neg_1  |  Cos(pos)=0.706953

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  Saved pos_2__neg_1  |  Cos(pos)=0.941026  Cos(neg|abs|)=0.991608
  Saved pos_2__neg_0  |  Cos(pos)=0.981443  Cos(neg|abs|)=0.996444
  Saved pos_1__neg_0  |  Cos(pos)=0.966449  Cos(neg|abs|)=0.994603
  Saved pos_3__neg_2  |  Cos(pos)=0.993909  Cos(neg|abs|)=0.988312
  Saved pos_3__neg_0_1_2  |  Cos(pos)=0.972554  Cos(neg|abs|)=0.986739
  Saved pos_2_1__neg_0  |  Cos(pos)=0.980243  Cos(neg|abs|)=0.995867
  Saved pos_3__neg_1  |  Cos(pos)=0.986697

In [14]:
#!/usr/bin/env python3
"""
Process all CSV files in a folder:
- For each CSV, extract ["m/z", one of target cols]
- Save into ./result/<basename>_<col>.csv
"""

import os
import sys
import pandas as pd

TARGET_COLS = ["pos_runA", "pos_runB", "negabs_runA", "negabs_runB"]
MZ_COL = "m/z"

def split_csv(input_path: str, out_dir: str) -> list[str]:
    """Split one CSV into multiple smaller CSVs."""
    df = pd.read_csv(input_path)

    if MZ_COL not in df.columns:
        print(f"[SKIP] {input_path} (no '{MZ_COL}' column)")
        return []

    available_targets = [c for c in TARGET_COLS if c in df.columns]
    if not available_targets:
        print(f"[SKIP] {input_path} (none of {TARGET_COLS} found)")
        return []

    base = os.path.splitext(os.path.basename(input_path))[0]
    written = []
    for col in available_targets:
        out_df = df[[MZ_COL, col]].copy()
        out_path = os.path.join(out_dir, f"{base}_{col}.csv")
        out_df.to_csv(out_path, index=False)
        written.append(out_path)
    return written

def process_folder(folder_path: str):
    """Process all CSV files in a folder and save into subfolder 'result'."""
    if not os.path.isdir(folder_path):
        raise NotADirectoryError(f"Not a folder: {folder_path}")

    out_dir = os.path.join(folder_path, "result")
    os.makedirs(out_dir, exist_ok=True)

    all_outputs = []
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(".csv"):
            fpath = os.path.join(folder_path, fname)
            print(f"Processing {fpath} ...")
            outputs = split_csv(fpath, out_dir)
            all_outputs.extend(outputs)

    print(f"\nDone. Wrote {len(all_outputs)} files to {out_dir}")
    return all_outputs

if __name__ == "__main__":
    # Example usage: change this path
    INPUT_FOLDER = "F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv"  # e.g. "/mnt/data/myfolder"
    process_folder(INPUT_FOLDER)


Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_2__neg_1.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_2__neg_0.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_1__neg_0.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_3__neg_2.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_3__neg_0_1_2.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_2_1__neg_0.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_3__neg_1.csv ...
Processing F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\75__pos_3__neg_0.csv ...

Done. Wrote 32 files to F:/group_compare_all_bins_8_groups/bin_75/compare_AB/csv\result


In [None]:
import os
import subprocess

def run_unidec_on_folder(folder_path):
    # Ensure result root folder exists
    result_root = os.path.join(folder_path, "result")
    os.makedirs(result_root, exist_ok=True)

    # Loop through files in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Skip directories
        if not os.path.isfile(file_path):
            continue

        # Create a unique subfolder named after the file (without extension)
        base_name = os.path.splitext(file_name)[0]
        file_result_folder = os.path.join(result_root, base_name)
        os.makedirs(file_result_folder, exist_ok=True)

        # Run UniDec for this file, send outputs to its subfolder
        print(f"Processing: {file_name} → {file_result_folder}")
        subprocess.run(["python", "-m", "unidec", "-f", file_path, "-o", file_result_folder])

    print("✅ All files processed. Results saved in:", result_root)


if __name__ == "__main__":
    # Example usage
    folder_path = r"F:\decon"
    run_unidec_on_folder(folder_path)


In [5]:
# mirror_plot_cosine.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re

# ----------- Input file paths (edit these) -----------
file_a = Path(r"F:\decon\decon\5__pos_1__neg_0_negabs_runA_mass.txt")
file_b = Path(r"F:\decon\decon\5__pos_1__neg_0_negabs_runB_mass.txt")

# ----------- Load spectra -----------
def load_spectrum(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["mz", "intensity"], engine="python")
    df = df.dropna()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce").fillna(0.0)
    df = df[np.isfinite(df["mz"]) & np.isfinite(df["intensity"])]
    return df.groupby("mz", as_index=False)["intensity"].sum().sort_values("mz")

spec_a = load_spectrum(file_a)
spec_b = load_spectrum(file_b)

# ----------- Align spectra on a common m/z grid -----------
aligned = pd.merge(spec_a, spec_b, on="mz", how="outer", suffixes=("_a", "_b")).fillna(0.0).sort_values("mz")
v_a = aligned["intensity_a"].to_numpy(dtype=float)
v_b = aligned["intensity_b"].to_numpy(dtype=float)

# ----------- Cosine similarity -----------
def cosine_similarity(x, y):
    x_norm = np.linalg.norm(x)
    y_norm = np.linalg.norm(y)
    if x_norm == 0.0 or y_norm == 0.0:
        return float("nan")
    return float(np.dot(x, y) / (x_norm * y_norm))

cos_sim = cosine_similarity(v_a, v_b)

# ----------- Normalize for plotting -----------
def max_norm(v):
    m = np.max(np.abs(v)) if len(v) else 1.0
    return (v / m) if m > 0 else v

plot_a = max_norm(v_a)
plot_b = -max_norm(v_b)  # mirror below axis

# ----------- Clean filename (remove run info) -----------
def clean_name(filename: str) -> str:
    return re.sub(r"_run[AB]", "", filename)

short_name = clean_name(file_a.stem)

# ----------- Plot -----------
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(aligned["mz"].values, plot_a, color="blue")
ax.plot(aligned["mz"].values, plot_b, color="red")

ax.axhline(0, linewidth=1, color="black")
ax.set_xlabel("m/z")
ax.set_ylabel("Normalized intensity (top vs mirrored)")

title = f"{short_name} | Cosine similarity = {cos_sim:.4f}"
ax.set_title(title)

plt.tight_layout()

# ----------- Save output PNG -----------
out_path = file_a.parent / f"{short_name}.png"
plt.savefig(out_path, dpi=160)
plt.close()

print(f"Saved mirror plot to: {out_path}")


Saved mirror plot to: F:\decon\decon\5__pos_1__neg_0_negabs_mass.png


In [6]:
# mirror_plot_cosine.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re

# ----------- Input file paths (edit these) -----------
file_a = Path(r"F:\decon\decon\5__pos_1__neg_0_negabs_runA_mass.txt")
file_b = Path(r"F:\decon\decon\5__pos_1__neg_0_negabs_runB_mass.txt")

# ----------- Output folder (edit this) -----------
output_folder = Path(r"F:\decon\plot")
output_folder.mkdir(parents=True, exist_ok=True)

# ----------- Load spectra -----------
def load_spectrum(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["mz", "intensity"], engine="python")
    df = df.dropna()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce").fillna(0.0)
    df = df[np.isfinite(df["mz"]) & np.isfinite(df["intensity"])]
    return df.groupby("mz", as_index=False)["intensity"].sum().sort_values("mz")

spec_a = load_spectrum(file_a)
spec_b = load_spectrum(file_b)

# ----------- Align spectra on a common m/z grid -----------
aligned = pd.merge(spec_a, spec_b, on="mz", how="outer", suffixes=("_a", "_b")).fillna(0.0).sort_values("mz")
v_a = aligned["intensity_a"].to_numpy(dtype=float)
v_b = aligned["intensity_b"].to_numpy(dtype=float)

# ----------- Cosine similarity -----------
def cosine_similarity(x, y):
    x_norm = np.linalg.norm(x)
    y_norm = np.linalg.norm(y)
    if x_norm == 0.0 or y_norm == 0.0:
        return float("nan")
    return float(np.dot(x, y) / (x_norm * y_norm))

cos_sim = cosine_similarity(v_a, v_b)

# ----------- Normalize for plotting -----------
def max_norm(v):
    m = np.max(np.abs(v)) if len(v) else 1.0
    return (v / m) if m > 0 else v

plot_a = max_norm(v_a)
plot_b = -max_norm(v_b)  # mirror below axis

# ----------- Clean filename (remove run info) -----------
def clean_name(filename: str) -> str:
    return re.sub(r"_run[AB]", "", filename)

short_name = clean_name(file_a.stem)

# ----------- Plot -----------
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(aligned["mz"].values, plot_a, color="blue")
ax.plot(aligned["mz"].values, plot_b, color="red")

ax.axhline(0, linewidth=1, color="black")
ax.set_xlabel("m/z")
ax.set_ylabel("Normalized intensity (top vs mirrored)")

title = f"{short_name} | Cosine similarity = {cos_sim:.4f}"
ax.set_title(title)

plt.tight_layout()

# ----------- Save output PNG -----------
out_path = output_folder / f"{short_name}.png"
plt.savefig(out_path, dpi=160)
plt.close()

print(f"Saved mirror plot to: {out_path}")


Saved mirror plot to: F:\decon\plot\5__pos_1__neg_0_negabs_mass.png


In [None]:
# batch_mirror_plots.py
import argparse
import sys
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ========= Optional defaults (used if no CLI flags are given) =========
# Set these to your folder paths if you want to run without CLI args:
INPUT_DIR_DEFAULT  = r"F:/decon/decon/"  # e.g., r"F:\path\to\your\folder"
OUTPUT_DIR_DEFAULT = r"F:/decon/plot/"  # leave empty "" to save into input dir
PATTERN_DEFAULT    = "*.txt"
MZ_DECIMALS_DEFAULT = 5
# =====================================================================


def try_parse_args():
    """
    Try to parse CLI args. If none provided (or parsing fails due to missing required args),
    return None to trigger fallback to top-of-file defaults.
    """
    parser = argparse.ArgumentParser(
        description="Create mirror plots for runA/runB replicate pairs in a folder."
    )
    parser.add_argument("--input_dir", type=str, required=True,
                        help="Folder containing spectra text files.")
    parser.add_argument("--output_dir", type=str, default=None,
                        help="Folder to save PNG plots (defaults to input_dir).")
    parser.add_argument("--pattern", type=str, default=PATTERN_DEFAULT,
                        help=f"Glob pattern for spectra files (default: {PATTERN_DEFAULT}).")
    parser.add_argument("--mz_decimals", type=int, default=MZ_DECIMALS_DEFAULT,
                        help=f"Round m/z to this many decimals before alignment (default: {MZ_DECIMALS_DEFAULT}).")

    # If script called with no extra args (len==1), skip argparse error and use defaults
    if len(sys.argv) == 1:
        return None  # signal fallback

    try:
        return parser.parse_args()
    except SystemExit:
        # Argparse already printed an error (e.g., missing --input_dir). Fall back.
        return None


def load_spectrum(path: Path, mz_decimals: int = 5) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["mz", "intensity"], engine="python")
    df = df.dropna()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce").fillna(0.0)
    df = df[np.isfinite(df["mz"]) & np.isfinite(df["intensity"])]
    if mz_decimals is not None and mz_decimals >= 0:
        df["mz"] = df["mz"].round(mz_decimals)
    return df.groupby("mz", as_index=False)["intensity"].sum().sort_values("mz")


def cosine_similarity(x, y) -> float:
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    nx, ny = np.linalg.norm(x), np.linalg.norm(y)
    if nx == 0.0 or ny == 0.0:
        return float("nan")
    return float(np.dot(x, y) / (nx * ny))


def max_norm(v: np.ndarray) -> np.ndarray:
    if v.size == 0:
        return v
    m = np.max(np.abs(v))
    return v / m if m > 0 else v


def split_key_and_run(stem: str):
    """
    From a filename stem like: '5__pos_1__neg_0_negabs_runA_mass'
    return:
      key_without_run: '5__pos_1__neg_0_negabs_mass'
      run: 'A' or 'B' (or None)
    """
    m = re.match(r"(?P<prefix>.*)_run(?P<run>[AB])(?P<suffix>.*)", stem)
    if not m:
        return stem, None
    key = f"{m.group('prefix')}{m.group('suffix')}"
    run = m.group("run")
    return key, run


def make_mirror_plot(aligned: pd.DataFrame, v_a: np.ndarray, v_b: np.ndarray,
                     title_name: str, out_path: Path):
    plot_a = max_norm(v_a)
    plot_b = -max_norm(v_b)  # mirrored

    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(aligned["mz"].values, plot_a)  # default color
    ax.plot(aligned["mz"].values, plot_b)  # default color
    ax.axhline(0, linewidth=1, color="black")
    ax.set_xlabel("m/z")
    ax.set_ylabel("Normalized intensity (top vs mirrored)")
    ax.set_title(title_name)
    fig.tight_layout()
    fig.savefig(out_path, dpi=160)
    plt.close(fig)


def main():
    args = try_parse_args()

    if args is None:
        # Fallback to top-of-file defaults
        if not INPUT_DIR_DEFAULT:
            print(
                "[ERROR] No --input_dir provided and INPUT_DIR_DEFAULT is empty.\n"
                "Either run with CLI flags, e.g.\n"
                '  python batch_mirror_plots.py --input_dir "F:\\path\\to\\folder" --output_dir "F:\\path\\to\\pngs"\n'
                "or set INPUT_DIR_DEFAULT at the top of this script."
            )
            sys.exit(2)

        input_dir = Path(INPUT_DIR_DEFAULT).expanduser().resolve()
        output_dir = (Path(OUTPUT_DIR_DEFAULT).expanduser().resolve()
                      if OUTPUT_DIR_DEFAULT else input_dir)
        pattern = PATTERN_DEFAULT
        mz_decimals = MZ_DECIMALS_DEFAULT
        print(f"[INFO] Using defaults: input_dir={input_dir}, output_dir={output_dir}, "
              f"pattern={pattern}, mz_decimals={mz_decimals}")
    else:
        input_dir = Path(args.input_dir).expanduser().resolve()
        output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else input_dir
        pattern = args.pattern
        mz_decimals = args.mz_decimals

    output_dir.mkdir(parents=True, exist_ok=True)

    files = list(input_dir.glob(pattern))
    if not files:
        print(f"[INFO] No files found in {input_dir} matching pattern {pattern}.")
        return

    # Build pairs: (key_without_run) -> {'A': Path, 'B': Path}
    pairs = {}
    for p in files:
        key, run = split_key_and_run(p.stem)
        if run not in ("A", "B"):
            continue
        entry = pairs.setdefault(key, {})
        entry[run] = p

    total_pairs = sum(1 for v in pairs.values() if "A" in v and "B" in v)
    print(f"[INFO] Found {len(pairs)} candidate keys; {total_pairs} A/B pairs will be processed.")

    for key, d in sorted(pairs.items()):
        if "A" not in d or "B" not in d:
            missing = "A" if "A" not in d else "B"
            print(f"[WARN] Skipping '{key}': missing run{missing}.")
            continue

        path_a, path_b = d["A"], d["B"]
        try:
            spec_a = load_spectrum(path_a, mz_decimals=mz_decimals)
            spec_b = load_spectrum(path_b, mz_decimals=mz_decimals)

            aligned = (
                pd.merge(spec_a, spec_b, on="mz", how="outer", suffixes=("_a", "_b"))
                .fillna(0.0)
                .sort_values("mz")
            )
            v_a = aligned["intensity_a"].to_numpy(dtype=float)
            v_b = aligned["intensity_b"].to_numpy(dtype=float)
            cs = cosine_similarity(v_a, v_b)

            title = f"{key} | Cosine similarity = {cs:.4f}"
            out_path = output_dir / f"{key}.png"
            make_mirror_plot(aligned, v_a, v_b, title, out_path)

            print(f"[OK] Saved: {out_path}  (cosine={cs:.4f})")
        except Exception as e:
            print(f"[ERROR] Failed on key '{key}' ({path_a.name} vs {path_b.name}): {e}")

    print("[DONE]")


if __name__ == "__main__":
    main()


In [None]:
# batch_mirror_plots.py
import argparse
import sys
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ========= Optional defaults if you run without CLI flags =========
INPUT_DIR_DEFAULT  = r"F:/decon/decon/"  # e.g., r"F:\path\to\your\folder"
OUTPUT_DIR_DEFAULT = r"F:/decon/plot/plot/"  # leave empty "" to save into input dir
PATTERN_DEFAULT    = "*_run[AB]_mass.*"  # match runA/runB files
MZ_DECIMALS_DEFAULT = 5
# ==================================================================

def try_parse_args():
    parser = argparse.ArgumentParser(
        description="Create mirror plots for runA/runB replicate pairs in a folder."
    )
    parser.add_argument("--input_dir", type=str, required=True,
                        help="Folder containing spectra files.")
    parser.add_argument("--output_dir", type=str, default=None,
                        help="Folder to save PNG plots (defaults to input_dir).")
    parser.add_argument("--pattern", type=str, default=PATTERN_DEFAULT,
                        help=f"Glob pattern for spectra files (default: {PATTERN_DEFAULT}).")
    parser.add_argument("--mz_decimals", type=int, default=MZ_DECIMALS_DEFAULT,
                        help=f"Round m/z to this many decimals before alignment (default: {MZ_DECIMALS_DEFAULT}).")

    if len(sys.argv) == 1:
        return None  # fall back to top-of-file defaults
    try:
        return parser.parse_args()
    except SystemExit:
        return None

def load_spectrum(path: Path, mz_decimals: int = 5) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["mz", "intensity"], engine="python")
    df = df.dropna()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce").fillna(0.0)
    df = df[np.isfinite(df["mz"]) & np.isfinite(df["intensity"])]
    if mz_decimals is not None and mz_decimals >= 0:
        df["mz"] = df["mz"].round(mz_decimals)
    return df.groupby("mz", as_index=False)["intensity"].sum().sort_values("mz")

def cosine_similarity(x, y) -> float:
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    nx, ny = np.linalg.norm(x), np.linalg.norm(y)
    if nx == 0.0 or ny == 0.0:
        return float("nan")
    return float(np.dot(x, y) / (nx * ny))

def max_norm(v: np.ndarray) -> np.ndarray:
    if v.size == 0:
        return v
    m = np.max(np.abs(v))
    return v / m if m > 0 else v

# ==== STRICT pattern: <prefix>_run(A|B)_mass(.ext) -> key = <prefix>_mass ====
RUN_RE = re.compile(r"^(?P<prefix>.+)_run(?P<run>[AB])_mass(?:\.[^.]+)?$", re.IGNORECASE)

def split_key_and_run_from_name(filename: str):
    """
    filename is the base name (with extension). Match:
      <prefix>_runA_mass(.ext) OR <prefix>_runB_mass(.ext)
    Returns:
      key_without_run = '<prefix>_mass'
      run = 'A' or 'B' (or None if not matching)
    """
    m = RUN_RE.match(filename)
    if not m:
        return None, None
    key = f"{m.group('prefix')}_mass"
    run = m.group("run").upper()
    return key, run

def make_mirror_plot(aligned: pd.DataFrame, v_a: np.ndarray, v_b: np.ndarray,
                     title_name: str, out_path: Path):
    plot_a = max_norm(v_a)
    plot_b = -max_norm(v_b)  # mirrored

    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(aligned["mz"].values, plot_a)  # default color
    ax.plot(aligned["mz"].values, plot_b)  # default color
    ax.axhline(0, linewidth=1, color="black")
    ax.set_xlabel("m/z")
    ax.set_ylabel("Normalized intensity (top vs mirrored)")
    ax.set_title(title_name)
    fig.tight_layout()
    fig.savefig(out_path, dpi=160)
    plt.close(fig)

def main():
    args = try_parse_args()

    if args is None:
        if not INPUT_DIR_DEFAULT:
            print(
                "[ERROR] No --input_dir provided and INPUT_DIR_DEFAULT is empty.\n"
                "Either run with CLI flags, e.g.\n"
                '  python batch_mirror_plots.py --input_dir "F:\\path\\to\\folder" --output_dir "F:\\path\\to\\pngs"\n'
                "or set INPUT_DIR_DEFAULT at the top of this script."
            )
            sys.exit(2)
        input_dir = Path(INPUT_DIR_DEFAULT).expanduser().resolve()
        output_dir = (Path(OUTPUT_DIR_DEFAULT).expanduser().resolve()
                      if OUTPUT_DIR_DEFAULT else input_dir)
        pattern = PATTERN_DEFAULT
        mz_decimals = MZ_DECIMALS_DEFAULT
        print(f"[INFO] Using defaults: input_dir={input_dir}, output_dir={output_dir}, "
              f"pattern={pattern}, mz_decimals={mz_decimals}")
    else:
        input_dir = Path(args.input_dir).expanduser().resolve()
        output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else input_dir
        pattern = args.pattern
        mz_decimals = args.mz_decimals

    output_dir.mkdir(parents=True, exist_ok=True)

    files = list(input_dir.glob(pattern))
    if not files:
        print(f"[INFO] No files found in {input_dir} matching pattern {pattern}.")
        return

    # Build pairs strictly by the <prefix>_runX_mass pattern
    pairs = {}
    for p in files:
        key, run = split_key_and_run_from_name(p.name)
        if key is None or run not in ("A", "B"):
            # Not a strict runA/runB _mass file -> skip
            continue
        entry = pairs.setdefault(key, {})
        entry[run] = p

    total_pairs = sum(1 for v in pairs.values() if "A" in v and "B" in v)
    print(f"[INFO] Found {len(pairs)} candidate keys; {total_pairs} A/B pairs will be processed.")

    for key, d in sorted(pairs.items()):
        if "A" not in d or "B" not in d:
            missing = "A" if "A" not in d else "B"
            print(f"[WARN] Skipping '{key}': missing run{missing}.")
            continue

        path_a, path_b = d["A"], d["B"]
        try:
            spec_a = load_spectrum(path_a, mz_decimals=mz_decimals)
            spec_b = load_spectrum(path_b, mz_decimals=mz_decimals)

            aligned = (
                pd.merge(spec_a, spec_b, on="mz", how="outer", suffixes=("_a", "_b"))
                .fillna(0.0)
                .sort_values("mz")
            )
            v_a = aligned["intensity_a"].to_numpy(dtype=float)
            v_b = aligned["intensity_b"].to_numpy(dtype=float)
            cs = cosine_similarity(v_a, v_b)

            title = f"{key} | Cosine similarity = {cs:.4f}"
            out_path = output_dir / f"{key}.png"   # filename WITHOUT cosine values
            make_mirror_plot(aligned, v_a, v_b, title, out_path)

            print(f"[OK] Saved: {out_path}  (cosine={cs:.4f})")
        except Exception as e:
            print(f"[ERROR] Failed on key '{key}' ({path_a.name} vs {path_b.name}): {e}")

    print("[DONE]")

if __name__ == "__main__":
    main()
