Generating informative features from the training dataset

In [1]:
# -*- coding: utf-8 -*-
import os, re, json, gc, sys, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional (not used in plotting)
import seaborn as sns  # noqa: F401
from scipy.signal import find_peaks  # noqa: F401
from scipy.signal import savgol_filter  # noqa: F401
from scipy.ndimage import gaussian_filter1d  # noqa: F401
from scipy.linalg import svd  # noqa: F401

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold

# ----------------------------
# Repro & GPU memory growth
# ----------------------------
np.random.seed(42)
tf.random.set_seed(42)
try:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for _gpu in gpus:
        tf.config.experimental.set_memory_growth(_gpu, True)
except Exception:
    pass

def hard_free():
    """Aggressively release memory after each bin."""
    try: plt.close('all')
    except: pass
    try: tf.keras.backend.clear_session()
    except: pass
    try: gc.collect(); gc.collect()
    except: pass
    try:
        import ctypes, platform
        if platform.system().lower() == "linux":
            ctypes.CDLL("libc.so.6").malloc_trim(0)
    except: pass

# ----------------------------
# Grouped log-odds gradient
# ----------------------------
@tf.function(reduce_retracing=True)
def _group_logodds_grad_for_model(x1, model, pos_ids, neg_ids, eps):
    pos_ids = tf.constant(pos_ids, dtype=tf.int32)
    neg_ids = tf.constant(neg_ids, dtype=tf.int32)
    with tf.GradientTape() as tape:
        tape.watch(x1)
        p = model(x1, training=False)  # (1, C)
        p_pos = tf.reduce_sum(tf.gather(p, pos_ids, axis=1), axis=1)  # (1,)
        p_neg = tf.reduce_sum(tf.gather(p, neg_ids, axis=1), axis=1)  # (1,)
        log_odds = tf.math.log(p_pos + eps) - tf.math.log(p_neg + eps)
    g = tape.gradient(log_odds, x1)  # (1, D)
    return tf.squeeze(g, axis=0)     # (D,)

def compute_avg_group_logodds_gradient(X: np.ndarray, models: list, pos_ids=(2,3), neg_ids=(0,1), eps: float = 1e-8) -> np.ndarray:
    X_t = tf.convert_to_tensor(X, dtype=tf.float32)
    N = int(X_t.shape[0])
    sample_grads = []
    for i in range(N):
        x_i = X_t[i:i+1]
        grads_over_models = []
        for m in models:
            g = _group_logodds_grad_for_model(x_i, m, pos_ids, neg_ids, eps)
            grads_over_models.append(g)
        g_avg_models = tf.reduce_mean(tf.stack(grads_over_models, axis=0), axis=0)
        sample_grads.append(g_avg_models)
    avg_grad = tf.reduce_mean(tf.stack(sample_grads, axis=0), axis=0)
    return avg_grad.numpy()

# ----------------------------
# Model
# ----------------------------
def build_model(input_dim: int, num_classes: int):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=regularizers.l1(0.01)),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    return model

# ----------------------------
# Helpers (cosine + plotting)
# ----------------------------
def cosine_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float:
    a = np.asarray(a, dtype=float).ravel()
    b = np.asarray(b, dtype=float).ravel()
    n = min(a.size, b.size)
    a = a[:n]; b = b[:n]
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + eps
    return float(np.dot(a, b) / denom)

def mirror_plot(x, top_y, bottom_y, title, outfile):
    plt.figure(figsize=(10, 5))
    plt.plot(x, top_y, linewidth=1.0, label="Run A")
    plt.plot(x, -bottom_y, linewidth=1.0, label="Run B (mirrored)")
    plt.axhline(0.0, linewidth=0.8)
    plt.xlabel("m/z (approx grid)")
    plt.ylabel("Gradient magnitude")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(outfile, dpi=200)
    plt.close()

# ----------------------------
# Splitter (4 CSVs per combined CSV) with bin prefix
# ----------------------------
TARGET_COLS = ["pos_runA", "pos_runB", "negabs_runA", "negabs_runB"]
MZ_COL = "m/z"

def split_csv(input_path: str, out_dir: str, bin_value: int) -> list[str]:
    """Split one combined CSV into 4 CSVs, prefixing filenames with bin number."""
    df = pd.read_csv(input_path)
    if MZ_COL not in df.columns:
        print(f"[SKIP] {input_path} (no '{MZ_COL}' column)")
        return []
    available_targets = [c for c in TARGET_COLS if c in df.columns]
    if not available_targets:
        print(f"[SKIP] {input_path} (none of {TARGET_COLS} found)")
        return []
    base = os.path.splitext(os.path.basename(input_path))[0]
    written = []
    for col in available_targets:
        out_df = df[[MZ_COL, col]].copy()
        out_path = os.path.join(out_dir, f"bin{bin_value}_{base}_{col}.csv")
        out_df.to_csv(out_path, index=False)
        written.append(out_path)
    return written

def process_folder(folder_path: str, bin_value: int) -> list[str]:
    """Split all CSVs in a folder to 'result/' and return list of result paths."""
    if not os.path.isdir(folder_path):
        print(f"[WARN] Not a folder: {folder_path}")
        return []
    out_dir = os.path.join(folder_path, "result")
    os.makedirs(out_dir, exist_ok=True)

    all_outputs = []
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(".csv"):
            fpath = os.path.join(folder_path, fname)
            print(f"Splitting {fpath} ...")
            outputs = split_csv(fpath, out_dir, bin_value)
            all_outputs.extend(outputs)

    print(f"Split done. Wrote {len(all_outputs)} files to {out_dir}")
    return all_outputs

# ----------------------------
# Config (edit these)
# ----------------------------
CSV_PATH    = r"F:/binary/neuro_training.csv"   # input dataset with 'bin' and 'target'
EPOCHS      = 50
BATCH_SIZE  = 32
K_SPLITS    = 5
N_REPEATS   = 10
SEED_BASES  = [111, 777]                   # two independent runs
OUT_ROOT    = r"F:/binary/"      # everything goes directly under bin_<N>/
BIN_WHITELIST = None  # e.g. [35, 75]

# Fixed grouping list (no prompt, no AUTO)
# GROUPINGS = [((2,),(1,)), ((2,),(0,)), ((1,),(0,)), ((3,),(2,)), ((3,),(0,1,2)), ((2,1),(0,)), ((3,),(1,)),  ((3,),(0,))]
GROUPINGS = [((1,),(0,))]

# ----------------------------
# KFold+Repeats trainer
# ----------------------------
def train_kfold_repeats(X: np.ndarray, Y: np.ndarray, seed_base: int):
    kf = KFold(n_splits=K_SPLITS, shuffle=True, random_state=42)
    all_models = []
    num_classes = int(np.max(Y)) + 1
    for fold, (tr, va) in enumerate(kf.split(X, Y), 1):
        X_tr, y_tr = X[tr], Y[tr]
        X_va, y_va = X[va], Y[va]
        for r in range(N_REPEATS):
            seed = seed_base * 1000 + fold * 100 + r
            tf.keras.utils.set_random_seed(seed)
            np.random.seed(seed)
            m = build_model(X.shape[1], num_classes)
            m.fit(X_tr, y_tr, epochs=EPOCHS, batch_size=BATCH_SIZE,
                  validation_data=(X_va, y_va), verbose=0)
            all_models.append(m)
        print(f"[Seed base {seed_base}] Fold {fold}/{K_SPLITS} trained {N_REPEATS} models (total: {len(all_models)})")
    return all_models

# ----------------------------
# MAIN (bin-root outputs + keep only split CSVs)
# ----------------------------
def main():
    df = pd.read_csv(CSV_PATH)
    # Discover bins
    bins_found = sorted([b for b in df["bin"].dropna().unique().tolist()])
    if BIN_WHITELIST is not None:
        bins_to_process = [b for b in bins_found if b in set(BIN_WHITELIST)]
    else:
        bins_to_process = bins_found
    if len(bins_to_process) == 0:
        raise ValueError(f"No 'bin' values found in {CSV_PATH}")

    # Keep only classes 0..3 by default
    df = df[df["target"].astype(int).isin([0,1,2,3])].copy()
    unique_labels = np.sort(df["target"].astype(int).unique())
    assert unique_labels[0] == 0 and np.array_equal(unique_labels, np.arange(unique_labels[-1] + 1)), \
        f"Non-contiguous labels detected: {unique_labels}. Please remap to 0..C-1."
    groupings = GROUPINGS
    print(f"\nUsing fixed grouping(s): {groupings}")

    os.makedirs(OUT_ROOT, exist_ok=True)

    for BIN_VALUE in bins_to_process:
        print(f"\n================= BIN {BIN_VALUE} =================")
        models_A = models_B = None
        X = Y = fdf = None

        try:
            fdf = df[df["bin"] == BIN_VALUE].copy()
            if fdf.empty:
                print(f"[WARN] No rows for bin {BIN_VALUE}; skipping.")
                continue

            # Normalize all features except ['bin','target'] within this bin
            cols_to_norm = fdf.columns.difference(['bin', 'target'])
            fdf[cols_to_norm] = fdf[cols_to_norm].apply(lambda x: x / (x.max() + 1.0))

            Y = fdf["target"].astype(int).to_numpy()
            X = np.nan_to_num(fdf.drop(columns=['bin', 'target']).to_numpy(), copy=False).astype(np.float32)

            if X.shape[0] < 2 or X.shape[1] < 1:
                print(f"[WARN] Insufficient data for bin {BIN_VALUE} (samples={X.shape[0]}, dim={X.shape[1]}). Skipping.")
                continue

            print(f"Bin {BIN_VALUE}: samples={X.shape[0]}, dim={X.shape[1]}  class_counts="
                  f"{dict(zip(*np.unique(Y, return_counts=True)))}")

            # Output dirs per bin (everything under bin folder)
            OUT_DIR   = os.path.join(OUT_ROOT, f"bin_{str(BIN_VALUE).replace('.', '_')}")
            CSV_DIR   = os.path.join(OUT_DIR, "csv")
            PLOTS_DIR = os.path.join(OUT_DIR, "plots")
            os.makedirs(OUT_DIR, exist_ok=True)
            os.makedirs(CSV_DIR, exist_ok=True)
            os.makedirs(PLOTS_DIR, exist_ok=True)

            # Train ensembles ONCE per run (A, B)
            models_A = train_kfold_repeats(X, Y, seed_base=SEED_BASES[0])
            models_B = train_kfold_repeats(X, Y, seed_base=SEED_BASES[1])

            # grid helper
            def _make_grid(n):
                n_grid = min(10000, n)
                x_grid = np.arange(600, 600 + 0.1 * n_grid, 0.1)[:n_grid]
                return n_grid, x_grid

            def save_compare_only(pos_ids, neg_ids):
                grad_A = compute_avg_group_logodds_gradient(X, models_A, pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8)
                grad_B = compute_avg_group_logodds_gradient(X, models_B, pos_ids=pos_ids, neg_ids=neg_ids, eps=1e-8)

                pos_tag = "_".join(map(str, pos_ids))
                neg_tag = "_".join(map(str, neg_ids))
                tag = f"pos_{pos_tag}__neg_{neg_tag}"

                n_grid, x_grid = _make_grid(min(grad_A.size, grad_B.size))
                yA = grad_A[:n_grid]; yB = grad_B[:n_grid]
                yA_pos = np.where(yA > 0, yA, 0.0)
                yB_pos = np.where(yB > 0, yB, 0.0)
                yA_neg = np.where(yA < 0, -yA, 0.0)  # abs
                yB_neg = np.where(yB < 0, -yB, 0.0)

                cos_pos = cosine_sim(yA_pos, yB_pos)
                cos_neg = cosine_sim(yA_neg, yB_neg)

                # --- write combined CSV under bin/<csv> ---
                comb_csv = os.path.join(CSV_DIR, f"grads_AB__{tag}.csv")
                pd.DataFrame({
                    "m/z": x_grid,
                    "grad_runA": yA,
                    "grad_runB": yB,
                    "pos_runA": yA_pos,
                    "pos_runB": yB_pos,
                    "negabs_runA": yA_neg,
                    "negabs_runB": yB_neg,
                }).to_csv(comb_csv, index=False)

                # plots under bin/<plots>
                pos_title = (f"Bin {BIN_VALUE} — Mirror Positive Gradients "
                             f"[{tag}] (cos={cos_pos:.4f})")
                neg_title = (f"Bin {BIN_VALUE} — Mirror Negative Gradients |abs| "
                             f"[{tag}] (cos={cos_neg:.4f})")

                mirror_plot(
                    x_grid, yA_pos, yB_pos,
                    title=pos_title,
                    outfile=os.path.join(PLOTS_DIR, f"{tag}__mirror_pos.png")
                )
                mirror_plot(
                    x_grid, yA_neg, yB_neg,
                    title=neg_title,
                    outfile=os.path.join(PLOTS_DIR, f"{tag}__mirror_negabs.png")
                )

                # summary JSON directly under bin/
                with open(os.path.join(OUT_DIR, f"summary__{tag}.json"), "w") as fC:
                    json.dump({
                        "bin": BIN_VALUE,
                        "grouping": {"pos_ids": list(pos_ids), "neg_ids": list(neg_ids)},
                        "comparison": "Run A vs Run B",
                        "cosine_pos": cos_pos,
                        "cosine_neg_abs": cos_neg,
                        "paths": {
                            "combined_csv": comb_csv,
                            "plots_dir": PLOTS_DIR,
                        }
                    }, fC, indent=2)

                print(f"  [COMPARE] {tag}  |  Cos(pos)={cos_pos:.6f}  Cos(neg|abs|)={cos_neg:.6f}")

            # run all groupings -> write combined CSVs/plots/JSONs
            num_classes = int(np.max(Y)) + 1
            for (pos_ids, neg_ids) in groupings:
                for idx in (*pos_ids, *neg_ids):
                    assert 0 <= idx < num_classes, f"Class index {idx} out of range 0..{num_classes-1}"
                save_compare_only(pos_ids, neg_ids)

            # --- Split step: create split CSVs into csv/result/ ---
            split_outputs = process_folder(CSV_DIR, int(BIN_VALUE))

            # Move split files from csv/result/ -> csv/ and delete originals
            result_dir = os.path.join(CSV_DIR, "result")
            moved = []
            if os.path.isdir(result_dir):
                for fname in os.listdir(result_dir):
                    src = os.path.join(result_dir, fname)
                    dst = os.path.join(CSV_DIR, fname)
                    os.replace(src, dst)
                    moved.append(dst)
                # try to remove now-empty result dir
                try: os.rmdir(result_dir)
                except OSError: pass

            # Delete original combined CSVs (keep only split)
            for fname in os.listdir(CSV_DIR):
                if fname.lower().endswith(".csv") and fname.startswith("grads_AB__"):
                    try:
                        os.remove(os.path.join(CSV_DIR, fname))
                    except Exception as e:
                        print(f"[WARN] Could not remove {fname}: {e}")

            # Save manifest of final CSVs
            with open(os.path.join(OUT_DIR, "split_manifest.json"), "w") as fman:
                json.dump({
                    "bin": BIN_VALUE,
                    "final_csvs": sorted([os.path.basename(p) for p in moved]),
                }, fman, indent=2)

        finally:
            try:
                if models_A is not None:
                    for _m in models_A: del _m
                del models_A
            except: pass
            try:
                if models_B is not None:
                    for _m in models_B: del _m
                del models_B
            except: pass
            for v in ["X","Y","fdf"]:
                try: del globals()[v]
                except: pass
            hard_free()

    print("\nAll bins processed. (bin-root outputs; csv keeps only split files)\n")

# ---- run ----
if __name__ == "__main__":
    main()



Using fixed grouping(s): [((1,), (0,))]

Bin 5: samples=30, dim=13690  class_counts={0: 14, 1: 16}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  [COMPARE] pos_1__neg_0  |  Cos(pos)=0.988323  Cos(neg|abs|)=0.974356
Splitting F:/binary/bin_5\csv\grads_AB__pos_1__neg_0.csv ...
Split done. Wrote 4 files to F:/binary/bin_5\csv\result


Bin 15: samples=32, dim=13690  class_counts={0: 16, 1: 16}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Seed base 111] Fold 1/5 trained 10 models (total: 10)
[Seed base 111] Fold 2/5 trained 10 models (total: 20)
[Seed base 111] Fold 3/5 trained 10 models (total: 30)
[Seed base 111] Fold 4/5 trained 10 models (total: 40)
[Seed base 111] Fold 5/5 trained 10 models (total: 50)
[Seed base 777] Fold 1/5 trained 10 models (total: 10)
[Seed base 777] Fold 2/5 trained 10 models (total: 20)
[Seed base 777] Fold 3/5 trained 10 models (total: 30)
[Seed base 777] Fold 4/5 trained 10 models (total: 40)
[Seed base 777] Fold 5/5 trained 10 models (total: 50)
  [COMPARE] pos_1__neg_0  |  Cos(pos)=0.961726  Cos(neg|abs|)=0.956743
Splitting F:/binary/bin_15\csv\grads_AB__pos_1__neg_0.csv ...
Split done. Wrote 4 files to F:/binary/bin_15\csv\result

All bins processed. (bin-root outputs; csv keeps only split files)

