Generating deconvoluted spectra from the informative features

In [27]:
# -*- coding: utf-8 -*-
import os
import subprocess
import shutil
import tempfile

def _unique_dst_path(dst_dir, fname):
    """Return a unique path in dst_dir for fname, adding a numeric suffix if needed."""
    base, ext = os.path.splitext(fname)
    candidate = os.path.join(dst_dir, fname)
    i = 1
    while os.path.exists(candidate):
        candidate = os.path.join(dst_dir, f"{base}__{i}{ext}")
        i += 1
    return candidate

def _prefixed_name(src_path, result_root):
    """
    Build a safer filename using the immediate parent folder under result/ as a prefix
    to reduce collisions: e.g., result/sampleA/sampleA_mass.txt -> sampleA__sampleA_mass.txt
    """
    # src_path like .../result/<parent>/<file>
    parent = os.path.basename(os.path.dirname(src_path))
    fname = os.path.basename(src_path)
    return f"{parent}__{fname}" if parent and parent != "result" else fname

def run_unidec_on_folder(folder_path):
    # Ensure result root folder exists
    result_root = os.path.join(folder_path, "result")
    os.makedirs(result_root, exist_ok=True)

    # Loop through files in the folder (top-level only)
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Skip directories
        if not os.path.isfile(file_path):
            continue

        # Create a unique subfolder named after the file (without extension)
        base_name = os.path.splitext(file_name)[0]
        file_result_folder = os.path.join(result_root, base_name)
        os.makedirs(file_result_folder, exist_ok=True)

        # Run UniDec for this file, send outputs to its subfolder
        print(f"Processing: {file_name} → {file_result_folder}")
        subprocess.run(["python", "-m", "unidec", "-f", file_path, "-o", file_result_folder])

    print("✅ All files processed. Results saved in:", result_root)

    # 1) Collect *_mass.txt paths from result_root (including subfolders)
    collected = []
    for root, _, files in os.walk(result_root):
        for f in files:
            if f.endswith("_mass.txt"):
                collected.append(os.path.join(root, f))

    if not collected:
        print("⚠️ No *_mass.txt files found under:", result_root)
        return

    # 2) Copy them to a temp folder FIRST (so deleting result/ content won't break src paths)
    temp_dir = tempfile.mkdtemp(prefix="mass_collect_")
    copied = []
    for src in collected:
        try:
            # Prefix with subfolder name to avoid collisions
            safe_name = _prefixed_name(src, result_root)
            dst = os.path.join(temp_dir, safe_name)
            dst = _unique_dst_path(temp_dir, os.path.basename(dst))  # ensure uniqueness
            print(f"Staging: {src} → {dst}")
            shutil.copy2(src, dst)
            copied.append(dst)
        except Exception as e:
            print(f"⚠️ Skip (copy error): {src} — {e}")

    # 3) Clean the result_root completely
    for item in os.listdir(result_root):
        item_path = os.path.join(result_root, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.remove(item_path)
            else:
                shutil.rmtree(item_path)
        except Exception as e:
            print(f"⚠️ Could not remove {item_path}: {e}")

    # 4) Move staged files back into a clean result_root
    for staged in copied:
        try:
            final_dst = os.path.join(result_root, os.path.basename(staged))
            final_dst = _unique_dst_path(result_root, os.path.basename(final_dst))
            print(f"Finalizing: {staged} → {final_dst}")
            shutil.move(staged, final_dst)
        except Exception as e:
            print(f"⚠️ Move error for {staged}: {e}")

    # 5) Remove temp dir (ignore errors)
    try:
        shutil.rmtree(temp_dir)
    except Exception:
        pass

    print("📂 Clean result folder ready with only *_mass.txt files:", result_root)


if __name__ == "__main__":
    # Example usage
    folder_path = r"F:\binary\final\raw"  # <-- replace with your folder
    run_unidec_on_folder(folder_path)


Processing: frac_pellet_grads_AB__pos_1__neg_0_negabs_runA.csv → F:\binary\final\raw\result\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA
Processing: frac_pellet_grads_AB__pos_1__neg_0_negabs_runB.csv → F:\binary\final\raw\result\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB
Processing: frac_pellet_grads_AB__pos_1__neg_0_pos_runA.csv → F:\binary\final\raw\result\frac_pellet_grads_AB__pos_1__neg_0_pos_runA
Processing: frac_pellet_grads_AB__pos_1__neg_0_pos_runB.csv → F:\binary\final\raw\result\frac_pellet_grads_AB__pos_1__neg_0_pos_runB
Processing: frac_soluble_grads_AB__pos_1__neg_0_negabs_runA.csv → F:\binary\final\raw\result\frac_soluble_grads_AB__pos_1__neg_0_negabs_runA
Processing: frac_soluble_grads_AB__pos_1__neg_0_negabs_runB.csv → F:\binary\final\raw\result\frac_soluble_grads_AB__pos_1__neg_0_negabs_runB
Processing: frac_soluble_grads_AB__pos_1__neg_0_pos_runA.csv → F:\binary\final\raw\result\frac_soluble_grads_AB__pos_1__neg_0_pos_runA
Processing: frac_soluble_grads_AB__pos_

Visualize the deconvolution graphs

In [None]:
# Mirror Plots (Run A vs Run B) — strong-color version with up/downregulated labels

import re, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display, Image

# ==== CONFIG ====
FOLDER   = Path(r"F:/binary/final/decon")  # 👈 change this to your folder
OUT_DIR  = None                            # or set to Path(r"F:/binary/mirror_plots")
MASS_MIN, MASS_MAX = 10000, 20000          # mass range (Da)
BAR_WIDTH = 5                              # narrow bars
# =================

plt.rcParams.update({
    "figure.figsize": (10, 5),
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "axes.titleweight": "bold",
    "axes.labelweight": "bold",
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0
})

PATTERN = re.compile(
    r'^frac_(?P<fraction>[A-Za-z0-9]+)_grads_AB__pos_1__neg_0_(?P<sign>negabs|pos)_run(?P<run>[AB])_mass\.txt$'
)

def read_two_col_txt(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["mass", "value"], engine="python")
    df = df[np.isfinite(df["mass"]) & np.isfinite(df["value"])]
    df = df[(df["mass"] >= MASS_MIN) & (df["mass"] <= MASS_MAX)]
    return df.sort_values("mass").reset_index(drop=True)

def discover_pairs(folder: Path):
    found = {}
    for p in folder.glob("*.txt"):
        m = PATTERN.match(p.name)
        if not m:
            continue
        k = (m["fraction"], m["sign"])
        found.setdefault(k, {})[m["run"]] = p
    return {k: v for k, v in found.items() if "A" in v and "B" in v}

def align_on_union_mass(a: pd.DataFrame, b: pd.DataFrame):
    if len(a) == 0 or len(b) == 0:
        return a.copy(), b.copy()
    if len(a) == len(b) and np.allclose(a["mass"], b["mass"]):
        return a, b
    union = pd.DataFrame({"mass": np.unique(np.r_[a["mass"].values, b["mass"].values])})
    a2 = pd.merge_asof(union.sort_values("mass"), a.sort_values("mass"), on="mass", direction="nearest")
    b2 = pd.merge_asof(union.sort_values("mass"), b.sort_values("mass"), on="mass", direction="nearest")
    a2["value"] = a2["value"].fillna(0.0)
    b2["value"] = b2["value"].fillna(0.0)
    return a2, b2

def cosine_similarity(a_vals: np.ndarray, b_vals: np.ndarray) -> float:
    denom = (np.linalg.norm(a_vals) * np.linalg.norm(b_vals))
    return 0.0 if denom == 0 else float(np.dot(a_vals, b_vals) / denom)

def mirror_plot_bars(a: pd.DataFrame, b: pd.DataFrame, out: Path, title: str, cos_sim: float):
    if len(a) == 0 or len(b) == 0:
        print(f"[skip] Empty after filtering {MASS_MIN}-{MASS_MAX} Da → {out.name}")
        return

    A, B = align_on_union_mass(a, b)
    x = A["mass"].values
    yA = A["value"].values
    yB = -B["value"].values  # mirrored

    # Strong, saturated colors
    color_A = "#000000"  # deep navy blue
    color_B = "#CC0000"  # vivid orange-red

    plt.figure(facecolor="white")
    plt.bar(x, yA, width=BAR_WIDTH, color=color_A, alpha=1.0, linewidth=0, label="Run A", zorder=3)
    plt.bar(x, yB, width=BAR_WIDTH, color=color_B, alpha=1.0, linewidth=0, label="Run B (mirrored)", zorder=3)
    plt.axhline(0, color="black", linewidth=1.4, zorder=1)
    plt.title(f"{title}  |  cos(A,B) = {cos_sim:.3f}")
    plt.xlabel("Mass (Da)")
    plt.ylabel("Gradient / Intensity")
    plt.xlim(MASS_MIN, MASS_MAX)
    plt.legend(frameon=False)
    plt.tight_layout()

    out.parent.mkdir(exist_ok=True, parents=True)
    plt.savefig(out, dpi=220)
    plt.close()

# ==== RUN ====
pairs = discover_pairs(FOLDER)
out_dir = OUT_DIR or (FOLDER / "mirror_plots")
out_dir.mkdir(exist_ok=True, parents=True)
print(f"Found {len(pairs)} A/B pairs in {FOLDER}. Saving to {out_dir}\n")

for (frac, sign), runs in sorted(pairs.items()):
    dfA = read_two_col_txt(runs["A"])
    dfB = read_two_col_txt(runs["B"])
    A_aligned, B_aligned = align_on_union_mass(dfA, dfB)
    cos = cosine_similarity(A_aligned["value"].to_numpy(), B_aligned["value"].to_numpy())

    # Replace "negabs" → "downregulated", "pos" → "upregulated"
    sign_label = "downregulated" if sign == "negabs" else "upregulated"

    name = f"mirror_{frac}_{sign}_A_vs_B_bars_strong.png"
    title = f"{frac.capitalize()} — {sign_label} (Run A vs Run B)"
    mirror_plot_bars(dfA, dfB, out_dir / name, title, cos)
    print(f"  ✓ {frac}/{sign_label}: cos(A,B) = {cos:.4f}  →  {name}")

# ==== PREVIEW ====
print("\nPreview:")
for p in sorted(out_dir.glob("*_strong.png")):
    print(" ", p.name)
    display(Image(filename=str(p)))


Features less than 25K Da

In [11]:
# -*- coding: utf-8 -*-
"""
(… header unchanged …)
"""

from __future__ import annotations
import os
import re
import json
import glob
from dataclasses import dataclass
from pathlib import Path
from bisect import bisect_left

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

# ============================================================
# --------------------  PEAK DETECTION  ----------------------
# ============================================================
@dataclass
class PeakFindingParams:
    min_prominence: float | None = None
    min_height: float | None = None
    min_distance_pts: int = 10
    smooth_window: int = 0
    min_snr: float = 0.0

def _mad_sigma(y: np.ndarray) -> float:
    if y.size == 0:
        return 0.0
    med = np.median(y)
    mad = np.median(np.abs(y - med))
    return 1.4826 * mad

def _smooth(y: np.ndarray, window: int) -> np.ndarray:
    if window < 3 or window % 2 == 0:
        return y
    kernel = np.ones(window, dtype=float) / window
    return np.convolve(y, kernel, mode="same")

def _extract_id_list(name: str, key: str) -> list[int] | None:
    m = re.search(rf"(?:^|[_-]){key}((?:[_-]\d+)+)(?=[_-]|$)", name, flags=re.I)
    if not m:
        m1 = re.search(rf"(?:^|[_-]){key}[_-]?(\d+)(?=[_-]|$)", name, flags=re.I)
        if m1:
            return [int(m1.group(1))]
        return None
    parts = re.findall(r"\d+", m.group(1))
    return [int(x) for x in parts] if parts else None

def parse_metadata_from_filename(path: str | Path) -> dict:
    p = Path(path)
    name = p.stem
    meta = {
        "bin": None, "experiments": None, "controls": None,
        "experiments_ids": None, "controls_ids": None,
        "experiments_n": None, "controls_n": None,
        "regulation": None, "replicate": None, "source_file": p.name,
    }
    m = re.search(r"(?:^|[_-])bin[_-]?(\d+)(?=[_-]|$)", name, flags=re.I)
    if m:
        meta["bin"] = int(m.group(1))
    else:
        m2 = re.match(r"^(\d+)(?=[_-])", name)
        if m2:
            meta["bin"] = int(m2.group(1))
    exp_ids = _extract_id_list(name, "pos")
    ctl_ids = _extract_id_list(name, "neg")
    if exp_ids is not None:
        meta["experiments_ids"] = ",".join(str(x) for x in exp_ids)
        meta["experiments_n"] = len(exp_ids)
        meta["experiments"] = len(exp_ids)
    if ctl_ids is not None:
        meta["controls_ids"] = ",".join(str(x) for x in ctl_ids)
        meta["controls_n"] = len(ctl_ids)
        meta["controls"] = len(ctl_ids)
    reg_tokens = [m.group(1).lower() for m in re.finditer(
        r"(?:^|[_-])(negabs|posabs|neg|pos)(?=[_-]|$)", name, flags=re.I
    )]
    if reg_tokens:
        token = reg_tokens[-1]
        reg_map = {"negabs": "downregulated", "neg": "downregulated",
                   "posabs": "upregulated", "pos": "upregulated"}
        meta["regulation"] = reg_map.get(token)
    m = re.search(r"(?:^|[_-])run([A-Za-z])(?=[_-]|$)", name)
    if m:
        meta["replicate"] = m.group(1).upper()
    return meta

def load_space_separated(path: str | Path) -> pd.DataFrame:
    path = Path(path)
    try:
        df = pd.read_csv(path, sep=r"\s+", engine="python", header=None,
                         names=["mass", "intensity"], comment="#")
    except Exception:
        df = pd.read_csv(path, header=None)
        if df.shape[1] >= 2:
            df = df.iloc[:, :2]; df.columns = ["mass", "intensity"]
        else:
            raise ValueError("Deconvoluted file must have at least two columns: mass intensity")
    return df

def detect_signals(df: pd.DataFrame, params: PeakFindingParams = PeakFindingParams()) -> pd.DataFrame:
    if not {"mass", "intensity"}.issubset(df.columns):
        if df.shape[1] >= 2:
            df = df.copy()
            df.columns = ["mass", "intensity"] + [f"col{i}" for i in range(2, df.shape[1])]
        else:
            raise ValueError("Input DataFrame must have columns ['mass','intensity'].")
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=["mass", "intensity"])
    df = df.sort_values("mass").reset_index(drop=True)
    x = df["mass"].to_numpy(float); y = df["intensity"].to_numpy(float)
    y_proc = _smooth(y, params.smooth_window)
    sigma = _mad_sigma(y_proc); ymax = float(np.max(y_proc)) if y_proc.size else 0.0
    min_prom = params.min_prominence or max(6.0 * sigma, 0.001 * ymax)
    min_h    = params.min_height     or max(4.0 * sigma, 0.0005 * ymax)
    peaks, props = find_peaks(y_proc, prominence=min_prom, height=min_h,
                              distance=max(1, int(params.min_distance_pts)))
    out = pd.DataFrame({
        "mass": x[peaks],
        "intensity": y[peaks],
        "prominence": props.get("prominences", np.full(peaks.shape, np.nan)),
        "left_base_idx": props.get("left_bases", np.full(peaks.shape, -1)),
        "right_base_idx": props.get("right_bases", np.full(peaks.shape, -1)),
    })
    snr_den = sigma if sigma > 0 else (np.std(y_proc) if y_proc.size else 1.0)
    snr_den = snr_den if snr_den > 0 else 1.0
    out["snr"] = out["intensity"] / snr_den
    if params.min_snr > 0:
        out = out[out["snr"] >= params.min_snr].reset_index(drop=True)
    return out.sort_values("intensity", ascending=False).reset_index(drop=True)

def plot_spectrum_with_peaks(df: pd.DataFrame, peaks_df: pd.DataFrame,
                             out_png: str | Path | None = None,
                             title: str = "Detected Neutral-Mass Signals") -> None:
    plt.figure(figsize=(10, 4))
    plt.plot(df["mass"].to_numpy(), df["intensity"].to_numpy(), linewidth=1)
    if peaks_df is not None and not peaks_df.empty:
        plt.scatter(peaks_df["mass"].to_numpy(),
                    peaks_df["intensity"].to_numpy(), s=18)
    plt.xlabel("Neutral mass (Da)")
    plt.ylabel("Intensity (arb.)")
    plt.title(title)
    plt.tight_layout()
    if out_png:
        plt.savefig(out_png, dpi=150)
    plt.close()

# ============================================================
# -----------------  CHARGE-SERIES MATCHING  -----------------
# ============================================================
PROTON_MASS = 1.007276466812  # Da
Z_MIN, Z_MAX = 5, 50
PPM_TOL = 1000.0
ABS_DA_TOL = 1.0
MIN_MATCHED_CHARGE_STATES = 4

def _read_raw_ms1(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
    except Exception:
        df = pd.read_csv(path, header=None)
    if df.shape[1] == 2:
        df.columns = ["mz", "intensity"]
    else:
        cols_lower = [str(c).lower() for c in df.columns]
        mz_candidates = [i for i, c in enumerate(cols_lower)
                         if ("mz" in c) or ("m/z" in c) or ("mass/charge" in c) or (c.strip() == "m z")]
        int_candidates = [i for i, c in enumerate(cols_lower)
                          if ("int" in c) or ("abund" in c) or ("height" in c) or ("signal" in c)]
        if not mz_candidates: mz_candidates = [0]
        if not int_candidates: int_candidates = [1 if df.shape[1] > 1 else 0]
        df = df.iloc[:, [mz_candidates[0], int_candidates[0]]].copy()
        df.columns = ["mz", "intensity"]
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    df = df[df["intensity"] > 0].copy()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce")
    df = df.dropna().sort_values("mz").reset_index(drop=True)
    return df

def _ppm_window(target_mz: float, ppm: float, abs_da: float) -> tuple[float, float]:
    da = target_mz * ppm * 1e-6
    tol = max(da, abs_da)
    return target_mz - tol, target_mz + tol

def _match_targets(sorted_mz: np.ndarray, targets: np.ndarray,
                   ppm: float, abs_da: float, available_mask: np.ndarray) -> dict[int, int | None]:
    results: dict[int, int | None] = {}
    for ti, t in enumerate(targets):
        lo, hi = _ppm_window(t, ppm, abs_da)
        j = bisect_left(sorted_mz, t)
        best_idx = None; best_delta = float("inf")
        for k in (j, j-1, j+1, j-2, j+2, j-3, j+3):
            if 0 <= k < len(sorted_mz):
                mz_k = sorted_mz[k]
                if available_mask[k] and (lo <= mz_k <= hi):
                    delta = abs(mz_k - t)
                    if delta < best_delta:
                        best_delta = delta; best_idx = k
        results[ti] = best_idx
    return results

def _generate_charge_series(neutral_mass: float, z_min: int, z_max: int) -> pd.DataFrame:
    z = np.arange(z_min, z_max + 1, dtype=int)
    mz = (neutral_mass + z * PROTON_MASS) / z
    return pd.DataFrame({"z": z, "target_mz": mz})

def assign_ms1_peaks(raw_df: pd.DataFrame, deconv_peaks_df: pd.DataFrame,
                     meta: dict | None = None) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_df = raw_df.sort_values("mz").reset_index(drop=True)
    mz_arr = raw_df["mz"].to_numpy()
    inten_arr = raw_df["intensity"].to_numpy()
    available = np.ones(len(raw_df), dtype=bool)

    assigned_mass = np.full(len(raw_df), np.nan)
    assigned_z    = np.full(len(raw_df), np.nan)

    summary_rows = []

    for r in deconv_peaks_df.itertuples(index=False):
        mass = float(r.mass)
        mass_intensity = float(r.intensity)
        mass_snr = float(getattr(r, "snr", np.nan))

        series = _generate_charge_series(mass, Z_MIN, Z_MAX)
        targets = series["target_mz"].to_numpy()
        matches = _match_targets(mz_arr, targets, PPM_TOL, ABS_DA_TOL, available_mask=available)

        matched_indices = []
        matched_z_list  = []
        matched_mz_list = []

        for ti, k in matches.items():
            if k is not None:
                matched_indices.append(k)
                matched_z_list.append(int(series.iloc[ti]["z"]))
                matched_mz_list.append(mz_arr[k])

        if len(matched_indices) >= MIN_MATCHED_CHARGE_STATES:
            for idx, z_val in zip(matched_indices, matched_z_list):
                if available[idx]:
                    available[idx] = False
                    assigned_mass[idx] = mass
                    assigned_z[idx] = z_val

            frac_intensity_removed = (
                float(np.sum(inten_arr[matched_indices])) / float(np.sum(inten_arr))
                if inten_arr.sum() > 0 else 0.0
            )

            row = {
                "neutral_mass": mass,
                "deconv_intensity": mass_intensity,
                "snr": mass_snr,
                "n_matches": len(matched_indices),
                "matched_z_list": json.dumps(matched_z_list),
                "matched_mz_list": json.dumps([round(float(x), 1) for x in matched_mz_list]),
                "ppm_tol": PPM_TOL,
                "abs_da_tol": ABS_DA_TOL,
                "fraction_total_intensity_captured": frac_intensity_removed
            }
            if meta:
                row.update({
                    "bin": meta.get("bin"),
                    "experiments_ids": meta.get("experiments_ids"),
                    "controls_ids": meta.get("controls_ids"),
                    "regulation": meta.get("regulation"),
                    "replicate": meta.get("replicate"),
                    "source_file": meta.get("source_file"),
                })
            summary_rows.append(row)

    assigned_raw = raw_df.copy()
    assigned_raw["assigned_mass"] = assigned_mass
    assigned_raw["assigned_charge"] = assigned_z
    assigned_raw["is_assigned"] = ~np.isnan(assigned_mass)

    assignments_summary = pd.DataFrame(summary_rows).sort_values(
        "deconv_intensity", ascending=False).reset_index(drop=True)

    if meta and "bin" in meta:
        assigned_raw["bin"] = meta["bin"]

    return assigned_raw, assignments_summary

# ============================================================
# ----------------------  PLOTTING  --------------------------
# ============================================================
def plot_neutral_mass_spectrum(deconv_peaks_df: pd.DataFrame, out_dir: str,
                               filename: str = "neutral_mass_spectrum.png"):
    if deconv_peaks_df.empty:
        print("No neutral masses to plot."); return
    masses = deconv_peaks_df["mass"].to_numpy()
    intens = deconv_peaks_df["intensity"].to_numpy()
    plt.figure(figsize=(9, 4.5))
    plt.vlines(masses, 0, intens, linewidth=1)
    plt.xlabel("Neutral mass (Da)"); plt.ylabel("Intensity (arb.)")
    plt.title("Neutral Mass Spectrum (detected peaks)")
    plt.tight_layout(); out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200); plt.close()
    print(f"Saved plot: {out_path}")

def plot_mirror_assigned_vs_total(assigned_raw: pd.DataFrame, out_dir: str,
                                  filename: str = "mirror_assigned_vs_total.png"):
    if assigned_raw.empty:
        print("No assigned/raw data to plot."); return
    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()
    assigned_mask = assigned_raw["is_assigned"].to_numpy(dtype=bool)
    assigned_int = np.where(assigned_mask, total_int, 0.0)
    plt.figure(figsize=(10, 5.2))
    plt.vlines(mz, 0, total_int, linewidth=0.6)
    plt.vlines(mz[assigned_mask], 0, -assigned_int[assigned_mask], linewidth=0.8)
    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -assigned_int.max() if assigned_int.any() else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)
    plt.xlabel("m/z"); plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Assigned (bottom)")
    plt.tight_layout(); out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200); plt.close()
    print(f"Saved plot: {out_path}")

def plot_mirror_unassigned_vs_total(assigned_raw: pd.DataFrame, out_dir: str,
                                    filename: str = "mirror_unassigned_vs_total.png"):
    if assigned_raw.empty:
        print("No assigned/raw data to plot."); return
    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()
    unassigned_mask = ~assigned_raw["is_assigned"].to_numpy(dtype=bool)
    unassigned_int = np.where(unassigned_mask, total_int, 0.0)
    plt.figure(figsize=(10, 5.2))
    plt.vlines(mz, 0, total_int, linewidth=0.6)
    plt.vlines(mz[unassigned_mask], 0, -unassigned_int[unassigned_mask], linewidth=0.8)
    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -unassigned_int.max() if unassigned_int.any() else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)
    plt.xlabel("m/z"); plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Non-assigned (bottom)")
    plt.tight_layout(); out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200); plt.close()
    print(f"Saved plot: {out_path}")

def plot_mirror_assigned_by_protein_vs_total(assigned_raw: pd.DataFrame, out_dir: str,
    filename: str = "mirror_assigned_by_protein_vs_total.png", max_legend_items: int = 20):
    if assigned_raw.empty:
        print("No assigned/raw data to plot."); return
    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()
    plt.figure(figsize=(11, 5.6))
    plt.vlines(mz, 0, total_int, linewidth=0.5)
    df_assigned_only = assigned_raw[assigned_raw["is_assigned"]].copy()
    if df_assigned_only.empty:
        plt.xlabel("m/z"); plt.ylabel("Intensity (arb.)")
        plt.title("Mirror Plot: Total (top) vs Assigned by Protein (bottom)")
        plt.tight_layout(); out_path = os.path.join(out_dir, filename)
        plt.savefig(out_path, dpi=200); plt.close(); print(f"Saved plot: {out_path}"); return
    counts = (df_assigned_only.groupby("assigned_mass", dropna=True)["is_assigned"]
              .count().sort_values(ascending=False))
    proteins_in_order = counts.index.tolist()
    color_cycle = plt.rcParams['axes.prop_cycle'].by_key().get(
        'color', ['C0','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
    legend_entries = 0
    for i, mass in enumerate(proteins_in_order):
        mask = (assigned_raw["assigned_mass"] == mass)
        mz_i = assigned_raw.loc[mask, "mz"].to_numpy()
        inten_i = assigned_raw.loc[mask, "intensity"].to_numpy()
        label = None
        if legend_entries < max_legend_items:
            label = f"{mass/1000:.2f} kDa (n={len(mz_i)})"; legend_entries += 1
        plt.vlines(mz_i, 0, -inten_i, linewidth=0.8,
                   color=color_cycle[i % len(color_cycle)], label=label)
    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -df_assigned_only["intensity"].max() if len(df_assigned_only) else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)
    if legend_entries:
        plt.legend(title="Assigned proteins", loc="upper right", fontsize=8, ncol=1)
    plt.xlabel("m/z"); plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Assigned by Protein (bottom)")
    plt.tight_layout(); out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200); plt.close(); print(f"Saved plot: {out_path}")

# ============================================================
# ----------------------  BATCH MAIN  ------------------------
# ============================================================

# ---------------------- User-configurable ----------------------
RAW_DIR    = r"F:\binary\final\raw"
DECONV_DIR = r"F:\binary\final\decon"
OUT_DIR    = r"F:\binary\final\firstpass"

RAW_GLOB    = "*.csv"
DECONV_GLOB = "*_mass.txt"

DECONV_DETECT_PARAMS = PeakFindingParams(
    min_distance_pts=20,
    min_snr=10,
    smooth_window=0,
)

# >>> Set max neutral mass here (kDa). Use None to disable the filter.
MAX_PROTEIN_MASS_KDA: float | None = 25.0  # e.g., 80 kDa; set to None for no limit

def _base_key_from_deconv(path: Path) -> str:
    stem = path.stem
    return stem[:-5] if stem.endswith("_mass") else stem

def _base_key_from_raw(path: Path) -> str:
    return path.stem

def _ensure_dir(p: str | Path) -> None:
    Path(p).mkdir(parents=True, exist_ok=True)

def process_one_pair(deconv_path: Path, raw_path: Path | None) -> None:
    base = _base_key_from_deconv(deconv_path)
    out_root = Path(OUT_DIR) / base
    _ensure_dir(out_root)

    meta = parse_metadata_from_filename(deconv_path)
    deconv_raw = load_space_separated(deconv_path)

    # --- apply max mass in kDa (converted to Da) BEFORE detection ---
    if MAX_PROTEIN_MASS_KDA is not None and np.isfinite(MAX_PROTEIN_MASS_KDA):
        cutoff_da = float(MAX_PROTEIN_MASS_KDA) * 1000.0
        before = len(deconv_raw)
        deconv_raw = deconv_raw[deconv_raw["mass"] <= cutoff_da].reset_index(drop=True)
        after = len(deconv_raw)
        print(f"[{base}] Applied max mass filter ≤ {MAX_PROTEIN_MASS_KDA:.3f} kDa ({cutoff_da:.1f} Da): kept {after}/{before} points.")
    else:
        cutoff_da = None

    # detect peaks
    deconv_peaks = detect_signals(deconv_raw, params=DECONV_DETECT_PARAMS)

    # attach metadata + filter provenance
    add = {}
    if cutoff_da is not None:
        add = {"max_mass_filter_kDa": float(MAX_PROTEIN_MASS_KDA),
               "max_mass_filter_Da": float(cutoff_da)}
    deconv_peaks = deconv_peaks.assign(
        bin=meta.get("bin"),
        experiments_ids=meta.get("experiments_ids"),
        controls_ids=meta.get("controls_ids"),
        regulation=meta.get("regulation"),
        replicate=meta.get("replicate"),
        source_file=meta.get("source_file"),
        **add
    )

    # save + plot detections
    out_detect_csv = out_root / f"{base}_detected_signals.csv"
    out_detect_png = out_root / f"{base}_detected_signals.png"
    deconv_peaks.to_csv(out_detect_csv, index=False)
    plot_spectrum_with_peaks(deconv_raw, deconv_peaks, out_png=str(out_detect_png))
    print(f"[{base}] Neutral-mass detection: {len(deconv_peaks)} peaks → {out_detect_csv}")
    print(f"[{base}] Detection plot saved → {out_detect_png}")
    print(f"[{base}] Parsed filename metadata: {meta}")

    # assignment (if raw exists)
    if raw_path is None or not raw_path.exists():
        print(f"[{base}] ⚠ No matching RAW CSV found. Skipping assignment.")
        return

    raw_df = _read_raw_ms1(raw_path)
    assigned_raw, summary = assign_ms1_peaks(raw_df, deconv_peaks, meta=meta)
    if cutoff_da is not None and not summary.empty:
        summary["max_mass_filter_kDa"] = float(MAX_PROTEIN_MASS_KDA)
        summary["max_mass_filter_Da"]  = float(cutoff_da)

    out_assigned = out_root / f"{base}_assigned_ms1_with_peaks.csv"
    out_summary  = out_root / f"{base}_assignments_summary.csv"
    assigned_raw.to_csv(out_assigned, index=False)
    summary.to_csv(out_summary, index=False)

    # plots
    plot_neutral_mass_spectrum(deconv_peaks, str(out_root), filename=f"{base}_neutral_mass_spectrum.png")
    plot_mirror_assigned_vs_total(assigned_raw, str(out_root), filename=f"{base}_mirror_assigned_vs_total.png")
    plot_mirror_unassigned_vs_total(assigned_raw, str(out_root), filename=f"{base}_mirror_unassigned_vs_total.png")
    plot_mirror_assigned_by_protein_vs_total(
        assigned_raw, str(out_root), filename=f"{base}_mirror_assigned_by_protein_vs_total.png", max_legend_items=20
    )

    print(f"\n=== [{base}] Summary ===")
    print(f"Raw MS1 peaks (rows): {len(raw_df):,}")
    print(f"Detected neutral-mass peaks: {len(deconv_peaks):,}")
    print(f"Assigned raw peaks: {int(assigned_raw['is_assigned'].sum()):,}")
    print(f"Non-assigned raw peaks: {int((~assigned_raw['is_assigned']).sum()):,}")
    if cutoff_da is not None:
        print(f"Max mass filter: ≤ {MAX_PROTEIN_MASS_KDA:.3f} kDa ({cutoff_da:.1f} Da)")
    print(f"Saved: {out_assigned}")
    print(f"Saved: {out_summary}")
    print("------------------------------------------------------------")

def main():
    _ensure_dir(OUT_DIR)

    raw_files = [Path(p) for p in glob.glob(str(Path(RAW_DIR) / RAW_GLOB))]
    raw_index = {_base_key_from_raw(p): p for p in raw_files}

    deconv_files = [Path(p) for p in glob.glob(str(Path(DECONV_DIR) / DECONV_GLOB))]
    if not deconv_files:
        print(f"⚠ No deconvoluted files found in: {DECONV_DIR} (pattern: {DECONV_GLOB})")
        return

    print(f"Found {len(deconv_files)} decon file(s) in {DECONV_DIR}")
    print(f"Found {len(raw_files)} raw CSV file(s) in {RAW_DIR}")
    if MAX_PROTEIN_MASS_KDA is not None:
        print(f"Applying maximum protein mass filter: ≤ {MAX_PROTEIN_MASS_KDA:.3f} kDa")

    for deconv_path in sorted(deconv_files):
        base = _base_key_from_deconv(deconv_path)
        raw_path = raw_index.get(base, None)
        try:
            process_one_pair(deconv_path, raw_path)
        except Exception as e:
            print(f"[{base}] ❌ Error: {e}")

    print("✅ Batch processing complete.")

if __name__ == "__main__":
    main()


Found 8 decon file(s) in F:\binary\final\decon
Found 8 raw CSV file(s) in F:\binary\final\raw
Applying maximum protein mass filter: ≤ 25.000 kDa
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Applied max mass filter ≤ 25.000 kDa (25000.0 Da): kept 2005/7371 points.
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Neutral-mass detection: 18 peaks → F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_detected_signals.csv
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Detection plot saved → F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_detected_signals.png
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Parsed filename metadata: {'bin': None, 'experiments': 1, 'controls': 1, 'experiments_ids': '1', 'controls_ids': '0', 'experiments_n': 1, 'controls_n': 1, 'regulation': 'downregulated', 'replicate': 'A', 'source_file': 'frac_pellet_grads_AB__pos

Combining the reports

In [24]:
# -*- coding: utf-8 -*-
"""
Collect all *_assignments_summary.csv files from subfolders into one folder,
then concatenate them into a single report.csv.

Steps:
1. Search recursively for *_assignments_summary.csv in BATCH_OUT_DIR.
2. Copy all to SUMMARY_OUT (renaming duplicates).
3. Concatenate all collected CSVs → report.csv.
4. Add 'fractions' column based on 'source_file' content:
      - "Pellet"  if '_pellet_' in source_file
      - "Soluble" if '_soluble_' in source_file
"""

import os
import glob
import shutil
from pathlib import Path
import pandas as pd

# ---------------------- USER SETTINGS ----------------------
BATCH_OUT_DIR = r"F:\binary\final\firstpass"     # where all subfolders were created
SUMMARY_OUT   = r"F:\binary\final\report"        # folder to collect summaries
PATTERN       = "*_assignments_summary.csv"      # filename pattern
OUTPUT_FILE   = Path(SUMMARY_OUT) / "report.csv"
# ------------------------------------------------------------

def collect_summary_files():
    """Collect all *_assignments_summary.csv into SUMMARY_OUT."""
    os.makedirs(SUMMARY_OUT, exist_ok=True)
    summary_files = glob.glob(str(Path(BATCH_OUT_DIR) / "**" / PATTERN), recursive=True)

    if not summary_files:
        print("⚠ No summary files found.")
        return []

    print(f"Found {len(summary_files)} summary file(s). Copying to {SUMMARY_OUT}...")
    copied = []
    for f in summary_files:
        src = Path(f)
        dst = Path(SUMMARY_OUT) / src.name
        if dst.exists():
            dst = Path(SUMMARY_OUT) / f"{src.parent.name}_{src.name}"
        shutil.copy2(src, dst)
        copied.append(dst)
        print(f"Copied: {src} → {dst}")

    print("✅ Copy complete.")
    return copied


def concat_csvs(folder_path: str, output_file: str):
    """Concatenate all CSV files in folder_path → output_file with 'fractions' column."""
    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the folder!")

    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        df = pd.read_csv(file_path)

        # Determine fraction based on file name or source_file column
        fraction = None
        if '_pellet_' in file.lower():
            fraction = "Pellet"
        elif '_soluble_' in file.lower():
            fraction = "Soluble"
        else:
            # Try checking inside the dataframe if it has 'source_file' column
            if 'source_file' in df.columns:
                if df['source_file'].str.contains('_pellet_', case=False, na=False).any():
                    fraction = "Pellet"
                elif df['source_file'].str.contains('_soluble_', case=False, na=False).any():
                    fraction = "Soluble"

        # Default to "Unknown" if not detected
        df['fractions'] = fraction if fraction else "Unknown"
        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(output_file, index=False)
    print(f"✅ Combined {len(csv_files)} files → {output_file}")
    print("🧪 Added 'fractions' column with values 'Pellet' or 'Soluble'.")


def main():
    copied_files = collect_summary_files()
    if copied_files:
        concat_csvs(SUMMARY_OUT, OUTPUT_FILE)


if __name__ == "__main__":
    main()


Found 8 summary file(s). Copying to F:\binary\final\report...
Copied: F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_assignments_summary.csv → F:\binary\final\report\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_assignments_summary.csv
Copied: F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB_assignments_summary.csv → F:\binary\final\report\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB_assignments_summary.csv
Copied: F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_pos_runA\frac_pellet_grads_AB__pos_1__neg_0_pos_runA_assignments_summary.csv → F:\binary\final\report\frac_pellet_grads_AB__pos_1__neg_0_pos_runA_assignments_summary.csv
Copied: F:\binary\final\firstpass\frac_pellet_grads_AB__pos_1__neg_0_pos_runB\frac_pellet_grads_AB__pos_1__neg_0_pos_runB_assignments_summary.csv → F:\binary\final\report\frac_pellet_grads_AB__pos_1__neg

Quantification of all proteoforms

In [25]:
import os
import ast
import pandas as pd

# --------------------
# Config (edit paths)
# --------------------
DATASET_RT_PATH = r"F:\binary\final\ms1_per_sample.csv"          # wide matrix with cast_* columns
ASSIGNMENTS_PATH = r"F:\binary\final\report\report.csv"          # has 'fractions' and 'matched_mz_list'
OUT_PATH = os.path.join(
    os.path.dirname(ASSIGNMENTS_PATH) or ".",
    "assignments_with_quant_sums_aaa.csv"
)

# --------------------
# Helpers
# --------------------
def to_cast_col(n: float) -> str:
    """Map an m/z to its cast_* column name: int((mz-600)*10), zero-padded."""
    col_num = int((float(n) - 600.0) * 10.0)
    return "cast_" + str(col_num).zfill(5)

def parse_mz_list(val):
    """Safely parse matched_mz_list cells that look like '[864.9, 865.2, ...]'."""
    try:
        out = ast.literal_eval(str(val))
        if isinstance(out, (list, tuple)):
            return [float(x) for x in out]
    except Exception:
        pass
    return []

# --------------------
# Load data
# --------------------
df_rt = pd.read_csv(DATASET_RT_PATH)
df_asn = pd.read_csv(ASSIGNMENTS_PATH)

# Basic checks
for col in ["fractions", "target"]:
    if col not in df_rt.columns:
        raise KeyError(f"'{col}' column is required in dataset_rt.csv")

if "fractions" not in df_asn.columns or "matched_mz_list" not in df_asn.columns:
    raise KeyError("assignments CSV must contain 'fractions' and 'matched_mz_list' columns")

# NEW columns to be added to assignments
new_cols = ["group_0_sum", "group_1_sum", "group_2_sum", "group_3_sum",
            "n_mz_used", "n_mz_found", "missing_cast_columns"]
for c in new_cols:
    if c in df_asn.columns:
        # avoid accidental overwrite
        df_asn.drop(columns=[c], inplace=True)

# --------------------
# Row-wise quantification
# --------------------
results = []
for idx, row in df_asn.iterrows():
    frac_value = row["fractions"]  # keep as-is (can be string like 'soluble_fraction')
    mz_list = parse_mz_list(row["matched_mz_list"])
    cast_cols = [to_cast_col(mz) for mz in mz_list]

    # Filter dataset_rt to this fraction
    df_frac = df_rt[df_rt["fractions"] == frac_value]
    if df_frac.empty:
        res = dict(
            group_0_sum=float("nan"),
            group_1_sum=float("nan"),
            group_2_sum=float("nan"),
            group_3_sum=float("nan"),
            n_mz_used=len(cast_cols),
            n_mz_found=0,
            missing_cast_columns=", ".join(cast_cols) if cast_cols else ""
        )
        results.append(res)
        continue

    # Ensure target present
    if "target" not in df_frac.columns:
        raise KeyError("Column 'target' not found in dataset_rt.csv")

    existing = [c for c in cast_cols if c in df_frac.columns]
    missing = [c for c in cast_cols if c not in df_frac.columns]

    if not existing:
        sums = {0: float("nan"), 1: float("nan"), 2: float("nan"), 3: float("nan")}
    else:
        # Sum intensities across all selected cast_* columns per target
        grouped = df_frac.groupby("target")[existing].sum()
        total_per_target = grouped.sum(axis=1)  # sum across those cast_* columns
        sums = {t: float(total_per_target.get(t, float("nan"))) for t in [0, 1, 2, 3]}

    res = dict(
        group_0_sum=sums[0],
        group_1_sum=sums[1],
        group_2_sum=sums[2],
        group_3_sum=sums[3],
        n_mz_used=len(cast_cols),
        n_mz_found=len(existing),
        missing_cast_columns=", ".join(missing) if missing else ""
    )
    results.append(res)

# Attach results
df_quant = pd.DataFrame(results, index=df_asn.index)
df_asn_out = pd.concat([df_asn, df_quant], axis=1)

# --------------------
# Save updated CSV
# --------------------
df_asn_out.to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")




Saved: F:\binary\final\report\assignments_with_quant_sums_aaa.csv


identification of PFR by matching with tdportal report

In [None]:
# -*- coding: utf-8 -*-
"""
Two-pass matching with precursor-based support:

PRIMARY (Accession):
  - For each precursor m/z in matched_mz_list:
      collect Accessions where |precursor_mz - m/z| <= MZ_TOL AND |MASS - neutral_mass| <= MASS_TOL
  - Choose the mode Accession by MAX number of precursors that contain it (support).
    Ties -> break by total match frequency, then lexicographic.
  - primary_count = number of precursors supporting that mode Accession.
  - Accept primary if primary_count >= 2.

SECONDARY (Prediction, only if primary fails):
  - For each precursor m/z:
      collect predictions where |precursor_mz - m/z| <= PRED_MZ_TOL (ignore MASS)
      (normalize to "ASYN" or "NASYN"; output uses "ASYN" / "nASYN")
  - Choose the mode prediction by MAX precursor support (same tie-break rules).
  - secondary_count = number of precursors supporting that mode prediction.
  - Accept secondary if secondary_count >= 4.

Else => final_call = "unidentified".
"""

import os
import ast
from collections import Counter, defaultdict
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
REPORT_PATH = r"F:/binary/final/report/report.csv"
MS2_PATH    = r"F:/binary/final/ms2_per_scan_with_ids_hash.csv"
OUTPUT_PATH = r"F:/binary/final/ids2.csv"

# Tolerances
MZ_TOL      = 2.0     # primary (m/z + mass)
MASS_TOL    = 50.0    # primary (Da)
PRED_MZ_TOL = 0.5     # secondary (m/z only)

# Thresholds applied to PRECURSOR SUPPORT (not raw matches)
PRIMARY_MIN_PRECURSORS   = 2
SECONDARY_MIN_PRECURSORS = 4

# ----------------------------
# Helpers
# ----------------------------
def _num(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def _safe_parse_list(val) -> List[float]:
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [float(x) for x in parsed]
        except Exception:
            pass
    elif isinstance(val, (list, tuple, np.ndarray)):
        try:
            return [float(x) for x in val]
        except Exception:
            pass
    return []

def _choose_mode_by_precursor_support(
    per_precursor_sets: List[set],
    total_match_counter: Counter
) -> Optional[str]:
    """
    Given:
      - per_precursor_sets: list of sets (one per precursor) containing labels present for that precursor
      - total_match_counter: counts across ALL matches (for tie-break)
    Return the label with:
      1) max precursor-support count (how many sets contain it),
      2) then max total match frequency,
      3) then lexicographic min.
    """
    if not per_precursor_sets:
        return None

    # precursor support counts
    support: Dict[str, int] = defaultdict(int)
    for s in per_precursor_sets:
        for lab in s:
            support[lab] += 1
    if not support:
        return None

    # Prepare sorted candidates by the criteria
    # (-support, -total_matches, label)
    candidates = sorted(
        support.keys(),
        key=lambda lab: (-support[lab], -total_match_counter.get(lab, 0), str(lab))
    )
    return candidates[0] if candidates else None

# ----------------------------
# Matching
# ----------------------------
def primary_accession_pass(
    df2: pd.DataFrame,
    mz_list: List[float],
    neutral_mass: float,
    mz_tol: float,
    mass_tol: float
) -> Tuple[Optional[str], int]:
    """
    Build, for each precursor m/z, the SET of matched Accessions (primary criteria).
    Choose the Accession with the highest number of precursor sets containing it.
    Return (mode_accession, precursor_support_count_for_mode).
    """
    if not mz_list or pd.isna(neutral_mass):
        return None, 0

    per_precursor_sets: List[set] = []
    total_matches = Counter()

    for mz in mz_list:
        d_mz   = (df2["precursor_mz"] - mz).abs()
        d_mass = (df2["MASS"] - neutral_mass).abs()
        mask = (d_mz <= mz_tol) & (d_mass <= mass_tol)
        if not mask.any():
            continue

        accs = df2.loc[mask, "Accession"].astype(str).tolist()
        if accs:
            per_precursor_sets.append(set(accs))
            total_matches.update(accs)

    if not per_precursor_sets:
        return None, 0

    mode_acc = _choose_mode_by_precursor_support(per_precursor_sets, total_matches)
    if mode_acc is None:
        return None, 0

    # precursor-based count for the chosen mode
    support_count = sum(1 for s in per_precursor_sets if mode_acc in s)
    return mode_acc, support_count

def secondary_prediction_pass(
    df2: pd.DataFrame,
    mz_list: List[float],
    mz_tol: float
) -> Tuple[Optional[str], int]:
    """
    Per precursor m/z, collect the SET of predictions matched within mz_tol (ignore MASS).
    Normalize predictions to "ASYN"/"NASYN". Output uses "ASYN" / "nASYN".
    Choose the prediction with the highest precursor support.
    Return (mode_prediction_token, precursor_support_count_for_mode).
    """
    if not mz_list:
        return None, 0

    per_precursor_sets: List[set] = []
    total_matches = Counter()

    for mz in mz_list:
        d_mz = (df2["precursor_mz"] - mz).abs()
        mask = (d_mz <= mz_tol)
        if not mask.any():
            continue

        preds = (
            df2.loc[mask, "prediction"]
            .astype(str).str.upper().str.strip()
            .tolist()
        )
        # keep only ASYN/NASYN; form a set for this precursor
        lab_set = set(p for p in preds if p in ("ASYN", "NASYN"))
        if lab_set:
            per_precursor_sets.append(lab_set)
            total_matches.update(lab_set)  # update by labels present for this precursor

    if not per_precursor_sets:
        return None, 0

    mode_pred = _choose_mode_by_precursor_support(per_precursor_sets, total_matches)
    if mode_pred is None:
        return None, 0

    support_count = sum(1 for s in per_precursor_sets if mode_pred in s)
    # canonicalize to requested output token
    mode_token = "ASYN" if mode_pred == "ASYN" else "nASYN"
    return mode_token, support_count

# ----------------------------
# Main
# ----------------------------
def main():
    # Load
    if not os.path.exists(REPORT_PATH):
        raise FileNotFoundError(f"Not found: {REPORT_PATH}")
    if not os.path.exists(MS2_PATH):
        raise FileNotFoundError(f"Not found: {MS2_PATH}")

    df1 = pd.read_csv(REPORT_PATH)
    df2 = pd.read_csv(MS2_PATH)

    # Required columns
    for col in ["precursor_mz", "MASS", "Accession", "prediction"]:
        if col not in df2.columns:
            raise KeyError(f"MS2 file must contain '{col}' column.")

    # Types
    df2["precursor_mz"] = _num(df2["precursor_mz"])
    df2["MASS"]         = _num(df2["MASS"])
    df2["Accession"]    = df2["Accession"].astype(str)
    df2["prediction"]   = df2["prediction"].astype(str)

    # Outputs
    primary_modes, primary_counts = [], []
    secondary_modes, secondary_counts = [], []
    finals = []

    # Process each row
    for _, row in df1.iterrows():
        mz_list = _safe_parse_list(row.get("matched_mz_list", []))
        neutral_mass = row.get("neutral_mass", np.nan)

        # PRIMARY (Accession, m/z+mass)
        p_mode, p_support = primary_accession_pass(df2, mz_list, neutral_mass, MZ_TOL, MASS_TOL)
        if p_mode is not None and p_support >= PRIMARY_MIN_PRECURSORS:
            primary_modes.append(p_mode)
            primary_counts.append(p_support)
            secondary_modes.append(None)
            secondary_counts.append(0)
            finals.append(p_mode)
            continue

        # SECONDARY (Prediction, m/z-only)
        s_mode, s_support = secondary_prediction_pass(df2, mz_list, PRED_MZ_TOL)
        primary_modes.append(None)
        primary_counts.append(0)

        if s_mode is not None and s_support >= SECONDARY_MIN_PRECURSORS:
            secondary_modes.append(s_mode)
            secondary_counts.append(s_support)
            finals.append(s_mode)
        else:
            secondary_modes.append(None)
            secondary_counts.append(0)
            finals.append("unidentified")

    # Attach outputs
    df1["primary_mode_accession"]    = primary_modes
    df1["primary_count"]             = primary_counts          # number of PRECURSORS supporting mode Accession
    df1["secondary_mode_prediction"] = secondary_modes
    df1["secondary_count"]           = secondary_counts        # number of PRECURSORS supporting mode prediction
    df1["final_call"]                = finals

    # Save
    os.makedirs(os.path.dirname(OUTPUT_PATH) or ".", exist_ok=True)
    df1.to_csv(OUTPUT_PATH, index=False)
    print(f"✅ Done. Saved → {OUTPUT_PATH}")

if __name__ == "__main__":
    main()


✅ Done. Saved → F:/binary/final\ids2.csv
