Generating deconvoluted spectra from the informative features

In [1]:
# -*- coding: utf-8 -*-
import os
import subprocess
import shutil
import tempfile

def _unique_dst_path(dst_dir, fname):
    """Return a unique path in dst_dir for fname, adding a numeric suffix if needed."""
    base, ext = os.path.splitext(fname)
    candidate = os.path.join(dst_dir, fname)
    i = 1
    while os.path.exists(candidate):
        candidate = os.path.join(dst_dir, f"{base}__{i}{ext}")
        i += 1
    return candidate

def _prefixed_name(src_path, result_root):
    """
    Build a safer filename using the immediate parent folder under result/ as a prefix
    to reduce collisions: e.g., result/sampleA/sampleA_mass.txt -> sampleA__sampleA_mass.txt
    """
    # src_path like .../result/<parent>/<file>
    parent = os.path.basename(os.path.dirname(src_path))
    fname = os.path.basename(src_path)
    return f"{parent}__{fname}" if parent and parent != "result" else fname

def run_unidec_on_folder(folder_path):
    # Ensure result root folder exists
    result_root = os.path.join(folder_path, "result")
    os.makedirs(result_root, exist_ok=True)

    # Loop through files in the folder (top-level only)
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Skip directories
        if not os.path.isfile(file_path):
            continue

        # Create a unique subfolder named after the file (without extension)
        base_name = os.path.splitext(file_name)[0]
        file_result_folder = os.path.join(result_root, base_name)
        os.makedirs(file_result_folder, exist_ok=True)

        # Run UniDec for this file, send outputs to its subfolder
        print(f"Processing: {file_name} → {file_result_folder}")
        subprocess.run(["python", "-m", "unidec", "-f", file_path, "-o", file_result_folder])

    print("✅ All files processed. Results saved in:", result_root)

    # 1) Collect *_mass.txt paths from result_root (including subfolders)
    collected = []
    for root, _, files in os.walk(result_root):
        for f in files:
            if f.endswith("_mass.txt"):
                collected.append(os.path.join(root, f))

    if not collected:
        print("⚠️ No *_mass.txt files found under:", result_root)
        return

    # 2) Copy them to a temp folder FIRST (so deleting result/ content won't break src paths)
    temp_dir = tempfile.mkdtemp(prefix="mass_collect_")
    copied = []
    for src in collected:
        try:
            # Prefix with subfolder name to avoid collisions
            safe_name = _prefixed_name(src, result_root)
            dst = os.path.join(temp_dir, safe_name)
            dst = _unique_dst_path(temp_dir, os.path.basename(dst))  # ensure uniqueness
            print(f"Staging: {src} → {dst}")
            shutil.copy2(src, dst)
            copied.append(dst)
        except Exception as e:
            print(f"⚠️ Skip (copy error): {src} — {e}")

    # 3) Clean the result_root completely
    for item in os.listdir(result_root):
        item_path = os.path.join(result_root, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.remove(item_path)
            else:
                shutil.rmtree(item_path)
        except Exception as e:
            print(f"⚠️ Could not remove {item_path}: {e}")

    # 4) Move staged files back into a clean result_root
    for staged in copied:
        try:
            final_dst = os.path.join(result_root, os.path.basename(staged))
            final_dst = _unique_dst_path(result_root, os.path.basename(final_dst))
            print(f"Finalizing: {staged} → {final_dst}")
            shutil.move(staged, final_dst)
        except Exception as e:
            print(f"⚠️ Move error for {staged}: {e}")

    # 5) Remove temp dir (ignore errors)
    try:
        shutil.rmtree(temp_dir)
    except Exception:
        pass

    print("📂 Clean result folder ready with only *_mass.txt files:", result_root)


if __name__ == "__main__":
    # Example usage
    folder_path = r"F:\binary\raw"  # <-- replace with your folder
    run_unidec_on_folder(folder_path)


Processing: frac_pellet_grads_AB__pos_1__neg_0_negabs_runA.csv → F:\binary\raw\result\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA
Processing: frac_pellet_grads_AB__pos_1__neg_0_negabs_runB.csv → F:\binary\raw\result\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB
Processing: frac_pellet_grads_AB__pos_1__neg_0_pos_runA.csv → F:\binary\raw\result\frac_pellet_grads_AB__pos_1__neg_0_pos_runA
Processing: frac_pellet_grads_AB__pos_1__neg_0_pos_runB.csv → F:\binary\raw\result\frac_pellet_grads_AB__pos_1__neg_0_pos_runB
Processing: frac_soluble_grads_AB__pos_1__neg_0_negabs_runA.csv → F:\binary\raw\result\frac_soluble_grads_AB__pos_1__neg_0_negabs_runA
Processing: frac_soluble_grads_AB__pos_1__neg_0_negabs_runB.csv → F:\binary\raw\result\frac_soluble_grads_AB__pos_1__neg_0_negabs_runB
Processing: frac_soluble_grads_AB__pos_1__neg_0_pos_runA.csv → F:\binary\raw\result\frac_soluble_grads_AB__pos_1__neg_0_pos_runA
Processing: frac_soluble_grads_AB__pos_1__neg_0_pos_runB.csv → F:\binary\raw\resu

In [2]:
# -*- coding: utf-8 -*-
"""
Pipeline:
1) Detect neutral-mass peaks from a deconvoluted spectrum (mass intensity; whitespace- or csv-delimited).
2) Use detected peaks as candidate proteins and assign raw MS1 peaks by charge-series matching.

Outputs:
- <deconv_stem>_detected_signals.csv / .png   (neutral-mass peak picks + metadata, includes SNR)
- OUT_DIR/assigned_ms1_with_peaks.csv         (annotated raw MS1 with assigned_mass/charge)
- OUT_DIR/assignments_summary.csv             (one row per accepted neutral mass; now includes SNR)
- OUT_DIR/<several_plots>.png                 (mirror plots + neutral-mass spectrum)
"""

from __future__ import annotations
import os
import re
import json
import glob  # <-- moved up to avoid duplicate import
from dataclasses import dataclass
from pathlib import Path
from bisect import bisect_left

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks


# ============================================================
# --------------------  PEAK DETECTION  ----------------------
# ============================================================

@dataclass
class PeakFindingParams:
    min_prominence: float | None = None
    min_height: float | None = None
    min_distance_pts: int = 10
    smooth_window: int = 0
    min_snr: float = 0.0


def _mad_sigma(y: np.ndarray) -> float:
    if y.size == 0:
        return 0.0
    med = np.median(y)
    mad = np.median(np.abs(y - med))
    return 1.4826 * mad


def _smooth(y: np.ndarray, window: int) -> np.ndarray:
    if window < 3 or window % 2 == 0:
        return y
    kernel = np.ones(window, dtype=float) / window
    return np.convolve(y, kernel, mode="same")


def _extract_id_list(name: str, key: str) -> list[int] | None:
    """
    Extract an underscore- or hyphen-separated list of integers after a key.
    Examples:
      "__pos_3__"          -> [3]
      "__neg_0_1_2_"       -> [0,1,2]
      "-pos-10-11"         -> [10,11]
    """
    m = re.search(rf"(?:^|[_-]){key}((?:[_-]\d+)+)(?=[_-]|$)", name, flags=re.I)
    if not m:
        m1 = re.search(rf"(?:^|[_-]){key}[_-]?(\d+)(?=[_-]|$)", name, flags=re.I)
        if m1:
            return [int(m1.group(1))]
        return None
    parts = re.findall(r"\d+", m.group(1))
    return [int(x) for x in parts] if parts else None


def parse_metadata_from_filename(path: str | Path) -> dict:
    """
    Extract bin, experiments/controls (IDs + counts), regulation, replicate, source_file.
    Regulation token is taken as the LAST occurrence among (negabs|posabs|neg|pos).
    """
    p = Path(path)
    name = p.stem

    meta = {
        "bin": None,
        "experiments": None,
        "controls": None,
        "experiments_ids": None,
        "controls_ids": None,
        "experiments_n": None,
        "controls_n": None,
        "regulation": None,
        "replicate": None,
        "source_file": p.name,
    }

    # bin: "bin5"/"bin_5" or a leading number "75__pos_..."
    m = re.search(r"(?:^|[_-])bin[_-]?(\d+)(?=[_-]|$)", name, flags=re.I)
    if m:
        meta["bin"] = int(m.group(1))
    else:
        m2 = re.match(r"^(\d+)(?=[_-])", name)
        if m2:
            meta["bin"] = int(m2.group(1))

    exp_ids = _extract_id_list(name, "pos")
    ctl_ids = _extract_id_list(name, "neg")
    if exp_ids is not None:
        meta["experiments_ids"] = ",".join(str(x) for x in exp_ids)
        meta["experiments_n"] = len(exp_ids)
        meta["experiments"] = len(exp_ids)
    if ctl_ids is not None:
        meta["controls_ids"] = ",".join(str(x) for x in ctl_ids)
        meta["controls_n"] = len(ctl_ids)
        meta["controls"] = len(ctl_ids)

    reg_tokens = [m.group(1).lower() for m in re.finditer(
        r"(?:^|[_-])(negabs|posabs|neg|pos)(?=[_-]|$)", name, flags=re.I
    )]
    if reg_tokens:
        token = reg_tokens[-1]
        reg_map = {"negabs": "downregulated", "neg": "downregulated",
                   "posabs": "upregulated", "pos": "upregulated"}
        meta["regulation"] = reg_map.get(token)

    m = re.search(r"(?:^|[_-])run([A-Za-z])(?=[_-]|$)", name)
    if m:
        meta["replicate"] = m.group(1).upper()

    return meta


def load_space_separated(path: str | Path) -> pd.DataFrame:
    path = Path(path)
    # whitespace-delimited by default; fallback to CSV
    try:
        df = pd.read_csv(path, sep=r"\s+", engine="python", header=None,
                         names=["mass", "intensity"], comment="#")
    except Exception:
        df = pd.read_csv(path, header=None)
        if df.shape[1] >= 2:
            df = df.iloc[:, :2]
            df.columns = ["mass", "intensity"]
        else:
            raise ValueError("Deconvoluted file must have at least two columns: mass intensity")
    return df


def detect_signals(
    df: pd.DataFrame,
    params: PeakFindingParams = PeakFindingParams()
) -> pd.DataFrame:
    # normalize columns
    if not {"mass", "intensity"}.issubset(df.columns):
        if df.shape[1] >= 2:
            df = df.copy()
            df.columns = ["mass", "intensity"] + [f"col{i}" for i in range(2, df.shape[1])]
        else:
            raise ValueError("Input DataFrame must have columns ['mass','intensity'].")

    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=["mass", "intensity"])
    df = df.sort_values("mass").reset_index(drop=True)

    x = df["mass"].to_numpy(float)
    y = df["intensity"].to_numpy(float)

    y_proc = _smooth(y, params.smooth_window)

    sigma = _mad_sigma(y_proc)
    ymax = float(np.max(y_proc)) if y_proc.size else 0.0

    min_prom = params.min_prominence or max(6.0 * sigma, 0.001 * ymax)
    min_h    = params.min_height     or max(4.0 * sigma, 0.0005 * ymax)

    peaks, props = find_peaks(
        y_proc,
        prominence=min_prom,
        height=min_h,
        distance=max(1, int(params.min_distance_pts))
    )

    out = pd.DataFrame({
        "mass": x[peaks],
        "intensity": y[peaks],
        "prominence": props.get("prominences", np.full(peaks.shape, np.nan)),
        "left_base_idx": props.get("left_bases", np.full(peaks.shape, -1)),
        "right_base_idx": props.get("right_bases", np.full(peaks.shape, -1)),
    })

    # SNR estimate and filter
    snr_den = sigma if sigma > 0 else (np.std(y_proc) if y_proc.size else 1.0)
    snr_den = snr_den if snr_den > 0 else 1.0
    out["snr"] = out["intensity"] / snr_den

    if params.min_snr > 0:
        out = out[out["snr"] >= params.min_snr].reset_index(drop=True)

    return out.sort_values("intensity", ascending=False).reset_index(drop=True)


def plot_spectrum_with_peaks(
    df: pd.DataFrame,
    peaks_df: pd.DataFrame,
    out_png: str | Path | None = None,
    title: str = "Detected Neutral-Mass Signals"
) -> None:
    plt.figure(figsize=(10, 4))
    plt.plot(df["mass"].to_numpy(), df["intensity"].to_numpy(), linewidth=1)
    if peaks_df is not None and not peaks_df.empty:
        plt.scatter(
            peaks_df["mass"].to_numpy(),
            peaks_df["intensity"].to_numpy(),  # use peaks’ intensities
            s=18
        )
    plt.xlabel("Neutral mass (Da)")
    plt.ylabel("Intensity (arb.)")
    plt.title(title)
    plt.tight_layout()
    if out_png:
        plt.savefig(out_png, dpi=150)
    plt.close()


# ============================================================
# -----------------  CHARGE-SERIES MATCHING  -----------------
# ============================================================

# Matching parameters (tweak as needed)
PROTON_MASS = 1.007276466812  # Da

Z_MIN, Z_MAX = 5, 50
PPM_TOL = 1000.0              # ppm window for m/z match
ABS_DA_TOL = 1.0              # absolute Da floor (used with ppm)
MIN_MATCHED_CHARGE_STATES = 4 # require ≥ N charge-state hits to accept a protein


def _read_raw_ms1(path: str) -> pd.DataFrame:
    """
    Robustly read raw MS1 CSV. Expected two columns (m/z, intensity), with or without headers.
    If more columns exist, pick the best 'mz' and 'intensity' columns.
    """
    try:
        df = pd.read_csv(path)
    except Exception:
        df = pd.read_csv(path, header=None)

    if df.shape[1] == 2:
        df.columns = ["mz", "intensity"]
    else:
        cols_lower = [str(c).lower() for c in df.columns]
        mz_candidates = [i for i, c in enumerate(cols_lower)
                         if ("mz" in c) or ("m/z" in c) or ("mass/charge" in c) or (c.strip() == "m z")]
        int_candidates = [i for i, c in enumerate(cols_lower)
                          if ("int" in c) or ("abund" in c) or ("height" in c) or ("signal" in c)]
        if not mz_candidates:
            mz_candidates = [0]
        if not int_candidates:
            int_candidates = [1 if df.shape[1] > 1 else 0]
        df = df.iloc[:, [mz_candidates[0], int_candidates[0]]].copy()
        df.columns = ["mz", "intensity"]

    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    df = df[df["intensity"] > 0].copy()
    df["mz"] = pd.to_numeric(df["mz"], errors="coerce")
    df["intensity"] = pd.to_numeric(df["intensity"], errors="coerce")
    df = df.dropna().sort_values("mz").reset_index(drop=True)
    return df


def _ppm_window(target_mz: float, ppm: float, abs_da: float) -> tuple[float, float]:
    da = target_mz * ppm * 1e-6
    tol = max(da, abs_da)
    return target_mz - tol, target_mz + tol


def _match_targets(sorted_mz: np.ndarray, targets: np.ndarray,
                   ppm: float, abs_da: float, available_mask: np.ndarray) -> dict[int, int | None]:
    results: dict[int, int | None] = {}
    for ti, t in enumerate(targets):
        lo, hi = _ppm_window(t, ppm, abs_da)
        j = bisect_left(sorted_mz, t)
        best_idx = None
        best_delta = float("inf")
        for k in (j, j-1, j+1, j-2, j+2, j-3, j+3):
            if 0 <= k < len(sorted_mz):
                mz_k = sorted_mz[k]
                if available_mask[k] and (lo <= mz_k <= hi):
                    delta = abs(mz_k - t)
                    if delta < best_delta:
                        best_delta = delta
                        best_idx = k
        results[ti] = best_idx
    return results


def _generate_charge_series(neutral_mass: float, z_min: int, z_max: int) -> pd.DataFrame:
    z = np.arange(z_min, z_max + 1, dtype=int)
    mz = (neutral_mass + z * PROTON_MASS) / z
    return pd.DataFrame({"z": z, "target_mz": mz})


def assign_ms1_peaks(raw_df: pd.DataFrame, deconv_peaks_df: pd.DataFrame,
                     meta: dict | None = None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Assign raw MS1 peaks to detected neutral masses (from deconvoluted spectrum) by charge-series matching.

    Returns:
      assigned_raw: raw_df with columns [assigned_mass, assigned_charge, is_assigned]
      assignments_summary: one row per accepted neutral mass with metadata, includes SNR
    """
    raw_df = raw_df.sort_values("mz").reset_index(drop=True)
    mz_arr = raw_df["mz"].to_numpy()
    inten_arr = raw_df["intensity"].to_numpy()
    available = np.ones(len(raw_df), dtype=bool)

    assigned_mass = np.full(len(raw_df), np.nan)
    assigned_z    = np.full(len(raw_df), np.nan)

    summary_rows = []

    for r in deconv_peaks_df.itertuples(index=False):
        mass = float(r.mass)
        mass_intensity = float(r.intensity)
        mass_snr = float(getattr(r, "snr", np.nan))  # <-- carry SNR into the summary

        series = _generate_charge_series(mass, Z_MIN, Z_MAX)
        targets = series["target_mz"].to_numpy()

        matches = _match_targets(mz_arr, targets, PPM_TOL, ABS_DA_TOL, available_mask=available)

        matched_indices = []
        matched_z_list  = []
        matched_mz_list = []

        for ti, k in matches.items():
            if k is not None:
                matched_indices.append(k)
                matched_z_list.append(int(series.iloc[ti]["z"]))
                matched_mz_list.append(mz_arr[k])

        if len(matched_indices) >= MIN_MATCHED_CHARGE_STATES:
            # accept and mark assigned
            for idx, z_val in zip(matched_indices, matched_z_list):
                if available[idx]:
                    available[idx] = False
                    assigned_mass[idx] = mass
                    assigned_z[idx] = z_val

            frac_intensity_removed = (
                float(np.sum(inten_arr[matched_indices])) / float(np.sum(inten_arr))
                if inten_arr.sum() > 0 else 0.0
            )

            row = {
                "neutral_mass": mass,
                "deconv_intensity": mass_intensity,
                "snr": mass_snr,  # <-- new column in assignments_summary
                "n_matches": len(matched_indices),
                "matched_z_list": json.dumps(matched_z_list),
                "matched_mz_list": json.dumps([round(float(x), 1) for x in matched_mz_list]),
                "ppm_tol": PPM_TOL,
                "abs_da_tol": ABS_DA_TOL,
                "fraction_total_intensity_captured": frac_intensity_removed
            }
            # attach filename metadata if available
            if meta:
                row.update({
                    "bin": meta.get("bin"),
                    "experiments_ids": meta.get("experiments_ids"),
                    "controls_ids": meta.get("controls_ids"),
                    "regulation": meta.get("regulation"),
                    "replicate": meta.get("replicate"),
                    "source_file": meta.get("source_file"),
                })
            summary_rows.append(row)

    assigned_raw = raw_df.copy()
    assigned_raw["assigned_mass"] = assigned_mass
    assigned_raw["assigned_charge"] = assigned_z
    assigned_raw["is_assigned"] = ~np.isnan(assigned_mass)

    assignments_summary = pd.DataFrame(summary_rows).sort_values(
        "deconv_intensity", ascending=False).reset_index(drop=True)

    # if meta present, also add bin to assigned_raw (useful downstream)
    if meta and "bin" in meta:
        assigned_raw["bin"] = meta["bin"]

    return assigned_raw, assignments_summary


# ============================================================
# ----------------------  PLOTTING  --------------------------
# ============================================================

def plot_neutral_mass_spectrum(deconv_peaks_df: pd.DataFrame, out_dir: str,
                               filename: str = "neutral_mass_spectrum.png"):
    if deconv_peaks_df.empty:
        print("No neutral masses to plot.")
        return
    masses = deconv_peaks_df["mass"].to_numpy()
    intens = deconv_peaks_df["intensity"].to_numpy()

    plt.figure(figsize=(9, 4.5))
    plt.vlines(masses, 0, intens, linewidth=1)
    plt.xlabel("Neutral mass (Da)")
    plt.ylabel("Intensity (arb.)")
    plt.title("Neutral Mass Spectrum (detected peaks)")
    plt.tight_layout()
    out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200)
    plt.close()
    print(f"Saved plot: {out_path}")


def plot_mirror_assigned_vs_total(assigned_raw: pd.DataFrame, out_dir: str,
                                  filename: str = "mirror_assigned_vs_total.png"):
    if assigned_raw.empty:
        print("No assigned/raw data to plot.")
        return

    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()
    assigned_mask = assigned_raw["is_assigned"].to_numpy(dtype=bool)
    assigned_int = np.where(assigned_mask, total_int, 0.0)

    plt.figure(figsize=(10, 5.2))
    plt.vlines(mz, 0, total_int, linewidth=0.6)                 # Top: total
    plt.vlines(mz[assigned_mask], 0, -assigned_int[assigned_mask], linewidth=0.8)  # Bottom: assigned (neg)

    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -assigned_int.max() if assigned_int.any() else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)

    plt.xlabel("m/z")
    plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Assigned (bottom)")
    plt.tight_layout()
    out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200)
    plt.close()
    print(f"Saved plot: {out_path}")


def plot_mirror_unassigned_vs_total(assigned_raw: pd.DataFrame, out_dir: str,
                                    filename: str = "mirror_unassigned_vs_total.png"):
    if assigned_raw.empty:
        print("No assigned/raw data to plot.")
        return

    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()
    unassigned_mask = ~assigned_raw["is_assigned"].to_numpy(dtype=bool)
    unassigned_int = np.where(unassigned_mask, total_int, 0.0)

    plt.figure(figsize=(10, 5.2))
    plt.vlines(mz, 0, total_int, linewidth=0.6)                     # Top: total
    plt.vlines(mz[unassigned_mask], 0, -unassigned_int[unassigned_mask], linewidth=0.8)  # Bottom: unassigned

    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -unassigned_int.max() if unassigned_int.any() else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)

    plt.xlabel("m/z")
    plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Non-assigned (bottom)")
    plt.tight_layout()
    out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200)
    plt.close()
    print(f"Saved plot: {out_path}")


def plot_mirror_assigned_by_protein_vs_total(
    assigned_raw: pd.DataFrame,
    out_dir: str,
    filename: str = "mirror_assigned_by_protein_vs_total.png",
    max_legend_items: int = 20
):
    if assigned_raw.empty:
        print("No assigned/raw data to plot.")
        return

    mz = assigned_raw["mz"].to_numpy()
    total_int = assigned_raw["intensity"].to_numpy()

    plt.figure(figsize=(11, 5.6))
    plt.vlines(mz, 0, total_int, linewidth=0.5)  # top: total

    df_assigned_only = assigned_raw[assigned_raw["is_assigned"]].copy()
    if df_assigned_only.empty:
        plt.xlabel("m/z")
        plt.ylabel("Intensity (arb.)")
        plt.title("Mirror Plot: Total (top) vs Assigned by Protein (bottom)")
        plt.tight_layout()
        out_path = os.path.join(out_dir, filename)
        plt.savefig(out_path, dpi=200)
        plt.close()
        print(f"Saved plot: {out_path}")
        return

    counts = (
        df_assigned_only.groupby("assigned_mass", dropna=True)["is_assigned"]
        .count()
        .sort_values(ascending=False)
    )
    proteins_in_order = counts.index.tolist()
    color_cycle = plt.rcParams['axes.prop_cycle'].by_key().get(
        'color', ['C0','C1','C2','C3','C4','C5','C6','C7','C8','C9']
    )

    legend_entries = 0
    for i, mass in enumerate(proteins_in_order):
        mask = (assigned_raw["assigned_mass"] == mass)
        mz_i = assigned_raw.loc[mask, "mz"].to_numpy()
        inten_i = assigned_raw.loc[mask, "intensity"].to_numpy()

        label = None
        if legend_entries < max_legend_items:
            label = f"{mass/1000:.2f} kDa (n={len(mz_i)})"
            legend_entries += 1

        plt.vlines(
            mz_i, 0, -inten_i,
            linewidth=0.8,
            color=color_cycle[i % len(color_cycle)],
            label=label
        )

    ymax = total_int.max() if len(total_int) else 1.0
    ymin = -df_assigned_only["intensity"].max() if len(df_assigned_only) else -0.1 * ymax
    plt.ylim(ymin * 1.05, ymax * 1.05)

    if legend_entries:
        plt.legend(title="Assigned proteins", loc="upper right", fontsize=8, ncol=1)

    plt.xlabel("m/z")
    plt.ylabel("Intensity (arb.)")
    plt.title("Mirror Plot: Total (top) vs Assigned by Protein (bottom)")
    plt.tight_layout()
    out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, dpi=200)
    plt.close()
    print(f"Saved plot: {out_path}")


# ============================================================
# ----------------------  BATCH MAIN  ------------------------
# ============================================================
# Process many files: deconvoluted spectra in one folder,
# raw MS1 spectra in another folder. Outputs go to a separate folder.
#
# Pairing rule: files are matched by a shared "base key"
#   - deconv: <base>_mass.txt
#   - raw:    <base>.csv
# Example:
#   RAW_DIR:    F:/raw_folder
#       5__pos_1__neg_0_pos_runA.csv
#   DECONV_DIR: F:/deconv_folder
#       5__pos_1__neg_0_pos_runA_mass.txt
#   -> base key = "5__pos_1__neg_0_pos_runA"
#
# Notes:
# - Neutral-mass peak detection is always run (on decon files).
# - Assignment & mirror plots are run only if the matching raw CSV exists.
# - All outputs (CSVs/PNGs) go under OUT_DIR/<base>/...
# ============================================================

# ---------------------- User-configurable ----------------------
RAW_DIR    = r"F:\binary\raw"      # folder with raw MS1 CSVs (m/z, intensity)
DECONV_DIR = r"F:\binary\decon"    # folder with *_mass.txt (mass intensity)
OUT_DIR    = r"F:\binary\firstpass"   # folder to hold all outputs

# Glob patterns (adjust if your extensions differ)
RAW_GLOB    = "*.csv"
DECONV_GLOB = "*_mass.txt"

# ---- Neutral-mass peak detection parameters ----
DECONV_DETECT_PARAMS = PeakFindingParams(
    min_distance_pts=20,  # decon masses can be coarse; 20 is a good start
    min_snr=10,           # enforce minimum SNR
    smooth_window=0,      # set to 5/7 if your decon spectrum is very noisy
    # min_prominence=None, min_height=None  # auto from MAD if None
)

def _base_key_from_deconv(path: Path) -> str:
    """
    For '.../<base>_mass.txt' → return '<base>'.
    """
    stem = path.stem
    if stem.endswith("_mass"):
        return stem[:-5]  # drop "_mass"
    return stem  # fallback


def _base_key_from_raw(path: Path) -> str:
    """
    For '.../<base>.csv' → return '<base>'.
    """
    return path.stem


def _ensure_dir(p: str | Path) -> None:
    Path(p).mkdir(parents=True, exist_ok=True)


def process_one_pair(deconv_path: Path, raw_path: Path | None) -> None:
    """
    - Detect neutral-mass peaks from deconv_path (always).
    - If raw_path exists, assign charge-series and make plots.
    - Write all outputs under OUT_DIR/<base>/...
    """
    base = _base_key_from_deconv(deconv_path)
    out_root = Path(OUT_DIR) / base
    _ensure_dir(out_root)

    # --- 1) Load deconvoluted spectrum & detect neutral-mass peaks ---
    meta = parse_metadata_from_filename(deconv_path)
    deconv_raw = load_space_separated(deconv_path)
    deconv_peaks = detect_signals(deconv_raw, params=DECONV_DETECT_PARAMS)

    # Attach filename metadata to neutral-mass peaks table
    deconv_peaks = deconv_peaks.assign(
        bin=meta.get("bin"),
        experiments_ids=meta.get("experiments_ids"),
        controls_ids=meta.get("controls_ids"),
        regulation=meta.get("regulation"),
        replicate=meta.get("replicate"),
        source_file=meta.get("source_file"),
    )

    # Save + plot neutral-mass detections (to OUT_DIR/<base>/...)
    out_detect_csv = out_root / f"{base}_detected_signals.csv"
    out_detect_png = out_root / f"{base}_detected_signals.png"
    deconv_peaks.to_csv(out_detect_csv, index=False)
    plot_spectrum_with_peaks(deconv_raw, deconv_peaks, out_png=str(out_detect_png))
    print(f"[{base}] Neutral-mass detection: {len(deconv_peaks)} peaks → {out_detect_csv}")
    print(f"[{base}] Detection plot saved → {out_detect_png}")
    print(f"[{base}] Parsed filename metadata: {meta}")

    # --- 2) If we have the matching raw MS1 CSV, assign charge-series ---
    if raw_path is None or not raw_path.exists():
        print(f"[{base}] ⚠ No matching RAW CSV found. Skipping assignment.")
        return

    raw_df = _read_raw_ms1(raw_path)
    assigned_raw, summary = assign_ms1_peaks(raw_df, deconv_peaks, meta=meta)

    out_assigned = out_root / f"{base}_assigned_ms1_with_peaks.csv"
    out_summary  = out_root / f"{base}_assignments_summary.csv"
    assigned_raw.to_csv(out_assigned, index=False)
    summary.to_csv(out_summary, index=False)

    # --- 3) Plots on assignments ---
    plot_neutral_mass_spectrum(deconv_peaks, str(out_root), filename=f"{base}_neutral_mass_spectrum.png")
    plot_mirror_assigned_vs_total(assigned_raw, str(out_root), filename=f"{base}_mirror_assigned_vs_total.png")
    plot_mirror_unassigned_vs_total(assigned_raw, str(out_root), filename=f"{base}_mirror_unassigned_vs_total.png")
    plot_mirror_assigned_by_protein_vs_total(
        assigned_raw, str(out_root), filename=f"{base}_mirror_assigned_by_protein_vs_total.png", max_legend_items=20
    )

    # --- 4) Console report ---
    print(f"\n=== [{base}] Summary ===")
    print(f"Raw MS1 peaks (rows): {len(raw_df):,}")
    print(f"Detected neutral-mass peaks: {len(deconv_peaks):,}")
    print(f"Assigned raw peaks: {int(assigned_raw['is_assigned'].sum()):,}")
    print(f"Non-assigned raw peaks: {int((~assigned_raw['is_assigned']).sum()):,}")
    print(f"Saved: {out_assigned}")
    print(f"Saved: {out_summary}")
    print("------------------------------------------------------------")


def main():
    _ensure_dir(OUT_DIR)

    # Index raw files by base key
    raw_files = [Path(p) for p in glob.glob(str(Path(RAW_DIR) / RAW_GLOB))]
    raw_index = {_base_key_from_raw(p): p for p in raw_files}

    # Walk all deconvoluted files and process
    deconv_files = [Path(p) for p in glob.glob(str(Path(DECONV_DIR) / DECONV_GLOB))]
    if not deconv_files:
        print(f"⚠ No deconvoluted files found in: {DECONV_DIR} (pattern: {DECONV_GLOB})")
        return

    print(f"Found {len(deconv_files)} decon file(s) in {DECONV_DIR}")
    print(f"Found {len(raw_files)} raw CSV file(s) in {RAW_DIR}")

    for deconv_path in sorted(deconv_files):
        base = _base_key_from_deconv(deconv_path)
        raw_path = raw_index.get(base, None)
        try:
            process_one_pair(deconv_path, raw_path)
        except Exception as e:
            print(f"[{base}] ❌ Error: {e}")

    print("✅ Batch processing complete.")


if __name__ == "__main__":
    main()


Found 8 decon file(s) in F:\binary\decon
Found 8 raw CSV file(s) in F:\binary\raw
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Neutral-mass detection: 55 peaks → F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_detected_signals.csv
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Detection plot saved → F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_detected_signals.png
[frac_pellet_grads_AB__pos_1__neg_0_negabs_runA] Parsed filename metadata: {'bin': None, 'experiments': 1, 'controls': 1, 'experiments_ids': '1', 'controls_ids': '0', 'experiments_n': 1, 'controls_n': 1, 'regulation': 'downregulated', 'replicate': 'A', 'source_file': 'frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_mass.txt'}
Saved plot: F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_neutral_mass_spectrum.png
Saved plot: F

In [5]:
# -*- coding: utf-8 -*-
"""
Collect all *_assignments_summary.csv files from subfolders into one folder,
then concatenate them into a single report.csv.

Steps:
1. Search recursively for *_assignments_summary.csv in BATCH_OUT_DIR.
2. Copy all to SUMMARY_OUT (renaming duplicates).
3. Concatenate all collected CSVs → report.csv.
"""

import os
import glob
import shutil
from pathlib import Path
import pandas as pd

# ---------------------- USER SETTINGS ----------------------
BATCH_OUT_DIR = r"F:\binary\firstpass"          # where all subfolders were created
SUMMARY_OUT   = r"F:\binary\report1"             # folder to collect summaries
PATTERN       = "*_assignments_summary.csv"     # filename pattern
OUTPUT_FILE   = Path(SUMMARY_OUT) / "report.csv"
# ------------------------------------------------------------

def collect_summary_files():
    """Collect all *_assignments_summary.csv into SUMMARY_OUT."""
    os.makedirs(SUMMARY_OUT, exist_ok=True)
    summary_files = glob.glob(str(Path(BATCH_OUT_DIR) / "**" / PATTERN), recursive=True)

    if not summary_files:
        print("⚠ No summary files found.")
        return []

    print(f"Found {len(summary_files)} summary file(s). Copying to {SUMMARY_OUT}...")
    copied = []
    for f in summary_files:
        src = Path(f)
        dst = Path(SUMMARY_OUT) / src.name
        if dst.exists():
            dst = Path(SUMMARY_OUT) / f"{src.parent.name}_{src.name}"
        shutil.copy2(src, dst)
        copied.append(dst)
        print(f"Copied: {src} → {dst}")

    print("✅ Copy complete.")
    return copied


def concat_csvs(folder_path: str, output_file: str):
    """Concatenate all CSV files in folder_path → output_file."""
    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the folder!")

    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        df = pd.read_csv(file_path)
        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(output_file, index=False)
    print(f"✅ Combined {len(csv_files)} files → {output_file}")


def main():
    copied_files = collect_summary_files()
    if copied_files:
        concat_csvs(SUMMARY_OUT, OUTPUT_FILE)


if __name__ == "__main__":
    main()


Found 8 summary file(s). Copying to F:\binary\report1...
Copied: F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_assignments_summary.csv → F:\binary\report1\frac_pellet_grads_AB__pos_1__neg_0_negabs_runA_assignments_summary.csv
Copied: F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB_assignments_summary.csv → F:\binary\report1\frac_pellet_grads_AB__pos_1__neg_0_negabs_runB_assignments_summary.csv
Copied: F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_pos_runA\frac_pellet_grads_AB__pos_1__neg_0_pos_runA_assignments_summary.csv → F:\binary\report1\frac_pellet_grads_AB__pos_1__neg_0_pos_runA_assignments_summary.csv
Copied: F:\binary\firstpass\frac_pellet_grads_AB__pos_1__neg_0_pos_runB\frac_pellet_grads_AB__pos_1__neg_0_pos_runB_assignments_summary.csv → F:\binary\report1\frac_pellet_grads_AB__pos_1__neg_0_pos_runB_assignments_summary.csv
Copied: F:\bi

Quantification of all proteoforms

In [9]:
import os
import ast
import pandas as pd

# --------------------
# Config (edit paths)
# --------------------
DATASET_RT_PATH = r"F:\binary\ms1_aggregate_per_sample_training.csv"          # wide matrix with cast_* columns
ASSIGNMENTS_PATH = r"F:\binary\report1\report.csv"          # has 'fractions' and 'matched_mz_list'
OUT_PATH = os.path.join(
    os.path.dirname(ASSIGNMENTS_PATH) or ".",
    "assignments_with_quant_sums_aaa.csv"
)

# --------------------
# Helpers
# --------------------
def to_cast_col(n: float) -> str:
    """Map an m/z to its cast_* column name: int((mz-600)*10), zero-padded."""
    col_num = int((float(n) - 600.0) * 10.0)
    return "cast_" + str(col_num).zfill(5)

def parse_mz_list(val):
    """Safely parse matched_mz_list cells that look like '[864.9, 865.2, ...]'."""
    try:
        out = ast.literal_eval(str(val))
        if isinstance(out, (list, tuple)):
            return [float(x) for x in out]
    except Exception:
        pass
    return []

# --------------------
# Load data
# --------------------
df_rt = pd.read_csv(DATASET_RT_PATH)
df_asn = pd.read_csv(ASSIGNMENTS_PATH)

# Basic checks
for col in ["fractions", "target"]:
    if col not in df_rt.columns:
        raise KeyError(f"'{col}' column is required in dataset_rt.csv")

if "fractions" not in df_asn.columns or "matched_mz_list" not in df_asn.columns:
    raise KeyError("assignments CSV must contain 'fractions' and 'matched_mz_list' columns")

# NEW columns to be added to assignments
new_cols = ["group_0_sum", "group_1_sum", "group_2_sum", "group_3_sum",
            "n_mz_used", "n_mz_found", "missing_cast_columns"]
for c in new_cols:
    if c in df_asn.columns:
        # avoid accidental overwrite
        df_asn.drop(columns=[c], inplace=True)

# --------------------
# Row-wise quantification
# --------------------
results = []
for idx, row in df_asn.iterrows():
    frac_value = row["fractions"]  # keep as-is (can be string like 'soluble_fraction')
    mz_list = parse_mz_list(row["matched_mz_list"])
    cast_cols = [to_cast_col(mz) for mz in mz_list]

    # Filter dataset_rt to this fraction
    df_frac = df_rt[df_rt["fractions"] == frac_value]
    if df_frac.empty:
        res = dict(
            group_0_sum=float("nan"),
            group_1_sum=float("nan"),
            group_2_sum=float("nan"),
            group_3_sum=float("nan"),
            n_mz_used=len(cast_cols),
            n_mz_found=0,
            missing_cast_columns=", ".join(cast_cols) if cast_cols else ""
        )
        results.append(res)
        continue

    # Ensure target present
    if "target" not in df_frac.columns:
        raise KeyError("Column 'target' not found in dataset_rt.csv")

    existing = [c for c in cast_cols if c in df_frac.columns]
    missing = [c for c in cast_cols if c not in df_frac.columns]

    if not existing:
        sums = {0: float("nan"), 1: float("nan"), 2: float("nan"), 3: float("nan")}
    else:
        # Sum intensities across all selected cast_* columns per target
        grouped = df_frac.groupby("target")[existing].sum()
        total_per_target = grouped.sum(axis=1)  # sum across those cast_* columns
        sums = {t: float(total_per_target.get(t, float("nan"))) for t in [0, 1, 2, 3]}

    res = dict(
        group_0_sum=sums[0],
        group_1_sum=sums[1],
        group_2_sum=sums[2],
        group_3_sum=sums[3],
        n_mz_used=len(cast_cols),
        n_mz_found=len(existing),
        missing_cast_columns=", ".join(missing) if missing else ""
    )
    results.append(res)

# Attach results
df_quant = pd.DataFrame(results, index=df_asn.index)
df_asn_out = pd.concat([df_asn, df_quant], axis=1)

# --------------------
# Save updated CSV
# --------------------
df_asn_out.to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")




Saved: F:\binary\report1\assignments_with_quant_sums_aaa.csv


identification of PFR by matching with tdportal report

In [None]:
# -*- coding: utf-8 -*-
"""
Combine charge-assignment summary with best matches from a databank,
supporting MULTIPLE (rt_window, mz_tol, mass_tol) triplets in one run.

- Reads:
    df1: assignments_summary (must have: neutral_mass, bin (or 'bin '), matched_mz_list)
    df2: databank_with_ids (must have: rt_aligned, precursor_mz, MASS, Accession, PFR)

- For each row in df1, for each m/z in matched_mz_list:
    find the single best df2 row where ALL hold:
        |rt_aligned - bin|    <= rt_window
        |precursor_mz - m/z|  <= mz_tol
        |MASS - neutral_mass| <= mass_tol
  Then format:
      best_match_* : "[<mz>: <Accession>, <MASS_from_df2>, <PFR>] ..."  (PFR optional)
      matched_pfr_*: "[<PFR_or_null_per_mz> ...]"  (aligned with matched_mz_list)
      mode_pfr_*   : most common non-null PFR across matches (per row)
      mode_pfr_count_* : frequency (count) of that PFR (non-null only)
      mode_accession_* : most common Accession across matches (per row), shown only if mode_pfr_count_* >= MIN_MODE_PFR_COUNT

- Outputs:
    One CSV with multiple columns per tolerance triplet:
      best_match_rt<RT>_mz<MZ>_mass<MASS>
      matched_pfr_rt<RT>_mz<MZ>_mass<MASS>
      mode_pfr_rt<RT>_mz<MZ>_mass<MASS>
      mode_pfr_count_rt<RT>_mz<MZ>_mass<MASS>
      mode_accession_rt<RT>_mz<MZ>_mass<MASS>

Edit the 3 PATHS and the PARAM_SETS below before running.
"""

from __future__ import annotations
import os
import ast
from typing import Optional, Dict, List, Tuple, Any
from collections import Counter

import numpy as np
import pandas as pd


# ----------------------------
# CONFIG: edit these paths
# ----------------------------
CHARGE_FILE_PATH = r"F:/binary/report1/report1.csv"
DATABANK_PATH    = r"F:/binary/ms2_with_ids_hash_nocast.csv"
OUTPUT_PATH      = r"F:/binary/ids1.csv"

# Provide one or more (rt_window, mz_tol, mass_tol) triplets here.
PARAM_SETS: List[Tuple[float, float, float]] = [
    (55.0, 2.0, 90.0),
    # (30.0, 1.0, 50.0),
]

# Keep "null" placeholders in matched_pfr_* so positions align with mz_list.
PFR_KEEP_PLACEHOLDERS: bool = True

# Minimum frequency required to report mode PFR and mode Accession.
MIN_MODE_PFR_COUNT: int = 1


# ----------------------------
# Helpers
# ----------------------------
def _num(s: pd.Series) -> pd.Series:
    """Coerce to numeric, invalid → NaN."""
    return pd.to_numeric(s, errors="coerce")


def _to_scalar(x: Any) -> Any:
    """Flatten 0-d arrays and coerce numeric strings to float when possible."""
    if isinstance(x, np.ndarray) and x.ndim == 0:
        x = x.item()
    if isinstance(x, str):
        try:
            return float(x)
        except Exception:
            return x
    return x


def _safe_parse_list(val) -> List[float]:
    """Convert a string-repr list into a Python list of floats safely."""
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [float(x) for x in parsed]
            return []
        except Exception:
            return []
    if isinstance(val, (list, tuple, np.ndarray)):
        try:
            return [float(x) for x in val]
        except Exception:
            return []
    return []


def _ensure_columns(df: pd.DataFrame, required: List[str]) -> None:
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required column(s): {missing}")


def _fmt_suffix(v: float) -> str:
    """
    Make a tidy string for column suffixes (avoid many decimals).
    e.g., 10 -> '10', 2.0 -> '2', 1.5 -> '1p5'
    """
    if float(v).is_integer():
        return f"{int(v)}"
    # Replace '.' with 'p' to keep it column-name friendly
    return str(v).replace('.', 'p')


def _mode_or_none(items: List[Any]) -> Optional[Any]:
    """
    Return the most common value in `items` excluding None/NaN.
    If tie, Counter.most_common returns first encountered top count.
    """
    clean = []
    for v in items:
        if v is None:
            continue
        if isinstance(v, float) and np.isnan(v):
            continue
        clean.append(v)
    if not clean:
        return None
    return Counter(clean).most_common(1)[0][0]


def _mode_and_count(items: List[Any]) -> Tuple[Optional[Any], int]:
    """
    Return (mode_value, count) over non-null items.
    If no non-null items, returns (None, 0).
    """
    clean = []
    for v in items:
        if v is None:
            continue
        if isinstance(v, float) and np.isnan(v):
            continue
        clean.append(v)
    if not clean:
        return None, 0
    val, cnt = Counter(clean).most_common(1)[0]
    return val, int(cnt)


def _mode_and_count_with_cutoff(
    pfrs: List[Any], accs: List[Any], min_count: int
) -> Tuple[Optional[Any], int, Optional[Any]]:
    """
    Return (mode_pfr, count, mode_accession) applying a frequency cutoff on PFR.
    If the mode PFR count < min_count, return (None, 0, None).
    Otherwise return (mode_pfr, count, mode_accession).
    """
    pfr_val, pfr_cnt = _mode_and_count(pfrs)
    acc_val = _mode_or_none(accs)

    if pfr_cnt < min_count:
        return None, 0, None
    return pfr_val, pfr_cnt, acc_val


# ----------------------------
# Core search
# ----------------------------
def search_best(
    df2: pd.DataFrame,
    rt_query: float,
    mz_query: float,
    mass_query: float,
    rt_window: float,
    mz_tol: float,
    mass_tol: float
) -> Optional[Dict]:
    """
    Return the single best match (row as dict) if ALL three criteria match:
      |rt - rt_query| <= rt_window
      |mz - mz_query| <= mz_tol
      |mass - mass_query| <= mass_tol
    Otherwise returns None.
    """
    d_rt   = (df2["rt_aligned"] - float(rt_query)).abs()
    d_mz   = (df2["precursor_mz"] - float(mz_query)).abs()
    d_mass = (df2["MASS"] - float(mass_query)).abs()

    mask = (d_rt <= rt_window) & (d_mz <= mz_tol) & (d_mass <= mass_tol)
    if not mask.any():
        return None

    cand = df2.loc[mask].copy()
    cand["score"] = (
        d_rt.loc[cand.index] / max(rt_window, 1e-12) +
        d_mz.loc[cand.index] / max(mz_tol, 1e-12) +
        d_mass.loc[cand.index] / max(mass_tol, 1e-12)
    )
    best_row = cand.sort_values("score", kind="mergesort").iloc[0]
    return best_row.to_dict()


# ---------- per-row collectors (single pass across m/z list) ----------
def _collect_matches_for_row(
    row: pd.Series,
    df2: pd.DataFrame,
    rt_window: float,
    mz_tol: float,
    mass_tol: float
) -> Tuple[List[str], List[Optional[Any]], List[Optional[str]], List[Optional[float]]]:
    """
    For a df1 row, iterate over matched_mz_list and collect:
      - mz_tokens for best_match string (aligned, with placeholders)
      - pfr_list  (aligned, None for missing/NaN)
      - acc_list  (aligned, None for no match)
      - mass_list (aligned, MASS from df2 if matched, else None)

    Returns (best_match_tokens, pfr_list, acc_list, mass_list).
    """
    neutral_mass   = row.get("neutral_mass", np.nan)
    retention_time = row.get("bin", row.get("bin ", np.nan))
    mz_list        = _safe_parse_list(row.get("matched_mz_list", []))

    if pd.isna(neutral_mass) or pd.isna(retention_time) or not mz_list:
        return [], [], [], []

    tokens: List[str] = []
    pfrs:   List[Optional[Any]]   = []
    accs:   List[Optional[str]]   = []
    masses: List[Optional[float]] = []

    for mz_value in mz_list:
        res = search_best(
            df2,
            rt_query=float(retention_time),
            mz_query=float(mz_value),
            mass_query=float(neutral_mass),
            rt_window=rt_window,
            mz_tol=mz_tol,
            mass_tol=mass_tol,
        )
        if res is not None:
            uniprot_id = res.get("Accession", "NA")
            mass_match = _to_scalar(res.get("MASS", neutral_mass))
            pfr_val    = _to_scalar(res.get("PFR", None))
            if pfr_val is None or (isinstance(pfr_val, float) and np.isnan(pfr_val)):
                tokens.append(f"{mz_value}: {uniprot_id}, {mass_match}")
                pfrs.append(None)
            else:
                tokens.append(f"{mz_value}: {uniprot_id}, {mass_match}, {pfr_val}")
                pfrs.append(pfr_val)
            accs.append(uniprot_id)
            masses.append(mass_match)
        else:
            tokens.append(f"{mz_value}: NA")
            pfrs.append(None)
            accs.append(None)
            masses.append(None)

    return tokens, pfrs, accs, masses


def best_match_formatter_from_tokens(tokens: List[str]) -> Optional[str]:
    if not tokens:
        return None
    return "[" + ", ".join(tokens) + "]"


def matched_pfr_from_list(pfrs: List[Optional[Any]], keep_placeholders: bool) -> Optional[str]:
    if not pfrs:
        return None
    if keep_placeholders:
        return "[" + ", ".join("null" if v is None else str(v) for v in pfrs) + "]"
    else:
        pruned = [str(v) for v in pfrs if v is not None]
        return "[" + ", ".join(pruned) + "]" if pruned else None


# ----------------------------
# Main
# ----------------------------
def main():
    # Load CSVs
    if not os.path.exists(CHARGE_FILE_PATH):
        raise FileNotFoundError(f"Not found: {CHARGE_FILE_PATH}")
    if not os.path.exists(DATABANK_PATH):
        raise FileNotFoundError(f"Not found: {DATABANK_PATH}")

    df1 = pd.read_csv(CHARGE_FILE_PATH)
    df2 = pd.read_csv(DATABANK_PATH)

    # Normalize df1 column names to handle accidental trailing spaces, capitalization, etc.
    df1.columns = [c.strip() for c in df1.columns]

    # Ensure required columns in both tables (with tolerant check for 'bin' / 'bin ')
    need_df1 = ["neutral_mass", "matched_mz_list"]
    _ensure_columns(df1, need_df1)
    if "bin" not in df1.columns and "bin " not in df1.columns:
        raise KeyError("df1 must contain 'bin' (or 'bin ').")

    # Ensure essential df2 columns (PFR required for the new output)
    _ensure_columns(df2, ["rt_aligned", "precursor_mz", "MASS", "Accession", "PFR"])

    # If df1 had 'bin ' originally, create 'bin' as an alias to simplify downstream code
    if "bin" not in df1.columns and "bin " in df1.columns:
        df1["bin"] = df1["bin "]

    # Pre-coerce df2 numerics once (for speed)
    df2 = df2.copy()
    df2["rt_aligned"]   = _num(df2["rt_aligned"])
    df2["precursor_mz"] = _num(df2["precursor_mz"])
    df2["MASS"]         = _num(df2["MASS"])
    # PFR can be numeric or categorical; try to coerce but keep strings if not
    try:
        df2["PFR"] = pd.to_numeric(df2["PFR"], errors="ignore")
    except Exception:
        pass

    # Build output columns per tolerance triplet
    for (rt_w, mz_t, mass_t) in PARAM_SETS:
        suffix = f"rt{_fmt_suffix(rt_w)}_mz{_fmt_suffix(mz_t)}_mass{_fmt_suffix(mass_t)}"

        match_col   = f"best_match_{suffix}"
        pfr_col     = f"matched_pfr_{suffix}"
        mode_pfr    = f"mode_pfr_{suffix}"
        mode_pfr_n  = f"mode_pfr_count_{suffix}"
        mode_acc    = f"mode_accession_{suffix}"

        best_tokens_series: List[List[str]] = []
        pfr_list_series:    List[List[Optional[Any]]] = []
        acc_list_series:    List[List[Optional[str]]] = []

        # Compute per-row tokens and lists in one pass
        for _, row in df1.iterrows():
            tokens, pfrs, accs, _masses = _collect_matches_for_row(
                row, df2, rt_window=rt_w, mz_tol=mz_t, mass_tol=mass_t
            )
            best_tokens_series.append(tokens)
            pfr_list_series.append(pfrs)
            acc_list_series.append(accs)

        # Populate columns
        df1[match_col] = [best_match_formatter_from_tokens(toks) for toks in best_tokens_series]
        df1[pfr_col]   = [matched_pfr_from_list(pfrs, keep_placeholders=PFR_KEEP_PLACEHOLDERS)
                          for pfrs in pfr_list_series]

        # Most common PFR + its frequency (with cutoff), and most common Accession (masked if cutoff not met)
        mode_results = [
            _mode_and_count_with_cutoff(pfrs, accs, min_count=MIN_MODE_PFR_COUNT)
            for pfrs, accs in zip(pfr_list_series, acc_list_series)
        ]
        df1[mode_pfr]   = [mr[0] for mr in mode_results]
        df1[mode_pfr_n] = [mr[1] for mr in mode_results]
        df1[mode_acc]   = [mr[2] for mr in mode_results]

    # Save single CSV containing all columns
    out_dir = os.path.dirname(OUTPUT_PATH) or "."
    os.makedirs(out_dir, exist_ok=True)
    df1.to_csv(OUTPUT_PATH, index=False)
    print(
        f"Saved with {len(PARAM_SETS)} sets of columns "
        f"(best_match / matched_pfr / mode_pfr / mode_pfr_count / mode_accession) → {OUTPUT_PATH}"
    )


if __name__ == "__main__":
    main()


  df2["PFR"] = pd.to_numeric(df2["PFR"], errors="ignore")


Saved with 1 sets of columns (best_match / matched_pfr / mode_pfr / mode_pfr_count / mode_accession) → F:/binary/ids.csv


In [14]:
# -*- coding: utf-8 -*-
"""
Match assignments (report.csv) to MS2 identification table (ms2_id.csv)
by precursor_mz and neutral_mass ONLY (no retention time).

Inputs:
  df1 (report): must have 'neutral_mass' and 'matched_mz_list'
  df2 (ms2_id): must have 'precursor_mz', 'MASS', 'Accession'

Outputs per tolerance pair:
  best_match_mz<MZ>_mass<MASS>
  matched_acc_mz<MZ>_mass<MASS>
  matched_mass_mz<MZ>_mass<MASS>
  mode_accession_mz<MZ>_mass<MASS>
  mode_mass_mz<MZ>_mass<MASS>
  mode_accession_count_mz<MZ>_mass<MASS>
"""

import os
import ast
from typing import Optional, Dict, List, Tuple, Any
from collections import Counter

import numpy as np
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
REPORT_PATH = r"F:/binary/report1/report.csv"                 # your report
MS2_PATH    = r"F:/binary/ms2_with_ids_hash_nocast.csv"       # your ms2_id table
OUTPUT_PATH = r"F:/binary/ids1.csv"

# Provide one or more (mz_tol, mass_tol) pairs here
PARAM_SETS: List[Tuple[float, float]] = [
    (2.0, 90.0),
    # (1.0, 50.0),
]

# ----------------------------
# Helpers
# ----------------------------
def _num(s: pd.Series) -> pd.Series:
    """Coerce to numeric, invalid → NaN."""
    return pd.to_numeric(s, errors="coerce")

def _safe_parse_list(val) -> List[float]:
    """Safely parse matched_mz_list into a list of floats."""
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [float(x) for x in parsed]
            return []
        except Exception:
            return []
    if isinstance(val, (list, tuple, np.ndarray)):
        try:
            return [float(x) for x in val]
        except Exception:
            return []
    return []

def _fmt_suffix(v: float) -> str:
    """Make a tidy string for column suffixes (avoid many decimals)."""
    if float(v).is_integer():
        return str(int(v))
    return str(v).replace(".", "p")

def _stringify_non_null(seq) -> Optional[str]:
    """Return JSON-ish list string of non-null items, or None if empty."""
    out = []
    for x in seq:
        if x is None:
            continue
        if isinstance(x, float) and np.isnan(x):
            continue
        out.append(str(x))
    return "[" + ", ".join(out) + "]" if out else None

def _mode_and_count_with_mass(accs: List[Any], masses: List[Any]) -> Tuple[Optional[str], Optional[float], int]:
    """
    Return (mode_accession, mode_mass, count) for the most common accession.
    The mass reported is the most common mass among those matching the mode accession.
    """
    clean_accs = []
    for a in accs:
        if a is None:
            continue
        if isinstance(a, float) and np.isnan(a):
            continue
        clean_accs.append(str(a))
    if not clean_accs:
        return None, None, 0

    counter = Counter(clean_accs)
    mode_acc, mode_cnt = counter.most_common(1)[0]

    # Pick most common mass corresponding to that accession
    linked_masses = [m for a, m in zip(accs, masses) if str(a) == mode_acc]
    if linked_masses:
        mass_mode, _ = Counter(linked_masses).most_common(1)[0]
    else:
        mass_mode = None

    return mode_acc, mass_mode, int(mode_cnt)

# ----------------------------
# Matching logic (no RT)
# ----------------------------
def search_best(df2: pd.DataFrame, mz_query: float, mass_query: float,
                mz_tol: float, mass_tol: float) -> Optional[Dict]:
    d_mz   = (df2["precursor_mz"] - mz_query).abs()
    d_mass = (df2["MASS"] - mass_query).abs()
    mask = (d_mz <= mz_tol) & (d_mass <= mass_tol)
    if not mask.any():
        return None

    cand = df2.loc[mask].copy()
    cand["score"] = d_mz.loc[cand.index] / max(mz_tol, 1e-12) + d_mass.loc[cand.index] / max(mass_tol, 1e-12)
    return cand.sort_values("score", kind="mergesort").iloc[0].to_dict()

# ----------------------------
# Main
# ----------------------------
def main():
    # --- load ---
    if not os.path.exists(REPORT_PATH):
        raise FileNotFoundError(f"Not found: {REPORT_PATH}")
    if not os.path.exists(MS2_PATH):
        raise FileNotFoundError(f"Not found: {MS2_PATH}")

    df1 = pd.read_csv(REPORT_PATH)
    df2 = pd.read_csv(MS2_PATH)

    # --- ensure numeric & safe dtypes ---
    df2["precursor_mz"] = _num(df2["precursor_mz"])
    df2["MASS"]         = _num(df2["MASS"])
    if "Accession" not in df2.columns:
        raise KeyError("MS2 file must contain 'Accession'.")
    df2["Accession"] = df2["Accession"].astype(str)

    # --- process ---
    for (mz_t, mass_t) in PARAM_SETS:
        suffix   = f"mz{_fmt_suffix(mz_t)}_mass{_fmt_suffix(mass_t)}"
        best_col = f"best_match_{suffix}"
        acc_col  = f"matched_acc_{suffix}"
        mass_col = f"matched_mass_{suffix}"
        mode_acc = f"mode_accession_{suffix}"
        mode_mass = f"mode_mass_{suffix}"
        mode_cnt = f"mode_accession_count_{suffix}"

        best_list, acc_list, mass_list = [], [], []
        mode_acc_list, mode_mass_list, mode_cnt_list = [], [], []

        for _, row in df1.iterrows():
            mz_list = _safe_parse_list(row.get("matched_mz_list", []))
            neutral_mass = row.get("neutral_mass", np.nan)

            if pd.isna(neutral_mass) or not mz_list:
                best_list.append(None)
                acc_list.append(None)
                mass_list.append(None)
                mode_acc_list.append(None)
                mode_mass_list.append(None)
                mode_cnt_list.append(0)
                continue

            matches, accs, masses = [], [], []
            for mz_val in mz_list:
                res = search_best(df2, float(mz_val), float(neutral_mass), mz_t, mass_t)
                if res is not None:
                    acc = res.get("Accession", "NA")
                    mass_match = res.get("MASS", neutral_mass)
                    matches.append(f"{mz_val}: {acc}, {mass_match}")
                    accs.append(acc)
                    masses.append(mass_match)
                else:
                    matches.append(f"{mz_val}: NA")
                    accs.append(None)
                    masses.append(None)

            best_list.append("[" + ", ".join(matches) + "]")
            acc_list.append(_stringify_non_null(accs))
            mass_list.append(_stringify_non_null(masses))

            m_acc, m_mass, m_cnt = _mode_and_count_with_mass(accs, masses)
            mode_acc_list.append(m_acc)
            mode_mass_list.append(m_mass)
            mode_cnt_list.append(m_cnt)

        df1[best_col]  = best_list
        df1[acc_col]   = acc_list
        df1[mass_col]  = mass_list
        df1[mode_acc]  = mode_acc_list
        df1[mode_mass] = mode_mass_list
        df1[mode_cnt]  = mode_cnt_list

    # --- save ---
    out_dir = os.path.dirname(OUTPUT_PATH) or "."
    os.makedirs(out_dir, exist_ok=True)
    df1.to_csv(OUTPUT_PATH, index=False)
    print(f"✅ Done. Saved → {OUTPUT_PATH}")

if __name__ == "__main__":
    main()


✅ Done. Saved → F:/binary/ids1.csv


In [15]:
# -*- coding: utf-8 -*-
"""
Match assignments (report.csv) to MS2 identification table (ms2_id.csv)
by precursor_mz and neutral_mass ONLY (no retention time).

Primary match (per m/z in matched_mz_list):
  - |precursor_mz - m/z|  <= mz_tol
  - |MASS - neutral_mass| <= mass_tol

Fallback if primary fails:
  - Restrict to rows where MS2 MASS is missing (NaN)
  - Match ONLY by |precursor_mz - m/z| <= mz_tol
  - Choose the closest m/z

Outputs per tolerance pair:
  best_match_mz<MZ>_mass<MASS>        : "[<mz>: <Accession>, <MASS>] ..."
  matched_acc_mz<MZ>_mass<MASS>       : "[<Accession> ...]" (non-null only)
  matched_mass_mz<MZ>_mass<MASS>      : "[<MASS> ...]" (non-null only)
  mode_accession_mz<MZ>_mass<MASS>    : most common accession
  mode_mass_mz<MZ>_mass<MASS>         : most common mass among matches of the mode accession
  mode_accession_count_mz<MZ>_mass<MASS> : frequency of the mode accession
"""

import os
import ast
from typing import Optional, Dict, List, Tuple, Any
from collections import Counter

import numpy as np
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
REPORT_PATH = r"F:/binary/report1/report.csv"                 # your report
MS2_PATH    = r"F:/binary/ms2_with_ids_hash_nocast.csv"       # your ms2_id table
OUTPUT_PATH = r"F:/binary/ids1.csv"

# Provide one or more (mz_tol, mass_tol) pairs here
PARAM_SETS: List[Tuple[float, float]] = [
    (2.0, 90.0),
    # (1.0, 50.0),
]

# ----------------------------
# Helpers
# ----------------------------
def _num(s: pd.Series) -> pd.Series:
    """Coerce to numeric, invalid → NaN."""
    return pd.to_numeric(s, errors="coerce")

def _safe_parse_list(val) -> List[float]:
    """Safely parse matched_mz_list into a list of floats."""
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [float(x) for x in parsed]
            return []
        except Exception:
            return []
    if isinstance(val, (list, tuple, np.ndarray)):
        try:
            return [float(x) for x in val]
        except Exception:
            return []
    return []

def _fmt_suffix(v: float) -> str:
    """Make a tidy string for column suffixes (avoid many decimals)."""
    if float(v).is_integer():
        return str(int(v))
    return str(v).replace(".", "p")

def _stringify_non_null(seq) -> Optional[str]:
    """Return JSON-ish list string of non-null items, or None if empty."""
    out = []
    for x in seq:
        if x is None:
            continue
        if isinstance(x, float) and np.isnan(x):
            continue
        out.append(str(x))
    return "[" + ", ".join(out) + "]" if out else None

def _mode_and_count_with_mass(accs: List[Any], masses: List[Any]) -> Tuple[Optional[str], Optional[float], int]:
    """
    Return (mode_accession, mode_mass, count) for the most common accession.
    The mass reported is the most common mass among those matching the mode accession.
    """
    clean_accs = []
    for a in accs:
        if a is None:
            continue
        if isinstance(a, float) and np.isnan(a):
            continue
        clean_accs.append(str(a))
    if not clean_accs:
        return None, None, 0

    counter = Counter(clean_accs)
    mode_acc, mode_cnt = counter.most_common(1)[0]

    # Pick most common mass corresponding to that accession
    linked_masses = [m for a, m in zip(accs, masses) if (a is not None and str(a) == mode_acc)]
    if linked_masses:
        mass_mode, _ = Counter(linked_masses).most_common(1)[0]
    else:
        mass_mode = None

    return mode_acc, mass_mode, int(mode_cnt)

def _fmt_token_val(x: Any) -> str:
    """Format values for token strings, using 'NA' for None/NaN."""
    if x is None:
        return "NA"
    if isinstance(x, float) and np.isnan(x):
        return "NA"
    return str(x)

# ----------------------------
# Matching logic
# ----------------------------
def search_best(df2: pd.DataFrame, mz_query: float, mass_query: float,
                mz_tol: float, mass_tol: float) -> Optional[Dict]:
    """
    Primary matcher: BOTH mz & mass within tolerance; choose smallest normalized deltas.
    """
    d_mz   = (df2["precursor_mz"] - mz_query).abs()
    d_mass = (df2["MASS"] - mass_query).abs()
    mask = (d_mz <= mz_tol) & (d_mass <= mass_tol)
    if not mask.any():
        return None

    cand = df2.loc[mask].copy()
    cand["score"] = d_mz.loc[cand.index] / max(mz_tol, 1e-12) + d_mass.loc[cand.index] / max(mass_tol, 1e-12)
    return cand.sort_values("score", kind="mergesort").iloc[0].to_dict()

def search_best_mz_only_missing_mass(df2: pd.DataFrame, mz_query: float, mz_tol: float) -> Optional[Dict]:
    """
    Fallback matcher: restrict to rows where MASS is NaN and match ONLY by m/z within mz_tol.
    Choose the closest m/z.
    """
    is_missing_mass = df2["MASS"].isna()
    d_mz = (df2["precursor_mz"] - mz_query).abs()
    mask = is_missing_mass & (d_mz <= mz_tol)
    if not mask.any():
        return None
    cand = df2.loc[mask].copy()
    cand["score"] = d_mz.loc[cand.index]  # smaller is better
    return cand.sort_values("score", kind="mergesort").iloc[0].to_dict()

# ----------------------------
# Main
# ----------------------------
def main():
    # --- load ---
    if not os.path.exists(REPORT_PATH):
        raise FileNotFoundError(f"Not found: {REPORT_PATH}")
    if not os.path.exists(MS2_PATH):
        raise FileNotFoundError(f"Not found: {MS2_PATH}")

    df1 = pd.read_csv(REPORT_PATH)
    df2 = pd.read_csv(MS2_PATH)

    # --- ensure numeric & safe dtypes ---
    df2["precursor_mz"] = _num(df2["precursor_mz"])
    df2["MASS"]         = _num(df2["MASS"])
    if "Accession" not in df2.columns:
        raise KeyError("MS2 file must contain 'Accession'.")
    df2["Accession"] = df2["Accession"].astype(str)

    # --- process ---
    for (mz_t, mass_t) in PARAM_SETS:
        suffix     = f"mz{_fmt_suffix(mz_t)}_mass{_fmt_suffix(mass_t)}"
        best_col   = f"best_match_{suffix}"
        acc_col    = f"matched_acc_{suffix}"
        mass_col   = f"matched_mass_{suffix}"
        mode_acc   = f"mode_accession_{suffix}"
        mode_mass  = f"mode_mass_{suffix}"
        mode_cnt   = f"mode_accession_count_{suffix}"

        best_list, acc_list, mass_list = [], [], []
        mode_acc_list, mode_mass_list, mode_cnt_list = [], [], []

        for _, row in df1.iterrows():
            mz_list = _safe_parse_list(row.get("matched_mz_list", []))
            neutral_mass = row.get("neutral_mass", np.nan)

            if pd.isna(neutral_mass) or not mz_list:
                best_list.append(None)
                acc_list.append(None)
                mass_list.append(None)
                mode_acc_list.append(None)
                mode_mass_list.append(None)
                mode_cnt_list.append(0)
                continue

            matches, accs, masses = [], [], []
            for mz_val in mz_list:
                # Primary: mz + mass
                res = search_best(df2, float(mz_val), float(neutral_mass), mz_t, mass_t)

                # Fallback: MASS missing rows, mz-only
                if res is None:
                    res = search_best_mz_only_missing_mass(df2, float(mz_val), mz_t)

                if res is not None:
                    acc = res.get("Accession", "NA")
                    mass_match = res.get("MASS", None)  # may be NaN in fallback
                    matches.append(f"{mz_val}: { _fmt_token_val(acc) }, { _fmt_token_val(mass_match) }")
                    accs.append(acc)
                    masses.append(mass_match)
                else:
                    matches.append(f"{mz_val}: NA")
                    accs.append(None)
                    masses.append(None)

            best_list.append("[" + ", ".join(matches) + "]")
            acc_list.append(_stringify_non_null(accs))
            mass_list.append(_stringify_non_null(masses))

            m_acc, m_mass, m_cnt = _mode_and_count_with_mass(accs, masses)
            mode_acc_list.append(m_acc)
            mode_mass_list.append(m_mass)
            mode_cnt_list.append(m_cnt)

        df1[best_col]  = best_list
        df1[acc_col]   = acc_list
        df1[mass_col]  = mass_list
        df1[mode_acc]  = mode_acc_list
        df1[mode_mass] = mode_mass_list
        df1[mode_cnt]  = mode_cnt_list

    # --- save ---
    out_dir = os.path.dirname(OUTPUT_PATH) or "."
    os.makedirs(out_dir, exist_ok=True)
    df1.to_csv(OUTPUT_PATH, index=False)
    print(f"✅ Done. Saved → {OUTPUT_PATH}")

if __name__ == "__main__":
    main()


✅ Done. Saved → F:/binary/ids1.csv
