In [1]:
import os
import re
import numpy as np
import pandas as pd

# ----------------------------
# Utils
# ----------------------------
def _decode_bytes_inplace(df: pd.DataFrame) -> None:
    for c in df.columns:
        dt = df[c].dtype
        if dt == object or str(dt).startswith("|S"):
            df[c] = df[c].apply(lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x)

def _pick(df: pd.DataFrame, *cands):
    for c in cands:
        if c in df.columns:
            return c
    raise KeyError(f"None of {cands} found in {df.columns.tolist()}")

def _ensure_cols(df: pd.DataFrame) -> None:
    # sample_name
    if "sample_name" not in df.columns:
        s = _pick(df, "sample_name", "file_name", "raw_name", "run_name")
        df["sample_name"] = df[s].astype(str)
    # group_name (optional)
    if "group_name" not in df.columns:
        df["group_name"] = "Unknown"
    # retention time (keep original spelling for compatibility with your code)
    if "retntion time" not in df.columns:
        if "retention_time" in df.columns:
            df["retntion time"] = df["retention_time"].astype(float)
        elif {"rt_min","rt_max"}.issubset(df.columns):
            df["retntion time"] = (df["rt_min"].astype(float)+df["rt_max"].astype(float))/2.0
        elif "rt_min" in df.columns:
            df["retntion time"] = df["rt_min"].astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = df["rt"].astype(float)
        else:
            raise KeyError("Couldn't infer 'retntion time' in metadata")

def _build_align_functions_from_drift_csv(drift_csv_path: str):
    """
    Returns dict: run_name -> f(rt) giving drift (min) at given RT.
    Expects columns: target_name, bin_center_min, avg_rt_drift
    Edge behavior: hold first/last values constant.
    """
    dt = pd.read_csv(drift_csv_path)
    # tolerate different casings
    rename_map = {}
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in dt.columns:
            for c in dt.columns:
                if c.lower() == need.lower():
                    rename_map[c] = need
    if rename_map:
        dt = dt.rename(columns=rename_map)
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in dt.columns:
            raise KeyError(f"Drift CSV missing '{need}'")

    fns = {}
    for run, grp in dt.groupby("target_name"):
        x = np.asarray(grp["bin_center_min"], dtype=float)
        y = np.asarray(grp["avg_rt_drift"], dtype=float)
        if x.size == 0:
            continue
        order = np.argsort(x)
        x = x[order]; y = y[order]
        if x.size == 1:
            c = float(y[0])
            fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
        else:
            # bind x,y to the closure now
            def make_f(xv, yv):
                def f(rt):
                    rt = np.asarray(rt, dtype=float)
                    return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                return f
            fns[str(run)] = make_f(x, y)
    return fns

def _safe_stem(path: str) -> str:
    stem = os.path.splitext(os.path.basename(path))[0]
    return re.sub(r'[<>:"/\\|?*]+', "_", stem)

# ----------------------------
# Main
# ----------------------------
def bin_ms1_npz_with_alignment(
    npz_path: str,
    drift_csv_path: str,
    out_csv_path: str,
    bin_width: float = 10.0,
    overlap: float = 2.5,
    num_bins: int = 8
) -> str:
    """
    Load an NPZ (e.g., TreatmentA.ms1.npz) with:
      - ms1_matrix: (N_scans, N_bins)
      - metadata arrays per column (same length N_scans)
    Apply alignment using drift CSV (per-run drift vs reference),
    then aggregate into fixed aligned-RT bins with overlap.

    Output: one CSV with 8 rows (one per 10-min bin), columns:
      ['group_name','rt_start_min','rt_end_min','expanded_start_min','expanded_end_min',
       'rt_center_min','n_scans', 'cast_00000'..]
    """
    if not os.path.exists(npz_path):
        raise FileNotFoundError(npz_path)
    if not os.path.exists(drift_csv_path):
        raise FileNotFoundError(drift_csv_path)

    # ---- Load NPZ ----
    z = np.load(npz_path, allow_pickle=True)
    if "ms1_matrix" not in z:
        raise KeyError("NPZ must contain 'ms1_matrix'")
    MS1 = z["ms1_matrix"]           # (N, L)
    meta_keys = [k for k in z.files if k != "ms1_matrix"]
    metadata = pd.DataFrame({k: z[k] for k in meta_keys})
    _decode_bytes_inplace(metadata)
    _ensure_cols(metadata)

    if len(metadata) != MS1.shape[0]:
        raise ValueError(f"Row mismatch: metadata={len(metadata)} vs ms1_matrix={MS1.shape[0]}")

    N_BINS = MS1.shape[1]
    cast_cols = [f"cast_{i:05d}" for i in range(N_BINS)]

    # ---- Build alignment functions from drift table ----
    align_fns = _build_align_functions_from_drift_csv(drift_csv_path)
    zero = lambda rt: np.zeros_like(np.asarray(rt, float))

    # ---- Compute aligned RT per scan ----
    rt_raw   = metadata["retntion time"].to_numpy(dtype=float)
    samples  = metadata["sample_name"].astype(str).to_numpy()

    rt_corr = np.zeros_like(rt_raw, dtype=float)
    for run in np.unique(samples):
        f = align_fns.get(run, zero)  # if ref or unknown -> zero drift
        m = (samples == run)
        if np.any(m):
            rt_corr[m] = f(rt_raw[m])
    rt_aligned = rt_raw - rt_corr

    # ---- Fixed bins: 8 × 10 min over [0, 80)
    starts = np.arange(0.0, num_bins * bin_width, bin_width, dtype=float)
    ends   = starts + bin_width
    centers = 0.5 * (starts + ends)

    # clip expanded windows to data bounds for safety
    rt_min = float(np.nanmin(rt_aligned)) if rt_aligned.size else 0.0
    rt_max = float(np.nanmax(rt_aligned)) if rt_aligned.size else 0.0

    rows = []
    for t0, t1, mid in zip(starts, ends, centers):
        win_start = max(t0 - overlap, rt_min)
        win_end   = min(t1 + overlap, rt_max)

        mask = (rt_aligned >= win_start) & (rt_aligned < win_end)
        n_scans = int(mask.sum())

        if n_scans > 0:
            # Sum with higher precision then cast down
            vec = MS1[mask].sum(axis=0, dtype=np.float64).astype(np.float32, copy=False)
            rt_obs_min = float(rt_aligned[mask].min())
            rt_obs_max = float(rt_aligned[mask].max())
        else:
            vec = np.zeros(N_BINS, dtype=np.float32)
            rt_obs_min, rt_obs_max = np.nan, np.nan

        group_name = str(metadata.get("group_name", "Unknown").iloc[0]) if len(metadata) else "Unknown"
        meta = [group_name, t0, t1, win_start, win_end, mid, n_scans, rt_obs_min, rt_obs_max]
        rows.append(meta + vec.tolist())

    # ---- Write CSV ----
    col_meta = [
        "group_name",
        "rt_start_min", "rt_end_min",
        "expanded_start_min", "expanded_end_min",
        "rt_center_min", "n_scans",
        "rt_aligned_min_obs", "rt_aligned_max_obs",
    ]
    out_df = pd.DataFrame(rows, columns=col_meta + cast_cols)

    os.makedirs(os.path.dirname(out_csv_path), exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)
    return out_csv_path


In [None]:
import os
import re
import numpy as np
import pandas as pd

# ---------- helpers ----------
def _resolve_csv(path: str) -> str:
    if os.path.exists(path):
        return path
    if not os.path.splitext(path)[1] and os.path.exists(path + ".csv"):
        return path + ".csv"
    raise FileNotFoundError(f"Drift file not found: {path} (also tried {path+'.csv'})")

def _decode_bytes_inplace(df: pd.DataFrame) -> None:
    for c in df.columns:
        dt = df[c].dtype
        if dt == object or str(dt).startswith("|S"):
            df[c] = df[c].apply(lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x)

def _pick(df: pd.DataFrame, *cands):
    for c in cands:
        if c in df.columns:
            return c
    raise KeyError(f"None of {cands} found in {df.columns.tolist()}")

def _ensure_cols(df: pd.DataFrame) -> None:
    if "sample_name" not in df.columns:
        s = _pick(df, "sample_name", "file_name", "raw_name", "run_name")
        df["sample_name"] = df[s].astype(str)
    if "group_name" not in df.columns:
        df["group_name"] = "Unknown"
    if "retntion time" not in df.columns:
        if "retention_time" in df.columns:
            df["retntion time"] = df["retention_time"].astype(float)
        elif {"rt_min","rt_max"}.issubset(df.columns):
            df["retntion time"] = (df["rt_min"].astype(float) + df["rt_max"].astype(float)) / 2.0
        elif "rt_min" in df.columns:
            df["retntion time"] = df["rt_min"].astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = df["rt"].astype(float)
        else:
            raise KeyError("Couldn't infer 'retntion time' from metadata")

def _build_align_functions_from_drift(drift_path: str):
    """
    Supports TWO formats:

    1) Long table with columns: target_name, bin_center_min, avg_rt_drift
    2) Wide matrix: rows=runs (index), columns=bins (bin centers as headers)

    Returns dict: run_name -> f(rt) => drift (minutes)
    """
    p = _resolve_csv(drift_path)

    # try wide matrix first (index=run names)
    try:
        wide = pd.read_csv(p, index_col=0)
        # columns should be numeric bin centers (or strings convertible to float)
        cols_numeric = []
        ok = True
        for c in wide.columns:
            try:
                cols_numeric.append(float(c))
            except Exception:
                ok = False
                break
        if ok and wide.shape[1] > 0:
            col_vals = np.array(cols_numeric, dtype=float)
            fns = {}
            for run, row in wide.iterrows():
                y = row.to_numpy(dtype=float)
                # drop NaNs where possible
                mask = ~np.isnan(col_vals) & ~np.isnan(y)
                x = col_vals[mask]
                y = y[mask]
                if x.size == 0:
                    continue
                order = np.argsort(x)
                x = x[order]; y = y[order]
                if x.size == 1:
                    c = float(y[0])
                    fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
                else:
                    def make_f(xv, yv):
                        def f(rt):
                            rt = np.asarray(rt, dtype=float)
                            return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                        return f
                    fns[str(run)] = make_f(x, y)
            if fns:
                return fns
    except Exception:
        pass

    # fallback: long format
    long = pd.read_csv(p)
    rename = {}
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in long.columns:
            for c in long.columns:
                if c.lower() == need.lower():
                    rename[c] = need
    if rename:
        long = long.rename(columns=rename)
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in long.columns:
            raise KeyError(f"Drift file missing column '{need}' (after trying wide+long formats).")

    fns = {}
    for run, grp in long.groupby("target_name"):
        x = np.asarray(grp["bin_center_min"], dtype=float)
        y = np.asarray(grp["avg_rt_drift"], dtype=float)
        order = np.argsort(x)
        x = x[order]; y = y[order]
        if x.size == 0:
            continue
        if x.size == 1:
            c = float(y[0])
            fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
        else:
            def make_f(xv, yv):
                def f(rt):
                    rt = np.asarray(rt, dtype=float)
                    return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                return f
            fns[str(run)] = make_f(x, y)
    return fns

def _safe_stem(path: str) -> str:
    stem = os.path.splitext(os.path.basename(path))[0]
    return re.sub(r'[<>:"/\\|?*]+', "_", stem)

# ---------- main ----------
def bin_ms1_npz_with_alignment(
    npz_path: str,
    drift_path: str,
    out_csv_path: str,
    bin_width: float = 10.0,
    overlap: float = 2.5,
    num_bins: int = 8
) -> str:
    """
    Sum MS1 spectra within aligned RT windows (8 bins × 10 min, ±2.5 min overlap).
    - npz_path must contain: 'ms1_matrix' + metadata arrays (same length)
    - drift_path: wide runs×bins matrix OR long drift table
    """
    if not os.path.exists(npz_path):
        raise FileNotFoundError(npz_path)

    z = np.load(npz_path, allow_pickle=True)
    if "ms1_matrix" not in z:
        raise KeyError("NPZ must contain 'ms1_matrix'")
    MS1 = z["ms1_matrix"]  # shape: (N, L)

    # metadata
    meta_keys = [k for k in z.files if k != "ms1_matrix"]
    metadata = pd.DataFrame({k: z[k] for k in meta_keys})
    _decode_bytes_inplace(metadata)
    _ensure_cols(metadata)

    if len(metadata) != MS1.shape[0]:
        raise ValueError(f"Row mismatch: metadata={len(metadata)} vs ms1_matrix={MS1.shape[0]}")

    N_BINS = MS1.shape[1]
    cast_cols = [f"cast_{i:05d}" for i in range(N_BINS)]

    # alignment functions (per run)
    align_fns = _build_align_functions_from_drift(drift_path)
    zero = lambda rt: np.zeros_like(np.asarray(rt, float))

    # aligned RT per scan
    rt_raw  = metadata["retntion time"].to_numpy(dtype=float)
    runs    = metadata["sample_name"].astype(str).to_numpy()
    rt_corr = np.zeros_like(rt_raw, dtype=float)

    for run in np.unique(runs):
        f = align_fns.get(run, zero)  # ref or unknown -> zero
        m = (runs == run)
        if np.any(m):
            rt_corr[m] = f(rt_raw[m])
    rt_aligned = rt_raw - rt_corr

    # fixed 8 bins: [0,80) with ±2.5 min overlap
    starts  = np.arange(0.0, num_bins * bin_width, bin_width, dtype=float)
    ends    = starts + bin_width
    centers = 0.5 * (starts + ends)

    rt_min = float(np.nanmin(rt_aligned)) if rt_aligned.size else 0.0
    rt_max = float(np.nanmax(rt_aligned)) if rt_aligned.size else 0.0

    rows = []
    for t0, t1, mid in zip(starts, ends, centers):
        win_start = max(t0 - overlap, rt_min)
        win_end   = min(t1 + overlap, rt_max)
        mask = (rt_aligned >= win_start) & (rt_aligned < win_end)
        n_scans = int(mask.sum())

        if n_scans > 0:
            vec = MS1[mask].sum(axis=0, dtype=np.float64).astype(np.float32, copy=False)
            rt_obs_min = float(rt_aligned[mask].min())
            rt_obs_max = float(rt_aligned[mask].max())
        else:
            vec = np.zeros(N_BINS, dtype=np.float32)
            rt_obs_min, rt_obs_max = np.nan, np.nan

        group_name = str(metadata.get("group_name", "Unknown").iloc[0]) if len(metadata) else "Unknown"
        rows.append([group_name, t0, t1, win_start, win_end, mid, n_scans, rt_obs_min, rt_obs_max] + vec.tolist())

    out_df = pd.DataFrame(
        rows,
        columns=[
            "group_name",
            "rt_start_min","rt_end_min",
            "expanded_start_min","expanded_end_min",
            "rt_center_min","n_scans",
            "rt_aligned_min_obs","rt_aligned_max_obs"
        ] + cast_cols
    )

    os.makedirs(os.path.dirname(out_csv_path), exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)
    return out_csv_path

# ---------- run with your paths ----------
if __name__ == "__main__":
    npz_path   = r"F:\casts\databank\TreatmentA.ms1.npz"
    drift_path = r"F:\casts\databank\rt_drifts_matrix"   # will auto-try .csv
    out_csv    = r"F:\casts\databank\TreatmentA_aligned_bins.csv"

    wrote = bin_ms1_npz_with_alignment(
        npz_path=npz_path,
        drift_path=drift_path,
        out_csv_path=out_csv,
        bin_width=10.0,
        overlap=2.5,
        num_bins=8
    )
    print("Saved:", wrote)


In [9]:
import os, re
import numpy as np
import pandas as pd

def _resolve_csv(path: str) -> str:
    if os.path.exists(path):
        return path
    root, ext = os.path.splitext(path)
    if not ext and os.path.exists(path + ".csv"):
        return path + ".csv"
    raise FileNotFoundError(f"Drift file not found: {path} (also tried {path+'.csv'})")

def _decode_bytes_inplace(df: pd.DataFrame) -> None:
    for c in df.columns:
        dt = df[c].dtype
        if dt == object or str(dt).startswith("|S"):
            df[c] = df[c].apply(lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x)

def _pick(df: pd.DataFrame, *cands):
    for c in cands:
        if c in df.columns:
            return c
    raise KeyError(f"None of {cands} found in {df.columns.tolist()}")

def _ensure_cols(df: pd.DataFrame) -> None:
    if "sample_name" not in df.columns:
        s = _pick(df, "sample_name", "file_name", "raw_name", "run_name")
        df["sample_name"] = df[s].astype(str)
    if "group_name" not in df.columns:
        df["group_name"] = "Unknown"
    if "retntion time" not in df.columns:
        if "retention_time" in df.columns:
            df["retntion time"] = df["retention_time"].astype(float)
        elif {"rt_min","rt_max"}.issubset(df.columns):
            df["retntion time"] = (df["rt_min"].astype(float)+df["rt_max"].astype(float))/2.0
        elif "rt_min" in df.columns:
            df["retntion time"] = df["rt_min"].astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = df["rt"].astype(float)
        else:
            raise KeyError("Couldn't infer 'retntion time' from metadata")

def _build_align_functions_from_drift(drift_path: str):
    p = _resolve_csv(drift_path)

    # Try wide matrix first (rows=runs, cols=bins as numbers)
    try:
        wide = pd.read_csv(p, index_col=0)
        cols = []
        ok = True
        for c in wide.columns:
            try:
                cols.append(float(c))
            except Exception:
                ok = False
                break
        if ok and wide.shape[1] > 0:
            x_all = np.asarray(cols, float)
            fns = {}
            for run, row in wide.iterrows():
                y = row.to_numpy(dtype=float)
                mask = ~np.isnan(x_all) & ~np.isnan(y)
                x = x_all[mask]; y = y[mask]
                if x.size == 0:
                    continue
                order = np.argsort(x)
                x = x[order]; y = y[order]
                if x.size == 1:
                    c = float(y[0])
                    fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
                else:
                    def make_f(xv, yv):
                        def f(rt):
                            rt = np.asarray(rt, float)
                            return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                        return f
                    fns[str(run)] = make_f(x, y)
            if fns:
                return fns
    except Exception:
        pass

    # Fallback: long format
    long = pd.read_csv(p)
    rename = {}
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in long.columns:
            for c in long.columns:
                if c.lower() == need.lower():
                    rename[c] = need
    if rename:
        long = long.rename(columns=rename)
    for need in ("target_name","bin_center_min","avg_rt_drift"):
        if need not in long.columns:
            raise KeyError(f"Drift file missing column '{need}' (after wide+long tries).")

    fns = {}
    for run, grp in long.groupby("target_name"):
        x = np.asarray(grp["bin_center_min"], float)
        y = np.asarray(grp["avg_rt_drift"], float)
        order = np.argsort(x)
        x = x[order]; y = y[order]
        if x.size == 0:
            continue
        if x.size == 1:
            c = float(y[0])
            fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
        else:
            def make_f(xv, yv):
                def f(rt):
                    rt = np.asarray(rt, float)
                    return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                return f
            fns[str(run)] = make_f(x, y)
    return fns

def _safe_metadata_from_npz(z: np.lib.npyio.NpzFile, n_rows: int) -> pd.DataFrame:
    """
    Build a DataFrame from NPZ where some arrays may be scalars or wrong length.
    - Keep only 1D arrays of length n_rows
    - Broadcast 0D / length-1 arrays
    - Skip others (with a print)
    """
    cols = {}
    for k in z.files:
        if k == "ms1_matrix":
            continue
        arr = z[k]
        a = np.asarray(arr, dtype=object)  # don't force numeric yet
        if a.ndim == 0:
            cols[k] = np.repeat(a.item(), n_rows)
        elif a.ndim == 1:
            if a.shape[0] == n_rows:
                cols[k] = a
            elif a.shape[0] == 1:
                cols[k] = np.repeat(a[0], n_rows)
            else:
                print(f"[skip] '{k}' length {a.shape[0]} != {n_rows}")
        else:
            # 2D+ (per-scan vectors etc.) — skip for metadata
            print(f"[skip] '{k}' shape {a.shape} not 1D/scalar")
    df = pd.DataFrame(cols)
    _decode_bytes_inplace(df)
    _ensure_cols(df)
    return df

def bin_ms1_npz_with_alignment(
    npz_path: str,
    drift_path: str,
    out_csv_path: str,
    bin_width: float = 10.0,
    overlap: float = 2.5,
    num_bins: int = 8
) -> str:
    # Load NPZ
    if not os.path.exists(npz_path):
        raise FileNotFoundError(npz_path)
    z = np.load(npz_path, allow_pickle=True)
    if "ms1_matrix" not in z:
        raise KeyError("NPZ must contain 'ms1_matrix'")
    MS1 = z["ms1_matrix"]  # (N, L)
    N, L = MS1.shape

    # SAFE metadata build
    metadata = _safe_metadata_from_npz(z, n_rows=N)

    # Alignment functions
    align_fns = _build_align_functions_from_drift(drift_path)
    zero = lambda rt: np.zeros_like(np.asarray(rt, float))

    # Aligned RT per scan
    rt_raw = metadata["retntion time"].to_numpy(dtype=float)
    runs   = metadata["sample_name"].astype(str).to_numpy()
    rt_corr = np.zeros_like(rt_raw, float)
    for run in np.unique(runs):
        f = align_fns.get(run, zero)
        m = (runs == run)
        if np.any(m):
            rt_corr[m] = f(rt_raw[m])
    rt_aligned = rt_raw - rt_corr

    # Fixed bins: 8 × 10 min with ±2.5 min overlap
    starts  = np.arange(0.0, num_bins * bin_width, bin_width, dtype=float)
    ends    = starts + bin_width
    centers = 0.5 * (starts + ends)

    rt_min = float(np.nanmin(rt_aligned)) if rt_aligned.size else 0.0
    rt_max = float(np.nanmax(rt_aligned)) if rt_aligned.size else 0.0

    cast_cols = [f"cast_{i:05d}" for i in range(L)]
    rows = []
    for t0, t1, mid in zip(starts, ends, centers):
        win_start = max(t0 - overlap, rt_min)
        win_end   = min(t1 + overlap, rt_max)
        mask = (rt_aligned >= win_start) & (rt_aligned < win_end)
        n_scans = int(mask.sum())

        if n_scans > 0:
            vec = MS1[mask].sum(axis=0, dtype=np.float64).astype(np.float32, copy=False)
            rt_obs_min = float(rt_aligned[mask].min())
            rt_obs_max = float(rt_aligned[mask].max())
        else:
            vec = np.zeros(L, dtype=np.float32)
            rt_obs_min, rt_obs_max = np.nan, np.nan

        group_name = str(metadata.get("group_name", "Unknown").iloc[0]) if len(metadata) else "Unknown"
        rows.append([group_name, t0, t1, win_start, win_end, mid, n_scans, rt_obs_min, rt_obs_max] + vec.tolist())

    out_df = pd.DataFrame(
        rows,
        columns=[
            "group_name",
            "rt_start_min","rt_end_min",
            "expanded_start_min","expanded_end_min",
            "rt_center_min","n_scans",
            "rt_aligned_min_obs","rt_aligned_max_obs"
        ] + cast_cols
    )
    os.makedirs(os.path.dirname(out_csv_path), exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)
    return out_csv_path


In [10]:
npz_path   = r"F:\casts\databank\TreatmentA.ms1.npz"
drift_path = r"F:\casts\databank\rt_drifts_matrix"  # auto-tries .csv
out_csv    = r"F:\casts\databank\TreatmentA_aligned_bins.csv"

wrote = bin_ms1_npz_with_alignment(
    npz_path=npz_path,
    drift_path=drift_path,
    out_csv_path=out_csv,
    bin_width=10.0,
    overlap=2.5,
    num_bins=8
)
print("Saved:", wrote)


[skip] 'ms2_scan' length 336425 != 147057
[skip] 'ms2_rt' length 336425 != 147057
[skip] 'ms2_precursor_mz' length 336425 != 147057
[skip] 'ms2_file_id' length 336425 != 147057
[skip] 'file_names_lookup' length 33 != 147057
[skip] 'file_paths_lookup' length 33 != 147057


KeyError: "None of ('sample_name', 'file_name', 'raw_name', 'run_name') found in ['group_name', 'ms1_scan', 'ms1_rt', 'ms1_file_id']"

In [20]:
# Warning: this will wipe *everything* you defined in the current session!
for var in list(globals().keys()):
    if var[0] != "_":  # keep built-ins like __name__, __doc__, etc.
        del globals()[var]

import gc
gc.collect()

0

In [None]:
import numpy as np
z = np.load('F:/casts/databank/TreatmentA.ms1.npz')
z.files

In [None]:
import os
import numpy as np
import pandas as pd
import re

# ---------------- helpers ----------------
def _resolve_csv(path: str) -> str:
    if os.path.exists(path):
        return path
    root, ext = os.path.splitext(path)
    if not ext and os.path.exists(path + ".csv"):
        return path + ".csv"
    raise FileNotFoundError(f"Drift file not found: {path} (also tried {path+'.csv'})")

def _decode_bytes_arr(a):
    if isinstance(a, np.ndarray) and (a.dtype.kind in ("S", "O")):
        out = []
        for x in a:
            if isinstance(x, (bytes, bytearray)):
                try:
                    out.append(x.decode("utf-8"))
                except Exception:
                    out.append(str(x))
            else:
                out.append(str(x))
        return np.array(out, dtype=object)
    return a

def _safe_metadata_from_npz_with_lut(z: np.lib.npyio.NpzFile, n_rows: int) -> pd.DataFrame:
    """
    Build metadata DataFrame robustly:
      - keep 1D arrays with length n_rows
      - broadcast 0D/len-1 arrays
      - skip others
      - create sample_name using file_names_lookup[ms1_file_id] if possible
      - create 'retntion time' from ms1_rt
    """
    cols = {}
    for k in z.files:
        if k == "ms1_matrix":
            continue
        arr = z[k]
        # keep raw array for special handling of lookups later
        if k in ("file_names_lookup", "file_paths_lookup"):
            cols[k] = arr  # store for later
            continue

        a = np.asarray(arr)
        if a.ndim == 0:
            cols[k] = np.repeat(a.item(), n_rows)
        elif a.ndim == 1:
            if a.shape[0] == n_rows:
                cols[k] = a
            elif a.shape[0] == 1:
                cols[k] = np.repeat(a[0], n_rows)
            else:
                # skip non-matching lengths (e.g., ms2 arrays)
                # print(f"[skip] '{k}' length {a.shape[0]} != {n_rows}")
                pass
        else:
            # skip 2D+ (e.g., per-row vectors)
            # print(f"[skip] '{k}' shape {a.shape} not 1D/scalar")
            pass

    df = pd.DataFrame({k: cols[k] for k in cols if k not in ("file_names_lookup", "file_paths_lookup")})

    # decode potential byte columns
    for c in df.columns:
        if df[c].dtype == object or str(df[c].dtype).startswith("|S"):
            df[c] = pd.Series([x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x for x in df[c]])

    # ---- sample_name via lookup if available ----
    if "ms1_file_id" in df.columns and "file_names_lookup" in cols:
        fid = pd.Series(df["ms1_file_id"]).astype(int).to_numpy()
        names_lut = _decode_bytes_arr(cols["file_names_lookup"])
        names_lut = np.asarray(names_lut, dtype=object)
        # safe mapping with fallback 'fid_<id>'
        fallback = np.array([f"fid_{i}" for i in fid], dtype=object)
        ok = (fid >= 0) & (fid < names_lut.shape[0])
        mapped = fallback.copy()
        mapped[ok] = names_lut[fid[ok]]
        df["sample_name"] = mapped.astype(str)
    else:
        # fallbacks: file_name/raw_name/run_name; else synthesize
        if "file_name" in df.columns:
            df["sample_name"] = df["file_name"].astype(str)
        elif "raw_name" in df.columns:
            df["sample_name"] = df["raw_name"].astype(str)
        elif "run_name" in df.columns:
            df["sample_name"] = df["run_name"].astype(str)
        elif "ms1_file_id" in df.columns:
            df["sample_name"] = ("fid_" + pd.Series(df["ms1_file_id"]).astype(int).astype(str)).astype(str)
        else:
            df["sample_name"] = "UnknownRun"

    # ---- group_name default ----
    if "group_name" not in df.columns:
        df["group_name"] = "Unknown"

    # ---- retention time (legacy spelling for compatibility) ----
    if "retntion time" not in df.columns:
        if "ms1_rt" in df.columns:
            df["retntion time"] = pd.Series(df["ms1_rt"]).astype(float)
        elif "retention_time" in df.columns:
            df["retntion time"] = pd.Series(df["retention_time"]).astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = pd.Series(df["rt"]).astype(float)
        elif {"rt_min", "rt_max"}.issubset(df.columns):
            df["retntion time"] = (pd.Series(df["rt_min"]).astype(float) + pd.Series(df["rt_max"]).astype(float)) / 2.0
        else:
            raise KeyError("Couldn't infer 'retntion time' (looked for ms1_rt, retention_time, rt, rt_min/rt_max).")

    # Ensure string types
    df["sample_name"] = df["sample_name"].astype(str)
    df["group_name"]  = df["group_name"].astype(str)
    return df

def _build_align_functions_from_drift(drift_path: str):
    """
    Supports:
      1) Wide matrix: index=runs, columns=bins (bin centers numeric)
      2) Long table : target_name, bin_center_min, avg_rt_drift

    Returns (fns, default_fn)
    """
    p = _resolve_csv(drift_path)

    # try wide first
    try:
        wide = pd.read_csv(p, index_col=0)
        # columns as numeric bin centers
        x_all = []
        ok = True
        for c in wide.columns:
            try:
                x_all.append(float(c))
            except Exception:
                ok = False
                break
        if ok and len(x_all) > 0:
            x_all = np.asarray(x_all, float)
            fns = {}
            for run, row in wide.iterrows():
                y = row.to_numpy(dtype=float)
                mask = ~np.isnan(x_all) & ~np.isnan(y)
                x = x_all[mask]; y2 = y[mask]
                if x.size == 0:
                    continue
                order = np.argsort(x)
                x = x[order]; y2 = y2[order]
                if x.size == 1:
                    c = float(y2[0])
                    fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
                else:
                    def make_f(xv, yv):
                        def f(rt):
                            rt = np.asarray(rt, float)
                            return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                        return f
                    fns[str(run)] = make_f(x, y2)
            # default = median across runs
            y_med = np.nanmedian(wide.to_numpy(dtype=float), axis=0)
            maskm = ~np.isnan(x_all) & ~np.isnan(y_med)
            if np.any(maskm):
                xm = x_all[maskm]; ym = y_med[maskm]
                order = np.argsort(xm); xm = xm[order]; ym = ym[order]
                if xm.size == 1:
                    default_fn = (lambda c=float(ym[0]): (lambda rt: np.full_like(np.asarray(rt, float), c)))()
                else:
                    def default_fn(rt, xv=xm, yv=ym):
                        rt = np.asarray(rt, float)
                        return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
            else:
                default_fn = lambda rt: np.zeros_like(np.asarray(rt, float))
            return fns, default_fn
    except Exception:
        pass

    # long format fallback
    long = pd.read_csv(p)
    rename = {}
    for need in ("target_name", "bin_center_min", "avg_rt_drift"):
        if need not in long.columns:
            for c in long.columns:
                if c.lower() == need.lower():
                    rename[c] = need
    if rename:
        long = long.rename(columns=rename)
    for need in ("target_name", "bin_center_min", "avg_rt_drift"):
        if need not in long.columns:
            raise KeyError(f"Drift file missing column '{need}'")

    fns = {}
    for run, grp in long.groupby("target_name"):
        x = np.asarray(grp["bin_center_min"], float)
        y = np.asarray(grp["avg_rt_drift"], float)
        order = np.argsort(x); x = x[order]; y = y[order]
        if x.size == 0:
            continue
        if x.size == 1:
            c = float(y[0])
            fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
        else:
            def make_f(xv, yv):
                def f(rt):
                    rt = np.asarray(rt, float)
                    return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                return f
            fns[str(run)] = make_f(x, y)

    # default = median across runs per bin center
    med = long.groupby("bin_center_min", as_index=False)["avg_rt_drift"].median()
    x = med["bin_center_min"].to_numpy(float)
    y = med["avg_rt_drift"].to_numpy(float)
    order = np.argsort(x); x = x[order]; y = y[order]
    if x.size == 0:
        default_fn = lambda rt: np.zeros_like(np.asarray(rt, float))
    elif x.size == 1:
        default_fn = (lambda c=float(y[0]): (lambda rt: np.full_like(np.asarray(rt, float), c)))()
    else:
        def default_fn(rt, xv=x, yv=y):
            rt = np.asarray(rt, float)
            return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
    return fns, default_fn

# --------------- main ---------------
def bin_ms1_npz_with_alignment(
    npz_path: str,
    drift_path: str,
    out_csv_path: str,
    bin_width: float = 10.0,
    overlap: float = 2.5,
    num_bins: int = 8
) -> str:
    if not os.path.exists(npz_path):
        raise FileNotFoundError(npz_path)
    z = np.load(npz_path, allow_pickle=True)
    if "ms1_matrix" not in z:
        raise KeyError("NPZ must contain 'ms1_matrix'")
    MS1 = z["ms1_matrix"]
    N, L = MS1.shape

    metadata = _safe_metadata_from_npz_with_lut(z, n_rows=N)

    # build per-run alignment functions + default
    align_fns, default_fn = _build_align_functions_from_drift(drift_path)

    # aligned RT per scan
    rt_raw = metadata["retntion time"].to_numpy(dtype=float)
    runs   = metadata["sample_name"].astype(str).to_numpy()
    rt_corr = np.zeros_like(rt_raw, float)
    for run in np.unique(runs):
        f = align_fns.get(run, default_fn)  # fallback to median drift curve if run missing
        m = (runs == run)
        if np.any(m):
            rt_corr[m] = f(rt_raw[m])
    rt_aligned = rt_raw - rt_corr

    # fixed bins: [0, 80) with ±2.5 overlap
    starts  = np.arange(0.0, num_bins * bin_width, bin_width, dtype=float)
    ends    = starts + bin_width
    centers = 0.5 * (starts + ends)

    # aggregate
    rt_min = float(np.nanmin(rt_aligned)) if rt_aligned.size else 0.0
    rt_max = float(np.nanmax(rt_aligned)) if rt_aligned.size else 0.0

    cast_cols = [f"cast_{i:05d}" for i in range(L)]
    rows = []
    for t0, t1, mid in zip(starts, ends, centers):
        win_start = max(t0 - overlap, rt_min)
        win_end   = min(t1 + overlap, rt_max)
        mask = (rt_aligned >= win_start) & (rt_aligned < win_end)
        n_scans = int(mask.sum())

        if n_scans > 0:
            vec = MS1[mask].sum(axis=0, dtype=np.float64).astype(np.float32, copy=False)
            rt_obs_min = float(rt_aligned[mask].min())
            rt_obs_max = float(rt_aligned[mask].max())
        else:
            vec = np.zeros(L, dtype=np.float32)
            rt_obs_min, rt_obs_max = np.nan, np.nan

        group_name = str(metadata.get("group_name", "Unknown").iloc[0]) if len(metadata) else "Unknown"
        rows.append([group_name, t0, t1, win_start, win_end, mid, n_scans, rt_obs_min, rt_obs_max] + vec.tolist())

    out_df = pd.DataFrame(
        rows,
        columns=[
            "group_name",
            "rt_start_min","rt_end_min",
            "expanded_start_min","expanded_end_min",
            "rt_center_min","n_scans",
            "rt_aligned_min_obs","rt_aligned_max_obs"
        ] + cast_cols
    )

    os.makedirs(os.path.dirname(out_csv_path), exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)
    return out_csv_path

# --------------- run with your paths ---------------
if __name__ == "__main__":
    npz_path   = r"F:\casts\databank\TreatmentA.ms1.npz"
    drift_path = r"F:\casts\databank\rt_drifts_matrix"  # auto-tries .csv
    out_csv    = r"F:\casts\databank\TreatmentA_aligned_bins.csv"

    wrote = bin_ms1_npz_with_alignment(
        npz_path=npz_path,
        drift_path=drift_path,
        out_csv_path=out_csv,
        bin_width=10.0,
        overlap=2.5,
        num_bins=8
    )
    print("Saved:", wrote)


In [21]:
# -*- coding: utf-8 -*-
import os
import re
import numpy as np
import pandas as pd

# ------------------ helpers ------------------

def _resolve_csv(path: str) -> str:
    """Return path (or path.csv) if exists; else raise."""
    if os.path.exists(path):
        return path
    root, ext = os.path.splitext(path)
    if not ext and os.path.exists(path + ".csv"):
        return path + ".csv"
    raise FileNotFoundError(f"Drift file not found: {path}  (also tried {path+'.csv'})")

def _decode_bytes_arr(a):
    """Decode a 1D array of bytes/objects to str objects."""
    if isinstance(a, np.ndarray) and (a.dtype.kind in ("S", "O")):
        out = []
        for x in a:
            if isinstance(x, (bytes, bytearray)):
                try:
                    out.append(x.decode("utf-8"))
                except Exception:
                    out.append(str(x))
            else:
                out.append(str(x))
        return np.array(out, dtype=object)
    return a

def _safe_metadata_from_npz_with_lut(z: np.lib.npyio.NpzFile, n_rows: int) -> pd.DataFrame:
    """
    Robust metadata builder:
      - keep 1D arrays of length n_rows
      - broadcast 0D or length-1 arrays
      - skip other shapes/lengths (e.g., ms2_* if mismatched)
      - build sample_name from file_names_lookup[ms1_file_id] when available
      - set retntion time from ms1_rt
    """
    cols = {}
    for k in z.files:
        if k == "ms1_matrix":
            continue
        arr = z[k]
        # Keep lookups for later mapping
        if k in ("file_names_lookup", "file_paths_lookup"):
            cols[k] = arr
            continue

        a = np.asarray(arr)
        if a.ndim == 0:
            cols[k] = np.repeat(a.item(), n_rows)
        elif a.ndim == 1:
            if a.shape[0] == n_rows:
                cols[k] = a
            elif a.shape[0] == 1:
                cols[k] = np.repeat(a[0], n_rows)
            else:
                # skip mismatched lengths
                pass
        else:
            # skip 2D+
            pass

    df = pd.DataFrame({k: cols[k] for k in cols if k not in ("file_names_lookup", "file_paths_lookup")})

    # decode bytes in df columns
    for c in df.columns:
        if df[c].dtype == object or str(df[c].dtype).startswith("|S"):
            df[c] = pd.Series([x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x for x in df[c]])

    # sample_name via file_names_lookup[ms1_file_id] when possible
    if "ms1_file_id" in df.columns and "file_names_lookup" in cols:
        fid = pd.Series(df["ms1_file_id"]).astype(int).to_numpy()
        names_lut = _decode_bytes_arr(cols["file_names_lookup"])
        names_lut = np.asarray(names_lut, dtype=object)
        fallback = np.array([f"fid_{i}" for i in fid], dtype=object)
        ok = (fid >= 0) & (fid < names_lut.shape[0])
        mapped = fallback.copy()
        mapped[ok] = names_lut[fid[ok]]
        df["sample_name"] = mapped.astype(str)
    else:
        if "file_name" in df.columns:
            df["sample_name"] = df["file_name"].astype(str)
        elif "raw_name" in df.columns:
            df["sample_name"] = df["raw_name"].astype(str)
        elif "run_name" in df.columns:
            df["sample_name"] = df["run_name"].astype(str)
        elif "ms1_file_id" in df.columns:
            df["sample_name"] = ("fid_" + pd.Series(df["ms1_file_id"]).astype(int).astype(str)).astype(str)
        else:
            df["sample_name"] = "UnknownRun"

    if "group_name" not in df.columns:
        df["group_name"] = "Unknown"

    # retention time (legacy spelling)
    if "retntion time" not in df.columns:
        if "ms1_rt" in df.columns:
            df["retntion time"] = pd.Series(df["ms1_rt"]).astype(float)
        elif "retention_time" in df.columns:
            df["retntion time"] = pd.Series(df["retention_time"]).astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = pd.Series(df["rt"]).astype(float)
        elif {"rt_min", "rt_max"}.issubset(df.columns):
            df["retntion time"] = (pd.Series(df["rt_min"]).astype(float) + pd.Series(df["rt_max"]).astype(float)) / 2.0
        else:
            raise KeyError("Couldn't infer 'retntion time' (looked for ms1_rt, retention_time, rt, rt_min/rt_max).")

    df["sample_name"] = df["sample_name"].astype(str)
    df["group_name"]  = df["group_name"].astype(str)
    return df

def _build_align_functions_from_drift(drift_path: str):
    """
    Accepts:
      1) Wide matrix CSV: index=runs, columns=bin centers (minutes)
      2) Long table  CSV: target_name, bin_center_min, avg_rt_drift
    All missing values are filled with 0.
    Returns (fns, default_fn) where default_fn is the zero-curve.
    """
    p = _resolve_csv(drift_path)

    # Try wide matrix first
    try:
        wide = pd.read_csv(p, index_col=0)
        # convert column names to numeric bin centers
        bin_centers = []
        ok = True
        for c in wide.columns:
            try:
                bin_centers.append(float(c))
            except Exception:
                ok = False
                break
        if ok and len(bin_centers) > 0:
            order = np.argsort(bin_centers)
            cols_sorted = [wide.columns[i] for i in order]
            wide = wide.loc[:, cols_sorted]
            x_all = np.array([float(c) for c in cols_sorted], dtype=float)

            # fill all missing with 0
            wide = wide.apply(pd.to_numeric, errors="coerce").fillna(0.0)

            fns = {}
            for run, row in wide.iterrows():
                y = row.to_numpy(dtype=float)  # NaNs already 0
                if x_all.size == 1:
                    c = float(y[0])
                    fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
                else:
                    def make_f(xv, yv):
                        def f(rt):
                            rt = np.asarray(rt, float)
                            return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                        return f
                    fns[str(run)] = make_f(x_all, y)

            default_fn = lambda rt: np.zeros_like(np.asarray(rt, float))
            return fns, default_fn
    except Exception:
        pass

    # Fallback: long table
    long = pd.read_csv(p)
    # normalize headers
    rename = {}
    for need in ("target_name", "bin_center_min", "avg_rt_drift"):
        if need not in long.columns:
            for c in long.columns:
                if c.lower() == need.lower():
                    rename[c] = need
    if rename:
        long = long.rename(columns=rename)
    for need in ("target_name", "bin_center_min", "avg_rt_drift"):
        if need not in long.columns:
            raise KeyError(f"Drift file missing column '{need}'")

    # full grid of bin centers
    all_bins = np.sort(long["bin_center_min"].astype(float).unique())

    fns = {}
    for run, grp in long.groupby("target_name"):
        # initialize y as zeros (missing -> 0)
        y = np.zeros_like(all_bins, dtype=float)
        x_run = grp["bin_center_min"].astype(float).to_numpy()
        y_run = grp["avg_rt_drift"].astype(float).to_numpy()
        # map provided points
        idx_map = {bx: i for i, bx in enumerate(all_bins)}
        for xr, yr in zip(x_run, y_run):
            i = idx_map.get(xr, None)
            if i is not None and np.isfinite(yr):
                y[i] = yr  # others remain 0

        if all_bins.size == 1:
            c = float(y[0])
            fns[str(run)] = (lambda c: (lambda rt: np.full_like(np.asarray(rt, float), c)))(c)
        else:
            def make_f(xv, yv):
                def f(rt):
                    rt = np.asarray(rt, float)
                    return np.interp(rt, xv, yv, left=yv[0], right=yv[-1])
                return f
            fns[str(run)] = make_f(all_bins, y)

    default_fn = lambda rt: np.zeros_like(np.asarray(rt, float))
    return fns, default_fn

def _sum_rows_chunked(M, idxs, chunk_rows=1024, out_dtype=np.float32):
    """Memory-safe sum over selected rows."""
    if idxs.size == 0:
        return np.zeros(M.shape[1], dtype=out_dtype)
    acc = np.zeros(M.shape[1], dtype=np.float64)
    for s in range(0, idxs.size, chunk_rows):
        block = M[idxs[s:s+chunk_rows]]
        acc += block.sum(axis=0, dtype=np.float64)
    return acc.astype(out_dtype, copy=False)

# ------------------ main (per-sample) ------------------

def bin_ms1_npz_with_alignment_per_sample(
    npz_path: str,
    drift_path: str,
    out_csv_path: str,
    bin_width: float = 10.0,
    overlap: float = 2.5,
    num_bins: int = 8,
    chunk_rows: int = 1024
) -> str:
    """
    Align per-scan RT using per-run drift curves, then for **each sample_name**
    sum MS1 spectra into 8 bins (10 min) with ±2.5 min overlap on aligned RT.
    Missing drift values -> 0. Writes ONE CSV with 8 rows per sample.
    """
    if not os.path.exists(npz_path):
        raise FileNotFoundError(npz_path)

    z = np.load(npz_path, allow_pickle=True)
    if "ms1_matrix" not in z:
        raise KeyError("NPZ must contain 'ms1_matrix'")
    MS1 = z["ms1_matrix"]       # shape: (N, L)
    N, L = MS1.shape

    # metadata with sample_name, group_name, retntion time
    metadata = _safe_metadata_from_npz_with_lut(z, n_rows=N)

    # build align functions; default is zero-curve
    align_fns, default_fn = _build_align_functions_from_drift(drift_path)

    # per-scan aligned RT
    rt_raw = metadata["retntion time"].to_numpy(dtype=float)
    runs   = metadata["sample_name"].astype(str).to_numpy()
    groups = metadata["group_name"].astype(str).to_numpy()

    rt_corr = np.zeros_like(rt_raw, dtype=float)
    for run in np.unique(runs):
        f = align_fns.get(run, default_fn)  # if run missing -> zero drift
        m = (runs == run)
        if np.any(m):
            rt_corr[m] = f(rt_raw[m])
    rt_aligned = rt_raw - rt_corr

    # fixed bins: [0, 80) stepped by 10, with ±2.5 overlap on aligned RT
    starts  = np.arange(0.0, num_bins * bin_width, bin_width, dtype=float)
    ends    = starts + bin_width
    centers = 0.5 * (starts + ends)

    # Precompute for clipping
    rt_min = float(np.nanmin(rt_aligned)) if rt_aligned.size else 0.0
    rt_max = float(np.nanmax(rt_aligned)) if rt_aligned.size else 0.0

    cast_cols = [f"cast_{i:05d}" for i in range(L)]
    rows = []

    # ---- PER-SAMPLE LOOP ----
    unique_runs = np.unique(runs)
    for run in unique_runs:
        idx_run = np.flatnonzero(runs == run)
        if idx_run.size == 0:
            continue

        # group label for this run (assume constant within run)
        grp_vals = np.unique(groups[idx_run])
        group_label = grp_vals[0] if grp_vals.size > 0 else "Unknown"

        rt_run = rt_aligned[idx_run]

        for t0, t1, mid in zip(starts, ends, centers):
            win_start = max(t0 - overlap, rt_min)
            win_end   = min(t1 + overlap, rt_max)

            # indices of this run that fall in the window
            mask_local = (rt_run >= win_start) & (rt_run < win_end)
            idxs = idx_run[mask_local]
            n_scans = int(idxs.size)

            if n_scans > 0:
                vec = _sum_rows_chunked(MS1, idxs, chunk_rows=chunk_rows, out_dtype=np.float32)
                rt_obs_min = float(rt_run[mask_local].min())
                rt_obs_max = float(rt_run[mask_local].max())
            else:
                vec = np.zeros(L, dtype=np.float32)
                rt_obs_min, rt_obs_max = np.nan, np.nan

            # include sample_name so output is per-sample
            rows.append([
                run, group_label,
                t0, t1, win_start, win_end, mid,
                n_scans, rt_obs_min, rt_obs_max
            ] + vec.tolist())

    out_df = pd.DataFrame(
        rows,
        columns=[
            "sample_name", "group_name",
            "rt_start_min","rt_end_min",
            "expanded_start_min","expanded_end_min",
            "rt_center_min","n_scans",
            "rt_aligned_min_obs","rt_aligned_max_obs"
        ] + cast_cols
    )

    os.makedirs(os.path.dirname(out_csv_path), exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)
    return out_csv_path

# ------------------ run ------------------
if __name__ == "__main__":
    npz_path   = r"F:\casts\databank\TreatmentA.ms1.npz"
    drift_path = r"F:\casts\databank\rt_drifts_matrix"  # auto-tries .csv
    out_csv    = r"F:\casts\databank\TreatmentA_aligned_bins_per_sample.csv"

    wrote = bin_ms1_npz_with_alignment_per_sample(
        npz_path=npz_path,
        drift_path=drift_path,
        out_csv_path=out_csv,
        bin_width=10.0,
        overlap=2.5,
        num_bins=8,
        chunk_rows=1024  # lower if you still see MemoryError
    )
    print("Saved:", wrote)


Saved: F:\casts\databank\TreatmentA_aligned_bins_per_sample.csv
