how to generate combined ms2 dataset

In [None]:
import numpy as np
import pandas as pd
import h5py

TreatmentA = "F:/casts/databank/TreatmentA.ms2.npz"
TreatmentB = "F:/casts/databank/TreatmentB.ms2.npz"
TreatmentC = "F:/casts/databank/TreatmentC.ms2.npz"
TreatmentD = "F:/casts/databank/TreatmentD.ms2.npz"

z = np.load(file=TreatmentD)

# Mat + metadata (same row count/order)
ms2_D = z["ms2_matrix"]             # (n_rows, 13690), float32
ms2_scan = z["ms2_scan"]          # (n_rows,)
ms2_rt   = z["ms2_rt"]            # (n_rows,) minutes
ms2_fid  = z["ms2_file_id"]       # (n_rows,)
fnames   = z["file_names_lookup"] # (n_files,)
group_name = z["group_name"]
precursor_mz = z["ms2_precursor_mz"]

# Optional: assemble a handy DataFrame aligned to ms1 rows
ms2_meta_D = pd.DataFrame({
    "scan": ms2_scan,
    "rt_min": ms2_rt,
    "precursor_mz": precursor_mz,
    "file_name": fnames[ms2_fid],
    'group_name': group_name
})

metadata = pd.concat([ms2_meta_A, ms2_meta_B, ms2_meta_C, ms2_meta_D], ignore_index=True)
ms2_lib = np.vstack((ms2_A, ms2_B, ms2_C, ms2_D))



with h5py.File("F:/casts/databank/ms2_dataset.h5", "w") as f:
    f.create_dataset("ms2_lib", data=ms2_lib, compression="gzip")
    for col in metadata.columns:
        f.create_dataset(col, data=metadata[col].values.astype("S") if metadata[col].dtype == object else metadata[col].values)

In [2]:
import h5py
import pandas as pd
with h5py.File("F:/casts/databank/ms2_dataset.h5", "r") as f:
    ms2_lib = f["ms2_lib"][:]
    metadata = pd.DataFrame({col: f[col][:] for col in f.keys() if col != "ms2_lib"})


clean the RAM

In [1]:
# Warning: this will wipe *everything* you defined in the current session!
for var in list(globals().keys()):
    if var[0] != "_":  # keep built-ins like __name__, __doc__, etc.
        del globals()[var]

import gc
gc.collect()



7

In [5]:
metadata['file_name'].unique().astype(str)

array(['20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep01.raw',
       '20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep02.raw',
       '20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep03.raw',
       '20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep01.raw',
       '20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep02.raw',
       '20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep03.raw',
       '20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep01.raw',
       '20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep02.raw',
       '20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep03.raw',
       '20220322_chm134_Cirrhosis_FlowChip15_AA18011EM1_TreatmentA_biorep11_techrep01.raw',
       '20220322_chm134_Cirrhosis_FlowChip15_AA18011EM1_TreatmentA_biorep11_tech

This code imports MS2 databank and returns aligned results

In [None]:
# -*- coding: utf-8 -*-
"""
Load HDF5 (ms2_dataset.h5) -> compute per-bin RT drifts vs first run -> align RTs
Save:
  - per-scan aligned metadata CSV (drops 'cast spectra')
  - per-bin drift tables CSV
"""

import os
import h5py
import numpy as np
import pandas as pd
from typing import Tuple
from math import floor, ceil
import matplotlib.pyplot as plt

# =====================
# Config
# =====================
H5_PATH = r"F:/casts/databank/ms2_dataset.h5"

SIM_THRESHOLD  = 0.95
MZ_WINDOW      = 1.0
TARGET_N       = 50
BIN_WIDTH      = 10.0
OVERLAP_MIN    = 2.5
FORCE_BIN_END_MIN = 80.0
SAMPLE_WITH_REPLACEMENT_IF_NEEDED = False

PLOT_DRIFT_CURVES = False     # set True if you want plots
PLOT_SANITY_AFTER = False

# CSV outputs
SAVE_ALIGNED_CSV  = True
CSV_OUT_PATH      = r"F:/casts/databank/aligned_metadata.csv"

SAVE_DRIFTS_CSV   = True
DRIFTS_CSV_PATH   = r"F:/casts/databank/rt_drifts.csv"

# =========================================================
# Helpers
# =========================================================
def _to_1d_float_array(x):
    if isinstance(x, np.ndarray):
        arr = x
    elif isinstance(x, (list, tuple)):
        arr = np.asarray(x, dtype=float)
    else:
        try:
            arr = np.asarray(x, dtype=float).ravel()
        except Exception:
            return None
    return arr.ravel().astype(float, copy=False)

def cosine(a, b):
    va = _to_1d_float_array(a); vb = _to_1d_float_array(b)
    if va is None or vb is None or va.size == 0 or vb.size == 0:
        return -np.inf
    if va.shape != vb.shape:
        n = min(va.size, vb.size)
        if n == 0:
            return -np.inf
        va, vb = va[:n], vb[:n]
    denom = np.linalg.norm(va) * np.linalg.norm(vb)
    if denom == 0:
        return -np.inf
    return float(np.dot(va, vb) / denom)

def decode_bytes_inplace(df: pd.DataFrame) -> None:
    for col in df.columns:
        dt = df[col].dtype
        if dt == object or str(dt).startswith("|S"):
            df[col] = df[col].apply(
                lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x
            )

def pick_col(df: pd.DataFrame, *cands):
    for c in cands:
        if c in df.columns:
            return c
    raise KeyError(f"None of {cands} found. Available: {df.columns.tolist()}")

def harmonize_columns(df: pd.DataFrame) -> None:
    # sample_name
    if "sample_name" not in df.columns:
        s_col = pick_col(df, "sample_name", "file_name", "raw_name", "run_name")
        df["sample_name"] = df[s_col].astype(str)

    # m/z
    if "m/z" not in df.columns:
        mz_col = pick_col(df, "m/z", "mz", "precursor_mz")
        df["m/z"] = df[mz_col].astype(float)

    # retntion time (keep original spelling for compatibility)
    if "retntion time" not in df.columns:
        if "retention_time" in df.columns:
            df["retntion time"] = df["retention_time"].astype(float)
        elif {"rt_min", "rt_max"}.issubset(df.columns):
            df["retntion time"] = (df["rt_min"].astype(float) + df["rt_max"].astype(float)) / 2.0
        elif "rt_min" in df.columns:
            df["retntion time"] = df["rt_min"].astype(float)
        elif "rt" in df.columns:
            df["retntion time"] = df["rt"].astype(float)
        else:
            raise KeyError("Could not infer 'retntion time' column from metadata.")

def load_h5_build_df(h5_path: str) -> pd.DataFrame:
    if not os.path.exists(h5_path):
        raise FileNotFoundError(h5_path)

    with h5py.File(h5_path, "r") as f:
        if "ms2_lib" not in f:
            raise KeyError("HDF5 must contain 'ms2_lib' dataset.")
        ms2_lib = f["ms2_lib"][:]  # (N, L)
        meta = {k: f[k][:] for k in f.keys() if k != "ms2_lib"}

    metadata = pd.DataFrame(meta)
    decode_bytes_inplace(metadata)
    harmonize_columns(metadata)

    if len(metadata) != ms2_lib.shape[0]:
        raise ValueError(f"Row mismatch: metadata={len(metadata)} vs ms2_lib={ms2_lib.shape[0]}")

    metadata = metadata.copy()
    metadata["cast spectra"] = pd.Series(list(ms2_lib), index=metadata.index)
    return metadata

def build_bins_for_target(df_target: pd.DataFrame,
                          bin_width: float,
                          force_end_min):
    if df_target.empty:
        return [], np.nan, np.nan

    rt_min = float(df_target["retntion time"].min())
    rt_max = float(df_target["retntion time"].max())

    start_edge = bin_width * floor(rt_min / bin_width)
    end_edge   = bin_width * ceil(rt_max / bin_width)

    if force_end_min is not None:
        end_edge = float(force_end_min)
        if end_edge <= start_edge:
            raise ValueError(f"FORCE_BIN_END_MIN ({force_end_min}) must be > start_edge ({start_edge}).")

    bins = []
    t = start_edge
    while t < end_edge:
        bins.append((t, t + bin_width))
        t += bin_width
    return bins, rt_min, rt_max

def collect_valid_drifts(bin_df: pd.DataFrame,
                         mz_ref: np.ndarray,
                         rt_ref: np.ndarray,
                         cast_ref: np.ndarray,
                         sim_threshold: float,
                         mz_window: float,
                         target_n: int,
                         sample_with_replacement: bool) -> list:
    if bin_df.empty:
        return []

    def drift_for_row(row):
        mz_i   = float(row["m/z"])
        rt_i   = float(row["retntion time"])
        cast_i = row["cast spectra"]

        mask = np.abs(mz_ref - mz_i) < mz_window
        idxs = np.where(mask)[0]
        if idxs.size == 0:
            return None

        match_count = 0
        rt_sum = 0.0
        for j in idxs:
            if cosine(cast_i, cast_ref[j]) > sim_threshold:
                match_count += 1
                rt_sum += rt_ref[j]
        if match_count == 0:
            return None
        return rt_i - (rt_sum / match_count)

    drifts = []
    if sample_with_replacement:
        tries = 0
        max_tries = max(200, target_n * 20)
        while len(drifts) < target_n and tries < max_tries:
            row = bin_df.sample(n=1, replace=True).iloc[0]
            tries += 1
            d = drift_for_row(row)
            if d is not None:
                drifts.append(d)
        return drifts

    bin_df_shuf = bin_df.sample(frac=1.0, replace=False, random_state=42).reset_index(drop=True)
    for _, row in bin_df_shuf.iterrows():
        if len(drifts) >= target_n:
            break
        d = drift_for_row(row)
        if d is not None:
            drifts.append(d)
    return drifts

def compute_drift_table_for_target(df_target: pd.DataFrame,
                                   mz_ref: np.ndarray,
                                   rt_ref: np.ndarray,
                                   cast_ref: np.ndarray) -> Tuple[pd.DataFrame, pd.DataFrame]:
    bins, rt_min, rt_max = build_bins_for_target(df_target, BIN_WIDTH, FORCE_BIN_END_MIN)
    records = []

    for (t0, t1) in bins:
        win_start = max(t0 - OVERLAP_MIN, rt_min)
        win_end   = min(t1 + OVERLAP_MIN, rt_max)

        bin_df = df_target[(df_target["retntion time"] >= win_start) &
                           (df_target["retntion time"] <  win_end)].copy()

        drifts = collect_valid_drifts(
            bin_df,
            mz_ref=mz_ref, rt_ref=rt_ref, cast_ref=cast_ref,
            sim_threshold=SIM_THRESHOLD,
            mz_window=MZ_WINDOW,
            target_n=TARGET_N,
            sample_with_replacement=SAMPLE_WITH_REPLACEMENT_IF_NEEDED
        )
        n_valid = len(drifts)
        avg_drift = float(np.mean(drifts)) if n_valid > 0 else float("nan")

        records.append({
            "bin_start_min": t0,
            "bin_end_min": t1,
            "expanded_start_min": win_start,
            "expanded_end_min": win_end,
            "n_in_expanded_window": len(bin_df),
            "n_valid_used": n_valid,
            "target_n": TARGET_N,
            "avg_rt_drift": avg_drift,
        })

    result_df = pd.DataFrame.from_records(records)
    if result_df.empty:
        return result_df, result_df

    result_df["bin_center_min"] = 0.5 * (result_df["bin_start_min"] + result_df["bin_end_min"])
    plot_df_valid = result_df[
        (~np.isnan(result_df["avg_rt_drift"])) & (result_df["n_valid_used"] > 0)
    ].copy()
    return result_df, plot_df_valid

def build_alignment_function(plot_df_valid: pd.DataFrame):
    if plot_df_valid is None or plot_df_valid.empty:
        return lambda x: np.zeros_like(np.asarray(x, dtype=float))

    x = plot_df_valid["bin_center_min"].to_numpy()
    y = plot_df_valid["avg_rt_drift"].to_numpy()
    order = np.argsort(x)
    x = x[order]; y = y[order]

    if x.size == 1:
        c = float(y[0])
        return lambda rt: np.full_like(np.asarray(rt, dtype=float), c)

    def f(rt):
        rt = np.asarray(rt, dtype=float)
        return np.interp(rt, x, y, left=y[0], right=y[-1])
    return f

def align_runs_from_h5(h5_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      aligned_df   : per-scan DataFrame with rt_correction and rt_aligned
      drift_table  : per-bin drift table for all targets
    """
    df = load_h5_build_df(h5_path)

    # reference & targets
    sample_order = df["sample_name"].dropna().unique().tolist()
    if len(sample_order) < 2:
        raise ValueError(f"Need ≥2 samples to align; found {len(sample_order)}: {sample_order}")
    ref_name = sample_order[0]
    target_names = sample_order[1:]

    df_ref = df[df["sample_name"] == ref_name].copy()
    if df_ref.empty:
        raise ValueError(f"No reference rows found for '{ref_name}'.")
    mz_ref   = df_ref["m/z"].to_numpy()
    rt_ref   = df_ref["retntion time"].to_numpy()
    cast_ref = df_ref["cast spectra"].to_numpy(object)

    df = df.copy()
    df["rt_correction"] = 0.0
    df["rt_aligned"] = df["retntion time"].astype(float)

    all_drifts = []  # collect per-target drift tables

    if PLOT_DRIFT_CURVES:
        plt.figure()
        any_series = False

    for tname in target_names:
        dft = df[df["sample_name"] == tname].copy()
        if dft.empty:
            print(f"Warning: no rows for target '{tname}', skipping.")
            continue

        res_df, plot_df_valid = compute_drift_table_for_target(dft, mz_ref, rt_ref, cast_ref)

        # add target name & collect drift table
        res_df = res_df.copy()
        res_df["target_name"] = tname
        all_drifts.append(res_df)

        # optional: weighted avg summary
        if not plot_df_valid.empty:
            weights = plot_df_valid["n_valid_used"].to_numpy()
            vals    = plot_df_valid["avg_rt_drift"].to_numpy()
            wavg    = np.average(vals, weights=weights)
            print(f"{tname}: weighted overall avg drift = {wavg:.3f} min "
                  f"(kept {plot_df_valid.shape[0]} bins with ≥1 valid match; TARGET_N={TARGET_N})")
        else:
            print(f"{tname}: no bins with ≥1 valid match.")

        if PLOT_DRIFT_CURVES and not plot_df_valid.empty:
            plt.plot(plot_df_valid["bin_center_min"], plot_df_valid["avg_rt_drift"], marker="o", label=tname)

        # build & apply alignment
        align_fn = build_alignment_function(plot_df_valid)
        rt_vals = dft["retntion time"].to_numpy(dtype=float)
        corr = align_fn(rt_vals)
        aligned = rt_vals - corr
        df.loc[dft.index, "rt_correction"] = corr
        df.loc[dft.index, "rt_aligned"] = aligned

    # reference unchanged
    df.loc[df["sample_name"] == ref_name, "rt_correction"] = 0.0
    df.loc[df["sample_name"] == ref_name, "rt_aligned"] = df.loc[df["sample_name"] == ref_name, "retntion time"].astype(float)

    if PLOT_DRIFT_CURVES:
        plt.axhline(0.0, linestyle="--", color="gray")
        plt.axhline(5.0, linestyle="--", alpha=0.6)
        plt.axhline(-5.0, linestyle="--", alpha=0.6)
        plt.xlabel("Retention time (min, bin center)")
        plt.ylabel("Average RT drift vs ref (min)")
        # plt.legend(title="Target samples", fontsize=9)
        plt.grid(True, which="both", linestyle=":", linewidth=0.5)
        plt.tight_layout()
        plt.show()

    # combine drift tables
    drift_table = pd.concat(all_drifts, ignore_index=True) if all_drifts else pd.DataFrame()

    if PLOT_SANITY_AFTER:
        plt.figure()
        for name in df["sample_name"].dropna().unique().tolist():
            dfx = df[df["sample_name"] == name]
            tmp = dfx[["retntion time", "rt_correction"]].copy()
            tmp["bin"] = (tmp["retntion time"] // 2.0) * 2.0  # 2-min bins
            grp = tmp.groupby("bin", as_index=False)["rt_correction"].median()
            plt.plot(grp["bin"], grp["rt_correction"], marker=".", alpha=0.85, label=name)
        plt.axhline(0.0, linestyle="--", color="gray")
        plt.xlabel("Raw RT (min, 2-min bins)")
        plt.ylabel("Median applied correction (min)")
        # plt.legend(fontsize=8)
        plt.grid(True, linestyle=":", linewidth=0.5)
        plt.tight_layout()
        plt.show()

    return df, drift_table

# =====================
# Run
# =====================
if __name__ == "__main__":
    aligned_df, drift_table = align_runs_from_h5(H5_PATH)

    # Save aligned per-scan metadata (drop huge spectra)
    if SAVE_ALIGNED_CSV:
        os.makedirs(os.path.dirname(CSV_OUT_PATH), exist_ok=True)
        aligned_df.drop(columns=["cast spectra"], errors="ignore").to_csv(CSV_OUT_PATH, index=False)
        print(f"Saved aligned metadata to: {CSV_OUT_PATH}")

    # Save per-bin drift table
    if SAVE_DRIFTS_CSV:
        os.makedirs(os.path.dirname(DRIFTS_CSV_PATH), exist_ok=True)
        drift_table.to_csv(DRIFTS_CSV_PATH, index=False)
        print(f"Saved per-bin RT drifts to: {DRIFTS_CSV_PATH}")

    # Quick summary
    for name in aligned_df["sample_name"].dropna().unique().tolist():
        dfx = aligned_df[aligned_df["sample_name"] == name]
        med_corr = float(np.nanmedian(dfx["rt_correction"])) if len(dfx) else np.nan
        print(f"{name:30s} median correction: {med_corr: .3f} min")

    print("\nColumns in aligned_df:")
    print("  sample_name, m/z, retntion time, rt_correction, rt_aligned, cast spectra")
    if not drift_table.empty:
        print("\nDrift table columns:")
        print(drift_table.columns.tolist())

# ---- Make runs × bins drift matrix and save to CSV ----
import numpy as np
import pandas as pd
import os

SAVE_DRIFT_MATRIX_CSV = True
DRIFT_MATRIX_CSV_PATH = r"F:/casts/databank/rt_drifts_matrix.csv"

if not drift_table.empty and SAVE_DRIFT_MATRIX_CSV:
    dt = drift_table.copy()

    # Use bin centers as columns (minutes). Round to 2 decimals for clean headers.
    dt["bin_center_min"] = dt["bin_center_min"].astype(float).round(2)

    # Pivot: rows=runs (target_name), cols=bins, values=avg drift
    drift_matrix = (
        dt.pivot_table(
            index="target_name",
            columns="bin_center_min",
            values="avg_rt_drift",
            aggfunc="mean"  # safe if duplicates ever appear
        )
        .sort_index(axis=1)  # sort bins left→right
    )

    # Optional: include the reference run as a zero row if you want it in the matrix
    try:
        ref_name = aligned_df["sample_name"].dropna().unique().tolist()[0]
        if ref_name not in drift_matrix.index:
            # add zero drift across all bins for the reference
            drift_matrix.loc[ref_name] = 0.0
            drift_matrix = drift_matrix.sort_index()
    except Exception:
        pass  # skip if aligned_df is not available

    # (Optional) prettier column labels like "t00-10", else keep numeric centers:
    # dt2 = drift_table.copy()
    # dt2["bin_label"] = dt2["bin_start_min"].astype(int).astype(str) + "-" + dt2["bin_end_min"].astype(int).astype(str)
    # drift_matrix = (dt2.pivot_table(index="target_name", columns="bin_label", values="avg_rt_drift").sort_index(axis=1))

    os.makedirs(os.path.dirname(DRIFT_MATRIX_CSV_PATH), exist_ok=True)
    drift_matrix.to_csv(DRIFT_MATRIX_CSV_PATH, float_format="%.5f")
    print(f"Saved drift matrix (runs × bins) to: {DRIFT_MATRIX_CSV_PATH}")



20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep02.raw: weighted overall avg drift = -0.331 min (kept 8 bins with ≥1 valid match; TARGET_N=50)
20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep03.raw: weighted overall avg drift = -0.961 min (kept 8 bins with ≥1 valid match; TARGET_N=50)
20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep01.raw: weighted overall avg drift = 1.789 min (kept 8 bins with ≥1 valid match; TARGET_N=50)
20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep02.raw: weighted overall avg drift = 1.816 min (kept 8 bins with ≥1 valid match; TARGET_N=50)
20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep03.raw: weighted overall avg drift = 1.985 min (kept 8 bins with ≥1 valid match; TARGET_N=50)
20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep01.raw: weighted overall avg drift = -0.387 min (kept 8 bins with ≥1 valid mat