In [2]:
import os
import ast
import numpy as np
import pandas as pd

# --------------------
# Config (edit paths)
# --------------------
DATASET_RT_PATH = r"F:\binary\neuro_training.csv"   # wide matrix; columns "0","1",... plus 'bin','target'
ASSIGNMENTS_PATH = r"F:\binary\ids.csv"   # has 'bin' and 'matched_mz_list'
OUT_PATH = os.path.join(os.path.dirname(ASSIGNMENTS_PATH) or ".", "assignments_with_quant_sums_aaa.csv")

# m/z -> column index mapping (make sure this matches how your matrix was created)
MZ_BASE = 600.0     # m/z at column 0
MZ_STEP = 0.1       # bin width in m/z
USE_NEAREST = True  # True: round to nearest bin; False: truncate toward zero

# --------------------
# Helpers
# --------------------
def to_col_index(mz: float) -> int:
    """Map m/z to integer feature index according to BASE/STEP."""
    x = (float(mz) - MZ_BASE) / MZ_STEP
    return int(round(x)) if USE_NEAREST else int(x)

def parse_mz_list(val):
    """Safely parse matched_mz_list cells like '[864.9, 865.2, ...]'. Returns [] on failure."""
    try:
        out = ast.literal_eval(str(val))
        if isinstance(out, (list, tuple)):
            return [float(x) for x in out]
    except Exception:
        pass
    return []

def snap_bins(series: pd.Series, valid_bins: np.ndarray) -> pd.Series:
    """Snap each numeric value in `series` to the nearest value in `valid_bins`."""
    arr = pd.to_numeric(series, errors="coerce").astype(float).to_numpy()
    snapped = []
    for v in arr:
        if np.isnan(v):
            snapped.append(np.nan)
        else:
            idx = int(np.argmin(np.abs(valid_bins - v)))
            snapped.append(float(valid_bins[idx]))
    return pd.Series(snapped, index=series.index, dtype=float)

# --------------------
# Load data
# --------------------
df_rt = pd.read_csv(DATASET_RT_PATH)
df_asn = pd.read_csv(ASSIGNMENTS_PATH)

# Basic checks
for col in ["bin", "target"]:
    if col not in df_rt.columns:
        raise KeyError(f"'{col}' column is required in the dataset CSV: {DATASET_RT_PATH}")

if "bin" not in df_asn.columns or "matched_mz_list" not in df_asn.columns:
    raise KeyError("Assignments CSV must contain 'bin' and 'matched_mz_list' columns")

# Normalize dataset bins
rt_bins_unique = np.array(sorted(pd.to_numeric(df_rt["bin"], errors="coerce").dropna().unique()), dtype=float)
if rt_bins_unique.size == 0:
    raise ValueError("No valid numeric bins found in the dataset.")

# Snap both sides to nearest dataset bin to avoid float equality issues
df_rt["__bin_norm__"] = pd.to_numeric(df_rt["bin"], errors="coerce").astype(float)
df_asn["__bin_norm__"] = snap_bins(df_asn["bin"], rt_bins_unique)

# Prepare feature column label handling (string labels like "0","1",...)
RESERVED = {"bin", "target", "__bin_norm__"}
feat_cols_all = [c for c in df_rt.columns if c not in RESERVED]
feat_str_set = set(map(str, feat_cols_all))  # treat all labels as strings for selection

# --------------------
# Prepare output columns
# --------------------
new_cols = ["group_0_sum", "group_1_sum", "n_mz_used", "n_mz_found", "missing_cast_columns"]
for c in new_cols:
    if c in df_asn.columns:
        df_asn.drop(columns=[c], inplace=True)

# --------------------
# Row-wise quantification
# --------------------
results = []
for _, row in df_asn.iterrows():
    bin_value = row["__bin_norm__"]
    mz_list = parse_mz_list(row["matched_mz_list"])
    idxs = [to_col_index(mz) for mz in mz_list]
    req_labels = [str(i) for i in idxs]  # match df_rt's string column labels

    # Slice dataset rows at this snapped bin
    df_bin = df_rt[df_rt["__bin_norm__"] == bin_value]
    if df_bin.empty:
        # Should be rare thanks to snapping; still handle
        results.append(dict(
            group_0_sum=float("nan"),
            group_1_sum=float("nan"),
            n_mz_used=len(req_labels),
            n_mz_found=0,
            missing_cast_columns=", ".join(req_labels) if req_labels else ""
        ))
        continue

    # Which requested features exist in this dataset?
    existing = [c for c in req_labels if c in df_bin.columns]
    missing = [c for c in req_labels if c not in df_bin.columns]

    if not existing:
        sums = {0: float("nan"), 1: float("nan")}
        n_found = 0
    else:
        # Sum across the selected feature columns per target (two groups: 0 and 1)
        grouped = df_bin.groupby("target")[existing].sum()
        total_per_target = grouped.sum(axis=1)  # collapse across the requested columns
        sums = {t: float(total_per_target.get(t, float("nan"))) for t in [0, 1]}
        n_found = len(existing)

    results.append(dict(
        group_0_sum=sums[0],
        group_1_sum=sums[1],
        n_mz_used=len(req_labels),
        n_mz_found=n_found,
        missing_cast_columns=", ".join(missing)
    ))

# Attach results and save
df_quant = pd.DataFrame(results, index=df_asn.index)
df_out = pd.concat([df_asn, df_quant], axis=1)
df_out.to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")

# --------------------
# Tips:
# - If you discover your matrix used a different BASE/STEP,
#   update MZ_BASE / MZ_STEP above.
# - If your assignments' bins are exactly {5, 15} already,
#   you could skip snapping and compare directly.
# --------------------


Saved: F:\binary\assignments_with_quant_sums_aaa.csv
