In [None]:
import os
import ast
import pandas as pd

# --------------------
# Config (edit paths)
# --------------------
DATASET_RT_PATH = r"F:\casts\databank\csv_files\dataset_rt.csv"          # wide matrix with cast_* columns
ASSIGNMENTS_PATH = r"F:\test\assignments_with_best_matches.csv"          # has 'bin' and 'matched_mz_list'
OUT_PATH = os.path.join(
    os.path.dirname(ASSIGNMENTS_PATH) or ".",
    "assignments_with_quant_sums.csv"
)

# --------------------
# Helpers
# --------------------
def to_cast_col(n: float) -> str:
    """Map an m/z to its cast_* column name: int((mz-600)*10), zero-padded."""
    col_num = int((float(n) - 600.0) * 10.0)
    return "cast_" + str(col_num).zfill(5)

def parse_mz_list(val):
    """Safely parse matched_mz_list cells that look like '[864.9, 865.2, ...]'."""
    try:
        out = ast.literal_eval(str(val))
        if isinstance(out, (list, tuple)):
            return [float(x) for x in out]
    except Exception:
        pass
    return []

# --------------------
# Load data
# --------------------
df_rt = pd.read_csv(DATASET_RT_PATH)
df_asn = pd.read_csv(ASSIGNMENTS_PATH)

# Basic checks
for col in ["bin", "target"]:
    if col not in df_rt.columns:
        raise KeyError(f"'{col}' column is required in dataset_rt.csv")

if "bin" not in df_asn.columns or "matched_mz_list" not in df_asn.columns:
    raise KeyError("assignments CSV must contain 'bin' and 'matched_mz_list' columns")

# NEW columns to be added to assignments
new_cols = ["group_0_sum", "group_1_sum", "group_2_sum", "group_3_sum",
            "n_mz_used", "n_mz_found", "missing_cast_columns"]
for c in new_cols:
    if c in df_asn.columns:
        # avoid accidental overwrite
        df_asn.drop(columns=[c], inplace=True)

# --------------------
# Row-wise quantification
# --------------------
results = []
for idx, row in df_asn.iterrows():
    bin_value = float(row["bin"])
    mz_list = parse_mz_list(row["matched_mz_list"])
    cast_cols = [to_cast_col(mz) for mz in mz_list]

    # Filter dataset_rt to this bin
    df_bin = df_rt[df_rt["bin"] == bin_value]
    if df_bin.empty:
        res = dict(
            group_0_sum=float("nan"),
            group_1_sum=float("nan"),
            group_2_sum=float("nan"),
            group_3_sum=float("nan"),
            n_mz_used=len(cast_cols),
            n_mz_found=0,
            missing_cast_columns=", ".join(cast_cols) if cast_cols else ""
        )
        results.append(res)
        continue

    # Ensure target present
    if "target" not in df_bin.columns:
        raise KeyError("Column 'target' not found in dataset_rt.csv")

    existing = [c for c in cast_cols if c in df_bin.columns]
    missing = [c for c in cast_cols if c not in df_bin.columns]

    if not existing:
        sums = {0: float("nan"), 1: float("nan"), 2: float("nan"), 3: float("nan")}
    else:
        # Sum intensities across all selected cast_* columns per target
        grouped = df_bin.groupby("target")[existing].sum()
        total_per_target = grouped.sum(axis=1)  # sum across those cast_* columns
        sums = {t: float(total_per_target.get(t, float("nan"))) for t in [0, 1, 2, 3]}

    res = dict(
        group_0_sum=sums[0],
        group_1_sum=sums[1],
        group_2_sum=sums[2],
        group_3_sum=sums[3],
    )
    results.append(res)

# Attach results
df_quant = pd.DataFrame(results, index=df_asn.index)
df_asn_out = pd.concat([df_asn, df_quant], axis=1)

# --------------------
# Save updated CSV
# --------------------
df_asn_out.to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")




Saved: F:\test\assignments_with_quant_sums.csv
