Receives Raw files and TDReport and generates MS1 and MS2 datasets

In [17]:
# -*- coding: utf-8 -*-
"""
End-to-end:
1) Process Thermo .raw in a folder -> MS1 and MS2 CSVs
2) Make MS2 compatible with ID_import (scan column, sample_name alignment)
3) Run ID_import to attach IDs from a tdportal table

Requires: fisher-py, numpy, pandas, tqdm
  pip install fisher-py numpy pandas tqdm
"""

import os, glob, re
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
from tqdm import tqdm

# ---- fisher-py imports (RAW access) ----
from fisher_py.data.business import Scan
from fisher_py import RawFile


# -----------------------------
# Binning configuration
# -----------------------------
# MS1: 0.1 m/z bins, 600.0 .. ~1968.9 (index = round(mz * 10))
MS1_MIN_IDX, MS1_LEN = 6000, 13690
MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN

# MS2: 1.0 m/z bins, 400 .. 1999 (index = round(mz))
MS2_MIN_IDX, MS2_LEN = 400, 1600
MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

# Normalize each MS2 scan to its max intensity (per-scan TIC-like)
MS2_NORMALIZE = True


# -----------------------------
# Helpers
# -----------------------------
def _scan_type_label(text: str) -> str:
    m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
    return m.group(1).lower() if m else ""

def _as_float_array(x):
    if x is None:
        return np.array([], dtype=float)
    a = np.asarray(x)
    return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

def _raw_files_in(folder: str) -> List[str]:
    folder = os.path.abspath(folder)
    if not os.path.isdir(folder):
        raise FileNotFoundError(f'Folder not found: "{folder}"')
    out = sorted(set(glob.glob(os.path.join(folder, "*.raw")) +
                     glob.glob(os.path.join(folder, "*.RAW"))))
    if not out:
        raise FileNotFoundError(f'No .raw files in: {folder}')
    return out

def _precursor_from_scan(raw_scan) -> float:
    for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
        if hasattr(raw_scan, attr):
            try:
                return float(getattr(raw_scan, attr))
            except Exception:
                pass
    try:
        nums = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
        return float(nums[1]) if len(nums) > 1 else np.nan
    except Exception:
        return np.nan

def _make_cast_headers(prefix: str, length: int) -> List[str]:
    width = max(5, len(str(length)))
    return [f"{prefix}_{i:0{width}d}" for i in range(length)]

def _strip_ext_basename(x: str) -> str:
    base = os.path.basename(str(x))
    return base[:-4] if base.lower().endswith(".raw") else base

def _norm_key(x: str) -> str:
    """Case-insensitive, extension-agnostic normalization."""
    return _strip_ext_basename(x).lower()


# -----------------------------
# RAW processing
# -----------------------------
def process_raw_folder(raw_folder: str,
                       out_ms1_csv: str,
                       out_ms2_csv: str) -> Tuple[str, str]:
    """
    1) MS1 CSV: one row per sample_name; columns: sample_name, cast_.....
    2) MS2 CSV: one row per MS2 scan; columns: sample_name, scan_number, retention_time, precursor_mz, cast_...

    Returns (ms1_csv_path, ms2_csv_path).
    """
    raw_paths = _raw_files_in(raw_folder)

    # MS1 accumulation per sample
    ms1_acc: Dict[str, np.ndarray] = {}

    # MS2 rows
    ms2_rows: List[List[float]] = []

    for raw_path in raw_paths:
        sample_name = os.path.splitext(os.path.basename(raw_path))[0]
        if sample_name not in ms1_acc:
            ms1_acc[sample_name] = np.zeros(MS1_LEN, dtype=np.float32)

        try:
            raw = RawFile(raw_path)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_path} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for scan_number in range(1, total_scans + 1):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=scan_number)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            sc_num = getattr(raw_scan.scan_statistics, "scan_number", scan_number)
            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            mz = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            it = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if mz.size == 0 or it.size == 0:
                continue

            if stype == "ms":
                idx = np.rint(mz * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if mask.any():
                    np.add.at(ms1_acc[sample_name],
                              idx[mask] - MS1_MIN_IDX,
                              it[mask].astype(np.float32, copy=False))

            elif stype == "ms2":
                idx = np.rint(mz).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue
                v = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v, idx[mask] - MS2_MIN_IDX, it[mask].astype(np.float32, copy=False))
                if MS2_NORMALIZE:
                    vmax = float(v.max())
                    if vmax > 0:
                        v /= vmax
                prec = _precursor_from_scan(raw_scan)
                ms2_rows.append([sample_name, int(sc_num), float(rt), float(prec)] + v.astype(np.float32).tolist())

        try:
            raw.dispose()
        except Exception:
            pass

    # Write MS1 CSV
    ms1_headers = ["sample_name"] + _make_cast_headers("cast", MS1_LEN)
    ms1_df = pd.DataFrame([[sn] + vec.tolist() for sn, vec in ms1_acc.items()], columns=ms1_headers)
    os.makedirs(os.path.dirname(os.path.abspath(out_ms1_csv)) or ".", exist_ok=True)
    ms1_df.to_csv(out_ms1_csv, index=False)

    # Write MS2 CSV
    ms2_headers = ["sample_name", "scan_number", "retention_time", "precursor_mz"] + _make_cast_headers("cast", MS2_LEN)
    ms2_df = pd.DataFrame(ms2_rows, columns=ms2_headers)
    os.makedirs(os.path.dirname(os.path.abspath(out_ms2_csv)) or ".", exist_ok=True)
    ms2_df.to_csv(out_ms2_csv, index=False)

    return out_ms1_csv, out_ms2_csv


# -----------------------------
# Make MS2 compatible with ID_import
# -----------------------------
def prepare_ms2_for_id_import(ms2_df: pd.DataFrame, tdportal_df: pd.DataFrame) -> pd.DataFrame:
    """
    - Renames 'scan_number' -> 'scan'
    - Coerces 'scan' to int
    - Aligns 'sample_name' to EXACT keys present in tdportal['File Name'] (case/extension robust)
    """
    if "scan_number" in ms2_df.columns:
        ms2_df = ms2_df.rename(columns={"scan_number": "scan"})

    # enforce int scans
    ms2_df["scan"] = pd.to_numeric(ms2_df["scan"], errors="coerce").astype("Int64")
    ms2_df = ms2_df.dropna(subset=["scan"]).copy()
    ms2_df["scan"] = ms2_df["scan"].astype(int)

    # Build a resolver from normalized forms -> canonical td key
    if "File Name" not in tdportal_df.columns:
        raise KeyError("tdportal must contain 'File Name' column.")
    td_names = list(tdportal_df["File Name"].astype(str).values)

    resolver: Dict[str, str] = {}
    for name in td_names:
        resolver[_norm_key(name)] = name  # canonical map

    # map ms2 sample_name to the canonical td key if possible
    def _resolve_sample(s: str) -> str:
        k = _norm_key(s)
        return resolver.get(k, s)  # if unmatched, keep original (ID_import will handle KeyError)

    ms2_df["sample_name"] = ms2_df["sample_name"].astype(str).map(_resolve_sample)

    return ms2_df


# -----------------------------
# Your original ID_import (unchanged)
# -----------------------------
def ID_import(tdportal, databank, cast_path):
    def str_to_int(st):
        internal = []
        digits = re.findall(r'\d+', st)
        for i in range(0, len(digits)):
            internal.append(int(digits[i]))
        return(internal)

    scan_number = [0]*len(tdportal['File Name'])
    td_samples = []

    for i in range(0, len(tdportal['File Name'])):
        scan_number[i] = str_to_int(str(tdportal['Fragment Scans'][i]))
        if tdportal['File Name'][i] not in td_samples:
            td_samples.append(tdportal['File Name'][i])

    my_dic_scan = {key: [] for key in td_samples}
    my_dic_index = {key: [] for key in td_samples}

    for i in range(0, len(tdportal['File Name'])):
        my_dic_scan[tdportal['File Name'][i]].append(scan_number[i])
        my_dic_index[tdportal['File Name'][i]].append([i]*len(scan_number[i]))

    for i in range(0, len(td_samples)):
        nested_list = my_dic_scan[td_samples[i]]
        flat_list = []
        for item in nested_list:
            if isinstance(item, list):
                flat_list.extend(item)
            else:
                flat_list.append(item)
        my_dic_scan[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]

    for i in range(0, len(td_samples)):
        nested_list = my_dic_index[td_samples[i]]
        flat_list = []
        for item in nested_list:
            if isinstance(item, list):
                flat_list.extend(item)
            else:
                flat_list.append(item)
        my_dic_index[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]

    sequence, MASS, Accession, missing, PFR = [], [], [], [], []

    for i in tqdm(range(len(databank['scan'])), desc="Processing scans", ncols=100):
        try:
            sample = databank['sample_name'][i]
            scan   = databank['scan'][i]

            if scan in my_dic_scan[sample]:
                tt = my_dic_index[sample][my_dic_scan[sample].index(scan)]
                sequence.append(tdportal.at[tt, 'Sequence'])
                MASS.append(tdportal.at[tt, 'Average Mass'])
                Accession.append(tdportal.at[tt, 'Accession'])
                PFR.append(tdportal.at[tt, 'PFR'])
            else:
                sequence.append(None)
                MASS.append(None)
                Accession.append(None)
                PFR.append(None)

        except KeyError as e:
            missing.append(sample)
            sequence.append(None)
            MASS.append(None)
            Accession.append(None)
            PFR.append(None)

        except Exception as e:
            sequence.append(None)
            MASS.append(None)
            Accession.append(None)
            PFR.append(None)

    print(set(missing))

    databank['sequence'] = sequence
    databank['MASS'] = MASS
    databank['Accession'] = Accession
    databank['PFR'] = PFR

    databank = pd.DataFrame(databank)
    databank.to_csv(cast_path, index=False)
    return()


# -----------------------------
# Orchestrator
# -----------------------------
def run_all(raw_folder: str,
            out_ms1_csv: str,
            out_ms2_csv: str,
            tdportal_csv: str,
            cast_out_csv: str):
    # 1) Process RAWs -> MS1 & MS2
    ms1_csv, ms2_csv = process_raw_folder(raw_folder, out_ms1_csv, out_ms2_csv)
    print("MS1 CSV:", ms1_csv)
    print("MS2 CSV:", ms2_csv)

    # 2) Load tdportal + ms2, make ms2 compatible
    tdportal = pd.read_csv(tdportal_csv)
    ms2_df = pd.read_csv(ms2_csv)
    ms2_df = prepare_ms2_for_id_import(ms2_df, tdportal)

    # 3) ID_import
    #    Pass DataFrame (dict-like) so ID_import can index as databank['col']
    ID_import(tdportal=tdportal, databank=ms2_df, cast_path=cast_out_csv)
    print("Final MS2+IDs CSV:", cast_out_csv)


# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    RAW_FOLDER   = r"F:\old_data\usb1\samples"         # folder with .raw files
    OUT_MS1      = r"F:\binary\ms1_aggregate_per_sample.csv"
    OUT_MS2      = r"F:\binary\ms2_per_scan.csv"
    TDPORTAL_CSV = r"F:\binary\tdreport.csv"    # must have 'File Name' and 'Fragment Scans'
    CAST_OUT     = r"F:\binary\ms2_with_ids.csv"

    run_all(RAW_FOLDER, OUT_MS1, OUT_MS2, TDPORTAL_CSV, CAST_OUT)


MS1 CSV: F:\binary\ms1_aggregate_per_sample.csv
MS2 CSV: F:\binary\ms2_per_scan.csv


Processing scans: 100%|████████████████████████████████████| 19307/19307 [00:00<00:00, 22997.07it/s]


set()
Final MS2+IDs CSV: F:\binary\ms2_with_ids.csv
