In [13]:
from fisher_py.data.business import Scan
from fisher_py import RawFile
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import pickle

In [6]:
def wholeCasting(folder_path, cast_path):
    os.chdir(folder_path)

    def helper_regex(text):
        match = re.search(rf"{'Full'}\s+(\w+)", text)
        if match:
            return match.group(1)
        return None
    def find_matching_keys(sequence: str, substring_dict: dict) -> list:
        return [key for key, substrings in substring_dict.items() if any(substring in sequence for substring in substrings)]


    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    substring_dict_sample = {"TreatmentA": ["TreatmentA"], "TreatmentB": ["TreatmentB"], "TreatmentC": ["TreatmentC"], "TreatmentD": ["TreatmentD"],}
 

    file_name = []
    sample_group = []

    scan_type = []
    scan_number = []
    retention_time = []
    cast_spectra = []

    mz_value = []

    for raw_name in files:
        raw = RawFile(raw_name)
        print(raw_name)
        for i in tqdm(range(1, raw.number_of_scans), desc="Processing scans", ncols=100):
            raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            file_name.append(raw_name)
            sample_group.append(find_matching_keys(raw_name, substring_dict_sample)[0])

            if str(helper_regex(raw_scan.scan_type)) == 'ms':
                scan_type.append('MS1')
                scan_number.append(raw_scan.scan_statistics.scan_number)
                retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
                mz_value.append('')

                data_intensities = [0]*13690
                scan_masses = raw_scan.preferred_masses
                scan_intensities = raw_scan.preferred_intensities

                for j in range(0,len(scan_masses)):
                    index = int(round(scan_masses[j], 2)*10)
                    if index > 6000 and index < 19360:
                        data_intensities[index-6000] = scan_intensities[j] + data_intensities[index-6000]

                cast_spectra.append(data_intensities)


            if str(helper_regex(raw_scan.scan_type)) == 'ms2':
                scan_type.append('MS2')
                scan_number.append(raw_scan.scan_statistics.scan_number)
                retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
                mz_value.append(float(re.findall(r'[\d]*[.][\d]+', raw_scan.scan_type)[1]))

                data_intensities = [0]*1600
                scan_masses = raw_scan.preferred_masses
                scan_intensities = raw_scan.preferred_intensities

                for j in range(0,len(scan_masses)):
                    index = round(scan_masses[j])
                    if index > 400 and index < 2000:
                        data_intensities[index-400] = scan_intensities[j] + data_intensities[index-400]
                data_intensities = np.array(data_intensities)
                max_value = np.max(data_intensities)
                data_intensities_norm = data_intensities / max_value
                data_intensities_norm = data_intensities_norm.astype(np.float16)
                data_intensities_norm.tolist()
                cast_spectra.append(data_intensities_norm)

    scan_dict = {'sample_name': file_name, 'group_name': sample_group, 'scan': scan_number,'scan_type': scan_type, 'retntion time': retention_time, 'm/z': mz_value, 'cast spectra': cast_spectra}

    with open(cast_path, "wb") as f:
        pickle.dump(scan_dict, f)

    return()

In [None]:
wholeCasting(folder_path='D:/test1/', cast_path='test')

In [3]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm

# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

def wholeCasting_npz(folder_path: str, out_path: str) -> str:
    """
    Build dense MS1/MS2 matrices from Thermo RAW files in folder_path and save to a compressed NPZ.
    Only *.raw (case-insensitive) files are processed.
    """
    # ---- Config ----
    MS1_MIN_IDX, MS1_LEN = 6000, 13690  # 600.0 m/z * 10 .. 1935.9
    MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
    MS2_MIN_IDX, MS2_LEN = 400, 1600    # m/z 400..1999
    MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

    def _scan_type_label(text: str) -> str:
        m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
        return m.group(1).lower() if m else ""

    def _group_from_name(name: str) -> str:
        for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
            if g in name:
                return g
        return "Unknown"

    # ---- Find RAW files (case-insensitive) ----
    folder_path = os.path.abspath(folder_path)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Folder not found: "{folder_path}"')

    raw_files = sorted(
        set(glob.glob(os.path.join(folder_path, "*.raw"))) |
        set(glob.glob(os.path.join(folder_path, "*.RAW")))
    )

    if not raw_files:
        raise FileNotFoundError(f'No ".raw" files found in: "{folder_path}"')

    # ---- Collectors ----
    ms1_rows = []
    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_names, group_names = [], []
    file_to_id, group_to_id = {}, {}

    # ---- Process each RAW file ----
    for raw_path in raw_files:
        raw_name = os.path.basename(raw_path)

        # assign file ID
        if raw_name not in file_to_id:
            file_to_id[raw_name] = len(file_names)
            file_names.append(raw_name)

        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)

        f_id = file_to_id[raw_name]
        g_id = group_to_id[group]

        # open RAW (skip gracefully if not readable)
        try:
            raw = RawFile(raw_path)
        except FileNotFoundError:
            print(f'[skip] Not a valid RAW or unreadable: {raw_path}')
            continue

        total_scans = getattr(raw, "number_of_scans", 0) or 0
        for i in tqdm(range(1, total_scans), desc=raw_name, ncols=100):
            raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            sc_num = raw_scan.scan_statistics.scan_number
            rt = float(raw.get_retention_time_from_scan_number(sc_num))
            stype = _scan_type_label(raw_scan.scan_type)  # 'ms' or 'ms2'

            masses = np.asarray(raw_scan.preferred_masses, dtype=float)
            intens = np.asarray(raw_scan.preferred_intensities, dtype=float)
            if masses.size == 0:
                continue

            if stype == "ms":
                vec = np.zeros(MS1_LEN, dtype=np.float32)
                idx = np.rint(np.round(masses, 2) * 10).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if mask.any():
                    b = idx[mask] - MS1_MIN_IDX
                    np.add.at(vec, b, intens[mask].astype(np.float32, copy=False))
                vec = vec.astype(np.float32, copy=False)

                ms1_rows.append(vec)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                vec = np.zeros(MS2_LEN, dtype=np.float32)
                idx = np.rint(np.round(masses)).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if mask.any():
                    b = idx[mask] - MS2_MIN_IDX
                    np.add.at(vec, b, intens[mask].astype(np.float32, copy=False))
                vmax = float(vec.max())
                vec = (vec / vmax).astype(np.float16, copy=False) if vmax > 0 else vec.astype(np.float16, copy=False)

                m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)
            else:
                # unknown scan type; skip
                continue

    # ---- Stack to dense matrices ----
    MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False) if ms1_rows else np.zeros((0, MS1_LEN), dtype=np.float16)
    MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False) if ms2_rows else np.zeros((0, MS2_LEN), dtype=np.float16)

    # ---- Metadata arrays ----
    ms1_scan = np.asarray(ms1_scan, dtype=np.int32)
    ms1_rt   = np.asarray(ms1_rt,   dtype=np.float32)
    ms1_file_id = np.asarray(ms1_file_id, dtype=np.int32)
    ms1_group_id = np.asarray(ms1_group_id, dtype=np.int32)

    ms2_scan = np.asarray(ms2_scan, dtype=np.int32)
    ms2_rt   = np.asarray(ms2_rt,   dtype=np.float32)
    ms2_prec_mz = np.asarray(ms2_prec_mz, dtype=np.float32)
    ms2_file_id = np.asarray(ms2_file_id, dtype=np.int32)
    ms2_group_id = np.asarray(ms2_group_id, dtype=np.int32)

    file_names_lookup  = np.asarray(file_names,  dtype=object)
    group_names_lookup = np.asarray(group_names, dtype=object)

    # ---- Save compressed NPZ (don’t put it inside the RAW folder to avoid accidental pickup) ----
    if not out_path.endswith(".npz"):
        out_path += ".npz"
    out_path = os.path.abspath(out_path)

    np.savez_compressed(
        out_path,
        ms1_matrix=MS1,
        ms2_matrix=MS2,
        ms1_scan=ms1_scan,
        ms1_rt=ms1_rt,
        ms1_file_id=ms1_file_id,
        ms1_group_id=ms1_group_id,
        ms2_scan=ms2_scan,
        ms2_rt=ms2_rt,
        ms2_precursor_mz=ms2_prec_mz,
        ms2_file_id=ms2_file_id,
        ms2_group_id=ms2_group_id,
        file_names_lookup=file_names_lookup,
        group_names_lookup=group_names_lookup,
    )
    print(f"Saved dense matrices: {out_path}")
    return out_path


In [4]:
wholeCasting_npz(folder_path=r"D:\raw2", out_path=r"D:\casts\test2")


test1.raw: 100%|██████████████████████████████████████████████| 20421/20421 [03:58<00:00, 85.79it/s]
test2.raw: 100%|██████████████████████████████████████████████| 18112/18112 [03:26<00:00, 87.84it/s]


Saved dense matrices: D:\casts\test2.npz


'D:\\casts\\test2.npz'

In [9]:
import numpy as np

# load the file (replace with your real path)
data = np.load(r"D:\casts\test.npz", allow_pickle=True)

# list of all arrays inside
print(data.files)


['ms1_matrix', 'ms2_matrix', 'ms1_scan', 'ms1_rt', 'ms1_file_id', 'ms1_group_id', 'ms2_scan', 'ms2_rt', 'ms2_precursor_mz', 'ms2_file_id', 'ms2_group_id', 'file_names_lookup', 'group_names_lookup']


In [None]:
MS1 = data["ms1_matrix"]        # shape (N_ms1, 13690), float16
MS2 = data["ms2_matrix"]        # shape (N_ms2, 1600), float16

ms1_rt   = data["ms1_rt"]       # retention times for MS1
ms2_rt   = data["ms2_rt"]       # retention times for MS2
ms2_prec = data["ms2_precursor_mz"]

files  = data["file_names_lookup"]   # lookup table for file IDs
groups = data["group_names_lookup"]  # lookup table for group IDs

# Example: map first MS1 row back to file/group
file_id  = data["ms1_file_id"][0]
group_id = data["ms1_group_id"][0]
print("First MS1 comes from:", files[file_id], "group:", groups[group_id])


In [5]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm

# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

def wholeCasting_npz(folder_path: str, out_path: str) -> str:
    """
    Build dense MS1/MS2 matrices from Thermo RAW files in folder_path and save to a compressed NPZ.
    Only *.raw (case-insensitive) files are processed.

    MS1 normalization: column-wise (per m/z bin) by the max across all scans/runs, then cast to float16.
    MS2 normalization: per-scan vector / max(vector), stored as float16.
    """
    # ---- Config ----
    MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9 -> indices [6000, 199?]
    MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN  # exclusive upper bound
    MS2_MIN_IDX, MS2_LEN = 400, 1600     # integer m/z 400..1999
    MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

    # ---- Helpers ----
    def _scan_type_label(text: str) -> str:
        m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
        return m.group(1).lower() if m else ""

    def _group_from_name(name: str) -> str:
        for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
            if g in name:
                return g
        return "Unknown"

    def _as_float_array(x):
        if x is None:
            return np.array([], dtype=float)
        a = np.asarray(x)
        return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

    # ---- Find RAW files (case-insensitive) ----
    folder_path = os.path.abspath(folder_path)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Folder not found: "{folder_path}"')

    raw_files = sorted(
        set(glob.glob(os.path.join(folder_path, "*.raw"))) |
        set(glob.glob(os.path.join(folder_path, "*.RAW")))
    )
    if not raw_files:
        raise FileNotFoundError(f'No ".raw" files found in: "{folder_path}"')

    # ---- Collectors ----
    ms1_rows = []  # keep as float32 until normalization step
    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []  # we will store each as float16 after per-scan normalization
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_names, group_names = [], []
    file_to_id, group_to_id = {}, {}

    # ---- Process each RAW file ----
    for raw_path in raw_files:
        raw_name = os.path.basename(raw_path)

        # assign file & group IDs
        if raw_name not in file_to_id:
            file_to_id[raw_name] = len(file_names)
            file_names.append(raw_name)
        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)
        f_id = file_to_id[raw_name]
        g_id = group_to_id[group]

        # open RAW (skip gracefully if not readable)
        try:
            raw = RawFile(raw_path)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_path} ({e})')
            continue

        try:
            total_scans = int(getattr(raw, "number_of_scans", 0) or 0)
        except Exception:
            total_scans = 0

        # Thermo scans are typically 1..N inclusive
        for i in tqdm(range(1, total_scans + 1), desc=raw_name, ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            try:
                sc_num = int(raw_scan.scan_statistics.scan_number)
            except Exception:
                sc_num = i

            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)  # 'ms' or 'ms2'

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # 0.1 m/z bins: index = round(mz*10)
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue

                vec = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                ms1_rows.append(vec)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                # 1.0 m/z bins: index = round(mz)
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # Try safer precursor extraction; fallback to regex
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

            else:
                continue

        # Dispose/close if supported
        try:
            raw.dispose()
        except Exception:
            pass

    # ---- Stack to dense matrices ----
    # MS1: stack as float32, then column-wise normalize by max across scans, then cast to float16.
    if ms1_rows:
        MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False)
        col_max = MS1.max(axis=0)
        # avoid division by zero
        col_max[col_max == 0] = 1.0
        MS1 = (MS1 / col_max).astype(np.float16, copy=False)
    else:
        MS1 = np.zeros((0, MS1_LEN), dtype=np.float16)

    # MS2: rows were already normalized and cast to float16 individually
    MS2 = (np.vstack(ms2_rows).astype(np.float16, copy=False)
           if ms2_rows else np.zeros((0, MS2_LEN), dtype=np.float16))

    # ---- Metadata arrays ----
    ms1_scan     = np.asarray(ms1_scan,     dtype=np.int32)
    ms1_rt       = np.asarray(ms1_rt,       dtype=np.float32)
    ms1_file_id  = np.asarray(ms1_file_id,  dtype=np.int32)
    ms1_group_id = np.asarray(ms1_group_id, dtype=np.int32)

    ms2_scan     = np.asarray(ms2_scan,     dtype=np.int32)
    ms2_rt       = np.asarray(ms2_rt,       dtype=np.float32)
    ms2_prec_mz  = np.asarray(ms2_prec_mz,  dtype=np.float32)
    ms2_file_id  = np.asarray(ms2_file_id,  dtype=np.int32)
    ms2_group_id = np.asarray(ms2_group_id, dtype=np.int32)

    file_names_lookup  = np.asarray(file_names,  dtype=object)
    group_names_lookup = np.asarray(group_names, dtype=object)

    # ---- Save compressed NPZ ----
    if not out_path.endswith(".npz"):
        out_path += ".npz"
    out_path = os.path.abspath(out_path)

    np.savez_compressed(
        out_path,
        ms1_matrix=MS1,
        ms2_matrix=MS2,
        ms1_scan=ms1_scan,
        ms1_rt=ms1_rt,
        ms1_file_id=ms1_file_id,
        ms1_group_id=ms1_group_id,
        ms2_scan=ms2_scan,
        ms2_rt=ms2_rt,
        ms2_precursor_mz=ms2_prec_mz,
        ms2_file_id=ms2_file_id,
        ms2_group_id=ms2_group_id,
        file_names_lookup=file_names_lookup,
        group_names_lookup=group_names_lookup,
    )
    print(f"Saved dense matrices: {out_path}")
    return out_path


In [6]:
wholeCasting_npz(folder_path=r"D:\raw2", out_path=r"D:\casts\test3")

test1.raw: 100%|██████████████████████████████████████████████| 20422/20422 [03:55<00:00, 86.55it/s]
test2.raw: 100%|██████████████████████████████████████████████| 18113/18113 [03:27<00:00, 87.13it/s]


Saved dense matrices: D:\casts\test3.npz


'D:\\casts\\test3.npz'

In [10]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm

# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

def wholeCasting_npz(folder_path: str, out_path: str):
    """
    Build dense MS1/MS2 matrices from Thermo RAW files in folder_path.
    Save MS1 and MS2 separately, and save shared metadata separately.

    Outputs (derived from out_path):
      <base>.ms1.npz  -> {"ms1_matrix": float16 [n_ms1_scans, MS1_LEN]}
      <base>.ms2.npz  -> {"ms2_matrix": float16 [n_ms2_scans, MS2_LEN]}
      <base>.meta.npz -> {
          "ms1_scan","ms1_rt","ms1_file_id","ms1_group_id",
          "ms2_scan","ms2_rt","ms2_precursor_mz","ms2_file_id","ms2_group_id",
          "file_names_lookup","group_names_lookup"
      }

    MS1 normalization: per-bin (column-wise) max across all scans -> [0,1], then float16.
    MS2 normalization: per-scan max -> [0,1], then float16.
    """
    # ---- Config ----
    MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9 -> indices [6000, 6000+13690)
    MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
    MS2_MIN_IDX, MS2_LEN = 400, 1600     # integer m/z 400..1999 -> indices [400, 400+1600)
    MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

    # ---- Helpers ----
    def _scan_type_label(text: str) -> str:
        m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
        return m.group(1).lower() if m else ""

    def _group_from_name(name: str) -> str:
        for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
            if g in name:
                return g
        return "Unknown"

    def _as_float_array(x):
        if x is None:
            return np.array([], dtype=float)
        a = np.asarray(x)
        return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

    # ---- Find RAW files ----
    folder_path = os.path.abspath(folder_path)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Folder not found: "{folder_path}"')

    raw_files = sorted(
        set(glob.glob(os.path.join(folder_path, "*.raw"))) |
        set(glob.glob(os.path.join(folder_path, "*.RAW")))
    )
    if not raw_files:
        raise FileNotFoundError(f'No ".raw" files found in: "{folder_path}"')

    # ---- Collectors ----
    ms1_rows = []  # float32 until normalization
    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []  # store each row as float16 after per-scan normalization
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_names, group_names = [], []
    file_to_id, group_to_id = {}, {}

    # ---- Process each RAW file ----
    for raw_path in raw_files:
        raw_name = os.path.basename(raw_path)

        # assign file & group IDs
        if raw_name not in file_to_id:
            file_to_id[raw_name] = len(file_names)
            file_names.append(raw_name)
        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)
        f_id = file_to_id[raw_name]
        g_id = group_to_id[group]

        # open RAW (skip gracefully if not readable)
        try:
            raw = RawFile(raw_path)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_path} ({e})')
            continue

        try:
            total_scans = int(getattr(raw, "number_of_scans", 0) or 0)
        except Exception:
            total_scans = 0

        # Thermo scans are typically 1..N inclusive
        for i in tqdm(range(1, total_scans + 1), desc=raw_name, ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            try:
                sc_num = int(raw_scan.scan_statistics.scan_number)
            except Exception:
                sc_num = i

            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)  # 'ms' or 'ms2'

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # 0.1 m/z bins (index = round(mz*10))
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue

                vec = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                ms1_rows.append(vec)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                # 1.0 m/z bins (index = round(mz))
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # Precursor extraction (prefer attribute, fallback to regex)
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

            else:
                continue

        # Dispose/close if supported
        try:
            raw.dispose()
        except Exception:
            pass

    # ---- Stack & normalize ----
    # MS1: stack as float32, column-wise normalize by max across scans, cast to float16
    if ms1_rows:
        MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False)
        col_max = MS1.max(axis=0)
        col_max[col_max == 0] = 1.0
        MS1 = (MS1 / col_max).astype(np.float16, copy=False)
    else:
        MS1 = np.zeros((0, MS1_LEN), dtype=np.float16)

    # MS2: rows are already normalized & float16
    MS2 = (np.vstack(ms2_rows).astype(np.float16, copy=False)
           if ms2_rows else np.zeros((0, MS2_LEN), dtype=np.float16))

    # ---- Metadata arrays (shared) ----
    ms1_scan     = np.asarray(ms1_scan,     dtype=np.int32)
    ms1_rt       = np.asarray(ms1_rt,       dtype=np.float32)
    ms1_file_id  = np.asarray(ms1_file_id,  dtype=np.int32)
    ms1_group_id = np.asarray(ms1_group_id, dtype=np.int32)

    ms2_scan     = np.asarray(ms2_scan,     dtype=np.int32)
    ms2_rt       = np.asarray(ms2_rt,       dtype=np.float32)
    ms2_prec_mz  = np.asarray(ms2_prec_mz,  dtype=np.float32)
    ms2_file_id  = np.asarray(ms2_file_id,  dtype=np.int32)
    ms2_group_id = np.asarray(ms2_group_id, dtype=np.int32)

    file_names_lookup  = np.asarray(file_names,  dtype=object)
    group_names_lookup = np.asarray(group_names, dtype=object)

    # ---- Save outputs (separate data + shared meta) ----
    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]

    ms1_path  = f"{base}.ms1.npz"
    ms2_path  = f"{base}.ms2.npz"
    meta_path = f"{base}.meta.npz"

    # data files
    np.savez_compressed(ms1_path, ms1_matrix=MS1)
    np.savez_compressed(ms2_path, ms2_matrix=MS2)

    # metadata file
    np.savez_compressed(
        meta_path,
        ms1_scan=ms1_scan,
        ms1_rt=ms1_rt,
        ms1_file_id=ms1_file_id,
        ms1_group_id=ms1_group_id,
        ms2_scan=ms2_scan,
        ms2_rt=ms2_rt,
        ms2_precursor_mz=ms2_prec_mz,
        ms2_file_id=ms2_file_id,
        ms2_group_id=ms2_group_id,
        file_names_lookup=file_names_lookup,
        group_names_lookup=group_names_lookup,
    )

    print(f"Saved MS1:  {ms1_path}  shape={MS1.shape}")
    print(f"Saved MS2:  {ms2_path}  shape={MS2.shape}")
    print(f"Saved meta: {meta_path}")
    return {"ms1": ms1_path, "ms2": ms2_path, "meta": meta_path}


In [12]:
wholeCasting_npz(folder_path=r"D:\raw2", out_path=r"D:\casts\test4")

KeyboardInterrupt: 

In [13]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc

# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

def wholeCasting_npz(folder_path: str, out_path: str):
    """
    Build dense MS1/MS2 matrices from Thermo RAW files in folder_path.
    Save MS2 first (with shared metadata), then MS1 (with the same metadata embedded).

    Outputs (derived from out_path):
      <base>.ms2.npz  -> ms2_matrix (float16) + shared metadata
      <base>.ms1.npz  -> ms1_matrix (float16) + shared metadata

    MS1 normalization: per-bin (column-wise) max across all scans -> [0,1], then float16.
    MS2 normalization: per-scan max -> [0,1], then float16.
    """
    # ---- Config ----
    MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9 -> indices [6000, 6000+13690)
    MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
    MS2_MIN_IDX, MS2_LEN = 400, 1600     # integer m/z 400..1999 -> indices [400, 400+1600)
    MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

    # ---- Helpers ----
    def _scan_type_label(text: str) -> str:
        m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
        return m.group(1).lower() if m else ""

    def _group_from_name(name: str) -> str:
        for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
            if g in name:
                return g
        return "Unknown"

    def _as_float_array(x):
        if x is None:
            return np.array([], dtype=float)
        a = np.asarray(x)
        return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

    # ---- Find RAW files ----
    folder_path = os.path.abspath(folder_path)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Folder not found: "{folder_path}"')

    raw_files = sorted(
        set(glob.glob(os.path.join(folder_path, "*.raw"))) |
        set(glob.glob(os.path.join(folder_path, "*.RAW")))
    )
    if not raw_files:
        raise FileNotFoundError(f'No ".raw" files found in: "{folder_path}"')

    # ---- Collectors ----
    ms1_rows = []  # float32 until normalization
    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []  # store rows as float16 after per-scan normalization
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_names, group_names = [], []
    file_to_id, group_to_id = {}, {}

    # ---- Process each RAW file ----
    for raw_path in raw_files:
        raw_name = os.path.basename(raw_path)

        # assign file & group IDs
        if raw_name not in file_to_id:
            file_to_id[raw_name] = len(file_names)
            file_names.append(raw_name)
        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)
        f_id = file_to_id[raw_name]
        g_id = group_to_id[group]

        # open RAW (skip gracefully if not readable)
        try:
            raw = RawFile(raw_path)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_path} ({e})')
            continue

        try:
            total_scans = int(getattr(raw, "number_of_scans", 0) or 0)
        except Exception:
            total_scans = 0

        # Thermo scans are typically 1..N inclusive
        for i in tqdm(range(1, total_scans + 1), desc=raw_name, ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            try:
                sc_num = int(raw_scan.scan_statistics.scan_number)
            except Exception:
                sc_num = i

            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)  # 'ms' or 'ms2'

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # 0.1 m/z bins (index = round(mz*10))
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue

                vec = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                ms1_rows.append(vec)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                # 1.0 m/z bins (index = round(mz))
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # Precursor extraction (prefer attribute, fallback to regex)
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

            else:
                continue

        # Dispose/close if supported
        try:
            raw.dispose()
        except Exception:
            pass

    # ---- Build shared metadata once ----
    ms1_scan_arr     = np.asarray(ms1_scan,     dtype=np.int32)
    ms1_rt_arr       = np.asarray(ms1_rt,       dtype=np.float32)
    ms1_file_id_arr  = np.asarray(ms1_file_id,  dtype=np.int32)
    ms1_group_id_arr = np.asarray(ms1_group_id, dtype=np.int32)

    ms2_scan_arr     = np.asarray(ms2_scan,     dtype=np.int32)
    ms2_rt_arr       = np.asarray(ms2_rt,       dtype=np.float32)
    ms2_prec_mz_arr  = np.asarray(ms2_prec_mz,  dtype=np.float32)
    ms2_file_id_arr  = np.asarray(ms2_file_id,  dtype=np.int32)
    ms2_group_id_arr = np.asarray(ms2_group_id, dtype=np.int32)

    file_names_lookup  = np.asarray(file_names,  dtype=object)
    group_names_lookup = np.asarray(group_names, dtype=object)

    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]
    ms2_path = f"{base}.ms2.npz"
    ms1_path = f"{base}.ms1.npz"

    # ---- Save MS2 first (then free) ----
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)

    np.savez_compressed(
        ms2_path,
        ms2_matrix=MS2,
        # embed shared metadata
        ms1_scan=ms1_scan_arr,
        ms1_rt=ms1_rt_arr,
        ms1_file_id=ms1_file_id_arr,
        ms1_group_id=ms1_group_id_arr,
        ms2_scan=ms2_scan_arr,
        ms2_rt=ms2_rt_arr,
        ms2_precursor_mz=ms2_prec_mz_arr,
        ms2_file_id=ms2_file_id_arr,
        ms2_group_id=ms2_group_id_arr,
        file_names_lookup=file_names_lookup,
        group_names_lookup=group_names_lookup,
    )
    print(f"Saved MS2: {ms2_path}  shape={MS2.shape}")
    del MS2, ms2_rows  # free memory
    gc.collect()

    # ---- Now build & save MS1 (second) ----
    if ms1_rows:
        MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False)
        col_max = MS1.max(axis=0)
        col_max[col_max == 0] = 1.0
        MS1 = (MS1 / col_max).astype(np.float16, copy=False)
    else:
        MS1 = np.zeros((0, MS1_LEN), dtype=np.float16)

    np.savez_compressed(
        ms1_path,
        ms1_matrix=MS1,
        # embed the same shared metadata again
        ms1_scan=ms1_scan_arr,
        ms1_rt=ms1_rt_arr,
        ms1_file_id=ms1_file_id_arr,
        ms1_group_id=ms1_group_id_arr,
        ms2_scan=ms2_scan_arr,
        ms2_rt=ms2_rt_arr,
        ms2_precursor_mz=ms2_prec_mz_arr,
        ms2_file_id=ms2_file_id_arr,
        ms2_group_id=ms2_group_id_arr,
        file_names_lookup=file_names_lookup,
        group_names_lookup=group_names_lookup,
    )
    print(f"Saved MS1: {ms1_path}  shape={MS1.shape}")

    # Optionally free MS1 too
    del MS1, ms1_rows
    gc.collect()

    return {"ms2": ms2_path, "ms1": ms1_path}


In [None]:
wholeCasting_npz(folder_path=r"D:\raw2", out_path=r"D:\casts\test5")

In [2]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc

# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

def wholeCasting_npz(folder_paths, out_path: str):
    """
    Build dense MS1/MS2 matrices from Thermo RAW files found in one or more folders.
    Saves MS2 first (with metadata), then MS1 (with metadata), and also writes a standalone metadata file.

    folder_paths: str or Iterable[str] of directories
    Outputs (derived from out_path):
      <base>.ms2.npz   -> ms2_matrix (float16) + metadata
      <base>.ms1.npz   -> ms1_matrix (float16) + metadata
      <base>.meta.npz  -> metadata only (no matrices)

    MS1 normalization: per-bin (column-wise) max across all scans -> [0,1], then float16.
    MS2 normalization: per-scan max -> [0,1], then float16.
    """
    # ---- Config ----
    MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9
    MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
    MS2_MIN_IDX, MS2_LEN = 400, 1600     # m/z 400..1999
    MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

    # ---- Helpers ----
    def _scan_type_label(text: str) -> str:
        m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
        return m.group(1).lower() if m else ""

    def _group_from_name(name: str) -> str:
        for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
            if g in name:
                return g
        return "Unknown"

    def _as_float_array(x):
        if x is None:
            return np.array([], dtype=float)
        a = np.asarray(x)
        return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

    def _ensure_folder_list(paths):
        if isinstance(paths, (list, tuple)):
            return list(paths)
        return [paths]

    # ---- Gather RAW files from all folders ----
    folder_list = _ensure_folder_list(folder_paths)
    raw_files = []
    for fp in folder_list:
        fp_abs = os.path.abspath(fp)
        if not os.path.isdir(fp_abs):
            raise FileNotFoundError(f'Folder not found: "{fp_abs}"')
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.raw")))
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.RAW")))

    raw_files = sorted(set(os.path.abspath(p) for p in raw_files))
    if not raw_files:
        raise FileNotFoundError(f'No ".raw" files found in: {", ".join(map(os.path.abspath, folder_list))}')

    # ---- Collectors ----
    ms1_rows = []
    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_basenames, file_abspaths, group_names = [], [], []
    file_to_id, group_to_id = {}, {}

    # ---- Process each RAW file ----
    for raw_path in raw_files:
        raw_abs = os.path.abspath(raw_path)
        raw_name = os.path.basename(raw_abs)

        if raw_abs not in file_to_id:
            file_to_id[raw_abs] = len(file_basenames)
            file_basenames.append(raw_name)
            file_abspaths.append(raw_abs)

        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)

        f_id = file_to_id[raw_abs]
        g_id = group_to_id[group]

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=raw_name, ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
            rt = float(raw.get_retention_time_from_scan_number(sc_num))

            stype = _scan_type_label(raw_scan.scan_type)

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue
                vec = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                ms1_rows.append(vec)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue
                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # ---- Build shared metadata ----
    metadata = dict(
        ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
        ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
        ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
        ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
        ms2_scan=np.asarray(ms2_scan, dtype=np.int32),
        ms2_rt=np.asarray(ms2_rt, dtype=np.float32),
        ms2_precursor_mz=np.asarray(ms2_prec_mz, dtype=np.float32),
        ms2_file_id=np.asarray(ms2_file_id, dtype=np.int32),
        ms2_group_id=np.asarray(ms2_group_id, dtype=np.int32),
        file_names_lookup=np.asarray(file_basenames, dtype=object),
        file_paths_lookup=np.asarray(file_abspaths, dtype=object),
        group_names_lookup=np.asarray(group_names, dtype=object),
    )

    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]
    ms2_path = f"{base}.ms2.npz"
    ms1_path = f"{base}.ms1.npz"
    meta_path = f"{base}.meta.npz"

    # ---- Save MS2 first ----
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)
    np.savez_compressed(ms2_path, ms2_matrix=MS2, **metadata)
    print(f"Saved MS2: {ms2_path}  shape={MS2.shape}")
    del MS2, ms2_rows
    gc.collect()

    # ---- Save MS1 second ----
    if ms1_rows:
        MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False)
        col_max = MS1.max(axis=0)
        col_max[col_max == 0] = 1.0
        MS1 = (MS1 / col_max).astype(np.float16, copy=False)
    else:
        MS1 = np.zeros((0, MS1_LEN), dtype=np.float16)
    np.savez_compressed(ms1_path, ms1_matrix=MS1, **metadata)
    print(f"Saved MS1: {ms1_path}  shape={MS1.shape}")
    del MS1, ms1_rows
    gc.collect()

    # ---- Save metadata standalone ----
    np.savez_compressed(meta_path, **metadata)
    print(f"Saved META: {meta_path}")

    return {"ms2": ms2_path, "ms1": ms1_path, "meta": meta_path}


In [None]:
wholeCasting_npz([r"D:\TreatmentABC", r"D:\TreatmentD"], r"D:\casts\test6")

In [6]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc
import tempfile

# If you have fisher_py installed, uncomment:
# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

# -----------------------------
# Shared config / helpers
# -----------------------------
MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9
MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
MS2_MIN_IDX, MS2_LEN = 400, 1600     # m/z 400..1999
MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

def _scan_type_label(text: str) -> str:
    m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
    return m.group(1).lower() if m else ""

def _group_from_name(name: str) -> str:
    for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
        if g in name:
            return g
    return "Unknown"

def _as_float_array(x):
    if x is None:
        return np.array([], dtype=float)
    a = np.asarray(x)
    return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

def _ensure_folder_list(paths):
    if isinstance(paths, (list, tuple)):
        return list(paths)
    return [paths]

def _gather_raw_files(folder_paths):
    folder_list = _ensure_folder_list(folder_paths)
    raw_files = []
    for fp in folder_list:
        fp_abs = os.path.abspath(fp)
        if not os.path.isdir(fp_abs):
            raise FileNotFoundError(f'Folder not found: "{fp_abs}"')
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.raw")))
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.RAW")))
    raw_files = sorted(set(os.path.abspath(p) for p in raw_files))
    if not raw_files:
        raise FileNotFoundError(
            f'No ".raw" files found in: {", ".join(map(os.path.abspath, folder_list))}'
        )
    return raw_files

def _base_paths(out_path: str):
    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]
    return (f"{base}.ms2.npz", f"{base}.ms1.npz", f"{base}.meta.npz")

# -----------------------------
# Pass 1: MS2 + Metadata
# -----------------------------
def build_ms2_and_meta(folder_paths, out_path: str):
    """
    Build MS2 (per-scan normalized) and shared metadata, saving:
      <base>.ms2.npz  -> ms2_matrix (float16) + metadata
      <base>.meta.npz -> metadata only
    """
    raw_files = _gather_raw_files(folder_paths)

    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_basenames, file_abspaths, group_names = [], [], []
    file_to_id, group_to_id = {}, {}

    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if raw_abs not in file_to_id:
            file_to_id[raw_abs] = len(file_basenames)
            file_basenames.append(raw_name)
            file_abspaths.append(raw_abs)

        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)

        f_id = file_to_id[raw_abs]
        g_id = group_to_id[group]

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass1-MS2] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # Just metadata for MS1 in this pass
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # precursor m/z extraction
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Build metadata (shared by both MS1 and MS2 output files)
    metadata = dict(
        ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
        ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
        ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
        ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),

        ms2_scan=np.asarray(ms2_scan, dtype=np.int32),
        ms2_rt=np.asarray(ms2_rt, dtype=np.float32),
        ms2_precursor_mz=np.asarray(ms2_prec_mz, dtype=np.float32),
        ms2_file_id=np.asarray(ms2_file_id, dtype=np.int32),
        ms2_group_id=np.asarray(ms2_group_id, dtype=np.int32),

        file_names_lookup=np.asarray(file_basenames, dtype=object),
        file_paths_lookup=np.asarray(file_abspaths, dtype=object),
        group_names_lookup=np.asarray(group_names, dtype=object),
    )

    ms2_path, _, meta_path = _base_paths(out_path)

    # Save MS2
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)
    np.savez_compressed(ms2_path, ms2_matrix=MS2, **metadata)
    print(f"Saved MS2: {ms2_path}  shape={MS2.shape}")
    del MS2, ms2_rows
    gc.collect()

    # Save metadata standalone
    np.savez_compressed(meta_path, **metadata)
    print(f"Saved META: {meta_path}")

    return {"ms2": ms2_path, "meta": meta_path}


# -----------------------------
# Pass 2: MS1 only (streamed, low-RAM)
# -----------------------------
def build_ms1_only(folder_paths, out_path: str, meta_path: str = None):
    """
    Build MS1 matrix in a streamed way (low RAM) and save:
      <base>.ms1.npz -> ms1_matrix (float16) + metadata

    MS1 normalization: per-column (bin-wise) max across *all* MS1 scans.

    Two sweeps:
      1) Count MS1 rows + compute col_max (no storage)
      2) Allocate memmap [n_rows, MS1_LEN], fill normalized rows, save compressed NPZ.

    If meta_path is provided, that metadata is embedded into the MS1 NPZ; otherwise it is
    rebuilt on the fly (slower) and saved alongside.
    """
    raw_files = _gather_raw_files(folder_paths)
    ms1_count = 0
    col_max = np.zeros(MS1_LEN, dtype=np.float32)

    # If metadata was saved in Pass 1, load it to re-use
    metadata = None
    if meta_path is not None and os.path.exists(meta_path):
        with np.load(meta_path, allow_pickle=True) as meta_npz:
            metadata = {k: meta_npz[k] for k in meta_npz.files}

    # If no metadata given, we’ll rebuild only MS1-related metadata
    rebuild_metadata = metadata is None

    # For (re)building MS1 metadata if needed
    if rebuild_metadata:
        file_basenames, file_abspaths, group_names = [], [], []
        file_to_id, group_to_id = {}, {}
        ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    # -------- Pass 2a: count & col_max --------
    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if rebuild_metadata:
            if raw_abs not in file_to_id:
                file_to_id[raw_abs] = len(file_basenames)
                file_basenames.append(raw_name)
                file_abspaths.append(raw_abs)

            group = _group_from_name(raw_name)
            if group not in group_to_id:
                group_to_id[group] = len(group_names)
                group_names.append(group)

            f_id = file_to_id[raw_abs]
            g_id = group_to_id[group]
        else:
            f_id = g_id = None  # unused

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2a-MS1 count] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            if stype != "ms":
                continue

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            idx = np.rint(masses * 10.0).astype(np.int32)
            mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
            if not mask.any():
                continue

            # Update col_max without storing the full row
            vec = np.zeros(MS1_LEN, dtype=np.float32)
            np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))
            np.maximum(col_max, vec, out=col_max)
            ms1_count += 1

            # (Re)build minimal metadata for MS1 if needed
            if rebuild_metadata:
                sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
                try:
                    rt = float(raw.get_retention_time_from_scan_number(sc_num))
                except Exception:
                    rt = np.nan
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Guard: no MS1 scans
    if ms1_count == 0:
        # Build/save empty file with metadata
        ms1_path = _base_paths(out_path)[1]
        if metadata is None and rebuild_metadata:
            metadata = dict(
                ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
                ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
                ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
                ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
                file_names_lookup=np.asarray(file_basenames, dtype=object),
                file_paths_lookup=np.asarray(file_abspaths, dtype=object),
                group_names_lookup=np.asarray(group_names, dtype=object),
            )
        elif metadata is None:
            metadata = {}
        empty = np.zeros((0, MS1_LEN), dtype=np.float16)
        np.savez_compressed(ms1_path, ms1_matrix=empty, **metadata)
        print(f"Saved MS1 (empty): {ms1_path}")
        return {"ms1": ms1_path}

    # Avoid divide-by-zero
    col_max[col_max == 0] = 1.0

    # If we had to rebuild metadata, finalize it now. Otherwise we already have it.
    if rebuild_metadata:
        metadata = dict(
            ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
            ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
            ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
            ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
            # Note: MS2-related fields will be missing (that’s OK if you only need MS1 here)
            file_names_lookup=np.asarray(file_basenames, dtype=object),
            file_paths_lookup=np.asarray(file_abspaths, dtype=object),
            group_names_lookup=np.asarray(group_names, dtype=object),
        )

    # -------- Pass 2b: fill normalized rows into a memmap --------
    ms1_path = _base_paths(out_path)[1]

    # Create a temp directory for the memmap file
    with tempfile.TemporaryDirectory() as tmpdir:
        memmap_path = os.path.join(tmpdir, "ms1_memmap.npy")
        MS1_map = np.memmap(memmap_path, dtype=np.float16, mode="w+", shape=(ms1_count, MS1_LEN))

        row = 0
        for raw_abs in raw_files:
            raw_name = os.path.basename(raw_abs)

            try:
                raw = RawFile(raw_abs)
            except Exception as e:
                print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
                continue

            total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

            for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2b-MS1 write] {raw_name}", ncols=100):
                if row >= ms1_count:
                    break  # safety

                try:
                    raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
                except Exception:
                    continue

                stype = _scan_type_label(raw_scan.scan_type)
                if stype != "ms":
                    continue

                masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
                intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
                if masses.size == 0 or intens.size == 0:
                    continue

                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                v32 /= col_max  # per-column normalization from Pass 2a
                MS1_map[row, :] = v32.astype(np.float16, copy=False)
                row += 1

            try:
                raw.dispose()
            except Exception:
                pass

        # Flush memmap to disk
        del MS1_map

        # Load the memmap file and write compressed NPZ with metadata
        MS1_final = np.load(memmap_path, mmap_mode="r")
        np.savez_compressed(ms1_path, ms1_matrix=MS1_final, **metadata)
        print(f"Saved MS1: {ms1_path}  shape={MS1_final.shape}")

    gc.collect()
    return {"ms1": ms1_path}


# -----------------------------
# Optional: convenience wrapper
# -----------------------------
def wholeCasting_npz_split(folder_paths, out_path: str):
    """
    Convenience: run Pass 1 then Pass 2.
    Returns dict with paths.
    """
    paths1 = build_ms2_and_meta(folder_paths, out_path)
    paths2 = build_ms1_only(folder_paths, out_path, meta_path=paths1["meta"])
    return {"ms2": paths1["ms2"], "ms1": paths2["ms1"], "meta": paths1["meta"]}


In [8]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc
import tempfile

# If you have fisher_py installed, uncomment:
# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

# -----------------------------
# Shared config / helpers
# -----------------------------
MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9
MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
MS2_MIN_IDX, MS2_LEN = 400, 1600     # m/z 400..1999
MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

def _scan_type_label(text: str) -> str:
    m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
    return m.group(1).lower() if m else ""

def _group_from_name(name: str) -> str:
    for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
        if g in name:
            return g
    return "Unknown"

def _as_float_array(x):
    if x is None:
        return np.array([], dtype=float)
    a = np.asarray(x)
    return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

def _ensure_folder_list(paths):
    if isinstance(paths, (list, tuple)):
        return list(paths)
    return [paths]

def _gather_raw_files(folder_paths):
    folder_list = _ensure_folder_list(folder_paths)
    raw_files = []
    for fp in folder_list:
        fp_abs = os.path.abspath(fp)
        if not os.path.isdir(fp_abs):
            raise FileNotFoundError(f'Folder not found: "{fp_abs}"')
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.raw")))
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.RAW")))
    raw_files = sorted(set(os.path.abspath(p) for p in raw_files))
    if not raw_files:
        raise FileNotFoundError(
            f'No ".raw" files found in: {", ".join(map(os.path.abspath, folder_list))}'
        )
    return raw_files

def _base_paths(out_path: str):
    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]
    return (f"{base}.ms2.npz", f"{base}.ms1.npz", f"{base}.meta.npz")

def _sanitize_metadata_dict(md: dict) -> dict:
    """
    Ensure metadata contains only numeric arrays or Unicode string arrays.
    This prevents NumPy from needing pickle to save/load.
    """
    safe = {}
    for k, v in md.items():
        if isinstance(v, (int, float, np.number, np.bool_)):
            safe[k] = np.array(v)
            continue
        # Arrays/lists/tuples
        if isinstance(v, (list, tuple, np.ndarray)):
            arr = np.asarray(v)
            # If object dtype, try numeric then fall back to Unicode
            if arr.dtype == object:
                try:
                    arr = arr.astype(np.float32)
                except Exception:
                    arr = arr.astype("U")
            # Force strings to be Unicode, not object
            if np.issubdtype(arr.dtype, np.character):
                arr = arr.astype("U")
            safe[k] = arr
            continue
        # Strings -> Unicode array (scalar ok)
        if isinstance(v, str):
            safe[k] = np.array(v, dtype="U")
            continue
        # Fallback: stringify scalars/objects to Unicode (rare)
        safe[k] = np.array(str(v), dtype="U")
    return safe

# -----------------------------
# Pass 1: MS2 + Metadata
# -----------------------------
def build_ms2_and_meta(folder_paths, out_path: str):
    """
    Build MS2 (per-scan normalized) and shared metadata, saving:
      <base>.ms2.npz  -> ms2_matrix (float16) + metadata
      <base>.meta.npz -> metadata only
    """
    # Guard: require fisher_py (uncomment import above if installed)
    try:
        RawFile, Scan  # type: ignore # noqa
    except NameError as _:
        raise ImportError("fisher_py is required for RAW access. Uncomment the imports at the top.")

    raw_files = _gather_raw_files(folder_paths)

    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_basenames, file_abspaths, group_names = [], [], []
    file_to_id, group_to_id = {}, {}

    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if raw_abs not in file_to_id:
            file_to_id[raw_abs] = len(file_basenames)
            file_basenames.append(raw_name)
            file_abspaths.append(raw_abs)

        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)

        f_id = file_to_id[raw_abs]
        g_id = group_to_id[group]

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass1-MS2] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # Just metadata for MS1 in this pass
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # precursor m/z extraction
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Build metadata (shared by both MS1 and MS2 output files)
    metadata_raw = dict(
        ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
        ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
        ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
        ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),

        ms2_scan=np.asarray(ms2_scan, dtype=np.int32),
        ms2_rt=np.asarray(ms2_rt, dtype=np.float32),
        ms2_precursor_mz=np.asarray(ms2_prec_mz, dtype=np.float32),
        ms2_file_id=np.asarray(ms2_file_id, dtype=np.int32),
        ms2_group_id=np.asarray(ms2_group_id, dtype=np.int32),

        # ensure Unicode (not object)
        file_names_lookup=np.asarray(file_basenames, dtype="U"),
        file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
        group_names_lookup=np.asarray(group_names, dtype="U"),
    )
    metadata = _sanitize_metadata_dict(metadata_raw)

    ms2_path, _, meta_path = _base_paths(out_path)

    # Save MS2
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)
    np.savez_compressed(ms2_path, ms2_matrix=MS2, **metadata)
    print(f"Saved MS2: {ms2_path}  shape={MS2.shape}")
    del MS2, ms2_rows
    gc.collect()

    # Save metadata standalone
    np.savez_compressed(meta_path, **metadata)
    print(f"Saved META: {meta_path}")

    return {"ms2": ms2_path, "meta": meta_path}


# -----------------------------
# Pass 2: MS1 only (streamed, low-RAM)
# -----------------------------
def build_ms1_only(folder_paths, out_path: str, meta_path: str = None):
    """
    Build MS1 matrix in a streamed way (low RAM) and save:
      <base>.ms1.npz -> ms1_matrix (float16) + metadata

    MS1 normalization: per-column (bin-wise) max across *all* MS1 scans.

    Two sweeps:
      1) Count MS1 rows + compute col_max (no storage)
      2) Allocate headered .npy memmap [n_rows, MS1_LEN], fill normalized rows, save compressed NPZ.

    If meta_path is provided, that metadata is embedded into the MS1 NPZ; otherwise it is
    rebuilt on the fly (slower) and saved alongside.
    """
    # Guard: require fisher_py (uncomment import above if installed)
    try:
        RawFile, Scan  # type: ignore # noqa
    except NameError as _:
        raise ImportError("fisher_py is required for RAW access. Uncomment the imports at the top.")

    from numpy.lib.format import open_memmap  # headered .npy memmap

    raw_files = _gather_raw_files(folder_paths)
    ms1_count = 0
    col_max = np.zeros(MS1_LEN, dtype=np.float32)

    # Load metadata from Pass 1 if available
    metadata = None
    if meta_path is not None and os.path.exists(meta_path):
        with np.load(meta_path, allow_pickle=False) as meta_npz:
            metadata = {k: meta_npz[k] for k in meta_npz.files}
        # Just in case an older meta had object dtypes:
        metadata = _sanitize_metadata_dict(metadata)

    rebuild_metadata = metadata is None

    if rebuild_metadata:
        file_basenames, file_abspaths, group_names = [], [], []
        file_to_id, group_to_id = {}, {}
        ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    # -------- Pass 2a: count & col_max --------
    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if rebuild_metadata:
            if raw_abs not in file_to_id:
                file_to_id[raw_abs] = len(file_basenames)
                file_basenames.append(raw_name)
                file_abspaths.append(raw_abs)

            group = _group_from_name(raw_name)
            if group not in group_to_id:
                group_to_id[group] = len(group_names)
                group_names.append(group)

            f_id = file_to_id[raw_abs]
            g_id = group_to_id[group]
        else:
            f_id = g_id = None  # unused

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2a-MS1 count] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            if stype != "ms":
                continue

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            idx = np.rint(masses * 10.0).astype(np.int32)
            mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
            if not mask.any():
                continue

            # Update col_max without storing the full row
            vec = np.zeros(MS1_LEN, dtype=np.float32)
            np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))
            np.maximum(col_max, vec, out=col_max)
            ms1_count += 1

            # (Re)build minimal metadata for MS1 if needed
            if rebuild_metadata:
                sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
                try:
                    rt = float(raw.get_retention_time_from_scan_number(sc_num))
                except Exception:
                    rt = np.nan
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Guard: no MS1 scans
    ms1_path = _base_paths(out_path)[1]
    if ms1_count == 0:
        if metadata is None and rebuild_metadata:
            metadata = dict(
                ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
                ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
                ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
                ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
                file_names_lookup=np.asarray(file_basenames, dtype="U"),
                file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
                group_names_lookup=np.asarray(group_names, dtype="U"),
            )
        elif metadata is None:
            metadata = {}
        metadata = _sanitize_metadata_dict(metadata)
        empty = np.zeros((0, MS1_LEN), dtype=np.float16)
        np.savez_compressed(ms1_path, ms1_matrix=empty, **metadata)
        print(f"Saved MS1 (empty): {ms1_path}")
        return {"ms1": ms1_path}

    # Avoid divide-by-zero
    col_max[col_max == 0] = 1.0

    # If we had to rebuild metadata, finalize it now. Otherwise we already have it.
    if rebuild_metadata:
        metadata_raw = dict(
            ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
            ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
            ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
            ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
            file_names_lookup=np.asarray(file_basenames, dtype="U"),
            file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
            group_names_lookup=np.asarray(group_names, dtype="U"),
        )
        metadata = _sanitize_metadata_dict(metadata_raw)

    # -------- Pass 2b: fill normalized rows into a headered .npy memmap --------
    with tempfile.TemporaryDirectory() as tmpdir:
        memmap_path = os.path.join(tmpdir, "ms1_memmap.npy")

        # Use headered .npy memmap to allow np.load(..., allow_pickle=False)
        from numpy.lib.format import open_memmap
        MS1_map = open_memmap(memmap_path, mode="w+", dtype=np.float16, shape=(ms1_count, MS1_LEN))

        row = 0
        for raw_abs in raw_files:
            raw_name = os.path.basename(raw_abs)

            try:
                raw = RawFile(raw_abs)
            except Exception as e:
                print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
                continue

            total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

            for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2b-MS1 write] {raw_name}", ncols=100):
                if row >= ms1_count:
                    break  # safety

                try:
                    raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
                except Exception:
                    continue

                stype = _scan_type_label(raw_scan.scan_type)
                if stype != "ms":
                    continue

                masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
                intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
                if masses.size == 0 or intens.size == 0:
                    continue

                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))

                v32 /= col_max  # per-column normalization from Pass 2a
                MS1_map[row, :] = v32.astype(np.float16, copy=False)
                row += 1

            try:
                raw.dispose()
            except Exception:
                pass

        # Flush memmap to disk by dropping reference
        del MS1_map

        # Load the headered .npy without pickle and write compressed NPZ with metadata
        MS1_final = np.load(memmap_path, mmap_mode="r", allow_pickle=False)
        np.savez_compressed(ms1_path, ms1_matrix=MS1_final, **metadata)
        print(f"Saved MS1: {ms1_path}  shape={MS1_final.shape}")

    gc.collect()
    return {"ms1": ms1_path}


# -----------------------------
# Optional: convenience wrapper
# -----------------------------
def wholeCasting_npz_split(folder_paths, out_path: str):
    """
    Convenience: run Pass 1 then Pass 2.
    Returns dict with paths.
    """
    paths1 = build_ms2_and_meta(folder_paths, out_path)
    paths2 = build_ms1_only(folder_paths, out_path, meta_path=paths1["meta"])
    return {"ms2": paths1["ms2"], "ms1": paths2["ms1"], "meta": paths1["meta"]}


In [10]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc

# If you have fisher_py installed, uncomment:
# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

# -----------------------------
# Shared config / helpers
# -----------------------------
MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9
MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
MS2_MIN_IDX, MS2_LEN = 400, 1600     # m/z 400..1999
MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

def _scan_type_label(text: str) -> str:
    m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
    return m.group(1).lower() if m else ""

def _group_from_name(name: str) -> str:
    for g in ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD"):
        if g in name:
            return g
    return "Unknown"

def _as_float_array(x):
    if x is None:
        return np.array([], dtype=float)
    a = np.asarray(x)
    return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

def _ensure_folder_list(paths):
    if isinstance(paths, (list, tuple)):
        return list(paths)
    return [paths]

def _gather_raw_files(folder_paths):
    folder_list = _ensure_folder_list(folder_paths)
    raw_files = []
    for fp in folder_list:
        fp_abs = os.path.abspath(fp)
        if not os.path.isdir(fp_abs):
            raise FileNotFoundError(f'Folder not found: "{fp_abs}"')
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.raw")))
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.RAW")))
    raw_files = sorted(set(os.path.abspath(p) for p in raw_files))
    if not raw_files:
        raise FileNotFoundError(
            f'No ".raw" files found in: {", ".join(map(os.path.abspath, folder_list))}'
        )
    return raw_files

def _base_paths(out_path: str):
    base = os.path.abspath(out_path)
    if base.lower().endswith(".npz"):
        base = base[:-4]
    return (f"{base}.ms2.npz", f"{base}.ms1.npz", f"{base}.meta.npz")

def _sanitize_metadata_dict(md: dict) -> dict:
    """
    Keep only numeric arrays or Unicode string arrays (no object dtype),
    so NumPy never needs pickle to save/load.
    """
    safe = {}
    for k, v in md.items():
        if isinstance(v, (int, float, np.number, np.bool_)):
            safe[k] = np.array(v)
            continue
        if isinstance(v, (list, tuple, np.ndarray)):
            arr = np.asarray(v)
            if arr.dtype == object:
                try:
                    arr = arr.astype(np.float32)
                except Exception:
                    arr = arr.astype("U")
            if np.issubdtype(arr.dtype, np.character):
                arr = arr.astype("U")
            safe[k] = arr
            continue
        if isinstance(v, str):
            safe[k] = np.array(v, dtype="U")
            continue
        safe[k] = np.array(str(v), dtype="U")
    return safe

# -----------------------------
# Pass 1: MS2 + Metadata
# -----------------------------
def build_ms2_and_meta(folder_paths, out_path: str):
    """
    Build MS2 (per-scan normalized) and shared metadata, saving:
      <base>.ms2.npz  -> ms2_matrix (float16) + metadata
      <base>.meta.npz -> metadata only
    """
    # Guard: require fisher_py (uncomment import above if installed)
    try:
        RawFile, Scan  # type: ignore # noqa
    except NameError as _:
        raise ImportError("fisher_py is required for RAW access. Uncomment the imports at the top.")

    raw_files = _gather_raw_files(folder_paths)

    ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    ms2_rows = []
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id, ms2_group_id = [], [], [], [], []

    file_basenames, file_abspaths, group_names = [], [], []
    file_to_id, group_to_id = {}, {}

    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if raw_abs not in file_to_id:
            file_to_id[raw_abs] = len(file_basenames)
            file_basenames.append(raw_name)
            file_abspaths.append(raw_abs)

        group = _group_from_name(raw_name)
        if group not in group_to_id:
            group_to_id[group] = len(group_names)
            group_names.append(group)

        f_id = file_to_id[raw_abs]
        g_id = group_to_id[group]

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass1-MS2] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            stype = _scan_type_label(raw_scan.scan_type)

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # MS1 metadata only in this pass
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

            elif stype == "ms2":
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue

                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # precursor m/z extraction
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)
                ms2_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Build metadata (shared by both MS1 and MS2 output files)
    metadata_raw = dict(
        ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
        ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
        ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
        ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),

        ms2_scan=np.asarray(ms2_scan, dtype=np.int32),
        ms2_rt=np.asarray(ms2_rt, dtype=np.float32),
        ms2_precursor_mz=np.asarray(ms2_prec_mz, dtype=np.float32),
        ms2_file_id=np.asarray(ms2_file_id, dtype=np.int32),
        ms2_group_id=np.asarray(ms2_group_id, dtype=np.int32),

        file_names_lookup=np.asarray(file_basenames, dtype="U"),
        file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
        group_names_lookup=np.asarray(group_names, dtype="U"),
    )
    metadata = _sanitize_metadata_dict(metadata_raw)

    ms2_path, _, meta_path = _base_paths(out_path)

    # Save MS2
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)
    np.savez_compressed(ms2_path, ms2_matrix=MS2, **metadata)
    print(f"Saved MS2: {ms2_path}  shape={MS2.shape}")
    del MS2, ms2_rows
    gc.collect()

    # Save metadata standalone
    np.savez_compressed(meta_path, **metadata)
    print(f"Saved META: {meta_path}")

    return {"ms2": ms2_path, "meta": meta_path}


# -----------------------------
# Pass 2: MS1 only (streamed, low-RAM)
# -----------------------------
def build_ms1_only(folder_paths, out_path: str, meta_path: str = None):
    """
    Build MS1 matrix in a streamed way (low RAM) and save:
      <base>.ms1.npz -> ms1_matrix (float16) + metadata

    MS1 normalization: per-column (bin-wise) max across *all* MS1 scans.

    Two sweeps:
      1) Count MS1 rows + compute col_max (no storage)
      2) Allocate headered .npy memmap [n_rows, MS1_LEN], fill normalized rows, save compressed NPZ.

    If meta_path is provided, that metadata is embedded into the MS1 NPZ; otherwise it is
    rebuilt on the fly (slower) and saved alongside.
    """
    # Guard: require fisher_py (uncomment import above if installed)
    try:
        RawFile, Scan  # type: ignore # noqa
    except NameError as _:
        raise ImportError("fisher_py is required for RAW access. Uncomment the imports at the top.")

    from numpy.lib.format import open_memmap  # headered .npy memmap

    raw_files = _gather_raw_files(folder_paths)
    ms1_count = 0
    col_max = np.zeros(MS1_LEN, dtype=np.float32)

    # Load metadata from Pass 1 if available
    metadata = None
    if meta_path is not None and os.path.exists(meta_path):
        with np.load(meta_path, allow_pickle=False) as meta_npz:
            metadata = {k: meta_npz[k] for k in meta_npz.files}
        metadata = _sanitize_metadata_dict(metadata)

    rebuild_metadata = metadata is None

    if rebuild_metadata:
        file_basenames, file_abspaths, group_names = [], [], []
        file_to_id, group_to_id = {}, {}
        ms1_scan, ms1_rt, ms1_file_id, ms1_group_id = [], [], [], []

    # -------- Pass 2a: count & col_max --------
    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        if rebuild_metadata:
            if raw_abs not in file_to_id:
                file_to_id[raw_abs] = len(file_basenames)
                file_basenames.append(raw_name)
                file_abspaths.append(raw_abs)

            group = _group_from_name(raw_name)
            if group not in group_to_id:
                group_to_id[group] = len(group_names)
                group_names.append(group)

            f_id = file_to_id[raw_abs]
            g_id = group_to_id[group]
        else:
            f_id = g_id = None  # unused

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2a-MS1 count] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            if stype != "ms":
                continue

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            idx = np.rint(masses * 10.0).astype(np.int32)
            mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
            if not mask.any():
                continue

            vec = np.zeros(MS1_LEN, dtype=np.float32)
            np.add.at(vec, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))
            np.maximum(col_max, vec, out=col_max)
            ms1_count += 1

            if rebuild_metadata:
                sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
                try:
                    rt = float(raw.get_retention_time_from_scan_number(sc_num))
                except Exception:
                    rt = np.nan
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)
                ms1_group_id.append(g_id)

        try:
            raw.dispose()
        except Exception:
            pass

    # Guard: no MS1 scans
    ms1_path = _base_paths(out_path)[1]
    if ms1_count == 0:
        if metadata is None and rebuild_metadata:
            metadata = dict(
                ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
                ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
                ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
                ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
                file_names_lookup=np.asarray(file_basenames, dtype="U"),
                file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
                group_names_lookup=np.asarray(group_names, dtype="U"),
            )
        elif metadata is None:
            metadata = {}
        metadata = _sanitize_metadata_dict(metadata)
        empty = np.zeros((0, MS1_LEN), dtype=np.float16)
        np.savez_compressed(ms1_path, ms1_matrix=empty, **metadata)
        print(f"Saved MS1 (empty): {ms1_path}")
        return {"ms1": ms1_path}

    # Avoid divide-by-zero
    col_max[col_max == 0] = 1.0

    # Finalize metadata if rebuilt
    if rebuild_metadata:
        metadata_raw = dict(
            ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
            ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
            ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),
            ms1_group_id=np.asarray(ms1_group_id, dtype=np.int32),
            file_names_lookup=np.asarray(file_basenames, dtype="U"),
            file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
            group_names_lookup=np.asarray(group_names, dtype="U"),
        )
        metadata = _sanitize_metadata_dict(metadata_raw)

    # -------- Pass 2b: fill normalized rows into a headered .npy memmap --------
    # Write the temp memmap next to the final outputs to avoid Windows handle issues.
    base_dir = os.path.dirname(os.path.abspath(ms1_path)) or "."
    os.makedirs(base_dir, exist_ok=True)
    tmp_memmap_path = os.path.join(base_dir, os.path.basename(ms1_path).replace(".ms1.npz", ".ms1.tmp.npy"))

    from numpy.lib.format import open_memmap
    MS1_map = open_memmap(tmp_memmap_path, mode="w+", dtype=np.float16, shape=(ms1_count, MS1_LEN))

    row = 0
    for raw_abs in raw_files:
        raw_name = os.path.basename(raw_abs)

        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[Pass2b-MS1 write] {raw_name}", ncols=100):
            if row >= ms1_count:
                break  # safety

            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            if stype != "ms":
                continue

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            idx = np.rint(masses * 10.0).astype(np.int32)
            mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
            if not mask.any():
                continue

            v32 = np.zeros(MS1_LEN, dtype=np.float32)
            np.add.at(v32, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))
            v32 /= col_max  # per-column normalization
            MS1_map[row, :] = v32.astype(np.float16, copy=False)
            row += 1

        try:
            raw.dispose()
        except Exception:
            pass

    # --- CRITICAL ON WINDOWS ---
    # Flush and drop all references to the memmap BEFORE we read/delete it.
    try:
        MS1_map.flush()
    except Exception:
        pass
    del MS1_map
    gc.collect()

    # Load into RAM (no mmap) so no handle remains on the file; then save compressed NPZ.
    MS1_final = np.load(tmp_memmap_path, mmap_mode=None, allow_pickle=False)
    np.savez_compressed(ms1_path, ms1_matrix=MS1_final, **metadata)
    print(f"Saved MS1: {ms1_path}  shape={MS1_final.shape}")

    # Now it is safe to remove the temp .npy (all references gone).
    del MS1_final
    gc.collect()
    try:
        os.remove(tmp_memmap_path)
    except Exception as e:
        print(f"[warn] Could not remove temp memmap ({tmp_memmap_path}): {e}")

    gc.collect()
    return {"ms1": ms1_path}


# -----------------------------
# Optional: convenience wrapper
# -----------------------------
def wholeCasting_npz_split(folder_paths, out_path: str):
    """
    Convenience: run Pass 1 then Pass 2.
    Returns dict with paths.
    """
    paths1 = build_ms2_and_meta(folder_paths, out_path)
    paths2 = build_ms1_only(folder_paths, out_path, meta_path=paths1["meta"])
    return {"ms2": paths1["ms2"], "ms1": paths2["ms1"], "meta": paths1["meta"]}


In [None]:
wholeCasting_npz_split(["D:/TreatmentABC", "D:/TreatmentD" ], out_path="D:/casts/databank")

In [14]:
import os
import re
import glob
import numpy as np
from tqdm import tqdm
import gc

# If you have fisher_py installed, UNCOMMENT these:
# from fisher_py.raw_file import RawFile
# from fisher_py.scan import Scan

# -----------------------------
# Config / binning
# -----------------------------
MS1_MIN_IDX, MS1_LEN = 6000, 13690   # 600.0 m/z * 10 .. 1935.9 (10 pts per m/z)
MS1_MAX_EXC = MS1_MIN_IDX + MS1_LEN
MS2_MIN_IDX, MS2_LEN = 400, 1600     # m/z 400..1999 (1 pt per m/z)
MS2_MAX_EXC = MS2_MIN_IDX + MS2_LEN

GROUPS = ("TreatmentA", "TreatmentB", "TreatmentC", "TreatmentD")

# -----------------------------
# Helpers
# -----------------------------
def _scan_type_label(text: str) -> str:
    m = re.search(r"Full\s+(\w+)", str(text), flags=re.IGNORECASE)
    return m.group(1).lower() if m else ""

def _group_from_name(name: str) -> str:
    for g in GROUPS:
        if g in name:
            return g
    return "Unknown"

def _as_float_array(x):
    if x is None:
        return np.array([], dtype=float)
    a = np.asarray(x)
    return a.astype(float, copy=False) if a.size else np.array([], dtype=float)

def _ensure_folder_list(paths):
    if isinstance(paths, (list, tuple)):
        return list(paths)
    return [paths]

def _gather_raw_files(folder_paths):
    folder_list = _ensure_folder_list(folder_paths)
    raw_files = []
    for fp in folder_list:
        fp_abs = os.path.abspath(fp)
        if not os.path.isdir(fp_abs):
            raise FileNotFoundError(f'Folder not found: "{fp_abs}"')
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.raw")))
        raw_files.extend(glob.glob(os.path.join(fp_abs, "*.RAW")))
    raw_files = sorted(set(os.path.abspath(p) for p in raw_files))
    if not raw_files:
        raise FileNotFoundError(
            f'No ".raw" files found in: {", ".join(map(os.path.abspath, folder_list))}'
        )
    return raw_files

def _sanitize_metadata_dict(md: dict) -> dict:
    """Ensure arrays are numeric or Unicode (never object dtype)."""
    safe = {}
    for k, v in md.items():
        if isinstance(v, (int, float, np.number, np.bool_)):
            safe[k] = np.array(v)
            continue
        if isinstance(v, (list, tuple, np.ndarray)):
            arr = np.asarray(v)
            if arr.dtype == object:
                try:
                    arr = arr.astype(np.float32)
                except Exception:
                    arr = arr.astype("U")
            if np.issubdtype(arr.dtype, np.character):
                arr = arr.astype("U")
            safe[k] = arr
            continue
        if isinstance(v, str):
            safe[k] = np.array(v, dtype="U")
            continue
        safe[k] = np.array(str(v), dtype="U")
    return safe

def _out_paths(out_dir: str, group: str):
    base = os.path.join(os.path.abspath(out_dir), group)
    return (f"{base}.ms1.npz", f"{base}.ms2.npz", f"{base}.meta.npz")

# -----------------------------
# Core: process one treatment group at a time
# -----------------------------
def _process_group(group: str, group_files: list, out_dir: str):
    """
    Builds:
      - MS1 (float32, UNnormalized) stacked per MS1 scan for this group
      - MS2 (float16, per-scan normalized) stacked per MS2 scan for this group
      - METADATA aligned to the two matrices
    Saves three NPZ files and frees RAM.
    """
    if not group_files:
        return None

    # Guard: require fisher_py
    try:
        RawFile, Scan  # type: ignore # noqa
    except NameError:
        raise ImportError("fisher_py is required for RAW access. Uncomment the imports at the top.")

    os.makedirs(out_dir, exist_ok=True)
    ms1_path, ms2_path, meta_path = _out_paths(out_dir, group)

    # Per-group accumulators
    file_basenames, file_abspaths = [], []
    file_to_id = {}

    # MS1
    ms1_rows = []                               # list of vectors (float32)
    ms1_scan, ms1_rt, ms1_file_id = [], [], []  # aligned to ms1_rows

    # MS2
    ms2_rows = []                               # list of vectors (float16)
    ms2_scan, ms2_rt, ms2_prec_mz, ms2_file_id = [], [], [], []

    # Iterate files in this group
    for raw_abs in group_files:
        raw_name = os.path.basename(raw_abs)
        if raw_abs not in file_to_id:
            file_to_id[raw_abs] = len(file_basenames)
            file_basenames.append(raw_name)
            file_abspaths.append(raw_abs)
        f_id = file_to_id[raw_abs]

        # open RAW
        try:
            raw = RawFile(raw_abs)
        except Exception as e:
            print(f'[skip] Cannot open RAW: {raw_abs} ({e})')
            continue

        total_scans = int(getattr(raw, "number_of_scans", 0) or 0)

        for i in tqdm(range(1, total_scans + 1), desc=f"[{group}] {raw_name}", ncols=100):
            try:
                raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            except Exception:
                continue

            stype = _scan_type_label(raw_scan.scan_type)
            sc_num = getattr(raw_scan.scan_statistics, "scan_number", i)
            try:
                rt = float(raw.get_retention_time_from_scan_number(sc_num))
            except Exception:
                rt = np.nan

            masses = _as_float_array(getattr(raw_scan, "preferred_masses", None))
            intens = _as_float_array(getattr(raw_scan, "preferred_intensities", None))
            if masses.size == 0 or intens.size == 0:
                continue

            if stype == "ms":
                # Build UNnormalized float32 MS1 row
                # Bin at 0.1 m/z: index = round(m/z*10)
                idx = np.rint(masses * 10.0).astype(np.int32)
                mask = (idx >= MS1_MIN_IDX) & (idx < MS1_MAX_EXC)
                if not mask.any():
                    continue
                v32 = np.zeros(MS1_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS1_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                ms1_rows.append(v32)
                ms1_scan.append(sc_num)
                ms1_rt.append(rt)
                ms1_file_id.append(f_id)

            elif stype == "ms2":
                # Build per-scan normalized MS2 row (float16 for compact size)
                # Bin at 1.0 m/z: index = round(m/z)
                idx = np.rint(masses).astype(np.int32)
                mask = (idx >= MS2_MIN_IDX) & (idx < MS2_MAX_EXC)
                if not mask.any():
                    continue
                v32 = np.zeros(MS2_LEN, dtype=np.float32)
                np.add.at(v32, idx[mask] - MS2_MIN_IDX, intens[mask].astype(np.float32, copy=False))
                vmax = float(v32.max())
                if vmax > 0:
                    v32 /= vmax
                vec_ms2 = v32.astype(np.float16, copy=False)

                # Precursor m/z (fallback to parsing scan_type text)
                prec = np.nan
                for attr in ("precursor_mz", "master_precursor_mz", "isolation_mz"):
                    if hasattr(raw_scan, attr):
                        try:
                            prec = float(getattr(raw_scan, attr))
                            break
                        except Exception:
                            pass
                if np.isnan(prec):
                    m = re.findall(r'\d+\.\d+', str(raw_scan.scan_type))
                    prec = float(m[1]) if len(m) > 1 else np.nan

                ms2_rows.append(vec_ms2)
                ms2_scan.append(sc_num)
                ms2_rt.append(rt)
                ms2_prec_mz.append(prec)
                ms2_file_id.append(f_id)

        # dispose RAW handle
        try:
            raw.dispose()
        except Exception:
            pass

    # ---- Build metadata (per-group) ----
    # Note: IDs are per-group (0..n_files_in_group-1)
    metadata_raw = dict(
        group_name=np.array(group, dtype="U"),

        # MS1 row-aligned meta
        ms1_scan=np.asarray(ms1_scan, dtype=np.int32),
        ms1_rt=np.asarray(ms1_rt, dtype=np.float32),
        ms1_file_id=np.asarray(ms1_file_id, dtype=np.int32),

        # MS2 row-aligned meta
        ms2_scan=np.asarray(ms2_scan, dtype=np.int32),
        ms2_rt=np.asarray(ms2_rt, dtype=np.float32),
        ms2_precursor_mz=np.asarray(ms2_prec_mz, dtype=np.float32),
        ms2_file_id=np.asarray(ms2_file_id, dtype=np.int32),

        # Lookups
        file_names_lookup=np.asarray(file_basenames, dtype="U"),
        file_paths_lookup=np.asarray(file_abspaths, dtype="U"),
    )
    metadata = _sanitize_metadata_dict(metadata_raw)

    # ---- Stack & save (release RAM right after) ----
    # MS1 (float32, UNnormalized)
    if ms1_rows:
        MS1 = np.vstack(ms1_rows).astype(np.float32, copy=False)
    else:
        MS1 = np.zeros((0, MS1_LEN), dtype=np.float32)
    np.savez_compressed(ms1_path, ms1_matrix=MS1, **metadata)
    print(f"[{group}] Saved MS1: {ms1_path}  shape={MS1.shape}, dtype={MS1.dtype}")
    del MS1, ms1_rows
    gc.collect()

    # MS2 (float16, normalized per scan)
    if ms2_rows:
        MS2 = np.vstack(ms2_rows).astype(np.float16, copy=False)
    else:
        MS2 = np.zeros((0, MS2_LEN), dtype=np.float16)
    np.savez_compressed(ms2_path, ms2_matrix=MS2, **metadata)
    print(f"[{group}] Saved MS2: {ms2_path}  shape={MS2.shape}, dtype={MS2.dtype}")
    del MS2, ms2_rows
    gc.collect()

    # Save metadata standalone (useful if you want to load meta without matrices)
    np.savez_compressed(meta_path, **metadata)
    print(f"[{group}] Saved META: {meta_path}")

    # Final cleanup
    del metadata, metadata_raw
    gc.collect()

    return {"group": group, "ms1": ms1_path, "ms2": ms2_path, "meta": meta_path}

# -----------------------------
# Public API
# -----------------------------
def wholeCasting_per_group(folder_paths, out_dir: str):
    """
    Scans RAW files, partitions by TreatmentA/B/C/D (using filename contains),
    and for each group writes:
      <out_dir>/<Group>.ms1.npz  (float32, UNnormalized)
      <out_dir>/<Group>.ms2.npz  (float16, per-scan normalized)
      <out_dir>/<Group>.meta.npz

    RAM is freed between groups.
    Returns a dict of outputs keyed by group.
    """
    raw_files = _gather_raw_files(folder_paths)
    by_group = {g: [] for g in GROUPS}
    for p in raw_files:
        g = _group_from_name(os.path.basename(p))
        if g in by_group:
            by_group[g].append(p)

    outputs = {}
    for g in GROUPS:
        paths = _process_group(g, by_group[g], out_dir)
        outputs[g] = paths
        # safety: ensure memory is really freed between groups
        gc.collect()
    return outputs


In [15]:
results = wholeCasting_per_group(
    ["D:/TreatmentABC", "D:/TreatmentD"],
    out_dir="D:/casts/databank"
)


[TreatmentA] 20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep01.raw: 100%
[TreatmentA] 20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep02.raw: 100%
[TreatmentA] 20220315_chm134_Cirrhosis_FlowChip15_AA13001EM1_TreatmentA_biorep01_techrep03.raw: 100%
[TreatmentA] 20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep01.raw: 100%
[TreatmentA] 20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep02.raw: 100%
[TreatmentA] 20220317_chm134_Cirrhosis_FlowChip15_AA26021EM1_TreatmentA_biorep21_techrep03.raw: 100%
[TreatmentA] 20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep01.raw: 100%
[TreatmentA] 20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep02.raw: 100%
[TreatmentA] 20220320_chm134_Cirrhosis_FlowChip15_AA18009EM1_TreatmentA_biorep09_techrep03.raw: 100%
[TreatmentA] 20220322_chm134_Cirrhosis_FlowChip15_AA18011EM1_TreatmentA_biorep11_techrep01.

[TreatmentA] Saved MS1: D:\casts\databank\TreatmentA.ms1.npz  shape=(147057, 13690), dtype=float32
[TreatmentA] Saved MS2: D:\casts\databank\TreatmentA.ms2.npz  shape=(336425, 1600), dtype=float16
[TreatmentA] Saved META: D:\casts\databank\TreatmentA.meta.npz


[TreatmentB] 20220315_chm134_Cirrhosis_FlowChip15_AA26018EM1_TreatmentB_biorep18_techrep01.raw: 100%
[TreatmentB] 20220315_chm134_Cirrhosis_FlowChip15_AA26018EM1_TreatmentB_biorep18_techrep02.raw: 100%
[TreatmentB] 20220315_chm134_Cirrhosis_FlowChip15_AA26018EM1_TreatmentB_biorep18_techrep03.raw: 100%
[TreatmentB] 20220317_chm134_Cirrhosis_FlowChip15_AY6027EM1_TreatmentB_biorep27_techrep01.raw: 100%|
[TreatmentB] 20220317_chm134_Cirrhosis_FlowChip15_AY6027EM1_TreatmentB_biorep27_techrep02.raw: 100%|
[TreatmentB] 20220317_chm134_Cirrhosis_FlowChip15_AY6027EM1_TreatmentB_biorep27_techrep03.raw: 100%|
[TreatmentB] 20220320_chm134_Cirrhosis_FlowChip15_AY2023EM1_TreatmentB_biorep23_techrep01.raw: 100%|
[TreatmentB] 20220320_chm134_Cirrhosis_FlowChip15_AY2023EM1_TreatmentB_biorep23_techrep02.raw: 100%|
[TreatmentB] 20220320_chm134_Cirrhosis_FlowChip15_AY2023EM1_TreatmentB_biorep23_techrep03.raw: 100%|
[TreatmentB] 20220322_chm134_Cirrhosis_FlowChip15_AY6029EM1_TreatmentB_biorep29_techrep01.r

[TreatmentB] Saved MS1: D:\casts\databank\TreatmentB.ms1.npz  shape=(130974, 13690), dtype=float32
[TreatmentB] Saved MS2: D:\casts\databank\TreatmentB.ms2.npz  shape=(327286, 1600), dtype=float16
[TreatmentB] Saved META: D:\casts\databank\TreatmentB.meta.npz


[TreatmentC] 20220315_chm134_Cirrhosis_FlowChip15_AA21013EM1_TreatmentC_biorep13_techrep01.raw: 100%
[TreatmentC] 20220315_chm134_Cirrhosis_FlowChip15_AA21013EM1_TreatmentC_biorep13_techrep02.raw: 100%
[TreatmentC] 20220315_chm134_Cirrhosis_FlowChip15_AA21013EM1_TreatmentC_biorep13_techrep03.raw: 100%
[TreatmentC] 20220317_chm134_Cirrhosis_FlowChip15_AY6028EM1_TreatmentC_biorep28_techrep01.raw: 100%|
[TreatmentC] 20220317_chm134_Cirrhosis_FlowChip15_AY6028EM1_TreatmentC_biorep28_techrep02.raw: 100%|
[TreatmentC] 20220317_chm134_Cirrhosis_FlowChip15_AY6028EM1_TreatmentC_biorep28_techrep03.raw: 100%|
[TreatmentC] 20220320_chm134_Cirrhosis_FlowChip15_AA21014EM1_TreatmentC_biorep14_techrep01.raw: 100%
[TreatmentC] 20220320_chm134_Cirrhosis_FlowChip15_AA21014EM1_TreatmentC_biorep14_techrep02.raw: 100%
[TreatmentC] 20220320_chm134_Cirrhosis_FlowChip15_AA21014EM1_TreatmentC_biorep14_techrep03.raw: 100%
[TreatmentC] 20220322_chm134_Cirrhosis_FlowChip15_AY9030EM1_TreatmentC_biorep30_techrep01.r

[skip] Cannot open RAW: D:\TreatmentABC\20220329_chm134_Cirrhosis_FlowChip15_AY2025EM1_TreatmentC_biorep25_techrep02.raw (Instrument index not available for requested device
Parameter name: instrumentIndex
   at ThermoFisher.CommonCore.RawFileReader.RawFileAccessBase.SelectInstrument(Device instrumentType, Int32 instrumentIndex))


[TreatmentC] 20220329_chm134_Cirrhosis_FlowChip15_AY2025EM1_TreatmentC_biorep25_techrep03.raw: 100%|
[TreatmentC] 20220330_chm134_Cirrhosis_FlowChip15_AU3034EM1_TreatmentC_biorep34_techrep01.raw: 100%|
[TreatmentC] 20220330_chm134_Cirrhosis_FlowChip15_AU3034EM1_TreatmentC_biorep34_techrep02.raw: 100%|
[TreatmentC] 20220330_chm134_Cirrhosis_FlowChip15_AU3034EM1_TreatmentC_biorep34_techrep03.raw: 100%|
[TreatmentC] 20221103_chm134_Cirrhosis_FlowChip15_AA15006EM1_TreatmentC_biorep06_techrep01.raw: 100%
[TreatmentC] 20221103_chm134_Cirrhosis_FlowChip15_AA15006EM1_TreatmentC_biorep06_techrep02.raw: 100%
[TreatmentC] 20221103_chm134_Cirrhosis_FlowChip15_AA15006EM1_TreatmentC_biorep06_techrep03.raw: 100%
[TreatmentC] 20221108_chm134_Cirrhosis_FlowChip15_AY12032EM1_TreatmentC_biorep32_techrep01.raw: 100%
[TreatmentC] 20221108_chm134_Cirrhosis_FlowChip15_AY12032EM1_TreatmentC_biorep32_techrep02.raw: 100%
[TreatmentC] 20221108_chm134_Cirrhosis_FlowChip15_AY12032EM1_TreatmentC_biorep32_techrep03.

[TreatmentC] Saved MS1: D:\casts\databank\TreatmentC.ms1.npz  shape=(123807, 13690), dtype=float32
[TreatmentC] Saved MS2: D:\casts\databank\TreatmentC.ms2.npz  shape=(284409, 1600), dtype=float16
[TreatmentC] Saved META: D:\casts\databank\TreatmentC.meta.npz


[TreatmentD] 20220315_chm134_Cirrhosis_FlowChip15_AG2635BC1_TreatmentD_biorep35_techrep01.raw: 100%|
[TreatmentD] 20220315_chm134_Cirrhosis_FlowChip15_AG2635BC1_TreatmentD_biorep35_techrep02.raw: 100%|
[TreatmentD] 20220315_chm134_Cirrhosis_FlowChip15_AG2635BC1_TreatmentD_biorep35_techrep03.raw: 100%|
[TreatmentD] 20220317_chm134_Cirrhosis_FlowChip15_AG2639BC1_TreatmentD_biorep38_techrep03.raw: 100%|
[TreatmentD] 20220317_chm134_Cirrhosis_FlowChip15_AG2639BC1_TreatmentD_biorep39_techrep01.raw: 100%|
[TreatmentD] 20220317_chm134_Cirrhosis_FlowChip15_AG2639BC1_TreatmentD_biorep39_techrep02.raw: 100%|
[TreatmentD] 20220317_chm134_Cirrhosis_FlowChip15_AG2639BC1_TreatmentD_biorep39_techrep03.raw: 100%|
[TreatmentD] 20220320_chm134_Cirrhosis_FlowChip15_AG2638BC1_TreatmentD_biorep38_techrep01.raw: 100%|
[TreatmentD] 20220320_chm134_Cirrhosis_FlowChip15_AG2638BC1_TreatmentD_biorep38_techrep02.raw: 100%|
[TreatmentD] 20220322_chm134_Cirrhosis_FlowChip15_AG31050BC1_TreatmentD_biorep50_techrep01.

[skip] Cannot open RAW: D:\TreatmentD\20221108_chm134_Cirrhosis_FlowChip15_AG2637BC1_TreatmentD_biorep37_techrep03.raw (Instrument index not available for requested device
Parameter name: instrumentIndex
   at ThermoFisher.CommonCore.RawFileReader.RawFileAccessBase.SelectInstrument(Device instrumentType, Int32 instrumentIndex))


[TreatmentD] 20221111_chm134_Cirrhosis_FlowChip15_AG2640BC1_TreatmentD_biorep40_techrep01.raw: 100%|
[TreatmentD] 20221111_chm134_Cirrhosis_FlowChip15_AG2640BC1_TreatmentD_biorep40_techrep02.raw: 100%|
[TreatmentD] 20221111_chm134_Cirrhosis_FlowChip15_AG2640BC1_TreatmentD_biorep40_techrep03.raw: 100%|


[TreatmentD] Saved MS1: D:\casts\databank\TreatmentD.ms1.npz  shape=(130735, 13690), dtype=float32
[TreatmentD] Saved MS2: D:\casts\databank\TreatmentD.ms2.npz  shape=(271277, 1600), dtype=float16
[TreatmentD] Saved META: D:\casts\databank\TreatmentD.meta.npz
