Combine multiple tdportal reports

In [5]:
import pandas as pd
import glob
import os

# --- Folder path containing your CSV files ---
folder = r"F:\20251115\tdportal"   # ⬅️ change this to your actual folder path

# --- Find all CSV files in the folder ---
csv_files = glob.glob(os.path.join(folder, "*.csv"))

# --- Read each CSV and combine them by row ---
dfs = [pd.read_csv(f) for f in csv_files]
combined = pd.concat(dfs, ignore_index=True)

# --- Save the combined CSV ---
output_path = os.path.join(folder, "combined.csv")
combined.to_csv(output_path, index=False)

print(f"✅ Combined {len(csv_files)} files into {output_path}")

✅ Combined 6 files into F:\20251115\tdportal\combined.csv


Combine multiple h5 files

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Combine multiple HDF5 files with identical structure into a single compressed HDF5.

Expected datasets in each input file:
  - file_name      (N,)        |S59
  - group_name     (N,)        |S6
  - ms2_lib        (N, 1600)   float16
  - precursor_mz   (N,)        float32
  - rt_min         (N,)        float32
  - scan           (N,)        int32

Output:
  - One HDF5 with the same dataset names, concatenated along axis 0,
    stored with gzip compression + shuffle + chunking (npz-like behavior).
"""

import os
import glob
import h5py
import numpy as np

# -----------------------------
# User parameters
# -----------------------------
INPUT_DIR = r"F:\20251115\spectra_h5"      # <-- change this
OUTPUT_FILE = r"F:\20251115\spectra_h5/combined.h5"    # <-- change this
PATTERN = "*.h5"                         # which input files to combine

# I/O chunk size when reading/copying (None = whole file at once; use e.g. 100_000 for huge datasets)
COPY_CHUNK_ROWS = 100_000

# HDF5 dataset compression / chunking
H5_COMPRESSION = "gzip"     # "gzip" is closest to npz; you can use "lzf" for faster but less tight
H5_COMP_OPTS = 4            # 1–9 for gzip, higher = more compression, slower
H5_ROW_CHUNK = 4096         # chunk length along the row axis for HDF5 datasets


# -----------------------------
# Helpers
# -----------------------------
def get_files(input_dir, pattern="*.h5"):
    files = sorted(glob.glob(os.path.join(input_dir, pattern)))
    if not files:
        raise FileNotFoundError(f"No files found in {input_dir!r} matching {pattern!r}")
    return files


def inspect_first_file(example_file):
    """
    Read the first file and capture shapes/dtypes to define the output file.
    Also verifies that required datasets exist.
    """
    expected_keys = ["file_name", "group_name", "ms2_lib",
                     "precursor_mz", "rt_min", "scan"]
    info = {}

    with h5py.File(example_file, "r") as f:
        for k in expected_keys:
            if k not in f:
                raise KeyError(f"Dataset {k!r} not found in {example_file}")
            ds = f[k]
            info[k] = {
                "shape": ds.shape,
                "dtype": ds.dtype,
            }

        # quick consistency checks
        n = info["file_name"]["shape"][0]
        for k in expected_keys:
            if f[k].shape[0] != n:
                raise ValueError(
                    f"Dataset {k!r} in {example_file!r} has first dim "
                    f"{f[k].shape[0]} != file_name first dim {n}"
                )

    return info


def compute_total_rows(files):
    """
    Sum up rows across all files and sanity-check that
    each file has internally consistent first dimensions.
    """
    total = 0
    for fp in files:
        with h5py.File(fp, "r") as f:
            n = f["file_name"].shape[0]
            assert f["group_name"].shape[0] == n
            assert f["ms2_lib"].shape[0] == n
            assert f["precursor_mz"].shape[0] == n
            assert f["rt_min"].shape[0] == n
            assert f["scan"].shape[0] == n
        total += n
    return total


def create_output_file(output_file, first_info, total_rows,
                       compression=H5_COMPRESSION,
                       compression_opts=H5_COMP_OPTS,
                       row_chunk=H5_ROW_CHUNK):
    """
    Create the output HDF5 with compressed, chunked datasets.
    """
    if os.path.exists(output_file):
        os.remove(output_file)

    ms2_dim = first_info["ms2_lib"]["shape"][1]

    f_out = h5py.File(output_file, "w")

    # Chunk shapes
    chunk_scalar = (row_chunk,)
    chunk_ms2 = (row_chunk, ms2_dim)

    d_file_name = f_out.create_dataset(
        "file_name",
        shape=(total_rows,),
        dtype=first_info["file_name"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_group_name = f_out.create_dataset(
        "group_name",
        shape=(total_rows,),
        dtype=first_info["group_name"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_ms2_lib = f_out.create_dataset(
        "ms2_lib",
        shape=(total_rows, ms2_dim),
        dtype=first_info["ms2_lib"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_ms2,
    )

    d_precursor_mz = f_out.create_dataset(
        "precursor_mz",
        shape=(total_rows,),
        dtype=first_info["precursor_mz"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_rt_min = f_out.create_dataset(
        "rt_min",
        shape=(total_rows,),
        dtype=first_info["rt_min"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_scan = f_out.create_dataset(
        "scan",
        shape=(total_rows,),
        dtype=first_info["scan"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    dsets_out = {
        "file_name": d_file_name,
        "group_name": d_group_name,
        "ms2_lib": d_ms2_lib,
        "precursor_mz": d_precursor_mz,
        "rt_min": d_rt_min,
        "scan": d_scan,
    }

    return f_out, dsets_out


def copy_data(files, dsets_out, chunk_rows=None):
    """
    Concatenate datasets from all input files into output datasets along axis 0.

    If chunk_rows is None:
        - copy each file in one go (simple, more RAM).
    Else:
        - copy in slices of size chunk_rows (safer for huge files).
    """
    offset = 0

    for fp in files:
        with h5py.File(fp, "r") as f_in:
            n_rows = f_in["file_name"].shape[0]
            print(f"Processing {fp} with {n_rows} rows (writing from offset {offset})")

            if chunk_rows is None:
                # Whole file at once
                sl = slice(offset, offset + n_rows)
                dsets_out["file_name"][sl] = f_in["file_name"][...]
                dsets_out["group_name"][sl] = f_in["group_name"][...]
                dsets_out["ms2_lib"][sl] = f_in["ms2_lib"][...]
                dsets_out["precursor_mz"][sl] = f_in["precursor_mz"][...]
                dsets_out["rt_min"][sl] = f_in["rt_min"][...]
                dsets_out["scan"][sl] = f_in["scan"][...]
            else:
                # Chunked copy
                for start in range(0, n_rows, chunk_rows):
                    end = min(start + chunk_rows, n_rows)
                    local_sl = slice(start, end)
                    global_sl = slice(offset + start, offset + end)

                    dsets_out["file_name"][global_sl] = f_in["file_name"][local_sl]
                    dsets_out["group_name"][global_sl] = f_in["group_name"][local_sl]
                    dsets_out["ms2_lib"][global_sl] = f_in["ms2_lib"][local_sl]
                    dsets_out["precursor_mz"][global_sl] = f_in["precursor_mz"][local_sl]
                    dsets_out["rt_min"][global_sl] = f_in["rt_min"][local_sl]
                    dsets_out["scan"][global_sl] = f_in["scan"][local_sl]

        offset += n_rows

    print(f"Done. Total rows written: {offset}")


# -----------------------------
# Main
# -----------------------------
def main():
    files = get_files(INPUT_DIR, PATTERN)
    print(f"Found {len(files)} input files:")
    for f in files:
        print("  -", f)

    first_info = inspect_first_file(files[0])
    total_rows = compute_total_rows(files)
    print(f"Total rows across all files: {total_rows}")

    f_out, dsets_out = create_output_file(OUTPUT_FILE, first_info, total_rows)
    print(f"Created output file: {OUTPUT_FILE}")

    try:
        copy_data(files, dsets_out, chunk_rows=COPY_CHUNK_ROWS)
        print("All data copied successfully.")
    finally:
        f_out.close()
        print("Output file closed.")


if __name__ == "__main__":
    main()


ID import

In [4]:
import re
import numpy as np
import pandas as pd
import h5py
from tqdm import trange


def annotate_h5_with_tdportal(tdportal_csv, in_h5_path, out_h5_path):
    """
    Read tdportal CSV and an input H5 databank, match scans to IDs, and
    write a new H5 file with added annotation datasets:
        - sequence
        - MASS
        - Accession
        - PFR
    
    ALSO PRINTS:
        - number of samples (file names)
        - matched samples vs unmatched samples
        - total scans matched vs unmatched
    """
    # ------------------------------
    # 1) Load tdportal
    # ------------------------------
    tdportal = pd.read_csv(tdportal_csv)

    def str_to_int_list(s):
        """Extract all integers from a string as a list."""
        return [int(x) for x in re.findall(r'\d+', str(s))]

    # Build mapping: (sample_name, scan) -> tdportal row index
    mapping = {}
    samples_in_tdportal = set()

    for i, row in tdportal.iterrows():
        sample = row['File Name']
        samples_in_tdportal.add(sample)
        scans = str_to_int_list(row['Fragment Scans'])
        for sc in scans:
            mapping[(sample, sc)] = i

    # ------------------------------
    # 2) Load input H5 databank
    # ------------------------------
    with h5py.File(in_h5_path, "r") as f:
        file_name = f["file_name"][()]  # (N,)
        scan      = f["scan"][()]      # (N,)

        # decode bytes to str if needed
        def maybe_decode(arr):
            if isinstance(arr[0], (bytes, np.bytes_)):
                return np.array([x.decode("utf-8") for x in arr], dtype=object)
            return arr

        file_name = maybe_decode(file_name)
        scan = scan.astype(int)

        N = len(scan)

        # Pre-allocate outputs
        sequence  = [""] * N
        MASS      = [np.nan] * N
        Accession = [""] * N
        PFR       = [np.nan] * N

        # Statistics tracking
        matched_scan_count = 0
        unmatched_scan_count = 0
        matched_samples = set()
        unmatched_samples = set()

        for i in trange(N, desc="Matching scans"):
            sname = file_name[i]
            sc    = int(scan[i])
            key   = (sname, sc)

            if key in mapping:
                matched_scan_count += 1
                matched_samples.add(sname)

                idx = mapping[key]
                row = tdportal.loc[idx]
                sequence[i]  = str(row.get('Sequence', ""))
                MASS[i]      = float(row.get('Average Mass', np.nan))
                Accession[i] = str(row.get('Accession', ""))
                PFR[i]       = float(row.get('PFR', np.nan))

            else:
                unmatched_scan_count += 1
                unmatched_samples.add(sname)

    # ------------------------------
    # 3) Print statistics
    # ------------------------------
    total_samples = len(np.unique(file_name))

    print("\n===== MATCHING SUMMARY =====")
    print(f"Total samples in H5: {total_samples}")
    print(f"Samples in tdportal: {len(samples_in_tdportal)}")
    print(f"Matched samples:     {len(matched_samples)}")
    print(f"Unmatched samples:   {len(unmatched_samples)}")

    print("----------------------------------")
    print(f"Total scans in H5:   {N}")
    print(f"Matched scans:       {matched_scan_count}")
    print(f"Unmatched scans:     {unmatched_scan_count}")
    print("==================================\n")

    # ------------------------------
    # 4) Write new H5 with annotations
    # ------------------------------
    dt_str = h5py.string_dtype(encoding='utf-8')

    with h5py.File(in_h5_path, "r") as fin, h5py.File(out_h5_path, "w") as fout:
        # Copy datasets
        for name, item in fin.items():
            fout.create_dataset(name, data=item[()], compression="gzip")

        # Add annotation datasets
        fout.create_dataset("sequence",  data=np.array(sequence,  dtype=dt_str))
        fout.create_dataset("MASS",      data=np.array(MASS,      dtype=np.float32))
        fout.create_dataset("Accession", data=np.array(Accession, dtype=dt_str))
        fout.create_dataset("PFR",       data=np.array(PFR,       dtype=np.float32))


# ------------------------------
# Example usage
# ------------------------------
if __name__ == "__main__":
    tdportal_csv = r"F:\20251115\tdportal\combined.csv"
    in_h5_path   = r"F:\20251115\spectra_h5\combined.h5"
    out_h5_path  = r"F:\20251115\spectra_h5\combined_annotated.h5"

    annotate_h5_with_tdportal(tdportal_csv, in_h5_path, out_h5_path)


  tdportal = pd.read_csv(tdportal_csv)
Matching scans: 100%|██████████| 2202567/2202567 [00:55<00:00, 39615.67it/s] 



===== MATCHING SUMMARY =====
Total samples in H5: 352
Samples in tdportal: 352
Matched samples:     350
Unmatched samples:   352
----------------------------------
Total scans in H5:   2202567
Matched scans:       398060
Unmatched scans:     1804507

