In [4]:
import h5py

# --- path to your file ---
filepath = "F:/20251115/ms2_tissue_dataset.h5"

with h5py.File(filepath, "r") as f:
    # List top-level groups / datasets
    print("Keys in file:")
    for key in f.keys():
        print("  ", key)

    # Recursively print structure
    def print_structure(name, obj):
        if isinstance(obj, h5py.Dataset):
            print(f"[DATASET] {name} - shape: {obj.shape}, dtype: {obj.dtype}")
        elif isinstance(obj, h5py.Group):
            print(f"[GROUP] {name}")

    print("\nFull structure:")
    f.visititems(print_structure)


Keys in file:
   file_name
   group_name
   ms2_lib
   precursor_mz
   rt_min
   scan

Full structure:
[DATASET] file_name - shape: (983170,), dtype: |S59
[DATASET] group_name - shape: (983170,), dtype: |S6
[DATASET] ms2_lib - shape: (983170, 1600), dtype: float16
[DATASET] precursor_mz - shape: (983170,), dtype: float32
[DATASET] rt_min - shape: (983170,), dtype: float32
[DATASET] scan - shape: (983170,), dtype: int32


Combine multiple tdportal reports

In [5]:
import pandas as pd
import glob
import os

# --- Folder path containing your CSV files ---
folder = r"F:\20251115\tdportal"   # ⬅️ change this to your actual folder path

# --- Find all CSV files in the folder ---
csv_files = glob.glob(os.path.join(folder, "*.csv"))

# --- Read each CSV and combine them by row ---
dfs = [pd.read_csv(f) for f in csv_files]
combined = pd.concat(dfs, ignore_index=True)

# --- Save the combined CSV ---
output_path = os.path.join(folder, "combined.csv")
combined.to_csv(output_path, index=False)

print(f"✅ Combined {len(csv_files)} files into {output_path}")

✅ Combined 6 files into F:\20251115\tdportal\combined.csv


In [None]:
def ID_import(tdportal, databank, cast_path):
  def str_to_int(st):
      internal = []
      digits = re.findall(r'\d+', st)
      for i in range(0, len(digits)):
          internal.append(int(digits[i]))
      return(internal)

  scan_number = [0]*len(tdportal['File Name'])
  td_samples = []

  for i in range(0, len(tdportal['File Name'])):
      scan_number[i] = str_to_int(str(tdportal['Fragment Scans'][i]))
      if tdportal['File Name'][i] not in td_samples:
        td_samples.append(tdportal['File Name'][i])

  my_dic_scan = {key: [] for key in td_samples}
  my_dic_index = {key: [] for key in td_samples}

  for i in range(0, len(tdportal['File Name'])):
      my_dic_scan[tdportal['File Name'][i]].append(scan_number[i])
      my_dic_index[tdportal['File Name'][i]].append([i]*len(scan_number[i]))

  for i in range(0, len(td_samples)):
      nested_list = my_dic_scan[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_scan[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]


  for i in range(0, len(td_samples)):
      nested_list = my_dic_index[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_index[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]

  sequence, MASS, Accession, missing, PFR = [], [], [], [], []

  for i in tqdm(range(len(databank['scan'])), desc="Processing scans", ncols=100):
      try:
          sample = databank['sample_name'][i]
          scan   = databank['scan'][i]

          if scan in my_dic_scan[sample]:
              tt = my_dic_index[sample][my_dic_scan[sample].index(scan)]
              sequence.append(tdportal.at[tt, 'Sequence'])
              MASS.append(tdportal.at[tt, 'Average Mass'])
              Accession.append(tdportal.at[tt, 'Accession'])
              PFR.append(tdportal.at[tt, 'PFR'])
          else:
              sequence.append(None)
              MASS.append(None)
              Accession.append(None)
              PFR.append(None)

      except KeyError as e:
          missing.append(sample)
        # Handles missing sample_name or missing index key
        # You could also log: print(f"Missing key: {e}")
          sequence.append(None)
          MASS.append(None)
          Accession.append(None)
          PFR.append(None)

      except Exception as e:
        # Catches other unexpected issues (out-of-range, missing column, etc.)
        # print(f"Unexpected error: {e}")
          sequence.append(None)
          MASS.append(None)
          Accession.append(None)
          PFR.append(None)

  print(set(missing))

  databank['sequence'] = sequence
  databank['MASS'] = MASS
  databank['Accession'] = Accession
  databank['PFR'] = PFR

  databank = pd.DataFrame(databank)

  databank.to_csv(cast_path, index=False)

  return()

In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Combine multiple HDF5 files with identical structure into a single compressed HDF5.

Expected datasets in each input file:
  - file_name      (N,)        |S59
  - group_name     (N,)        |S6
  - ms2_lib        (N, 1600)   float16
  - precursor_mz   (N,)        float32
  - rt_min         (N,)        float32
  - scan           (N,)        int32

Output:
  - One HDF5 with the same dataset names, concatenated along axis 0,
    stored with gzip compression + shuffle + chunking (npz-like behavior).
"""

import os
import glob
import h5py
import numpy as np

# -----------------------------
# User parameters
# -----------------------------
INPUT_DIR = r"F:\20251115\spectra_h5"      # <-- change this
OUTPUT_FILE = r"F:\20251115\spectra_h5/combined.h5"    # <-- change this
PATTERN = "*.h5"                         # which input files to combine

# I/O chunk size when reading/copying (None = whole file at once; use e.g. 100_000 for huge datasets)
COPY_CHUNK_ROWS = 100_000

# HDF5 dataset compression / chunking
H5_COMPRESSION = "gzip"     # "gzip" is closest to npz; you can use "lzf" for faster but less tight
H5_COMP_OPTS = 4            # 1–9 for gzip, higher = more compression, slower
H5_ROW_CHUNK = 4096         # chunk length along the row axis for HDF5 datasets


# -----------------------------
# Helpers
# -----------------------------
def get_files(input_dir, pattern="*.h5"):
    files = sorted(glob.glob(os.path.join(input_dir, pattern)))
    if not files:
        raise FileNotFoundError(f"No files found in {input_dir!r} matching {pattern!r}")
    return files


def inspect_first_file(example_file):
    """
    Read the first file and capture shapes/dtypes to define the output file.
    Also verifies that required datasets exist.
    """
    expected_keys = ["file_name", "group_name", "ms2_lib",
                     "precursor_mz", "rt_min", "scan"]
    info = {}

    with h5py.File(example_file, "r") as f:
        for k in expected_keys:
            if k not in f:
                raise KeyError(f"Dataset {k!r} not found in {example_file}")
            ds = f[k]
            info[k] = {
                "shape": ds.shape,
                "dtype": ds.dtype,
            }

        # quick consistency checks
        n = info["file_name"]["shape"][0]
        for k in expected_keys:
            if f[k].shape[0] != n:
                raise ValueError(
                    f"Dataset {k!r} in {example_file!r} has first dim "
                    f"{f[k].shape[0]} != file_name first dim {n}"
                )

    return info


def compute_total_rows(files):
    """
    Sum up rows across all files and sanity-check that
    each file has internally consistent first dimensions.
    """
    total = 0
    for fp in files:
        with h5py.File(fp, "r") as f:
            n = f["file_name"].shape[0]
            assert f["group_name"].shape[0] == n
            assert f["ms2_lib"].shape[0] == n
            assert f["precursor_mz"].shape[0] == n
            assert f["rt_min"].shape[0] == n
            assert f["scan"].shape[0] == n
        total += n
    return total


def create_output_file(output_file, first_info, total_rows,
                       compression=H5_COMPRESSION,
                       compression_opts=H5_COMP_OPTS,
                       row_chunk=H5_ROW_CHUNK):
    """
    Create the output HDF5 with compressed, chunked datasets.
    """
    if os.path.exists(output_file):
        os.remove(output_file)

    ms2_dim = first_info["ms2_lib"]["shape"][1]

    f_out = h5py.File(output_file, "w")

    # Chunk shapes
    chunk_scalar = (row_chunk,)
    chunk_ms2 = (row_chunk, ms2_dim)

    d_file_name = f_out.create_dataset(
        "file_name",
        shape=(total_rows,),
        dtype=first_info["file_name"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_group_name = f_out.create_dataset(
        "group_name",
        shape=(total_rows,),
        dtype=first_info["group_name"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_ms2_lib = f_out.create_dataset(
        "ms2_lib",
        shape=(total_rows, ms2_dim),
        dtype=first_info["ms2_lib"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_ms2,
    )

    d_precursor_mz = f_out.create_dataset(
        "precursor_mz",
        shape=(total_rows,),
        dtype=first_info["precursor_mz"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_rt_min = f_out.create_dataset(
        "rt_min",
        shape=(total_rows,),
        dtype=first_info["rt_min"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    d_scan = f_out.create_dataset(
        "scan",
        shape=(total_rows,),
        dtype=first_info["scan"]["dtype"],
        compression=compression,
        compression_opts=compression_opts,
        shuffle=True,
        chunks=chunk_scalar,
    )

    dsets_out = {
        "file_name": d_file_name,
        "group_name": d_group_name,
        "ms2_lib": d_ms2_lib,
        "precursor_mz": d_precursor_mz,
        "rt_min": d_rt_min,
        "scan": d_scan,
    }

    return f_out, dsets_out


def copy_data(files, dsets_out, chunk_rows=None):
    """
    Concatenate datasets from all input files into output datasets along axis 0.

    If chunk_rows is None:
        - copy each file in one go (simple, more RAM).
    Else:
        - copy in slices of size chunk_rows (safer for huge files).
    """
    offset = 0

    for fp in files:
        with h5py.File(fp, "r") as f_in:
            n_rows = f_in["file_name"].shape[0]
            print(f"Processing {fp} with {n_rows} rows (writing from offset {offset})")

            if chunk_rows is None:
                # Whole file at once
                sl = slice(offset, offset + n_rows)
                dsets_out["file_name"][sl] = f_in["file_name"][...]
                dsets_out["group_name"][sl] = f_in["group_name"][...]
                dsets_out["ms2_lib"][sl] = f_in["ms2_lib"][...]
                dsets_out["precursor_mz"][sl] = f_in["precursor_mz"][...]
                dsets_out["rt_min"][sl] = f_in["rt_min"][...]
                dsets_out["scan"][sl] = f_in["scan"][...]
            else:
                # Chunked copy
                for start in range(0, n_rows, chunk_rows):
                    end = min(start + chunk_rows, n_rows)
                    local_sl = slice(start, end)
                    global_sl = slice(offset + start, offset + end)

                    dsets_out["file_name"][global_sl] = f_in["file_name"][local_sl]
                    dsets_out["group_name"][global_sl] = f_in["group_name"][local_sl]
                    dsets_out["ms2_lib"][global_sl] = f_in["ms2_lib"][local_sl]
                    dsets_out["precursor_mz"][global_sl] = f_in["precursor_mz"][local_sl]
                    dsets_out["rt_min"][global_sl] = f_in["rt_min"][local_sl]
                    dsets_out["scan"][global_sl] = f_in["scan"][local_sl]

        offset += n_rows

    print(f"Done. Total rows written: {offset}")


# -----------------------------
# Main
# -----------------------------
def main():
    files = get_files(INPUT_DIR, PATTERN)
    print(f"Found {len(files)} input files:")
    for f in files:
        print("  -", f)

    first_info = inspect_first_file(files[0])
    total_rows = compute_total_rows(files)
    print(f"Total rows across all files: {total_rows}")

    f_out, dsets_out = create_output_file(OUTPUT_FILE, first_info, total_rows)
    print(f"Created output file: {OUTPUT_FILE}")

    try:
        copy_data(files, dsets_out, chunk_rows=COPY_CHUNK_ROWS)
        print("All data copied successfully.")
    finally:
        f_out.close()
        print("Output file closed.")


if __name__ == "__main__":
    main()


Found 2 input files:
  - F:\20251115\spectra_h5\ms2_plasma_dataset.h5
  - F:\20251115\spectra_h5\ms2_tissue_dataset.h5
Total rows across all files: 2202567
Created output file: F:\20251115\spectra_h5/combined.h5
Processing F:\20251115\spectra_h5\ms2_plasma_dataset.h5 with 1219397 rows (writing from offset 0)
Processing F:\20251115\spectra_h5\ms2_tissue_dataset.h5 with 983170 rows (writing from offset 1219397)
Done. Total rows written: 2202567
All data copied successfully.
Output file closed.


In [9]:
import h5py

path = r"F:\20251115\spectra_h5\combined.h5"   # <-- change this to your file name

with h5py.File(path, "r") as f:
    print("Keys:", list(f.keys()))
    
    for key in f.keys():
        dset = f[key]
        print(f"[{key}] shape={dset.shape}, dtype={dset.dtype}")


Keys: ['file_name', 'group_name', 'ms2_lib', 'precursor_mz', 'rt_min', 'scan']
[file_name] shape=(2202567,), dtype=object
[group_name] shape=(2202567,), dtype=object
[ms2_lib] shape=(2202567, 1600), dtype=float16
[precursor_mz] shape=(2202567,), dtype=float32
[rt_min] shape=(2202567,), dtype=float32
[scan] shape=(2202567,), dtype=int64
