## convert h5 to pt files

### get the full file name for h5

In [2]:
import os
import pandas as pd

# Load your clam CSV
clam_df = pd.read_csv("./clam_data/TCGA_COAD.csv")

# List of all h5 filenames
h5_dir = "./UNI2-h_features/TCGA-COAD"
all_h5_files = os.listdir(h5_dir)

# Create a mapping: slide_id -> matched full h5 filename
slide_to_h5 = {}
for slide_id in clam_df["slide_id"]:
    matches = [f for f in all_h5_files if f.startswith(slide_id)]
    if matches:
        slide_to_h5[slide_id] = matches[0]
    else:
        print(f"No match found for {slide_id}")

# Optional: add to dataframe for verification
clam_df["matched_h5"] = clam_df["slide_id"].map(slide_to_h5)

# See how many matched
print(f"Matched: {clam_df['matched_h5'].notnull().sum()} / {len(clam_df)}")

Matched: 439 / 439


In [4]:
import shutil

output_dir = "./clam_data/h5_files"
os.makedirs(output_dir, exist_ok=True)

for fname in clam_df["matched_h5"].dropna():
    src = os.path.join(h5_dir, fname)
    dst = os.path.join(output_dir, fname)
    shutil.copyfile(src, dst)


### create pt files

In [5]:
import os
import h5py
import torch
from tqdm import tqdm

# Define paths
h5_dir = "./clam_data/h5_files"
pt_dir = "./clam_data/pt_files"
os.makedirs(pt_dir, exist_ok=True)

# Loop through h5 files and extract features
for fname in tqdm(os.listdir(h5_dir)):
    if not fname.endswith(".h5"):
        continue
    slide_id = fname.replace(".h5", "")
    h5_path = os.path.join(h5_dir, fname)
    pt_path = os.path.join(pt_dir, slide_id + ".pt")

    with h5py.File(h5_path, "r") as f:
        if "features" not in f:
            print(f"Missing 'features' in {fname}, skipping...")
            continue
        feats = f["features"][:]
        tensor = torch.from_numpy(feats).float()
        torch.save({"features": tensor}, pt_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 431/431 [00:19<00:00, 22.68it/s]


### sanity cross-check

In [11]:
import os
import h5py
import torch
import pandas as pd

# Paths
csv_path = "./clam_data/TCGA_COAD.csv"
h5_dir = "./clam_data/h5_files"
pt_dir = "./clam_data/pt_files"

# Load CSV
df = pd.read_csv(csv_path)
slide_ids = set(df["slide_id"])

# Collect file basenames (without extensions)
h5_files = {f.replace(".h5", "") for f in os.listdir(h5_dir) if f.endswith(".h5")}
pt_files = {f.replace(".pt", "") for f in os.listdir(pt_dir) if f.endswith(".pt")}

# Check that all CSV slide_ids have both .h5 and .pt
csv_missing_h5 = slide_ids - h5_files
csv_missing_pt = slide_ids - pt_files

# Check that all .pt files have a .h5
pt_missing_h5 = pt_files - h5_files

# Check feature shape consistency
bad_shape = []
for sid in sorted(slide_ids & h5_files & pt_files):
    h5_path = os.path.join(h5_dir, sid + ".h5")
    pt_path = os.path.join(pt_dir, sid + ".pt")

    with h5py.File(h5_path, "r") as f:
        h5_feats = f["features"][:]
    pt_feats = torch.load(pt_path)["features"]

    if h5_feats.shape != pt_feats.shape:
        bad_shape.append((sid, h5_feats.shape, pt_feats.shape))

# Results
print(f"\nSanity Check Summary")
print(f"CSV entries        : {len(slide_ids)}")
print(f"H5 files available : {len(h5_files)}")
print(f"PT files available : {len(pt_files)}")
print(f"Matched triplets : {len(slide_ids & h5_files & pt_files)}")


print(f"\nMismatched feature shapes: {len(bad_shape)}")
if bad_shape:
    for sid, h5_shape, pt_shape in bad_shape[:5]:  # print only first 5
        print(f"  {sid}: h5={h5_shape}, pt={pt_shape}")
else:
    print("All patch features match between h5 and pt.")



Sanity Check Summary
CSV entries        : 431
H5 files available : 431
PT files available : 431
Matched triplets : 0

Mismatched feature shapes: 0
All patch features match between h5 and pt.


In [15]:
import os
import pandas as pd

csv_path = "CLAM/clam_data/task_1_tumor_vs_normal.csv"
pt_dir = "CLAM/clam_data/task_1_tumor_vs_normal/pt_files"

# Load CSV
df = pd.read_csv(csv_path)

# Rename files
renamed = 0
for slide_id in df["slide_id"]:
    for fname in os.listdir(pt_dir):
        if fname.startswith(slide_id) and fname.endswith(".pt"):
            old_path = os.path.join(pt_dir, fname)
            new_path = os.path.join(pt_dir, f"{slide_id}.pt")
            os.rename(old_path, new_path)
            print(f"✅ Renamed: {fname} → {slide_id}.pt")
            renamed += 1
            break
    else:
        print(f"⚠️ No match found for {slide_id}")

print(f"\n🎉 Renamed {renamed} .pt files total")


✅ Renamed: TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.pt → TCGA-3L-AA1B.pt
✅ Renamed: TCGA-3L-AA1B.pt → TCGA-3L-AA1B.pt
✅ Renamed: TCGA-4N-A93T-01Z-00-DX2.875E7F95-A6D4-4BEB-A331-F9D8080898C2.pt → TCGA-4N-A93T.pt
✅ Renamed: TCGA-4N-A93T.pt → TCGA-4N-A93T.pt
✅ Renamed: TCGA-5M-AAT4-01Z-00-DX1.725C46CA-9354-43AC-AA81-3E5A66354D6B.pt → TCGA-5M-AAT4.pt
✅ Renamed: TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-8156-52FC917BB014.pt → TCGA-5M-AAT6.pt
✅ Renamed: TCGA-5M-AATE-01Z-00-DX1.483FFD2F-61A1-477E-8F94-157383803FC7.pt → TCGA-5M-AATE.pt
✅ Renamed: TCGA-A6-2671-01Z-00-DX1.13d1a0d9-78cd-4cfc-b670-34a79ebe52ee.pt → TCGA-A6-2671.pt
✅ Renamed: TCGA-A6-2672-01Z-00-DX1.e2a845c8-6d77-4120-9f43-abec84a66c1c.pt → TCGA-A6-2672.pt
✅ Renamed: TCGA-A6-2674-01Z-00-DX1.d301f1f5-6f4a-49e6-9c93-f4e8b7f616b8.pt → TCGA-A6-2674.pt
✅ Renamed: TCGA-A6-2675-01Z-00-DX1.d37847d6-c17f-44b9-b90a-84cd1946c8ab.pt → TCGA-A6-2675.pt
✅ Renamed: TCGA-A6-2676-01Z-00-DX1.c465f6e0-b47c-48e9-bdb1-67077bb16c67.p