In [2]:
import os
import h5py
import numpy as np

input_dir = "/orcd/data/edboyden/002/ezh/uni/virchow_features_no_l2"
output_dir = "/orcd/data/edboyden/002/ezh/uni/virchow_features"
os.makedirs(output_dir, exist_ok=True)

for fname in os.listdir(input_dir):
    if not fname.endswith(".h5"):
        continue

    input_path = os.path.join(input_dir, fname)
    output_path = os.path.join(output_dir, fname)

    with h5py.File(input_path, "r") as f:
        features = f["features"][:]
        coords = f["coords"][:] if "coords" in f else None
        label = f["mmr_status"][()] if "mmr_status" in f else None

    # L2 normalization
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    normalized_features = features / (norms + 1e-8)

    with h5py.File(output_path, "w") as f_out:
        f_out.create_dataset("features", data=normalized_features.astype(np.float32))
        if coords is not None:
            f_out.create_dataset("coords", data=coords)
        if label is not None:
            f_out.create_dataset("mmr_status", data=label)

    print(f"✅ Normalized {fname}")


✅ Normalized TCGA-DM-A28G.h5
✅ Normalized TCGA-CK-5913.h5
✅ Normalized TCGA-D5-6530.h5
✅ Normalized TCGA-F4-6461.h5
✅ Normalized TCGA-DM-A1D7.h5
✅ Normalized TCGA-DC-4745.h5
✅ Normalized TCGA-AA-3850.h5
✅ Normalized TCGA-AA-A00F.h5
✅ Normalized TCGA-CM-6676.h5
✅ Normalized TCGA-AA-3530.h5
✅ Normalized TCGA-G4-6625.h5
✅ Normalized TCGA-AA-3696.h5
✅ Normalized TCGA-AF-A56K.h5
✅ Normalized TCGA-D5-6932.h5
✅ Normalized TCGA-G4-6321.h5
✅ Normalized TCGA-A6-2678.h5
✅ Normalized TCGA-AZ-4313.h5
✅ Normalized TCGA-D5-6540.h5
✅ Normalized TCGA-A6-5667.h5
✅ Normalized TCGA-AA-3955.h5
✅ Normalized TCGA-AA-3681.h5
✅ Normalized TCGA-AA-3837.h5
✅ Normalized TCGA-AA-3527.h5
✅ Normalized TCGA-DC-5337.h5
✅ Normalized TCGA-F4-6459.h5
✅ Normalized TCGA-A6-6142.h5
✅ Normalized TCGA-AY-6197.h5
✅ Normalized TCGA-AY-A71X.h5
✅ Normalized TCGA-DY-A1DC.h5
✅ Normalized TCGA-AD-6901.h5
✅ Normalized TCGA-F5-6464.h5
✅ Normalized TCGA-AA-3818.h5
✅ Normalized TCGA-AY-6196.h5
✅ Normalized TCGA-AG-3731.h5
✅ Normalized T