## Look at the h5 files

In [1]:
%pwd
%cd ..
%pwd

/orcd/data/edboyden/002/ezh/uni


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'/orcd/data/edboyden/002/ezh/uni'

In [2]:
import h5py

with h5py.File("/orcd/data/edboyden/002/ezh/uni/virchow_features/TCGA-3L-AA1B.h5", "r") as f:
    print(list(f.keys()))
    print(f["features"].shape)  # (N_patches, 1536)
    if "KRAS_binary" in f:
        print(f["KRAS_binary"][()]) 

['HER2_Amp', 'KRAS_binary', 'Lymphovascular invasion indicator', 'coords', 'features', 'mmr_status']
(8988, 2560)
0


## Match each h5 file to label

In [2]:
import pandas as pd

# Load the file
mutation_df = pd.read_csv('/orcd/data/edboyden/002/ezh/uni/datasets/kras_labels.csv')

display(mutation_df.head(10))

Unnamed: 0,index,Sample_ID,KRAS,KRAS_binary
0,0,TCGA-3L-AA1B,WT,0.0
1,1,TCGA-4N-A93T,G12D,1.0
2,2,TCGA-4T-AA8H,G12V,1.0
3,3,TCGA-5M-AAT4,G12D,1.0
4,4,TCGA-5M-AAT5,G12D,1.0
5,5,TCGA-5M-AAT6,G12D,1.0
6,6,TCGA-5M-AATA,WT,0.0
7,7,TCGA-5M-AATE,G13D,1.0
8,8,TCGA-A6-2670,WT,0.0
9,9,TCGA-A6-2671,WT,0.0


In [3]:
import os
import pandas as pd
from glob import glob

In [4]:
h5_folder = "/orcd/data/edboyden/002/ezh/uni/virchow_features"
mutation = "KRAS_binary"
h5_paths = sorted(glob(os.path.join(h5_folder, "*.h5")))
print(f"\nFound {len(h5_paths)} .h5 files")


Found 493 .h5 files


In [5]:
# Extract TCGA ID from each .h5 filename
def extract_tcga_id(path):
    basename = os.path.basename(path)
    return "-".join(basename.split("-")[:3])  # TCGA-XX-YYYY

h5_df = pd.DataFrame({
    "h5_path": h5_paths,
    "Sample_ID": [extract_tcga_id(p) for p in h5_paths]
})

h5_df['Sample_ID'] = h5_df['Sample_ID'].str.replace('.h5', '')

print("\nFirst 5 extracted TCGA IDs:")
print(h5_df.head())


First 5 extracted TCGA IDs:
                                             h5_path     Sample_ID
0  /orcd/data/edboyden/002/ezh/uni/virchow_featur...  TCGA-3L-AA1B
1  /orcd/data/edboyden/002/ezh/uni/virchow_featur...  TCGA-4N-A93T
2  /orcd/data/edboyden/002/ezh/uni/virchow_featur...  TCGA-4T-AA8H
3  /orcd/data/edboyden/002/ezh/uni/virchow_featur...  TCGA-5M-AAT4
4  /orcd/data/edboyden/002/ezh/uni/virchow_featur...  TCGA-5M-AAT5


In [6]:
print("\nSample_ID format in mutation_df:")
print(mutation_df['Sample_ID'].head())

print("\nSample_ID format in h5_df:")
print(h5_df['Sample_ID'].head())


Sample_ID format in mutation_df:
0    TCGA-3L-AA1B
1    TCGA-4N-A93T
2    TCGA-4T-AA8H
3    TCGA-5M-AAT4
4    TCGA-5M-AAT5
Name: Sample_ID, dtype: object

Sample_ID format in h5_df:
0    TCGA-3L-AA1B
1    TCGA-4N-A93T
2    TCGA-4T-AA8H
3    TCGA-5M-AAT4
4    TCGA-5M-AAT5
Name: Sample_ID, dtype: object


In [7]:
merged_df = h5_df.merge(mutation_df, on="Sample_ID", how="left")

display(merged_df.head(10))

# Separate matched/unmatched again
matched = merged_df.dropna(subset=[mutation])
unmatched = merged_df[merged_df[mutation].isna()]

# Print results
print(f"\nMatched: {len(matched)}")
print(f"Unmatched: {len(unmatched)}")
print("\nMatched sample preview:")
print(matched[["Sample_ID", mutation]].head(5))

Unnamed: 0,h5_path,Sample_ID,index,KRAS,KRAS_binary
0,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-3L-AA1B,0,WT,0.0
1,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-4N-A93T,1,G12D,1.0
2,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-4T-AA8H,2,G12V,1.0
3,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AAT4,3,G12D,1.0
4,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AAT5,4,G12D,1.0
5,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AAT6,5,G12D,1.0
6,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AATE,7,G13D,1.0
7,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-A6-2671,9,WT,0.0
8,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-A6-2672,10,WT,0.0
9,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-A6-2674,11,WT,0.0



Matched: 490
Unmatched: 3

Matched sample preview:
      Sample_ID  KRAS_binary
0  TCGA-3L-AA1B          0.0
1  TCGA-4N-A93T          1.0
2  TCGA-4T-AA8H          1.0
3  TCGA-5M-AAT4          1.0
4  TCGA-5M-AAT5          1.0


In [8]:
print(matched[mutation].value_counts())

KRAS_binary
0.0    295
1.0    195
Name: count, dtype: int64


### Save the mutation status column in the h5 file

In [9]:
import h5py

# Loop over matched rows
for idx, row in matched.iterrows():
    h5_path = row["h5_path"]
    label = int(row[mutation])  # make sure it's a simple 0 or 1
    
    try:
        with h5py.File(h5_path, "a") as f:  # 'a' = append mode
            if mutation in f:
                print(f"Overwriting existing {mutation} in {os.path.basename(h5_path)}")
                del f[mutation]
            else:
                print(f"Writing {mutation} to {os.path.basename(h5_path)}")
            
            f.create_dataset(mutation, data=label)
    
    except Exception as e:
        print(f"Error writing to {h5_path}: {e}")

Writing KRAS_binary to TCGA-3L-AA1B.h5
Writing KRAS_binary to TCGA-4N-A93T.h5
Writing KRAS_binary to TCGA-4T-AA8H.h5
Writing KRAS_binary to TCGA-5M-AAT4.h5
Writing KRAS_binary to TCGA-5M-AAT5.h5
Writing KRAS_binary to TCGA-5M-AAT6.h5
Writing KRAS_binary to TCGA-5M-AATE.h5
Writing KRAS_binary to TCGA-A6-2671.h5
Writing KRAS_binary to TCGA-A6-2672.h5
Writing KRAS_binary to TCGA-A6-2674.h5
Writing KRAS_binary to TCGA-A6-2675.h5
Writing KRAS_binary to TCGA-A6-2676.h5
Writing KRAS_binary to TCGA-A6-2677.h5
Writing KRAS_binary to TCGA-A6-2678.h5
Writing KRAS_binary to TCGA-A6-2679.h5
Writing KRAS_binary to TCGA-A6-2680.h5
Writing KRAS_binary to TCGA-A6-2681.h5
Writing KRAS_binary to TCGA-A6-2682.h5
Writing KRAS_binary to TCGA-A6-2683.h5
Writing KRAS_binary to TCGA-A6-2684.h5
Writing KRAS_binary to TCGA-A6-2685.h5
Writing KRAS_binary to TCGA-A6-2686.h5
Writing KRAS_binary to TCGA-A6-3807.h5
Writing KRAS_binary to TCGA-A6-3808.h5
Writing KRAS_binary to TCGA-A6-3809.h5
Writing KRAS_binary to TC

In [10]:
print(matched.keys())
display(matched)

Index(['h5_path', 'Sample_ID', 'index', 'KRAS', 'KRAS_binary'], dtype='object')


Unnamed: 0,h5_path,Sample_ID,index,KRAS,KRAS_binary
0,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-3L-AA1B,0,WT,0.0
1,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-4N-A93T,1,G12D,1.0
2,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-4T-AA8H,2,G12V,1.0
3,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AAT4,3,G12D,1.0
4,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-5M-AAT5,4,G12D,1.0
...,...,...,...,...,...
488,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-QG-A5Z2,634,WT,0.0
489,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-QL-A97D,635,G12D,1.0
490,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-SS-A7HO,637,G12A,1.0
491,/orcd/data/edboyden/002/ezh/uni/virchow_featur...,TCGA-T9-A92H,638,WT,0.0


## Do some sanity checking

Print how many H5 files exist vs. how many have the label

In [11]:
import h5py
import os

dir_path = "/orcd/data/edboyden/002/ezh/uni/virchow_features"
label_name = "KRAS_binary"

total_files = 0
files_with_label = 0
label_counts = {}
missing_label_files = []

for fname in os.listdir(dir_path):
    if not fname.endswith(".h5"):
        continue
    total_files += 1
    fpath = os.path.join(dir_path, fname)
    try:
        with h5py.File(fpath, "r") as f:
            if label_name in f:
                label = int(f[label_name][()])
                files_with_label += 1
                label_counts[label] = label_counts.get(label, 0) + 1
            else:
                missing_label_files.append(fname)
    except Exception as e:
        print(f"Error reading {fname}: {e}")

print("Total .h5 files:", total_files)
print(f"Files with label '{label_name}':", files_with_label)
print("Label distribution:", label_counts)
print("Files missing the label (first 5):", missing_label_files[:5])


Total .h5 files: 493
Files with label 'KRAS_binary': 490
Label distribution: {0: 295, 1: 195}
Files missing the label (first 5): ['TCGA-AA-3970.h5', 'TCGA-A6-6140.h5', 'TCGA-A6-6141.h5']
