## Look at the data
- in the TCGA-COAD folder there are 442 h5 files that each correspond to a WSI. Each h5 folder has thousands of numpy array features for each patch of the WSI.

In [2]:
import os

folder_path = "UNI2-h_features/TCGA-COAD"

# Count number of .h5 files
num_h5_files = len([f for f in os.listdir(folder_path) if f.endswith(".h5")])
print(f"Number of .h5 files in {folder_path}: {num_h5_files}")

# Preview first 5 filenames
print("First 5 files:", sorted(os.listdir(folder_path))[:5])


Number of .h5 files in UNI2-h_features/TCGA-COAD: 442
First 5 files: ['.ipynb_checkpoints', 'TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.h5', 'TCGA-3L-AA1B-01Z-00-DX2.17CE3683-F4B1-4978-A281-8F620C4D77B4.h5', 'TCGA-4N-A93T-01Z-00-DX1.82E240B1-22C3-46E3-891F-0DCE35C43F8B.h5', 'TCGA-4N-A93T-01Z-00-DX2.875E7F95-A6D4-4BEB-A331-F9D8080898C2.h5']


In [11]:
import h5py
import numpy as np

h5_path = "UNI2-h_features/TCGA-COAD/TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.h5"

with h5py.File(h5_path, "r") as f:
    print("Keys:", list(f.keys()))
    print("features shape:", f['features'].shape)
    print("coords shape:", f['coords'].shape)

    features = f['features'][:].squeeze(0)  # shape (N, 1536)
    coords = f['coords'][:].squeeze(0)      # shape (N, 2)
    
    print("First feature vector:", features[0])
    print("Second feature vector:", features[1])
    print("First patch coordinates:", coords[0])

    print("number of features: ", len(features))
    print(type(features[0]))
    print(np.shape(features[0]))


Keys: ['annots', 'coords', 'coords_patching', 'features']
features shape: (1, 14853, 1536)
coords shape: (1, 14853, 2)
First feature vector: [-0.24655764  0.21954072 -0.11068739 ...  0.09347688  0.39546552
  0.32493556]
Second feature vector: [-0.20793374  0.61069363  0.02949323 ...  0.12355845  0.43776366
  0.24911475]
First patch coordinates: [ 3072 51200]
number of features:  14853
<class 'numpy.ndarray'>
(1536,)


## Aggregate the features per WSI