In [2]:
import h5py
import os
import timm
import torch
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from timm.layers import SwiGLUPacked
from torch.utils.data import DataLoader, Dataset

In [3]:
model = timm.create_model(
    "hf-hub:paige-ai/Virchow2",
    pretrained=True,
    mlp_layer=SwiGLUPacked,
    act_layer=torch.nn.SiLU
)
model.eval().cuda()

config = resolve_data_config(model.pretrained_cfg, model=model)
transform = create_transform(**config)

In [4]:
class TileFolderDataset(Dataset):
    def __init__(self, folder):
        self.paths = sorted([
            os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".png")
        ])
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        return self.transform(img), self.paths[idx]

In [5]:
# ───────────── Embedding + Saving ─────────────
def extract_and_save(tile_folder, h5_output_path, batch_size=96):
    dataset = TileFolderDataset(tile_folder)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=20, pin_memory=True, prefetch_factor=2, persistent_workers=True)

    all_embeddings = []
    all_coords = []

    for batch_imgs, batch_paths in tqdm(dataloader, desc=os.path.basename(tile_folder)):
        batch_imgs = batch_imgs.cuda()
        with torch.no_grad():
            out = model(batch_imgs)  # shape: (B, 261, 1280)

        cls = out[:, 0]
        patch_tokens = out[:, 5:]  # skip register tokens
        mean = patch_tokens.mean(dim=1)
        embedding = torch.cat([cls, mean], dim=-1)  # (B, 2560)
        all_embeddings.append(embedding.cpu())

        # Extract x, y from filename (e.g., TCGA-XX_L1_1232_2048.png)
        for path in batch_paths:
            base = os.path.splitext(os.path.basename(path))[0]
            try:
                x, y = map(int, base.split("_")[-2:])
            except:
                x, y = 0, 0
            all_coords.append((x, y))

    all_embeddings = torch.cat(all_embeddings, dim=0)     # (N, 2560)
    all_coords = torch.tensor(all_coords)                 # (N, 2)

    with h5py.File(h5_output_path, "w") as f:
        f.create_dataset("features", data=all_embeddings.numpy())
        f.create_dataset("coords", data=all_coords.numpy())

    print(f"✅ Saved {all_embeddings.shape[0]} embeddings to {h5_output_path}")

In [6]:
%pwd

'/orcd/data/edboyden/002/ezh/uni'

In [None]:
tile_root_dir = "slow_virchow_tiles"               # root directory containing subfolders for each WSI
output_dir = "virchow_features"                   # where to save .h5 files
os.makedirs(output_dir, exist_ok=True)

for slide_folder in sorted(os.listdir(tile_root_dir)):
    slide_path = os.path.join(tile_root_dir, slide_folder)
    if not os.path.isdir(slide_path):
        continue  # skip files

    h5_output_path = os.path.join(output_dir, f"{slide_folder}.h5")

    if os.path.exists(h5_output_path):
        print(f"✅ Skipping {slide_folder}, already exists.")
        continue

    try:
        extract_and_save(slide_path, h5_output_path, batch_size=96)
    except Exception as e:
        print(f"❌ Failed to process {slide_folder}: {e}")

.ipynb_checkpoints: 0it [00:00, ?it/s]


❌ Failed to process .ipynb_checkpoints: torch.cat(): expected a non-empty list of Tensors
✅ Skipping TCGA-5M-AAT4, already exists.
✅ Skipping TCGA-5M-AAT5, already exists.
✅ Skipping TCGA-5M-AAT6, already exists.
✅ Skipping TCGA-5M-AATE, already exists.
✅ Skipping TCGA-A6-2671, already exists.
✅ Skipping TCGA-A6-2672, already exists.
✅ Skipping TCGA-A6-2674, already exists.
✅ Skipping TCGA-A6-2675, already exists.
✅ Skipping TCGA-A6-2676, already exists.
✅ Skipping TCGA-A6-2677, already exists.
✅ Skipping TCGA-A6-2678, already exists.
✅ Skipping TCGA-A6-2679, already exists.
✅ Skipping TCGA-A6-2680, already exists.
✅ Skipping TCGA-A6-2681, already exists.
✅ Skipping TCGA-A6-2682, already exists.
✅ Skipping TCGA-A6-2683, already exists.
✅ Skipping TCGA-A6-2684, already exists.
✅ Skipping TCGA-A6-2685, already exists.
✅ Skipping TCGA-A6-2686, already exists.
✅ Skipping TCGA-A6-3807, already exists.
✅ Skipping TCGA-A6-3808, already exists.
✅ Skipping TCGA-A6-3809, already exists.
✅ Skippi

TCGA-G4-6317: 100%|███████████████████████████| 121/121 [08:38<00:00,  4.28s/it]


✅ Saved 11570 embeddings to virchow_features/TCGA-G4-6317.h5


TCGA-G4-6320: 100%|█████████████████████████████| 98/98 [06:59<00:00,  4.28s/it]


✅ Saved 9398 embeddings to virchow_features/TCGA-G4-6320.h5


TCGA-G4-6321: 100%|███████████████████████████| 125/125 [08:53<00:00,  4.27s/it]


✅ Saved 11970 embeddings to virchow_features/TCGA-G4-6321.h5


TCGA-G4-6322: 100%|███████████████████████████| 112/112 [07:58<00:00,  4.27s/it]


✅ Saved 10735 embeddings to virchow_features/TCGA-G4-6322.h5


TCGA-G4-6323: 100%|█████████████████████████████| 80/80 [05:39<00:00,  4.24s/it]


✅ Saved 7616 embeddings to virchow_features/TCGA-G4-6323.h5


TCGA-G4-6586: 100%|███████████████████████████| 100/100 [07:08<00:00,  4.29s/it]


✅ Saved 9600 embeddings to virchow_features/TCGA-G4-6586.h5


TCGA-G4-6588: 100%|███████████████████████████| 119/119 [08:25<00:00,  4.24s/it]


✅ Saved 11340 embeddings to virchow_features/TCGA-G4-6588.h5


TCGA-G4-6625: 100%|█████████████████████████████| 48/48 [03:27<00:00,  4.31s/it]


✅ Saved 4606 embeddings to virchow_features/TCGA-G4-6625.h5


TCGA-G4-6626: 100%|█████████████████████████████| 84/84 [05:58<00:00,  4.26s/it]


✅ Saved 8030 embeddings to virchow_features/TCGA-G4-6626.h5


TCGA-G4-6627: 100%|███████████████████████████| 135/135 [09:34<00:00,  4.26s/it]


✅ Saved 12920 embeddings to virchow_features/TCGA-G4-6627.h5


TCGA-G4-6628: 100%|███████████████████████████| 124/124 [08:50<00:00,  4.28s/it]


✅ Saved 11904 embeddings to virchow_features/TCGA-G4-6628.h5


TCGA-G5-6233: 100%|███████████████████████████| 105/105 [07:30<00:00,  4.29s/it]


✅ Saved 10080 embeddings to virchow_features/TCGA-G5-6233.h5


TCGA-G5-6235: 100%|███████████████████████████| 107/107 [07:36<00:00,  4.27s/it]


✅ Saved 10260 embeddings to virchow_features/TCGA-G5-6235.h5


TCGA-G5-6572: 100%|███████████████████████████| 118/118 [08:24<00:00,  4.27s/it]


✅ Saved 11328 embeddings to virchow_features/TCGA-G5-6572.h5


TCGA-G5-6641: 100%|█████████████████████████████| 75/75 [05:21<00:00,  4.29s/it]


✅ Saved 7200 embeddings to virchow_features/TCGA-G5-6641.h5


TCGA-NH-A50T: 100%|███████████████████████████| 124/124 [08:48<00:00,  4.26s/it]


✅ Saved 11900 embeddings to virchow_features/TCGA-NH-A50T.h5


TCGA-NH-A50U:  32%|█████████▍                   | 24/74 [01:44<03:32,  4.24s/it]