In [1]:
import h5py
import os
import timm
import torch
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from timm.layers import SwiGLUPacked
from torch.utils.data import DataLoader, Dataset

In [2]:
# for multi-gpu
print(os.environ["CUDA_VISIBLE_DEVICES"])

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

0,1


In [3]:
model = timm.create_model(
    "hf-hub:paige-ai/Virchow2",
    pretrained=True,
    mlp_layer=SwiGLUPacked,
    act_layer=torch.nn.SiLU
)
model.eval().cuda()

config = resolve_data_config(model.pretrained_cfg, model=model)
transform = create_transform(**config)

In [4]:
class TileFolderDataset(Dataset):
    def __init__(self, folder):
        self.paths = sorted([
            os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".png")
        ])
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        return self.transform(img), self.paths[idx]

In [5]:
# ───────────── Embedding + Saving ─────────────
def extract_and_save(tile_folder, h5_output_path, batch_size=96):
    dataset = TileFolderDataset(tile_folder)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=20, pin_memory=True, prefetch_factor=2, persistent_workers=True)

    all_embeddings = []
    all_coords = []

    for batch_imgs, batch_paths in tqdm(dataloader, desc=os.path.basename(tile_folder)):
        batch_imgs = batch_imgs.cuda()
        with torch.no_grad():
            out = model(batch_imgs)  # shape: (B, 261, 1280)

        cls = out[:, 0]
        patch_tokens = out[:, 5:]  # skip register tokens
        mean = patch_tokens.mean(dim=1)
        embedding = torch.cat([cls, mean], dim=-1)  # (B, 2560)
        all_embeddings.append(embedding.cpu())

        # Extract x, y from filename (e.g., TCGA-XX_L1_1232_2048.png)
        for path in batch_paths:
            base = os.path.splitext(os.path.basename(path))[0]
            try:
                x, y = map(int, base.split("_")[-2:])
            except:
                x, y = 0, 0
            all_coords.append((x, y))

    all_embeddings = torch.cat(all_embeddings, dim=0)     # (N, 2560)
    all_coords = torch.tensor(all_coords)                 # (N, 2)

    with h5py.File(h5_output_path, "w") as f:
        f.create_dataset("features", data=all_embeddings.numpy())
        f.create_dataset("coords", data=all_coords.numpy())

    print(f"✅ Saved {all_embeddings.shape[0]} embeddings to {h5_output_path}")

In [6]:
%pwd

'/orcd/data/edboyden/002/ezh/uni'

In [7]:
print("Using GPU:", torch.cuda.current_device(), torch.cuda.get_device_name())

Using GPU: 0 NVIDIA A100 80GB PCIe


In [None]:
tile_root_dir = "virchow_tiles_gpu1"               # root directory containing subfolders for each WSI
output_dir = "virchow_features"                   # where to save .h5 files
os.makedirs(output_dir, exist_ok=True)

for slide_folder in sorted(os.listdir(tile_root_dir)):
    slide_path = os.path.join(tile_root_dir, slide_folder)
    if not os.path.isdir(slide_path):
        continue  # skip files

    h5_output_path = os.path.join(output_dir, f"{slide_folder}.h5")

    if os.path.exists(h5_output_path):
        print(f"✅ Skipping {slide_folder}, already exists.")
        continue

    try:
        extract_and_save(slide_path, h5_output_path, batch_size=196)
    except Exception as e:
        print(f"❌ Failed to process {slide_folder}: {e}")

.ipynb_checkpoints: 0it [00:00, ?it/s]


❌ Failed to process .ipynb_checkpoints: torch.cat(): expected a non-empty list of Tensors
✅ Skipping TCGA-AA-3815, already exists.
✅ Skipping TCGA-AA-3818, already exists.
✅ Skipping TCGA-AA-3819, already exists.
✅ Skipping TCGA-AA-3821, already exists.
✅ Skipping TCGA-AA-3831, already exists.
✅ Skipping TCGA-AA-3833, already exists.
✅ Skipping TCGA-AA-3837, already exists.
✅ Skipping TCGA-AA-3841, already exists.
✅ Skipping TCGA-AA-3842, already exists.
✅ Skipping TCGA-AA-3844, already exists.
✅ Skipping TCGA-AA-3845, already exists.
✅ Skipping TCGA-AA-3846, already exists.
✅ Skipping TCGA-AA-3848, already exists.
✅ Skipping TCGA-AA-3850, already exists.
✅ Skipping TCGA-AA-3851, already exists.
✅ Skipping TCGA-AA-3852, already exists.
✅ Skipping TCGA-AA-3854, already exists.
✅ Skipping TCGA-AA-3855, already exists.
✅ Skipping TCGA-AA-3856, already exists.
✅ Skipping TCGA-AA-3858, already exists.
✅ Skipping TCGA-AA-3860, already exists.
✅ Skipping TCGA-AA-3861, already exists.
✅ Skippi

TCGA-AY-6196: 100%|█████████████████████████████████████████████████████| 42/42 [02:49<00:00,  4.03s/it]


✅ Saved 8118 embeddings to virchow_features/TCGA-AY-6196.h5


TCGA-AY-6197: 100%|█████████████████████████████████████████████████████| 57/57 [03:51<00:00,  4.06s/it]


✅ Saved 11039 embeddings to virchow_features/TCGA-AY-6197.h5


TCGA-AY-6386: 100%|█████████████████████████████████████████████████████| 32/32 [02:08<00:00,  4.01s/it]


✅ Saved 6090 embeddings to virchow_features/TCGA-AY-6386.h5


TCGA-AY-A54L: 100%|█████████████████████████████████████████████████████| 46/46 [03:06<00:00,  4.05s/it]


✅ Saved 8909 embeddings to virchow_features/TCGA-AY-A54L.h5


TCGA-AY-A69D: 100%|█████████████████████████████████████████████████████| 33/33 [02:13<00:00,  4.05s/it]


✅ Saved 6300 embeddings to virchow_features/TCGA-AY-A69D.h5


TCGA-AY-A71X: 100%|███████████████████████████████████████████████████| 102/102 [06:50<00:00,  4.03s/it]


✅ Saved 19800 embeddings to virchow_features/TCGA-AY-A71X.h5


TCGA-AY-A8YK: 100%|█████████████████████████████████████████████████████| 32/32 [02:07<00:00,  3.98s/it]


✅ Saved 6104 embeddings to virchow_features/TCGA-AY-A8YK.h5


TCGA-AZ-4308: 100%|█████████████████████████████████████████████████████| 42/42 [02:48<00:00,  4.02s/it]


✅ Saved 8064 embeddings to virchow_features/TCGA-AZ-4308.h5


TCGA-AZ-4313: 100%|█████████████████████████████████████████████████████| 47/47 [03:08<00:00,  4.00s/it]


✅ Saved 9030 embeddings to virchow_features/TCGA-AZ-4313.h5


TCGA-AZ-4315: 100%|█████████████████████████████████████████████████████| 54/54 [03:39<00:00,  4.06s/it]


✅ Saved 10500 embeddings to virchow_features/TCGA-AZ-4315.h5


TCGA-AZ-4614:  25%|█████████████▌                                       | 13/51 [00:55<02:33,  4.04s/it]