In [1]:
import os
os.getcwd()

'd:\\Aamir Gulzar\\KSA_project2\\Cancer-detection-classifier\\feature_extraction'

## Feature extraction using Resnet-18

In [None]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch
import numpy as np
from typing import Union, List
import os

class ResNetFeatureExtractor:
    def __init__(self, model_name: str = "microsoft/resnet-18"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = AutoImageProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def extract_features(self, 
                         images: Union[str, Image.Image, List[str], List[Image.Image]], 
                         batch_size: int = 32) -> np.ndarray:
        if isinstance(images, (str, Image.Image)):
            images = [images]

        # Load and preprocess images
        all_features = []
        image_batch = []
        
        for img in images:
            if isinstance(img, str):
                pil_img = Image.open(img).convert("RGB")
            elif isinstance(img, Image.Image):
                pil_img = img.convert("RGB")
            else:
                raise ValueError(f"Unsupported image type: {type(img)}")

            image_batch.append(pil_img)

        # Process in batches
        for i in range(0, len(image_batch), batch_size):
            batch_imgs = image_batch[i:i + batch_size]
            inputs = self.processor(batch_imgs, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                feats = outputs.last_hidden_state  # (B, C, 1, 1) → (B, C)
                # # Use the pooled output (e.g., CLS token)
                # if hasattr(outputs, 'pooler_output'):  # For models like ViT
                #     feats = outputs.pooler_output
                # else:  # For resnet18, use last_hidden_state average
                #     feats = outputs.last_hidden_state.squeeze(-1).squeeze(-1)  # (B, C, 1, 1) → (B, C)

            all_features.append(feats.cpu().numpy())

        return np.concatenate(all_features, axis=0)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


ConvNextImageProcessor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ConvNextImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}



In [None]:
import os
import glob
import pandas as pd
import torch
import numpy as np
from h5py import File as H5File
from transformers import AutoModel
from tqdm import tqdm

def run_full_pipeline(
    patch_data_dir: str,
    extractor,
    slide_meta_csv: str,
    non_white_csv: str,
    output_root: str,
    batch_size: int = 64
):
    # Load slide metadata and non-white patch names
    slide_meta_df = pd.read_csv(slide_meta_csv)
    non_white_df = pd.read_csv(non_white_csv)
    valid_patch_names = set(non_white_df['patch_name'].astype(str))

    # Iterate through all slide folders in patch_data_dir
    slide_dirs = glob.glob(os.path.join(patch_data_dir, "*"))

    for slide_dir in slide_dirs:
        slide_name = os.path.basename(slide_dir)
        print(f"\nProcessing slide: {slide_name}")

        # Match objective power
        meta_row = slide_meta_df[slide_meta_df["renamed"].str.lower() == slide_name.lower()]
        if meta_row.empty:
            print(f"  ❌ Skipped: Slide metadata not found for {slide_name}")
            continue

        try:
            obj_power = int(meta_row.iloc[0]["Objective Power"])
            patch_size_level0 = 1024 if obj_power == 40 else 512 if obj_power == 20 else None
            if patch_size_level0 is None:
                print(f"  ❌ Unsupported objective power: {obj_power}")
                continue
        except Exception as e:
            print(f"  ❌ Error extracting objective power: {e}")
            continue

        # Filter valid patches for this slide
        all_patches = glob.glob(os.path.join(slide_dir, "*.png"))
        patch_paths = [p for p in all_patches if os.path.basename(p) in valid_patch_names]

        if not patch_paths:
            print(f"  ❌ No valid non-white patches found for {slide_name}")
            continue

        # Sort and extract features
        patch_paths = sort_by_coords(patch_paths)
        output_dir = os.path.join(output_root, slide_name)
        os.makedirs(output_dir, exist_ok=True)

        features_list = []
        for i in range(0, len(patch_paths), batch_size):
            batch_paths = patch_paths[i:i + batch_size]
            try:
                feats = extractor.extract_features(batch_paths, batch_size=batch_size)  # RESNET
                features_list.append(feats)
            except Exception as e:
                print(f"  ⚠️ Skipping batch due to error: {e}")
                continue

        if not features_list:
            print(f"  ❌ No features extracted for {slide_name}")
            continue

        features_np = np.concatenate(features_list, axis=0)
        h5_path = create_h5_from_features(
            features_np,
            patch_paths,
            output_dir=os.path.join(output_dir, "patch_features"),
            slide_name=slide_name,
            patch_size_level0=patch_size_level0
        )

        # Save a .pt version of the h5 contents
        with H5File(h5_path, 'r') as file:
            patch_features = torch.from_numpy(file['features'][:])
            patch_coords = torch.from_numpy(file['coords'][:])
            patch_size_lv0 = file['coords'].attrs['patch_size_level0']
        
        pt_data = {
            'features': patch_features,
            'coords': patch_coords,
            'patch_size_level0': patch_size_lv0
        }
        
        pt_path = h5_path.replace('.h5', '.pt')
        torch.save(pt_data, pt_path)
        
        print(f"  💾 Saved patch data to .pt at {pt_path}")
        
        # # Extract slide embedding
        # with H5File(h5_path, 'r') as file:
        #     feats = torch.from_numpy(file['features'][:])
        #     coords = torch.from_numpy(file['coords'][:])
        #     patch_size_lv0 = file['coords'].attrs['patch_size_level0']

        # with torch.autocast('cuda', torch.float16), torch.inference_mode():
        #     slide_emb = titan.encode_slide_from_patch_features(feats, coords, patch_size_lv0)

        # slide_emb_path = os.path.join(output_dir, "slide_features", f"{slide_name}_slide_embedding.pt")
        # os.makedirs(os.path.dirname(slide_emb_path), exist_ok=True)
        # torch.save(slide_emb, slide_emb_path)

        # print(f"  ✅ Done: Saved H5 and slide embedding for {slide_name}")

In [None]:
extractor = ResNetFeatureExtractor()

run_full_pipeline(
    patch_data_dir=r"D:\Aamir Gulzar\KSA_project2\dataset\Test_data",
    extractor=extractor,
    slide_meta_csv=r"D:\Aamir Gulzar\KSA_project2\Cancer-detection-classifier\feature_extraction\slide_metadata_filtered.csv",
    non_white_csv=r"D:\Aamir Gulzar\KSA_project2\Cancer-detection-classifier\feature_extraction\final_merged_without_white.csv",
    output_root=r"D:\Aamir Gulzar\KSA_project2\dataset\Features\Conch_v1_5_Missing",
    batch_size=64
)

In [9]:
import os
import glob
import pandas as pd
import torch
import numpy as np
from h5py import File as H5File
from tqdm import tqdm

def run_full_pipeline_test(
    patch_data_dir: str,
    extractor,
    slide_meta_csv: str,
    non_white_csv: str,
    output_root: str,
    batch_size: int = 64
):
    # Load slide metadata and non-white patch names
    slide_meta_df = pd.read_csv(slide_meta_csv)
    non_white_df = pd.read_csv(non_white_csv)
    valid_patch_names = set(non_white_df['patch_name'].astype(str))

    # Iterate through all slide folders in patch_data_dir
    slide_dirs = glob.glob(os.path.join(patch_data_dir, "*"))

    for slide_dir in slide_dirs:
        slide_name = os.path.basename(slide_dir)
        print(f"\nProcessing slide: {slide_name}")

        # Match objective power
        meta_row = slide_meta_df[slide_meta_df["renamed"].str.lower() == slide_name.lower()]
        if meta_row.empty:
            print(f"  ❌ Skipped: Slide metadata not found for {slide_name}")
            continue

        try:
            obj_power = int(meta_row.iloc[0]["Objective Power"])
            patch_size_level0 = 1024 if obj_power == 40 else 512 if obj_power == 20 else None
            if patch_size_level0 is None:
                print(f"  ❌ Unsupported objective power: {obj_power}")
                continue
        except Exception as e:
            print(f"  ❌ Error extracting objective power: {e}")
            continue

        # Filter valid patches for this slide
        all_patches = glob.glob(os.path.join(slide_dir, "*.png"))
        patch_paths = [p for p in all_patches if os.path.basename(p) in valid_patch_names]

        if not patch_paths:
            print(f"  ❌ No valid non-white patches found for {slide_name}")
            continue

        # Sort and extract features
        patch_paths = sort_by_coords(patch_paths)
        output_dir = os.path.join(output_root, slide_name)
        os.makedirs(output_dir, exist_ok=True)

        features_list = []
        print("  🔍 Extracting features:")
        for i in tqdm(range(0, len(patch_paths), batch_size)):
            batch_paths = patch_paths[i:i + batch_size]
            try:
                feats = extractor.extract_features(batch_paths, batch_size=batch_size)  # RESNET
                features_list.append(feats)
                print(f"    ✅ Batch {i//batch_size}: {feats.shape}")
            except Exception as e:
                print(f"    ⚠️ Skipping batch {i//batch_size} due to error: {e}")
                continue

        if not features_list:
            print(f"  ❌ No features extracted for {slide_name}")
            continue

        features_np = np.concatenate(features_list, axis=0)
        print(f"  📐 Feature shape for {slide_name}: {features_np.shape}")

        h5_path = create_h5_from_features(
            features_np,
            patch_paths,
            output_dir=os.path.join(output_dir, "patch_features"),
            slide_name=slide_name,
            patch_size_level0=patch_size_level0
        )
        print(f"  💾 Saved H5 file at: {h5_path}")

        with H5File(h5_path, 'r') as file:
            patch_features = torch.from_numpy(file['features'][:])
            patch_coords = torch.from_numpy(file['coords'][:])
            patch_size_lv0 = file['coords'].attrs['patch_size_level0']

        print(f"  🔍 Loaded back features shape: {patch_features.shape}")
        print(f"  📍 Coords shape: {patch_coords.shape}, Patch size: {patch_size_lv0}")

        pt_data = {
            'features': patch_features,
            'coords': patch_coords,
            'patch_size_level0': patch_size_lv0
        }

        pt_path = h5_path.replace('.h5', '.pt')
        torch.save(pt_data, pt_path)
        print(f"  💾 Saved .pt at: {pt_path}")

In [10]:
extractor = ResNetFeatureExtractor()

# For Laptop testing
run_full_pipeline_test(
    patch_data_dir=r"D:\Titan_Project\Test_data_smallest",
    extractor=extractor,
    slide_meta_csv=r"D:\Titan_Project\slide_metadata_filtered.csv",
    non_white_csv=r"D:\Titan_Project\final_merged_without_white.csv",
    output_root=r"D:\DataInsight\CRC100k\output_dir",
    batch_size=64
)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.



Processing slide: TCGA-3L-AA1B_nonMSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  3.70it/s]


    ✅ Batch 0: (2, 512, 7, 7)
  📐 Feature shape for TCGA-3L-AA1B_nonMSIH: (2, 512, 7, 7)
H5 file created successfully: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH_patch_features.h5
Features shape: (2, 512, 7, 7)
Coords shape: (1, 2, 2)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH_patch_features.h5
  🔍 Loaded back features shape: torch.Size([2, 512, 7, 7])
  📍 Coords shape: torch.Size([1, 2, 2]), Patch size: 1024
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH_patch_features.pt

Processing slide: TCGA-A6-5661_MSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  3.38it/s]


    ✅ Batch 0: (2, 512, 7, 7)
  📐 Feature shape for TCGA-A6-5661_MSIH: (2, 512, 7, 7)
H5 file created successfully: D:\DataInsight\CRC100k\output_dir\TCGA-A6-5661_MSIH\patch_features\TCGA-A6-5661_MSIH_patch_features.h5
Features shape: (2, 512, 7, 7)
Coords shape: (1, 2, 2)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-A6-5661_MSIH\patch_features\TCGA-A6-5661_MSIH_patch_features.h5
  🔍 Loaded back features shape: torch.Size([2, 512, 7, 7])
  📍 Coords shape: torch.Size([1, 2, 2]), Patch size: 1024
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-A6-5661_MSIH\patch_features\TCGA-A6-5661_MSIH_patch_features.pt

Processing slide: TCGA-AA-3846_nonMSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  3.44it/s]

    ✅ Batch 0: (1, 512, 7, 7)
  📐 Feature shape for TCGA-AA-3846_nonMSIH: (1, 512, 7, 7)
H5 file created successfully: D:\DataInsight\CRC100k\output_dir\TCGA-AA-3846_nonMSIH\patch_features\TCGA-AA-3846_nonMSIH_patch_features.h5
Features shape: (1, 512, 7, 7)
Coords shape: (1, 1, 2)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-AA-3846_nonMSIH\patch_features\TCGA-AA-3846_nonMSIH_patch_features.h5
  🔍 Loaded back features shape: torch.Size([1, 512, 7, 7])
  📍 Coords shape: torch.Size([1, 1, 2]), Patch size: 512
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-AA-3846_nonMSIH\patch_features\TCGA-AA-3846_nonMSIH_patch_features.pt





In [7]:
def inspect_h5_file(h5_path):
    """Inspect the contents of an H5 file"""
    print(f"Inspecting H5 file: {h5_path}")
    print("=" * 50)
    
    with h5py.File(h5_path, 'r') as file:
        def print_structure(name, obj):
            if isinstance(obj, h5py.Dataset):
                print(f"Dataset: {name}")
                print(f"  Shape: {obj.shape}")
                print(f"  Dtype: {obj.dtype}")
                if hasattr(obj, 'attrs') and len(obj.attrs) > 0:
                    print(f"  Attributes: {dict(obj.attrs)}")
                print()
            elif isinstance(obj, h5py.Group):
                print(f"Group: {name}")
                if hasattr(obj, 'attrs') and len(obj.attrs) > 0:
                    print(f"  Attributes: {dict(obj.attrs)}")
                print()
        
        file.visititems(print_structure)

# First, inspect your H5 file
# demo_h5_path = "TCGA-PC-A5DK-01Z-00-DX1.C2D3BC09-411F-46CF-811B-FDBA7C2A295B.h5"
# h5_path = r"D:\Titan_Project\Features_Tests\3732_ResNet\features\h5_files\TCGA-AG-3732-01Z-00-DX1.5EC57511-2B19-4005-BCA0-333C387C66E6.h5"
h5_path = r"D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH_patch_features.h5"
inspect_h5_file(h5_path)
# inspect_h5_file(demo_h5_path)

Inspecting H5 file: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH_patch_features.h5
Dataset: coords
  Shape: (1, 2, 2)
  Dtype: int64
  Attributes: {'contour_fn': 'four_pt', 'custom_downsample': 2.0, 'downsample': array([1., 1.]), 'name': 'TCGA-3L-AA1B_nonMSIH', 'patch_level': 0, 'patch_size': 1024, 'patch_size_level0': 1024, 'step_size': 1024, 'use_padding': True}

Dataset: features
  Shape: (2, 512, 7, 7)
  Dtype: float32



In [None]:
import os
import glob
import pandas as pd
import torch
import numpy as np
from h5py import File as H5File
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch
import numpy as np
from typing import Union, List
import re

class ResNetFeatureExtractor:
    def __init__(self, model_name: str = "microsoft/resnet-18"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = AutoImageProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def extract_features(self, 
                         images: Union[str, Image.Image, List[str], List[Image.Image]], 
                         batch_size: int = 32) -> np.ndarray:
        if isinstance(images, (str, Image.Image)):
            images = [images]

        # Load and preprocess images
        all_features = []
        image_batch = []
        
        for img in images:
            if isinstance(img, str):
                pil_img = Image.open(img).convert("RGB")
            elif isinstance(img, Image.Image):
                pil_img = img.convert("RGB")
            else:
                raise ValueError(f"Unsupported image type: {type(img)}")

            image_batch.append(pil_img)

        # Process in batches
        for i in range(0, len(image_batch), batch_size):
            batch_imgs = image_batch[i:i + batch_size]
            inputs = self.processor(batch_imgs, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                # Use the pooled output (e.g., CLS token)
                if hasattr(outputs, 'pooler_output'):  # For models like ViT
                    feats = outputs.pooler_output
                else:  # For resnet18, use last_hidden_state average
                    feats = outputs.last_hidden_state
                    # Properly handle the spatial dimensions
                    if len(feats.shape) == 4:  # (B, C, H, W)
                        feats = feats.mean(dim=[2, 3])  # Global average pooling -> (B, C)
                    elif len(feats.shape) == 3:  # (B, C, 1) - already mostly flattened
                        feats = feats.squeeze(-1)  # (B, C)
                    # If it's already 2D (B, C), leave as is

            all_features.append(feats.cpu().numpy())

        return np.concatenate(all_features, axis=0)

def sort_by_coords(patch_paths):
    """Sort patch paths by their coordinates extracted from filename"""
    def extract_coords(path):
        filename = os.path.basename(path)
        # Extract x, y coordinates from filename (adjust regex based on your naming convention)
        # Common patterns: patch_x_y.png, slide_x_y.png, etc.
        match = re.search(r'(\d+)_(\d+)', filename)
        if match:
            return int(match.group(1)), int(match.group(2))
        return 0, 0
    
    return sorted(patch_paths, key=extract_coords)

def create_h5_from_features(
    features_np,
    patch_paths,
    output_dir,
    slide_name,
    patch_size_level0
):
    """Create H5 file from features and patch paths"""
    os.makedirs(output_dir, exist_ok=True)
    h5_path = os.path.join(output_dir, f"{slide_name}.h5")
    
    # Extract coordinates from patch paths
    coords = []
    for path in patch_paths:
        filename = os.path.basename(path)
        # Extract x, y coordinates from filename
        match = re.search(r'(\d+)_(\d+)', filename)
        if match:
            x, y = int(match.group(1)), int(match.group(2))
            coords.append([x, y])
        else:
            coords.append([0, 0])  # Default if no coords found
    
    coords_np = np.array(coords)  # Shape: (N, 2)
    
    # Save to H5 file
    with H5File(h5_path, 'w') as file:
        file.create_dataset('features', data=features_np)
        coords_dataset = file.create_dataset('coords', data=coords_np)
        coords_dataset.attrs['patch_size_level0'] = patch_size_level0
    
    return h5_path

def run_full_pipeline_test(
    patch_data_dir: str,
    extractor,
    slide_meta_csv: str,
    non_white_csv: str,
    output_root: str,
    batch_size: int = 64
):
    # Load slide metadata and non-white patch names
    slide_meta_df = pd.read_csv(slide_meta_csv)
    non_white_df = pd.read_csv(non_white_csv)
    valid_patch_names = set(non_white_df['patch_name'].astype(str))

    # Iterate through all slide folders in patch_data_dir
    slide_dirs = glob.glob(os.path.join(patch_data_dir, "*"))

    for slide_dir in slide_dirs:
        slide_name = os.path.basename(slide_dir)
        print(f"\nProcessing slide: {slide_name}")

        # Match objective power
        meta_row = slide_meta_df[slide_meta_df["renamed"].str.lower() == slide_name.lower()]
        if meta_row.empty:
            print(f"  ❌ Skipped: Slide metadata not found for {slide_name}")
            continue

        try:
            obj_power = int(meta_row.iloc[0]["Objective Power"])
            patch_size_level0 = 1024 if obj_power == 40 else 512 if obj_power == 20 else None
            if patch_size_level0 is None:
                print(f"  ❌ Unsupported objective power: {obj_power}")
                continue
        except Exception as e:
            print(f"  ❌ Error extracting objective power: {e}")
            continue

        # Filter valid patches for this slide
        all_patches = glob.glob(os.path.join(slide_dir, "*.png"))
        patch_paths = [p for p in all_patches if os.path.basename(p) in valid_patch_names]

        if not patch_paths:
            print(f"  ❌ No valid non-white patches found for {slide_name}")
            continue

        # Sort and extract features
        patch_paths = sort_by_coords(patch_paths)
        output_dir = os.path.join(output_root, slide_name)
        os.makedirs(output_dir, exist_ok=True)

        features_list = []
        print("  🔍 Extracting features:")
        for i in tqdm(range(0, len(patch_paths), batch_size)):
            batch_paths = patch_paths[i:i + batch_size]
            try:
                feats = extractor.extract_features(batch_paths, batch_size=batch_size)
                features_list.append(feats)
                print(f"    ✅ Batch {i//batch_size}: {feats.shape}")
            except Exception as e:
                print(f"    ⚠️ Skipping batch {i//batch_size} due to error: {e}")
                continue

        if not features_list:
            print(f"  ❌ No features extracted for {slide_name}")
            continue

        features_np = np.concatenate(features_list, axis=0)
        print(f"  📐 Feature shape for {slide_name}: {features_np.shape}")

        h5_path = create_h5_from_features(
            features_np,
            patch_paths,
            output_dir=os.path.join(output_dir, "patch_features"),
            slide_name=slide_name,
            patch_size_level0=patch_size_level0
        )
        print(f"  💾 Saved H5 file at: {h5_path}")

        # Load back and verify
        with H5File(h5_path, 'r') as file:
            patch_features = torch.from_numpy(file['features'][:])
            patch_coords = torch.from_numpy(file['coords'][:])
            patch_size_lv0 = file['coords'].attrs['patch_size_level0']

        print(f"  🔍 Loaded back features shape: {patch_features.shape}")
        print(f"  📍 Coords shape: {patch_coords.shape}, Patch size: {patch_size_lv0}")

        pt_data = {
            'features': patch_features,
            'coords': patch_coords,
            'patch_size_level0': patch_size_lv0
        }

        pt_path = h5_path.replace('.h5', '.pt')
        torch.save(pt_data, pt_path)
        print(f"  💾 Saved .pt at: {pt_path}")

# Usage example:
if __name__ == "__main__":
    # Initialize extractor
    extractor = ResNetFeatureExtractor()
    
    # For Laptop testing
    run_full_pipeline_test(
        patch_data_dir=r"D:\Titan_Project\Test_data_smallest",
        extractor=extractor,
        slide_meta_csv=r"D:\Titan_Project\slide_metadata_filtered.csv",
        non_white_csv=r"D:\Titan_Project\final_merged_without_white.csv",
        output_root=r"D:\DataInsight\CRC100k\output_dir",
        batch_size=64
    )

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.



Processing slide: TCGA-3L-AA1B_nonMSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  2.45it/s]


BBBBBBBBBBBBBBBBBBBBBBBBBBBB
    ✅ Batch 0: (2, 512, 1, 1)
  📐 Feature shape for TCGA-3L-AA1B_nonMSIH: (2, 512, 1, 1)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH.h5
  🔍 Loaded back features shape: torch.Size([2, 512, 1, 1])
  📍 Coords shape: torch.Size([2, 2]), Patch size: 1024
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-3L-AA1B_nonMSIH\patch_features\TCGA-3L-AA1B_nonMSIH.pt

Processing slide: TCGA-A6-5661_MSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


BBBBBBBBBBBBBBBBBBBBBBBBBBBB
    ✅ Batch 0: (2, 512, 1, 1)
  📐 Feature shape for TCGA-A6-5661_MSIH: (2, 512, 1, 1)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-A6-5661_MSIH\patch_features\TCGA-A6-5661_MSIH.h5
  🔍 Loaded back features shape: torch.Size([2, 512, 1, 1])
  📍 Coords shape: torch.Size([2, 2]), Patch size: 1024
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-A6-5661_MSIH\patch_features\TCGA-A6-5661_MSIH.pt

Processing slide: TCGA-AA-3846_nonMSIH
  🔍 Extracting features:


100%|██████████| 1/1 [00:00<00:00,  2.77it/s]

BBBBBBBBBBBBBBBBBBBBBBBBBBBB
    ✅ Batch 0: (1, 512, 1, 1)
  📐 Feature shape for TCGA-AA-3846_nonMSIH: (1, 512, 1, 1)
  💾 Saved H5 file at: D:\DataInsight\CRC100k\output_dir\TCGA-AA-3846_nonMSIH\patch_features\TCGA-AA-3846_nonMSIH.h5
  🔍 Loaded back features shape: torch.Size([1, 512, 1, 1])
  📍 Coords shape: torch.Size([1, 2]), Patch size: 512
  💾 Saved .pt at: D:\DataInsight\CRC100k\output_dir\TCGA-AA-3846_nonMSIH\patch_features\TCGA-AA-3846_nonMSIH.pt





In [18]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch

# Load image
image_path = r"D:\DataInsight\CRC100k\Test_data_smallest\TCGA-3L-AA1B_nonMSIH\TCGA-3L-AA1B_nonMSIH_x2608_y14048_patch00000.png"
image = Image.open(image_path).convert("RGB")  # Make sure it's in RGB

# Load processor and model (for feature extraction)
image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-18")
model = AutoModel.from_pretrained("microsoft/resnet-18")

# Preprocess the image
inputs = image_processor(image, return_tensors="pt")

# Get features (skip classification head)
with torch.no_grad():
    outputs = model(**inputs)
    features = outputs.last_hidden_state  # Should be [1, 512] for ResNet-18

# If needed as a numpy array
pooled_features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1)).squeeze()  # Shape: [512]
print("Pooled feature shape:", pooled_features.shape)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Pooled feature shape: torch.Size([512])


In [None]:
import torch
from torchvision import models, transforms
from PIL import Image

# Load pre-trained ResNet18 from torchvision
resnet18 = models.resnet18(pretrained=True)
resnet18.eval()

# Remove final classification layer (fc)
backbone = torch.nn.Sequential(*(list(resnet18.children())[:-1]))  # Excludes the last FC layer

# Image preprocessing (same as torchvision defaults)
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Load and preprocess image
image_path = r"D:\DataInsight\CRC100k\Test_data_smallest\TCGA-3L-AA1B_nonMSIH\TCGA-3L-AA1B_nonMSIH_x2608_y14048_patch00000.png"
image = Image.open(image_path).convert("RGB")
input_tensor = transform(image).unsqueeze(0)  # Shape: [1, 3, 224, 224]

# Get pooled features directly
with torch.no_grad():
    features = backbone(input_tensor)  # Shape: [1, 512, 1, 1]
    features = features.view(features.size(0), -1)  # Shape: [1, 512]

print("Pooled feature shape:", features.shape)




torch.Size([1, 3, 224, 224])
torch.Size([1, 512, 1, 1])
torch.Size([1, 512])
Pooled feature shape: torch.Size([1, 512])


In [None]:
# Check torchvision resnet preprocessing
from torchvision.models import resnet18, ResNet18_Weights

# Load weights and their associated transforms
weights = ResNet18_Weights.DEFAULT
preprocess = weights.transforms()

# View the actual transform pipeline
print(preprocess)

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


In [2]:
import h5py
import numpy as np
import re
import os
from typing import List, Tuple, Dict, Any
from pathlib import Path
import glob
import time
from concurrent.futures import ThreadPoolExecutor
import torch
from PIL import Image

def extract_coords_from_filename(filename: str) -> Tuple[int, int]:
    """
    Extract x, y coordinates from patch filename.
    
    Expected format: TCGA-AA-3846_nonMSIH_x3152_y22080_patch00015.png
    
    Args:
        filename: Full filename or just the basename
        
    Returns:
        Tuple of (x, y) coordinates
        
    Raises:
        ValueError: If coordinates cannot be extracted from filename
    """
    # Remove path and extension to get just the basename
    basename = os.path.splitext(os.path.basename(filename))[0]
    
    # Pattern to match x and y coordinates
    pattern = r'_x(\d+)_y(\d+)'
    
    match = re.search(pattern, basename)
    if not match:
        raise ValueError(f"Could not extract coordinates from filename: {filename}")
    
    x_coord = int(match.group(1))
    y_coord = int(match.group(2))
    
    return x_coord, y_coord

def sort_by_coords(image_paths: List[str]) -> List[str]:
    def get_xy(path):
        try:
            return extract_coords_from_filename(path)
        except ValueError:
            return (float('inf'), float('inf'))  # Push invalid files to end
    return sorted(image_paths, key=get_xy)

def load_image_batch(image_paths: List[str], start_idx: int, batch_size: int, transform) -> torch.Tensor:
    """
    Load and transform a batch of images efficiently using threading.
    """
    end_idx = min(start_idx + batch_size, len(image_paths))
    batch_paths = image_paths[start_idx:end_idx]
    
    def load_single_image(path):
        try:
            img = Image.open(path).convert("RGB")
            return transform(img)
        except Exception as e:
            print(f"Warning: Failed to load {path}: {e}")
            return None
    
    # Use threading for I/O bound image loading
    with ThreadPoolExecutor(max_workers=min(8, len(batch_paths))) as executor:
        tensors = list(executor.map(load_single_image, batch_paths))
    
    # Filter out None values and stack
    valid_tensors = [t for t in tensors if t is not None]
    if not valid_tensors:
        return torch.empty(0)
    
    return torch.stack(valid_tensors)

def create_h5_from_features(
    features: np.ndarray,
    image_paths: List[str],
    output_dir: str,
    slide_name: str = None,
    patch_size_level0: int = 1024,
    additional_attributes: Dict[str, Any] = None
) -> str:
    """
    Create H5 file with features and coordinates extracted from image filenames.
    
    Args:
        features: Feature array of shape (n_patches, feature_dim)
        image_paths: List of image file paths corresponding to features
        output_dir: Directory where to save the H5 file
        slide_name: Name of the slide (extracted from first filename if None)
        patch_size_level0: Patch size at level 0
        additional_attributes: Additional attributes to add to coords dataset
        
    Returns:
        Path to created H5 file
    """
    if len(features) != len(image_paths):
        raise ValueError(f"Features length ({len(features)}) must match image_paths length ({len(image_paths)})")
    
    # Extract coordinates from filenames using threading for speed
    def extract_coords_worker(img_path):
        try:
            x, y = extract_coords_from_filename(img_path)
            return [x, y]
        except ValueError as e:
            print(f"Warning: {e}")
            return None
    
    # Parallel coordinate extraction
    with ThreadPoolExecutor(max_workers=8) as executor:
        coords_list = list(executor.map(extract_coords_worker, image_paths))
    
    # Filter out None values
    coords = np.array([c for c in coords_list if c is not None], dtype=np.int64)
    
    # Extract slide name and label from first filename for H5 filename
    if slide_name is None:
        first_filename = os.path.basename(image_paths[0])
        # Extract first 12 characters + label: TCGA-AA-3846_nonMSIH or TCGA-AA-3846_MSIH
        parts = first_filename.split('_')
        if len(parts) >= 2:
            slide_name = '_'.join(parts[:2])  # e.g., "TCGA-AA-3846_nonMSIH"
        else:
            slide_name = parts[0]
    
    # Create H5 filename from slide name
    h5_filename = f"{slide_name}_patch_features.h5"
    output_path = os.path.join(output_dir, h5_filename)
    
    # Ensure features have the right shape (add batch dimension if needed)
    if features.ndim == 2:
        features = features[np.newaxis, :]  # Add batch dimension: (1, n_patches, feature_dim)
    
    # Ensure coords have the right shape
    if coords.ndim == 2:
        coords = coords[np.newaxis, :]  # Add batch dimension: (1, n_patches, 2)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create H5 file with optimized settings
    with h5py.File(output_path, 'w') as h5f:
        # Create features dataset with compression
        features_dataset = h5f.create_dataset(
            'features', 
            data=features.astype(np.float32),
            dtype=np.float32,
            compression='gzip',
            compression_opts=1,  # Light compression for speed
            chunks=True
        )
        
        # Create coordinates dataset
        coords_dataset = h5f.create_dataset(
            'coords',
            data=coords.astype(np.int64),
            dtype=np.int64,
            compression='gzip',
            compression_opts=1,
            chunks=True
        )
        
        # Add attributes to coords dataset
        default_attributes = {
            'contour_fn': 'four_pt',
            'custom_downsample': 2.0,
            'downsample': np.array([1., 1.]),
            'name': slide_name,
            'patch_level': 0,
            'patch_size': patch_size_level0,
            'patch_size_level0': patch_size_level0,
            'step_size': patch_size_level0,
            'use_padding': True
        }
        
        # Add additional attributes if provided
        if additional_attributes:
            default_attributes.update(additional_attributes)
        
        # Set attributes
        for key, value in default_attributes.items():
            coords_dataset.attrs[key] = value
    
    print(f"H5 file created successfully: {output_path}")
    print(f"Features shape: {features.shape}")
    print(f"Coords shape: {coords.shape}")
    
    return output_path

def process_patch_directory_optimized(
    patch_dir: str,
    extractor,
    output_dir: str,
    csv_path: str,
    batch_size: int = 64,  # Increased default batch size
    file_pattern: str = "*.png",
    patch_size_level0: int = 1024,
    max_workers: int = 8,
    progress_interval: int = 10
) -> str:
    """
    Process all patches in a directory and create H5 file with optimizations.
    
    Args:
        patch_dir: Directory containing patch images
        extractor: ConchFeatureExtractor instance
        output_dir: Directory for output H5 file
        batch_size: Batch size for feature extraction (larger = faster)
        file_pattern: File pattern to match (e.g., "*.png", "*.jpg")
        patch_size_level0: Patch size at level 0
        max_workers: Number of threads for image loading
        progress_interval: Print progress every N batches
        
    Returns:
        Path to created H5 file
    """
    # Extract slide name from directory name
    slide_name = os.path.basename(patch_dir.rstrip('/\\'))
    
    # Load Excel and find matching patch size
    df_meta = pd.read_csv(csv_path)
    
    # Find row where "renamed" matches slide_name (case-insensitive match)
    row = df_meta[df_meta['renamed'].str.lower() == slide_name.lower()]
    
    if row.empty:
        raise ValueError(f"Slide name '{slide_name}' not found in csv file.")
    
    objective_power = int(row.iloc[0]['Objective Power'])
    if objective_power == 40:
        patch_size_level0 = 1024
    elif objective_power == 20:
        patch_size_level0 = 512
    else:
        raise ValueError(f"Unsupported objective power: {objective_power} for slide {slide_name}")
    
    print(f"Detected patch size: {patch_size_level0} for slide '{slide_name}' with objective power {objective_power}")

    start_time = time.time()
    
    # Get all image files
    print("Scanning for images...")
    image_paths = glob.glob(os.path.join(patch_dir, file_pattern))
    
    if not image_paths:
        raise ValueError(f"No images found in {patch_dir} with pattern {file_pattern}")
    
    # Sort paths for consistent ordering
    image_paths = sort_by_coords(image_paths)
    
    print(f"Found {len(image_paths)} images to process")
    print(f"Using batch size: {batch_size}")
    print(f"Estimated batches: {len(image_paths) // batch_size + 1}")
    
    # Pre-allocate result arrays for better memory efficiency
    n_images = len(image_paths)
    feature_dim = None
    all_features = []
    
    # Process in batches with optimized loading
    print("Extracting features...")
    batch_count = 0
    
    for i in range(0, n_images, batch_size):
        batch_start = time.time()
        
        # Load batch of images with threading
        batch_tensor = load_image_batch(image_paths, i, batch_size, extractor.eval_transform)
        
        if batch_tensor.numel() == 0:
            print(f"Warning: Empty batch at index {i}")
            continue
        
        # Move to GPU and extract features
        batch_tensor = batch_tensor.to(extractor.device)
        
        with torch.no_grad():
            features = extractor.conch(batch_tensor)
            features_np = features.cpu().numpy()
            all_features.append(features_np)
        
        batch_count += 1
        
        # Progress reporting
        if batch_count % progress_interval == 0:
            batch_time = time.time() - batch_start
            processed = min(i + batch_size, n_images)
            elapsed = time.time() - start_time
            rate = processed / elapsed
            eta = (n_images - processed) / rate if rate > 0 else 0
            
            print(f"Batch {batch_count}: {processed}/{n_images} images "
                  f"({processed/n_images*100:.1f}%) - "
                  f"Rate: {rate:.1f} imgs/sec - "
                  f"ETA: {eta:.1f}s - "
                  f"Batch time: {batch_time:.2f}s")
    
    # Concatenate all features
    if not all_features:
        raise ValueError("No features extracted - check your images")
    
    features = np.concatenate(all_features, axis=0)
    
    extraction_time = time.time() - start_time
    print(f"Feature extraction completed in {extraction_time:.2f}s")
    print(f"Average rate: {len(image_paths)/extraction_time:.1f} images/sec")
    
    # Create H5 file
    print("Creating H5 file...")
    h5_start = time.time()
    
    h5_path = create_h5_from_features(
        features=features,
        image_paths=image_paths,
        output_dir=output_dir,
        patch_size_level0=patch_size_level0
    )
    
    h5_time = time.time() - h5_start
    total_time = time.time() - start_time
    
    print(f"H5 creation completed in {h5_time:.2f}s")
    print(f"Total processing time: {total_time:.2f}s")
    print(f"Overall rate: {len(image_paths)/total_time:.1f} images/sec")
    
    return h5_path

# Performance benchmarking function
def benchmark_processing(
    patch_dir: str,
    extractor,
    output_dir: str,
    batch_sizes: List[int] = [32, 64, 128, 256]
) -> Dict[int, float]:
    """
    Benchmark different batch sizes to find optimal performance.
    
    Args:
        patch_dir: Directory containing patch images
        extractor: ConchFeatureExtractor instance
        output_dir: Directory for output H5 file
        batch_sizes: List of batch sizes to test
        
    Returns:
        Dictionary mapping batch_size to processing time
    """
    results = {}
    
    # Get image count
    image_paths = glob.glob(os.path.join(patch_dir, "*.png"))
    n_images = len(image_paths)
    
    print(f"Benchmarking with {n_images} images")
    
    for batch_size in batch_sizes:
        print(f"\nTesting batch size: {batch_size}")
        
        try:
            start_time = time.time()
            
            # Just test feature extraction (skip H5 creation for speed)
            for i in range(0, min(n_images, 500), batch_size):  # Test first 500 images
                batch_tensor = load_image_batch(image_paths, i, batch_size, extractor.eval_transform)
                
                if batch_tensor.numel() == 0:
                    continue
                
                batch_tensor = batch_tensor.to(extractor.device)
                
                with torch.no_grad():
                    features = extractor.conch(batch_tensor)
                    _ = features.cpu().numpy()
            
            elapsed = time.time() - start_time
            rate = min(500, n_images) / elapsed
            
            results[batch_size] = rate
            print(f"Batch size {batch_size}: {rate:.1f} images/sec")
            
        except Exception as e:
            print(f"Batch size {batch_size} failed: {e}")
            results[batch_size] = 0
    
    return results

In [3]:
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
from typing import Union, List

class ResNetFeatureExtractor:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load pretrained torchvision ResNet-18 and remove final fc layer
        resnet18 = models.resnet18(pretrained=True)
        self.model = torch.nn.Sequential(*(list(resnet18.children())[:-1]))  # Exclude final FC
        self.model = self.model.to(self.device)
        self.model.eval()

        # Use standard ImageNet preprocessing
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    def extract_features(self, 
                         images: Union[str, Image.Image, List[str], List[Image.Image]], 
                         batch_size: int = 32) -> np.ndarray:
        if isinstance(images, (str, Image.Image)):
            images = [images]

        all_features = []
        image_batch = []

        # Load and preprocess images
        for img in images:
            if isinstance(img, str):
                pil_img = Image.open(img).convert("RGB")
            elif isinstance(img, Image.Image):
                pil_img = img.convert("RGB")
            else:
                raise ValueError(f"Unsupported image type: {type(img)}")

            tensor = self.transform(pil_img)
            image_batch.append(tensor)

        # Batch processing
        for i in range(0, len(image_batch), batch_size):
            batch_tensor = torch.stack(image_batch[i:i + batch_size]).to(self.device)  # Shape: (B, 3, 224, 224)

            with torch.no_grad():
                features = self.model(batch_tensor)  # Shape: (B, 512, 1, 1)
                features = features.view(features.size(0), -1)  # Shape: (B, 512)

            all_features.append(features.cpu().numpy())

        return np.concatenate(all_features, axis=0)

In [4]:
import os
import glob
import pandas as pd
import torch
import numpy as np
from h5py import File as H5File
from transformers import AutoModel
from tqdm import tqdm

def run_full_pipeline(
    patch_data_dir: str,
    extractor,
    slide_meta_csv: str,
    non_white_csv: str,
    output_root: str,
    batch_size: int = 64
):
    # Load slide metadata and non-white patch names
    slide_meta_df = pd.read_csv(slide_meta_csv)
    non_white_df = pd.read_csv(non_white_csv)
    valid_patch_names = set(non_white_df['patch_name'].astype(str))

    # Iterate through all slide folders in patch_data_dir
    slide_dirs = glob.glob(os.path.join(patch_data_dir, "*"))

    for slide_dir in slide_dirs:
        slide_name = os.path.basename(slide_dir)

        output_dir = os.path.join(output_root, slide_name)
        if os.path.exists(output_dir):
            print(f"  ⏭️ Skipped: Output already exists for {slide_name}")
            continue
        os.makedirs(output_dir, exist_ok=True)

        print(f"\nProcessing slide: {slide_name}")

        # Match objective power
        meta_row = slide_meta_df[slide_meta_df["renamed"].str.lower() == slide_name.lower()]
        if meta_row.empty:
            print(f"  ❌ Skipped: Slide metadata not found for {slide_name}")
            continue

        try:
            obj_power = int(meta_row.iloc[0]["Objective Power"])
            patch_size_level0 = 1024 if obj_power == 40 else 512 if obj_power == 20 else None
            if patch_size_level0 is None:
                print(f"  ❌ Unsupported objective power: {obj_power}")
                continue
        except Exception as e:
            print(f"  ❌ Error extracting objective power: {e}")
            continue

        # Filter valid patches for this slide
        all_patches = glob.glob(os.path.join(slide_dir, "*.png"))
        patch_paths = [p for p in all_patches if os.path.basename(p) in valid_patch_names]

        if not patch_paths:
            print(f"  ❌ No valid non-white patches found for {slide_name}")
            continue

        # Sort and extract features
        patch_paths = sort_by_coords(patch_paths)

        features_list = []
        for i in range(0, len(patch_paths), batch_size):
            batch_paths = patch_paths[i:i + batch_size]
            try:
                feats = extractor.extract_features(batch_paths, batch_size=batch_size)  # RESNET
                features_list.append(feats)
            except Exception as e:
                print(f"  ⚠️ Skipping batch due to error: {e}")
                continue

        if not features_list:
            print(f"  ❌ No features extracted for {slide_name}")
            continue

        features_np = np.concatenate(features_list, axis=0)
        h5_path = create_h5_from_features(
            features_np,
            patch_paths,
            output_dir=output_dir,
            slide_name=slide_name,
            patch_size_level0=patch_size_level0
        )

        # Save a .pt version of the h5 contents
        with H5File(h5_path, 'r') as file:
            patch_features = torch.from_numpy(file['features'][:])
            patch_coords = torch.from_numpy(file['coords'][:])
            patch_size_lv0 = file['coords'].attrs['patch_size_level0']
        
        pt_data = {
            'features': patch_features,
            'coords': patch_coords,
            'patch_size_level0': patch_size_lv0
        }
        
        pt_path = h5_path.replace('.h5', '.pt')
        torch.save(pt_data, pt_path)
        
        print(f"  💾 Saved patch data to .pt at {pt_path}")
        
        # # Extract slide embedding
        # with H5File(h5_path, 'r') as file:
        #     feats = torch.from_numpy(file['features'][:])
        #     coords = torch.from_numpy(file['coords'][:])
        #     patch_size_lv0 = file['coords'].attrs['patch_size_level0']

        # with torch.autocast('cuda', torch.float16), torch.inference_mode():
        #     slide_emb = titan.encode_slide_from_patch_features(feats, coords, patch_size_lv0)

        # slide_emb_path = os.path.join(output_dir, "slide_features", f"{slide_name}_slide_embedding.pt")
        # os.makedirs(os.path.dirname(slide_emb_path), exist_ok=True)
        # torch.save(slide_emb, slide_emb_path)

        # print(f"  ✅ Done: Saved H5 and slide embedding for {slide_name}")

In [5]:
extractor = ResNetFeatureExtractor()

run_full_pipeline(
    patch_data_dir=r"D:\Aamir Gulzar\KSA_project2\dataset\patch_data",
    extractor=extractor,
    slide_meta_csv=r"D:\Aamir Gulzar\KSA_project2\Cancer-detection-classifier\feature_extraction\slide_metadata_filtered.csv",
    non_white_csv=r"D:\Aamir Gulzar\KSA_project2\Cancer-detection-classifier\feature_extraction\final_merged_without_white.csv",
    output_root=r"D:\Aamir Gulzar\KSA_project2\dataset\Features\ResNet_18",
    batch_size=64
)



  ⏭️ Skipped: Output already exists for TCGA-3L-AA1B_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-4N-A93T_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-5M-AAT4_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-5M-AAT6_MSIH
  ⏭️ Skipped: Output already exists for TCGA-5M-AATE_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-2671_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-2681_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-2685_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-2686_MSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-3807_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-4105_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-4107_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-5657_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-5660_nonMSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-5661_MSIH
  ⏭️ Skipped: Output already exists for TCGA-A6-5662_nonMSIH
  ⏭️ Skipped: Output already exis