# CellViT Classification Dataset ‚Äî Patch and Label Generation

**Purpose:**  
This notebook prepares **256x256 classification patches** specifically for  
the *CellViT classification head*. Each patch contains cropped H&E tissue  
and a corresponding label file listing nucleus centers and classes.

---

### What It Does
1. Loads processed maps (`inst_map`, `type_map`) from `whole_slide/`.
2. Locates the corresponding RGB `.svs` or `.tif` image.
3. Tiles the slide into non-overlapping 256x256 patches.
4. For each patch:
   - Saves an RGB patch (`.png`)
   - Extracts nucleus centroids and class IDs ‚Üí saves as `.csv`
5. Produces a clean folder structure compatible with CellViT‚Äôs training pipeline.
6. Generate `inst_map` & `type_map` for 256x256 patches

---

### Output Folder Structure
ProcessedDataset/

‚îî‚îÄ‚îÄ v1_40x_area20/

‚îî‚îÄ‚îÄ patches_cellvit_p256/

‚îú‚îÄ‚îÄ images/ ‚Üê 256x256 cropped RGB patches

‚îú‚îÄ‚îÄ labels/ ‚Üê matching .csv files with x,y,class

‚îú‚îÄ‚îÄ debug/ ‚Üê visualization or QA exports

‚îî‚îÄ‚îÄ _meta/ ‚Üê configuration and logs


---

### Table of Contents
- [Step 1 ‚Äî Setup and Folder Creation](#step1)
- [Step 2 ‚Äî Patch Extraction and Label Generation](#step2)
- [Step 3 ‚Äî Visual Inspection of Generated Patches](#step3)
- [Step 4 ‚Äî Summary and Quality Checks](#step4)
- [Step 5 ‚Äî Dataset Integrity + Overview](#step5)
- [Step 6 ‚Äî Crop and save 256x256 patch-level inst/type maps](#step6)



<a id="step1"></a>
# Step 1 ‚Äî Setup and Folder Creation


In [2]:
from pathlib import Path
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Config ---
PROJECT_ROOT = Path("/projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis")
PROC_ROOT = PROJECT_ROOT / "ProcessedDataset/v1_40x_area20"
PATCH_SIZE = 256
TASK_NAME = f"patches_cellvit_p{PATCH_SIZE}"

# --- Input directories ---
RAW_IMG_ROOT = PROJECT_ROOT / "Datasets/BU_course_data/TrainingData/TrainingImages_and_Annotations"
INST_DIR = PROC_ROOT / "whole_slide/inst_maps"
TYPE_DIR = PROC_ROOT / "whole_slide/type_maps"

# --- Output directories ---
PATCH_ROOT = PROC_ROOT / TASK_NAME
(PATCH_ROOT / "images").mkdir(parents=True, exist_ok=True)
(PATCH_ROOT / "labels").mkdir(parents=True, exist_ok=True)
(PATCH_ROOT / "debug").mkdir(parents=True, exist_ok=True)
(PATCH_ROOT / "_meta").mkdir(parents=True, exist_ok=True)

# --- Metadata loading ---
with open(PROC_ROOT / "_meta/class_map.json", "r") as f:
    class_map = json.load(f)

# --- Record run parameters ---
params = {
    "task": "cellvit_classification_head",
    "patch_size": PATCH_SIZE,
    "input_root": str(PROC_ROOT),
    "output_root": str(PATCH_ROOT),
    "num_classes": len(class_map),
    "class_map": class_map,
}

with open(PATCH_ROOT / "_meta/params.json", "w") as f:
    json.dump(params, f, indent=2)

# --- Confirmation ---
print(f"‚úÖ Initialized output structure for CellViT classification patches:")
print(f"  ‚Üí {PATCH_ROOT}")
print(f"  Classes: {list(class_map.values())}")


‚úÖ Initialized output structure for CellViT classification patches:
  ‚Üí /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/ProcessedDataset/v1_40x_area20/patches_cellvit_p256
  Classes: ['background', 'epithelial', 'lymphocyte', 'macrophage', 'neutrophil']


<a id="step2"></a>
# Step 2 ‚Äî Patch Extraction and Label Generation

In [3]:
from skimage.measure import regionprops
import cv2
from openslide import OpenSlide

def find_image_path(slide_name: str) -> Path | None:
    """
    Locate the corresponding .svs (preferred) or .tif file for a given slide.
    """
    for patient_dir in sorted(RAW_IMG_ROOT.iterdir()):
        if not patient_dir.is_dir():
            continue
        for ext in (".svs", ".tif"):
            candidate = patient_dir / f"{slide_name}{ext}"
            if candidate.exists():
                return candidate
    return None


def load_rgb_image(img_path: Path) -> np.ndarray:
    """
    Load RGB image using OpenSlide (.svs) or OpenCV (.tif).
    """
    if img_path.suffix.lower() == ".svs":
        with OpenSlide(str(img_path)) as slide:
            w, h = slide.level_dimensions[0]
            rgb = np.array(slide.read_region((0, 0), 0, (w, h)))[:, :, :3]
        return rgb
    else:
        img = cv2.imread(str(img_path))
        if img is None:
            raise RuntimeError(f"Failed to load {img_path}")
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


def extract_patch_labels(inst_map: np.ndarray, type_map: np.ndarray):
    """
    Return (x, y, class_id) for each nucleus centroid.
    """
    rows = []
    for region in regionprops(inst_map):
        y, x = region.centroid
        cls_vals = type_map[inst_map == region.label]
        if cls_vals.size == 0:
            continue
        cls = int(np.bincount(cls_vals).argmax())  # majority vote
        if cls == 0:
            continue  # skip background
        rows.append((int(x), int(y), cls))
    return rows


# --- Main loop ---
slides = sorted([p.stem.replace("_inst", "") for p in INST_DIR.glob("*_inst.npy")])
print(f"Found {len(slides)} processed slides.")

for slide_name in tqdm(slides, desc="Generating patches"):
    inst_path = INST_DIR / f"{slide_name}_inst.npy"
    type_path = TYPE_DIR / f"{slide_name}_type.npy"

    if not inst_path.exists() or not type_path.exists():
        print(f"‚ö†Ô∏è Missing maps for {slide_name}")
        continue

    try:
        inst_map = np.load(inst_path)
        type_map = np.load(type_path)
        img_path = find_image_path(slide_name)
        if img_path is None:
            print(f"‚ö†Ô∏è No image found for {slide_name}")
            continue
        rgb = load_rgb_image(img_path)

        h, w = rgb.shape[:2]
        saved_count = 0

        for y in range(0, h, PATCH_SIZE):
            for x in range(0, w, PATCH_SIZE):
                patch_rgb = rgb[y:y+PATCH_SIZE, x:x+PATCH_SIZE]
                patch_inst = inst_map[y:y+PATCH_SIZE, x:x+PATCH_SIZE]
                patch_type = type_map[y:y+PATCH_SIZE, x:x+PATCH_SIZE]

                if patch_rgb.shape[0] != PATCH_SIZE or patch_rgb.shape[1] != PATCH_SIZE:
                    continue  # skip incomplete edge tiles

                # skip empty patches
                if np.count_nonzero(patch_inst) == 0:
                    continue

                # extract centers
                centers = extract_patch_labels(patch_inst, patch_type)
                if not centers:
                    continue

                # save patch
                img_name = f"{slide_name}_{x}_{y}.png"
                csv_name = f"{slide_name}_{x}_{y}.csv"

                cv2.imwrite(str(PATCH_ROOT / "images" / img_name), cv2.cvtColor(patch_rgb, cv2.COLOR_RGB2BGR))
                pd.DataFrame(centers, columns=["x", "y", "class"]).to_csv(
                    PATCH_ROOT / "labels" / csv_name, index=False, header=False
                )
                saved_count += 1

        print(f"Saved {saved_count} patches from {slide_name}")

    except Exception as e:
        print(f"‚ö†Ô∏è Error in {slide_name}: {e}")

print("‚úÖ Patch generation complete.")


Found 209 processed slides.


Generating patches:   0%|          | 1/209 [00:00<02:46,  1.25it/s]

Saved 16 patches from TCGA-55-1594-01Z-00-DX1_001


Generating patches:   1%|‚ñè         | 3/209 [00:01<01:08,  2.99it/s]

Saved 4 patches from TCGA-55-1594-01Z-00-DX1_002
Saved 4 patches from TCGA-55-1594-01Z-00-DX1_003


Generating patches:   2%|‚ñè         | 5/209 [00:01<00:49,  4.16it/s]

Saved 4 patches from TCGA-55-1594-01Z-00-DX1_004
Saved 4 patches from TCGA-55-1594-01Z-00-DX1_005


Generating patches:   3%|‚ñé         | 7/209 [00:02<00:49,  4.07it/s]

Saved 20 patches from TCGA-5P-A9K0-01Z-00-DX1_1
Saved 4 patches from TCGA-5P-A9K0-01Z-00-DX1_2


Generating patches:   4%|‚ñç         | 9/209 [00:02<00:36,  5.48it/s]

Saved 0 patches from TCGA-5P-A9K0-01Z-00-DX1_3
Saved 3 patches from TCGA-69-7760-01Z-00-DX1_001


Generating patches:   5%|‚ñå         | 11/209 [00:02<00:34,  5.73it/s]

Saved 4 patches from TCGA-69-7760-01Z-00-DX1_002
Saved 4 patches from TCGA-69-7760-01Z-00-DX1_003


Generating patches:   7%|‚ñã         | 14/209 [00:03<00:35,  5.51it/s]

Saved 16 patches from TCGA-69-7760-01Z-00-DX1_004
Saved 0 patches from TCGA-69-A59K-01Z-00-DX1_001
Saved 1 patches from TCGA-69-A59K-01Z-00-DX1_002


Generating patches:   7%|‚ñã         | 15/209 [00:04<01:39,  1.95it/s]

Saved 40 patches from TCGA-69-A59K-01Z-00-DX1_003


Generating patches:   8%|‚ñä         | 16/209 [00:05<01:27,  2.20it/s]

Saved 4 patches from TCGA-69-A59K-01Z-00-DX1_004


Generating patches:   8%|‚ñä         | 17/209 [00:05<01:16,  2.52it/s]

Saved 5 patches from TCGA-73-4668-01Z-00-DX1_001


Generating patches:   9%|‚ñä         | 18/209 [00:05<01:21,  2.33it/s]

Saved 16 patches from TCGA-73-4668-01Z-00-DX1_002
Saved 0 patches from TCGA-73-4668-01Z-00-DX1_003


Generating patches:  10%|‚ñà         | 21/209 [00:06<00:50,  3.75it/s]

Saved 3 patches from TCGA-73-4668-01Z-00-DX1_004
Saved 3 patches from TCGA-78-7220-01Z-00-DX1_001


Generating patches:  11%|‚ñà         | 22/209 [00:06<00:43,  4.25it/s]

Saved 2 patches from TCGA-78-7220-01Z-00-DX1_002


Generating patches:  11%|‚ñà‚ñè        | 24/209 [00:07<00:49,  3.73it/s]

Saved 16 patches from TCGA-78-7220-01Z-00-DX1_003
Saved 4 patches from TCGA-78-7220-01Z-00-DX1_004


Generating patches:  12%|‚ñà‚ñè        | 25/209 [00:07<00:44,  4.16it/s]

Saved 2 patches from TCGA-86-7713-01Z-00-DX1_001


Generating patches:  12%|‚ñà‚ñè        | 26/209 [00:07<00:46,  3.97it/s]

Saved 4 patches from TCGA-86-7713-01Z-00-DX1_002


Generating patches:  13%|‚ñà‚ñé        | 27/209 [00:07<00:47,  3.81it/s]

Saved 6 patches from TCGA-86-7713-01Z-00-DX1_003


Generating patches:  13%|‚ñà‚ñé        | 28/209 [00:08<01:04,  2.81it/s]

Saved 12 patches from TCGA-86-7713-01Z-00-DX1_004


Generating patches:  14%|‚ñà‚ñç        | 29/209 [00:08<00:59,  3.04it/s]

Saved 4 patches from TCGA-86-8672-01Z-00-DX1_1


Generating patches:  14%|‚ñà‚ñç        | 30/209 [00:09<01:20,  2.24it/s]

Saved 18 patches from TCGA-86-8672-01Z-00-DX1_2


Generating patches:  15%|‚ñà‚ñç        | 31/209 [00:10<01:42,  1.74it/s]

Saved 25 patches from TCGA-86-8672-01Z-00-DX1_3


Generating patches:  15%|‚ñà‚ñå        | 32/209 [00:10<01:22,  2.14it/s]

Saved 4 patches from TCGA-A2-A0CV-01Z-00-DX1_1


Generating patches:  16%|‚ñà‚ñå        | 33/209 [00:10<01:09,  2.55it/s]

Saved 4 patches from TCGA-A2-A0CV-01Z-00-DX1_2
Saved 0 patches from TCGA-A2-A0CV-01Z-00-DX1_3


Generating patches:  18%|‚ñà‚ñä        | 37/209 [00:11<00:36,  4.72it/s]

Saved 6 patches from TCGA-A2-A0CV-01Z-00-DX1_4
Saved 0 patches from TCGA-A2-A0CV-01Z-00-DX1_5
Saved 1 patches from TCGA-A2-A0ES-01Z-00-DX1_1


Generating patches:  18%|‚ñà‚ñä        | 38/209 [00:11<00:43,  3.94it/s]

Saved 8 patches from TCGA-A2-A0ES-01Z-00-DX1_2
Saved 0 patches from TCGA-A2-A0ES-01Z-00-DX1_3


Generating patches:  20%|‚ñà‚ñâ        | 41/209 [00:12<00:36,  4.62it/s]

Saved 9 patches from TCGA-A2-A0ES-01Z-00-DX1_4
Saved 1 patches from TCGA-A2-A0ES-01Z-00-DX1_5


Generating patches:  21%|‚ñà‚ñà        | 44/209 [00:13<00:45,  3.63it/s]

Saved 28 patches from TCGA-B6-A0WZ-01Z-00-DX1_1
Saved 1 patches from TCGA-B6-A0WZ-01Z-00-DX1_2
Saved 1 patches from TCGA-B6-A0WZ-01Z-00-DX1_3


Generating patches:  22%|‚ñà‚ñà‚ñè       | 45/209 [00:13<00:39,  4.20it/s]

Saved 1 patches from TCGA-B6-A0WZ-01Z-00-DX1_4


Generating patches:  22%|‚ñà‚ñà‚ñè       | 46/209 [00:13<00:38,  4.26it/s]

Saved 4 patches from TCGA-B6-A0WZ-01Z-00-DX1_5
Saved 0 patches from TCGA-B6-A0WZ-01Z-00-DX1_6


Generating patches:  23%|‚ñà‚ñà‚ñé       | 48/209 [00:14<00:31,  5.13it/s]

Saved 2 patches from TCGA-B9-A44B-01Z-00-DX1_1


Generating patches:  23%|‚ñà‚ñà‚ñé       | 49/209 [00:14<00:38,  4.12it/s]

Saved 20 patches from TCGA-B9-A44B-01Z-00-DX1_2
Saved 0 patches from TCGA-B9-A44B-01Z-00-DX1_3


Generating patches:  24%|‚ñà‚ñà‚ñç       | 51/209 [00:14<00:37,  4.20it/s]

Saved 25 patches from TCGA-B9-A8YI-01Z-00-DX1_1


Generating patches:  25%|‚ñà‚ñà‚ñç       | 52/209 [00:15<00:37,  4.24it/s]

Saved 12 patches from TCGA-B9-A8YI-01Z-00-DX1_2


Generating patches:  26%|‚ñà‚ñà‚ñã       | 55/209 [00:15<00:32,  4.79it/s]

Saved 30 patches from TCGA-B9-A8YI-01Z-00-DX1_3
Saved 0 patches from TCGA-B9-A8YI-01Z-00-DX1_4
Saved 0 patches from TCGA-B9-A8YI-01Z-00-DX1_5


Generating patches:  27%|‚ñà‚ñà‚ñã       | 56/209 [00:16<00:45,  3.35it/s]

Saved 15 patches from TCGA-BH-A18T-01Z-00-DX1_1


Generating patches:  28%|‚ñà‚ñà‚ñä       | 58/209 [00:16<00:35,  4.22it/s]

Saved 4 patches from TCGA-BH-A18T-01Z-00-DX1_2
Saved 1 patches from TCGA-BH-A18T-01Z-00-DX1_3


Generating patches:  28%|‚ñà‚ñà‚ñä       | 59/209 [00:16<00:31,  4.78it/s]

Saved 1 patches from TCGA-BH-A18T-01Z-00-DX1_4


Generating patches:  29%|‚ñà‚ñà‚ñä       | 60/209 [00:17<00:37,  4.00it/s]

Saved 9 patches from TCGA-BH-A18T-01Z-00-DX1_5


Generating patches:  29%|‚ñà‚ñà‚ñâ       | 61/209 [00:17<00:50,  2.93it/s]

Saved 12 patches from TCGA-BH-A18T-01Z-00-DX1_6
Saved 0 patches from TCGA-D8-A1X5-01Z-00-DX2_1


Generating patches:  30%|‚ñà‚ñà‚ñà       | 63/209 [00:18<00:40,  3.60it/s]

Saved 16 patches from TCGA-D8-A1X5-01Z-00-DX2_2


Generating patches:  31%|‚ñà‚ñà‚ñà       | 65/209 [00:18<00:42,  3.42it/s]

Saved 28 patches from TCGA-D8-A1X5-01Z-00-DX2_3
Saved 6 patches from TCGA-D8-A1X5-01Z-00-DX2_4


Generating patches:  32%|‚ñà‚ñà‚ñà‚ñè      | 66/209 [00:19<00:40,  3.57it/s]

Saved 9 patches from TCGA-DW-7841-01Z-00-DX1_1


Generating patches:  33%|‚ñà‚ñà‚ñà‚ñé      | 69/209 [00:19<00:26,  5.37it/s]

Saved 9 patches from TCGA-DW-7841-01Z-00-DX1_2
Saved 1 patches from TCGA-DW-7841-01Z-00-DX1_3
Saved 1 patches from TCGA-E2-A154-01Z-00-DX1_1


Generating patches:  34%|‚ñà‚ñà‚ñà‚ñç      | 71/209 [00:19<00:21,  6.45it/s]

Saved 0 patches from TCGA-E2-A154-01Z-00-DX1_2
Saved 4 patches from TCGA-E2-A154-01Z-00-DX1_3


Generating patches:  35%|‚ñà‚ñà‚ñà‚ñç      | 73/209 [00:20<00:21,  6.21it/s]

Saved 4 patches from TCGA-E2-A154-01Z-00-DX1_4
Saved 2 patches from TCGA-E2-A154-01Z-00-DX1_5
Saved 0 patches from TCGA-E2-A154-01Z-00-DX1_6


Generating patches:  36%|‚ñà‚ñà‚ñà‚ñå      | 75/209 [00:20<00:15,  8.38it/s]

Saved 0 patches from TCGA-E2-A154-01Z-00-DX1_7
Saved 0 patches from TCGA-E2-A154-01Z-00-DX1_8


Generating patches:  37%|‚ñà‚ñà‚ñà‚ñã      | 77/209 [00:21<00:34,  3.78it/s]

Saved 32 patches from TCGA-E9-A22B-01Z-00-DX1_1


Generating patches:  37%|‚ñà‚ñà‚ñà‚ñã      | 78/209 [00:21<00:34,  3.76it/s]

Saved 6 patches from TCGA-E9-A22B-01Z-00-DX1_2


Generating patches:  38%|‚ñà‚ñà‚ñà‚ñä      | 80/209 [00:21<00:29,  4.32it/s]

Saved 6 patches from TCGA-E9-A22B-01Z-00-DX1_3
Saved 0 patches from TCGA-E9-A22B-01Z-00-DX1_4
Saved 0 patches from TCGA-E9-A22B-01Z-00-DX1_5


Generating patches:  39%|‚ñà‚ñà‚ñà‚ñâ      | 82/209 [00:22<00:21,  5.95it/s]

Saved 1 patches from TCGA-E9-A22B-01Z-00-DX1_6
Saved 0 patches from TCGA-E9-A22B-01Z-00-DX1_7


Generating patches:  40%|‚ñà‚ñà‚ñà‚ñà      | 84/209 [00:22<00:27,  4.51it/s]

Saved 15 patches from TCGA-E9-A22B-01Z-00-DX1_8


Generating patches:  41%|‚ñà‚ñà‚ñà‚ñà      | 85/209 [00:22<00:27,  4.44it/s]

Saved 4 patches from TCGA-E9-A22G-01Z-00-DX1_1


Generating patches:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 87/209 [00:23<00:24,  4.99it/s]

Saved 4 patches from TCGA-E9-A22G-01Z-00-DX1_2
Saved 1 patches from TCGA-E9-A22G-01Z-00-DX1_3
Saved 1 patches from TCGA-E9-A22G-01Z-00-DX1_4


Generating patches:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 89/209 [00:23<00:23,  5.15it/s]

Saved 6 patches from TCGA-E9-A22G-01Z-00-DX1_5


Generating patches:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 91/209 [00:24<00:27,  4.29it/s]

Saved 30 patches from TCGA-EJ-5495-01Z-00-DX1-1
Saved 2 patches from TCGA-EJ-5495-01Z-00-DX1-2


Generating patches:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 92/209 [00:24<00:23,  4.98it/s]

Saved 2 patches from TCGA-EJ-5495-01Z-00-DX1-3
Saved 0 patches from TCGA-EJ-5495-01Z-00-DX1-4


Generating patches:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 96/209 [00:24<00:16,  6.81it/s]

Saved 12 patches from TCGA-EJ-5505-01Z-00-DX1-1
Saved 1 patches from TCGA-EJ-5505-01Z-00-DX1-2
Saved 0 patches from TCGA-EJ-5505-01Z-00-DX1-3


Generating patches:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 98/209 [00:25<00:14,  7.78it/s]

Saved 1 patches from TCGA-EJ-5505-01Z-00-DX1-4
Saved 0 patches from TCGA-EJ-5505-01Z-00-DX1-5
Saved 0 patches from TCGA-EJ-5505-01Z-00-DX1-6


Generating patches:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 100/209 [00:25<00:12,  8.78it/s]

Saved 2 patches from TCGA-EJ-5517-01Z-00-DX1-1
Saved 0 patches from TCGA-EJ-5517-01Z-00-DX1-2


Generating patches:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 102/209 [00:25<00:11,  9.00it/s]

Saved 5 patches from TCGA-EJ-5517-01Z-00-DX1-3


Generating patches:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 103/209 [00:26<00:22,  4.74it/s]

Saved 36 patches from TCGA-EJ-5517-01Z-00-DX1-4


Generating patches:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 105/209 [00:26<00:21,  4.92it/s]

Saved 12 patches from TCGA-EV-5903-01Z-00-DX1_1
Saved 3 patches from TCGA-EV-5903-01Z-00-DX1_2
Saved 0 patches from TCGA-EV-5903-01Z-00-DX1_3


Generating patches:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 109/209 [00:26<00:12,  8.32it/s]

Saved 1 patches from TCGA-EV-5903-01Z-00-DX1_4
Saved 0 patches from TCGA-EV-5903-01Z-00-DX1_5
Saved 0 patches from TCGA-EW-A6SD-01Z-00-DX1_1
Saved 0 patches from TCGA-EW-A6SD-01Z-00-DX1_2


Generating patches:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 111/209 [00:27<00:15,  6.24it/s]

Saved 8 patches from TCGA-EW-A6SD-01Z-00-DX1_3


Generating patches:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 113/209 [00:27<00:19,  4.99it/s]

Saved 9 patches from TCGA-EW-A6SD-01Z-00-DX1_4
Saved 2 patches from TCGA-EW-A6SD-01Z-00-DX1_5


Generating patches:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 114/209 [00:28<00:20,  4.68it/s]

Saved 12 patches from TCGA-F9-A97G-01Z-00-DX1_1
Saved 0 patches from TCGA-F9-A97G-01Z-00-DX1_2


Generating patches:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 116/209 [00:28<00:15,  5.92it/s]

Saved 4 patches from TCGA-F9-A97G-01Z-00-DX1_3
Saved 1 patches from TCGA-F9-A97G-01Z-00-DX1_4


Generating patches:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 120/209 [00:28<00:10,  8.50it/s]

Saved 3 patches from TCGA-G7-A8LD-01Z-00-DX1_1
Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_10
Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_2
Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_3


Generating patches:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 122/209 [00:28<00:08, 10.16it/s]

Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_4
Saved 3 patches from TCGA-G7-A8LD-01Z-00-DX1_5


Generating patches:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 126/209 [00:29<00:08, 10.04it/s]

Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_6
Saved 1 patches from TCGA-G7-A8LD-01Z-00-DX1_7
Saved 2 patches from TCGA-G7-A8LD-01Z-00-DX1_8


Generating patches:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 128/209 [00:29<00:07, 11.06it/s]

Saved 0 patches from TCGA-G7-A8LD-01Z-00-DX1_9
Saved 0 patches from TCGA-G9-6342-01Z-00-DX1-1
Saved 1 patches from TCGA-G9-6342-01Z-00-DX1-2


Generating patches:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 130/209 [00:29<00:08,  9.32it/s]

Saved 8 patches from TCGA-G9-6499-01Z-00-DX1-1
Saved 2 patches from TCGA-G9-6499-01Z-00-DX1-2


Generating patches:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 132/209 [00:29<00:08,  8.90it/s]

Saved 2 patches from TCGA-G9-6499-01Z-00-DX1-3


Generating patches:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 133/209 [00:30<00:11,  6.34it/s]

Saved 20 patches from TCGA-G9-6499-01Z-00-DX1-4
Saved 1 patches from TCGA-G9-6499-01Z-00-DX1-5


Generating patches:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 137/209 [00:30<00:09,  7.53it/s]

Saved 9 patches from TCGA-J4-A67Q-01Z-00-DX1-1
Saved 0 patches from TCGA-J4-A67Q-01Z-00-DX1-2
Saved 2 patches from TCGA-J4-A67Q-01Z-00-DX1-3


Generating patches:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 140/209 [00:30<00:07,  8.65it/s]

Saved 1 patches from TCGA-J4-A67Q-01Z-00-DX1-4
Saved 0 patches from TCGA-J4-A67Q-01Z-00-DX1-5
Saved 2 patches from TCGA-J4-A67T-01Z-00-DX1-1


Generating patches:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 143/209 [00:31<00:07,  9.32it/s]

Saved 12 patches from TCGA-J4-A67T-01Z-00-DX1-2
Saved 0 patches from TCGA-J4-A67T-01Z-00-DX1-3
Saved 0 patches from TCGA-J4-A67T-01Z-00-DX1-4
Saved 1 patches from TCGA-J4-A67T-01Z-00-DX1-5


Generating patches:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 145/209 [00:31<00:06,  9.31it/s]

Saved 4 patches from TCGA-J4-A67T-01Z-00-DX1-6
Saved 4 patches from TCGA-KK-A59X-01Z-00-DX1-1


Generating patches:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 147/209 [00:31<00:07,  8.58it/s]

Saved 2 patches from TCGA-KK-A59X-01Z-00-DX1-2
Saved 0 patches from TCGA-KK-A59X-01Z-00-DX1-3


Generating patches:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 149/209 [00:32<00:07,  8.36it/s]

Saved 6 patches from TCGA-KK-A6E0-01Z-00-DX1-1


Generating patches:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 152/209 [00:32<00:06,  8.44it/s]

Saved 12 patches from TCGA-KK-A6E0-01Z-00-DX1-2
Saved 1 patches from TCGA-KK-A6E0-01Z-00-DX1-3
Saved 0 patches from TCGA-KK-A6E0-01Z-00-DX1-4


Generating patches:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 154/209 [00:32<00:07,  7.51it/s]

Saved 2 patches from TCGA-KK-A6E0-01Z-00-DX1-5
Saved 4 patches from TCGA-KK-A7AW-01Z-00-DX1-1


Generating patches:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 156/209 [00:33<00:08,  5.96it/s]

Saved 6 patches from TCGA-L4-A4E5-01Z-00-DX1_1
Saved 2 patches from TCGA-L4-A4E5-01Z-00-DX1_2


Generating patches:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 158/209 [00:33<00:09,  5.26it/s]

Saved 9 patches from TCGA-L4-A4E5-01Z-00-DX1_3
Saved 1 patches from TCGA-MH-A560-01Z-00-DX2_1


Generating patches:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 161/209 [00:34<00:09,  5.28it/s]

Saved 35 patches from TCGA-MH-A560-01Z-00-DX2_2
Saved 1 patches from TCGA-MH-A560-01Z-00-DX2_3
Saved 0 patches from TCGA-MH-A560-01Z-00-DX2_4


Generating patches:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 162/209 [00:34<00:08,  5.69it/s]

Saved 1 patches from TCGA-MP-A4SY-01Z-00-DX1_1


Generating patches:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 163/209 [00:35<00:12,  3.61it/s]

Saved 16 patches from TCGA-MP-A4SY-01Z-00-DX1_2


Generating patches:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 165/209 [00:35<00:10,  4.15it/s]

Saved 4 patches from TCGA-MP-A4SY-01Z-00-DX1_3
Saved 2 patches from TCGA-MP-A4SY-01Z-00-DX1_4


Generating patches:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 167/209 [00:35<00:07,  5.90it/s]

Saved 0 patches from TCGA-MP-A4T7-01Z-00-DX1_1
Saved 1 patches from TCGA-MP-A4T7-01Z-00-DX1_2


Generating patches:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 168/209 [00:36<00:09,  4.44it/s]

Saved 9 patches from TCGA-MP-A4T7-01Z-00-DX1_3


Generating patches:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 170/209 [00:36<00:07,  4.96it/s]

Saved 3 patches from TCGA-MP-A4T7-01Z-00-DX1_4
Saved 2 patches from TCGA-P4-AAVK-01Z-00-DX1_1


Generating patches:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 171/209 [00:36<00:06,  5.73it/s]

Saved 1 patches from TCGA-P4-AAVK-01Z-00-DX1_2


Generating patches:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 174/209 [00:37<00:06,  5.33it/s]

Saved 30 patches from TCGA-P4-AAVK-01Z-00-DX1_3
Saved 0 patches from TCGA-P4-AAVK-01Z-00-DX1_4
Saved 0 patches from TCGA-P4-AAVK-01Z-00-DX1_5


Generating patches:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 176/209 [00:37<00:04,  6.89it/s]

Saved 0 patches from TCGA-P4-AAVK-01Z-00-DX1_6
Saved 0 patches from TCGA-P4-AAVK-01Z-00-DX1_7


Generating patches:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 177/209 [00:38<00:08,  3.73it/s]

Saved 19 patches from TCGA-S3-AA11-01Z-00-DX1_1


Generating patches:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 178/209 [00:38<00:12,  2.42it/s]

Saved 24 patches from TCGA-S3-AA11-01Z-00-DX1_2


Generating patches:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 179/209 [00:39<00:11,  2.65it/s]

Saved 4 patches from TCGA-S3-AA11-01Z-00-DX1_3


Generating patches:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 181/209 [00:39<00:08,  3.27it/s]

Saved 6 patches from TCGA-S3-AA11-01Z-00-DX1_4
Saved 2 patches from TCGA-S3-AA11-01Z-00-DX1_5


Generating patches:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 183/209 [00:39<00:05,  4.58it/s]

Saved 0 patches from TCGA-S3-AA11-01Z-00-DX1_6
Saved 3 patches from TCGA-SX-A7SR-01Z-00-DX1_1


Generating patches:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 185/209 [00:40<00:04,  5.51it/s]

Saved 3 patches from TCGA-SX-A7SR-01Z-00-DX1_2
Saved 2 patches from TCGA-UZ-A9PO-01Z-00-DX1_1
Saved 0 patches from TCGA-UZ-A9PO-01Z-00-DX1_2


Generating patches:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 188/209 [00:40<00:03,  6.33it/s]

Saved 9 patches from TCGA-UZ-A9PO-01Z-00-DX1_3
Saved 3 patches from TCGA-UZ-A9PO-01Z-00-DX1_4
Saved 0 patches from TCGA-UZ-A9PO-01Z-00-DX1_5


Generating patches:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 192/209 [00:40<00:01,  9.65it/s]

Saved 0 patches from TCGA-UZ-A9PO-01Z-00-DX1_6
Saved 0 patches from TCGA-UZ-A9PO-01Z-00-DX1_7
Saved 1 patches from TCGA-UZ-A9PU-01Z-00-DX1_1


Generating patches:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 194/209 [00:41<00:01,  9.16it/s]

Saved 1 patches from TCGA-UZ-A9PU-01Z-00-DX1_2
Saved 4 patches from TCGA-V1-A8WL-01Z-00-DX1-1


Generating patches:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 196/209 [00:41<00:01,  9.45it/s]

Saved 3 patches from TCGA-V1-A8WL-01Z-00-DX1-2
Saved 0 patches from TCGA-V1-A8WL-01Z-00-DX1-3


Generating patches:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 198/209 [00:41<00:01,  8.14it/s]

Saved 4 patches from TCGA-V1-A9O9-01Z-00-DX1-1
Saved 4 patches from TCGA-V1-A9O9-01Z-00-DX1-2


Generating patches:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 201/209 [00:42<00:00,  8.13it/s]

Saved 9 patches from TCGA-V1-A9O9-01Z-00-DX1-3
Saved 1 patches from TCGA-V1-A9O9-01Z-00-DX1-4
Saved 1 patches from TCGA-V1-A9O9-01Z-00-DX1-5


Generating patches:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 204/209 [00:42<00:00,  9.67it/s]

Saved 1 patches from TCGA-X4-A8KQ-01Z-00-DX8-1
Saved 1 patches from TCGA-X4-A8KQ-01Z-00-DX8-2
Saved 0 patches from TCGA-X4-A8KQ-01Z-00-DX8-3
Saved 16 patches from TCGA-YL-A9WY-01Z-00-DX1-1


Generating patches:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 207/209 [00:42<00:00,  6.73it/s]

Saved 9 patches from TCGA-YL-A9WY-01Z-00-DX1-2
Saved 1 patches from TCGA-YL-A9WY-01Z-00-DX1-3


Generating patches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 209/209 [00:43<00:00,  4.85it/s]

Saved 1 patches from TCGA-YL-A9WY-01Z-00-DX1-4
Saved 0 patches from TCGA-YL-A9WY-01Z-00-DX1-5
‚úÖ Patch generation complete.





<a id="step3"></a>
# Step 3 ‚Äî Visual Inspection of Generated Patches


In [None]:
!pip install ipywidgets
import ipywidgets as widgets
from IPython.display import display, clear_output
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2
from matplotlib.patches import Patch
import matplotlib

get_cmap = matplotlib.colormaps.get_cmap
cmap = get_cmap("Set2")
num_classes = len(class_map)
LUT = cmap(np.linspace(0, 1, num_classes))[:, :3]  # RGB LUT in [0,1]


patches = sorted(os.listdir(PATCH_ROOT / "images"))
print(f"Available patches: {len(patches)}")

def visualize_patch_with_gt(patch_name: str):
    if not patch_name:
        return
    clear_output(wait=True)
    print(f"Visualizing patch: {patch_name}")

    base_name = Path(patch_name).stem
    img_path  = PATCH_ROOT / "images" / f"{base_name}.png"
    csv_path  = PATCH_ROOT / "labels" / f"{base_name}.csv"

    parts = base_name.split("_")
    slide_name = "_".join(parts[:-2])
    x_off, y_off = map(int, parts[-2:])
    type_path = TYPE_DIR / f"{slide_name}_type.npy"

    if not type_path.exists() or not img_path.exists() or not csv_path.exists():
        print(f"Missing one of: {type_path.name}, {img_path.name}, {csv_path.name}")
        return

    type_map = np.load(type_path)
    type_patch = type_map[y_off:y_off+PATCH_SIZE, x_off:x_off+PATCH_SIZE]
    img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
    df = pd.read_csv(csv_path, header=None, names=["x", "y", "class"])

    overlay = img.copy()
    for cls_id, grp in df.groupby("class"):
        color = (np.array(LUT[int(cls_id) % num_classes]) * 255).astype(int)
        for _, row in grp.iterrows():
            center = (int(row.x), int(row.y))
            cv2.circle(overlay, center, 4, (0, 0, 0), -1, lineType=cv2.LINE_AA)
            cv2.circle(overlay, center, 2, color.tolist(), -1, lineType=cv2.LINE_AA)

    tm_clipped = np.clip(type_patch, 0, num_classes - 1).astype(int)
    tm_color = (LUT[tm_clipped] * 255).astype(np.uint8)

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    axes[0].imshow(img); axes[0].set_title("H&E Patch"); axes[0].axis("off")
    axes[1].imshow(tm_color); axes[1].set_title("Ground Truth Type Map"); axes[1].axis("off")
    axes[2].imshow(overlay); axes[2].set_title("Overlay (Labels on RGB)"); axes[2].axis("off")
    plt.tight_layout(); plt.show()

    present = sorted(df["class"].unique())
    legend_patches = [
        Patch(color=LUT[int(c)], label=f"{int(c)}: {class_map[str(int(c))]}")
        for c in present
    ]
    plt.figure(figsize=(6, 0.6 + 0.3 * len(present)))
    plt.legend(handles=legend_patches, loc="center left",
               bbox_to_anchor=(0, 0.5), frameon=False, title="Cell Types")
    plt.axis("off")
    plt.show()

# --- Proper interactive output setup ---
dropdown = widgets.Dropdown(options=patches, description="Select Patch:")
out = widgets.interactive_output(visualize_patch_with_gt, {'patch_name': dropdown})

display(dropdown, out)


Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m139.8/139.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m914.9/914.9 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

Matplotlib is building the font cache; this may take a moment.


Available patches: 1106


<a id="step4"></a>
# Step 4 ‚Äî Dataset Summary & Integrity Checks

In [None]:
from collections import Counter
import seaborn as sns

patch_dir = PATCH_ROOT / "images"
label_dir = PATCH_ROOT / "labels"
meta_dir  = PROC_ROOT / "_meta"
meta_dir.mkdir(exist_ok=True)

records = []
class_counter = Counter()

for csv_file in sorted(label_dir.glob("*.csv")):
    base = csv_file.stem
    img_path = patch_dir / f"{base}.png"
    if not img_path.exists():
        continue

    df = pd.read_csv(csv_file, header=None, names=["x", "y", "class"])
    num_cells = len(df)
    class_counts = df["class"].value_counts().to_dict()

    for cls, n in class_counts.items():
        class_counter[int(cls)] += n

    records.append({
        "patch": base,
        "num_cells": num_cells,
        "unique_classes": list(class_counts.keys()),
        "empty": num_cells == 0
    })

summary_df = pd.DataFrame(records)
summary_df.to_csv(meta_dir / "dataset_summary.csv", index=False)
print(f"Saved summary: {meta_dir / 'dataset_summary.csv'}")

# --- Overview stats ---
total_patches = len(summary_df)
empty_patches = summary_df["empty"].sum()
print(f"Total patches: {total_patches}")
print(f"Empty patches: {empty_patches} ({empty_patches/total_patches:.1%})")

# --- Class distribution ---
class_df = pd.DataFrame.from_dict(class_counter, orient="index", columns=["count"])
class_df["label"] = class_df.index.map(lambda i: class_map.get(str(i), "unknown"))
class_df = class_df.sort_index()

plt.figure(figsize=(7,4))
sns.barplot(x="label", y="count", data=class_df, palette="tab10")
plt.title("Cell Counts per Class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- Cells per patch distribution ---
plt.figure(figsize=(6,4))
sns.histplot(summary_df["num_cells"], bins=40, kde=False)
plt.title("Distribution of Cell Counts per Patch")
plt.xlabel("Number of cells per patch")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


<a id="step5"></a>
# Step 5: Dataset Integrity + Overview

In [None]:
from collections import Counter
import pandas as pd
import numpy as np

print("üîç Checking dataset integrity...")

images_dir = PATCH_ROOT / "images"
labels_dir = PATCH_ROOT / "labels"

image_files = sorted(f.stem for f in images_dir.glob("*.png"))
label_files = sorted(f.stem for f in labels_dir.glob("*.csv"))

# 1Ô∏è‚É£  File pairing
unmatched_images = set(image_files) - set(label_files)
unmatched_labels = set(label_files) - set(image_files)

print(f"Total images: {len(image_files)}")
print(f"Total labels: {len(label_files)}")
if unmatched_images:
    print(f"‚ö†Ô∏è {len(unmatched_images)} images without labels.")
if unmatched_labels:
    print(f"‚ö†Ô∏è {len(unmatched_labels)} labels without images.")

# 2Ô∏è‚É£  Validate CSV content (quick sample)
empty_labels = []
class_counter = Counter()
for csv_path in labels_dir.glob("*.csv"):
    try:
        data = np.loadtxt(csv_path, delimiter=",")
        if data.ndim == 1 and data.size == 0:
            empty_labels.append(csv_path.name)
        else:
            # 3rd column = class id
            cls = data[:, 2].astype(int) if data.ndim > 1 else []
            class_counter.update(cls.tolist())
    except Exception:
        empty_labels.append(csv_path.name)

print(f"Empty label files: {len(empty_labels)}")
if empty_labels:
    print("Example empty files:", empty_labels[:3])

# 3Ô∏è‚É£  Patch count per patient
# Ensure file stems are strings
file_series = pd.Series([str(f) for f in image_files])

patient_counts = (
    file_series
    .str.extract(r"(TCGA-[0-9A-Z-]+)")
    .dropna()[0]
    .value_counts()
)

print("\nPatch count per patient (top 5):")
print(patient_counts.head())


# 4Ô∏è‚É£  Class totals
print("\nClass pixel count summary:")
for cid, count in class_counter.items():
    name = class_map.get(str(cid), f"id_{cid}")
    print(f"  {cid} ({name}): {count}")

# 5Ô∏è‚É£  Save summary CSV
summary_path = PROC_ROOT / "_meta" / "final_summary.csv"
pd.DataFrame.from_dict(class_counter, orient="index", columns=["count"]).to_csv(summary_path)
print(f"\n‚úÖ Final summary saved to: {summary_path}")


<a id="step6"></a>
# Step 6: Generate 256x256 patch-level instance and type maps

In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import random

print("\nCropping 256x256 instance and type maps...")

from pathlib import Path
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Config ---
PROJECT_ROOT = Path("/projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis")
PROC_ROOT = PROJECT_ROOT / "ProcessedDataset/v1_40x_area20"
PATCH_SIZE = 256
TASK_NAME = f"patches_cellvit_p{PATCH_SIZE}"

# --- Input directories ---
RAW_IMG_ROOT = PROJECT_ROOT / "Datasets/BU_course_data/TrainingData/TrainingImages_and_Annotations"
INST_DIR = PROC_ROOT / "whole_slide/inst_maps"
TYPE_DIR = PROC_ROOT / "whole_slide/type_maps"

# --- Output directories ---
PATCH_ROOT = PROC_ROOT / TASK_NAME

PATCH_SIZE = 256
patch_map_dir = PATCH_ROOT / "patch_label_maps"
patch_map_dir.mkdir(exist_ok=True)

slide_summaries = []

for inst_path in INST_DIR.glob("*_inst.npy"):
    slide_id = inst_path.stem.replace("_inst", "")
    type_path = TYPE_DIR / f"{slide_id}_type.npy"
    if not type_path.exists():
        print(f"‚ö†Ô∏è Missing type map for {slide_id}")
        continue

    inst_map = np.load(inst_path)
    type_map = np.load(type_path)
    h, w = inst_map.shape
    saved = 0

    for y in range(0, h, PATCH_SIZE):
        for x in range(0, w, PATCH_SIZE):
            patch_inst = inst_map[y:y+PATCH_SIZE, x:x+PATCH_SIZE]
            patch_type = type_map[y:y+PATCH_SIZE, x:x+PATCH_SIZE]
            if patch_inst.shape != (PATCH_SIZE, PATCH_SIZE):
                continue
            if np.all(patch_type == 0):
                continue
            np.save(
                patch_map_dir / f"{slide_id}_{x}_{y}.npy",
                {"inst_map": patch_inst, "type_map": patch_type}
            )
            saved += 1

    slide_summaries.append({"slide": slide_id, "patch_maps": saved})
    print(f"{slide_id}: saved {saved} patch label maps")


In [None]:
# --- Summary ---
import pandas as pd
summary_df = pd.DataFrame(slide_summaries)
summary_df.to_csv(PROC_ROOT / "_meta" / "patch_label_summary.csv", index=False)
print(f"\n‚úÖ Patch-level maps saved to: {patch_map_dir}")
print(f"Summary written to: _meta/patch_label_summary.csv")

# --- Quick visual sanity check ---
import cv2
import matplotlib.pyplot as plt
import numpy as np
import random

# pick a random patch label map
sample = random.choice(list(patch_map_dir.glob("*.npy")))
data = np.load(sample, allow_pickle=True).item()
inst, typ = data["inst_map"], data["type_map"]

# infer corresponding RGB image file
# e.g. slide_id_x_y.npy ‚Üí slide_id_x_y.png
img_name = sample.stem + ".png"
img_path = PATCH_ROOT / "images" / img_name

rgb = None
if img_path.exists():
    rgb = cv2.imread(str(img_path))
    rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
else:
    print(f"‚ö†Ô∏è No matching RGB image found for {img_name}")

plt.figure(figsize=(15,5))
if rgb is not None:
    plt.subplot(1,3,1); plt.imshow(rgb); plt.title("Original RGB"); plt.axis("off")

plt.subplot(1,3,2); plt.imshow(inst, cmap="nipy_spectral"); plt.title("Instance Map"); plt.axis("off")
plt.subplot(1,3,3); plt.imshow(typ, cmap="tab10"); plt.title("Type Map"); plt.axis("off")
plt.suptitle(sample.stem)
plt.tight_layout()
plt.show()
