# Dataset Visualization — Explore Processed Whole-Slide Data

**Purpose:**  
This notebook visualizes the processed outputs generated in  
`02_generate_processed_dataset.ipynb`.  
It allows interactive exploration of whole-slide images, cell-type maps,  
and overlays for validation and quality control.

---

### What you can do here
1. Load instance (`inst_map`) and type (`type_map`) masks.
2. Visualize them alongside the original H&E images.
3. Inspect color-coded overlays to confirm annotation accuracy.
4. Interactively switch between slides using a dropdown or slider.

---

### Folder Dependencies
| Folder | Description |
|:--|:--|
| `/ProcessedDataset/v1_40x_area20/whole_slide/inst_maps` | Instance label maps |
| `/ProcessedDataset/v1_40x_area20/whole_slide/type_maps` | Cell-type maps |
| `/TrainingData/TrainingImages_and_Annotations` | Original images (.svs) |
| `/ProcessedDataset/v1_40x_area20/_meta/class_map.json` | Cell-type color/ID mapping |

---

### Table of Contents
- [Step 1 — Setup and Imports](#step1)
- [Step 2 — Load Class Map and Available Slides](#step2)
- [Step 3 — Interactive Visualization](#step3)

---

<a id="step1"></a>
## Step 1 — Setup and Imports

In [1]:
# Paths, imports, and basic checks

from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt

# Optional dependencies (handled gracefully if missing)
try:
    from openslide import OpenSlide
except Exception as e:
    OpenSlide = None
    print("OpenSlide not available:", e)

# --- Project paths ---
PROJECT_ROOT = Path("/projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis")

RAW_IMG_ROOT = PROJECT_ROOT / "Datasets/BU_course_data/TrainingData/TrainingImages_and_Annotations"
PROC_ROOT    = PROJECT_ROOT / "ProcessedDataset/v1_40x_area20"
TYPE_DIR     = PROC_ROOT / "whole_slide/type_maps"
INST_DIR     = PROC_ROOT / "whole_slide/inst_maps"
META_DIR     = PROC_ROOT / "_meta"

# --- Sanity checks ---
for p in [RAW_IMG_ROOT, PROC_ROOT, TYPE_DIR, INST_DIR, META_DIR]:
    print(f"Exists: {p.exists():5}  →  {p}")

# Matplotlib defaults
plt.rcParams["figure.dpi"] = 120
plt.rcParams["axes.grid"] = False


Exists:     1  →  /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/Datasets/BU_course_data/TrainingData/TrainingImages_and_Annotations
Exists:     1  →  /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/ProcessedDataset/v1_40x_area20
Exists:     1  →  /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/ProcessedDataset/v1_40x_area20/whole_slide/type_maps
Exists:     1  →  /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/ProcessedDataset/v1_40x_area20/whole_slide/inst_maps
Exists:     1  →  /projectnb/ec500kb/projects/Fall_2025_Projects/Project_2/AI-guided-whole-slide-imaging-analysis/ProcessedDataset/v1_40x_area20/_meta


<a id="step2"></a>

## Step 2 — Load Class Map and Discover Available Slides


In [2]:
# Load class map and list processed slides

# Load class_map.json produced in 02_generate_processed_dataset
with open(META_DIR / "class_map.json", "r") as f:
    class_map = json.load(f)

# Normalize to int->str and str->int helpers
id_to_name = {int(k): v for k, v in class_map.items()}
name_to_id = {v: int(k) for k, v in id_to_name.items()}

print("Class map (id → name):", id_to_name)

# Collect slide basenames from type_maps
slide_names = sorted({p.stem.replace("_type", "") for p in TYPE_DIR.glob("*_type.npy")})
print(f"Slides available for visualization: {len(slide_names)}")
if slide_names[:5]:
    print("Sample:", slide_names[:5])

# Simple utility: build a color look-up table for categorical masks
def categorical_colormap(n_classes: int):
    """
    Returns an (n_classes+1, 3) float array in [0,1] for indexing by class id.
    Index 0 is background. Uses tab10/tab20 depending on class count.
    """
    if n_classes <= 10:
        cmap = plt.cm.get_cmap("tab10", n_classes + 1)
    else:
        cmap = plt.cm.get_cmap("tab20", n_classes + 1)
    lut = cmap(np.arange(n_classes + 1))[:, :3]  # drop alpha
    return lut

# Prepare LUT based on max class id present in class_map
max_cls_id = max(id_to_name.keys())
LUT = categorical_colormap(max_cls_id)

print(f"Prepared color LUT for {max_cls_id} classes (plus background).")


Class map (id → name): {0: 'background', 1: 'epithelial', 2: 'lymphocyte', 3: 'macrophage', 4: 'neutrophil'}
Slides available for visualization: 209
Sample: ['TCGA-55-1594-01Z-00-DX1_001', 'TCGA-55-1594-01Z-00-DX1_002', 'TCGA-55-1594-01Z-00-DX1_003', 'TCGA-55-1594-01Z-00-DX1_004', 'TCGA-55-1594-01Z-00-DX1_005']
Prepared color LUT for 4 classes (plus background).


  cmap = plt.cm.get_cmap("tab10", n_classes + 1)


<a id="step3"></a>

## Step 3 — Interactive Visualization (RGB • type_map • overlay)


In [3]:
from IPython.display import display
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as plt
import cv2
from matplotlib.patches import Patch

# --- Color-blind friendly LUT ---
def categorical_colormap(n_classes: int):
    """
    Returns an (n_classes+1, 3) float array in [0,1] for indexing by class id.
    Uses color-blind–safe 'Set2' or 'tab20' palette.
    """
    import matplotlib
    get_cmap = matplotlib.colormaps.get_cmap
    cmap = get_cmap("Set2" if n_classes <= 8 else "tab20")
    lut = cmap(np.linspace(0, 1, n_classes + 1))[:, :3]
    return lut

# Rebuild LUT
max_cls_id = max(id_to_name.keys())
LUT = categorical_colormap(max_cls_id)

def find_image_path(slide_name: str) -> Path | None:
    """
    Locate the corresponding .svs (preferred) or .tif for a slide basename
    by searching patient subfolders under RAW_IMG_ROOT.
    """
    for patient_dir in sorted(RAW_IMG_ROOT.iterdir()):
        if not patient_dir.is_dir():
            continue
        svs_candidate = patient_dir / f"{slide_name}.svs"
        if svs_candidate.exists():
            return svs_candidate
        tif_candidate = patient_dir / f"{slide_name}.tif"
        if tif_candidate.exists():
            return tif_candidate
    return None

def load_rgb_image(img_path: Path) -> np.ndarray:
    """
    Load RGB from .svs/.tif. Prefer OpenSlide when available; otherwise fallback to cv2 for .tif.
    """
    if img_path.suffix.lower() == ".svs":
        if OpenSlide is None:
            raise RuntimeError("OpenSlide is required to read .svs files.")
        with OpenSlide(str(img_path)) as slide:
            w, h = slide.level_dimensions[0]
            rgb = np.array(slide.read_region((0, 0), 0, (w, h)))[:, :, :3]
        return rgb
    else:
        # .tif fallback
        img = cv2.imread(str(img_path), cv2.IMREAD_COLOR)
        if img is None:
            raise RuntimeError(f"Failed to read image: {img_path}")
        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return rgb

def visualize_slide(slide_name: str, alpha: float = 0.45):
    type_path = TYPE_DIR / f"{slide_name}_type.npy"
    inst_path = INST_DIR / f"{slide_name}_inst.npy"

    if not type_path.exists() or not inst_path.exists():
        print(f"Missing maps for {slide_name}")
        return

    # Load maps
    type_map = np.load(type_path)
    inst_map = np.load(inst_path)

    # Load original image
    img_path = find_image_path(slide_name)
    if img_path is None:
        print(f"No .svs/.tif found for {slide_name}")
        return
    rgb = load_rgb_image(img_path)

    # Remove background for overlay (keep as mask)
    mask = type_map > 0

    # Build overlay from LUT
    tm_clipped = np.clip(type_map, 0, LUT.shape[0] - 1).astype(int)
    color_mask = LUT[tm_clipped]
    # zero out background
    color_mask[~mask] = 0.0

    # Strengthen color visibility (gamma + scaling)
    color_mask = np.power(color_mask, 0.7)  # slightly more vivid
    overlay = rgb / 255.0
    overlay[mask] = (1 - alpha) * overlay[mask] + alpha * color_mask[mask]

    # --- Plot ---
    fig, axes = plt.subplots(1, 3, figsize=(16, 6))
    axes[0].imshow(rgb)
    axes[0].set_title("H&E (RGB)")
    axes[0].axis("off")

    im1 = axes[1].imshow(type_map, cmap="Set2", vmin=0, vmax=max_cls_id)
    axes[1].set_title("Type Map (IDs)")
    axes[1].axis("off")

    axes[2].imshow(overlay)
    axes[2].set_title("Overlay (No Background)")
    axes[2].axis("off")

    plt.tight_layout()
    plt.show()

    # --- Legend (only classes present) ---
    present = sorted(np.unique(type_map[type_map > 0]))
    legend_patches = [
        Patch(color=LUT[i], label=f"{i}: {id_to_name.get(i, 'unknown')}")
        for i in present
    ]
    plt.figure(figsize=(6, 0.6 + 0.3 * len(present)))
    plt.legend(
        handles=legend_patches,
        title="Cell Types",
        loc="center left",
        bbox_to_anchor=(0, 0.5),
        frameon=False,
    )
    plt.axis("off")
    plt.show()

    print("Classes present:", ", ".join(id_to_name[i] for i in present))

# UI
dropdown = widgets.Dropdown(options=slide_names, description="Slide:", layout=widgets.Layout(width="70%"))
ui = widgets.VBox([dropdown])
out = widgets.interactive_output(visualize_slide, {"slide_name": dropdown})
display(ui, out)


VBox(children=(Dropdown(description='Slide:', layout=Layout(width='70%'), options=('TCGA-55-1594-01Z-00-DX1_00…

Output()