# Create and save object embeddings

### Create and save embeddings for each building detected on urban area images from satellite and drone

In [1]:
import json
from typing import List

import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\radiu\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [2]:
# import sys
# import types
#
# # Patch for missing C3k2 class
# import ultralytics.nn.modules.block
#
# if not hasattr(ultralytics.nn.modules.block, 'C3k2'):
#     class C3k2:
#         pass  # Dummy class, replace with actual code if available
#     ultralytics.nn.modules.block.C3k2 = C3k2

In [3]:
class FeatureHook:
    """
    A hook class to store the output (feature map) of a particular layer.
    """
    def __init__(self, module: nn.Module):
        self.module = module
        self.feature_map = None
        self.hook = module.register_forward_hook(self.hook_fn)

    def hook_fn(self, module, input, output):
        # output typically: (B, C, Hf, Wf)
        self.feature_map = output.clone().detach()

    def remove(self):
        self.hook.remove()

def setup_yolo_with_hooks(
    model_path: str,
    layer_indices: List[int]
):
    """
    Loads a YOLO segmentation model and registers a hook on each 'layer_index'.
    """
    yolo_model = YOLO(model_path)
    base_model = yolo_model.model.model  # main container for YOLOv8

    # Convert submodules to a list for direct indexing
    modules_list = list(base_model.children())

    hooks = []
    for idx in layer_indices:
        if idx < 0 or idx >= len(modules_list):
            raise ValueError(f"layer_index {idx} out of range [0..{len(modules_list)-1}]")
        target_layer = modules_list[idx]
        hooks.append(FeatureHook(target_layer))

    return yolo_model, hooks

###########################################
# 2. Aggregation & Region Mask
###########################################

def aggregate_feature_region(
    feature_map: torch.Tensor,
    region_mask: torch.Tensor,
    aggregation_type: str = "avg"
) -> torch.Tensor:
    """
    Pools feature-map values where region_mask == 1.
    feature_map: shape (C, Hf, Wf)
    region_mask: shape (Hf, Wf) => binary
    """
    selected_values = feature_map[:, region_mask.bool()]  # shape: (C, Npixels)

    if selected_values.numel() == 0:
        # No pixels => zero vector
        c = feature_map.shape[0]
        return torch.zeros(c)

    if aggregation_type == "avg":
        return selected_values.mean(dim=1)  # shape (C,)
    elif aggregation_type == "sum":
        return selected_values.sum(dim=1)  # shape (C,)
    elif aggregation_type == "max":
        return selected_values.max(dim=1)[0]  # shape (C,)
    elif aggregation_type == "avg+max":
        avg_vals = selected_values.mean(dim=1)
        max_vals = selected_values.max(dim=1)[0]
        return torch.cat([avg_vals, max_vals], dim=0)  # shape (2*C,)
    elif aggregation_type == "flatten":
        return selected_values.view(-1)  # potentially large
    else:
        raise ValueError(f"Unknown aggregation_type: {aggregation_type}")

def get_feature_map_region_mask(
    feature_map_size: tuple,
    box_or_mask: np.ndarray,
    downsample_ratio: float,
    method: str = "bbox"
) -> torch.Tensor:
    """
    Returns a (Hf, Wf) mask in feature-map coords from bounding box or segmentation mask.
    method='bbox': box_or_mask -> [xmin, ymin, xmax, ymax]
    method='mask': box_or_mask -> 2D array in original image coords
    """
    Hf, Wf = feature_map_size
    region_mask = torch.zeros((Hf, Wf), dtype=torch.uint8)

    if method == "bbox":
        x_min, y_min, x_max, y_max = box_or_mask
        x_min_f = int(np.floor(x_min / downsample_ratio))
        y_min_f = int(np.floor(y_min / downsample_ratio))
        x_max_f = int(np.ceil(x_max / downsample_ratio))
        y_max_f = int(np.ceil(y_max / downsample_ratio))

        # clamp
        x_min_f, y_min_f = max(x_min_f, 0), max(y_min_f, 0)
        x_max_f = min(x_max_f, Wf - 1)
        y_max_f = min(y_max_f, Hf - 1)

        if x_min_f <= x_max_f and y_min_f <= y_max_f:
            region_mask[y_min_f:y_max_f+1, x_min_f:x_max_f+1] = 1

    elif method == "mask":
        # box_or_mask is the instance mask (2D), possibly continuous [0..1]
        bin_mask = (box_or_mask > 0.5).astype(np.uint8)
        resized = cv2.resize(bin_mask, (Wf, Hf), interpolation=cv2.INTER_NEAREST)
        region_mask = torch.from_numpy(resized)

    else:
        raise ValueError("method must be 'bbox' or 'mask'")

    return region_mask

###########################################
# 3. Extract Multi-Layer Embeddings
###########################################

def extract_multi_layer_embeddings(
    model: YOLO,
    hooks: List[FeatureHook],
    image_path: str,
    conf_threshold: float = 0.25,
    building_class_name: str = 'building',
    layer_downsample_ratios: List[float] = None,
    method: str = 'bbox',
    aggregation_type: str = 'max'
):
    """
    1. Runs YOLO segmentation inference.
    2. From each hook, gather feature maps (B=1 => shape (1, C, Hf, Wf)).
    3. For each building instance:
       - Create region_mask in each layer's feature map
       - aggregate => produce layer embedding
       - concat across all layers => final embedding
    Returns:
      embeddings: list of torch.Tensor
      boxes_out: list of [xmin, ymin, xmax, ymax]
      masks_out: list of 2D (or None)
      orig_img: original np array
    """
    orig_img = cv2.imread(image_path)
    if orig_img is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")

    results = model.predict(source=image_path, conf=conf_threshold)
    result = results[0]

    # If no custom downsample_ratios provided, assume 8 for all
    if layer_downsample_ratios is None:
        layer_downsample_ratios = [8.0]*len(hooks)
    elif len(layer_downsample_ratios) < len(hooks):
        raise ValueError("layer_downsample_ratios must have >= len(hooks) elements.")

    names = model.names
    embeddings = []
    boxes_out = []
    masks_out = []
    confs_out = []

    if result.boxes is not None and len(result.boxes) > 0:
        for i in range(len(result.boxes)):
            box = result.boxes.xyxy[i].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
            cls_id = int(result.boxes.cls[i].item())
            conf = result.boxes.conf[i].item()
            class_name = names[cls_id]

            if class_name.lower() == building_class_name.lower():
                instance_mask = None
                if (method == 'mask') and (result.masks is not None):
                    instance_mask = result.masks.data[i].cpu().numpy()

                # For each layer, build region_mask => aggregate
                layer_embs = []
                for hook_idx, hook in enumerate(hooks):
                    feat_map_batch = hook.feature_map  # shape: (B, C, Hf, Wf)
                    if feat_map_batch is None:
                        raise RuntimeError("Hook did not capture feature map.")

                    feat_map = feat_map_batch[0]  # (C, Hf, Wf)
                    ds_ratio = layer_downsample_ratios[hook_idx]

                    if (method == 'mask') and (instance_mask is not None):
                        region_mask = get_feature_map_region_mask(
                            feat_map.shape[-2:], instance_mask,
                            ds_ratio, method='mask'
                        )
                    else:
                        region_mask = get_feature_map_region_mask(
                            feat_map.shape[-2:], box,
                            ds_ratio, method='bbox'
                        )

                    emb = aggregate_feature_region(
                        feature_map=feat_map,
                        region_mask=region_mask,
                        aggregation_type=aggregation_type
                    )
                    layer_embs.append(emb)

                layer_embs = [i.cpu() for i in layer_embs]

                # Concat all layer embeddings
                final_emb = torch.cat(layer_embs, dim=0)
                embeddings.append(final_emb)
                boxes_out.append(box)
                masks_out.append(instance_mask)
                confs_out.append(conf)

    return embeddings, boxes_out, masks_out, confs_out, orig_img

In [4]:
config = {
    "yolo_model": "default_100_epochs",
    "label": "urban",
    "conf_threshold": 0.3,
    "method": "mask",  # 'mask' or 'bbox'
    "aggregation_type": "sum"  # 'avg', 'max', 'avg+max', 'flatten'
}

yolo_model_path = "model/best.pt"

# all possible from backbone
layer_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# constants for yolov11n-seg we used here
layer_strides = [2.0, 4.0, 4.0, 8.0, 8.0, 16.0, 16.0, 32.0, 32.0, 32.0, 32.0] 

model, hooks = setup_yolo_with_hooks(yolo_model_path, layer_indices)

images_folder = "data/vpair/reference_views"
# images_folder = "data/vpair/queries"

all_embeddings = []
info = []

labels = pd.read_csv("data/vpair/poses_with_labels.csv")

for _, (image_name, label) in labels[["filename", "label"]].iterrows():
    if label != "urban":
        continue

    image_path = f"{images_folder}/{image_name}"

    embeddings, boxes, masks, confs, orig_img = extract_multi_layer_embeddings(
        model=model,
        hooks=hooks,
        image_path=image_path,
        conf_threshold=config["conf_threshold"],
        building_class_name='building',
        layer_downsample_ratios=layer_strides,
        method=config["method"],
        aggregation_type=config["aggregation_type"]
    )

    embeddings = [tensor.cpu() for tensor in embeddings]
    all_embeddings += embeddings

    for i in range(len(boxes)):
        box_0, box_1, box_2, box_3 = boxes[i]
        info.append((image_name, box_0, box_1, box_2, box_3, confs[i]))

all_embeddings = torch.stack(all_embeddings, dim=0).numpy()
info = pd.DataFrame(info, columns=["image", "box_0", "box_1", "box_2", "box_3", "conf"])


image 1/1 C:\Courses\uav-landmarks-embeddings\data\vpair\reference_views\00001.png: 480x640 34 Buildings, 170.3ms
Speed: 5.0ms preprocess, 170.3ms inference, 39.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Courses\uav-landmarks-embeddings\data\vpair\reference_views\00002.png: 480x640 17 Buildings, 96.0ms
Speed: 3.0ms preprocess, 96.0ms inference, 12.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Courses\uav-landmarks-embeddings\data\vpair\reference_views\00003.png: 480x640 6 Buildings, 98.0ms
Speed: 2.0ms preprocess, 98.0ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Courses\uav-landmarks-embeddings\data\vpair\reference_views\00004.png: 480x640 5 Buildings, 105.0ms
Speed: 1.0ms preprocess, 105.0ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Courses\uav-landmarks-embeddings\data\vpair\reference_views\00005.png: 480x640 12 Buildings, 104.0ms
Speed: 2.0ms preprocess, 104.0ms infe

In [5]:
all_embeddings.shape # 34898

(37576, 1584)

In [6]:
info

Unnamed: 0,image,box_0,box_1,box_2,box_3,conf
0,00001.png,589.633484,386.412323,662.674072,440.570709,0.745103
1,00001.png,422.776611,351.447144,456.424286,382.864746,0.724470
2,00001.png,273.926453,564.957031,331.723938,599.664673,0.695141
3,00001.png,223.289062,527.000000,275.310120,582.890259,0.650830
4,00001.png,295.007172,523.585632,332.749054,570.764709,0.644671
...,...,...,...,...,...,...
37571,02631.png,0.014535,435.243530,15.283167,468.564911,0.424236
37572,02631.png,61.241413,286.977386,99.191132,327.133087,0.419084
37573,02631.png,46.578140,390.407471,85.338745,429.637756,0.387998
37574,02631.png,105.920616,171.687119,186.769272,243.115799,0.328832


In [13]:
# save_folder = f"data/landmarks_drone_{config['aggregation_type']}_urban"
save_folder = f"data/landmarks_satellite_{config['aggregation_type']}_urban"

with open(f"{save_folder}/config.json", "w") as file:
    json.dump(config, file)

np.save(f"{save_folder}/embeddings.npy", all_embeddings)

info.to_csv(f"{save_folder}/info.csv", index=False)

### Test loading saved embeddings

In [14]:
import numpy as np


LAYER_TO_DIM = {
    0: 16,
    1: 32,
    2: 64,
    3: 64,
    4: 128,
    5: 128,
    6: 128,
    7: 256,
    8: 256,
    9: 256,
    10: 256
}


def get_embeddings_for_layers(embeddings: np.ndarray, layers: list, layer_to_dim: dict = LAYER_TO_DIM) -> np.ndarray:
    """
    Selects and returns the embeddings for the specified layers from a concatenated embeddings array.

    Parameters:
    - embeddings (np.ndarray): The input embeddings array of shape (num_samples, total_dim).
    - layers (list): A list of layer numbers to extract embeddings for.
    - layer_to_dim (dict): A dictionary mapping layer numbers to their respective dimensions.

    Returns:
    - np.ndarray: A new embeddings array containing only the specified layers' embeddings.
    
    Raises:
    - ValueError: If any of the specified layers are not present in layer_to_dim.
    """
    # Validate input layers
    invalid_layers = [layer for layer in layers if layer not in layer_to_dim]
    if invalid_layers:
        raise ValueError(f"Layers {invalid_layers} are not present in the layer_to_dim mapping.")
    
    # Compute the starting index for each layer
    layer_start_indices = {}
    current_index = 0
    for layer in sorted(layer_to_dim.keys()):
        layer_start_indices[layer] = current_index
        current_index += layer_to_dim[layer]
    
    # Collect all column indices for the specified layers
    selected_columns = []
    for layer in layers:
        start = layer_start_indices[layer]
        dim = layer_to_dim[layer]
        selected_columns.extend(range(start, start + dim))
    
    # Select and return the columns from the embeddings array
    return embeddings[:, selected_columns]

In [15]:
embeddings = np.load("data/landmarks_drone_max_urban/embeddings.npy")

test = get_embeddings_for_layers(embeddings, layers=[4, 6, 8, 10])

# Make pca and t-sne plots

In [19]:
"""
Utility script to:
1) Iterate over three embedding variants (sum / avg / max) and 11 single-layer
   slices (layer 0 … layer 10);
2) For every (embedding variant, layer) pair, run Isolation Forest outlier
   detection (500 trees, 1 % contamination), save cluster labels + metadata +
   PCA / t-SNE coordinates in exactly the same directory structure used by
   landmark_ui_reserve.py;
3) Provide an interactive Plotly visualiser that can draw an arbitrary set of
   those runs together in a single dashboard, with user-controlled layout
   (row/column), figure size, embedding variants to include, and plot types
   (PCA, t-SNE, or both).

"""

# ----------------------------------------------------------------------
# ORIGINAL HELPERS (verbatim copy -- do NOT modify anything here)
# ----------------------------------------------------------------------
from typing import List, Optional, Union
import os, json, logging, numpy as np, pandas as pd
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import List, Tuple

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

LAYER_TO_DIM = {
    0: 16, 1: 32, 2: 64, 3: 64, 4: 128,
    5: 128, 6: 128, 7: 256, 8: 256, 9: 256, 10: 256
}

def ensure_folder(path):  # unchanged
    if not os.path.exists(path):
        os.makedirs(path)

def get_embeddings_for_layers(embeddings: np.ndarray, layers: List[int],
                              layer_to_dim: dict = LAYER_TO_DIM) -> np.ndarray:
    layer_start, cur = {}, 0
    for l in sorted(layer_to_dim.keys()):
        layer_start[l], cur = cur, cur + layer_to_dim[l]
    cols = []
    for l in layers:
        s, d = layer_start[l], layer_to_dim[l]
        cols.extend(range(s, s + d))
    return embeddings[:, cols]

def generate_experiment_folder(conf_threshold, layer_list, algo_name, algo_params):
    layer_str = "_".join(map(str, layer_list)) if layer_list else "all"
    params_str = "_".join(f"{k}-{v}" for k, v in sorted(algo_params.items()))
    return f"conf-{conf_threshold}_layers-{layer_str}_{algo_name}_{params_str}"

def generate_embedding_config_folder(conf_threshold, layer_list):
    layer_str = "_".join(map(str, layer_list)) if layer_list else "all"
    return f"conf-{conf_threshold}_layers-{layer_str}"

# ----------------------------------------------------------------------
# NEW CONSTANTS for this automation run
# ----------------------------------------------------------------------
CONF_THRESHOLD           = 0.50          # same default as UI slider
RESULTS_ROOTS            = {              # match previous folder names
    "sum": "data/landmarks_sum_ui_results",
    "avg": "data/landmarks_avg_ui_results",
    "max": "data/landmarks_ui_results"
}
EMB_VARIANT_PATHS        = {              # location of .npy & .csv for each type
    "sum": ("data/landmarks_satellite_sum_urban/embeddings.npy",
            "data/landmarks_satellite_sum_urban/info.csv"),
    "avg": ("data/landmarks_satellite_avg_urban/embeddings.npy",
            "data/landmarks_satellite_avg_urban/info.csv"),
    "max": ("data/landmarks_satellite_max_urban/embeddings.npy",
            "data/landmarks_satellite_max_urban/info.csv"),
}
EMBEDDING_COORDS_ROOTS   = {              # cache folders per variant
    "sum": "data/embedding_sum_coords_cache",
    "avg": "data/embedding_avg_coords_cache",
    "max": "data/embedding_coords_cache"
}

# ----------------------------------------------------------------------
# STEP 1 & 2 – batch processing
# ----------------------------------------------------------------------
from sklearn.ensemble import IsolationForest

def batch_process_single_layer_outliers():
    for variant, (npy_path, csv_path) in EMB_VARIANT_PATHS.items():
        embeddings = np.load(npy_path)
        df_meta    = pd.read_csv(csv_path)
        mask       = df_meta["conf"] >= CONF_THRESHOLD

        embeddings = embeddings[mask.values]
        df_meta    = df_meta[mask].reset_index(drop=True)

        for layer in range(11):
            X          = get_embeddings_for_layers(embeddings, [layer])
            iso_model  = IsolationForest(n_estimators=500, contamination=0.01, random_state=42)
            labels     = np.where(iso_model.fit_predict(X) == 1, 0, -1)

            run_name   = generate_experiment_folder(CONF_THRESHOLD, [layer],
                                                    "IsolationForest", {"n_estimators":500,"contamination":0.01})
            run_root   = os.path.join(RESULTS_ROOTS[variant], run_name)
            ensure_folder(run_root)

            np.save (os.path.join(run_root, "cluster_labels.npy"), labels)
            df_meta.to_csv(os.path.join(run_root, "filtered_metadata.csv"), index=False)
            with open(os.path.join(run_root, "params.json"), "w") as fp:
                json.dump({"algo_name":"IsolationForest", "params":{"n_estimators":500,"contamination":0.01},
                           "confidence_threshold":CONF_THRESHOLD, "layers":[layer]}, fp, indent=2)

            # -------------- PCA / t-SNE (cached) ----------------------
            emb_cfg_folder = generate_embedding_config_folder(CONF_THRESHOLD, [layer])
            coords_root    = EMBEDDING_COORDS_ROOTS[variant]
            coords_path    = os.path.join(coords_root, emb_cfg_folder)
            ensure_folder(coords_path)

            pca_file  = os.path.join(coords_path, "pca_coords.npy")
            tsne_file = os.path.join(coords_path, "tsne_coords.npy")

            if not os.path.exists(pca_file):
                pca = PCA(n_components=2, random_state=42)
                np.save(pca_file, pca.fit_transform(X))
            if not os.path.exists(tsne_file):
                tsne = TSNE(n_components=2, random_state=42)
                np.save(tsne_file, tsne.fit_transform(X))

            logging.info(f"[{variant} | layer {layer}] saved in {run_root}")

# ----------------------------------------------------------------------
# STEP 3 – multi-run visualiser
# ----------------------------------------------------------------------
from itertools import product
import plotly.subplots as psub
import plotly.express as px

def load_coords(variant:str, layer:int, plot_type:str) -> Tuple[np.ndarray, np.ndarray]:
    emb_cfg_folder = generate_embedding_config_folder(CONF_THRESHOLD, [layer])
    coords_root    = EMBEDDING_COORDS_ROOTS[variant]
    file_name      = "pca_coords.npy" if plot_type=="pca" else "tsne_coords.npy"
    return np.load(os.path.join(coords_root, emb_cfg_folder, file_name))

def visualise_runs_column(
    embedding_type:   str,
    plot_type:        str,
    experiment_names: List[str],
    row_titles:       Optional[List[str]] = None,
    sampling_value:   Union[str, float]   = "all",
    vertical_spacing: float             = 0.01,
    horizontal_spacing: float           = 0.0,
    subplot_width:    Optional[int]       = None,
    subplot_height:   Optional[int]       = None,
    title_font_size: int = 24,
    row_title_font_size: int = 14,
    title: Optional[str] = None
) -> go.Figure:
    """
    Draw vertically–stacked PCA or t-SNE plots with:
      • adjustable subplot spacing
      • custom sampling of non-outliers
      • custom subplot width & height

    Parameters
    ----------
    embedding_type   : {"sum","avg","max"}
    plot_type        : {"pca","tsne"}
    experiment_names : list of run-folder names under RESULTS_ROOTS[embedding_type]
    row_titles       : optional list of subplot titles (same length)
    sampling_value   : "all" or fraction (0<frac<1) to subsample inliers
    vertical_spacing : fraction of vertical gap between rows
    horizontal_spacing: fraction of horizontal gap (unused for one column)
    subplot_width    : width in px for each subplot (default: 1000 for PCA, 800 for t-SNE)
    subplot_height   : height in px for each subplot (default: 700 for PCA, 600 for t-SNE)
    """
    assert embedding_type in RESULTS_ROOTS, "embedding_type must be 'sum','avg','max'"
    assert plot_type in ("pca", "tsne"),     "plot_type must be 'pca' or 'tsne'"

    # choose defaults if not provided
    if subplot_width  is None:
        subplot_width  = 1000 if plot_type == "pca" else 800
    if subplot_height is None:
        subplot_height = 700  if plot_type == "pca" else 600

    root_results = RESULTS_ROOTS[embedding_type]
    coords_root  = EMBEDDING_COORDS_ROOTS[embedding_type]
    coord_file   = "pca_coords.npy" if plot_type == "pca" else "tsne_coords.npy"

    titles = row_titles if row_titles is not None else experiment_names
    n_rows = len(experiment_names)

    fig = psub.make_subplots(
        rows=n_rows, cols=1,
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=vertical_spacing,
        horizontal_spacing=horizontal_spacing,
        subplot_titles=titles
    )

    palette = px.colors.qualitative.Plotly

    for r, run_name in enumerate(experiment_names, start=1):
        run_path = os.path.join(root_results, run_name)
        lbls_pth = os.path.join(run_path, "cluster_labels.npy")
        meta_pth = os.path.join(run_path, "filtered_metadata.csv")
        prm_pth  = os.path.join(run_path, "params.json")

        if not (os.path.exists(lbls_pth) and os.path.exists(meta_pth) and os.path.exists(prm_pth)):
            continue

        labels = np.load(lbls_pth)
        df_meta = pd.read_csv(meta_pth).reset_index(drop=True)

        # load coords
        with open(prm_pth, "r") as fp:
            cfg = json.load(fp)
        conf_thr, layers = cfg["confidence_threshold"], cfg["layers"]
        emb_cfg_folder = generate_embedding_config_folder(conf_thr, layers)
        coords_path    = os.path.join(coords_root, emb_cfg_folder, coord_file)
        if not os.path.exists(coords_path):
            continue
        coords = np.load(coords_path)

        # assemble DataFrame for sampling
        df_plot = pd.DataFrame({
            "label":     labels,
            "x":         coords[:, 0],
            "y":         coords[:, 1],
            "orig_index": np.arange(len(labels))
        })

        # sampling of non-outliers
        if sampling_value != "all":
            frac = float(sampling_value)
            is_out = df_plot["label"] == -1
            df_out = df_plot[is_out]
            df_in  = df_plot[~is_out]
            if frac < 1.0:
                df_in = df_in.sample(frac=frac, random_state=42)
            df_plot = pd.concat([df_in, df_out], axis=0).reset_index(drop=True)

        # determine label order & colors
        unique_labels = df_plot["label"].unique().tolist()
        unique_labels = [lbl for lbl in unique_labels if lbl != -1] + [-1]
        color_map = {cid: palette[i % len(palette)] for i, cid in enumerate(unique_labels) if cid != -1}
        color_map[-1] = "orange"

        # plot per cluster
        for cid in unique_labels:
            subset = df_plot[df_plot["label"] == cid]
            trace_kwargs = dict(
                x=subset["x"],
                y=subset["y"],
                mode="markers",
                text=subset["orig_index"].astype(str),
                showlegend=False
            )
            if cid == -1:
                trace_kwargs["marker"] = dict(
                    size=10, color="orange",
                    line=dict(width=1, color="black")
                )
            else:
                trace_kwargs["marker"] = dict(
                    size=6, color=color_map[cid],
                    line=dict(width=1, color="black")
                )

            fig.add_trace(go.Scatter(**trace_kwargs), row=r, col=1)

        # hide axes
        fig.update_xaxes(visible=False, row=r, col=1)
        fig.update_yaxes(visible=False, row=r, col=1)

    # final layout
    fig.update_layout(
        height = subplot_height * n_rows,
        width  = subplot_width,
        title  = {
            "text": f"{plot_type.upper()} | Aggregation {embedding_type.upper()} {title}",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": title_font_size}
        },
        hovermode="closest",
        # title_x = 0.15,     
    )
    for anno in fig.layout.annotations:
        # subplot_titles from make_subplots are all annotations
        anno.font.size = row_title_font_size
    
    return fig


In [42]:
"""
Utility script to:
1) Iterate over three embedding variants (sum / avg / max) and 11 single-layer
   slices (layer 0 … layer 10);
2) For every (embedding variant, layer) pair, run Isolation Forest outlier
   detection (500 trees, 1 % contamination), save cluster labels + metadata +
   PCA / t-SNE coordinates in exactly the same directory structure used by
   landmark_ui_reserve.py;
3) Provide an interactive Plotly visualiser that can draw an arbitrary set of
   those runs together in a single dashboard, with user-controlled layout
   (row/column), figure size, embedding variants to include, and plot types
   (PCA, t-SNE, or both).

"""

# ----------------------------------------------------------------------
# ORIGINAL HELPERS (verbatim copy -- do NOT modify anything here)
# ----------------------------------------------------------------------
from typing import List, Optional, Union
import os, json, logging, numpy as np, pandas as pd
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import List, Tuple

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

LAYER_TO_DIM = {
    0: 16, 1: 32, 2: 64, 3: 64, 4: 128,
    5: 128, 6: 128, 7: 256, 8: 256, 9: 256, 10: 256
}

def ensure_folder(path):  # unchanged
    if not os.path.exists(path):
        os.makedirs(path)

def get_embeddings_for_layers(embeddings: np.ndarray, layers: List[int],
                              layer_to_dim: dict = LAYER_TO_DIM) -> np.ndarray:
    layer_start, cur = {}, 0
    for l in sorted(layer_to_dim.keys()):
        layer_start[l], cur = cur, cur + layer_to_dim[l]
    cols = []
    for l in layers:
        s, d = layer_start[l], layer_to_dim[l]
        cols.extend(range(s, s + d))
    return embeddings[:, cols]

def generate_experiment_folder(conf_threshold, layer_list, algo_name, algo_params):
    layer_str = "_".join(map(str, layer_list)) if layer_list else "all"
    params_str = "_".join(f"{k}-{v}" for k, v in sorted(algo_params.items()))
    return f"conf-{conf_threshold}_layers-{layer_str}_{algo_name}_{params_str}"

def generate_embedding_config_folder(conf_threshold, layer_list):
    layer_str = "_".join(map(str, layer_list)) if layer_list else "all"
    return f"conf-{conf_threshold}_layers-{layer_str}"

# ----------------------------------------------------------------------
# NEW CONSTANTS for this automation run
# ----------------------------------------------------------------------
CONF_THRESHOLD           = 0.50          # same default as UI slider
RESULTS_ROOTS            = {              # match previous folder names
    "sum": "data/landmarks_sum_ui_results",
    "avg": "data/landmarks_avg_ui_results",
    "max": "data/landmarks_ui_results"
}
EMB_VARIANT_PATHS        = {              # location of .npy & .csv for each type
    "sum": ("data/landmarks_satellite_sum_urban/embeddings.npy",
            "data/landmarks_satellite_sum_urban/info.csv"),
    "avg": ("data/landmarks_satellite_avg_urban/embeddings.npy",
            "data/landmarks_satellite_avg_urban/info.csv"),
    "max": ("data/landmarks_satellite_max_urban/embeddings.npy",
            "data/landmarks_satellite_max_urban/info.csv"),
}
EMBEDDING_COORDS_ROOTS   = {              # cache folders per variant
    "sum": "data/embedding_sum_coords_cache",
    "avg": "data/embedding_avg_coords_cache",
    "max": "data/embedding_coords_cache"
}

# ----------------------------------------------------------------------
# STEP 1 & 2 – batch processing
# ----------------------------------------------------------------------
from sklearn.ensemble import IsolationForest

def batch_process_single_layer_outliers():
    for variant, (npy_path, csv_path) in EMB_VARIANT_PATHS.items():
        embeddings = np.load(npy_path)
        df_meta    = pd.read_csv(csv_path)
        mask       = df_meta["conf"] >= CONF_THRESHOLD

        embeddings = embeddings[mask.values]
        df_meta    = df_meta[mask].reset_index(drop=True)

        for layer in range(11):
            X          = get_embeddings_for_layers(embeddings, [layer])
            iso_model  = IsolationForest(n_estimators=500, contamination=0.01, random_state=42)
            labels     = np.where(iso_model.fit_predict(X) == 1, 0, -1)

            run_name   = generate_experiment_folder(CONF_THRESHOLD, [layer],
                                                    "IsolationForest", {"n_estimators":500,"contamination":0.01})
            run_root   = os.path.join(RESULTS_ROOTS[variant], run_name)
            ensure_folder(run_root)

            np.save (os.path.join(run_root, "cluster_labels.npy"), labels)
            df_meta.to_csv(os.path.join(run_root, "filtered_metadata.csv"), index=False)
            with open(os.path.join(run_root, "params.json"), "w") as fp:
                json.dump({"algo_name":"IsolationForest", "params":{"n_estimators":500,"contamination":0.01},
                           "confidence_threshold":CONF_THRESHOLD, "layers":[layer]}, fp, indent=2)

            # -------------- PCA / t-SNE (cached) ----------------------
            emb_cfg_folder = generate_embedding_config_folder(CONF_THRESHOLD, [layer])
            coords_root    = EMBEDDING_COORDS_ROOTS[variant]
            coords_path    = os.path.join(coords_root, emb_cfg_folder)
            ensure_folder(coords_path)

            pca_file  = os.path.join(coords_path, "pca_coords.npy")
            tsne_file = os.path.join(coords_path, "tsne_coords.npy")

            if not os.path.exists(pca_file):
                pca = PCA(n_components=2, random_state=42)
                np.save(pca_file, pca.fit_transform(X))
            if not os.path.exists(tsne_file):
                tsne = TSNE(n_components=2, random_state=42)
                np.save(tsne_file, tsne.fit_transform(X))

            logging.info(f"[{variant} | layer {layer}] saved in {run_root}")

# ----------------------------------------------------------------------
# STEP 3 – multi-run visualiser
# ----------------------------------------------------------------------
from itertools import product
import plotly.subplots as psub
import plotly.express as px

def load_coords(variant:str, layer:int, plot_type:str) -> Tuple[np.ndarray, np.ndarray]:
    emb_cfg_folder = generate_embedding_config_folder(CONF_THRESHOLD, [layer])
    coords_root    = EMBEDDING_COORDS_ROOTS[variant]
    file_name      = "pca_coords.npy" if plot_type=="pca" else "tsne_coords.npy"
    return np.load(os.path.join(coords_root, emb_cfg_folder, file_name))

# ======================================================================
# MODIFIED FUNCTION
# ======================================================================
def visualise_runs_column(
    embedding_type:   str,
    plot_type:        str,
    experiment_names: List[str],
    row_titles:       Optional[List[str]] = None,
    sampling_value:   Union[str, float]   = "all",
    vertical_spacing: float             = 0.01,
    horizontal_spacing: float           = 0.0,
    subplot_width:    Optional[int]       = None,
    subplot_height:   Optional[int]       = None,
    title_font_size: int = 24,
    row_title_font_size: int = 14,
    title: Optional[str] = None,
    save_png: bool = False,
    png_name: Optional[str] = None
) -> go.Figure:
    """
    Draw vertically–stacked PCA or t-SNE plots with:
      • adjustable subplot spacing
      • custom sampling of non-outliers
      • custom subplot width & height
      • option to save as high-resolution PNG

    Parameters
    ----------
    embedding_type   : {"sum","avg","max"}
    plot_type        : {"pca","tsne"}
    experiment_names : list of run-folder names under RESULTS_ROOTS[embedding_type]
    row_titles       : optional list of subplot titles (same length)
    sampling_value   : "all" or fraction (0<frac<1) to subsample inliers
    vertical_spacing : fraction of vertical gap between rows
    horizontal_spacing: fraction of horizontal gap (unused for one column)
    subplot_width    : width in px for each subplot (default: 1000 for PCA, 800 for t-SNE)
    subplot_height   : height in px for each subplot (default: 700 for PCA, 600 for t-SNE)
    save_png         : If True, save the figure to a PNG file.
    png_name         : Path to save the PNG file. If None, a default name is generated.
    """
    assert embedding_type in RESULTS_ROOTS, "embedding_type must be 'sum','avg','max'"
    assert plot_type in ("pca", "tsne"),     "plot_type must be 'pca' or 'tsne'"

    # choose defaults if not provided
    if subplot_width  is None:
        subplot_width  = 1000 if plot_type == "pca" else 800
    if subplot_height is None:
        subplot_height = 700  if plot_type == "pca" else 600

    root_results = RESULTS_ROOTS[embedding_type]
    coords_root  = EMBEDDING_COORDS_ROOTS[embedding_type]
    coord_file   = "pca_coords.npy" if plot_type == "pca" else "tsne_coords.npy"

    titles = row_titles if row_titles is not None else experiment_names
    n_rows = len(experiment_names)

    fig = psub.make_subplots(
        rows=n_rows, cols=1,
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=vertical_spacing,
        horizontal_spacing=horizontal_spacing,
        subplot_titles=titles
    )

    palette = px.colors.qualitative.Plotly

    for r, run_name in enumerate(experiment_names, start=1):
        run_path = os.path.join(root_results, run_name)
        lbls_pth = os.path.join(run_path, "cluster_labels.npy")
        meta_pth = os.path.join(run_path, "filtered_metadata.csv")
        prm_pth  = os.path.join(run_path, "params.json")

        if not (os.path.exists(lbls_pth) and os.path.exists(meta_pth) and os.path.exists(prm_pth)):
            continue

        labels = np.load(lbls_pth)
        df_meta = pd.read_csv(meta_pth).reset_index(drop=True)

        # load coords
        with open(prm_pth, "r") as fp:
            cfg = json.load(fp)
        conf_thr, layers = cfg["confidence_threshold"], cfg["layers"]
        emb_cfg_folder = generate_embedding_config_folder(conf_thr, layers)
        coords_path    = os.path.join(coords_root, emb_cfg_folder, coord_file)
        if not os.path.exists(coords_path):
            continue
        coords = np.load(coords_path)

        # --- FIX STARTS HERE ---
        # Check for length mismatch and slice the arrays to the smallest common length.
        if len(labels) != coords.shape[0]:
            print(
                f"Warning: Mismatch in array lengths. "
                f"Labels: {len(labels)}, Coords: {coords.shape[0]}. "
                f"Slicing to the smaller size to proceed with visualization."
            )
            min_len = min(len(labels), coords.shape[0])
            labels = labels[:min_len]
            coords = coords[:min_len, :]
        # --- FIX ENDS HERE ---

        df_plot = pd.DataFrame({
            "label":     labels,
            "x":         coords[:, 0],
            "y":         coords[:, 1],
            "orig_index": np.arange(len(labels))
        })

        # sampling of non-outliers
        if sampling_value != "all":
            frac = float(sampling_value)
            is_out = df_plot["label"] == -1
            df_out = df_plot[is_out]
            df_in  = df_plot[~is_out]
            if frac < 1.0:
                df_in = df_in.sample(frac=frac, random_state=42)
            df_plot = pd.concat([df_in, df_out], axis=0).reset_index(drop=True)

        # determine label order & colors
        unique_labels = df_plot["label"].unique().tolist()
        unique_labels = [lbl for lbl in unique_labels if lbl != -1] + [-1]
        color_map = {cid: palette[i % len(palette)] for i, cid in enumerate(unique_labels) if cid != -1}
        color_map[-1] = "orange"

        # plot per cluster
        for cid in unique_labels:
            subset = df_plot[df_plot["label"] == cid]
            trace_kwargs = dict(
                x=subset["x"],
                y=subset["y"],
                mode="markers",
                text=subset["orig_index"].astype(str),
                showlegend=False
            )
            if cid == -1:
                trace_kwargs["marker"] = dict(
                    size=10, color="orange",
                    line=dict(width=1, color="black")
                )
            else:
                trace_kwargs["marker"] = dict(
                    size=6, color=color_map[cid],
                    line=dict(width=1, color="black")
                )

            fig.add_trace(go.Scatter(**trace_kwargs), row=r, col=1)

        # hide axes
        fig.update_xaxes(visible=False, row=r, col=1)
        fig.update_yaxes(visible=False, row=r, col=1)

    # --- MODIFICATION START ---
    # final layout
    main_title_text = f"{plot_type.upper()} | Aggregation {embedding_type.upper()}"
    if title:
        main_title_text += f" | {title}"

    fig.update_layout(
        margin=dict(l=20, r=20, t=80, b=20),  # Ensures tight layout
        height = subplot_height * n_rows,
        width  = subplot_width,
        title  = {
            "text": main_title_text,
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": title_font_size}
        },
        hovermode="closest",
    )
    for anno in fig.layout.annotations:
        # subplot_titles from make_subplots are all annotations
        anno.font.size = row_title_font_size

    # Save figure as high-resolution PNG if requested
    if save_png:
        if png_name is None:
            # Generate a default filename if not provided
            safe_title = "".join(c for c in (title if title else "") if c.isalnum())
            safe_title_part = f"_{safe_title}" if safe_title else ""
            png_name = f"{embedding_type}_{plot_type}{safe_title_part}.png"

        # To achieve a DPI of 600, a scale factor is used. Kaleido's default is 1.0 for 96 DPI.
        # Scale = Target DPI / Default DPI = 600 / 96 = 6.25.
        scale_factor = 6.25
        fig.write_image(png_name, scale=scale_factor)
        logging.info(f"Figure saved to {png_name} with high resolution (DPI-equivalent 600).")
    # --- MODIFICATION END ---
    
    return fig

In [43]:
# 1) Run the batch processing to compute outlier labels and PCA/t-SNE coords:
batch_process_single_layer_outliers()


2025-06-24 17:55:44,259 [INFO] [sum | layer 0] saved in data/landmarks_sum_ui_results\conf-0.5_layers-0_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:45,772 [INFO] [sum | layer 1] saved in data/landmarks_sum_ui_results\conf-0.5_layers-1_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:47,251 [INFO] [sum | layer 2] saved in data/landmarks_sum_ui_results\conf-0.5_layers-2_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:48,695 [INFO] [sum | layer 3] saved in data/landmarks_sum_ui_results\conf-0.5_layers-3_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:50,143 [INFO] [sum | layer 4] saved in data/landmarks_sum_ui_results\conf-0.5_layers-4_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:51,635 [INFO] [sum | layer 5] saved in data/landmarks_sum_ui_results\conf-0.5_layers-5_IsolationForest_contamination-0.01_n_estimators-500
2025-06-24 17:55:53,145 [INFO] [sum | layer 6] saved in data/lan

In [44]:
# 2) Configure and launch the multi-embedding visualiser:

# ---------------- EXAMPLE USAGE (not notebook-only) -------------
fig = visualise_runs_column(
    embedding_type  = "sum", # "max", "sum", "avg"
    plot_type       = "pca",
    experiment_names= [
        "conf-0.5_layers-0_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-2_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-4_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-6_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-8_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-10_IsolationForest_contamination-0.01_n_estimators-500",
                       ],
    row_titles=[
        'layer=1',
        'layer=3',
        'layer=5',
        'layer=7',
        'layer=9',
        'layer=11',
    ],
    vertical_spacing=0.02,
    horizontal_spacing=0,
    subplot_height=350,
    subplot_width=550,
    sampling_value=0.25,
    title='',
    title_font_size = 32,      # ← new argument
    row_title_font_size = 24,  # ← new argument
)
fig.show()



In [45]:
fig = visualise_runs_column(
    embedding_type  = "sum", # "max", "sum", "avg"
    plot_type       = "tsne",
    experiment_names= [
        "conf-0.5_layers-0_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-2_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-4_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-6_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-8_IsolationForest_contamination-0.01_n_estimators-500",
        "conf-0.5_layers-10_IsolationForest_contamination-0.01_n_estimators-500",
                       ],
    row_titles=[
        'layer=1',
        'layer=3',
        'layer=5',
        'layer=7',
        'layer=9',
        'layer=11',
    ],
    vertical_spacing=0.02,
    horizontal_spacing=0,
    subplot_height=350,
    subplot_width=550,
    sampling_value=0.25,
    title='',
    title_font_size = 32,      # ← new argument
    row_title_font_size = 24,  # ← new argument
)
fig.show()

