# UMAP Projection Calculation and Saving

### Goal of this Notebook:
- Load projections into `data/projections` dir, with unique names and the following format (image names without file ending):
```json
[
  {
    "image": "image_name1",
    "UMAP1": 1.7074365615844727,
    "UMAP2": 0.3955068588256836
  },
  {
    "image": "image_name2",
    "UMAP1": -2.22145938873291,
    "UMAP2": -0.10457038879394531
  },
]
```
- For every projection add an entry into `data/projections/manifest.json` with the following format (the element is equal to the filename above):
```json
[
  "umap_image_projection.json",
  "umap_combined_projection.json"
]
```

The frontend will parse the manifest to show what projections are available and will fetch a specific one when requested.

In [5]:
import os
import numpy as np
import pandas as pd
import umap
import json
from pathlib import Path

# --- Define Paths ---
NOTEBOOK_DIR = Path(os.getcwd())
REPO_ROOT = NOTEBOOK_DIR.parent
DATASET_PATH = REPO_ROOT / "data"
FEATURE_DIR = DATASET_PATH / "features"
PROJECTIONS_DIR = DATASET_PATH / "projections"
PROJECTIONS_DIR.mkdir(parents=True, exist_ok=True)

# Load Features
Load image features (assumed saved in NPZ format with keys "image_names" and "features")

In [6]:
img_feat_file = FEATURE_DIR / "image_features.npz"
print(f"Loading image features from: {img_feat_file}")
img_data = np.load(img_feat_file, allow_pickle=True)
image_names = img_data["image_names"]
image_features = img_data["features"]

# Load metadata features (CSV, indexed by image names)
meta_feat_file = FEATURE_DIR / "metadata_features.csv"
print(f"Loading metadata features from: {meta_feat_file}")
df_meta = pd.read_csv(meta_feat_file, index_col=0)
# Ensure ordering of metadata features matches image_names
metadata_features = df_meta.loc[image_names].values

Loading image features from: /app/data/features/image_features.npz
Loading metadata features from: /app/data/features/metadata_features.csv


# Compute UMAP Projections

In [18]:
# Define a helper function to compute and normalize UMAP projection
def compute_normalized_umap(
    features, n_neighbors=15, min_dist=0.1, n_components=2, random_state=42
):
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        random_state=random_state,
    )
    embedding = reducer.fit_transform(features)
    # Normalize coordinates such that the mean is 0 (centered)
    embedding_centered = embedding - np.mean(embedding, axis=0)
    return embedding_centered


print("Computing UMAP projection for image features only...")
umap_img = compute_normalized_umap(image_features)

umap_img_df = pd.DataFrame(umap_img, columns=["UMAP1", "UMAP2"])
umap_img_df["image"] = image_names

print("Computing UMAP projection for combined image and metadata features...")
# Before concatenating, replace NaN values in metadata_features with 0.
metadata_features_clean = np.nan_to_num(metadata_features, nan=0.0)
# Concatenate along feature axis (horizontally)
combined_features = np.hstack([image_features, metadata_features_clean])
umap_combined = compute_normalized_umap(combined_features)
umap_combined_df = pd.DataFrame(umap_combined, columns=["UMAP1", "UMAP2"])
umap_combined_df["image"] = image_names

Computing UMAP projection for image features only...


  warn(


Computing UMAP projection for combined image and metadata features...


  warn(


# Prepare and Save Image-Only Projection
Reset the DataFrame index so that image names become a column

In [8]:
# --- Prepare and Save Image-Only Projection ---
umap_img_df_reset = umap_img_df.reset_index().rename(columns={"index": "image"})
# Convert to a list of dictionaries, e.g.,
# [ { "image": "pikachu", "UMAP1": 0.123, "UMAP2": -0.456 }, ... ]
image_projection = umap_img_df_reset.to_dict(orient="records")
json_path_image = PROJECTIONS_DIR / "umap_image_projection.json"
with open(json_path_image, "w") as f:
    json.dump(image_projection, f, indent=2)
print(f"Saved image-only UMAP projection to: {json_path_image}")

# --- Prepare and Save Combined Projection ---
umap_combined_df_reset = umap_combined_df.reset_index().rename(
    columns={"index": "image"}
)
combined_projection = umap_combined_df_reset.to_dict(orient="records")
json_path_combined = PROJECTIONS_DIR / "umap_combined_projection.json"
with open(json_path_combined, "w") as f:
    json.dump(combined_projection, f, indent=2)
print(f"Saved combined UMAP projection to: {json_path_combined}")

# --- Update Projection Manifest ---
# Create a manifest listing all available projection files
manifest = [
    json_path_image.name,
    json_path_combined.name,
]
manifest_path = PROJECTIONS_DIR / "projection_manifest.json"
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)
print(f"Updated projection manifest at: {manifest_path}")

Saved image-only UMAP projection to: /app/data/projections/umap_image_projection.json
Saved combined UMAP projection to: /app/data/projections/umap_combined_projection.json
Updated projection manifest at: /app/data/projections/projection_manifest.json


  image_projection = umap_img_df_reset.to_dict(orient="records")
  combined_projection = umap_combined_df_reset.to_dict(orient="records")
