# Lab 2: Genre Target Vector Space

This notebook finalizes Lab 2 as a reproducible calibration pipeline on top of the frozen Lab 1 encoder.

What this notebook does:
1. Materialize and/or reuse genre samples.
2. Harvest (or reload) embeddings from the frozen Lab 1 model.
3. Build weighted target vectors, optionally apply supervised LDA warp.
4. Build purified centroids (top-k inlier fraction).
5. Run full Lab 2 exit audit and export all artifacts.


## Save/Resume Contract

This notebook supports the same practical run behavior as your scripts:
- `RUN_MODE='fresh'`: create a new run folder under `saves/lab2_calibration`.
- `RUN_MODE='resume'`: continue from an existing run folder without recomputing completed stages.
- Stage outputs are persisted after each stage.
- `run_state.json` tracks progress, config, and final pass/fail status.

You can also bootstrap from a previous run by setting `REUSE_ARTIFACTS_DIR`.


In [None]:
from pathlib import Path
from datetime import datetime
import json
import numpy as np
import pandas as pd

from src.lab2_data import load_manifests, assign_genres, materialize_genre_samples, genre_count_table
from src.lab2_encoder_bridge import FrozenLab1Encoder
from src.lab2_pipeline import (
    harvest_embeddings,
    compose_target_space,
    apply_lda_projection,
    compute_centroids,
    compute_centroid_distances,
    validate_target_space,
    write_artifacts,
    run_exit_audit,
)


In [None]:
# -----------------------------
# Run Controls
# -----------------------------
REPO_ROOT = Path.cwd().parent
OUTPUT_ROOT = REPO_ROOT / 'saves' / 'lab2_calibration'

RUN_MODE = 'fresh'          # 'fresh' or 'resume'
RUN_TAG = ''                # optional custom folder name for fresh mode
RESUME_ARTIFACTS_DIR = None # required only when RUN_MODE='resume', e.g. REPO_ROOT / 'saves/lab2_calibration/lab2_...'

# Optional bootstrap from another run's embeddings (skips Stage 1/2 extraction)
REUSE_ARTIFACTS_DIR = None  # e.g. REPO_ROOT / 'saves/lab2_calibration/lab2_20260211_015118'

# Stage toggles
RUN_STAGE1 = True
RUN_STAGE2 = True
RUN_STAGE3 = True

# Data + model
MANIFESTS_ROOT = Path('Z:/DataSets/_lab1_manifests')
CHECKPOINT = REPO_ROOT / 'saves/lab1_run_combo_af_gate_exit_v2/latest.pt'
PER_GENRE_SAMPLES = 1200
SEED = 328
DEVICE = 'auto'

# Calibration knobs
PROJECTION = 'lda'          # 'raw' or 'lda'
ZSTYLE_WEIGHT = 2.0
DESCRIPTOR_WEIGHT = 1.0
CENTROID_INLIER_FRACTION = 0.5
AUDIT_INLIER_FRACTION = 0.5

# Exit thresholds
SILHOUETTE_THRESHOLD = 0.45
SIGMA_MULTIPLIER = 3.0
NEIGHBOR_TOP_K = 5
NEIGHBOR_MIN_MEAN_HITS = 4.0
STABILITY_SAMPLE_FRACTION = 0.2
STABILITY_TRIALS = 10
TSNE_MAX_POINTS = 5000

if RUN_MODE == 'fresh':
    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
    folder = RUN_TAG if RUN_TAG else f'lab2_nb_{ts}'
    OUT_DIR = OUTPUT_ROOT / folder
elif RUN_MODE == 'resume':
    if RESUME_ARTIFACTS_DIR is None:
        raise ValueError("RUN_MODE='resume' requires RESUME_ARTIFACTS_DIR")
    OUT_DIR = Path(RESUME_ARTIFACTS_DIR)
else:
    raise ValueError("RUN_MODE must be 'fresh' or 'resume'")

OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR


In [None]:
# -----------------------------
# Run-State Helpers
# -----------------------------
state_path = OUT_DIR / 'run_state.json'


def _save_json(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open('w', encoding='utf-8') as f:
        json.dump(obj, f, indent=2)


def _load_json(path: Path, default=None):
    if path.exists():
        with path.open('r', encoding='utf-8') as f:
            return json.load(f)
    return {} if default is None else default


def _save_embeddings(out_dir: Path, index_df: pd.DataFrame, arrays: dict):
    index_df.to_csv(out_dir / 'embeddings_index.csv', index=False)
    np.savez_compressed(
        out_dir / 'embeddings.npz',
        z_content=arrays['z_content'],
        z_style=arrays['z_style'],
        descriptor32=arrays['descriptor32'],
        target160=arrays['target160'],
        music_prob=arrays['music_prob'],
    )


def _load_embeddings(out_dir: Path):
    idx_path = out_dir / 'embeddings_index.csv'
    npz_path = out_dir / 'embeddings.npz'
    if not idx_path.exists() or not npz_path.exists():
        return None, None
    index_df = pd.read_csv(idx_path)
    z = np.load(npz_path)
    arrays = {
        'z_content': z['z_content'].astype(np.float32),
        'z_style': z['z_style'].astype(np.float32),
        'descriptor32': z['descriptor32'].astype(np.float32),
        'target160': z['target160'].astype(np.float32),
        'music_prob': z['music_prob'].astype(np.float32),
    }
    return index_df, arrays


state = _load_json(state_path, default={
    'stage1_done': False,
    'stage2_done': False,
    'stage3_done': False,
    'created_at': datetime.now().isoformat(timespec='seconds'),
})

state['config'] = {
    'run_mode': RUN_MODE,
    'out_dir': str(OUT_DIR),
    'reuse_artifacts_dir': str(REUSE_ARTIFACTS_DIR) if REUSE_ARTIFACTS_DIR else '',
    'checkpoint': str(CHECKPOINT),
    'projection': PROJECTION,
    'zstyle_weight': float(ZSTYLE_WEIGHT),
    'descriptor_weight': float(DESCRIPTOR_WEIGHT),
    'centroid_inlier_fraction': float(CENTROID_INLIER_FRACTION),
    'audit_inlier_fraction': float(AUDIT_INLIER_FRACTION),
    'silhouette_threshold': float(SILHOUETTE_THRESHOLD),
    'sigma_multiplier': float(SIGMA_MULTIPLIER),
    'neighbor_top_k': int(NEIGHBOR_TOP_K),
    'neighbor_min_mean_hits': float(NEIGHBOR_MIN_MEAN_HITS),
}

_save_json(state, state_path)
state


## Stage 1: Genre Materialization

Goal: create a balanced, explicit sample manifest for this run.

Resume behavior:
- If `genre_samples.csv` already exists in `OUT_DIR`, it is loaded.
- Otherwise it is generated from manifests and saved.


In [None]:
genre_samples_path = OUT_DIR / 'genre_samples.csv'

if genre_samples_path.exists():
    samples_df = pd.read_csv(genre_samples_path)
    assigned_counts = state.get('assigned_genre_counts', {})
    sampled_counts = genre_count_table(samples_df)
    print('[stage1] loaded existing genre_samples.csv')
else:
    if not RUN_STAGE1:
        raise RuntimeError('Stage 1 is disabled but no saved genre_samples.csv was found.')

    raw_df = load_manifests(MANIFESTS_ROOT)
    assigned_df = assign_genres(raw_df)
    assigned_counts = genre_count_table(assigned_df)

    samples_df = materialize_genre_samples(
        assigned_df,
        per_genre_samples=PER_GENRE_SAMPLES,
        seed=SEED,
    )
    sampled_counts = genre_count_table(samples_df)
    samples_df.to_csv(genre_samples_path, index=False)
    print('[stage1] created and saved genre_samples.csv')

state['assigned_genre_counts'] = assigned_counts
state['sampled_genre_counts'] = sampled_counts
state['stage1_done'] = True
_save_json(state, state_path)

print('Assigned counts:', assigned_counts)
print('Sampled counts:', sampled_counts)
samples_df.head()


## Stage 2: Embedding Harvest (or Reload)

Goal: produce stable embedding artifacts (`embeddings_index.csv`, `embeddings.npz`).

Resume behavior:
- If embeddings already exist in `OUT_DIR`, they are loaded.
- If `REUSE_ARTIFACTS_DIR` is set, embeddings are loaded from there.
- Otherwise embeddings are harvested from the frozen Lab 1 encoder.

After load/harvest, weighted target composition and optional LDA warp are applied,
and the resulting vectors are saved back into this run folder.


In [None]:
index_df, arrays = _load_embeddings(OUT_DIR)

if index_df is not None and arrays is not None:
    print('[stage2] loaded embeddings from current OUT_DIR')
else:
    if REUSE_ARTIFACTS_DIR is not None:
        reuse_dir = Path(REUSE_ARTIFACTS_DIR)
        index_df, arrays = _load_embeddings(reuse_dir)
        if index_df is None:
            raise FileNotFoundError(f'No embeddings found in REUSE_ARTIFACTS_DIR: {reuse_dir}')
        print(f'[stage2] loaded embeddings from REUSE_ARTIFACTS_DIR: {reuse_dir}')
    else:
        if not RUN_STAGE2:
            raise RuntimeError('Stage 2 is disabled and no cached embeddings were found.')
        encoder = FrozenLab1Encoder(CHECKPOINT, device=DEVICE)
        index_df, arrays = harvest_embeddings(samples_df, encoder, progress_every=100)
        print('[stage2] harvested embeddings using frozen Lab 1 checkpoint')

# Compose weighted target space from z_style + descriptor block
arrays['target160'] = compose_target_space(
    arrays=arrays,
    zstyle_weight=ZSTYLE_WEIGHT,
    descriptor_weight=DESCRIPTOR_WEIGHT,
    normalize_rows=True,
)

projection_meta = {
    'projection': 'raw',
    'input_dim': int(arrays['target160'].shape[1]),
    'output_dim': int(arrays['target160'].shape[1]),
}

if PROJECTION == 'lda':
    X_proj, projection_meta = apply_lda_projection(index_df=index_df, arrays=arrays, seed=SEED)
    arrays['target160'] = X_proj
    print('[stage2] applied supervised LDA projection')

_save_embeddings(OUT_DIR, index_df, arrays)
_save_json(projection_meta, OUT_DIR / 'projection_meta.json')

state['stage2_done'] = True
state['n_samples'] = int(len(index_df))
state['target_dim'] = int(arrays['target160'].shape[1])
state['projection_meta'] = projection_meta
_save_json(state, state_path)

print('n_samples:', len(index_df), 'target_dim:', arrays['target160'].shape[1])
index_df.head()


## Stage 3: Centroids, Exit Audit, and Final Artifacts

Goal: compute centroids and run the official Lab 2 exit checklist.

This stage writes:
- `validation_summary.json`
- `target_centroids.json`
- `lab2_exit_checklist.json`
- `neighbor_audit.csv`, `inter_centroid_separation.csv`, `global_genre_map_tsne.png`, etc.


In [None]:
if not RUN_STAGE3:
    raise RuntimeError('Stage 3 is disabled for this run.')

centroids_df = compute_centroids(
    index_df=index_df,
    target160=arrays['target160'],
    inlier_fraction=CENTROID_INLIER_FRACTION,
    inlier_metric='cosine',
)
centroid_distances_df = compute_centroid_distances(centroids_df)
summary = validate_target_space(index_df, arrays, seed=SEED)

summary['config'] = {
    'checkpoint': str(CHECKPOINT),
    'manifests_root': str(MANIFESTS_ROOT),
    'per_genre_samples': int(PER_GENRE_SAMPLES),
    'seed': int(SEED),
    'device': str(DEVICE),
    'projection': str(PROJECTION),
    'zstyle_weight': float(ZSTYLE_WEIGHT),
    'descriptor_weight': float(DESCRIPTOR_WEIGHT),
    'centroid_inlier_fraction': float(CENTROID_INLIER_FRACTION),
    'audit_inlier_fraction': float(AUDIT_INLIER_FRACTION),
    'neighbor_min_mean_hits': float(NEIGHBOR_MIN_MEAN_HITS),
}
summary['assigned_genre_counts'] = assigned_counts
summary['sampled_genre_counts'] = sampled_counts
summary['projection_meta'] = projection_meta

write_artifacts(
    output_dir=OUT_DIR,
    index_df=index_df,
    arrays=arrays,
    centroids_df=centroids_df,
    centroid_distances_df=centroid_distances_df,
    validation_summary=summary,
)

exit_checklist = run_exit_audit(
    output_dir=OUT_DIR,
    index_df=index_df,
    arrays=arrays,
    centroids_df=centroids_df,
    centroid_distances_df=centroid_distances_df,
    validation_summary=summary,
    silhouette_threshold=SILHOUETTE_THRESHOLD,
    sigma_multiplier=SIGMA_MULTIPLIER,
    neighbor_top_k=NEIGHBOR_TOP_K,
    neighbor_min_mean_hits=NEIGHBOR_MIN_MEAN_HITS,
    audit_inlier_fraction=AUDIT_INLIER_FRACTION,
    stability_sample_fraction=STABILITY_SAMPLE_FRACTION,
    stability_trials=STABILITY_TRIALS,
    tsne_max_points=TSNE_MAX_POINTS,
    seed=SEED,
)

state['stage3_done'] = True
state['lab2_done'] = bool(exit_checklist.get('lab2_done', False))
state['finished_at'] = datetime.now().isoformat(timespec='seconds')
state['key_metrics'] = summary.get('metrics', {})
state['exit_summary'] = exit_checklist
_save_json(state, state_path)

exit_checklist


## Results Snapshot

Use this cell for quick reporting and sanity checks.


In [None]:
quick = {
    'out_dir': str(OUT_DIR),
    'lab2_done': bool(state.get('lab2_done', False)),
    'metrics': state.get('key_metrics', {}),
    'assigned_genre_counts': state.get('assigned_genre_counts', {}),
    'sampled_genre_counts': state.get('sampled_genre_counts', {}),
}
quick
