## Setup & imports

In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
sns.set(style='whitegrid')
# add project root to path for src imports
ROOT = os.path.abspath(os.path.join('..'))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
from src.vae import VAE
from src.eval_metrics import compute_metrics

## Load latents and cluster assignments

In [None]:
latents_path = '../results/demo_vae/latents.npy'
assign_path = '../results/demo_analysis/assignments.csv'
X = np.load(latents_path)
print('Latents shape:', X.shape)
# try to load assignments if present
import pandas as pd
if os.path.exists(assign_path):
    df_assign = pd.read_csv(assign_path)
    labels = df_assign['cluster'].values
    print('Loaded assignments, n=', len(labels))
else:
    labels = None
    print('No assignments found; will compute new clustering if needed')

## Visualizations (PCA, t-SNE, UMAP)

In [None]:
# PCA 2D
pca = PCA(n_components=min(2, X.shape[1]))
Xpca = pca.fit_transform(X)
plt.figure(figsize=(6,6))
if labels is None:
    plt.scatter(Xpca[:,0], Xpca[:,1], s=60)
else:
    sns.scatterplot(x=Xpca[:,0], y=Xpca[:,1], hue=labels, palette='tab10', s=60)
plt.title('PCA of latents')
plt.tight_layout()
plt.show()

# t-SNE (safe perplexity)
n_samples = X.shape[0]
perp = min(30, max(2, (n_samples - 1) // 3))
tsne = TSNE(n_components=2, random_state=123, perplexity=perp)
Xtsne = tsne.fit_transform(X)
plt.figure(figsize=(6,6))
if labels is None:
    plt.scatter(Xtsne[:,0], Xtsne[:,1], s=60)
else:
    sns.scatterplot(x=Xtsne[:,0], y=Xtsne[:,1], hue=labels, palette='tab10', s=60)
plt.title(f't-SNE (perplexity={perp})')
plt.tight_layout()
plt.show()

# UMAP (optional)
try:
    import umap
    reducer = umap.UMAP(n_components=2, random_state=123)
    Xumap = reducer.fit_transform(X)
    plt.figure(figsize=(6,6))
    if labels is None:
        plt.scatter(Xumap[:,0], Xumap[:,1], s=60)
    else:
        sns.scatterplot(x=Xumap[:,0], y=Xumap[:,1], hue=labels, palette='tab10', s=60)
    plt.title('UMAP of latents')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print('UMAP not available or failed:', e)

## Reconstructions from the VAE

In [None]:
# Load model checkpoint and reconstruct a few examples from saved latents
import torch
model_path = '../results/demo_vae/model_epoch10.pt'
if os.path.exists(model_path):
    # attempt to infer input_dim from a feature file
    feat_dir = '../data/features/multimodal'
    sample_files = [f for f in os.listdir(feat_dir) if f.endswith('.npy')]
    sample = np.load(os.path.join(feat_dir, sample_files[0]))
    input_dim = sample.shape[0]
    from src.vae import VAE
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = VAE(input_dim=input_dim, hidden_dim=256, latent_dim=X.shape[1]).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    # reconstruct first 3 examples
    import numpy as _np
    for i in range(min(3, X.shape[0])):
        z = torch.from_numpy(X[i].astype('float32')).unsqueeze(0).to(device)
        with torch.no_grad():
            recon = model.decode(z).cpu().numpy().squeeze(0)
        orig = np.load(os.path.join(feat_dir, sample_files[i]))
        mse = ((recon - orig)**2).mean()
        print(f'Example {i}: reconstruction MSE = {mse:.4f}')
else:
    print('Model checkpoint not found at', model_path)

## Conclusions & next steps
- The demo shows that latent clustering yields reasonable silhouette and DB scores for the synthetic data.
- Next: run the same pipeline on the real Jamendo dataset (place files in `data/raw`), then produce a NeurIPS-style results section and figures.