In [None]:
import os
import numpy as np
from vendi_score import vendi
from pprint import pprint   
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# NOTE: Set this to either images or videos
images_or_videos = 'images'  # 'images' or 'videos'

if images_or_videos == 'videos':
    embedding_model = 'facebook_vjepa2-vitl-fpc64-256'
    embedding_dir_dict = {
        'BabyView': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/babyview/{embedding_model}',
        'SAYCam': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/SAYCam/{embedding_model}',
        'Ego4D': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/ego4D/{embedding_model}',
        'Kinetics400': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/kinetics_train//{embedding_model}',
        'SSv2': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/ssv2/{embedding_model}',
        'MomentsInTime': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/Moments_in_Time_Raw_training/{embedding_model}',
        'Physion': f'/ccn2a/dataset/babyview/2025.2/outputs/video_embeddings/physion/{embedding_model}',
    }
    
elif images_or_videos == 'images':
    embedding_model = 'facebook_dinov3-vitb16-pretrain-lvd1689m'
    # embedding_model = 'facebook_dinov2-base'
    embedding_dir_dict = {
        'BabyView': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/babyview/{embedding_model}',
        'SAYCam': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/SAYCam/{embedding_model}',
        'Ego4D': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/ego4D/{embedding_model}',
        'Kinetics400': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/kinetics400_train//{embedding_model}',
        'SSv2': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/ssv2/{embedding_model}',
        'MomentsInTime': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/Moments_in_Time_Raw_training/{embedding_model}',
        'Physion': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/physion/{embedding_model}',
        'ImageNet': f'/ccn2a/dataset/babyview/2025.2/outputs/image_embeddings/imagenet_test/{embedding_model}',
    }

In [None]:
def get_num_files(embedding_dir):
    files = [f for f in os.listdir(embedding_dir) if f.endswith(".npy")]
    return len(files)

def load_embeddings(embedding_dir, num_samples=1000):
    files = [f for f in os.listdir(embedding_dir) if f.endswith(".npy")]
    files = np.random.choice(files, min(num_samples, len(files)), replace=False)
    X = np.stack([np.load(os.path.join(embedding_dir, f)) for f in files])  # (n, d)
    return X

In [None]:
results = {}
embedding_dim = None
for dataset_name, embedding_dir in embedding_dir_dict.items():
    X = load_embeddings(embedding_dir)          # (n, 1024)
    embedding_dim = X.shape[1]
    num_files = get_num_files(embedding_dir)

    results[dataset_name] = {
        'num_files': num_files,
        'embeddings': X,
    }
    print(f'{dataset_name}: {num_files} files, {X.shape[0]} samples, {X.shape[1]} dim')

## Plotting

In [None]:
# --- t-SNE across datasets (continue from your code above) --------------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd

# Config: cap how many points per dataset for speed (set to None for all)
max_per_dataset = 1000
random_state = 42

# 1) Collect a balanced sample across datasets
Xs = []
labels = []
row_ids = []  # index within each dataset's embedding array
for ds_name, info in results.items():
    X = info["embeddings"]  # (n, d)
    n = X.shape[0]
    if n == 0:
        continue
    if (max_per_dataset is None) or (n <= max_per_dataset):
        idx = np.arange(n)
    else:
        rng = np.random.default_rng(random_state)
        idx = rng.choice(n, size=max_per_dataset, replace=False)
    Xs.append(X[idx])
    labels.extend([ds_name] * len(idx))
    row_ids.extend(idx.tolist())

X_all = np.vstack(Xs)                      # (N, d)
labels = np.array(labels)                  # (N,)
row_ids = np.array(row_ids)                # (N,)

N, D = X_all.shape
print(f"[t-SNE] Using {N} points across {len(np.unique(labels))} datasets; dim={D}")

# 2) Optional PCA to 50D (common speedup + denoising before t-SNE)
pca_dim = min(50, D)
X_pca = PCA(n_components=pca_dim, random_state=random_state).fit_transform(X_all)

# 3) Pick a valid perplexity based on N
#    t-SNE requires (3 * perplexity + 1) < N; keep it in a sane range
max_perp = max(5, int((N - 1) / 3))
perplexity = min(50, max_perp)  # cap at 50 by default
print(f"[t-SNE] Using perplexity={perplexity}")

tsne = TSNE(
    n_components=2,
    perplexity=perplexity,
    learning_rate=200,   # use numeric for compatibility
    init="pca",
    random_state=random_state,
    verbose=1,
)
X_2d = tsne.fit_transform(X_pca)


# 4) Plot — honor embedding_dir_dict order for colors & legend
plt.figure(figsize=(8, 7), dpi=120)

# Datasets in the exact insertion order of embedding_dir_dict,
# but keep only those present in the current labels
ordered_datasets_all = list(embedding_dir_dict.keys())
present = set(labels.tolist())
ordered_datasets_present = [ds for ds in ordered_datasets_all if ds in present]

# Stable color map in that exact order
num_classes = len(ordered_datasets_present)
cmap = plt.get_cmap("tab20" if num_classes > 10 else "tab10")
color_map = {ds: cmap(i % cmap.N) for i, ds in enumerate(ordered_datasets_present)}

# Reproducible RNG for plotting subsample
rng = np.random.default_rng(random_state)

handles = []
for ds in ordered_datasets_present:
    mask = (labels == ds)
    ds_idx = np.flatnonzero(mask)
    # Plot at most 200 points for this dataset
    if ds_idx.size > 200:
        plot_idx = rng.choice(ds_idx, size=200, replace=False)
    else:
        plot_idx = ds_idx

    sc = plt.scatter(
        X_2d[plot_idx, 0],
        X_2d[plot_idx, 1],
        s=10,
        alpha=0.75,
        c=[color_map[ds]],
        label=f"{ds} (n={plot_idx.size})",
        edgecolors="none",
    )
    handles.append(sc)

plt.title(f"t-SNE of {images_or_videos} using {embedding_model}")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
# Legend order matches embedding_dir_dict because we pass handles in order
plt.legend(handles=handles, loc="best", frameon=True, fontsize=8, markerscale=2.0)
plt.tight_layout()

# 5) Save outputs
out_png = f"figures/tsne_{images_or_videos}.png"
os.makedirs(os.path.dirname(out_png), exist_ok=True)
plt.savefig(out_png)
print(f"[t-SNE] Figure saved to: {out_png}")

df_out = pd.DataFrame({
    "x": X_2d[:, 0],
    "y": X_2d[:, 1],
    "dataset": labels,
    "row_id_within_dataset": row_ids,
})
# If running in a notebook, also show it inline:
plt.show()