### Visualize stimulus embeddings from multiple datasets to examine their diversity/clustering and test/train differences

In [1]:
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import os
import glob
import pandas as pd
import umap.umap_ as umap
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [8]:
#housekeeping
dataset_root = os.path.join(os.getenv("DATASETS_ROOT", "/default/path/to/datasets")) #use default if DATASETS_ROOT env variable is not set.
image_embedding_model = "dreamsim"
cols = ['embeddings','label','stimulus_id', 'dataset_id']
all_data = {col: [] for col in cols}
image_set = 'coco' #'imagenet' # either imagenet or coco
subject = "sub-10" #only matters for imagenet. coco has the same split for all (1-9) subjects

In [10]:
### load NOD data
#load the train/test split

if image_set == 'imagenet':
    with open(os.path.join(dataset_root, "NaturalObjectDataset","derivatives","stimuli_metadata","testtrain_split", f"{subject}_imagenet_groupings_rdm.pkl"), 'rb') as f:
        splits = pickle.load(f)
    all_stim = [v for _, values in splits.items() for v in values]
elif image_set == 'coco':
    with open(os.path.join(dataset_root, "NaturalObjectDataset","derivatives","stimuli_metadata","testtrain_split", "coco_groupings_rdm.pkl"), 'rb') as f:
        splits = pickle.load(f)
    all_stim = [v for _, values in splits.items() for v in values]

embedding_paths = glob.glob(os.path.join(dataset_root, "NaturalObjectDataset","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    if stimulus_name in all_stim:
        all_data['dataset_id'].append(f"NOD-{image_set}") #continue
    else:
        continue #all_data['dataset_id'].append("NOD-coco")
    all_data['embeddings'].append(np.load(stim_embedding))
    all_data['stimulus_id'].append(stimulus_name)
    
    if stimulus_name in splits['group_01']:
        all_data['label'].append('training')
    elif stimulus_name in splits['group_02']:
        all_data['label'].append('testing') 



In [None]:
df = pd.DataFrame(all_data)
X = np.vstack(df['embeddings'].to_numpy())
print(X.shape)

In [12]:
X_scaled = StandardScaler().fit_transform(X)

In [None]:
# Perform UMAP dimensionality reduction
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding = reducer.fit_transform(X_scaled)

In [None]:
# Add UMAP projections to DataFrame
df['UMAP1'] = embedding[:, 0]
df['UMAP2'] = embedding[:, 1]

# Plotting
plt.figure(figsize=(10, 8))
#sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='dataset_id', style='label', palette='tab10', s=10, alpha=0.6)
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='label', palette='tab10', s=10, alpha=0.6)
if image_set == 'imagenet':
    plt.title(f'UMAP Projection for {image_set} {subject}')
elif image_set == 'coco':
    plt.title(f'UMAP Projection for {image_set} all subjects')
plt.show()