### Visualize stimulus embeddings from multiple datasets to examine their diversity/clustering and test/train differences
outdated. load stimuli from the shared/compiled stimulus folder and use the test/train split defined in either the compiled train/test jsons or each fmri datasets stiminfo.tsv file

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import numpy as np
import sys
sys.path.append(os.getenv('PYTHONPATH')) 
import glob
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [None]:
#housekeeping
dataset_root = os.path.join(os.getenv('DATASETS_ROOT'))
video_embedding_model = "microsoft_xclip-large-patch14"
image_embedding_model = "openai_clip-vit-large-patch14"
cols = ['embeddings','label','stimulus_id', 'dataset_id']
all_data = {col: [] for col in cols}

In [15]:
### load BMD data
embedding_paths = glob.glob(os.path.join(dataset_root, "BOLDMomentsDataset","derivatives","stimuli_metadata",f"{video_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{video_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    if int(stimulus_name) > 1000:
        all_data['label'].append('testing')
    else:
        all_data['label'].append('training')
    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append("BMD")

In [16]:
### load HAD data
embedding_paths = glob.glob(os.path.join(dataset_root, "HumanActionsDataset","derivatives","stimuli_metadata",f"{video_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{video_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    all_data['label'].append('training') #all HAD were used for training
    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append("HAD")

In [22]:
### load CC2017 data
embedding_paths = glob.glob(os.path.join(dataset_root, "CC2017","video_fmri_dataset","stimuli_metadata","clipped_2s", f"{video_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{video_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    if 'test' in stimulus_name:
        all_data['label'].append('testing')
    elif 'seg' in stimulus_name:
        all_data['label'].append('training')
    else:
        raise ValueError("Invalid stimulus name")
    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append("CC2017")

In [32]:
### load NSD data
embedding_paths = glob.glob(os.path.join(dataset_root, "NaturalScenesDataset","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images

    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append("NSD")

In [3]:
### load GOD data
embedding_paths = glob.glob(os.path.join(dataset_root, "GenericObjectDecoding","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images

    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append("GOD")

In [9]:
### load deeprecon data
embedding_paths = glob.glob(os.path.join(dataset_root, "deeprecon","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images

    all_data['embeddings'].append(np.load(stim_embedding)[0])
    if 'colorExpStim' in stimulus_name:
        all_data['dataset_id'].append("deeprecon-shape")
    elif 'letter_' in stimulus_name:
        all_data['dataset_id'].append("deeprecon-letter")
    else:
        all_data['dataset_id'].append('deeprecon-naturalimages')

In [15]:
### load BOLD5000 data
embedding_paths = glob.glob(os.path.join(dataset_root, "BOLD5000","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images
    all_data['embeddings'].append(np.load(stim_embedding)[0])
    all_data['dataset_id'].append('BOLD5000')

In [27]:
### load NOD data
embedding_paths = glob.glob(os.path.join(dataset_root, "NaturalObjectDataset","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images

    all_data['embeddings'].append(np.load(stim_embedding)[0])
    if stimulus_name[0] == 'n':
        all_data['dataset_id'].append("NOD-imagenet")
    else:
        all_data['dataset_id'].append("NOD-coco")


In [21]:
### load THINGS_fmri data
embedding_paths = glob.glob(os.path.join(dataset_root, "THINGS_fmri","derivatives","stimuli_metadata", f"{image_embedding_model}_embeddings","*.npy"))
for stim_embedding in embedding_paths:
    filename = Path(stim_embedding).stem
    stimulus_name = filename.split(f"_model-{image_embedding_model}")[0]
    all_data['stimulus_id'].append(stimulus_name)
    
    all_data['label'].append('training') #TODO separate training and testing images

    all_data['embeddings'].append(np.load(stim_embedding)[0])
    #all_data['dataset_id'].append("THINGS")
    dataset_letter = stimulus_name.split('.jpg')[-1][-1]
    if dataset_letter == 'n':
        all_data['dataset_id'].append("THINGS-imagenet")
    elif dataset_letter == 'b':
        all_data['dataset_id'].append("THINGS-behavioral")
    elif dataset_letter == 's':
        all_data['dataset_id'].append("THINGS-googleimages")
    else:
        raise ValueError(f"dataset_letter {dataset_letter} not recognized")


In [None]:
df = pd.DataFrame(all_data)
X = np.vstack(df['embeddings'].to_numpy())
print(X.shape)

In [34]:
X_scaled = StandardScaler().fit_transform(X)

In [None]:
# Perform UMAP dimensionality reduction
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding = reducer.fit_transform(X_scaled)

In [None]:
# Add UMAP projections to DataFrame
df['UMAP1'] = embedding[:, 0]
df['UMAP2'] = embedding[:, 1]

# Plotting
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='dataset_id', style='label', palette='tab10', s=10, alpha=0.6)

plt.title('UMAP Projection')
plt.show()