### Visualize stimulus embeddings from multiple datasets to examine their diversity/clustering and test/train differences

In [None]:
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import os
import sys
sys.path.append(os.getenv('PYTHONPATH')) 
import json
import pandas as pd
from PIL import Image
import umap.umap_ as umap
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

from torchvision.transforms import v2

#local
from src.utils.dataset import FMRIDataset
from src.utils.transforms import SelectROIs
from src.utils.helpers import FilterDataset

In [None]:
#housekeeping
cols = ['fmri','stimulus_filename', 'subject_id', 'dataset_id']
all_data = {col: [] for col in cols}
root = os.path.join(os.getenv("DATASETS_ROOT", "/default/path/to/datasets"), "MOSAIC")
project_root = os.path.join(os.getenv("PROJECT_ROOT"))
rois = [f"GlasserGroup_{x}" for x in range(1,6)]
config = {"fmri": {"dataset_include": ['GOD','deeprecon'],
                   "subject_include": None,
                   "use_noiseceiling": True}}

In [None]:
#load a train/test json to identify filenames
    #load train and test jsons
with open(os.path.join(root, 'train_naturalistic.json'), 'r') as f:
    train_val_all = json.load(f)
with open(os.path.join(root, 'test_naturalistic.json'), 'r') as f:
    test_all = json.load(f)
dataset_preprocessing = FilterDataset(config)
train_val = dataset_preprocessing.filter_splits(train_val_all)
test = dataset_preprocessing.filter_splits(test_all)
subjectID_mapping = dataset_preprocessing.subjectID_map()
idx_to_subjectID = {v:k for k,v in subjectID_mapping.items()}

shuffled_indices_train_val = np.random.permutation(len(train_val))
train_val = [train_val[i] for i in shuffled_indices_train_val]

shuffled_indices_test= np.random.permutation(len(test))
test = [test[i] for i in shuffled_indices_test]
n = len(test)
#n=200
#train_val = train_val[:n] #just for debugging
#test = test[:n]

In [None]:
roi_selection = SelectROIs(selected_rois=rois)
fmri_tsfm = v2.Compose([roi_selection])
dataset = FMRIDataset(test, config['fmri']['use_noiseceiling'], 'average', subjectID_mapping=subjectID_mapping, fmri_transforms=fmri_tsfm)

In [None]:
cols = ['fmri','stimulus_filename', 'subject_id', 'dataset_id']
all_data = {col: [] for col in cols}

for idx in tqdm(range(len(dataset)), total=len(dataset), desc="Gathering responses from dataset"):
    sample = dataset.get_all_responses(idx)
    stimulus_filename = sample['stimulus_filename']
    #loop over individual responses for that stimulus
    for s in range(len(sample['fmri'])):
        fmri = sample['fmri'][s]
        subjectID = sample['subjectID'][s]

        if config['fmri']['use_noiseceiling']:
            noiseceiling = sample['noiseceiling'][s]
            all_data['fmri'].append(fmri*noiseceiling)
        else:
            all_data['fmri'].append(fmri)
        all_data['stimulus_filename'].append(stimulus_filename)
        all_data['subject_id'].append(idx_to_subjectID[subjectID])
        all_data['dataset_id'].append(idx_to_subjectID[subjectID].split('_')[-1])

In [None]:
df = pd.DataFrame(all_data)
X = np.vstack(df['fmri'].to_numpy())
print(X.shape)

In [None]:
print("Scaling...")
X_scaled = StandardScaler().fit_transform(X)
# Perform UMAP dimensionality reduction
print("UMAP reduction...")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

In [None]:
# Add UMAP projections to DataFrame
df['X'] = X_umap[:, 0]
df['Y'] = X_umap[:, 1]

# Plotting
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='X', y='Y', hue='dataset_id', palette='tab10', s=10, alpha=0.6)
#
plt.title('UMAP Projection')
plt.show()

In [None]:
unique_subjects = df['subject_id'].unique()
palette = sns.color_palette('tab10', len(unique_subjects))
color_map = dict(zip(unique_subjects, palette))
# Assuming 'df' is your DataFrame, 'X_tsne' contains the t-SNE results, and you have 'stimulus_filename' in df.

# Set up the figure
fig, ax = plt.subplots(figsize=(10, 8))

# Get the center coordinates of the plot
x_center = (np.max(X_umap[:, 0]) + np.min(X_umap[:, 0])) / 2
y_center = (np.max(X_umap[:, 1]) + np.min(X_umap[:, 1])) / 2
stretch = np.floor(700 / (np.abs(np.max(X_umap.ravel())) + np.abs(np.min(X_umap.ravel()))))
print("stretch:", stretch)
scaler = 0.02 #adjust according to the resolution of the image and number of images you are plotting

# Track minimum and maximum x and y for setting axis limits later
min_x, max_x = np.inf, -np.inf
min_y, max_y = np.inf, -np.inf
print("looping over dataframe rows...")
for idx, row in df.iterrows():
    img_path = os.path.join(root, "stimuli", "stimuli_compressed_quality-95_size-224", row['stimulus_filename'])
    label = row['subject_id']
    color = color_map[label]  # Get the color for the dataset_id

    x = row['X']
    y = row['Y']
    x_new = x_center + x*stretch
    y_new = y_center + y*stretch
    # plot middle frame 
    img = np.array(Image.open(os.path.join(img_path))).astype(np.float64) /255 #.astype('uint8')

    # Update min/max coordinates to accommodate the image extent
    min_x = min(min_x, x_new)
    max_x = max(max_x, x_new + scaler * img.shape[1])
    min_y = min(min_y, y_new)
    max_y = max(max_y, y_new + scaler * img.shape[0])

    # Add a border using plt.Rectangle with the subject_id's color
    rect = plt.Rectangle((x_new, y_new), scaler * img.shape[1], scaler * img.shape[0],
                         linewidth=1, edgecolor=color, facecolor='none', zorder=1)
    ax.add_patch(rect)

    ax.imshow(img, extent=[x_new, x_new+scaler*img.shape[1], y_new, y_new+scaler*img.shape[0]], zorder=2)

padding = scaler * 80  # Adjust based on image size
ax.set_xlim(min_x - padding, max_x + padding)
ax.set_ylim(min_y - padding, max_y + padding)
ax.set_aspect('equal')
ax.set_axis_off()

# Create legend patches for each dataset_id
legend_patches = [Patch(color=color_map[subject], label=subject) for subject in unique_subjects]
# Add the legend outside the plot
plt.legend(handles=legend_patches, title="Subject ID", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# Adjust layout to make space for the legend
plt.subplots_adjust(right=0.8)

print("saving plot...")
plot_fname = f"ROIs-{('-').join(rois)}_subjects-{('-').join(dataset_preprocessing.subjects_to_include)}_usenoiseceiling-{config['fmri']['use_noiseceiling']}_n-{n}_umap.png"
save_root = os.path.join(project_root, "src", "fmriDatasetPreparation", "visualizations", "fmri_embedding_tsne")
if not os.path.exists(save_root):
    os.makedirs(save_root)
plt.savefig(os.path.join(save_root, plot_fname), dpi=300)
plt.show()
plt.clf()

In [None]:
#save as json for viewing in webpage
df.drop('fmri', axis=1, inplace=True)
json_filename = f"ROIs-{('-').join(rois)}_n-{n}_umap.json"
df.to_json(os.path.join(project_root, "assets", json_filename), orient='records', lines=False)
df.to_json(os.path.join("/data/vision/oliva/blahner/projects/BrainEmbedder/data/umap", json_filename), orient='records', lines=False)

In [None]:
#save as json for viewing in webpage
