In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import numpy as np
import sys
sys.path.append(os.getenv('PYTHONPATH')) 
import matplotlib.pyplot as plt
import json
import seaborn as sns

from torchvision.transforms import v2
from pathlib import Path

#local
from src.utils.helpers import FilterDataset
from src.utils.dataset import FMRIDataset
from src.utils.transforms import SelectROIs

In [None]:
save_flag = False
trialselection='all' #'average'
root = os.path.join(os.getenv("DATASETS_ROOT", "/default/path/to/datasets"), "MOSAIC")
project_root = os.path.join(os.getenv("PROJECT_ROOT"))
print(f"root: {root}")
print(f"project root: {project_root}")

rois = ["GlasserGroup_1", "GlasserGroup_2", "GlasserGroup_3","GlasserGroup_4", "GlasserGroup_5"] #["LO1","LO2"] #["V1"]
ROI_selection = SelectROIs(selected_rois = rois)
fmri_tsfm = None #v2.Compose([ToTensorfMRI(dtype='float32')])

phase = 'test' #if using noiseceiling
n='avg' #if using noiseceiling

with open(os.path.join(root, 'train_naturalistic.json'), 'r') as f:
    train_all = json.load(f)
with open(os.path.join(root, 'test_naturalistic.json'), 'r') as f:
    test_all = json.load(f)
with open(os.path.join(root, 'test_artificial.json'), 'r') as f:
    artificial_all = json.load(f)
numsubs = {"BOLD5000": 4, "NSD": 8, "BMD": 10, "THINGS": 3, "NOD": 30, "HAD": 30, "GOD": 5, "deeprecon": 3}
all_subjects = []
for dset, nsubs in numsubs.items():
    for x in range(1, numsubs[dset]+1):
        all_subjects.append(f"sub-{x:02}_{dset}")


In [None]:
cols = ['fmri','stimulus_filename', 'subject_id', 'dataset_id']
eval_sets = ['train_naturalistic', 'test_naturalistic']

for subjectID in all_subjects: 
    all_data = {eval_set: {col: [] for col in cols} for eval_set in eval_sets}
    config = {"fmri": {"dataset_include": None,
                    "subject_include": [subjectID],
                    "use_noiseceiling": False}}
    dataset_preprocessing = FilterDataset(config['fmri']['subject_include'],
                                        config['fmri']['dataset_include'],
                                        config['fmri']['use_noiseceiling'])

    train, subjectID_mapping_train = dataset_preprocessing.filter_splits(train_all)
    test, subjectID_mapping_test = dataset_preprocessing.filter_splits(test_all)
    all_subjects_dict = {**subjectID_mapping_train, **subjectID_mapping_test}
    all_subjects_list_loop = list(all_subjects_dict.keys())
    assert(len(all_subjects_list_loop) == 1 and all_subjects_list_loop[0] == subjectID)

    stimuli_list = {'train_naturalistic': [Path(list(stim.keys())[0]).stem for stim in train],
                    'test_naturalistic': [Path(list(stim.keys())[0]).stem for stim in test]}

    dataset = FMRIDataset(test, ROI_selection, config['fmri']['use_noiseceiling'], trialselection, fmri_transforms=fmri_tsfm)
    
    sample = dataset.load_responses_block_hdf5(subjectID, verbose=True)
    stimuli = sample['stimulus_filename']
    for idx, stim in enumerate(stimuli):
        fmri = sample['fmri'][idx,:]
        subject_stim = f"{subjectID}_stimulus-{stim}"
        for eval_set in eval_sets:
            if subject_stim in stimuli_list[eval_set]:
                if config['fmri']['use_noiseceiling']:
                    noiseceiling = sample['noiseceiling'][f"{subjectID}_phase-{phase}_n-{n}"]
                    all_data[eval_set]['fmri'].append(fmri*noiseceiling)
                else:
                    all_data[eval_set]['fmri'].append(fmri)
                all_data[eval_set]['stimulus_filename'].append(stim)
                all_data[eval_set]['subject_id'].append(subjectID)
                all_data[eval_set]['dataset_id'].append(subjectID.split('_')[-1])
                continue #no need to check other eval sets. Note that some stimuli got removed from the filtering so are not part of any eval set.
    
    train_data = np.vstack(all_data['train_naturalistic']['fmri'])
    test_data = np.vstack(all_data['test_naturalistic']['fmri'])

    print(f"{'*' * 10} {subjectID} stats {'*' * 10}")
    print(f"Train set np.mean(np.mean(train_data, axis=1)): {np.mean(np.mean(train_data, axis=1))}")
    print(f"Train set np.mean(np.std(train_data, axis=1)): {np.mean(np.std(train_data, axis=1))}")
    print(f"Train set min ({np.min(train_data)}) and max ({np.max(train_data)})")
    print(f"Test set np.mean(np.mean(test_data, axis=1)): {np.mean(np.mean(test_data, axis=1))}")
    print(f"Test set np.mean(np.std(test_data, axis=1)): {np.mean(np.std(test_data, axis=1))}")
    print(f"Test set min ({np.min(test_data)}) and max ({np.max(test_data)})")

    train_mean = np.mean(train_data)
    train_std = np.std(train_data)
    test_mean = np.mean(test_data)
    test_std = np.std(test_data)
    
    # Create a figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Plot histogram for train data
    ax1.hist(train_data.ravel(), bins=50, alpha=0.7, color='blue')
    ax1.set_title(f'Training Data Distribution\nMean: {train_mean:.4f}, Std: {train_std:.4f}')
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Frequency')

    # Plot histogram for test data
    ax2.hist(test_data.ravel(), bins=50, alpha=0.7, color='orange')
    ax2.set_title(f'Testing Data Distribution\nMean: {test_mean:.4f}, Std: {test_std:.4f}')
    ax2.set_xlabel('Value')
    ax2.set_ylabel('Frequency')

    # Add a main title
    fig.suptitle(f'{subjectID} Distribution of fMRI Values', fontsize=16)

    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    # Show the plot
    plt.show()
    plt.close(fig)


In [None]:
cols = ['fmri','stimulus_filename', 'subject_id', 'dataset_id']
eval_sets = ['test_artificial']

for subjectID in all_subjects: 
    dset = subjectID.split('_')[-1]
    if dset not in ['NSD', 'deeprecon']:
        continue
    all_data = {eval_set: {col: [] for col in cols} for eval_set in eval_sets}
    config = {"fmri": {"dataset_include": None,
                    "subject_include": [subjectID],
                    "use_noiseceiling": False}}
    dataset_preprocessing = FilterDataset(config['fmri']['subject_include'],
                                        config['fmri']['dataset_include'],
                                        config['fmri']['use_noiseceiling'])

    test_art, subjectID_mapping_test_art = dataset_preprocessing.filter_splits(artificial_all)
    all_subjects_dict = {**subjectID_mapping_test_art}
    all_subjects_list_loop = list(all_subjects_dict.keys())
    assert(len(all_subjects_list_loop) == 1 and all_subjects_list_loop[0] == subjectID)

    stimuli_list = {'test_artificial': [Path(list(stim.keys())[0]).stem for stim in test_art]}

    dataset = FMRIDataset(test_art, ROI_selection, config['fmri']['use_noiseceiling'], trialselection, fmri_transforms=fmri_tsfm)
    
    sample = dataset.load_responses_block_hdf5(subjectID, verbose=True)
    stimuli = sample['stimulus_filename']
    for idx, stim in enumerate(stimuli):
        fmri = sample['fmri'][idx,:]
        subject_stim = f"{subjectID}_stimulus-{stim}"
        for eval_set in eval_sets:
            if subject_stim in stimuli_list[eval_set]:
                if config['fmri']['use_noiseceiling']:
                    noiseceiling = sample['noiseceiling'][f"{subjectID}_phase-{phase}_n-{n}"]
                    all_data[eval_set]['fmri'].append(fmri*noiseceiling)
                else:
                    all_data[eval_set]['fmri'].append(fmri)
                all_data[eval_set]['stimulus_filename'].append(stim)
                all_data[eval_set]['subject_id'].append(subjectID)
                all_data[eval_set]['dataset_id'].append(subjectID.split('_')[-1])
                continue #no need to check other eval sets. Note that some stimuli got removed from the filtering so are not part of any eval set.
    
    test_art_data = np.vstack(all_data['test_artificial']['fmri'])

    print(f"{'*' * 10} {subjectID} stats {'*' * 10}")
    print(f"Test art set np.mean(np.mean(test_art_data, axis=1)): {np.mean(np.mean(test_art_data, axis=1))}")
    print(f"Test art set np.mean(np.std(test_art_data, axis=1)): {np.mean(np.std(test_art_data, axis=1))}")
    print(f"Test art set min ({np.min(test_art_data)}) and max ({np.max(test_art_data)})")

    test_art_mean = np.mean(test_art_data)
    test_art_std = np.std(test_art_data)
    
    # Create a figure with two subplots side by side
    fig, ax1 = plt.subplots(1, 1, figsize=(14, 5))

    # Plot histogram for test data
    ax1.hist(test_art_data.ravel(), bins=50, alpha=0.7, color='orange')
    ax1.set_title(f'{subjectID} Artificial Testing Data Distribution\nMean: {test_art_mean:.4f}, Std: {test_art_std:.4f}')
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Frequency')

    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    # Show the plot
    plt.show()
    plt.close(fig)
