In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
import sys
sys.path.append(os.getenv('PYTHONPATH'))
import numpy as np
import matplotlib.pyplot as plt
import torch

import pickle
import hcp_utils as hcp
from nilearn import plotting
import json
import pandas as pd
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from torchvision.transforms import v2

#local imports
from src.utils.transforms import SelectROIs, ToTensorfMRI
from src.utils.helpers import FilterDataset, vectorized_correlation
from src.utils.dataset import FMRIDataset

pixdim[1,2,3] should be non-zero; setting 0 dims to 1


In [8]:
project_root = os.getenv("PROJECT_ROOT", "/default/path/to/datasets") #use default if DATASETS_ROOT env variable is not set.
dataset_root = os.path.join(os.getenv("DATASETS_ROOT", "/default/path/to/datasets"), "MOSAIC") #use default if DATASETS_ROOT env variable is not set.

rois = [f"GlasserGroup_{x}" for x in range(1,6)]
ROI_selection = SelectROIs(selected_rois=rois)
#NSDsubjects = [f"sub-{x:02}_NSD" for x in range(1,9)]
subjectID = 'sub-01_NSD'
img_tsfm = v2.Compose([v2.ToImage(),
                       v2.ToTensor(),
                       v2.ToDtype(torch.float32, scale=True)
                       ])
fmri_tsfm = v2.Compose([ToTensorfMRI(dtype='float32')])
#dataset = FMRIDataset(None, ROI_selection, use_noiseceiling=True, trial_selection='average', fmri_transforms=fmri_tsfm, img_transforms=img_tsfm)
save_root = os.path.join(project_root, "src", "utils", "output", "data_tmp")
if not os.path.exists(save_root):
    os.makedirs(save_root)




In [9]:
#sample = dataset.load_responses_block_hdf5(subjectID, verbose=True, load_stimulus=True)

In [15]:
with open(os.path.join(dataset_root, 'train_naturalistic.json'), 'r') as f:
    train_val_naturalistic_all = json.load(f)
with open(os.path.join(dataset_root, 'test_naturalistic.json'), 'r') as f:
    test_naturalistic_all = json.load(f)
with open(os.path.join(dataset_root, 'test_artificial.json'), 'r') as f:
    test_artificial_all = json.load(f)

dataset_preprocessing_train_val_naturalistic = FilterDataset(subject_include=[subjectID],
                                                dataset_include=None,
                                                use_noiseceiling=True)

dataset_preprocessing_test_naturalistic = FilterDataset(subject_include=[subjectID],
                                                dataset_include=None,
                                                use_noiseceiling=True)

dataset_preprocessing_test_artificial = FilterDataset(subject_include=[subjectID],
                                                dataset_include=None,
                                                use_noiseceiling=True)

train_val_naturalistic, subjectID_mapping_train_val_naturalistic = dataset_preprocessing_train_val_naturalistic.filter_splits(train_val_naturalistic_all)
test_naturalistic, subjectID_mapping_test_naturalistic = dataset_preprocessing_test_naturalistic.filter_splits(test_naturalistic_all)
test_artificial, subjectID_mapping_test_artificial = dataset_preprocessing_test_artificial.filter_splits(test_artificial_all)
assert len(set(subjectID_mapping_train_val_naturalistic.keys()) - set(subjectID_mapping_test_naturalistic.keys())) == 0, f"Trainng and testing subject filters should return the same set of subjects"

training_subjects = sorted(list(subjectID_mapping_train_val_naturalistic.keys()),key=lambda x: (x.split('_')[1], int(x.split('_')[0].split('-')[-1])))

dataset_train_val_naturalistic = FMRIDataset(train_val_naturalistic, ROI_selection, use_noiseceiling=False, trial_selection='average', img_transforms=img_tsfm, fmri_transforms=fmri_tsfm)
dataset_test_naturalistic = FMRIDataset(test_naturalistic, ROI_selection, use_noiseceiling=False, trial_selection='average', img_transforms=img_tsfm, fmri_transforms=fmri_tsfm)
dataset_test_artificial = FMRIDataset(test_artificial, ROI_selection, use_noiseceiling=False, trial_selection='average', img_transforms=img_tsfm, fmri_transforms=fmri_tsfm)


In [16]:
subDict = {'fmri_train_naturalistic': [],
           'fmri_test_naturalistic': [],
           'fmri_test_artificial': [],
           'stimulus_train_naturalistic': [],
           'stimulus_test_naturalistic': [],
           'stimulus_test_artificial': [],
           'noiseceilings': dataset_train_val_naturalistic.subject_noiseceilings[subjectID], #doesnt matter which dataset class we use here.
           'subjectID': []}

for idx in tqdm(range(len(dataset_train_val_naturalistic)), total=len(dataset_train_val_naturalistic)):
    sample = dataset_train_val_naturalistic[idx]
    subDict['fmri_train_naturalistic'].append(sample['fmri'])
    subDict['stimulus_train_naturalistic'].append(sample['stimulus'])
    subDict['subjectID'].append(subjectID)

for idx in tqdm(range(len(dataset_test_naturalistic)), total=len(dataset_test_naturalistic)):
    sample = dataset_test_naturalistic[idx]
    subDict['fmri_test_naturalistic'].append(sample['fmri'])
    subDict['stimulus_test_naturalistic'].append(sample['stimulus'])
    subDict['subjectID'].append(subjectID)

for idx in tqdm(range(len(dataset_test_artificial)), total=len(dataset_test_artificial)):
    sample = dataset_test_artificial[idx]
    subDict['fmri_test_artificial'].append(sample['fmri'])
    subDict['stimulus_test_artificial'].append(sample['stimulus'])
    subDict['subjectID'].append(subjectID)


100%|██████████| 8954/8954 [11:29<00:00, 12.99it/s]
100%|██████████| 1000/1000 [01:20<00:00, 12.49it/s]
100%|██████████| 284/284 [00:15<00:00, 18.13it/s]


In [17]:
# {image: Tensor, fmri: Tensor(float32), ID: str} per subject

#torch.save

#subDict = {'image': sample['stimulus'], 'fmri': sample['fmri'], 'subjectID': sample['subjectID']}
torch.save(subDict, os.path.join(save_root, f"{subjectID}_rois-GG1_5.pth"))
