In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import pandas as pd
import glob

### Validate that each imagenet session shows 1 unique image from all 1000 categories.

In [None]:
dataset_root = os.path.join(os.getenv("DATASETS_ROOT", "/default/path/to/datasets"),"NaturalObjectDataset") #use default if DATASETS_ROOT env variable is not set.
project_root = os.getenv("PROJECT_ROOT", "/default/path/to/project")
print(f"dataset_root: {dataset_root}")
print(f"project_root: {project_root}")
fmri_path = os.path.join(dataset_root,"derivatives", "GLM")
task='imagenet'

#first define the stimulus order and matrix.
#Next we will essentially place the betas into this pre-defined matrix
#imagenet
with open(os.path.join(dataset_root,"derivatives", "stimuli_metadata", "testtrain_split", "synset_words_edited.txt"), 'r') as f:
    # Initialize lists to store the columns
    imagenet_names = []

    # Iterate through each line in the file
    for line in f:
        # Split the line at the first space to get the 'n*' code and the labels
        parts = line.strip().split(' ', 1)  # Split on first space only
        imagenet_names.append(parts[0])  # First part is the imagenet name

#get a list of imagenet filenames n*. order doesnt matter
assert(len(imagenet_names) == 1000)
subject_betas = {} #this will be a big dictionary holding all the beta estimates from the subjects

nvertices = 91282
session_group = "sessiongroup-01"

In [None]:
allsubject_images = set() #keep track of all images shown across subjects and sessions
allsubject_categories = set() #keep track of all categories shown across subjects and sessions

for sub in range(1,31):
    subject = f"sub-{int(sub):02}"
    print('*'*20)
    print(f"starting subject {subject}")

    #search over all sessions to find the ones that include runs of the specified task
    session_path = os.path.join(dataset_root, "derivatives", "fmriprep", subject)
    sessions_tmp = sorted(glob.glob(os.path.join(session_path, f"*imagenet*"))) + sorted(glob.glob(os.path.join(session_path, f"*coco*")))  #compile all the session numbers
    assert(len(sessions_tmp) > 0)
    sessions = []
    for s in sessions_tmp:
        sname = s.split("/")[-1]
        if "imagenet05" not in sname:
            sessions.append(sname)
    assert(len(sessions) > 0)
    print(f"Found {len(sessions)} sessions")
    print(f"{sessions}")

    allsessions_images = set() #keep track of all images shown across sessions
    allsessions_categories = set() #keep track of all categories shown across sessions
    
    for session_count, session_path in enumerate(sessions):
        allruns_images = set()  #keep track of all images shown across the runs in a sessions for one subject
        allruns_categories = set()#keep track of all categories shown across the runs in a sessions for one subject
        session = session_path.split('/')[-1]
        if 'coco' in session:
            continue
        elif 'imagenet' in session:
            task='imagenet'
            events_stim_field = 'stim_file'
        else:
            raise ValueError("Invalid task name. Must be either coco or imagenet session.")
        
        numruns = len(glob.glob(os.path.join(dataset_root, "derivatives", "fmriprep", subject, session, "func", f"{subject}_{session}_task-{task}_run-*_desc-confounds_timeseries.tsv")))  #nothing special about the confounds file choice
        assert(numruns > 0)
        print(f"Found {numruns} runs for subject {subject} session {session}")

        ##Load eventts and data for each run
        for count, run in enumerate(range(1,numruns+1)):
            #print("run:",run)
            #load events
            tmp = pd.read_table(os.path.join(dataset_root, "Nifti", subject, session, "func", f"{subject}_{session}_task-{task}_run-{run:02}_events.tsv"))
            for idx, stim in enumerate(tmp.loc[:,events_stim_field]):
                if str(stim) == 'nan': #for coco subject 1, blank trials are not input into the events file. for subjects 2-9, they are listed as n/a conditions with their own onsets
                    continue
                if 'imagenet' in stim:
                    imagenet_category = stim.split('/')[1]
                    imagenet_filename = stim.split('/')[2]
                elif 'coco' in stim:
                    tmp = stim.split('/')[-1]
                    coco_filename = tmp.split('.')[0]
                allruns_images.add(imagenet_filename)
                allruns_categories.add(imagenet_category)
        print(f"Number of unique imagenet images in session {session}: {len(allruns_images)}")
        print(f"Number of unique imagenet categories in session {session}: {len(allruns_categories)}")
        assert(len(allruns_images) == 1000)
        assert(len(allruns_categories) == 1000)

        allsessions_images.update(allruns_images)
        allsessions_categories.update(allruns_categories)
    if sub < 10:
        assert(len(allsessions_images) == 4000)
        assert(len(allsessions_categories) == 1000)
    else:
        assert(len(allsessions_images) == 1000)
        assert(len(allsessions_categories) == 1000)

    allsubject_images.update(allsessions_images)
    allsubject_categories.update(allsessions_categories)
    
print(f"Number of unique imagenet images across all sessions: {len(allsubject_images)}")
print(f"Number of unique imagenet categories across all sessions: {len(allsubject_categories)}")
assert(len(allsubject_images) == 57000)
assert(len(allsubject_categories) == 1000)