# 03 - NSD Packaging Preparation

Build Brain-Score-ready assemblies with per-rep test data for InternalConsistency ceiling.

**Inputs:**
- Notebook 02 per-region averaged assemblies (`Allen2022.{V1,V2,V4,IT}.nc`)
- Raw NSD HDF5 betas (for per-rep test extraction)
- `nsd_stimuli.hdf5` (for stimulus images)

**Outputs** (to `/Volumes/Hagibis/nsd/brainscore/`):
- `Allen2022_fmri_train.nc` -- train assembly: (412, ~57K, 1)
- `Allen2022_fmri_test.nc` -- test assembly: (309, ~57K, 1) with repetition coord
- `stimuli_train/` and `stimuli_test/` -- extracted stimulus images
- `stimulus_metadata_{train,test}.csv`

**Key decision:** Test assembly keeps 3 reps separate (103 images x 3 = 309 presentations)
so that `InternalConsistency` can compute split-half ceiling, matching the Hebart2023 pattern.

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py
import nibabel as nib
import scipy.io
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import time
import gc

NSD_ROOT = Path('/Volumes/Hagibis/nsd')
ASSEMBLIES_DIR = NSD_ROOT / 'assemblies'  # notebook 02 output
OUTPUT_DIR = NSD_ROOT / 'brainscore'
OUTPUT_DIR.mkdir(exist_ok=True)

# --- Subject Configuration (must match NB02) ---
SUBJECT_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
# SUBJECT_LIST = [1, 2, 5, 7]

N_SUBJECTS = len(SUBJECT_LIST)
ALL_SESSIONS = {1: 40, 2: 40, 3: 32, 4: 30, 5: 40, 6: 32, 7: 40, 8: 30}
SESSIONS_PER_SUBJECT = {s: ALL_SESSIONS[s] for s in SUBJECT_LIST}

TRIALS_PER_SESSION = 750
N_REPS = 3
REGIONS = ['V1', 'V2', 'V4', 'IT']

# ROI definitions (same as notebook 02)
REGION_TO_PRF_LABELS = {'V1': [1, 2], 'V2': [3, 4], 'V4': [7]}
STREAMS_VENTRAL_LABEL = 5

# Load train/test split
split_df = pd.read_csv(ASSEMBLIES_DIR / 'train_test_split.csv')
train_ids = set(split_df.loc[split_df['split'] == 'train', 'stimulus_id'].values)
test_ids = set(split_df.loc[split_df['split'] == 'test', 'stimulus_id'].values)
test_nsd_ids = sorted(split_df.loc[split_df['split'] == 'test', 'nsd_id'].values)
all_nsd_ids = sorted(split_df['nsd_id'].values)

print(f'Train: {len(train_ids)}, Test: {len(test_ids)}')
print(f'Output: {OUTPUT_DIR}')

## 1. Load Notebook 02 Averaged Assemblies

These contain per-session z-scored, rep-averaged betas for all 515 images.
We use them for: (a) train data, (b) global z-score statistics.

In [2]:
# Load per-region assemblies from notebook 02
nb02_assemblies = {}
for region in REGIONS:
    path = ASSEMBLIES_DIR / f'Allen2022.{region}.nc'
    da = xr.open_dataarray(str(path))
    da.load()
    nb02_assemblies[region] = da
    print(f'{region}: {da.shape}')

N_USABLE_IMAGES = nb02_assemblies['V1'].sizes['presentation']
print(f'\nTotal images: {N_USABLE_IMAGES}')

V1: (515, 9039)
V2: (515, 8792)
V4: (515, 3982)
IT: (515, 35429)

Total images: 515


## 2. Extract Per-Rep Test Betas

Re-extract betas for the 103 test images, keeping 3 reps separate.
Same per-session z-scoring as notebook 02.

In [None]:
# --- Data loading utilities (replicated from notebook 02) ---

def load_roi(subj: int, roi_name: str) -> np.ndarray:
    path = NSD_ROOT / f'subj{subj:02d}' / 'rois' / f'{roi_name}.nii.gz'
    return nib.load(str(path)).get_fdata().T  # (81,104,83) -> (83,104,81)


def load_session_betas(subj: int, session: int) -> np.ndarray:
    path = NSD_ROOT / f'subj{subj:02d}' / 'betas' / f'betas_session{session:02d}.hdf5'
    with h5py.File(str(path), 'r') as f:
        betas = f['betas'][:]  # (750, 83, 104, 81) int16
    return betas.astype(np.float32) / 300.0


def get_roi_masks(subj: int) -> dict:
    prf = load_roi(subj, 'prf-visualrois')
    lh_streams = load_roi(subj, 'lh.streams')
    rh_streams = load_roi(subj, 'rh.streams')
    masks = {}
    for region, labels in REGION_TO_PRF_LABELS.items():
        masks[region] = np.isin(prf, labels)
    masks['IT'] = (lh_streams == STREAMS_VENTRAL_LABEL) | (rh_streams == STREAMS_VENTRAL_LABEL)
    return masks


# --- Trial mapping (replicated from notebook 02) ---

expdesign = scipy.io.loadmat(NSD_ROOT / 'metadata' / 'nsd_expdesign.mat')
masterordering = expdesign['masterordering'].flatten()
subjectim = expdesign['subjectim']
sharedix = expdesign['sharedix'].flatten()


def get_shared_trial_info(subj_idx: int, target_nsd_ids: set) -> pd.DataFrame:
    """Find trial indices for shared images for a given subject.
    Only returns rows for images in target_nsd_ids."""
    n_sessions = SESSIONS_PER_SUBJECT[subj_idx + 1]
    n_total_trials = n_sessions * TRIALS_PER_SESSION
    subj_nsdids = subjectim[subj_idx]
    nsdid_to_imgidx = {int(nsd_id): img_idx + 1 for img_idx, nsd_id in enumerate(subj_nsdids)}

    shared_imgidxs = set()
    for nsd_id in sharedix:
        if int(nsd_id) in nsdid_to_imgidx:
            shared_imgidxs.add(nsdid_to_imgidx[int(nsd_id)])

    records = []
    rep_counter = {}
    for trial_idx in range(n_total_trials):
        img_idx = masterordering[trial_idx]
        if img_idx in shared_imgidxs:
            nsd_id = int(subj_nsdids[img_idx - 1] - 1)  # 0-indexed
            if nsd_id not in target_nsd_ids:
                rep_counter[img_idx] = rep_counter.get(img_idx, 0) + 1
                continue
            rep = rep_counter.get(img_idx, 0)
            rep_counter[img_idx] = rep + 1
            session = trial_idx // TRIALS_PER_SESSION + 1
            trial_in_session = trial_idx % TRIALS_PER_SESSION
            records.append({
                'nsd_id': nsd_id,
                'rep': rep,
                'session': session,
                'trial_in_session': trial_in_session,
            })
    return pd.DataFrame(records)


print('Utilities loaded.')

In [None]:
# Extract per-rep test betas for all subjects, all regions
# Same per-session z-scoring as notebook 02

test_nsd_id_set = set(test_nsd_ids)
n_test = len(test_nsd_ids)
test_nsd_id_to_idx = {nsd_id: idx for idx, nsd_id in enumerate(sorted(test_nsd_ids))}

test_per_rep = {}  # {subj: {region: (n_test, 3, n_voxels)}}
all_masks = {}     # {subj: {region: mask}}

total_t0 = time.time()

for subj in SUBJECT_LIST:
    print(f'\nProcessing subj{subj:02d} ({SESSIONS_PER_SUBJECT[subj]} sessions)...')
    
    masks = get_roi_masks(subj)
    all_masks[subj] = masks
    
    # Get trial info for test images
    trial_info = get_shared_trial_info(subj - 1, test_nsd_id_set)
    assert len(trial_info) == n_test * N_REPS, (
        f'subj{subj:02d}: expected {n_test * N_REPS} test trials, got {len(trial_info)}')
    
    # Find which sessions we need
    sessions_needed = sorted(trial_info['session'].unique())
    print(f'  Need {len(sessions_needed)} sessions for {n_test} test images')
    
    # Pre-allocate per-rep storage
    subj_per_rep = {}
    for region, mask in masks.items():
        subj_per_rep[region] = np.zeros((n_test, N_REPS, mask.sum()), dtype=np.float32)
    
    t0 = time.time()
    for session in sessions_needed:
        session_trials = trial_info[trial_info['session'] == session]
        if len(session_trials) == 0:
            continue
        
        session_betas = load_session_betas(subj, session)  # (750, 83, 104, 81)
        
        for region, mask in masks.items():
            roi_betas = session_betas[:, mask]  # (750, n_voxels)
            
            # Session z-score (same as NB02 -- see comment there)
            mean = roi_betas.mean(axis=0, keepdims=True)
            std = roi_betas.std(axis=0, keepdims=True)
            std[std == 0] = 1.0
            roi_betas = (roi_betas - mean) / std
            
            for _, row in session_trials.iterrows():
                img_idx = test_nsd_id_to_idx[row['nsd_id']]
                subj_per_rep[region][img_idx, row['rep']] = roi_betas[row['trial_in_session']]
        
        del session_betas
    
    test_per_rep[subj] = subj_per_rep
    elapsed = time.time() - t0
    print(f'  Done in {elapsed:.0f}s')
    
    for region in REGIONS:
        print(f'    {region}: {subj_per_rep[region].shape}')
    
    gc.collect()

total_elapsed = time.time() - total_t0
print(f'\nTotal extraction time: {total_elapsed/60:.1f} minutes')

In [None]:
# Validate: mean of 3 per-rep test betas should match notebook 02 averaged values
print('Validation: per-rep mean vs notebook 02 averaged values')
print('=' * 60)

# Get test image indices in the notebook 02 assembly
nb02_stimulus_ids = nb02_assemblies['V1'].coords['stimulus_id'].values
test_stimulus_ids_sorted = [f'nsd_{nsd_id:05d}' for nsd_id in sorted(test_nsd_ids)]
test_mask_nb02 = np.array([sid in test_ids for sid in nb02_stimulus_ids])

all_pass = True
for region in REGIONS:
    nb02_test = nb02_assemblies[region].values[test_mask_nb02]  # (n_test, n_voxels)
    
    # Stack per-rep across subjects to get the same neuroid ordering
    per_rep_means = []
    for subj in SUBJECT_LIST:
        mean_per_rep = test_per_rep[subj][region].mean(axis=1)  # (n_test, n_voxels)
        per_rep_means.append(mean_per_rep)
    per_rep_mean_all = np.concatenate(per_rep_means, axis=1)  # (n_test, total_voxels)
    
    max_diff = np.max(np.abs(nb02_test - per_rep_mean_all))
    mean_diff = np.mean(np.abs(nb02_test - per_rep_mean_all))
    status = 'PASS' if max_diff < 1e-4 else 'FAIL'
    if status == 'FAIL':
        all_pass = False
    print(f'{region}: max_diff={max_diff:.2e}, mean_diff={mean_diff:.2e} [{status}]')

print(f'\nOverall: {"ALL PASS" if all_pass else "SOME FAILED"}')

## 3. Build Combined Assemblies

Stack V1, V2, V4, IT along neuroid dimension. Apply global z-score per subject per region
using statistics from all 515 averaged images.

In [None]:
# Stage 2 of 2: Global z-score (per subject, per region).
# After session z-scoring (NB02) and rep-averaging, subjects have different signal
# distributions due to individual differences in BOLD responsiveness.
# This normalization puts subjects on a comparable scale for pooled
# cross-subject ridge regression. Not prescribed by Allen et al. 2022
# (their analyses are per-subject), but standard for cross-subject encoding.

zscore_stats = {}  # {subj: {region: (mean, std)}} -- per voxel

for subj in SUBJECT_LIST:
    subj_label = f'subj{subj:02d}'
    zscore_stats[subj] = {}
    for region in REGIONS:
        assembly = nb02_assemblies[region]
        # Select this subject's voxels
        subj_mask = assembly.coords['subject'].values == subj_label
        data = assembly.values[:, subj_mask]  # (n_complete, n_voxels_subj)
        
        mean = data.mean(axis=0)  # (n_voxels_subj,)
        std = data.std(axis=0)
        std[std == 0] = 1.0
        zscore_stats[subj][region] = (mean, std)

print('Global z-score statistics computed from all averaged images.')

In [None]:
# Build train assembly: (n_train, total_neuroids, 1)
# Data from notebook 02, split to train only, z-scored globally

train_mask_nb02 = np.array([sid in train_ids for sid in nb02_stimulus_ids])
train_stimulus_ids_sorted = list(nb02_stimulus_ids[train_mask_nb02])
train_nsd_ids_sorted = list(nb02_assemblies['V1'].coords['nsd_id'].values[train_mask_nb02])

train_data_blocks = []   # per-subject per-region data blocks
neuroid_ids = []
subjects = []
regions_coord = []
nc_values = []
voxel_xs = []
voxel_ys = []
voxel_zs = []

for subj in SUBJECT_LIST:
    subj_label = f'subj{subj:02d}'
    total_voxels_subj = 0
    for region in REGIONS:
        assembly = nb02_assemblies[region]
        subj_mask = assembly.coords['subject'].values == subj_label
        
        # Get train data for this subject's voxels
        data = assembly.values[train_mask_nb02][:, subj_mask]  # (n_train, n_voxels_subj)
        
        # Apply global z-score
        mean, std = zscore_stats[subj][region]
        data = (data - mean) / std
        
        train_data_blocks.append(data)
        
        n_voxels = data.shape[1]
        total_voxels_subj += n_voxels
        
        # Collect neuroid metadata
        nc_subj = assembly.coords['nc_testset'].values[subj_mask]
        vx_subj = assembly.coords['voxel_x'].values[subj_mask]
        vy_subj = assembly.coords['voxel_y'].values[subj_mask]
        vz_subj = assembly.coords['voxel_z'].values[subj_mask]
        nids_subj = assembly.coords['neuroid_id'].values[subj_mask]
        
        neuroid_ids.extend(nids_subj.tolist())
        subjects.extend([subj_label] * n_voxels)
        regions_coord.extend([region] * n_voxels)
        nc_values.extend(nc_subj.tolist())
        voxel_xs.extend(vx_subj.tolist())
        voxel_ys.extend(vy_subj.tolist())
        voxel_zs.extend(vz_subj.tolist())

train_data = np.concatenate(train_data_blocks, axis=1)  # (n_train, total_neuroids)
n_train = train_data.shape[0]
n_neuroids = train_data.shape[1]
print(f'Train data: {train_data.shape}')
print(f'Total neuroids: {n_neuroids}')

In [None]:
# Build test assembly: (n_test_presentations, total_neuroids, 1)
# n_test images x 3 reps, with repetition coord

test_data_blocks = []  # per-subject per-region blocks, reps interleaved

for subj in SUBJECT_LIST:
    for region in REGIONS:
        per_rep = test_per_rep[subj][region]  # (n_test, 3, n_voxels_subj)
        
        # Apply global z-score (same stats as train)
        mean, std = zscore_stats[subj][region]
        per_rep = (per_rep - mean) / std
        
        # Interleave reps: for each image, list rep0, rep1, rep2
        # Result: (n_test_presentations, n_voxels_subj)
        n_img, n_rep, n_vox = per_rep.shape
        interleaved = per_rep.reshape(n_img * n_rep, n_vox)
        test_data_blocks.append(interleaved)

test_data = np.concatenate(test_data_blocks, axis=1)  # (n_test_presentations, total_neuroids)
assert test_data.shape[1] == n_neuroids, f'Neuroid count mismatch: {test_data.shape[1]} vs {n_neuroids}'

# Build repetition and stimulus coords for test
n_test_images = len(test_nsd_ids)
test_rep_coord = np.tile(np.arange(N_REPS), n_test_images)  # [0,1,2, 0,1,2, ...]
test_stimulus_id_coord = np.repeat(
    [f'nsd_{nsd_id:05d}' for nsd_id in sorted(test_nsd_ids)], N_REPS
)  # each stimulus_id repeated 3x
test_nsd_id_coord = np.repeat(sorted(test_nsd_ids), N_REPS)

print(f'Test data: {test_data.shape}')
print(f'Repetitions per image: {N_REPS}')
print(f'Test presentations: {n_test_images} images x {N_REPS} reps = {test_data.shape[0]}')

In [9]:
# Save as plain xr.DataArray (NeuroidAssembly wrapping happens at packaging stage)
# This preserves all coords cleanly without MultiIndex issues.

# Shared neuroid coords (same for train and test)
neuroid_coords = {
    'neuroid_id': ('neuroid', neuroid_ids),
    'subject': ('neuroid', subjects),
    'region': ('neuroid', regions_coord),
    'nc_testset': ('neuroid', nc_values),
    'voxel_x': ('neuroid', voxel_xs),
    'voxel_y': ('neuroid', voxel_ys),
    'voxel_z': ('neuroid', voxel_zs),
}

# Train assembly
train_assembly = xr.DataArray(
    train_data.reshape(n_train, n_neuroids, 1),
    dims=['presentation', 'neuroid', 'time_bin'],
    coords={
        'stimulus_id': ('presentation', train_stimulus_ids_sorted),
        'nsd_id': ('presentation', train_nsd_ids_sorted),
        **neuroid_coords,
        'time_bin_start': ('time_bin', [70]),
        'time_bin_end': ('time_bin', [170]),
    },
)

# Test assembly (with repetition coord)
test_assembly = xr.DataArray(
    test_data.reshape(test_data.shape[0], n_neuroids, 1),
    dims=['presentation', 'neuroid', 'time_bin'],
    coords={
        'stimulus_id': ('presentation', list(test_stimulus_id_coord)),
        'nsd_id': ('presentation', list(test_nsd_id_coord)),
        'repetition': ('presentation', list(test_rep_coord)),
        **neuroid_coords,
        'time_bin_start': ('time_bin', [70]),
        'time_bin_end': ('time_bin', [170]),
    },
)

print(f'Train assembly: {train_assembly.shape}, dims={train_assembly.dims}')
print(f'Test assembly:  {test_assembly.shape}, dims={test_assembly.dims}')
print(f'\nTrain coords: {list(train_assembly.coords.keys())}')
print(f'Test coords:  {list(test_assembly.coords.keys())}')

Train assembly: (412, 57242, 1), dims=('presentation', 'neuroid', 'time_bin')
Test assembly:  (309, 57242, 1), dims=('presentation', 'neuroid', 'time_bin')

Train coords: ['stimulus_id', 'nsd_id', 'neuroid_id', 'subject', 'region', 'nc_testset', 'voxel_x', 'voxel_y', 'voxel_z', 'time_bin_start', 'time_bin_end']
Test coords:  ['stimulus_id', 'nsd_id', 'repetition', 'neuroid_id', 'subject', 'region', 'nc_testset', 'voxel_x', 'voxel_y', 'voxel_z', 'time_bin_start', 'time_bin_end']


## 4. Extract Stimulus Images

In [10]:
# Extract 515 stimulus images from HDF5 and save as JPEGs
STIMULI_HDF5 = NSD_ROOT / 'stimuli' / 'nsd_stimuli.hdf5'

for split_name, nsd_id_list in [('train', sorted(split_df.loc[split_df['split']=='train', 'nsd_id'])),
                                 ('test', sorted(split_df.loc[split_df['split']=='test', 'nsd_id']))]:
    out_dir = OUTPUT_DIR / f'stimuli_{split_name}'
    out_dir.mkdir(exist_ok=True)
    
    with h5py.File(STIMULI_HDF5, 'r') as f:
        img_brick = f['imgBrick']
        for nsd_id in tqdm(nsd_id_list, desc=f'Extracting {split_name} images'):
            img_array = img_brick[nsd_id]
            img = Image.fromarray(img_array, 'RGB')
            img.save(out_dir / f'nsd_{nsd_id:05d}.jpg', quality=95)
    
    print(f'{split_name}: {len(nsd_id_list)} images saved to {out_dir}')

Extracting train images:   0%|          | 0/412 [00:00<?, ?it/s]

train: 412 images saved to /Volumes/Hagibis/nsd/brainscore/stimuli_train


Extracting test images:   0%|          | 0/103 [00:00<?, ?it/s]

test: 103 images saved to /Volumes/Hagibis/nsd/brainscore/stimuli_test


In [11]:
# Create stimulus metadata CSVs
for split_name in ['train', 'test']:
    split_rows = split_df[split_df['split'] == split_name].copy()
    split_rows['image_file_name'] = split_rows['nsd_id'].apply(lambda x: f'nsd_{x:05d}.jpg')
    meta_path = OUTPUT_DIR / f'stimulus_metadata_{split_name}.csv'
    split_rows[['stimulus_id', 'nsd_id', 'image_file_name']].to_csv(meta_path, index=False)
    print(f'{split_name}: metadata saved to {meta_path}')

train: metadata saved to /Volumes/Hagibis/nsd/brainscore/stimulus_metadata_train.csv
test: metadata saved to /Volumes/Hagibis/nsd/brainscore/stimulus_metadata_test.csv


## 5. Validate and Save

In [None]:
# Validate assemblies
print('Assembly Validation')
print('=' * 60)

n_train_expected = len(train_ids)
n_test_expected = len(test_ids) * N_REPS

# Train
assert train_assembly.dims == ('presentation', 'neuroid', 'time_bin')
assert train_assembly.sizes['presentation'] == n_train_expected
assert train_assembly.sizes['time_bin'] == 1
assert not np.any(np.isnan(train_assembly.values))
print(f'Train: {train_assembly.shape} -- OK')

# Test
assert test_assembly.dims == ('presentation', 'neuroid', 'time_bin')
assert test_assembly.sizes['presentation'] == n_test_expected
assert test_assembly.sizes['time_bin'] == 1
assert 'repetition' in test_assembly.coords
assert list(sorted(set(test_assembly.coords['repetition'].values))) == [0, 1, 2]
assert not np.any(np.isnan(test_assembly.values))
print(f'Test:  {test_assembly.shape} -- OK')

# Neuroid consistency
assert train_assembly.sizes['neuroid'] == test_assembly.sizes['neuroid']
assert np.array_equal(
    train_assembly.coords['neuroid_id'].values,
    test_assembly.coords['neuroid_id'].values
)
print(f'Neuroids consistent: {n_neuroids}')

# Region counts
for region in REGIONS:
    n = (np.array(regions_coord) == region).sum()
    print(f'  {region}: {n} neuroids')

# Stimulus ID consistency
assert len(set(train_assembly.coords['stimulus_id'].values) & 
           set(test_assembly.coords['stimulus_id'].values)) == 0, \
    'Train/test stimulus_id overlap!'
print('\nNo train/test overlap in stimulus_ids.')

# Check subjects
subjects_set = set(train_assembly.coords['subject'].values)
expected_subjs = {f'subj{i:02d}' for i in SUBJECT_LIST}
assert subjects_set == expected_subjs
print(f'All {N_SUBJECTS} subjects present.')

In [13]:
# Save assemblies
train_path = OUTPUT_DIR / 'Allen2022_fmri_train.nc'
test_path = OUTPUT_DIR / 'Allen2022_fmri_test.nc'

train_assembly.to_netcdf(str(train_path))
test_assembly.to_netcdf(str(test_path))

print(f'Train: {train_path} ({train_path.stat().st_size / 1e6:.1f} MB)')
print(f'Test:  {test_path} ({test_path.stat().st_size / 1e6:.1f} MB)')

Train: /Volumes/Hagibis/nsd/brainscore/Allen2022_fmri_train.nc (106.3 MB)
Test:  /Volumes/Hagibis/nsd/brainscore/Allen2022_fmri_test.nc (82.7 MB)


In [14]:
# Reload and verify
train_reload = xr.open_dataarray(str(train_path))
test_reload = xr.open_dataarray(str(test_path))

assert train_reload.shape == train_assembly.shape
assert test_reload.shape == test_assembly.shape
assert np.allclose(train_reload.values, train_assembly.values, atol=1e-6)
assert np.allclose(test_reload.values, test_assembly.values, atol=1e-6)

train_reload.close()
test_reload.close()

print('Reload verification passed.')

Reload verification passed.


In [15]:
# Summary
print('=' * 60)
print('NSD Packaging Preparation Summary')
print('=' * 60)
print()
print('ASSEMBLIES:')
print(f'  Train: {train_assembly.shape} ({train_path.stat().st_size/1e6:.1f} MB)')
print(f'  Test:  {test_assembly.shape} ({test_path.stat().st_size/1e6:.1f} MB)')
print(f'  Total neuroids: {n_neuroids} across 8 subjects x 4 regions')
print()
print('STIMULI:')
n_train_imgs = len(list((OUTPUT_DIR / 'stimuli_train').glob('*.jpg')))
n_test_imgs = len(list((OUTPUT_DIR / 'stimuli_test').glob('*.jpg')))
print(f'  Train images: {n_train_imgs}')
print(f'  Test images:  {n_test_imgs}')
print()
print('FILES:')
for f in sorted(OUTPUT_DIR.iterdir()):
    if f.is_file():
        print(f'  {f.name}: {f.stat().st_size/1e6:.1f} MB')
    elif f.is_dir():
        n_files = len(list(f.iterdir()))
        print(f'  {f.name}/: {n_files} files')

NSD Packaging Preparation Summary

ASSEMBLIES:
  Train: (412, 57242, 1) (106.3 MB)
  Test:  (309, 57242, 1) (82.7 MB)
  Total neuroids: 57242 across 8 subjects x 4 regions

STIMULI:
  Train images: 412
  Test images:  103

FILES:
  Allen2022_fmri_test.nc: 82.7 MB
  Allen2022_fmri_train.nc: 106.3 MB
  stimuli_test/: 103 files
  stimuli_train/: 412 files
  stimulus_metadata_test.csv: 0.0 MB
  stimulus_metadata_train.csv: 0.0 MB
