# Surface Assembly Packaging

Build Brain-Score-ready assemblies from NB02 surface data:
1. Filter to 515 complete images (min_reps >= 3)
2. Apply global z-score (per subject, per region)
3. Train/test split: 412/103 (same seed=42 as volumetric)
4. Re-extract per-rep test betas for ceiling computation
5. Save final assemblies with StimulusSet metadata

**Output:**
- `Allen2022_fmri_surface_train.nc`: 412 images, rep-averaged
- `Allen2022_fmri_surface_test.nc`: 309 presentations (103 x 3 reps)

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import nibabel as nib
import scipy.io
import h5py
import time
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm

NSD_ROOT = Path('/Volumes/Hagibis/nsd')
FSAVG_LABELS = NSD_ROOT / 'fsaverage_labels'
ASSEMBLY_DIR = NSD_ROOT / 'assemblies'
OUTPUT_DIR = NSD_ROOT / 'brainscore_surface'
OUTPUT_DIR.mkdir(exist_ok=True)

# --- Subject Configuration (must match NB02) ---
SUBJECT_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
# SUBJECT_LIST = [1, 2, 5, 7]

N_SUBJECTS = len(SUBJECT_LIST)
ALL_SESSIONS = {1: 40, 2: 40, 3: 32, 4: 30, 5: 40, 6: 32, 7: 40, 8: 30}
SESSIONS_PER_SUBJECT = {s: ALL_SESSIONS[s] for s in SUBJECT_LIST}

TRIALS_PER_SESSION = 750
REGIONS = ['V1', 'V2', 'V4', 'IT']
TRAIN_TEST_SEED = 42
TRAIN_FRAC = 0.8

## 1. Load NB02 Assemblies and Filter to 515 Complete Images

In [None]:
# Load all 4 region assemblies from NB02
region_assemblies = {}
for region in REGIONS:
    path = ASSEMBLY_DIR / f'Allen2022_surface.{region}.nc'
    da = xr.open_dataarray(str(path))
    da.load()
    region_assemblies[region] = da
    print(f'{region}: {da.shape}')

# Filter to complete images (3 reps across all subjects in SUBJECT_LIST)
min_reps = region_assemblies['V1'].coords['min_reps_across_subjects'].values
complete_mask = min_reps >= 3
n_complete = complete_mask.sum()
print(f'\nComplete images (3 reps x {N_SUBJECTS} subjects): {n_complete}')

for region in REGIONS:
    region_assemblies[region] = region_assemblies[region].isel(presentation=complete_mask)
    print(f'{region} filtered: {region_assemblies[region].shape}')

## 2. Train/Test Split (Same as Volumetric)

In [None]:
n_images = n_complete
n_train = int(n_images * TRAIN_FRAC)
n_test = n_images - n_train

rng = np.random.RandomState(TRAIN_TEST_SEED)
indices = rng.permutation(n_images)
train_idx = np.sort(indices[:n_train])
test_idx = np.sort(indices[n_train:])

print(f'Train: {len(train_idx)} images')
print(f'Test: {len(test_idx)} images')
assert len(train_idx) + len(test_idx) == n_images
assert len(set(train_idx) & set(test_idx)) == 0

# Check nsd_ids for the split
train_nsd_ids = region_assemblies['V1'].coords['nsd_id'].values[train_idx]
test_nsd_ids = region_assemblies['V1'].coords['nsd_id'].values[test_idx]
print(f'\nTrain nsd_ids: {train_nsd_ids[:5]}...{train_nsd_ids[-5:]}')
print(f'Test nsd_ids: {test_nsd_ids[:5]}...{test_nsd_ids[-5:]}')

# Save split for reference
split_df = pd.DataFrame({
    'nsd_id': region_assemblies['V1'].coords['nsd_id'].values,
    'stimulus_id': region_assemblies['V1'].coords['stimulus_id'].values,
    'split': ['train' if i in set(train_idx) else 'test' for i in range(n_images)],
})
split_df.to_csv(OUTPUT_DIR / 'train_test_split.csv', index=False)
print(f'\nSplit saved: {OUTPUT_DIR / "train_test_split.csv"}')

## 3. Global Z-Score

Compute mean/std from all 515 averaged images per subject per region,
then apply to both train and test data.

In [None]:
# Stage 2 of 2: Global z-score (per subject, per region).
# After session z-scoring (NB02) and rep-averaging, subjects have different signal
# distributions due to individual differences in BOLD responsiveness.
# This normalization puts subjects on a comparable scale for pooled
# cross-subject ridge regression. Not prescribed by Allen et al. 2022
# (their analyses are per-subject), but standard for cross-subject encoding.

train_data_regions = {}  # region -> (n_train, n_verts) global-z-scored
test_avg_data_regions = {}  # region -> (n_test, n_verts) global-z-scored, rep-averaged

for region in REGIONS:
    da = region_assemblies[region]
    data = da.values  # (n_complete, n_verts)
    subjects = da.coords['subject'].values
    
    data_zscored = data.copy()
    for subj in SUBJECT_LIST:
        subj_label = f'subj{subj:02d}'
        subj_mask = subjects == subj_label
        subj_data = data[:, subj_mask]
        mean = np.nanmean(subj_data, axis=0, keepdims=True)
        std = np.nanstd(subj_data, axis=0, keepdims=True)
        std[std == 0] = 1.0
        data_zscored[:, subj_mask] = (subj_data - mean) / std
    
    # Fill any remaining NaN with 0.0 (neutral value post z-score)
    n_nan_before = np.isnan(data_zscored).sum()
    data_zscored = np.nan_to_num(data_zscored, nan=0.0)
    
    train_data_regions[region] = data_zscored[train_idx]
    test_avg_data_regions[region] = data_zscored[test_idx]
    
    print(f'{region}: train {train_data_regions[region].shape}, '
          f'test {test_avg_data_regions[region].shape}, '
          f'NaN filled: {n_nan_before}')

## 4. Re-Extract Per-Rep Test Betas

For the 103 test images, load raw surface betas and keep 3 reps separate.
This is needed for `internal_consistency` ceiling computation.

In [None]:
# Load experiment design for trial mapping
expdesign = scipy.io.loadmat(str(NSD_ROOT / 'metadata' / 'nsd_expdesign.mat'))
masterordering = expdesign['masterordering'].flatten()
subjectim = expdesign['subjectim']
sharedix = expdesign['sharedix'].flatten()

# ROI masks (must match NB02 definitions)
lh_kastner = nib.load(str(FSAVG_LABELS / 'lh.Kastner2015.mgz')).get_fdata().flatten()
rh_kastner = nib.load(str(FSAVG_LABELS / 'rh.Kastner2015.mgz')).get_fdata().flatten()
lh_streams = nib.load(str(FSAVG_LABELS / 'lh.streams.mgz')).get_fdata().flatten()
rh_streams = nib.load(str(FSAVG_LABELS / 'rh.streams.mgz')).get_fdata().flatten()

REGION_TO_KASTNER = {'V1': [1, 2], 'V2': [3, 4], 'V4': [7]}
STREAMS_VENTRAL_LABEL = 5

roi_masks = {}
for region, labels in REGION_TO_KASTNER.items():
    roi_masks[region] = {
        'lh': np.isin(lh_kastner, labels),
        'rh': np.isin(rh_kastner, labels),
    }
roi_masks['IT'] = {
    'lh': lh_streams == STREAMS_VENTRAL_LABEL,
    'rh': rh_streams == STREAMS_VENTRAL_LABEL,
}

# Get the nsd_ids for the test images (0-indexed)
test_nsd_ids_0 = test_nsd_ids  # from cell above
test_nsd_ids_1 = test_nsd_ids_0 + 1  # 1-indexed for matching with sharedix
test_set = set(test_nsd_ids_0)
print(f'Test images: {len(test_set)}')

In [None]:
# Build trial mapping for test images only
def get_test_trial_info(subj_idx: int, test_nsd_ids: set) -> pd.DataFrame:
    """Get trial info for test images only."""
    subj = subj_idx + 1
    n_sessions = SESSIONS_PER_SUBJECT[subj]
    n_total_trials = n_sessions * TRIALS_PER_SESSION
    
    subj_nsdids = subjectim[subj_idx]
    nsdid_to_imgidx = {int(nsd_id): img_idx + 1 
                       for img_idx, nsd_id in enumerate(subj_nsdids)}
    
    # Map test nsd_ids to image indices
    test_imgidxs = {}
    for nsd_id_0 in test_nsd_ids:
        nsd_id_1 = int(nsd_id_0) + 1
        if nsd_id_1 in nsdid_to_imgidx:
            test_imgidxs[nsdid_to_imgidx[nsd_id_1]] = nsd_id_0
    
    records = []
    rep_counter = {}
    for trial_idx in range(n_total_trials):
        img_idx = masterordering[trial_idx]
        if img_idx in test_imgidxs:
            nsd_id_0 = test_imgidxs[img_idx]
            rep = rep_counter.get(img_idx, 0)
            rep_counter[img_idx] = rep + 1
            session = trial_idx // TRIALS_PER_SESSION + 1
            trial_in_session = trial_idx % TRIALS_PER_SESSION
            records.append({
                'nsd_id': nsd_id_0,
                'rep': rep,
                'session': session,
                'trial_in_session': trial_in_session,
            })
    
    return pd.DataFrame(records)


# Build test trial info for all subjects in SUBJECT_LIST
test_trial_info = {}
for subj in SUBJECT_LIST:
    subj_idx = subj - 1
    df = get_test_trial_info(subj_idx, test_set)
    test_trial_info[subj] = df
    print(f'subj{subj:02d}: {len(df)} test trials ({df["nsd_id"].nunique()} images)')

In [None]:
# Compute global z-score parameters from ALL complete images (computed in cell above)
# We need to store these so we can apply the same transform to per-rep test data
global_zs_params = {}  # (region, subj_label) -> (mean, std)
for region in REGIONS:
    da = region_assemblies[region]
    data = da.values  # (n_complete, n_verts)
    subjects = da.coords['subject'].values
    for subj in SUBJECT_LIST:
        subj_label = f'subj{subj:02d}'
        subj_mask = subjects == subj_label
        subj_data = data[:, subj_mask]
        mean = np.nanmean(subj_data, axis=0)
        std = np.nanstd(subj_data, axis=0)
        std[std == 0] = 1.0
        global_zs_params[(region, subj_label)] = (mean, std)

print('Global z-score parameters computed.')

In [None]:
# Extract per-rep test betas
# For each subject, load sessions that contain test trials,
# extract ROI vertices, session z-score, then global z-score

test_per_rep = {region: {} for region in REGIONS}  # region -> subj -> (n_test, 3, n_verts)
sorted_test_nsd_ids = np.sort(list(test_set))
nsd_to_test_idx = {nid: i for i, nid in enumerate(sorted_test_nsd_ids)}

n_test_images = len(sorted_test_nsd_ids)
t0 = time.time()

for subj in SUBJECT_LIST:
    subj_label = f'subj{subj:02d}'
    trial_info = test_trial_info[subj]
    sessions_needed = sorted(trial_info['session'].unique())
    
    # Initialize per-region storage
    for region in REGIONS:
        n_verts = roi_masks[region]['lh'].sum() + roi_masks[region]['rh'].sum()
        test_per_rep[region][subj] = np.full((n_test_images, 3, n_verts), np.nan, dtype=np.float32)
    
    for session in tqdm(sessions_needed, desc=subj_label, leave=False):
        session_trials = trial_info[trial_info['session'] == session]
        if len(session_trials) == 0:
            continue
        
        lh_betas = nib.load(str(
            NSD_ROOT / subj_label / 'betas' / f'lh.betas_session{session:02d}.mgh'
        )).get_fdata().squeeze()
        rh_betas = nib.load(str(
            NSD_ROOT / subj_label / 'betas' / f'rh.betas_session{session:02d}.mgh'
        )).get_fdata().squeeze()
        
        for region in REGIONS:
            lh_mask = roi_masks[region]['lh']
            rh_mask = roi_masks[region]['rh']
            
            roi_betas = np.concatenate([
                lh_betas[lh_mask].T,
                rh_betas[rh_mask].T,
            ], axis=1)  # (750, n_verts)
            
            # Session z-score (same as NB02 -- see comment there)
            mean = roi_betas.mean(axis=0, keepdims=True)
            std = roi_betas.std(axis=0, keepdims=True)
            std[std == 0] = 1.0
            roi_betas = (roi_betas - mean) / std
            
            # Global z-score
            g_mean, g_std = global_zs_params[(region, subj_label)]
            
            for _, row in session_trials.iterrows():
                nsd_id = row['nsd_id']
                if nsd_id in nsd_to_test_idx:
                    test_img_idx = nsd_to_test_idx[nsd_id]
                    rep = row['rep']
                    beta = roi_betas[row['trial_in_session']]
                    test_per_rep[region][subj][test_img_idx, rep] = (beta - g_mean) / g_std
        
        del lh_betas, rh_betas
    
    elapsed = time.time() - t0
    print(f'subj{subj:02d} done. Elapsed: {elapsed/60:.1f} min')

print(f'\nTotal test extraction time: {(time.time() - t0)/60:.1f} min')

## 5. Build Combined Train and Test Assemblies

In [9]:
# Build neuroid coordinate arrays (shared between train and test)
# Stack all 4 regions along the neuroid dimension

all_neuroid_ids = []
all_subjects = []
all_hemispheres = []
all_vertex_indices = []
all_regions = []
all_nc = []

for region in REGIONS:
    da = region_assemblies[region]
    all_neuroid_ids.extend(da.coords['neuroid_id'].values.tolist())
    all_subjects.extend(da.coords['subject'].values.tolist())
    all_hemispheres.extend(da.coords['hemisphere'].values.tolist())
    all_vertex_indices.extend(da.coords['vertex_index'].values.tolist())
    all_regions.extend(da.coords['region'].values.tolist())
    all_nc.extend(da.coords['nc_testset'].values.tolist())

total_neuroids = len(all_neuroid_ids)
print(f'Total neuroids across all regions: {total_neuroids:,}')

Total neuroids across all regions: 157,992


In [10]:
# --- TRAIN ASSEMBLY ---
# Stack regions along neuroid dim: (412, total_neuroids)
train_data = np.concatenate(
    [train_data_regions[region] for region in REGIONS],
    axis=1,
)  # (412, total_neuroids)

# Add time_bin dimension: (412, total_neuroids, 1)
train_data = train_data[:, :, np.newaxis]

train_stim_ids = region_assemblies['V1'].coords['stimulus_id'].values[train_idx]
train_nsd = region_assemblies['V1'].coords['nsd_id'].values[train_idx]

train_assembly = xr.DataArray(
    train_data.astype(np.float32),
    dims=['presentation', 'neuroid', 'time_bin'],
    coords={
        'stimulus_id': ('presentation', train_stim_ids),
        'nsd_id': ('presentation', train_nsd),
        'neuroid_id': ('neuroid', all_neuroid_ids),
        'subject': ('neuroid', all_subjects),
        'hemisphere': ('neuroid', all_hemispheres),
        'vertex_index': ('neuroid', all_vertex_indices),
        'region': ('neuroid', all_regions),
        'nc_testset': ('neuroid', all_nc),
        'time_bin_start': ('time_bin', [70]),
        'time_bin_end': ('time_bin', [170]),
    },
)

print(f'Train assembly shape: {train_assembly.shape}')
print(f'NaN count: {np.isnan(train_assembly.values).sum()}')

Train assembly shape: (412, 157992, 1)
NaN count: 0


In [None]:
# --- TEST ASSEMBLY ---
# Build (n_test_presentations, total_neuroids, 1) with 3 reps per image
# Order: image_0_rep0, image_0_rep1, image_0_rep2, image_1_rep0, ...

n_reps = 3
n_test_presentations = n_test_images * n_reps

test_data = np.zeros((n_test_presentations, total_neuroids), dtype=np.float32)
test_stim_ids = []
test_nsd_coord = []
test_rep_coord = []

for img_i in range(n_test_images):
    nsd_id = sorted_test_nsd_ids[img_i]
    stim_id = f'nsd_{int(nsd_id):05d}'
    
    for rep in range(n_reps):
        pres_idx = img_i * n_reps + rep
        test_stim_ids.append(stim_id)
        test_nsd_coord.append(int(nsd_id))
        test_rep_coord.append(rep)
        
        # Stack regions along neuroid dim, subjects within each region
        neuroid_offset = 0
        for region in REGIONS:
            for subj in SUBJECT_LIST:
                n_verts = roi_masks[region]['lh'].sum() + roi_masks[region]['rh'].sum()
                beta_vec = test_per_rep[region][subj][img_i, rep, :]
                # Fill NaN with 0.0 (consistent with train assembly treatment)
                beta_vec = np.nan_to_num(beta_vec, nan=0.0)
                test_data[pres_idx, neuroid_offset:neuroid_offset + n_verts] = beta_vec
                neuroid_offset += n_verts

# Add time_bin dimension
test_data = test_data[:, :, np.newaxis]

test_assembly = xr.DataArray(
    test_data,
    dims=['presentation', 'neuroid', 'time_bin'],
    coords={
        'stimulus_id': ('presentation', test_stim_ids),
        'nsd_id': ('presentation', test_nsd_coord),
        'repetition': ('presentation', test_rep_coord),
        'neuroid_id': ('neuroid', all_neuroid_ids),
        'subject': ('neuroid', all_subjects),
        'hemisphere': ('neuroid', all_hemispheres),
        'vertex_index': ('neuroid', all_vertex_indices),
        'region': ('neuroid', all_regions),
        'nc_testset': ('neuroid', all_nc),
        'time_bin_start': ('time_bin', [70]),
        'time_bin_end': ('time_bin', [170]),
    },
)

print(f'Test assembly shape: {test_assembly.shape}')
print(f'NaN count: {np.isnan(test_assembly.values).sum()}')
print(f'Unique test images: {len(set(test_stim_ids))}')
print(f'Repetitions: {sorted(set(test_rep_coord))}')

## 6. Extract Stimulus Images

Reuse stimulus images from the volumetric pipeline if they exist,
otherwise extract from `nsd_stimuli.hdf5`.

In [12]:
# Check if volumetric stimulus images already exist
vol_stim_train = NSD_ROOT / 'brainscore' / 'stimuli_train'
vol_stim_test = NSD_ROOT / 'brainscore' / 'stimuli_test'

if vol_stim_train.exists() and vol_stim_test.exists():
    # Reuse existing images
    print('Reusing stimulus images from volumetric pipeline.')
    stim_train_dir = OUTPUT_DIR / 'stimuli_train'
    stim_test_dir = OUTPUT_DIR / 'stimuli_test'
    
    # Symlink to avoid duplication
    if not stim_train_dir.exists():
        stim_train_dir.symlink_to(vol_stim_train)
    if not stim_test_dir.exists():
        stim_test_dir.symlink_to(vol_stim_test)
    
    print(f'Train images: {len(list(stim_train_dir.glob("*.jpg")))}')
    print(f'Test images: {len(list(stim_test_dir.glob("*.jpg")))}')
else:
    print('Volumetric stimulus images not found. Extracting from HDF5...')
    stim_train_dir = OUTPUT_DIR / 'stimuli_train'
    stim_test_dir = OUTPUT_DIR / 'stimuli_test'
    stim_train_dir.mkdir(exist_ok=True)
    stim_test_dir.mkdir(exist_ok=True)
    
    stim_path = NSD_ROOT / 'nsd_stimuli.hdf5'
    with h5py.File(str(stim_path), 'r') as f:
        imgBrick = f['imgBrick']
        
        for i, idx in enumerate(tqdm(train_idx, desc='Train stimuli')):
            nsd_id = int(region_assemblies['V1'].coords['nsd_id'].values[idx])
            stim_id = f'nsd_{nsd_id:05d}'
            img = Image.fromarray(imgBrick[nsd_id])
            img.save(stim_train_dir / f'stimulus_{stim_id}.jpg', quality=95)
        
        for i, idx in enumerate(tqdm(test_idx, desc='Test stimuli')):
            nsd_id = int(region_assemblies['V1'].coords['nsd_id'].values[idx])
            stim_id = f'nsd_{nsd_id:05d}'
            img = Image.fromarray(imgBrick[nsd_id])
            img.save(stim_test_dir / f'stimulus_{stim_id}.jpg', quality=95)
    
    print(f'Extracted {len(train_idx)} train + {len(test_idx)} test images.')

Reusing stimulus images from volumetric pipeline.
Train images: 412
Test images: 103


In [None]:
# Build stimulus metadata CSVs
for split, idx_array, stim_dir in [
    ('train', train_idx, stim_train_dir),
    ('test', test_idx, stim_test_dir),
]:
    rows = []
    for idx in idx_array:
        nsd_id = int(region_assemblies['V1'].coords['nsd_id'].values[idx])
        stim_id = f'nsd_{nsd_id:05d}'
        fname = f'{stim_id}.jpg'
        rows.append({'stimulus_id': stim_id, 'nsd_id': nsd_id, 'image_file_name': fname})
    
    meta_df = pd.DataFrame(rows)
    meta_path = OUTPUT_DIR / f'stimulus_metadata_{split}.csv'
    meta_df.to_csv(meta_path, index=False)
    print(f'{split}: {len(rows)} images, saved to {meta_path.name}')

## 7. Save Final Assemblies

In [14]:
# Save train assembly
train_path = OUTPUT_DIR / 'Allen2022_fmri_surface_train.nc'
train_assembly.to_netcdf(str(train_path))
print(f'Train: {train_assembly.shape} -> {train_path.name} '
      f'({train_path.stat().st_size / 1e6:.1f} MB)')

# Save test assembly
test_path = OUTPUT_DIR / 'Allen2022_fmri_surface_test.nc'
test_assembly.to_netcdf(str(test_path))
print(f'Test: {test_assembly.shape} -> {test_path.name} '
      f'({test_path.stat().st_size / 1e6:.1f} MB)')

Train: (412, 157992, 1) -> Allen2022_fmri_surface_train.nc (300.9 MB)


Test: (309, 157992, 1) -> Allen2022_fmri_surface_test.nc (235.8 MB)


In [15]:
# Validation: reload and check
train_reload = xr.open_dataarray(str(train_path))
train_reload.load()
test_reload = xr.open_dataarray(str(test_path))
test_reload.load()

print('Reload validation:')
print(f'  Train: {train_reload.shape}, NaN={np.isnan(train_reload.values).sum()}')
print(f'  Test: {test_reload.shape}, NaN={np.isnan(test_reload.values).sum()}')
print(f'  Regions: {np.unique(train_reload.coords["region"].values)}')
print(f'  Subjects: {np.unique(train_reload.coords["subject"].values)}')
print(f'  Hemispheres: {np.unique(train_reload.coords["hemisphere"].values)}')
print(f'  Test reps: {np.unique(test_reload.coords["repetition"].values)}')

# Verify no train/test overlap
train_stims = set(train_reload.coords['stimulus_id'].values)
test_stims = set(test_reload.coords['stimulus_id'].values)
overlap = train_stims & test_stims
print(f'  Train/test overlap: {len(overlap)} (should be 0)')
assert len(overlap) == 0

print('\nAll checks passed.')

Reload validation:
  Train: (412, 157992, 1), NaN=0
  Test: (309, 157992, 1), NaN=0
  Regions: ['IT' 'V1' 'V2' 'V4']
  Subjects: ['subj01' 'subj02' 'subj03' 'subj04' 'subj05' 'subj06' 'subj07' 'subj08']
  Hemispheres: ['lh' 'rh']
  Test reps: [0 1 2]
  Train/test overlap: 0 (should be 0)

All checks passed.


In [16]:
print('Surface assembly packaging complete.')
print(f'\nOutput directory: {OUTPUT_DIR}')
print(f'\nFiles:')
for f in sorted(OUTPUT_DIR.iterdir()):
    if f.is_file():
        print(f'  {f.name}: {f.stat().st_size / 1e6:.1f} MB')

Surface assembly packaging complete.

Output directory: /Volumes/Hagibis/nsd/brainscore_surface

Files:
  Allen2022_fmri_surface_test.nc: 235.8 MB
  Allen2022_fmri_surface_train.nc: 300.9 MB
  stimulus_metadata_test.csv: 0.0 MB
  stimulus_metadata_train.csv: 0.0 MB
  train_test_split.csv: 0.0 MB
