# NSD Data Preparation

Extract shared-image betas for all 4 regions (V1, V2, V4, IT) across all 8 subjects,
compute per-voxel noise ceilings, and construct a NeuroidAssembly following the
Hebart2023 pattern.

**Inputs:** Raw NSD data on `/Volumes/Hagibis/nsd`

**Outputs:** Per-region NeuroidAssembly `.nc` files ready for Brain-Score benchmark packaging.

**Key design decisions:**
- Volumetric HDF5 betas (int16 / 300 for % signal change)
- Z-score within session (750 trials, per voxel)
- Average across 3 repetitions per shared image
- Only images with 3 complete reps across ALL 8 subjects (subjects 3/4/6/8 incomplete)
- ncsnr-based noise ceiling per voxel (percentage, 0-100)
- Single pass per session: extract all 4 regions simultaneously
- Train/test split: 80/20, seed=42

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py
import nibabel as nib
import scipy.io
from pathlib import Path
from collections import defaultdict
import time
import gc

NSD_ROOT = Path('/Volumes/Hagibis/nsd')
OUTPUT_DIR = Path('/Volumes/Hagibis/nsd/assemblies')
OUTPUT_DIR.mkdir(exist_ok=True)

# --- Subject Configuration ---
# Use all 8 subjects (515 shared images with 3 reps):
SUBJECT_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
# Or use only 4 complete subjects (40 sessions each, ~1000 images):
# SUBJECT_LIST = [1, 2, 5, 7]

N_SUBJECTS = len(SUBJECT_LIST)
ALL_SESSIONS = {1: 40, 2: 40, 3: 32, 4: 30, 5: 40, 6: 32, 7: 40, 8: 30}
SESSIONS_PER_SUBJECT = {s: ALL_SESSIONS[s] for s in SUBJECT_LIST}

TRIALS_PER_SESSION = 750
N_SHARED_IMAGES = 1000
N_REPS = 3

# Region definitions
# V1, V2, V4 from prf-visualrois (Kastner2015.mgz.ctab)
# IT: NSD "streams" ventral parcellation (label 5), consistent with Algonauts 2023.
REGION_TO_PRF_LABELS = {
    'V1': [1, 2],   # V1v, V1d
    'V2': [3, 4],   # V2v, V2d
    'V4': [7],       # hV4
}

STREAMS_VENTRAL_LABEL = 5

REGIONS = ['V1', 'V2', 'V4', 'IT']

print(f'Output directory: {OUTPUT_DIR}')
print(f'Subjects: {SUBJECT_LIST}')
print(f'Total sessions to process: {sum(SESSIONS_PER_SUBJECT.values())}')

## 1. Data Loading Utilities

Reuse validated functions from notebook 01.

In [None]:
def load_roi(subj: int, roi_name: str) -> np.ndarray:
    """Load a volumetric ROI mask, transposed to match HDF5 beta dims."""
    path = NSD_ROOT / f'subj{subj:02d}' / 'rois' / f'{roi_name}.nii.gz'
    data = nib.load(str(path)).get_fdata()
    return data.T  # (81, 104, 83) -> (83, 104, 81)


def load_session_betas(subj: int, session: int) -> np.ndarray:
    """Load volumetric betas for one session. Returns float32 in % signal change."""
    path = NSD_ROOT / f'subj{subj:02d}' / 'betas' / f'betas_session{session:02d}.hdf5'
    with h5py.File(str(path), 'r') as f:
        betas = f['betas'][:]  # (750, 83, 104, 81) int16
    return betas.astype(np.float32) / 300.0


def load_ncsnr(subj: int) -> np.ndarray:
    """Load volumetric ncsnr, transposed to match HDF5 beta dims."""
    path = NSD_ROOT / f'subj{subj:02d}' / 'betas' / 'ncsnr.nii.gz'
    data = nib.load(str(path)).get_fdata()
    return data.T


def ncsnr_to_nc(ncsnr: np.ndarray, k: int = 3) -> np.ndarray:
    """Convert ncsnr to noise ceiling percentage. NC = 100 * ncsnr^2 / (ncsnr^2 + 1/k)"""
    return 100.0 * ncsnr**2 / (ncsnr**2 + 1.0 / k)


def get_roi_masks(subj: int) -> dict[str, np.ndarray]:
    """Build boolean masks for all 4 regions for a subject."""
    prf = load_roi(subj, 'prf-visualrois')
    
    masks = {}
    for region, labels in REGION_TO_PRF_LABELS.items():
        masks[region] = np.isin(prf, labels)
    
    # IT: NSD "streams" ventral parcellation (label 5), consistent with Algonauts 2023.
    lh_streams = load_roi(subj, 'lh.streams')
    rh_streams = load_roi(subj, 'rh.streams')
    masks['IT'] = (lh_streams == STREAMS_VENTRAL_LABEL) | (rh_streams == STREAMS_VENTRAL_LABEL)
    
    return masks


# Verify masks load correctly
test_masks = get_roi_masks(1)
for region, mask in test_masks.items():
    print(f'{region}: {mask.sum():>5,} voxels')

## 2. Trial Mapping

Identify which trials correspond to the 1,000 shared images.

In [None]:
# Load experiment design
stim_info = pd.read_csv(NSD_ROOT / 'metadata' / 'nsd_stim_info_merged.csv', index_col=0)
expdesign = scipy.io.loadmat(NSD_ROOT / 'metadata' / 'nsd_expdesign.mat')
masterordering = expdesign['masterordering'].flatten()  # (30000,)
subjectim = expdesign['subjectim']  # (8, 10000)
sharedix = expdesign['sharedix'].flatten()  # (1000,) 1-indexed nsdIds


def get_shared_trial_info(subj: int) -> pd.DataFrame:
    """Find trial indices for shared images for a given subject.
    
    Args:
        subj: 1-indexed subject number
    
    Returns:
        DataFrame with columns: nsd_id, rep, session, trial_in_session, global_trial
    """
    subj_idx = subj - 1  # 0-indexed for subjectim array
    n_sessions = SESSIONS_PER_SUBJECT[subj]
    n_total_trials = n_sessions * TRIALS_PER_SESSION
    
    subj_nsdids = subjectim[subj_idx]  # (10000,) 1-indexed
    nsdid_to_imgidx = {int(nsd_id): img_idx + 1 for img_idx, nsd_id in enumerate(subj_nsdids)}
    
    shared_imgidxs = set()
    for nsd_id in sharedix:
        if int(nsd_id) in nsdid_to_imgidx:
            shared_imgidxs.add(nsdid_to_imgidx[int(nsd_id)])
    
    records = []
    rep_counter = {}
    for trial_idx in range(n_total_trials):
        img_idx = masterordering[trial_idx]
        if img_idx in shared_imgidxs:
            nsd_id = subj_nsdids[img_idx - 1]
            rep = rep_counter.get(img_idx, 0)
            rep_counter[img_idx] = rep + 1
            session = trial_idx // TRIALS_PER_SESSION + 1
            trial_in_session = trial_idx % TRIALS_PER_SESSION
            records.append({
                'nsd_id': int(nsd_id - 1),  # 0-indexed
                'rep': rep,
                'session': session,
                'trial_in_session': trial_in_session,
                'global_trial': trial_idx,
            })
    
    return pd.DataFrame(records)


# Build trial info for all subjects and find images with 3 complete reps
all_trial_info = {}
images_with_3reps = {}  # {subj: set of nsd_ids with 3 reps}

for subj in SUBJECT_LIST:
    trial_info = get_shared_trial_info(subj)
    all_trial_info[subj] = trial_info
    
    # Count reps per image
    reps_per_image = trial_info.groupby('nsd_id')['rep'].count()
    n_with_3 = (reps_per_image == 3).sum()
    n_with_2 = (reps_per_image == 2).sum()
    n_with_1 = (reps_per_image == 1).sum()
    images_with_3reps[subj] = set(reps_per_image[reps_per_image == 3].index)
    
    print(f'subj{subj:02d}: {len(trial_info)} trials, '
          f'{trial_info["nsd_id"].nunique()} unique images, '
          f'3-rep={n_with_3}, 2-rep={n_with_2}, 1-rep={n_with_1}')

# Find images with 3 reps across ALL subjects in SUBJECT_LIST
common_3rep_images = images_with_3reps[SUBJECT_LIST[0]]
for subj in SUBJECT_LIST[1:]:
    common_3rep_images = common_3rep_images & images_with_3reps[subj]

common_3rep_images = sorted(common_3rep_images)
print(f'\nImages with 3 reps in all {N_SUBJECTS} subjects: {len(common_3rep_images)}')

# Filter trial_info to only include these images
for subj in SUBJECT_LIST:
    all_trial_info[subj] = all_trial_info[subj][
        all_trial_info[subj]['nsd_id'].isin(common_3rep_images)
    ].reset_index(drop=True)
    assert len(all_trial_info[subj]) == len(common_3rep_images) * 3

N_USABLE_IMAGES = len(common_3rep_images)
print(f'Using {N_USABLE_IMAGES} images with complete data across all subjects')

## 3. Extract Betas: All Regions, All Subjects

Single pass per session: load betas once, extract all 4 regions simultaneously.
Z-score within session per voxel. Store per-rep betas, then average.

**Expected output per subject per region:** `(1000, n_voxels)` averaged betas.

In [None]:
def extract_all_regions_for_subject(
    subj: int,
    masks: dict[str, np.ndarray],
    trial_info: pd.DataFrame,
    n_images: int,
) -> dict[str, np.ndarray]:
    """Extract z-scored, rep-averaged betas for all regions in a single pass.
    
    Args:
        subj: 1-indexed subject number
        masks: dict of region -> boolean mask (83, 104, 81)
        trial_info: DataFrame from get_shared_trial_info (filtered to usable images)
        n_images: number of usable images
    
    Returns:
        dict of region -> averaged_betas (n_images, n_voxels)
    """
    n_sessions = SESSIONS_PER_SUBJECT[subj]
    nsd_ids_sorted = sorted(trial_info['nsd_id'].unique())
    nsd_id_to_idx = {nsd_id: idx for idx, nsd_id in enumerate(nsd_ids_sorted)}
    
    # Pre-allocate per-rep storage for each region
    per_rep = {}
    for region, mask in masks.items():
        n_voxels = mask.sum()
        per_rep[region] = np.zeros((n_images, N_REPS, n_voxels), dtype=np.float32)
    
    t0 = time.time()
    for session in range(1, n_sessions + 1):
        session_trials = trial_info[trial_info['session'] == session]
        if len(session_trials) == 0:
            continue
        
        # Load session betas ONCE
        session_betas = load_session_betas(subj, session)  # (750, 83, 104, 81)
        
        # Extract and z-score each region
        for region, mask in masks.items():
            roi_betas = session_betas[:, mask]  # (750, n_voxels)
            
            # Stage 1 of 2: Session z-score (Allen et al. 2022, Extended Data Fig. 8).
            # Normalize each voxel to mean=0, std=1 within each 750-trial session.
            # Removes within-session non-stationarities and equalizes units across
            # voxels. This is the only normalization the NSD paper prescribes.
            # Stage 2 (global z-score per subject) is applied later in NB03.
            mean = roi_betas.mean(axis=0, keepdims=True)
            std = roi_betas.std(axis=0, keepdims=True)
            std[std == 0] = 1.0
            roi_betas = (roi_betas - mean) / std
            
            # Collect shared-image trials
            for _, row in session_trials.iterrows():
                img_idx = nsd_id_to_idx[row['nsd_id']]
                per_rep[region][img_idx, row['rep']] = roi_betas[row['trial_in_session']]
        
        del session_betas
        
        if session % 10 == 0:
            elapsed = time.time() - t0
            print(f'  subj{subj:02d} session {session}/{n_sessions} ({elapsed:.0f}s)', flush=True)
    
    # Average across repetitions
    averaged = {}
    for region in masks:
        averaged[region] = per_rep[region].mean(axis=1)  # (n_images, n_voxels)
    
    elapsed = time.time() - t0
    print(f'  subj{subj:02d} done in {elapsed:.0f}s')
    
    return averaged


print('Ready to extract. Will process all 8 subjects.')

In [None]:
# Extract all subjects, all regions
# This is the heavy computation: ~284 sessions, ~2 GB per session load

all_betas = {}  # {subj: {region: (N_USABLE_IMAGES, n_voxels)}}
all_masks = {}  # {subj: {region: boolean_mask}}

total_t0 = time.time()

for subj in SUBJECT_LIST:
    print(f'\nProcessing subj{subj:02d} ({SESSIONS_PER_SUBJECT[subj]} sessions)...')
    
    masks = get_roi_masks(subj)
    all_masks[subj] = masks
    
    betas = extract_all_regions_for_subject(
        subj, masks, all_trial_info[subj], N_USABLE_IMAGES
    )
    all_betas[subj] = betas
    
    for region in REGIONS:
        print(f'    {region}: {betas[region].shape}')
    
    gc.collect()

total_elapsed = time.time() - total_t0
print(f'\nTotal extraction time: {total_elapsed/60:.1f} minutes')

## 4. Compute Per-Voxel Noise Ceilings

Use ncsnr-based noise ceiling (validated to match Allen et al. 2022 at 36.20%).
Store as percentage (0-100) following the Hebart2023 convention.

In [None]:
# Compute per-voxel NC for each subject and region
all_nc = {}  # {subj: {region: (n_voxels,) NC percentage}}

for subj in SUBJECT_LIST:
    ncsnr = load_ncsnr(subj)
    masks = all_masks[subj]
    
    all_nc[subj] = {}
    for region in REGIONS:
        ncsnr_roi = ncsnr[masks[region]]
        nc_pct = ncsnr_to_nc(ncsnr_roi, k=3)
        # Replace NaN with 0 (voxels with undefined ncsnr)
        nc_pct = np.nan_to_num(nc_pct, nan=0.0)
        all_nc[subj][region] = nc_pct
        
        print(f'subj{subj:02d} {region}: {len(nc_pct):>5} voxels, '
              f'median NC={np.median(nc_pct):.1f}%, '
              f'voxels > 30%: {(nc_pct > 30).sum()}')

## 5. Construct NeuroidAssembly

Following the Hebart2023 pattern:
- Dims: `(presentation, neuroid)`
- Presentation coords: `stimulus_id`, `repetition`
- Neuroid coords: `neuroid_id`, `subject`, `region`, `nc_testset`, `voxel_x`, `voxel_y`, `voxel_z`
- NC stored as percentage (0-100)
- One assembly per region, all subjects concatenated along neuroid dim

In [35]:
def get_voxel_coordinates(subj: int, mask: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Get voxel indices (i, j, k) for voxels in a mask.
    
    Returns voxel indices in the func1pt8mm space. These are consistent
    across subjects within NSD (same grid).
    """
    indices = np.argwhere(mask)  # (n_voxels, 3)
    return indices[:, 0], indices[:, 1], indices[:, 2]


# Use the common set of images with 3 reps across all subjects
shared_nsd_ids = common_3rep_images  # already sorted
stimulus_ids = [f'nsd_{nsd_id:05d}' for nsd_id in shared_nsd_ids]

print(f'Stimulus IDs: {stimulus_ids[:5]}...{stimulus_ids[-3:]}')
print(f'Total stimuli: {len(stimulus_ids)}')

Stimulus IDs: ['nsd_03049', 'nsd_03077', 'nsd_03157', 'nsd_03164', 'nsd_03171']...['nsd_72605', 'nsd_72719', 'nsd_72948']
Total stimuli: 515


In [None]:
def build_region_assembly(region: str) -> xr.DataArray:
    """Build a NeuroidAssembly for one region, concatenating all subjects.
    
    Dims: (presentation, neuroid)
    """
    # Collect data and metadata across subjects
    data_blocks = []       # list of (1000, n_voxels_subj) arrays
    neuroid_ids = []       # unique neuroid identifiers
    subjects = []          # subject labels
    regions_coord = []     # region label (constant per assembly)
    nc_values = []         # per-voxel NC percentage
    voxel_xs = []          # voxel i coordinate
    voxel_ys = []          # voxel j coordinate
    voxel_zs = []          # voxel k coordinate
    
    for subj in SUBJECT_LIST:
        betas = all_betas[subj][region]  # (1000, n_voxels)
        nc = all_nc[subj][region]        # (n_voxels,)
        mask = all_masks[subj][region]   # (83, 104, 81)
        
        n_voxels = betas.shape[1]
        vx, vy, vz = get_voxel_coordinates(subj, mask)
        
        data_blocks.append(betas)
        
        for v_idx in range(n_voxels):
            neuroid_ids.append(f'subj{subj:02d}_{region}_v{v_idx:04d}')
            subjects.append(f'subj{subj:02d}')
            regions_coord.append(region)
        
        nc_values.extend(nc.tolist())
        voxel_xs.extend(vx.tolist())
        voxel_ys.extend(vy.tolist())
        voxel_zs.extend(vz.tolist())
    
    # Concatenate across subjects: (1000, total_voxels)
    data = np.concatenate(data_blocks, axis=1)
    
    # Build xarray DataArray
    assembly = xr.DataArray(
        data,
        dims=['presentation', 'neuroid'],
        coords={
            # Presentation coords
            'stimulus_id': ('presentation', stimulus_ids),
            'nsd_id': ('presentation', shared_nsd_ids),
            # Neuroid coords
            'neuroid_id': ('neuroid', neuroid_ids),
            'subject': ('neuroid', subjects),
            'region': ('neuroid', regions_coord),
            'nc_testset': ('neuroid', nc_values),
            'voxel_x': ('neuroid', voxel_xs),
            'voxel_y': ('neuroid', voxel_ys),
            'voxel_z': ('neuroid', voxel_zs),
        },
    )
    
    return assembly


# Build assemblies for all regions
assemblies = {}
for region in REGIONS:
    assembly = build_region_assembly(region)
    assemblies[region] = assembly
    
    n_subjects = len(set(assembly.subject.values))
    n_voxels = assembly.sizes['neuroid']
    median_nc = float(np.median(assembly.nc_testset.values))
    reliable = (assembly.nc_testset.values > 30).sum()
    
    print(f'{region}: shape={assembly.shape}, '
          f'subjects={n_subjects}, voxels={n_voxels}, '
          f'median NC={median_nc:.1f}%, reliable(>30%)={reliable}')

## 6. Train/Test Split

800 train / 200 test images, fixed seed=42. The split is on stimulus_id, consistent
across regions and subjects.

In [37]:
rng = np.random.RandomState(42)
all_indices = np.arange(N_USABLE_IMAGES)
rng.shuffle(all_indices)

n_train = int(N_USABLE_IMAGES * 0.8)
n_test = N_USABLE_IMAGES - n_train

train_indices = np.sort(all_indices[:n_train])
test_indices = np.sort(all_indices[n_train:])

train_stimulus_ids = [stimulus_ids[i] for i in train_indices]
test_stimulus_ids = [stimulus_ids[i] for i in test_indices]

print(f'Train: {len(train_indices)} images')
print(f'Test:  {len(test_indices)} images')
print(f'No overlap: {len(set(train_indices) & set(test_indices)) == 0}')
print(f'\nTrain stimulus_ids: {train_stimulus_ids[:5]}...')
print(f'Test stimulus_ids:  {test_stimulus_ids[:5]}...')

Train: 412 images
Test:  103 images
No overlap: True

Train stimulus_ids: ['nsd_03049', 'nsd_03157', 'nsd_03164', 'nsd_03171', 'nsd_03434']...
Test stimulus_ids:  ['nsd_03077', 'nsd_03847', 'nsd_04690', 'nsd_04786', 'nsd_06444']...


## 7. Validation

Cross-check against notebook 01 values before saving.

In [None]:
# Validate: per-ROI median NC should match expected values
# V1, V2, V4 validated against notebook 01 (which uses correct prf-visualrois labels)
# IT: no prior validated reference (now using streams ventral parcellation)

expected_nc = {'V1': 37.0, 'V2': 31.1, 'V4': 26.4}

print('Noise Ceiling Validation')
print('=' * 50)
all_pass = True

for region in REGIONS:
    assembly = assemblies[region]
    
    # Compute mean-of-medians (median per subject, mean across subjects)
    per_subj_medians = []
    for subj_label in [f'subj{s:02d}' for s in SUBJECT_LIST]:
        subj_nc = assembly.nc_testset.values[assembly.subject.values == subj_label]
        per_subj_medians.append(np.median(subj_nc))
    
    mean_of_medians = np.mean(per_subj_medians)
    
    if region in expected_nc:
        expected = expected_nc[region]
        diff = abs(mean_of_medians - expected)
        status = 'PASS' if diff < 0.5 else 'FAIL'
        if status == 'FAIL':
            all_pass = False
        print(f'{region}: mean-of-medians NC = {mean_of_medians:.1f}% '
              f'(expected {expected:.1f}%, diff={diff:.2f}%) [{status}]')
    else:
        print(f'{region}: mean-of-medians NC = {mean_of_medians:.1f}% '
              f'(no prior reference -- streams ventral parcellation)')

print(f'\nOverall: {"ALL PASS" if all_pass else "SOME FAILED"} (for regions with reference values)')

In [39]:
# Validate assembly shapes and data quality
print('Assembly Shape & Quality Validation')
print('=' * 60)

for region in REGIONS:
    a = assemblies[region]
    
    # Check dims
    assert a.dims == ('presentation', 'neuroid'), f'{region}: unexpected dims {a.dims}'
    assert a.sizes['presentation'] == N_USABLE_IMAGES, f'{region}: expected {N_USABLE_IMAGES} presentations'
    
    # Check no NaN in data
    n_nan = np.isnan(a.values).sum()
    
    # Check subjects present
    unique_subjs = sorted(set(a.subject.values))
    
    # Check data range (z-scored, averaged across 3 reps -> should be moderate)
    data_std = float(np.std(a.values))
    data_mean = float(np.mean(a.values))
    
    print(f'{region}:')
    print(f'  Shape: {a.shape}')
    print(f'  Subjects: {unique_subjs}')
    print(f'  NaN count: {n_nan}')
    print(f'  Data mean={data_mean:.4f}, std={data_std:.4f}')
    print(f'  NC range: [{float(a.nc_testset.min()):.1f}, {float(a.nc_testset.max()):.1f}]%')

Assembly Shape & Quality Validation
V1:
  Shape: (515, 9039)
  Subjects: ['subj01', 'subj02', 'subj03', 'subj04', 'subj05', 'subj06', 'subj07', 'subj08']
  NaN count: 0
  Data mean=-0.0212, std=0.6764
  NC range: [0.0, 83.0]%
V2:
  Shape: (515, 8792)
  Subjects: ['subj01', 'subj02', 'subj03', 'subj04', 'subj05', 'subj06', 'subj07', 'subj08']
  NaN count: 0
  Data mean=-0.0237, std=0.6654
  NC range: [0.0, 81.6]%
V4:
  Shape: (515, 3982)
  Subjects: ['subj01', 'subj02', 'subj03', 'subj04', 'subj05', 'subj06', 'subj07', 'subj08']
  NaN count: 0
  Data mean=-0.0262, std=0.6479
  NC range: [0.0, 77.3]%
IT:
  Shape: (515, 35429)
  Subjects: ['subj01', 'subj02', 'subj03', 'subj04', 'subj05', 'subj06', 'subj07', 'subj08']
  NaN count: 0
  Data mean=-0.0251, std=0.6147
  NC range: [0.0, 84.6]%


## 8b. Assembly Visualizations

Standard representational analysis figures:
1. **RDMs (Representational Dissimilarity Matrices):** Pairwise image dissimilarity in each region's neural space, revealing representational geometry
2. **Inter-subject RDM consistency:** Validates that stimulus-driven signal dominates noise across subjects

In [None]:
# Representational Dissimilarity Matrices (RDMs) per region
# Average per-subject RDMs to reveal stimulus-driven representational structure.
# Images reordered via hierarchical clustering to expose category/similarity structure.

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import squareform
from matplotlib.gridspec import GridSpec

fig = plt.figure(figsize=(22, 9))
gs = GridSpec(2, 5, width_ratios=[1, 1, 1, 1, 0.05], wspace=0.25, hspace=0.3)

region_rdms = {}
region_orders = {}

for idx, region in enumerate(REGIONS):
    assembly = assemblies[region]
    
    # Compute per-subject RDMs, then average
    subject_rdms = []
    for subj_label in [f'subj{s:02d}' for s in SUBJECT_LIST]:
        subj_mask = assembly.subject.values == subj_label
        data = assembly.values[:, subj_mask]
        data_c = data - data.mean(axis=1, keepdims=True)
        norms = np.linalg.norm(data_c, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        data_c = data_c / norms
        subject_rdms.append(1.0 - data_c @ data_c.T)
    
    avg_rdm = np.mean(subject_rdms, axis=0)
    region_rdms[region] = avg_rdm
    
    # Hierarchical clustering to reorder images
    condensed = squareform(avg_rdm, checks=False)
    Z = linkage(condensed, method='average')
    order = leaves_list(Z)
    region_orders[region] = order
    sorted_rdm = avg_rdm[np.ix_(order, order)]
    
    triu_vals = avg_rdm[np.triu_indices(N_USABLE_IMAGES, k=1)]
    vmin, vmax = np.percentile(triu_vals, [2, 98])
    
    # Top row: original ordering
    ax = fig.add_subplot(gs[0, idx])
    ax.imshow(avg_rdm, cmap='viridis', vmin=vmin, vmax=vmax, aspect='equal')
    ax.set_title(f'{region} (original order)', fontsize=11)
    ax.set_xlabel('Image')
    if idx == 0:
        ax.set_ylabel('Image')
    
    # Bottom row: clustered ordering
    ax = fig.add_subplot(gs[1, idx])
    im = ax.imshow(sorted_rdm, cmap='viridis', vmin=vmin, vmax=vmax, aspect='equal')
    ax.set_title(f'{region} (clustered)\n'
                 f'mean={triu_vals.mean():.2f}, std={triu_vals.std():.3f}',
                 fontsize=11)
    ax.set_xlabel('Image (reordered)')
    if idx == 0:
        ax.set_ylabel('Image (reordered)')

# Shared colorbar in dedicated column
cax = fig.add_subplot(gs[:, 4])
fig.colorbar(im, cax=cax, label='1 - Pearson r')

fig.suptitle('Subject-Averaged RDMs: Original vs. Hierarchically Clustered',
             fontsize=14, y=1.01)
plt.show()

# Cross-region RDM correlation
tril = np.tril_indices(N_USABLE_IMAGES, k=-1)
print('Cross-region RDM correlation (lower triangle of subject-averaged RDMs):')
for i, r1 in enumerate(REGIONS):
    for r2 in REGIONS[i+1:]:
        r = np.corrcoef(region_rdms[r1][tril], region_rdms[r2][tril])[0, 1]
        print(f'  {r1} vs {r2}: r = {r:.3f}')

### RDM Interpretation

Each matrix shows the pairwise dissimilarity (1 - Pearson r) between all 515 stimulus images
in neural response space, averaged across 8 subjects. The **top row** uses the arbitrary nsd_id
ordering; the **bottom row** reorders images via hierarchical clustering (average linkage) to
group neurally similar images together.

**Key observations:**

- **V1/V2 (std ~0.09):** Many small, fragmented clusters along the diagonal. These reflect
  low-level feature similarity -- images sharing similar spatial frequency, orientation, or contrast
  statistics evoke correlated V1/V2 response patterns regardless of semantic content.
- **V4 (std ~0.08):** Intermediate structure with broader clusters than V1/V2, consistent with
  mid-level feature selectivity (texture, curvature, shape fragments).
- **IT (std ~0.085):** The most prominent block-diagonal structure with a few large, distinct
  clusters. The dominant dark block likely corresponds to scenes/places (the largest semantic
  category in COCO-derived NSD stimuli), which evoke highly correlated patterns in
  parahippocampal and fusiform regions. The clear off-diagonal bands (yellow) show that
  IT representations sharply distinguish between semantic categories.

**Cross-region RDM correlations** decrease monotonically along the ventral stream
(V1-V2: 0.82, V2-V4: 0.66, V4-IT: 0.43, V1-IT: 0.16), confirming a representational
transformation from low-level retinotopic to high-level categorical encoding. This gradient
is a strong sanity check that our ROI definitions and data extraction are correct.

In [None]:
# Inter-subject representational consistency
# For each region, compute per-subject RDMs and correlate their lower triangles.
# High inter-subject RDM correlation validates that stimulus-driven signal dominates noise.

fig, axes = plt.subplots(1, 4, figsize=(20, 4.5))

for idx, region in enumerate(REGIONS):
    ax = axes[idx]
    assembly = assemblies[region]
    
    subject_rdm_vectors = {}
    for subj_label in [f'subj{s:02d}' for s in SUBJECT_LIST]:
        subj_mask = assembly.subject.values == subj_label
        data = assembly.values[:, subj_mask]  # (515, n_voxels_subj)
        
        # Correlation-distance RDM
        data_c = data - data.mean(axis=1, keepdims=True)
        norms = np.linalg.norm(data_c, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        data_c = data_c / norms
        rdm = 1.0 - data_c @ data_c.T  # (515, 515)
        
        tril = np.tril_indices(N_USABLE_IMAGES, k=-1)
        subject_rdm_vectors[subj_label] = rdm[tril]
    
    # Pairwise RDM correlation
    subj_list = [f'subj{s:02d}' for s in SUBJECT_LIST]
    corr_mat = np.zeros((N_SUBJECTS, N_SUBJECTS))
    for i, si in enumerate(subj_list):
        for j, sj in enumerate(subj_list):
            corr_mat[i, j] = np.corrcoef(
                subject_rdm_vectors[si], subject_rdm_vectors[sj]
            )[0, 1]
    
    im = ax.imshow(corr_mat, cmap='RdYlBu_r', vmin=0, vmax=1, aspect='equal')
    ax.set_xticks(range(N_SUBJECTS))
    ax.set_yticks(range(N_SUBJECTS))
    ax.set_xticklabels([f'S{s}' for s in SUBJECT_LIST], fontsize=8)
    ax.set_yticklabels([f'S{s}' for s in SUBJECT_LIST], fontsize=8)
    
    for i in range(N_SUBJECTS):
        for j in range(N_SUBJECTS):
            color = 'white' if corr_mat[i, j] < 0.5 else 'black'
            ax.text(j, i, f'{corr_mat[i, j]:.2f}', ha='center', va='center',
                    fontsize=7, color=color)
    
    off_diag = corr_mat[np.triu_indices(N_SUBJECTS, k=1)]
    ax.set_title(f'{region}\n(mean r = {off_diag.mean():.3f})', fontsize=12)

fig.colorbar(im, ax=axes[-1], shrink=0.8, label='RDM Pearson r')
fig.suptitle('Inter-Subject Representational Consistency (RDM Correlation)',
             fontsize=13)
plt.tight_layout()
plt.show()

# Print summary
for region in REGIONS:
    assembly = assemblies[region]
    subject_rdm_vectors = {}
    for subj_label in [f'subj{s:02d}' for s in SUBJECT_LIST]:
        subj_mask = assembly.subject.values == subj_label
        data = assembly.values[:, subj_mask]
        data_c = data - data.mean(axis=1, keepdims=True)
        norms = np.linalg.norm(data_c, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        data_c = data_c / norms
        rdm = 1.0 - data_c @ data_c.T
        tril = np.tril_indices(N_USABLE_IMAGES, k=-1)
        subject_rdm_vectors[subj_label] = rdm[tril]
    
    subj_list = [f'subj{s:02d}' for s in SUBJECT_LIST]
    pairs = [(subject_rdm_vectors[si], subject_rdm_vectors[sj])
             for i, si in enumerate(subj_list)
             for j, sj in enumerate(subj_list) if j > i]
    mean_r = np.mean([np.corrcoef(a, b)[0, 1] for a, b in pairs])
    print(f'{region}: mean inter-subject RDM r = {mean_r:.3f}')

### Inter-Subject Consistency Interpretation

Each heatmap shows the Pearson correlation between every pair of subjects' RDM lower triangles
for a given region. High off-diagonal values indicate that two subjects organize the same
stimulus set similarly in neural space -- i.e., the representational geometry is driven by
stimulus content rather than idiosyncratic noise.

**Key observations:**

- **IT has the highest consistency (mean r = 0.484).** This is the classic RSA finding:
  higher-level categorical/semantic representations in inferotemporal cortex are remarkably
  consistent across individuals, because category membership is a shared organizational
  principle. All subject pairs exceed r = 0.15, with most above 0.5.
- **V1 (0.376) and V2 (0.361) show moderate consistency.** Early visual areas encode
  retinotopic features that are inherently stimulus-driven, but the exact voxel placement
  and receptive field coverage vary across subjects (different cortical folding), which
  lowers RDM agreement despite the underlying functional similarity.
- **V4 has the lowest consistency (0.248).** This is the smallest ROI (~500 voxels/subject),
  making per-subject RDMs noisier. V4's selectivity for intermediate features (textures,
  curvature) is also less categorically organized than IT, yielding weaker cross-subject agreement.
- **Subject 8 is consistently the weakest** across all regions (lowest row/column values),
  aligning with having the fewest sessions (30) and lowest noise ceiling (NC median 5-21%
  depending on region).

## 8. Save Assemblies

Save per-region assemblies as netCDF files. Also save the train/test split indices.

In [42]:
# Save assemblies
for region in REGIONS:
    path = OUTPUT_DIR / f'Allen2022.{region}.nc'
    assemblies[region].to_netcdf(str(path))
    size_mb = path.stat().st_size / 1e6
    print(f'Saved {path.name}: {size_mb:.1f} MB')

# Save train/test split
split_df = pd.DataFrame({
    'stimulus_id': stimulus_ids,
    'nsd_id': shared_nsd_ids,
    'split': ['train' if i in set(train_indices) else 'test' for i in range(N_USABLE_IMAGES)],
})
split_path = OUTPUT_DIR / 'train_test_split.csv'
split_df.to_csv(str(split_path), index=False)
print(f'\nSaved {split_path.name}')
print(f'  Train: {(split_df["split"] == "train").sum()}')
print(f'  Test:  {(split_df["split"] == "test").sum()}')

Saved Allen2022.V1.nc: 20.5 MB
Saved Allen2022.V2.nc: 20.0 MB
Saved Allen2022.V4.nc: 9.1 MB
Saved Allen2022.IT.nc: 80.4 MB

Saved train_test_split.csv
  Train: 412
  Test:  103


In [43]:
# Verify saved files can be loaded back
print('Reload verification')
print('=' * 50)

for region in REGIONS:
    path = OUTPUT_DIR / f'Allen2022.{region}.nc'
    loaded = xr.open_dataarray(str(path))
    
    # Check round-trip fidelity
    orig = assemblies[region]
    assert loaded.shape == orig.shape, f'{region}: shape mismatch'
    assert np.allclose(loaded.values, orig.values, atol=1e-6), f'{region}: data mismatch'
    assert list(loaded.coords) == list(orig.coords), f'{region}: coord mismatch'
    
    print(f'{region}: reload OK, shape={loaded.shape}')
    loaded.close()

print('\nAll assemblies verified.')

Reload verification
V1: reload OK, shape=(515, 9039)
V2: reload OK, shape=(515, 8792)
V4: reload OK, shape=(515, 3982)
IT: reload OK, shape=(515, 35429)

All assemblies verified.


## 9. Summary

In [45]:
print('=' * 60)
print('NSD Data Preparation Summary')
print('=' * 60)
print()
print('ASSEMBLIES CREATED:')
for region in REGIONS:
    a = assemblies[region]
    nc_med = np.median(a.nc_testset.values)
    reliable = (a.nc_testset.values > 30).sum()
    print(f'  Allen2022.{region}: {a.shape[0]} stimuli x {a.shape[1]} neuroids, '
          f'median NC={nc_med:.1f}%, reliable(>30%)={reliable}')
print()
print('SPLIT:')
print(f'  Train: {len(train_indices)} stimuli')
print(f'  Test:  {len(test_indices)} stimuli')
print(f'  Seed: 42')
print()
print('FILES:')
for f in sorted(OUTPUT_DIR.iterdir()):
    size_mb = f.stat().st_size / 1e6
    print(f'  {f.name}: {size_mb:.1f} MB')
print()
print('NEXT STEPS:')
print('  1. Package StimulusSet from nsd_stimuli.hdf5')
print('  2. Write Brain-Score benchmark plugin (Allen2022.V1-ridge, etc.)')
print('  3. Upload assemblies to S3 via Brain-Score data packaging')

NSD Data Preparation Summary

ASSEMBLIES CREATED:
  Allen2022.V1: 515 stimuli x 9039 neuroids, median NC=37.4%, reliable(>30%)=5402
  Allen2022.V2: 515 stimuli x 8792 neuroids, median NC=31.2%, reliable(>30%)=4584
  Allen2022.V4: 515 stimuli x 3982 neuroids, median NC=25.3%, reliable(>30%)=1660
  Allen2022.IT: 515 stimuli x 35429 neuroids, median NC=9.7%, reliable(>30%)=6805

SPLIT:
  Train: 412 stimuli
  Test:  103 stimuli
  Seed: 42

FILES:
  Allen2022.IT.nc: 80.4 MB
  Allen2022.V1.nc: 20.5 MB
  Allen2022.V2.nc: 20.0 MB
  Allen2022.V4.nc: 9.1 MB
  train_test_split.csv: 0.0 MB

NEXT STEPS:
  1. Package StimulusSet from nsd_stimuli.hdf5
  2. Write Brain-Score benchmark plugin (Allen2022.V1-ridge, etc.)
  3. Upload assemblies to S3 via Brain-Score data packaging
