# Developmental Trajectory RDM Analysis

This notebook creates multiple Representational Dissimilarity Matrices (RDMs) for each individual subject, binned by age in months (age_mo).
This allows tracking how object representations change developmentally within each subject.

## Overview

This analysis:
1. Loads grouped embeddings (averaged by category, subject, and age_mo)
2. Bins embeddings by age_mo for each subject
3. Computes RDM for each subject at each age_mo bin
4. Handles data density differences (some subjects/ages have more data)
5. Visualizes developmental trajectories
6. Compares RDMs across age bins within subjects

## Key Features

- **Age binning**: Groups embeddings by age_mo to track developmental changes
- **Data density handling**: Minimum category threshold per age bin
- **Trajectory analysis**: Compare RDMs across age bins to see developmental changes
- **Missing data handling**: Only includes age bins with sufficient data


## Setup and Imports


In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr, pearsonr
from scipy.cluster.hierarchy import linkage, dendrogram, optimal_leaf_ordering
from scipy.spatial.distance import squareform
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib backend
import matplotlib
matplotlib.use('Agg')

print("All imports successful!")


All imports successful!


## Configuration


In [7]:
# Paths
embeddings_dir = Path("/data2/dataset/babyview/868_hours/outputs/yoloe_cdi_embeddings/clip_embeddings_grouped_by_age-mo")
output_dir = Path("developmental_trajectory_rdms")
output_dir.mkdir(exist_ok=True, parents=True)

# Categories file (optional - to filter to specific categories)
categories_file = Path("../../data/things_bv_overlap_categories_exclude_zero_precisions.txt")

# CDI words CSV file (required for category type organization)
cdi_path = Path("../../data/cdi_words.csv")

# Hierarchical clustering options
use_clustering = True  # Enable hierarchical clustering within category groups
save_dendrograms = True  # Save dendrogram plots for each category group

# Minimum categories required per age_mo bin to compute RDM
min_categories_per_age_bin = 8

# Minimum number of age bins required per subject to include in analysis
min_age_bins_per_subject = 2

# Age binning strategy: 'exact' (use exact age_mo) or 'binned' (group into bins)
age_binning_strategy = 'exact'  # or 'binned'
age_bin_size = 3  # if using 'binned', group ages into bins of this size (e.g., 3 months)

print(f"Embeddings directory: {embeddings_dir}")
print(f"Output directory: {output_dir}")
print(f"CDI path: {cdi_path}")
print(f"Use clustering: {use_clustering}")
print(f"Min categories per age bin: {min_categories_per_age_bin}")
print(f"Min age bins per subject: {min_age_bins_per_subject}")
print(f"Age binning strategy: {age_binning_strategy}")


Embeddings directory: /data2/dataset/babyview/868_hours/outputs/yoloe_cdi_embeddings/clip_embeddings_grouped_by_age-mo
Output directory: developmental_trajectory_rdms
Min categories per age bin: 8
Min age bins per subject: 2
Age binning strategy: exact


## Helper Functions


In [None]:
def load_category_types(cdi_path):
    """Load category type information from CDI words CSV"""
    print(f"Loading category types from {cdi_path}...")
    cdi_df = pd.read_csv(cdi_path)
    
    category_types = {}
    for _, row in cdi_df.iterrows():
        category_types[row['uni_lemma']] = {
            'is_animate': bool(row.get('is_animate', 0)),
            'is_bodypart': bool(row.get('is_bodypart', 0)),
            'is_small': bool(row.get('is_small', 0)),
            'is_big': bool(row.get('is_big', 0))
        }
    
    print(f"Loaded type information for {len(category_types)} categories")
    return category_types

def cluster_categories_within_group(group_categories, cat_to_embedding, save_dendrogram=False, output_dir=None, group_name=None):
    """
    Perform hierarchical clustering within a group of categories.
    
    Args:
        group_categories: List of category names in the group
        cat_to_embedding: Dictionary mapping category names to embeddings
        save_dendrogram: Whether to save dendrogram plot (default: False)
        output_dir: Output directory for saving dendrogram (required if save_dendrogram=True)
        group_name: Name of the group for saving dendrogram (required if save_dendrogram=True)
    
    Returns:
        List of category names reordered according to clustering dendrogram
    """
    if len(group_categories) <= 1:
        return group_categories, None
    
    # Get embeddings for this group
    group_embeddings = np.array([cat_to_embedding[cat].flatten() for cat in group_categories])
    
    # Normalize embeddings (z-score normalization per embedding)
    normalized_embeddings = (group_embeddings - group_embeddings.mean(axis=0)) / (group_embeddings.std(axis=0) + 1e-10)
    
    # Compute distance matrix (1 - cosine similarity)
    similarity_matrix = cosine_similarity(normalized_embeddings)
    distance_matrix = 1 - similarity_matrix
    np.fill_diagonal(distance_matrix, 0)
    
    # Convert to condensed form for linkage
    condensed_distances = squareform(distance_matrix)
    
    # Perform hierarchical clustering
    linkage_matrix = linkage(condensed_distances, method='ward')
    
    # Get optimal leaf ordering for better visualization
    try:
        linkage_matrix = optimal_leaf_ordering(linkage_matrix, condensed_distances)
    except:
        # If optimal leaf ordering fails, use original linkage
        pass
    
    # Extract the order from the dendrogram
    dendro_dict = dendrogram(linkage_matrix, no_plot=True)
    leaf_order = dendro_dict['leaves']
    
    # Reorder categories according to clustering
    clustered_categories = [group_categories[i] for i in leaf_order]
    
    # Save dendrogram if requested
    if save_dendrogram and output_dir is not None and group_name is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        
        plt.figure(figsize=(12, 8))
        dendrogram(linkage_matrix, 
                  labels=group_categories,
                  leaf_rotation=90,
                  leaf_font_size=10)
        plt.title(f'Hierarchical Clustering Dendrogram: {group_name.upper()}\n({len(group_categories)} categories)',
                 fontsize=16, pad=20)
        plt.xlabel('Category', fontsize=12)
        plt.ylabel('Distance', fontsize=12)
        plt.tight_layout()
        
        # Save as PNG
        output_path_png = output_dir / f'dendrogram_{group_name}.png'
        plt.savefig(output_path_png, dpi=300, bbox_inches='tight', pad_inches=0.2)
        print(f"    Saved dendrogram to {output_path_png}")
        
        # Save as PDF
        output_path_pdf = output_dir / f'dendrogram_{group_name}.pdf'
        plt.savefig(output_path_pdf, bbox_inches='tight', pad_inches=0.2)
        print(f"    Saved dendrogram to {output_path_pdf}")
        
        plt.close()
    
    return clustered_categories, linkage_matrix

print("Helper functions loaded!")

In [8]:
# Load allowed categories if file exists
allowed_categories = None
if categories_file.exists():
    print(f"Loading categories from {categories_file}...")
    with open(categories_file, 'r') as f:
        allowed_categories = set(line.strip() for line in f if line.strip())
    print(f"Loaded {len(allowed_categories)} categories")
else:
    print(f"Categories file not found, using all categories")


Loading categories from ../../data/things_bv_overlap_categories_exclude_zero_precisions.txt...
Loaded 163 categories


## Load Embeddings by Age


In [9]:
def load_embeddings_by_age(embeddings_dir, allowed_categories=None, age_binning_strategy='exact', age_bin_size=3):
    """
    Load embeddings organized by subject, age_mo, and category.
    
    Returns:
        subject_age_embeddings: dict[subject_id][age_mo_bin][category] = embedding array
    """
    subject_age_embeddings = defaultdict(lambda: defaultdict(dict))
    
    # Get all category folders
    category_folders = [f for f in embeddings_dir.iterdir() if f.is_dir()]
    
    if allowed_categories:
        category_folders = [f for f in category_folders if f.name in allowed_categories]
    
    print(f"Loading embeddings from {len(category_folders)} categories...")
    
    for category_folder in tqdm(category_folders, desc="Loading categories"):
        category = category_folder.name
        
        # Get all embedding files in this category
        embedding_files = list(category_folder.glob("*.npy"))
        
        for emb_file in embedding_files:
            # Parse filename: {subject_id}_{age_mo}_month_level_avg.npy
            filename = emb_file.stem  # without .npy
            parts = filename.split('_')
            
            if len(parts) < 2:
                continue
            
            # Extract subject_id and age_mo
            subject_id = parts[0]
            age_mo = int(parts[1]) if parts[1].isdigit() else None
            
            if age_mo is None:
                continue
            
            # Apply age binning strategy
            if age_binning_strategy == 'binned':
                age_mo_bin = (age_mo // age_bin_size) * age_bin_size  # Round down to bin
            else:
                age_mo_bin = age_mo  # Use exact age
            
            try:
                embedding = np.load(emb_file)
                subject_age_embeddings[subject_id][age_mo_bin][category] = embedding
            except Exception as e:
                print(f"Error loading {emb_file}: {e}")
                continue
    
    return subject_age_embeddings

# Load embeddings
subject_age_embeddings = load_embeddings_by_age(
    embeddings_dir, 
    allowed_categories, 
    age_binning_strategy=age_binning_strategy,
    age_bin_size=age_bin_size
)

print(f"\nLoaded embeddings for {len(subject_age_embeddings)} subjects")

# Show age bin distribution
all_age_bins = set()
for subject_id, age_data in subject_age_embeddings.items():
    all_age_bins.update(age_data.keys())

print(f"Age bins found: {sorted(all_age_bins)}")
print(f"Age range: {min(all_age_bins)} to {max(all_age_bins)} months")


Loading embeddings from 163 categories...


Loading categories: 100%|██████████| 163/163 [00:01<00:00, 119.78it/s]



Loaded embeddings for 32 subjects
Age bins found: [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37]
Age range: 6 to 37 months


## Normalize Embeddings

Before computing RDMs, we normalize embeddings using z-score normalization (mean=0, std=1) for each age bin to ensure fair comparisons.


In [10]:
# Normalize embeddings within each subject across all age bins (z-score normalization)
# This normalizes all embeddings for a subject together, allowing comparison across age bins
print("Normalizing embeddings within each subject across all age bins...")
subject_age_embeddings_normalized = {}

for subject_id, age_data in tqdm(subject_age_embeddings.items(), desc="Normalizing"):
    # Collect all embeddings for this subject across all age bins
    all_embeddings_list = []
    embedding_to_age_cat = []  # Track which (age, category) each embedding belongs to
    
    for age_mo, categories in age_data.items():
        for cat, embedding in categories.items():
            all_embeddings_list.append(embedding.flatten())
            embedding_to_age_cat.append((age_mo, cat))
    
    if len(all_embeddings_list) == 0:
        continue
    
    # Stack all embeddings for this subject
    all_embeddings_matrix = np.array(all_embeddings_list)
    
    # Compute global mean and std across all embeddings for this subject
    subject_mean = all_embeddings_matrix.mean(axis=0)
    subject_std = all_embeddings_matrix.std(axis=0) + 1e-10  # Add small epsilon to avoid division by zero
    
    # Normalize all embeddings using subject-level statistics
    normalized_embeddings_matrix = (all_embeddings_matrix - subject_mean) / subject_std
    
    # Store normalized embeddings back by age and category
    subject_age_embeddings_normalized[subject_id] = {}
    for i, (age_mo, cat) in enumerate(embedding_to_age_cat):
        if age_mo not in subject_age_embeddings_normalized[subject_id]:
            subject_age_embeddings_normalized[subject_id][age_mo] = {}
        subject_age_embeddings_normalized[subject_id][age_mo][cat] = normalized_embeddings_matrix[i]

print(f"Normalized embeddings for {len(subject_age_embeddings_normalized)} subjects")
print("  Note: Normalized within each subject across all age bins, allowing comparison across development")
def compute_rdm_for_age_bin(age_embeddings_dict, categories_list):
    """
    Compute RDM for a single age bin.
    
    Args:
        age_embeddings_dict: dict[category] = embedding array (should be normalized)
        categories_list: list of categories to include (in order)
    
    Returns:
        rdm: numpy array of shape (n_categories, n_categories) or None
        available_categories: list of categories actually present
    """
    # Filter to categories that exist for this age bin
    available_categories = [cat for cat in categories_list if cat in age_embeddings_dict]
    
    if len(available_categories) < min_categories_per_age_bin:
        return None, available_categories
    
    # Build embedding matrix (already normalized)
    # Flatten each embedding to ensure 1D (in case they have shape (1, 512) instead of (512,))
    embedding_matrix = np.array([age_embeddings_dict[cat].flatten() for cat in available_categories])
    
    # Ensure 2D shape: (n_categories, embedding_dim)
    if embedding_matrix.ndim != 2:
        raise ValueError(f"Expected 2D embedding matrix, got shape {embedding_matrix.shape}")
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(embedding_matrix)
    
    # Convert to distance (RDM)
    distance_matrix = 1 - similarity_matrix
    np.fill_diagonal(distance_matrix, 0)  # Ensure diagonal is 0
    
    # Make symmetric (in case of numerical errors)
    distance_matrix = (distance_matrix + distance_matrix.T) / 2
    
    return distance_matrix, available_categories

# Get all unique categories across all subjects and ages
all_categories = set()
for subject_id, age_data in subject_age_embeddings_normalized.items():
    for age_mo, categories in age_data.items():
        all_categories.update(categories.keys())

all_categories = sorted(list(all_categories))
print(f"Total unique categories across all subjects and ages: {len(all_categories)}")

# Compute RDMs for each subject at each age bin using normalized embeddings
subject_age_rdms = {}
subject_age_rdm_categories = {}

for subject_id, age_data in tqdm(subject_age_embeddings_normalized.items(), desc="Computing RDMs"):
    subject_age_rdms[subject_id] = {}
    subject_age_rdm_categories[subject_id] = {}
    
    for age_mo, categories in age_data.items():
        rdm, available_cats = compute_rdm_for_age_bin(categories, all_categories)
        
        if rdm is not None:
            subject_age_rdms[subject_id][age_mo] = rdm
            subject_age_rdm_categories[subject_id][age_mo] = available_cats
    
    # Filter out subjects with too few age bins
    if len(subject_age_rdms[subject_id]) < min_age_bins_per_subject:
        del subject_age_rdms[subject_id]
        del subject_age_rdm_categories[subject_id]

print(f"\nComputed RDMs for {len(subject_age_rdms)} subjects")
print(f"  (Excluded subjects with < {min_age_bins_per_subject} age bins with sufficient data)")

# Show distribution of age bins per subject
age_bin_counts = [len(age_rdms) for age_rdms in subject_age_rdms.values()]
print(f"\nAge bins per subject:")
print(f"  Min: {min(age_bin_counts) if age_bin_counts else 0}")
print(f"  Max: {max(age_bin_counts) if age_bin_counts else 0}")
print(f"  Mean: {np.mean(age_bin_counts):.1f}" if age_bin_counts else "  Mean: 0")
print(f"  Median: {np.median(age_bin_counts):.1f}" if age_bin_counts else "  Median: 0")


Normalizing embeddings per subject and age bin...


Normalizing: 100%|██████████| 32/32 [00:00<00:00, 414.53it/s]


Normalized embeddings for 32 subjects
  Note: Each age bin is normalized independently, focusing on relative structure within that age bin
Total unique categories across all subjects and ages: 163


Computing RDMs: 100%|██████████| 32/32 [00:00<00:00, 242.70it/s]


Computed RDMs for 26 subjects
  (Excluded subjects with < 2 age bins with sufficient data)

Age bins per subject:
  Min: 2
  Max: 20
  Mean: 10.0
  Median: 11.0





In [None]:
## Organize Categories and Apply Hierarchical Clustering

# Load category types for organization
if cdi_path.exists():
    category_types = load_category_types(cdi_path)
else:
    print(f"Warning: CDI path {cdi_path} not found. Skipping category organization.")
    category_types = {}

# Get all unique categories across all subjects and ages (needed for organization)
all_categories = set()
for subject_id, age_data in subject_age_embeddings_normalized.items():
    for age_mo, categories in age_data.items():
        all_categories.update(categories.keys())

all_categories = sorted(list(all_categories))
print(f"Total unique categories across all subjects and ages: {len(all_categories)}")

# Organize categories by broad types and apply hierarchical clustering
print("\nOrganizing categories by type and applying hierarchical clustering...")

# Get a representative set of embeddings for clustering (average across all subjects and ages)
representative_embeddings = {}
for cat in all_categories:
    cat_embeddings = []
    for subject_id, age_data in subject_age_embeddings_normalized.items():
        for age_mo, categories in age_data.items():
            if cat in categories:
                cat_embeddings.append(categories[cat])
    if len(cat_embeddings) > 0:
        # Average across all subjects and ages for this category
        representative_embeddings[cat] = np.mean(cat_embeddings, axis=0)

# Organize by type
organized = {
    'animals': [],
    'bodyparts': [],
    'big_objects': [],
    'small_objects': [],
    'others': []
}

for cat in all_categories:
    if cat not in category_types:
        organized['others'].append(cat)
        continue
    
    types = category_types[cat]
    if types['is_animate']:
        organized['animals'].append(cat)
    elif types['is_bodypart']:
        organized['bodyparts'].append(cat)
    elif types['is_big']:
        organized['big_objects'].append(cat)
    elif types['is_small']:
        organized['small_objects'].append(cat)
    else:
        organized['others'].append(cat)

print(f"  Organized into: {len(organized['animals'])} animals, {len(organized['bodyparts'])} bodyparts, "
      f"{len(organized['big_objects'])} big objects, {len(organized['small_objects'])} small objects, "
      f"{len(organized['others'])} others")

# Apply hierarchical clustering within each group
if use_clustering:
    print("\nApplying hierarchical clustering within groups...")
    for key in organized:
        if len(organized[key]) > 1:
            # Filter to categories that have representative embeddings
            group_cats = [cat for cat in organized[key] if cat in representative_embeddings]
            if len(group_cats) > 1:
                print(f"  Clustering {key} ({len(group_cats)} categories)...")
                organized[key], _ = cluster_categories_within_group(
                    group_cats,
                    representative_embeddings,
                    save_dendrogram=save_dendrograms,
                    output_dir=output_dir,
                    group_name=key
                )
            else:
                organized[key] = group_cats
        else:
            organized[key] = [cat for cat in organized[key] if cat in representative_embeddings]
else:
    for key in organized:
        organized[key] = sorted([cat for cat in organized[key] if cat in representative_embeddings])

# Create ordered list of categories
ordered_categories = (
    organized['animals'] +
    organized['bodyparts'] +
    organized['big_objects'] +
    organized['small_objects'] +
    organized['others']
)

print(f"\nFinal ordered category list: {len(ordered_categories)} categories")

In [None]:
# Age binning for visualization# Group ages into bins to reduce number of RDMs per subjectdef create_age_bins(ages, bin_size=3):    """    Create age bins of specified size.        Args:        ages: List of age values in months        bin_size: Size of each bin in months (default: 3)        Returns:        Dictionary mapping age_bin -> list of ages in that bin    """"    age_bins = {}    for age in sorted(ages):        # Round down to nearest bin        age_bin = (age // bin_size) * bin_size        if age_bin not in age_bins:            age_bins[age_bin] = []        age_bins[age_bin].append(age)    return age_binsdef aggregate_rdm_for_age_bin(rdms_dict, ages_in_bin, categories_dict):    """    Aggregate RDMs for multiple ages in a bin by averaging.        Args:        rdms_dict: Dictionary mapping age -> RDM matrix        ages_in_bin: List of ages to aggregate        categories_dict: Dictionary mapping age -> list of categories        Returns:        Aggregated RDM and common categories    """    if len(ages_in_bin) == 0:        return None, []        # Find common categories across all ages in bin    common_cats = set(categories_dict[ages_in_bin[0]])    for age in ages_in_bin[1:]:        common_cats = common_cats & set(categories_dict[age])        common_cats = sorted(list(common_cats))        if len(common_cats) < min_categories_per_age_bin:        return None, common_cats        # Get indices for common categories in each RDM    rdms_to_aggregate = []    for age in ages_in_bin:        rdm = rdms_dict[age]        cats = categories_dict[age]        indices = [cats.index(cat) for cat in common_cats]        rdm_subset = rdm[np.ix_(indices, indices)]        rdms_to_aggregate.append(rdm_subset)        # Average the RDMs    aggregated_rdm = np.mean(rdms_to_aggregate, axis=0)        return aggregated_rdm, common_cats# Age binning configurationage_bin_size = 3  # Group ages into 3-month bins (e.g., 6-8, 9-11, 12-14, etc.)use_age_binning = True  # Set to False to use exact agesprint(f"Age binning: {'Enabled' if use_age_binning else 'Disabled'}")if use_age_binning:    print(f"  Bin size: {age_bin_size} months")

In [None]:

# Reorganize each subject's RDM at each age bin according to the new ordering
print("\nReorganizing individual subject RDMs according to new category ordering...")
subject_age_rdms_reorganized = {}
subject_age_rdm_categories_reorganized = {}
subject_age_group_boundaries = {}  # Store group boundaries for visual separators

for subject_id in tqdm(subject_age_rdms.keys(), desc="Reorganizing RDMs"):
    subject_age_rdms_reorganized[subject_id] = {}    subject_age_rdm_categories_reorganized[subject_id] = {}    subject_age_group_boundaries[subject_id] = {}        # Get original age RDMs for this subject    original_age_rdms = subject_age_rdms[subject_id]    original_age_categories = subject_age_rdm_categories[subject_id]        # Apply age binning if enabled    if use_age_binning:        # Get all ages for this subject        subject_ages = sorted(list(original_age_rdms.keys()))        age_bins = create_age_bins(subject_ages, age_bin_size)                # Aggregate RDMs within each bin        binned_rdms = {}        binned_categories = {}        for age_bin, ages_in_bin in age_bins.items():            rdms_for_bin = {age: original_age_rdms[age] for age in ages_in_bin}            cats_for_bin = {age: original_age_categories[age] for age in ages_in_bin}            agg_rdm, agg_cats = aggregate_rdm_for_age_bin(rdms_for_bin, ages_in_bin, cats_for_bin)            if agg_rdm is not None:                # Use bin label (e.g., "6-8" for ages 6,7,8)                bin_label = f"{min(ages_in_bin)}-{max(ages_in_bin)}"                binned_rdms[bin_label] = agg_rdm                binned_categories[bin_label] = agg_cats                # Use binned version for reorganization        age_rdms_to_process = binned_rdms        age_categories_to_process = binned_categories    else:        age_rdms_to_process = original_age_rdms        age_categories_to_process = original_age_categories        # Reorganize each age bin's RDM    for age_mo, rdm in age_rdms_to_process.items():        available_cats = age_categories_to_process[age_mo]        
        # Get the ordered list of categories for this age bin (subset of ordered_categories)
        subject_age_ordered_cats = [cat for cat in ordered_categories if cat in available_cats]
        
        # Create new indices for reorganized RDM
        new_indices = [available_cats.index(cat) for cat in subject_age_ordered_cats]
        
        # Reorganize the RDM
        rdm_reorganized = rdm[np.ix_(new_indices, new_indices)]
        
        # Compute group boundaries for this age bin
        group_boundaries = []
        current_idx = 0
        for group_name in ['animals', 'bodyparts', 'big_objects', 'small_objects', 'others']:
            group_cats = [cat for cat in organized[group_name] if cat in subject_age_ordered_cats]
            if len(group_cats) > 0:
                group_start = current_idx
                group_end = current_idx + len(group_cats)
                group_boundaries.append({
                    'name': group_name,
                    'start': group_start,
                    'end': group_end,
                    'categories': group_cats
                })
                current_idx = group_end
        
        subject_age_rdms_reorganized[subject_id][age_mo] = rdm_reorganized
        subject_age_rdm_categories_reorganized[subject_id][age_mo] = subject_age_ordered_cats
        subject_age_group_boundaries[subject_id][age_mo] = group_boundaries

# Update the main dictionaries
subject_age_rdms = subject_age_rdms_reorganized
subject_age_rdm_categories = subject_age_rdm_categories_reorganized

print(f"Reorganized RDMs for {len(subject_age_rdms)} subjects")

## Compute RDMs for Each Subject at Each Age Bin


In [11]:
# Save RDMs for each subject-age combination
print("Saving developmental trajectory RDMs...")

for subject_id, age_rdms in tqdm(subject_age_rdms.items(), desc="Saving RDMs"):
    subject_output_dir = output_dir / subject_id
    subject_output_dir.mkdir(exist_ok=True, parents=True)
    
    for age_mo, rdm in age_rdms.items():
        categories = subject_age_rdm_categories[subject_id][age_mo]
        
        # Save as numpy array
        np.save(subject_output_dir / f"rdm_age_{age_mo}.npy", rdm)
        
        # Save as CSV with category labels
        rdm_df = pd.DataFrame(rdm, index=categories, columns=categories)
        rdm_df.to_csv(subject_output_dir / f"rdm_age_{age_mo}.csv")
        
        # Save metadata
        metadata = {
            'subject_id': subject_id,
            'age_mo': age_mo,
            'n_categories': len(categories),
            'categories': categories,
            'mean_distance': float(rdm.mean()),
            'std_distance': float(rdm.std())
        }
        
        metadata_df = pd.DataFrame([metadata])
        metadata_df.to_csv(subject_output_dir / f"metadata_age_{age_mo}.csv", index=False)

        # Create and save individual dendrogram for this age bin
        if len(categories) > 1:
            # Get embeddings for this age bin's categories
            age_embeddings = {cat: subject_age_embeddings_normalized[subject_id][age_mo][cat] 
                             for cat in categories if cat in subject_age_embeddings_normalized[subject_id][age_mo]}
            
            if len(age_embeddings) > 1:
                # Build embedding matrix
                embedding_matrix = np.array([age_embeddings[cat].flatten() for cat in categories])
                
                # Normalize embeddings
                normalized_embeddings = (embedding_matrix - embedding_matrix.mean(axis=0)) / (embedding_matrix.std(axis=0) + 1e-10)
                
                # Compute distance matrix
                similarity_matrix = cosine_similarity(normalized_embeddings)
                distance_matrix = 1 - similarity_matrix
                np.fill_diagonal(distance_matrix, 0)
                
                # Convert to condensed form for linkage
                condensed_distances = squareform(distance_matrix)
                
                # Perform hierarchical clustering
                linkage_matrix = linkage(condensed_distances, method='ward')
                
                # Get optimal leaf ordering
                try:
                    linkage_matrix = optimal_leaf_ordering(linkage_matrix, condensed_distances)
                except:
                    pass
                
                # Create dendrogram
                plt.figure(figsize=(max(16, len(categories) * 0.5), 10))
                dendrogram(linkage_matrix, 
                          labels=categories,
                          leaf_rotation=90,
                          leaf_font_size=max(8, min(14, 200 // len(categories))))
                plt.title(f'Dendrogram: {subject_id} Age {age_mo} months\n({len(categories)} categories)',
                         fontsize=16, pad=20)
                plt.xlabel('Category', fontsize=14)
                plt.ylabel('Distance', fontsize=14)
                plt.tight_layout()
                
                # Save dendrogram
                dendrogram_dir = subject_output_dir / "dendrograms"
                dendrogram_dir.mkdir(exist_ok=True, parents=True)
                dendrogram_path = dendrogram_dir / f"dendrogram_age_{age_mo}.png"
                plt.savefig(dendrogram_path, dpi=300, bbox_inches='tight', pad_inches=0.2)
                plt.close()

print(f"\nSaved RDMs to {output_dir}")


Saving developmental trajectory RDMs...


Saving RDMs: 100%|██████████| 26/26 [00:04<00:00,  6.17it/s]


Saved RDMs to developmental_trajectory_rdms





## Analyze Developmental Trajectories


In [12]:
def compute_rdm_correlation(rdm1, rdm2, categories1, categories2):
    """
    Compute correlation between two RDMs.
    Only uses categories present in both RDMs.
    """
    # Find common categories
    common_categories = sorted(list(set(categories1) & set(categories2)))
    
    if len(common_categories) < 2:
        return np.nan, len(common_categories)
    
    # Get indices for common categories
    idx1 = [categories1.index(cat) for cat in common_categories]
    idx2 = [categories2.index(cat) for cat in common_categories]
    
    # Extract upper triangle (excluding diagonal) for both RDMs
    rdm1_subset = rdm1[np.ix_(idx1, idx1)]
    rdm2_subset = rdm2[np.ix_(idx2, idx2)]
    
    # Get upper triangle
    mask = np.triu(np.ones_like(rdm1_subset, dtype=bool), k=1)
    rdm1_flat = rdm1_subset[mask]
    rdm2_flat = rdm2_subset[mask]
    
    # Compute Spearman correlation (more robust to outliers)
    if len(rdm1_flat) > 0:
        corr, _ = spearmanr(rdm1_flat, rdm2_flat)
        return corr, len(common_categories)
    else:
        return np.nan, len(common_categories)

# Compute RDM correlations across age bins for each subject
trajectory_data = []

for subject_id, age_rdms in tqdm(subject_age_rdms.items(), desc="Analyzing trajectories"):
    ages = sorted(age_rdms.keys())
    
    if len(ages) < 2:
        continue
    
    # Compute pairwise correlations between consecutive age bins
    for i in range(len(ages) - 1):
        age1 = ages[i]
        age2 = ages[i + 1]
        
        rdm1 = age_rdms[age1]
        rdm2 = age_rdms[age2]
        cats1 = subject_age_rdm_categories[subject_id][age1]
        cats2 = subject_age_rdm_categories[subject_id][age2]
        
        corr, n_common = compute_rdm_correlation(rdm1, rdm2, cats1, cats2)
        
        trajectory_data.append({
            'subject_id': subject_id,
            'age1': age1,
            'age2': age2,
            'age_diff': age2 - age1,
            'rdm_correlation': corr,
            'n_common_categories': n_common,
            'n_categories_age1': len(cats1),
            'n_categories_age2': len(cats2)
        })

trajectory_df = pd.DataFrame(trajectory_data)
trajectory_df.to_csv(output_dir / "trajectory_correlations.csv", index=False)

print(f"\nTrajectory analysis:")
print(f"  Total age transitions analyzed: {len(trajectory_df)}")
print(f"  Mean RDM correlation: {trajectory_df['rdm_correlation'].mean():.3f}")
print(f"  Std RDM correlation: {trajectory_df['rdm_correlation'].std():.3f}")
print(f"\nSaved trajectory correlations to {output_dir / 'trajectory_correlations.csv'}")


Analyzing trajectories: 100%|██████████| 26/26 [00:00<00:00, 66.93it/s]


Trajectory analysis:
  Total age transitions analyzed: 235
  Mean RDM correlation: 0.627
  Std RDM correlation: 0.083

Saved trajectory correlations to developmental_trajectory_rdms/trajectory_correlations.csv





## Visualize Developmental Trajectories


In [13]:
# Create grid layout RDM trajectory overview for each subjectprint("Creating grid layout RDM trajectory visualizations for all subjects...")for subject_id in tqdm(subject_age_rdms.keys(), desc="Creating trajectory grids"):    age_rdms = subject_age_rdms[subject_id]    ages = sorted(age_rdms.keys())        if len(ages) == 0:        continue        # Determine grid layout (aim for roughly square grid)    n_ages = len(ages)    n_cols = int(np.ceil(np.sqrt(n_ages)))    n_rows = int(np.ceil(n_ages / n_cols))        # Create figure with appropriate size    fig_size_per_plot = 4    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * fig_size_per_plot, n_rows * fig_size_per_plot))        # Flatten axes for easier indexing    if n_ages == 1:        axes = [axes]    elif n_rows == 1:        axes = axes if isinstance(axes, list) else [axes]    else:        axes = axes.flatten()        # Find global min/max for consistent color scale    all_rdm_values = []    for rdm in age_rdms.values():        all_rdm_values.extend(rdm.flatten())    vmin = np.percentile(all_rdm_values, 1)    vmax = np.percentile(all_rdm_values, 99)        for idx, age_mo in enumerate(ages):        rdm = age_rdms[age_mo]        categories = subject_age_rdm_categories[subject_id][age_mo]        group_boundaries = subject_age_group_boundaries[subject_id][age_mo]                ax = axes[idx]                # Determine font sizes based on number of categories        n_cats = len(categories)        if n_cats <= 50:            label_fontsize = 10            tick_fontsize = 14        elif n_cats <= 100:            label_fontsize = 8            tick_fontsize = 12        else:            label_fontsize = 6            tick_fontsize = 10                im = ax.imshow(rdm, cmap="viridis", aspect="auto", vmin=vmin, vmax=vmax)                # Add visual separators between category groups        for boundary in group_boundaries:            # Draw vertical line            if boundary["start"] > 0:                ax.axvline(x=boundary["start"] - 0.5, color="white", linewidth=1.5, linestyle="--", alpha=0.7)            # Draw horizontal line            if boundary["start"] > 0:                ax.axhline(y=boundary["start"] - 0.5, color="white", linewidth=1.5, linestyle="--", alpha=0.7)                # Set category names as axis labels (show every Nth label to avoid crowding)        n_cats = len(categories)        if n_cats > 50:            # Show every 5th label            tick_step = max(1, n_cats // 20)            tick_positions = list(range(0, n_cats, tick_step))            tick_labels = [categories[i] if i < len(categories) else '' for i in tick_positions]        else:            tick_positions = range(len(categories))            tick_labels = categories                ax.set_xticks(tick_positions)        ax.set_yticks(tick_positions)        ax.set_xticklabels(tick_labels, rotation=90, ha="right", fontsize=tick_fontsize)        ax.set_yticklabels(tick_labels, fontsize=tick_fontsize)                ax.set_title(f"Age {age_mo} months\n({n_cats} cats)", fontsize=10, pad=5)                # Add colorbar        plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)        # Hide unused subplots    for idx in range(n_ages, len(axes)):        axes[idx].axis('off')        plt.suptitle(f"Developmental Trajectory: {subject_id}\n({n_ages} age bins)", fontsize=14, y=0.995)    plt.tight_layout(rect=[0, 0, 1, 0.99])    plt.savefig(output_dir / f"trajectory_grid_{subject_id}.png", dpi=200, bbox_inches='tight')    plt.close()print(f"\nSaved grid layout trajectory visualizations for {len(subject_age_rdms)} subjects")

Saved trajectory visualizations for 3 sample subjects


## Plot RDM Stability Across Development


In [14]:
# Plot RDM correlation as a function of age difference
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RDM correlation vs age difference
axes[0].scatter(trajectory_df['age_diff'], trajectory_df['rdm_correlation'], alpha=0.6)
axes[0].set_xlabel('Age Difference (months)')
axes[0].set_ylabel('RDM Correlation (Spearman)')
axes[0].set_title('RDM Stability vs Age Gap')
axes[0].grid(True, alpha=0.3)

# RDM correlation vs mean age
trajectory_df['mean_age'] = (trajectory_df['age1'] + trajectory_df['age2']) / 2
axes[1].scatter(trajectory_df['mean_age'], trajectory_df['rdm_correlation'], alpha=0.6)
axes[1].set_xlabel('Mean Age (months)')
axes[1].set_ylabel('RDM Correlation (Spearman)')
axes[1].set_title('RDM Stability vs Age')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / "rdm_stability_analysis.png", dpi=150, bbox_inches='tight')
print(f"Saved RDM stability analysis to {output_dir / 'rdm_stability_analysis.png'}")
plt.close()


Saved RDM stability analysis to developmental_trajectory_rdms/rdm_stability_analysis.png


## Summary Statistics


In [15]:
# Create summary statistics
summary_data = []

for subject_id, age_rdms in subject_age_rdms.items():
    ages = sorted(age_rdms.keys())
    
    for age_mo in ages:
        rdm = age_rdms[age_mo]
        categories = subject_age_rdm_categories[subject_id][age_mo]
        
        summary_data.append({
            'subject_id': subject_id,
            'age_mo': age_mo,
            'n_categories': len(categories),
            'mean_distance': float(rdm.mean()),
            'std_distance': float(rdm.std()),
            'min_distance': float(rdm[rdm > 0].min()) if (rdm > 0).any() else np.nan,
            'max_distance': float(rdm.max())
        })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(output_dir / "summary_statistics.csv", index=False)

print("Summary statistics:")
print(summary_df.describe())
print(f"\nSaved summary to {output_dir / 'summary_statistics.csv'}")


Summary statistics:
           age_mo  n_categories  mean_distance  std_distance  min_distance  \
count  261.000000    261.000000     261.000000    261.000000    261.000000   
mean    16.743295    130.164751       0.996279      0.254649      0.046867   
std      5.773880     15.093544       0.000960      0.009529      0.027013   
min      6.000000     60.000000       0.993074      0.228317      0.003778   
25%     13.000000    122.000000       0.995709      0.248605      0.027431   
50%     16.000000    134.000000       0.996341      0.254671      0.041594   
75%     20.000000    142.000000       0.996917      0.260849      0.063201   
max     37.000000    157.000000       0.998314      0.278531      0.183422   

       max_distance  
count    261.000000  
mean       1.612466  
std        0.037372  
min        1.498946  
25%        1.588435  
50%        1.614097  
75%        1.636053  
max        1.696628  

Saved summary to developmental_trajectory_rdms/summary_statistics.csv
