In [None]:
from pathlib import Path
import pandas as pd
import torch
import re
import numpy as np

# Paths
obj_dir = Path('[path to mesh folder: minimal_scaled_obj_files]')
metadata_csv = Path('[path to metadata file]')
volume_csv = Path('[path to metadata file]')
output_path = obj_dir / 'labels.pt'

print(f"OBJ directory: {obj_dir}")
print(f"Metadata CSV: {metadata_csv}")
print(f"Volume CSV: {volume_csv}")
print(f"Output path: {output_path}")

In [None]:
# Load metadata files
metadata_df = pd.read_csv(metadata_csv)
volume_df = pd.read_csv(volume_csv)

print(f"Metadata shape: {metadata_df.shape}")
print(f"Volume data shape: {volume_df.shape}")
print("\nMetadata columns:", metadata_df.columns.tolist())
print("\nVolume data columns:", volume_df.columns.tolist())
print("\nMetadata sample:")
print(metadata_df.head())
print("\nVolume data sample:")
print(volume_df.head())

In [None]:
# Get all combined hippocampus OBJ files
obj_files = sorted([f.name for f in obj_dir.glob('*.obj') if 'combined' in f.name.lower()])
print(f"Found {len(obj_files)} combined hippocampus OBJ files")
print("\nSample filenames:")
for i in range(min(5, len(obj_files))):
    print(obj_files[i])

In [None]:
# Extract subject_id and image_id from filename
# Example: ADNI_002_S_0295_MR_Hippocampal_Mask_Hi_20080228111448800_S13408_I93328_combined.obj
# Pattern: ADNI_{subject_id}_MR_..._{series_id}_{image_id}_combined.obj

def parse_filename(filename):
    """
    Extract subject_id and image_id from ADNI filename.
    Returns (subject_id, image_id) or (None, None) if parsing fails.
    """
    # Match pattern like: ADNI_002_S_0295_MR_..._I93328_combined.obj
    pattern = r'ADNI_(\d+_S_\d+)_.*_I(\d+)_combined\.obj'
    match = re.search(pattern, filename)
    
    if match:
        subject_id = match.group(1)
        image_id = match.group(2)
        return subject_id, image_id
    return None, None

# Test parsing
print("Testing filename parsing:")
for i in range(min(5, len(obj_files))):
    fname = obj_files[i]
    subj, img = parse_filename(fname)
    print(f"{fname} -> subject: {subj}, image: {img}")

In [None]:
# Create mapping from image_id to metadata
# Assuming metadata has 'image_id' and volume data has 'image_id' columns

# Check what columns exist for matching
print("Checking for matching columns...")
print(f"Metadata columns with 'image' or 'id': {[c for c in metadata_df.columns if 'image' in c.lower() or 'id' in c.lower()]}")
print(f"Volume columns with 'image' or 'id': {[c for c in volume_df.columns if 'image' in c.lower() or 'id' in c.lower()]}")
print(f"Volume columns with 'hippo' or 'volume': {[c for c in volume_df.columns if 'hippo' in c.lower() or 'volume' in c.lower()]}")

In [None]:
# Create labels dictionary
labels = {}
missing_metadata = []
missing_volume = []
successfully_processed = []

# Map diagnosis to binary: CN=0, AD=1
diagnosis_map = {'CN': 0, 'AD': 1}

# Map sex to numeric: M=0, F=1
sex_map = {'M': 0, 'F': 1}

def normalize_image_id(value):
    s = str(value)
    return s[1:] if s.startswith('I') else s

for fname in obj_files:
    base_name = fname.replace('.obj', '')
    subject_id, image_id = parse_filename(fname)

    if subject_id is None or image_id is None:
        print(f"Warning: Could not parse filename {fname}")
        continue

    image_id_norm = normalize_image_id(image_id)

    # Match with metadata (use subject_id and potentially image_id)
    subject_metadata = metadata_df[metadata_df['subject_id'] == subject_id]

    if len(subject_metadata) == 0:
        missing_metadata.append(fname)
        continue

    # If there are multiple rows for a subject, try to match by image_id
    if 'image_data_id' in subject_metadata.columns:
        image_metadata = subject_metadata[
            subject_metadata['image_data_id'].astype(str).map(normalize_image_id) == image_id_norm
        ]
        if len(image_metadata) > 0:
            subject_metadata = image_metadata

    # Take the first matching row
    metadata_row = subject_metadata.iloc[0]

    # Extract diagnosis, age, sex
    diagnosis_str = metadata_row.get('diagnosis', 'MCI')  # Default to MCI if missing
    diagnosis = diagnosis_map.get(diagnosis_str, -1)  # -1 for MCI or unknown

    age = metadata_row.get('age', float('nan'))
    sex_str = metadata_row.get('gender', None)

    # Match with volume data
    volume_match = volume_df[volume_df['subject_id'] == subject_id]

    if 'image_uid' in volume_df.columns:
        volume_match = volume_match[volume_match['image_uid'].astype(str) == image_id_norm]

    if (sex_str is None or str(sex_str).strip() == '' or str(sex_str).upper() == 'U') and len(volume_match) > 0:
        sex_str = volume_match.iloc[0].get('subject_sex', 'U')

    sex = sex_map.get(str(sex_str).strip().upper(), -1)

    if len(volume_match) == 0:
        missing_volume.append(fname)
        volume = float('nan')
    else:
        volume_row = volume_match.iloc[0]
        # Look for combined hippocampus volume column
        volume_col = [c for c in volume_df.columns if 'total' in c.lower() and 'hippo' in c.lower()]
        if len(volume_col) > 0:
            volume = volume_row.get(volume_col[0], float('nan'))
        else:
            volume = float('nan')

    # Create label tensor: [diagnosis, age, sex, volume]
    label = torch.tensor([diagnosis, age, sex, volume], dtype=torch.float32)
    labels[base_name] = label
    successfully_processed.append(fname)

print(f"\nProcessing summary:")
print(f"Total OBJ files: {len(obj_files)}")
print(f"Successfully processed: {len(successfully_processed)}")
print(f"Missing metadata: {len(missing_metadata)}")
print(f"Missing volume data: {len(missing_volume)}")


In [None]:
# Display sample labels
print("\nSample labels:")
for i, (key, value) in enumerate(labels.items()):
    if i < 10:
        print(f"{key}: {value.tolist()} (diagnosis={value[0]}, age={value[1]}, sex={value[2]}, volume={value[3]})")
    else:
        break

In [None]:
# Save labels
torch.save(labels, output_path)
print(f"\nSaved {len(labels)} labels to {output_path}")

In [None]:
# Verification: Load and check saved labels
loaded_labels = torch.load(output_path)
print(f"\nVerification: Loaded {len(loaded_labels)} labels from {output_path}")

# Check a few random samples
sample_keys = list(loaded_labels.keys())[:5]
print("\nVerifying sample labels:")
for key in sample_keys:
    label = loaded_labels[key]
    print(f"{key}: diagnosis={label[0]}, age={label[1]}, sex={label[2]}, volume={label[3]}")

In [57]:
# Statistics
all_labels_array = torch.stack(list(loaded_labels.values()))

print("\nLabel statistics:")
print(f"Diagnosis distribution: CN={torch.sum(all_labels_array[:, 0] == 0).item()}, AD={torch.sum(all_labels_array[:, 0] == 1).item()}")
print(f"Age range: {torch.min(all_labels_array[:, 1]):.1f} - {torch.max(all_labels_array[:, 1]):.1f}")
print(f"Age mean: {torch.mean(all_labels_array[:, 1]):.1f}")
print(f"Sex distribution: M={torch.sum(all_labels_array[:, 2] == 0).item()}, F={torch.sum(all_labels_array[:, 2] == 1).item()}")
print(f"Volume range: {torch.min(all_labels_array[:, 3]):.1f} - {torch.max(all_labels_array[:, 3]):.1f}")
print(f"Volume mean: {torch.mean(all_labels_array[:, 3]):.1f}")


Label statistics:
Diagnosis distribution: CN=477, AD=342
Age range: 55.0 - 91.0
Age mean: 76.1
Sex distribution: M=904, F=728
Volume range: 1439.0 - 5936.4
Volume mean: 3712.1


In [None]:
# Cross-reference with split files to ensure all training/test files have labels
import json

split_dir = Path('../examples/splits/splits_combined_hippocampus_ADNI_No_MCI')
train_split_file = split_dir / 'train_split_combined_hippocampus_adni_no_mci.json'
test_split_file = split_dir / 'test_split_combined_hippocampus_adni_no_mci.json'
val_split_file = split_dir / 'val_split_combined_hippocampus_adni_no_mci.json'

if train_split_file.exists() and test_split_file.exists() and val_split_file.exists():
    with open(train_split_file, 'r') as f:
        train_files = json.load(f)
    with open(test_split_file, 'r') as f:
        test_files = json.load(f)
    with open(val_split_file, 'r') as f:
        val_files = json.load(f)
    
    all_split_files = train_files + test_files + val_files
    missing_in_labels = []
    
    for fname in all_split_files:
        base_name = fname.replace('.obj', '')
        if base_name not in loaded_labels:
            missing_in_labels.append(fname)
    
    print(f"\nSplit file verification:")
    print(f"Total files in splits: {len(all_split_files)}")
    print(f"Files in splits missing labels: {len(missing_in_labels)}")
    
    if missing_in_labels:
        print("\nMissing labels for:")
        for fname in missing_in_labels[:10]:
            print(f"  {fname}")
else:
    print(f"\nSplit files not found at {split_dir}")

In [None]:
# AD vs CN combined hippocampus volume summary and box plot
import math
import numpy as np
import matplotlib.pyplot as plt

loaded_labels = torch.load(output_path, map_location='cpu')
vols_by_diag = {0: [], 1: []}  # 0=CN, 1=AD
for v in loaded_labels.values():
    diag = int(v[0].item())
    vol = float(v[3].item())
    if diag in vols_by_diag and math.isfinite(vol):
        vols_by_diag[diag].append(vol)

cn_mean = np.mean(vols_by_diag[0])
ad_mean = np.mean(vols_by_diag[1])
print(f"CN mean volume: {cn_mean:.2f}")
print(f"AD mean volume: {ad_mean:.2f}")

plt.figure(figsize=(6, 4))
plt.boxplot([vols_by_diag[0], vols_by_diag[1]], labels=['CN', 'AD'], showmeans=True)
plt.title('Combined Hippocampus Volume (AD vs CN)')
plt.ylabel('Volume')
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.show()


In [None]:
# AD vs CN combined hippocampus volume summary and box plot
import math
import numpy as np
import matplotlib.pyplot as plt

loaded_labels = torch.load(output_path, map_location='cpu')
vols_by_diag = {0: [], 1: []}  # 0=CN, 1=AD
for v in loaded_labels.values():
    diag = int(v[0].item())
    vol = float(v[3].item())
    if diag in vols_by_diag and math.isfinite(vol):
        vols_by_diag[diag].append(vol)

cn_mean = np.mean(vols_by_diag[0])
ad_mean = np.mean(vols_by_diag[1])
print(f"CN mean volume: {cn_mean:.2f}")
print(f"AD mean volume: {ad_mean:.2f}")

plt.figure(figsize=(6, 4))
plt.boxplot([vols_by_diag[0], vols_by_diag[1]], labels=['CN', 'AD'], showmeans=True)
plt.title('Combined Hippocampus Volume (AD vs CN)')
plt.ylabel('Volume')
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.show()


In [None]:
# AD vs CN combined hippocampus volume summary and box plot
import math
import numpy as np
import matplotlib.pyplot as plt

loaded_labels = torch.load(output_path, map_location='cpu')
vols_by_diag = {0: [], 1: []}  # 0=CN, 1=AD
for v in loaded_labels.values():
    diag = int(v[0].item())
    vol = float(v[3].item())
    if diag in vols_by_diag and math.isfinite(vol):
        vols_by_diag[diag].append(vol)

cn_mean = np.mean(vols_by_diag[0])
ad_mean = np.mean(vols_by_diag[1])
print(f"CN mean volume: {cn_mean:.2f}")
print(f"AD mean volume: {ad_mean:.2f}")

plt.figure(figsize=(6, 4))
plt.boxplot([vols_by_diag[0], vols_by_diag[1]], labels=['CN', 'AD'], showmeans=True)
plt.title('Combined Hippocampus Volume (AD vs CN)')
plt.ylabel('Volume')
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.show()


In [None]:
import numpy as np

def count_outliers(vals):
    vals = np.asarray(vals)
    q1, q3 = np.percentile(vals, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    mask = (vals < lo) | (vals > hi)
    return mask.sum(), vals[mask], (lo, hi)

cn_count, cn_vals, cn_bounds = count_outliers(vols_by_diag[0])
ad_count, ad_vals, ad_bounds = count_outliers(vols_by_diag[1])

print(f"CN outliers: {cn_count} (bounds {cn_bounds})")
print(f"AD outliers: {ad_count} (bounds {ad_bounds})")
# If you want the actual values:
print("CN outlier values:", cn_vals)
print("AD outlier values:", ad_vals)
