In [None]:
### Parcsing Aseg.stats file and saving to CSV

import pandas as pd
from pathlib import Path

# Read the aseg.stats file
stats_file = Path(r"C:\Users\okkam\Desktop\aseg.stats")
csv_output = Path(r"C:\Users\okkam\Desktop\aseg_parsed.csv")

# Read the file and extract the data table
with open(stats_file, 'r') as f:
    lines = f.readlines()

# Extract ICV (eTIV) from header
icv = None
for line in lines:
    if 'EstimatedTotalIntraCranialVol' in line and line.startswith('# Measure'):
        parts = line.split(',')
        if len(parts) >= 4:
            icv = float(parts[3].strip())
            break

# Find the line with column headers
header_line_idx = None
for i, line in enumerate(lines):
    if line.startswith('# ColHeaders'):
        header_line_idx = i
        break

# Extract column names from the header line
header_line = lines[header_line_idx].strip()
columns = header_line.replace('# ColHeaders ', '').split()

# Find where data starts (after the header line)
data_start_idx = header_line_idx + 1

# Parse the data rows
data_rows = []
for line in lines[data_start_idx:]:
    if line.strip() and not line.startswith('#'):
        values = line.strip().split()
        if len(values) == len(columns):
            data_rows.append(values)

# Create DataFrame
df = pd.DataFrame(data_rows, columns=columns)

# Add ICV column
if icv is not None:
    df['ICV'] = icv

# Convert numeric columns to appropriate types
numeric_cols = ['Index', 'SegId', 'NVoxels', 'Volume_mm3', 'normMean', 'normStdDev', 'normMin', 'normMax', 'normRange', 'ICV']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Save to CSV
df.to_csv(csv_output, index=False)

print(f"✓ Parsed aseg.stats file")
print(f"  Rows: {len(df)}")
print(f"  Columns: {', '.join(df.columns)}")
print(f"  ICV (eTIV): {icv:.1f} mm³")
print(f"\n✓ Saved to: {csv_output}")
print(f"\nFirst few rows:")
print(df.head())

✓ Parsed aseg.stats file
  Rows: 45
  Columns: Index, SegId, NVoxels, Volume_mm3, StructName, normMean, normStdDev, normMin, normMax, normRange, ICV
  ICV (eTIV): 1349509.5 mm³

✓ Saved to: C:\Users\okkam\Desktop\aseg_parsed.csv

First few rows:
   Index  SegId  NVoxels  Volume_mm3                    StructName  normMean  \
0      1      4    10556     10709.2        Left-Lateral-Ventricle   21.1470   
1      2      5      302       319.3             Left-Inf-Lat-Vent   36.0099   
2      3      7    12870     13558.6  Left-Cerebellum-White-Matter   82.4620   
3      4      8    40401     39555.7        Left-Cerebellum-Cortex   54.1926   
4      5     10     6338      6206.1          Left-Thalamus-Proper   88.7248   

   normStdDev  normMin  normMax  normRange           ICV  
0     11.6668      0.0     75.0       75.0  1.349510e+06  
1     14.2427      6.0     77.0       71.0  1.349510e+06  
2      7.4717     13.0    109.0       96.0  1.349510e+06  
3     13.5459      0.0    118.0      

In [None]:
### Batch processing multiple aseg.stats files in a folder

import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Path to T1 folder with all participants
t1_folder = Path(r"D:\02-Raw_data-anat\longitudinal_freesurfer_149\longi_output\T2")

# Find all aseg.stats files
aseg_files = list(t1_folder.rglob("aseg.stats"))
print(f"Found {len(aseg_files)} aseg.stats files\n")

# Function to parse aseg.stats file
def parse_aseg_stats(stats_file):
    """Parse aseg.stats file and extract data"""
    with open(stats_file, 'r') as f:
        lines = f.readlines()
    
    # Extract ICV (eTIV) from header
    icv = None
    for line in lines:
        if 'EstimatedTotalIntraCranialVol' in line and line.startswith('# Measure'):
            parts = line.split(',')
            if len(parts) >= 4:
                icv = float(parts[3].strip())
                break
    
    # Find the line with column headers
    header_line_idx = None
    for i, line in enumerate(lines):
        if line.startswith('# ColHeaders'):
            header_line_idx = i
            break
    
    if header_line_idx is None:
        return None, None
    
    # Extract column names
    header_line = lines[header_line_idx].strip()
    columns = header_line.replace('# ColHeaders ', '').split()
    
    # Find where data starts
    data_start_idx = header_line_idx + 1
    
    # Parse the data rows
    data_rows = []
    for line in lines[data_start_idx:]:
        if line.strip() and not line.startswith('#'):
            values = line.strip().split()
            if len(values) == len(columns):
                data_rows.append(values)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows, columns=columns)
    
    # Add ICV column
    if icv is not None:
        df['ICV'] = icv
    
    # Convert numeric columns to appropriate types
    numeric_cols = ['Index', 'SegId', 'NVoxels', 'Volume_mm3', 'normMean', 'normStdDev', 'normMin', 'normMax', 'normRange', 'ICV']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, icv

# Process all aseg.stats files
processed = 0
errors = 0

for stats_file in tqdm(aseg_files, desc="Processing aseg.stats files"):
    try:
        df, icv = parse_aseg_stats(stats_file)
        
        if df is not None:
            # Save CSV in the same stats folder
            csv_output = stats_file.parent / "aseg_parsed.csv"
            df.to_csv(csv_output, index=False)
            processed += 1
        else:
            errors += 1
            print(f"✗ Failed to parse: {stats_file}")
    except Exception as e:
        errors += 1
        print(f"✗ Error processing {stats_file}: {e}")

print(f"\n✓ Processing complete!")
print(f"  Successfully processed: {processed}")
print(f"  Errors: {errors}")

Found 65 aseg.stats files



Processing aseg.stats files: 100%|██████████| 65/65 [00:04<00:00, 13.75it/s]


✓ Processing complete!
  Successfully processed: 65
  Errors: 0





In [None]:
##### aparc processing

import pandas as pd
from pathlib import Path

# Files to parse
lh_file = Path(r"C:\Users\okkam\Desktop\lh.aparc.stats")
rh_file = Path(r"C:\Users\okkam\Desktop\rh.aparc.stats")

def parse_aparc_stats(stats_file):
    """Parse aparc.stats file and extract header measures and region data"""
    with open(stats_file, 'r') as f:
        lines = f.readlines()
    
    # Extract header measures
    measures = {}
    for line in lines:
        if line.startswith('# Measure'):
            parts = line.split(',')
            if len(parts) >= 4:
                measure_name = parts[0].replace('# Measure ', '').strip()
                measure_value = parts[3].strip()
                try:
                    measures[measure_name] = float(measure_value)
                except:
                    measures[measure_name] = measure_value
    
    # Find the line with column headers
    col_header_line_idx = None
    for i, line in enumerate(lines):
        if line.startswith('# ColHeaders'):
            col_header_line_idx = i
            break
    
    if col_header_line_idx is None:
        return None, None
    
    # Extract column names
    header_line = lines[col_header_line_idx].strip()
    columns = header_line.replace('# ColHeaders ', '').split()
    
    # Find where data starts
    data_start_idx = col_header_line_idx + 1
    
    # Parse the data rows
    data_rows = []
    for line in lines[data_start_idx:]:
        if line.strip() and not line.startswith('#'):
            values = line.strip().split()
            if len(values) >= len(columns):
                # Handle structure name which might have spaces (take first column as name)
                struct_name = values[0]
                data_values = [struct_name] + values[1:len(columns)]
                data_rows.append(data_values)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows, columns=columns)
    
    # Convert numeric columns
    numeric_cols = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Add key measures as columns
    for measure_name, measure_value in measures.items():
        if isinstance(measure_value, (int, float)):
            df[measure_name] = measure_value
    
    return df, measures

# Parse both files
print("Parsing lh.aparc.stats...")
lh_df, lh_measures = parse_aparc_stats(lh_file)

print("Parsing rh.aparc.stats...")
rh_df, rh_measures = parse_aparc_stats(rh_file)

if lh_df is not None:
    lh_csv = lh_file.parent / "lh.aparc_parsed.csv"
    lh_df.to_csv(lh_csv, index=False)
    print(f"✓ Saved lh: {lh_csv}")
    print(f"  Regions: {len(lh_df)}")
    print(f"  Columns: {', '.join(lh_df.columns.tolist()[:10])}...")

if rh_df is not None:
    rh_csv = rh_file.parent / "rh.aparc_parsed.csv"
    rh_df.to_csv(rh_csv, index=False)
    print(f"✓ Saved rh: {rh_csv}")
    print(f"  Regions: {len(rh_df)}")
    print(f"  Columns: {', '.join(rh_df.columns.tolist()[:10])}...")

print("\n✓ Done! Both aparc files parsed and saved.")

Parsing lh.aparc.stats...
Parsing rh.aparc.stats...
✓ Saved lh: C:\Users\okkam\Desktop\lh.aparc_parsed.csv
  Regions: 34
  Columns: StructName, NumVert, SurfArea, GrayVol, ThickAvg, ThickStd, MeanCurv, GausCurv, FoldInd, CurvInd...
✓ Saved rh: C:\Users\okkam\Desktop\rh.aparc_parsed.csv
  Regions: 34
  Columns: StructName, NumVert, SurfArea, GrayVol, ThickAvg, ThickStd, MeanCurv, GausCurv, FoldInd, CurvInd...

✓ Done! Both aparc files parsed and saved.


In [5]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Path to T1 folder with all participants
t1_folder = Path(r"D:\02-Raw_data-anat\longitudinal_freesurfer_149\longi_output\T2")

# Find all lh and rh aparc.stats files
lh_aparc_files = list(t1_folder.rglob("lh.aparc.stats"))
rh_aparc_files = list(t1_folder.rglob("rh.aparc.stats"))

print(f"Found {len(lh_aparc_files)} lh.aparc.stats files")
print(f"Found {len(rh_aparc_files)} rh.aparc.stats files\n")

def parse_aparc_stats(stats_file):
    """Parse aparc.stats file and extract header measures and region data"""
    with open(stats_file, 'r') as f:
        lines = f.readlines()
    
    # Extract header measures
    measures = {}
    for line in lines:
        if line.startswith('# Measure'):
            parts = line.split(',')
            if len(parts) >= 4:
                measure_name = parts[0].replace('# Measure ', '').strip()
                measure_value = parts[3].strip()
                try:
                    measures[measure_name] = float(measure_value)
                except:
                    measures[measure_name] = measure_value
    
    # Find the line with column headers
    col_header_line_idx = None
    for i, line in enumerate(lines):
        if line.startswith('# ColHeaders'):
            col_header_line_idx = i
            break
    
    if col_header_line_idx is None:
        return None, None
    
    # Extract column names
    header_line = lines[col_header_line_idx].strip()
    columns = header_line.replace('# ColHeaders ', '').split()
    
    # Find where data starts
    data_start_idx = col_header_line_idx + 1
    
    # Parse the data rows
    data_rows = []
    for line in lines[data_start_idx:]:
        if line.strip() and not line.startswith('#'):
            values = line.strip().split()
            if len(values) >= len(columns):
                struct_name = values[0]
                data_values = [struct_name] + values[1:len(columns)]
                data_rows.append(data_values)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows, columns=columns)
    
    # Convert numeric columns
    numeric_cols = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Add key measures as columns
    for measure_name, measure_value in measures.items():
        if isinstance(measure_value, (int, float)):
            df[measure_name] = measure_value
    
    return df, measures

# Process all lh and rh aparc files
processed = 0
errors = 0
all_aparc_files = lh_aparc_files + rh_aparc_files

for aparc_file in tqdm(all_aparc_files, desc="Processing aparc.stats files"):
    try:
        df, measures = parse_aparc_stats(aparc_file)
        
        if df is not None:
            # Determine output filename based on lh or rh
            if 'lh.aparc' in aparc_file.name:
                csv_output = aparc_file.parent / "lh.aparc_parsed.csv"
            else:
                csv_output = aparc_file.parent / "rh.aparc_parsed.csv"
            
            df.to_csv(csv_output, index=False)
            processed += 1
        else:
            errors += 1
            print(f"✗ Failed to parse: {aparc_file}")
    except Exception as e:
        errors += 1
        print(f"✗ Error processing {aparc_file}: {e}")

print(f"\n✓ Processing complete!")
print(f"  Successfully processed: {processed}")
print(f"  Errors: {errors}")

Found 65 lh.aparc.stats files
Found 65 rh.aparc.stats files



Processing aparc.stats files: 100%|██████████| 130/130 [00:08<00:00, 14.74it/s]


✓ Processing complete!
  Successfully processed: 130
  Errors: 0





In [None]:
### Combine all mm2 data into one CSV
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re

# Path to T1 folder
t1_folder = Path(r"D:\02-Raw_data-anat\longitudinal_freesurfer_149\longi_output\T2")

# Find all participant directories - look for folders containing "stats" subfolder with parsed CSVs
participant_stats_dirs = list(t1_folder.rglob("stats"))
# Filter to only those that have all three required files
participant_dirs = []
for stats_dir in participant_stats_dirs:
    aseg_csv = stats_dir / "aseg_parsed.csv"
    lh_aparc_csv = stats_dir / "lh.aparc_parsed.csv"
    rh_aparc_csv = stats_dir / "rh.aparc_parsed.csv"
    
    if aseg_csv.exists() and lh_aparc_csv.exists() and rh_aparc_csv.exists():
        participant_dirs.append(stats_dir.parent)

print(f"Found {len(participant_dirs)} participants with complete data\n")

# Function to extract participant ID and visit from folder name
def extract_participant_info(folder_name):
    """
    Extract participant ID and visit from folder name like:
    3002498_irm_t04-3192767.long.Base-3192838-1
    OR
    8193847_irm_t00-3192713.long.8193847_base-3194677-1
    Returns: (participant_id, visit)
    """
    # Extract participant ID (first number before underscore)
    match_id = re.search(r'^(\d+)_', folder_name)
    participant_id = match_id.group(1) if match_id else "unknown"
    
    # Extract visit (t00, t02, t04, etc.) - look for pattern like _irm_t04
    match_visit = re.search(r'_irm_(t\d+)', folder_name)
    visit = match_visit.group(1) if match_visit else "unknown"
    
    return participant_id, visit

# Function to flatten aseg data - keep only Volume_mm3 and ICV
def flatten_aseg(aseg_df):
    """Keep only Volume_mm3 (volume in mm3) from aseg data"""
    flattened = {}
    
    for idx, row in aseg_df.iterrows():
        struct_name = row['StructName'].replace('-', '_').replace(' ', '_')
        
        # Keep only Volume_mm3
        if 'Volume_mm3' in aseg_df.columns:
            flattened[f"aseg_{struct_name}_Volume_mm3"] = row['Volume_mm3']
        
        # Add ICV once (same for all rows)
        if idx == 0 and 'ICV' in aseg_df.columns:
            flattened['aseg_ICV'] = row['ICV']
    
    return flattened

# Function to flatten aparc data - keep only GrayVol
def flatten_aparc(aparc_df, hemisphere):
    """Keep only GrayVol (gray matter volume in mm3) from aparc data"""
    flattened = {}
    
    for idx, row in aparc_df.iterrows():
        region_name = row['StructName'].replace('-', '_').replace(' ', '_')
        
        # Keep only GrayVol (gray matter volume)
        if 'GrayVol' in aparc_df.columns:
            flattened[f"{hemisphere}_{region_name}_GrayVol"] = row['GrayVol']
    
    return flattened

# Collect all participant data
all_data = []
errors = 0

for participant_dir in tqdm(participant_dirs, desc="Processing participants"):
    try:
        participant_id, visit = extract_participant_info(participant_dir.name)
        stats_folder = participant_dir / "stats"
        
        # Define CSV file paths
        aseg_csv = stats_folder / "aseg_parsed.csv"
        lh_aparc_csv = stats_folder / "lh.aparc_parsed.csv"
        rh_aparc_csv = stats_folder / "rh.aparc_parsed.csv"
        
        # Load CSVs
        aseg_df = pd.read_csv(aseg_csv)
        lh_aparc_df = pd.read_csv(lh_aparc_csv)
        rh_aparc_df = pd.read_csv(rh_aparc_csv)
        
        # Start with participant info
        row_data = {
            'participant_id': participant_id,
            'visit': visit
        }
        
        # Add ASEG Volume data (45 structures with Volume_mm3 + ICV)
        aseg_flat = flatten_aseg(aseg_df)
        row_data.update(aseg_flat)
        
        # Add LH and RH APARC GrayVol data (34 regions × GrayVol each hemisphere)
        lh_flat = flatten_aparc(lh_aparc_df, 'lh')
        rh_flat = flatten_aparc(rh_aparc_df, 'rh')
        row_data.update(lh_flat)
        row_data.update(rh_flat)
        
        all_data.append(row_data)
        
    except Exception as e:
        errors += 1
        print(f"✗ Error processing {participant_dir.name}: {e}")

# Create combined DataFrame
if all_data:
    combined_df = pd.DataFrame(all_data)
    
    # Save to T1 folder
    output_path = t1_folder / "all_participants_freesurfer_data.csv"
    combined_df.to_csv(output_path, index=False)
    
    print(f"\n✓ Combined data saved!")
    print(f"  Participants: {len(combined_df)}")
    print(f"  Total Variables: {len(combined_df.columns)}")
    print(f"  Output: {output_path}")
    print(f"\nVariable breakdown:")
    print(f"  - Base columns (participant_id, visit): 2")
    print(f"  - ASEG volumes (45 structures × Volume_mm3 + ICV): 46")
    print(f"  - LH APARC volumes (34 regions × GrayVol): 34")
    print(f"  - RH APARC volumes (34 regions × GrayVol): 34")
    print(f"  - TOTAL: {len(combined_df.columns)}")
    print(f"\nSample columns:")
    print(combined_df.columns.tolist()[:20])
    print(f"\nFirst few rows:")
    print(combined_df.iloc[:3, :6])
    print(f"\nErrors: {errors}")
else:
    print("✗ No data to combine")

Found 65 participants with complete data



Processing participants: 100%|██████████| 65/65 [00:02<00:00, 22.43it/s]


✓ Combined data saved!
  Participants: 65
  Total Variables: 116
  Output: D:\02-Raw_data-anat\longitudinal_freesurfer_149\longi_output\T2\all_participants_freesurfer_data.csv

Variable breakdown:
  - Base columns (participant_id, visit): 2
  - ASEG volumes (45 structures × Volume_mm3 + ICV): 46
  - LH APARC volumes (34 regions × GrayVol): 34
  - RH APARC volumes (34 regions × GrayVol): 34
  - TOTAL: 116

Sample columns:
['participant_id', 'visit', 'aseg_Left_Lateral_Ventricle_Volume_mm3', 'aseg_ICV', 'aseg_Left_Inf_Lat_Vent_Volume_mm3', 'aseg_Left_Cerebellum_White_Matter_Volume_mm3', 'aseg_Left_Cerebellum_Cortex_Volume_mm3', 'aseg_Left_Thalamus_Proper_Volume_mm3', 'aseg_Left_Caudate_Volume_mm3', 'aseg_Left_Putamen_Volume_mm3', 'aseg_Left_Pallidum_Volume_mm3', 'aseg_3rd_Ventricle_Volume_mm3', 'aseg_4th_Ventricle_Volume_mm3', 'aseg_Brain_Stem_Volume_mm3', 'aseg_Left_Hippocampus_Volume_mm3', 'aseg_Left_Amygdala_Volume_mm3', 'aseg_CSF_Volume_mm3', 'aseg_Left_Accumbens_area_Volume_mm3', '


