In [None]:
import pandas as pd

# Read the sample file
file_path = r"D:\01-Raw_data-spectro\Precuneus\3100205_255881_V17S\first_run_data\fit_tissue_adjusted\summary.csv"

# Load and display the data
print("=" * 80)
print("SAMPLE FILE STRUCTURE")
print("=" * 80)
print(f"\nFile: {file_path}\n")

# Read the CSV
df = pd.read_csv(file_path)

# Display basic info
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")

print("Column names:")
print(df.columns.tolist())
print("\n")

print("First few rows:")
print(df.head(10))
print("\n")

print("Data types:")
print(df.dtypes)
print("\n")

print("Full data:")
print(df)



In [None]:
"""
Parse glutamate spectroscopy data from Precuneus folder
Creates one output file with all participants and their metabolite measurements
"""

import pandas as pd
import os
from pathlib import Path

# Configuration
PRECUNEUS_FOLDER = r"D:\01-Raw_data-spectro\Precuneus"
OUTPUT_FILE = r"c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\precuneus_metabolite_data.csv"

# Metabolites to extract (in order)
METABOLITES = ['Glu', 'GABA', 'Gly', 'NAA', 'Cr', 'Cr+PCr', 'GPC', 'PCh', 'mI']

# Columns to extract for each metabolite (in order)
COLUMNS = ['mM', '/Cr+PCr', '%CRLB', 'SNR', 'FWHM']


def extract_participant_data(summary_file_path, participant_id):
    """
    Extract metabolite data from a single participant's summary.csv file
    
    Args:
        summary_file_path: Path to the summary.csv file
        participant_id: Participant folder name (e.g., 3100205_255881_V17S)
    
    Returns:
        Dictionary with participant data
    """
    # Read the summary file
    df = pd.read_csv(summary_file_path)
    
    # Initialize result dictionary with participant ID
    result = {'Participant_ID': participant_id}
    
    # Extract data for each metabolite
    for metabolite in METABOLITES:
        # Find the row for this metabolite
        metabolite_row = df[df['Metab'] == metabolite]
        
        if len(metabolite_row) == 0:
            # Metabolite not found - fill with NaN
            for column in COLUMNS:
                result[f'{metabolite}_{column}'] = None
        else:
            # Extract each column value
            for column in COLUMNS:
                result[f'{metabolite}_{column}'] = metabolite_row[column].values[0]
    
    return result


def parse_all_participants():
    """
    Parse all participants in the Precuneus folder
    
    Returns:
        DataFrame with all participants' data
    """
    all_data = []
    
    # Get all participant folders in Precuneus
    precuneus_path = Path(PRECUNEUS_FOLDER)
    
    if not precuneus_path.exists():
        print(f"ERROR: Precuneus folder not found at {PRECUNEUS_FOLDER}")
        return None
    
    # Iterate through all participant folders
    participant_folders = [f for f in precuneus_path.iterdir() if f.is_dir()]
    
    print(f"Found {len(participant_folders)} participant folders")
    print("Processing participants...\n")
    
    for participant_folder in participant_folders:
        participant_id = participant_folder.name
        
        # Construct path to summary.csv
        summary_path = participant_folder / "first_run_data" / "fit_tissue_adjusted" / "summary.csv"
        
        if summary_path.exists():
            try:
                # Extract data for this participant
                participant_data = extract_participant_data(summary_path, participant_id)
                all_data.append(participant_data)
                print(f"✓ Processed: {participant_id}")
            except Exception as e:
                print(f"✗ Error processing {participant_id}: {e}")
        else:
            print(f"✗ Summary file not found for {participant_id}")
    
    # Convert to DataFrame
    if len(all_data) > 0:
        df = pd.DataFrame(all_data)
        print(f"\n{'='*80}")
        print(f"Successfully processed {len(all_data)} participants")
        print(f"{'='*80}")
        return df
    else:
        print("No data was extracted!")
        return None


def main():
    """Main function to run the parsing"""
    print("="*80)
    print("PRECUNEUS METABOLITE DATA PARSER")
    print("="*80)
    print(f"\nSource folder: {PRECUNEUS_FOLDER}")
    print(f"Output file: {OUTPUT_FILE}")
    print(f"\nMetabolites: {', '.join(METABOLITES)}")
    print(f"Columns per metabolite: {', '.join(COLUMNS)}")
    print("="*80)
    print()
    
    # Parse all participants
    df = parse_all_participants()
    
    if df is not None:
        # Save to CSV
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\n✓ Data saved to: {OUTPUT_FILE}")
        print(f"\nOutput shape: {df.shape[0]} rows × {df.shape[1]} columns")
        print("\nFirst few rows:")
        print(df.head())
    else:
        print("\n✗ No output file created")


if __name__ == "__main__":
    main()


In [2]:
"""
Parse glutamate spectroscopy data from ACC folder
Creates one output file with all participants and their metabolite measurements
"""

import pandas as pd
import os
from pathlib import Path

# Configuration
PRECUNEUS_FOLDER = r"D:\01-Raw_data-spectro\ACC"
OUTPUT_FILE = r"c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\acc_metabolite_data.csv"

# Metabolites to extract (in order)
METABOLITES = ['Glu', 'GABA', 'Gly', 'NAA', 'Cr', 'Cr+PCr', 'GPC', 'PCh', 'mI']

# Columns to extract for each metabolite (in order)
COLUMNS = ['mM', '/Cr+PCr', '%CRLB', 'SNR', 'FWHM']


def extract_participant_data(summary_file_path, participant_id):
    """
    Extract metabolite data from a single participant's summary.csv file
    
    Args:
        summary_file_path: Path to the summary.csv file
        participant_id: Participant folder name (e.g., 3100205_255881_V17S)
    
    Returns:
        Dictionary with participant data
    """
    # Read the summary file
    df = pd.read_csv(summary_file_path)
    
    # Initialize result dictionary with participant ID
    result = {'Participant_ID': participant_id}
    
    # Extract data for each metabolite
    for metabolite in METABOLITES:
        # Find the row for this metabolite
        metabolite_row = df[df['Metab'] == metabolite]
        
        if len(metabolite_row) == 0:
            # Metabolite not found - fill with NaN
            for column in COLUMNS:
                result[f'{metabolite}_{column}'] = None
        else:
            # Extract each column value
            for column in COLUMNS:
                result[f'{metabolite}_{column}'] = metabolite_row[column].values[0]
    
    return result


def parse_all_participants():
    """
    Parse all participants in the ACC folder
    
    Returns:
        DataFrame with all participants' data
    """
    all_data = []
    
    # Get all participant folders in ACC
    precuneus_path = Path(PRECUNEUS_FOLDER)
    
    if not precuneus_path.exists():
        print(f"ERROR: ACC folder not found at {PRECUNEUS_FOLDER}")
        return None
    
    # Iterate through all participant folders
    participant_folders = [f for f in precuneus_path.iterdir() if f.is_dir()]
    
    print(f"Found {len(participant_folders)} participant folders")
    print("Processing participants...\n")
    
    for participant_folder in participant_folders:
        participant_id = participant_folder.name
        
        # Construct path to summary.csv
        summary_path = participant_folder / "first_run_data" / "fit_tissue_adjusted" / "summary.csv"
        
        if summary_path.exists():
            try:
                # Extract data for this participant
                participant_data = extract_participant_data(summary_path, participant_id)
                all_data.append(participant_data)
                print(f"✓ Processed: {participant_id}")
            except Exception as e:
                print(f"✗ Error processing {participant_id}: {e}")
        else:
            print(f"✗ Summary file not found for {participant_id}")
    
    # Convert to DataFrame
    if len(all_data) > 0:
        df = pd.DataFrame(all_data)
        print(f"\n{'='*80}")
        print(f"Successfully processed {len(all_data)} participants")
        print(f"{'='*80}")
        return df
    else:
        print("No data was extracted!")
        return None


def main():
    """Main function to run the parsing"""
    print("="*80)
    print("ACC METABOLITE DATA PARSER")
    print("="*80)
    print(f"\nSource folder: {PRECUNEUS_FOLDER}")
    print(f"Output file: {OUTPUT_FILE}")
    print(f"\nMetabolites: {', '.join(METABOLITES)}")
    print(f"Columns per metabolite: {', '.join(COLUMNS)}")
    print("="*80)
    print()
    
    # Parse all participants
    df = parse_all_participants()
    
    if df is not None:
        # Save to CSV
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\n✓ Data saved to: {OUTPUT_FILE}")
        print(f"\nOutput shape: {df.shape[0]} rows × {df.shape[1]} columns")
        print("\nFirst few rows:")
        print(df.head())
    else:
        print("\n✗ No output file created")


if __name__ == "__main__":
    main()


ACC METABOLITE DATA PARSER

Source folder: D:\01-Raw_data-spectro\ACC
Output file: c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\acc_metabolite_data.csv

Metabolites: Glu, GABA, Gly, NAA, Cr, Cr+PCr, GPC, PCh, mI
Columns per metabolite: mM, /Cr+PCr, %CRLB, SNR, FWHM

Found 148 participant folders
Processing participants...

✓ Processed: 3420680_878356_V31S
✓ Processed: 3025432_658178_V31S
✓ Processed: 7459501_502616_V31S
✓ Processed: 5237572_187625_V24S
✓ Processed: 5381177_938001_V24S
✓ Processed: 8327387_549614_V03S
✓ Processed: 4088305_183039_V03S
✓ Processed: 9270946_891932_V03S
✓ Processed: 3123186_920577_V10S
✓ Processed: 9889496_847696_V03S
✓ Processed: 7573547_561120_V14
✓ Processed: 6477990_167431_V10S
✓ Processed: 9230354_537666_V03S
✓ Processed: 4694616_147863_V17S
✓ Processed: 4971119_675481_V14
✓ Processed: 8874623_974246_V24S
✓ Processed: 8060583_785217_V24S
✓ Processed: 3634853_127216_V03S
✓ Processed: 8210381_921495_V14
✓ Processed: 5199730_151599

In [3]:
"""
Match ACC and Precuneus metabolite data to reference participant list
Handles longitudinal data with different visit codes (e.g., _V17S, _V24S)
"""

import pandas as pd
import re

# File paths
REFERENCE_FILE = r"C:\Users\okkam\Desktop\labo\article 2\Glu\QC\participants reference (Samira).xlsx"
ACC_FILE = r"C:\Users\okkam\Desktop\labo\article 2\Glu\QC\acc_metabolite_data_2026-01-18.csv"
PRECUNEUS_FILE = r"C:\Users\okkam\Desktop\labo\article 2\Glu\QC\precuneus_metabolite_data_2026-01-18.csv"

# Output files
ACC_OUTPUT = r"c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\acc_metabolite_data_matched.csv"
PRECUNEUS_OUTPUT = r"c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\precuneus_metabolite_data_matched.csv"

print("="*80)
print("MATCHING METABOLITE DATA TO REFERENCE PARTICIPANTS")
print("="*80)

# Read reference participants
print(f"\nReading reference file: {REFERENCE_FILE}")
ref_df = pd.read_excel(REFERENCE_FILE)
print(f"Reference file shape: {ref_df.shape}")
print(f"Columns: {ref_df.columns.tolist()}\n")

# Display first few rows to see the structure
print("First few rows of reference file:")
print(ref_df.head())
print()

# Read ACC data
print(f"Reading ACC data: {ACC_FILE}")
acc_df = pd.read_csv(ACC_FILE)
print(f"ACC data shape: {acc_df.shape}")
print(f"Number of participants: {len(acc_df)}\n")

# Read Precuneus data
print(f"Reading Precuneus data: {PRECUNEUS_FILE}")
precuneus_df = pd.read_csv(PRECUNEUS_FILE)
print(f"Precuneus data shape: {precuneus_df.shape}")
print(f"Number of participants: {len(precuneus_df)}\n")

print("="*80)
print("Sample Participant IDs from metabolite data:")
print("ACC:", acc_df['Participant_ID'].head(3).tolist())
print("Precuneus:", precuneus_df['Participant_ID'].head(3).tolist())
print("="*80)

MATCHING METABOLITE DATA TO REFERENCE PARTICIPANTS

Reading reference file: C:\Users\okkam\Desktop\labo\article 2\Glu\QC\participants reference (Samira).xlsx
Reference file shape: (74, 2)
Columns: ['PSCID_CandID', 'PSCID']

First few rows of reference file:
     PSCID_CandID    PSCID
0  3002498_327986  3002498
1  3025432_658178  3025432
2  3100205_255881  3100205
3  3123186_920577  3123186
4  3149469_790489  3149469

Reading ACC data: C:\Users\okkam\Desktop\labo\article 2\Glu\QC\acc_metabolite_data_2026-01-18.csv
ACC data shape: (147, 46)
Number of participants: 147

Reading Precuneus data: C:\Users\okkam\Desktop\labo\article 2\Glu\QC\precuneus_metabolite_data_2026-01-18.csv
Precuneus data shape: (148, 46)
Number of participants: 148

Sample Participant IDs from metabolite data:
ACC: ['3420680_878356_V31S', '3025432_658178_V31S', '7459501_502616_V31S']
Precuneus: ['4088305_183039_V03S', '9270946_891932_V03S', '7725018_862524_V14']


In [4]:
"""
Extract base participant ID (PSCID_CandID) from full ID with visit code
"""

def extract_base_id(full_id):
    """
    Extract base participant ID from full ID with visit code
    Example: '3002498_327986_V31S' -> '3002498_327986'
    """
    # Split by underscore and take first two parts (PSCID_CandID)
    parts = full_id.split('_')
    if len(parts) >= 2:
        return f"{parts[0]}_{parts[1]}"
    return full_id

# Add base ID column to both datasets
print("\nExtracting base participant IDs...")
acc_df['Base_ID'] = acc_df['Participant_ID'].apply(extract_base_id)
precuneus_df['Base_ID'] = precuneus_df['Participant_ID'].apply(extract_base_id)

# Get reference participant IDs
reference_ids = set(ref_df['PSCID_CandID'].tolist())
print(f"Number of reference participants: {len(reference_ids)}")

# Filter ACC data to only reference participants
acc_matched = acc_df[acc_df['Base_ID'].isin(reference_ids)].copy()
print(f"\nACC - Matched participants: {len(acc_matched)}")
print(f"ACC - Unique base IDs: {acc_matched['Base_ID'].nunique()}")

# Filter Precuneus data to only reference participants
precuneus_matched = precuneus_df[precuneus_df['Base_ID'].isin(reference_ids)].copy()
print(f"\nPrecuneus - Matched participants: {len(precuneus_matched)}")
print(f"Precuneus - Unique base IDs: {precuneus_matched['Base_ID'].nunique()}")

# Save matched data
acc_matched.to_csv(ACC_OUTPUT, index=False)
precuneus_matched.to_csv(PRECUNEUS_OUTPUT, index=False)

print(f"\n{'='*80}")
print("RESULTS SAVED")
print(f"{'='*80}")
print(f"ACC matched data: {ACC_OUTPUT}")
print(f"Precuneus matched data: {PRECUNEUS_OUTPUT}")

# Show some statistics
print(f"\n{'='*80}")
print("VISIT DISTRIBUTION")
print(f"{'='*80}")

# Extract visit codes from Participant_ID
acc_matched['Visit'] = acc_matched['Participant_ID'].str.extract(r'_(V\d+\w*)$')[0]
precuneus_matched['Visit'] = precuneus_matched['Participant_ID'].str.extract(r'_(V\d+\w*)$')[0]

print("\nACC visits:")
print(acc_matched['Visit'].value_counts().sort_index())

print("\nPrecuneus visits:")
print(precuneus_matched['Visit'].value_counts().sort_index())

# Show matched participants preview
print(f"\n{'='*80}")
print("SAMPLE OF MATCHED ACC DATA")
print(f"{'='*80}")
print(acc_matched[['Participant_ID', 'Base_ID', 'Visit']].head(10))

print(f"\n{'='*80}")
print("SAMPLE OF MATCHED PRECUNEUS DATA")
print(f"{'='*80}")
print(precuneus_matched[['Participant_ID', 'Base_ID', 'Visit']].head(10))


Extracting base participant IDs...
Number of reference participants: 74

ACC - Matched participants: 110
ACC - Unique base IDs: 73

Precuneus - Matched participants: 109
Precuneus - Unique base IDs: 72

RESULTS SAVED
ACC matched data: c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\acc_metabolite_data_matched.csv
Precuneus matched data: c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\precuneus_metabolite_data_matched.csv

VISIT DISTRIBUTION

ACC visits:
Visit
V03S    13
V10S    24
V14     13
V17S    27
V24S    25
V31S     8
Name: count, dtype: int64

Precuneus visits:
Visit
V03S    13
V10S    23
V14     13
V17S    27
V24S    25
V31S     8
Name: count, dtype: int64

SAMPLE OF MATCHED ACC DATA
         Participant_ID         Base_ID Visit
1   3025432_658178_V31S  3025432_658178  V31S
2   7459501_502616_V31S  7459501_502616  V31S
3   5237572_187625_V24S  5237572_187625  V24S
4   5381177_938001_V24S  5381177_938001  V24S
8   3123186_920577_V10S  3

In [5]:
"""
Check which reference participants are missing from ACC and Precuneus data
"""

# Get sets of matched participant IDs
acc_matched_ids = set(acc_matched['Base_ID'].unique())
precuneus_matched_ids = set(precuneus_matched['Base_ID'].unique())

# Find missing participants
missing_from_acc = reference_ids - acc_matched_ids
missing_from_precuneus = reference_ids - precuneus_matched_ids
missing_from_both = missing_from_acc & missing_from_precuneus

print("="*80)
print("MISSING PARTICIPANTS ANALYSIS")
print("="*80)

print(f"\nTotal reference participants: {len(reference_ids)}")
print(f"ACC matched: {len(acc_matched_ids)}")
print(f"Precuneus matched: {len(precuneus_matched_ids)}")

print(f"\n{'='*80}")
print(f"Missing from ACC ({len(missing_from_acc)} participants):")
print(f"{'='*80}")
if missing_from_acc:
    for participant in sorted(missing_from_acc):
        print(f"  - {participant}")
else:
    print("  None - all reference participants found in ACC data!")

print(f"\n{'='*80}")
print(f"Missing from Precuneus ({len(missing_from_precuneus)} participants):")
print(f"{'='*80}")
if missing_from_precuneus:
    for participant in sorted(missing_from_precuneus):
        print(f"  - {participant}")
else:
    print("  None - all reference participants found in Precuneus data!")

print(f"\n{'='*80}")
print(f"Missing from BOTH datasets ({len(missing_from_both)} participants):")
print(f"{'='*80}")
if missing_from_both:
    for participant in sorted(missing_from_both):
        print(f"  - {participant}")
else:
    print("  None - every participant appears in at least one dataset!")

# Show which participants are in only one dataset
only_in_acc = acc_matched_ids - precuneus_matched_ids
only_in_precuneus = precuneus_matched_ids - acc_matched_ids

print(f"\n{'='*80}")
print(f"Only in ACC, not in Precuneus ({len(only_in_acc)} participants):")
print(f"{'='*80}")
if only_in_acc:
    for participant in sorted(only_in_acc):
        print(f"  - {participant}")
else:
    print("  None")

print(f"\n{'='*80}")
print(f"Only in Precuneus, not in ACC ({len(only_in_precuneus)} participants):")
print(f"{'='*80}")
if only_in_precuneus:
    for participant in sorted(only_in_precuneus):
        print(f"  - {participant}")
else:
    print("  None")

# Participants in both datasets
in_both = acc_matched_ids & precuneus_matched_ids
print(f"\n{'='*80}")
print(f"In BOTH ACC and Precuneus ({len(in_both)} participants):")
print(f"{'='*80}")

MISSING PARTICIPANTS ANALYSIS

Total reference participants: 74
ACC matched: 73
Precuneus matched: 72

Missing from ACC (1 participants):
  - 6417837_396250

Missing from Precuneus (2 participants):
  - 6371164_500741
  - 6417837_396250

Missing from BOTH datasets (1 participants):
  - 6417837_396250

Only in ACC, not in Precuneus (1 participants):
  - 6371164_500741

Only in Precuneus, not in ACC (0 participants):
  None

In BOTH ACC and Precuneus (72 participants):


In [6]:
"""
Match Arsenii's 107 participant dataset with T1_Glu_38 dataset
"""

import pandas as pd

# File paths
ARSENII_FILE = r"C:\Users\okkam\Desktop\labo\article 2\Glu\QC\databases_previous\import_data_arsenii_20250609_hippR_article1.xlsx"
T1_GLU_FILE = r"C:\Users\okkam\Desktop\labo\article 2\Glu\QC\T1_Glu_38.xlsx"

print("="*80)
print("MATCHING ARSENII DATASET WITH T1_GLU_38")
print("="*80)

# Read Arsenii's dataset
print(f"\nReading Arsenii dataset: {ARSENII_FILE}")
arsenii_df = pd.read_excel(ARSENII_FILE)
print(f"Shape: {arsenii_df.shape}")
print(f"Columns: {arsenii_df.columns.tolist()}\n")

print("First few rows:")
print(arsenii_df.head())
print()

# Read T1_Glu_38 dataset
print(f"\nReading T1_Glu_38 dataset: {T1_GLU_FILE}")
t1_glu_df = pd.read_excel(T1_GLU_FILE)
print(f"Shape: {t1_glu_df.shape}")
print(f"Columns: {t1_glu_df.columns.tolist()}\n")

print("First few rows:")
print(t1_glu_df.head())
print()

print("="*80)

MATCHING ARSENII DATASET WITH T1_GLU_38

Reading Arsenii dataset: C:\Users\okkam\Desktop\labo\article 2\Glu\QC\databases_previous\import_data_arsenii_20250609_hippR_article1.xlsx
Shape: (107, 20)
Columns: ['PSCID_CandID', 'diagnostic_Nick', 'sex', 'Education', 'Age_spectro', 'mM (Precuneus)', 'mM (ACC)', 'MOCA_corr_spectro', 'memoria_libre_correcte', 'face_name_rappel_differe_spectro', 'Hip_L_norICV', 'cortical thickness (AdsignatureDickson)', 'dprime(Hit-FA)', 'Associative memory performance', 'Associative memory performance-sylvie', 'activation_Hippocampus_L', 'activation_Hippocampus_R', 'activation_Parietal_Sup_L', 'activation_Temporal_Inf_R', 'RighHipVol']

First few rows:
        PSCID_CandID diagnostic_Nick sex  Education  Age_spectro  \
0     3002498_327986              HC   F         16         72.6   
1  3025432_658178_T4              HC   F         12         75.5   
2     3100205_255881             MCI   H         10         73.6   
3     3123186_920577            SCD+   F  

In [7]:
"""
Extract base IDs and match the datasets
"""

# Extract base ID from T1_Glu_38 participant IDs
t1_glu_df['Base_ID'] = t1_glu_df['Participant_ID'].apply(extract_base_id)

print("Sample T1_Glu_38 IDs:")
print(t1_glu_df[['Participant_ID', 'Base_ID']].head())
print()

# Get set of base IDs from T1_Glu_38
t1_glu_base_ids = set(t1_glu_df['Base_ID'].tolist())
print(f"Number of unique participants in T1_Glu_38: {len(t1_glu_base_ids)}")

# Check if Arsenii dataset has PSCID_CandID column
print(f"\nArsenii dataset PSCID_CandID column sample:")
print(arsenii_df['PSCID_CandID'].head(10))

# Clean up Arsenii PSCID_CandID (remove _T4 suffix if present)
def clean_pscid_candid(pscid):
    """Remove _T4 or other visit suffixes from PSCID_CandID"""
    pscid = str(pscid)
    # Remove _T4 or _T followed by number
    if '_T' in pscid:
        return '_'.join(pscid.split('_')[:2])
    return pscid

arsenii_df['Base_ID'] = arsenii_df['PSCID_CandID'].apply(clean_pscid_candid)

print(f"\nArsenii cleaned Base_IDs:")
print(arsenii_df[['PSCID_CandID', 'Base_ID']].head(10))

# Get set of base IDs from Arsenii dataset
arsenii_base_ids = set(arsenii_df['Base_ID'].tolist())
print(f"\nNumber of unique participants in Arsenii dataset: {len(arsenii_base_ids)}")

# Find matches
matched_ids = t1_glu_base_ids & arsenii_base_ids
only_in_t1 = t1_glu_base_ids - arsenii_base_ids
only_in_arsenii = arsenii_base_ids - t1_glu_base_ids

print(f"\n{'='*80}")
print("MATCHING RESULTS")
print(f"{'='*80}")
print(f"Participants in both datasets: {len(matched_ids)}")
print(f"Only in T1_Glu_38: {len(only_in_t1)}")
print(f"Only in Arsenii dataset: {len(only_in_arsenii)}")

if only_in_t1:
    print(f"\n{'='*80}")
    print(f"Participants ONLY in T1_Glu_38 ({len(only_in_t1)}):")
    print(f"{'='*80}")
    for pid in sorted(only_in_t1):
        print(f"  - {pid}")

if only_in_arsenii:
    print(f"\n{'='*80}")
    print(f"Participants ONLY in Arsenii dataset (first 20):")
    print(f"{'='*80}")
    for pid in sorted(only_in_arsenii)[:20]:
        print(f"  - {pid}")

Sample T1_Glu_38 IDs:
        Participant_ID         Base_ID
0  3002498_327986_V17S  3002498_327986
1  3100205_255881_V17S  3100205_255881
2  3123186_920577_V10S  3123186_920577
3  3291977_748676_V24S  3291977_748676
4  3388201_333084_V03S  3388201_333084

Number of unique participants in T1_Glu_38: 38

Arsenii dataset PSCID_CandID column sample:
0       3002498_327986
1    3025432_658178_T4
2       3100205_255881
3       3123186_920577
4       3149469_790489
5       3291977_748676
6       3309393_500607
7       3388201_333084
8       3420680_878354
9       3572536_582808
Name: PSCID_CandID, dtype: object

Arsenii cleaned Base_IDs:
        PSCID_CandID         Base_ID
0     3002498_327986  3002498_327986
1  3025432_658178_T4  3025432_658178
2     3100205_255881  3100205_255881
3     3123186_920577  3123186_920577
4     3149469_790489  3149469_790489
5     3291977_748676  3291977_748676
6     3309393_500607  3309393_500607
7     3388201_333084  3388201_333084
8     3420680_878354  34206

In [8]:
"""
Merge T1_Glu_38 with Arsenii dataset
"""

# Merge the datasets on Base_ID
merged_df = pd.merge(
    t1_glu_df,
    arsenii_df,
    on='Base_ID',
    how='inner',  # Only keep participants in both datasets
    suffixes=('_T1Glu', '_Arsenii')
)

print(f"Merged dataset shape: {merged_df.shape}")
print(f"Number of participants: {len(merged_df)}")

# Show some key columns
print(f"\n{'='*80}")
print("SAMPLE OF MERGED DATA")
print(f"{'='*80}")

key_columns = [
    'Participant_ID',
    'Base_ID',
    'diagnostic_Nick',
    'Age_spectro',
    'Glu_mM',  # Precuneus Glu from T1_Glu
    'Glu_mM.1',  # ACC Glu from T1_Glu
    'mM (Precuneus)',  # Precuneus Glu from Arsenii
    'mM (ACC)',  # ACC Glu from Arsenii
]

print(merged_df[key_columns].head(10))

# Save the merged dataset
OUTPUT_MERGED = r"c:\Users\okkam\Documents\GitHub\glutamate_ad_longitudinal_sourcecode\merged_T1Glu_Arsenii.csv"
merged_df.to_csv(OUTPUT_MERGED, index=False)

print(f"\n{'='*80}")
print(f"Merged dataset saved to: {OUTPUT_MERGED}")
print(f"{'='*80}")

# Check if the Glu values match between the two datasets
print(f"\n{'='*80}")
print("CHECKING DATA CONSISTENCY")
print(f"{'='*80}")

# Compare Precuneus Glu values
merged_df['Precuneus_Glu_diff'] = abs(merged_df['Glu_mM'] - merged_df['mM (Precuneus)'])
merged_df['ACC_Glu_diff'] = abs(merged_df['Glu_mM.1'] - merged_df['mM (ACC)'])

print(f"\nPrecuneus Glu difference statistics:")
print(f"Mean difference: {merged_df['Precuneus_Glu_diff'].mean():.6f}")
print(f"Max difference: {merged_df['Precuneus_Glu_diff'].max():.6f}")
print(f"Matches exactly: {(merged_df['Precuneus_Glu_diff'] < 0.001).sum()} out of {len(merged_df)}")

print(f"\nACC Glu difference statistics:")
print(f"Mean difference: {merged_df['ACC_Glu_diff'].mean():.6f}")
print(f"Max difference: {merged_df['ACC_Glu_diff'].max():.6f}")
print(f"Matches exactly: {(merged_df['ACC_Glu_diff'] < 0.001).sum()} out of {len(merged_df)}")

Merged dataset shape: (36, 116)
Number of participants: 36

SAMPLE OF MERGED DATA
        Participant_ID         Base_ID diagnostic_Nick  Age_spectro  \
0  3002498_327986_V17S  3002498_327986              HC         72.6   
1  3100205_255881_V17S  3100205_255881             MCI         73.6   
2  3123186_920577_V10S  3123186_920577            SCD+         71.7   
3  3291977_748676_V24S  3291977_748676             MCI         87.5   
4  3388201_333084_V03S  3388201_333084             SCD         68.8   
5  3634853_127216_V03S  3634853_127216             SCD         68.1   
6  3886505_130777_V17S  3886505_130777            SCD+         71.8   
7  4206804_287981_V17S  4206804_287981             SCD         73.1   
8  4220920_636684_V03S  4220920_636684             MCI         68.2   
9  4273359_571380_V24S  4273359_571380            SCD+         75.2   

      Glu_mM   Glu_mM.1  mM (Precuneus)   mM (ACC)  
0  14.622442  14.187601       14.622442  14.187601  
1  13.118256  12.931811       