In [6]:
# We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath

# We'll also want to load some data directly from the filesystem.
from pathlib import Path

# We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np

In [7]:
# This path refers to the repo github.com:ReproBrainChart/PNC_FreeSurfer;
# Subject 1000393599's directory is used as an example.
subject_id = 1000393599
# To browse the repo, use this link:
# https://github.com/ReproBrainChart/PNC_FreeSurfer/tree/main
sub_path = RBCPath(f'rbc://PNC_FreeSurfer/freesurfer/sub-{subject_id}')

# This path refers to a directory:
assert sub_path.is_dir()

# Print each file in the directory:
for file in sub_path.iterdir():
    print(repr(file))

RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_brainmeasures.json')
RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_brainmeasures.tsv')
RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_freesurfer.tar.xz')
RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_fsLR_den-164k.tar.xz')
RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_fsaverage.tar.xz')
RBCPath('rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_regionsurfacestats.tsv')


In [8]:
# We can construct new paths by using the `/` operator. This is identical to
# how paths are constructed in the `pathlib` module.
stats_filepath = sub_path / f'sub-{subject_id}_regionsurfacestats.tsv'

# Use pandas to read in the TSV file then display it:

print(f"Loading {stats_filepath} ...")
with stats_filepath.open('r') as f:
    data = pd.read_csv(f, sep='\t')

data

Loading rbc://PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_regionsurfacestats.tsv ...


Unnamed: 0,subject_id,session_id,atlas,hemisphere,StructName,NumVert,SurfArea,GrayVol,ThickAvg,ThickStd,...,StdDev_wgpct,Min_wgpct,Max_wgpct,Range_wgpct,SNR_wgpct,Mean_piallgi,StdDev_piallgi,Min_piallgi,Max_piallgi,Range_piallgi
0,sub-1000393599,,aparc.DKTatlas,lh,caudalanteriorcingulate,1668,1121,3493,2.870,0.588,...,5.8371,-1.8413,42.8855,44.7269,4.4281,1.9877,0.0777,1.8054,2.1455,0.3402
1,sub-1000393599,,aparc.DKTatlas,lh,caudalmiddlefrontal,3308,2236,7030,2.882,0.537,...,4.6666,7.1531,40.4774,33.3243,5.0341,3.3898,0.2448,2.7003,3.8032,1.1029
2,sub-1000393599,,aparc.DKTatlas,lh,cuneus,4102,2619,5753,2.019,0.490,...,5.2623,-13.1617,33.8137,46.9754,3.0343,3.2453,0.3093,2.4099,3.5491,1.1392
3,sub-1000393599,,aparc.DKTatlas,lh,entorhinal,737,549,2714,3.655,0.585,...,6.0438,2.5989,37.5099,34.9110,3.4560,2.6710,0.1285,2.4654,2.9647,0.4993
4,sub-1000393599,,aparc.DKTatlas,lh,fusiform,4115,2822,8180,2.738,0.526,...,5.2854,-5.9378,39.6908,45.6286,3.9405,2.8272,0.1093,2.3304,3.1105,0.7800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13735,sub-1000393599,,Yeo2011_7Networks_N1000,rh,7Networks_3,14937,9936,27688,2.611,0.492,...,5.0774,-10.8846,39.2314,50.1161,4.1769,3.1173,0.3747,2.4544,4.7044,2.2500
13736,sub-1000393599,,Yeo2011_7Networks_N1000,rh,7Networks_4,13382,9146,29555,2.909,0.582,...,5.8317,-41.1954,52.2013,93.3967,3.8157,3.5262,0.9928,1.8828,5.1531,3.2703
13737,sub-1000393599,,Yeo2011_7Networks_N1000,rh,7Networks_5,10558,7677,31072,3.196,0.792,...,7.1063,-22.2837,88.8118,111.0955,3.3020,2.5300,0.3971,2.0215,4.7753,2.7538
13738,sub-1000393599,,Yeo2011_7Networks_N1000,rh,7Networks_6,20144,13602,41999,2.696,0.641,...,6.0781,-11.6287,43.5814,55.2101,3.6592,3.0563,0.5547,1.8599,4.9149,3.0550


In [9]:
# Participant meta-data is generally located in the BIDS repository for each
# study:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all
# study participants:
all_data = pd.concat([train_data, test_data])

# Display the full dataframe:
all_data

Unnamed: 0,participant_id,study,study_site,session_id,wave,age,sex,race,ethnicity,bmi,handedness,participant_education,parent_1_education,parent_2_education,p_factor
0,1000393599,PNC,PNC1,PNC1,1,15.583333,Male,Black,not Hispanic or Latino,22.15,Right,9th Grade,Complete primary,Complete secondary,0.589907
1,1001970838,PNC,PNC1,PNC1,1,17.833333,Male,Other,Hispanic or Latino,23.98,Right,11th Grade,Complete tertiary,Complete tertiary,-0.659061
2,1007995238,PNC,PNC1,PNC1,1,13.750000,Female,Other,not Hispanic or Latino,23.77,Right,6th Grade,Complete tertiary,Complete primary,-1.608375
3,1011497669,PNC,PNC1,PNC1,1,16.666667,Male,White,not Hispanic or Latino,29.68,Right,9th Grade,Complete tertiary,Complete tertiary,-1.233807
4,1017092387,PNC,PNC1,PNC1,1,18.666667,Female,Black,not Hispanic or Latino,23.24,Right,11th Grade,Complete primary,Complete primary,-0.923100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,969649154,PNC,PNC1,PNC1,1,12.333333,Male,White,not Hispanic or Latino,17.38,Right,5th Grade,Complete tertiary,Complete secondary,
530,970890500,PNC,PNC1,PNC1,1,18.166667,Female,White,not Hispanic or Latino,30.89,Right,11th Grade,Complete secondary,Complete secondary,
531,975856179,PNC,PNC1,PNC1,1,11.000000,Male,White,not Hispanic or Latino,15.67,Right,4th Grade,Complete primary,Complete secondary,
532,984757368,PNC,PNC1,PNC1,1,13.416667,Male,Black,not Hispanic or Latino,16.66,Right,5th Grade,Complete primary,,


In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler


In [11]:
def eda_pipeline(data):
    # Drop unnecessary columns - all of these only have 1 value
    data = data.drop(columns = ['study','study_site','session_id','wave'])
    # Drop NA values for parent_1_education
    data = data.dropna(subset=['parent_1_education'])
    # Fill NA values for bmi and parent_2_education
    # Fill bmi with median - slight right skew to original distribution
    data['bmi'] = data['bmi'].fillna(data['bmi'].median())
    # Fill parent_2_education with median - education is ordinal

    ordinal_map_p_ed_2 = {
    'No/incomplete primary': 0,
    'Complete primary': 1,
    'Complete secondary': 2,
    'Complete tertiary': 3
    }
    data['ordinal_p_ed_2'] = data['parent_2_education'].map(ordinal_map_p_ed_2)
    median_code = int(data['ordinal_p_ed_2'].median())
    # Reverse mapping
    rev_map = {v: k for k, v in ordinal_map_p_ed_2.items()}
    median_category = rev_map[median_code]
    data['parent_2_education'] = data['parent_2_education'].fillna(median_category)
    data = data.drop(columns = ['ordinal_p_ed_2'])
    # Encode categorical binary variables (Sex, Handedness, Ethnicity)
    # Example placeholders: 'sex' → Male=1, Female=0; 'handedness' → Right=1, Left=0; 'ethnicity' → Majority=1, Minority=0
    
    binary_encodings = {
        'sex': {'Male': 1, 'Female': 0},
        'ethnicity': {'not Hispanic or Latino': 1, 'Hispanic or Latino': 0}
    }
    
    for col, mapping in binary_encodings.items():
        if col in data.columns:
            data[col] = data[col].map(mapping)
    
    # Standardize continuous variables (Age & BMI)
    scaler = StandardScaler()
    data[['age_std', 'bmi_std']] = scaler.fit_transform(data[['age', 'bmi']])
    
    # One-Hot Encode Multicategory Features: parent_2_education, participant_education, race
    categorical_cols = ['parent_1_education','parent_2_education', 'participant_education', 'race', 'handedness']
        
    # Apply one-hot encoding with pandas
    data = pd.get_dummies(
        data,
        columns=categorical_cols,
        prefix=categorical_cols,
        # keep all categories - might be good for more complex models
        drop_first=False  # keep all categories
    )
    # Convert all resulting dummy columns to int (0/1)
    dummy_cols = [col for col in data.columns if any(col.startswith(c + '_') for c in categorical_cols)]
    data[dummy_cols] = data[dummy_cols].astype(int)
    
    return data

Example: using train_data

In [12]:
td3 = eda_pipeline(train_data)

In [13]:
print(td3.shape)
print(td3.columns)
print(td3.isna().sum())

(1051, 37)
Index(['participant_id', 'age', 'sex', 'ethnicity', 'bmi', 'p_factor',
       'age_std', 'bmi_std', 'parent_1_education_Complete primary',
       'parent_1_education_Complete secondary',
       'parent_1_education_Complete tertiary',
       'parent_1_education_No/incomplete primary',
       'parent_2_education_Complete primary',
       'parent_2_education_Complete secondary',
       'parent_2_education_Complete tertiary',
       'parent_2_education_No/incomplete primary',
       'participant_education_10th Grade', 'participant_education_11th Grade',
       'participant_education_12th Grade', 'participant_education_1st Grade',
       'participant_education_2nd Grade', 'participant_education_3rd Grade',
       'participant_education_4th Grade', 'participant_education_5th Grade',
       'participant_education_6th Grade', 'participant_education_7th Grade',
       'participant_education_8th Grade', 'participant_education_9th Grade',
       'participant_education_Bachelor's Degree

In [14]:
td3.head()

Unnamed: 0,participant_id,age,sex,ethnicity,bmi,p_factor,age_std,bmi_std,parent_1_education_Complete primary,parent_1_education_Complete secondary,...,participant_education_9th Grade,participant_education_Bachelor's Degree,participant_education_Some College,race_Asian,race_Black,race_Other,race_White,handedness_Ambidextrous,handedness_Left,handedness_Right
0,1000393599,15.583333,1,1,22.15,0.589907,0.185011,-0.064539,1,0,...,1,0,0,0,1,0,0,0,0,1
1,1001970838,17.833333,1,0,23.98,-0.659061,0.787288,0.324304,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1007995238,13.75,0,1,23.77,-1.608375,-0.305734,0.279682,0,0,...,0,0,0,0,0,1,0,0,0,1
3,1011497669,16.666667,1,1,29.68,-1.233807,0.474996,1.535453,0,0,...,1,0,0,0,0,0,1,0,0,1
4,1017092387,18.666667,0,1,23.24,-0.9231,1.010353,0.167067,1,0,...,0,0,0,0,1,0,0,0,0,1


In [15]:
import pandas as pd

In [16]:
mda = pd.read_csv("merged_data_aggregated.csv")

In [17]:
mda.head()

Unnamed: 0,participant_id,study,study_site,session_id,wave,age,sex,race,ethnicity,bmi,...,LimbicB:StdDev_wgpct,LimbicB:Min_wgpct,LimbicB:Max_wgpct,LimbicB:Range_wgpct,LimbicB:SNR_wgpct,LimbicB:Mean_piallgi,LimbicB:StdDev_piallgi,LimbicB:Min_piallgi,LimbicB:Max_piallgi,LimbicB:Range_piallgi
0,1000393599,PNC,PNC1,PNC1,1,15.583333,Male,Black,not Hispanic or Latino,22.15,...,7.74245,-22.323,66.6207,88.9437,2.93995,2.377,0.215,2.01445,3.24485,1.23045
1,1001970838,PNC,PNC1,PNC1,1,17.833333,Male,Other,Hispanic or Latino,23.98,...,8.06155,-24.73265,45.9391,70.6717,2.83385,2.2424,0.14335,1.99275,2.7745,0.78175
2,1007995238,PNC,PNC1,PNC1,1,13.75,Female,Other,not Hispanic or Latino,23.77,...,7.9687,-20.56425,43.4519,64.01615,2.63275,2.36035,0.2221,2.1049,3.47415,1.3693
3,1011497669,PNC,PNC1,PNC1,1,16.666667,Male,White,not Hispanic or Latino,29.68,...,7.98535,-28.9526,63.10885,92.06145,2.84495,2.3061,0.1352,2.10565,2.82025,0.71455
4,1017092387,PNC,PNC1,PNC1,1,18.666667,Female,Black,not Hispanic or Latino,23.24,...,7.3091,-21.0214,51.1549,72.1763,2.82865,2.2166,0.14775,2.0075,2.80105,0.79355


In [21]:
posteda = eda_pipeline(mda)
posteda.head()


Unnamed: 0.1,participant_id,age,sex,ethnicity,bmi,p_factor_x,Unnamed: 0,p_factor_y,SalVentAttnA:NumVert,SalVentAttnA:SurfArea,...,participant_education_9th Grade,participant_education_Bachelor's Degree,participant_education_Some College,race_Asian,race_Black,race_Other,race_White,handedness_Ambidextrous,handedness_Left,handedness_Right
0,1000393599,15.583333,1,1,22.15,0.589907,0,0.589907,2041.111111,1386.111111,...,1,0,0,0,1,0,0,0,0,1
1,1001970838,17.833333,1,0,23.98,-0.659061,1,-0.659061,2084.555556,1400.222222,...,0,0,0,0,0,1,0,0,0,1
2,1007995238,13.75,0,1,23.77,-1.608375,2,-1.608375,1796.444444,1215.222222,...,0,0,0,0,0,1,0,0,0,1
3,1011497669,16.666667,1,1,29.68,-1.233807,3,-1.233807,1875.111111,1279.888889,...,1,0,0,0,0,0,1,0,0,1
4,1017092387,18.666667,0,1,23.24,-0.9231,4,-0.9231,1877.777778,1286.666667,...,0,0,0,0,1,0,0,0,0,1


In [22]:
posteda = posteda.rename(columns={'p_factor_x': 'p_factor'}).drop(columns=['p_factor_y'])

In [23]:
posteda.to_csv("posteda.csv")