In [None]:
# We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath

# We'll also want to load some data directly from the filesystem.
from pathlib import Path

# We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np

In [None]:
# This path refers to the repo github.com:ReproBrainChart/PNC_FreeSurfer;
# Subject 1000393599's directory is used as an example.
subject_id = 1000393599
# To browse the repo, use this link:
# https://github.com/ReproBrainChart/PNC_FreeSurfer/tree/main
sub_path = RBCPath(f'rbc://PNC_FreeSurfer/freesurfer/sub-{subject_id}')

# This path refers to a directory:
assert sub_path.is_dir()

# Print each file in the directory:
for file in sub_path.iterdir():
    print(repr(file))

In [None]:
# We can construct new paths by using the `/` operator. This is identical to
# how paths are constructed in the `pathlib` module.
stats_filepath = sub_path / f'sub-{subject_id}_regionsurfacestats.tsv'

# Use pandas to read in the TSV file then display it:

print(f"Loading {stats_filepath} ...")
with stats_filepath.open('r') as f:
    data = pd.read_csv(f, sep='\t')

data

In [None]:
# Participant meta-data is generally located in the BIDS repository for each
# study:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all
# study participants:
all_data = pd.concat([train_data, test_data])

# Display the full dataframe:
all_data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler


In [None]:
def eda_pipeline(data):
    # Drop NA values for parent_1_education
    data = data.dropna(subset=['parent_1_education'])
    # Fill NA values for bmi and parent_2_education
    # Fill bmi with median - slight right skew to original distribution
    data['bmi'] = data['bmi'].fillna(data['bmi'].median())
    # Fill parent_2_education with median - education is ordinal

    ordinal_map_p_ed_2 = {
    'No/incomplete primary': 0,
    'Complete primary': 1,
    'Complete secondary': 2,
    'Complete tertiary': 3
    }
    data['ordinal_p_ed_2'] = data['parent_2_education'].map(ordinal_map_p_ed_2)
    median_code = int(data['ordinal_p_ed_2'].median())
    # Reverse mapping
    rev_map = {v: k for k, v in ordinal_map_p_ed_2.items()}
    median_category = rev_map[median_code]
    data['parent_2_education'] = data['parent_2_education'].fillna(median_category)
    data = data.drop(columns = ['ordinal_p_ed_2'])
    # Encode categorical binary variables (Sex, Handedness, Ethnicity)
    # Example placeholders: 'sex' → Male=1, Female=0; 'handedness' → Right=1, Left=0; 'ethnicity' → Majority=1, Minority=0
    
    binary_encodings = {
        'sex': {'Male': 1, 'Female': 0},
        'ethnicity': {'not Hispanic or Latino': 1, 'Hispanic or Latino': 0}
    }
    
    for col, mapping in binary_encodings.items():
        if col in data.columns:
            data[col] = data[col].map(mapping)
    
    # Standardize continuous variables (Age & BMI)
    scaler = StandardScaler()
    data[['age_std', 'bmi_std']] = scaler.fit_transform(data[['age', 'bmi']])
    
    # One-Hot Encode Multicategory Features: parent_2_education, participant_education, race
    categorical_cols = ['parent_1_education','parent_2_education', 'participant_education', 'race', 'handedness']
        
    # Apply one-hot encoding with pandas
    data = pd.get_dummies(
        data,
        columns=categorical_cols,
        prefix=categorical_cols,
        # keep all categories - might be good for more complex models
        drop_first=False  # keep all categories
    )
    # Convert all resulting dummy columns to int (0/1)
    dummy_cols = [col for col in data.columns if any(col.startswith(c + '_') for c in categorical_cols)]
    data[dummy_cols] = data[dummy_cols].astype(int)
    
    return data

Example: using train_data

In [None]:
td3 = eda_pipeline(train_data)

In [None]:
print(td3.shape)
print(td3.columns)
print(td3.isna().sum())

In [None]:
td3.head()