Parkinson's Progression Markers Initiative (PPMI) —
Preprocessing & Analysis Starter Notebook
Purpose:
- Ready-to-run Jupyter notebook scaffold to prepare PPMI Clinical data for downstream
modeling and analysis. Designed to work with CSV downloads from the PPMI portal.
Usage:
1. Place your downloaded PPMI CSV files into `./PPMI_raw/` (or change DATA_DIR below).
2. Run the cells in order. Cells that depend on files you don't yet have will warn but
will not crash.
 loader cells to continue ML pipeline.

 Notebook author: Ahmed Fawaz



In [None]:
# -------------------------
# 0. Environment & Settings
# -------------------------
import os
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging


# Visual settings
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 120)


# Logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')


# Paths (change to suit your environment)
DATA_DIR = os.getenv('PPMI_RAW_DIR', './PPMI_raw')
PROCESSED_DIR = os.getenv('PPMI_PROCESSED_DIR', './processed')
OUTPUT_DIR = os.getenv('PPMI_OUTPUT_DIR', './output')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)


print(f"DATA_DIR = {DATA_DIR}")
print(f"PROCESSED_DIR = {PROCESSED_DIR}")
print(f"OUTPUT_DIR = {OUTPUT_DIR}")



Quick cell: install missing packages (run only if necessary)

If you're running in a fresh environment, uncomment & run the next cell to install packages.



!pip install pandas matplotlib seaborn scikit-learn imbalanced-learn shap lime

In [None]:
# ## 1. Helper functions to discover and load PPMI CSVs
# The PPMI download contains many CSVs whose filenames may include a date suffix.
# These helpers find files by partial name match and read them robustly.





def find_file(partial_name, directory=DATA_DIR):
"""Return the first matching CSV for partial_name in directory, else None."""
pattern = os.path.join(directory, f"*{partial_name}*.csv")
matches = sorted(glob(pattern))
if not matches:
logging.warning(f"No file matches for '{partial_name}' in {directory}")
return None
if len(matches) > 1:
logging.info(f"Multiple matches for '{partial_name}', using first: {matches[0]}")
return matches[0]




def load_csv_by_partial(partial_name, directory=DATA_DIR, dtype=None, parse_dates=None):
"""Load a CSV matched by partial_name, with optional dtype/parse_dates.


Returns: DataFrame or None (if file missing)
"""
fp = find_file(partial_name, directory)
if fp is None:
return None
try:
df = pd.read_csv(fp, dtype=dtype, parse_dates=parse_dates, low_memory=False)
logging.info(f"Loaded '{partial_name}' -> {os.path.basename(fp)} ({df.shape})")
return df
except Exception as e:
logging.error(f"Failed to load {fp}: {e}")
return None



In [None]:
# ## 2. Load core PPMI tables (safe to run now; will warn if files missing)
# We'll attempt to load the most commonly used clinical tables referenced in the
# PPMI Data User Guide. If you only have a subset, proceed with what you have.



core_tables = {
'Participant_Status': None,
'Demographics': None,
'MDS-UPDRS_Part_I': None,
'MDS-UPDRS_Part_I_Patient_Questionnaire': None,
'MDS-UPDRS_Part_II__Patient_Questionnaire': None,
'MDS-UPDRS_Part_III': None,
'LEDD_Concomitant_Medication_Log': None,
'Concomitant_Medication_Log': None,
'UPSIT': None,
'Codes': None,
}


for name in list(core_tables.keys()):
core_tables[name] = load_csv_by_partial(name)


# Make variables for convenience (will be None if not present)
Participant_Status = core_tables['Participant_Status']
Demographics = core_tables['Demographics']
MDS_UPDRS_I = core_tables['MDS-UPDRS_Part_I']
MDS_UPDRS_I_P = core_tables['MDS-UPDRS_Part_I_Patient_Questionnaire']
MDS_UPDRS_II = core_tables['MDS-UPDRS_Part_II__Patient_Questionnaire']
MDS_UPDRS_III = core_tables['MDS-UPDRS_Part_III']
LEDD_log = core_tables['LEDD_Concomitant_Medication_Log']
Concomitant_Med = core_tables['Concomitant_Medication_Log']
Codes = core_tables['Codes']
UPSIT = core_tables['UPSIT']



In [None]:
# ## 3. Basic inspection utilities
# Small helpers to inspect tables and common issues found in PPMI data (case variations,
# date parsing needs, missing PATNOs, etc.)




def show_basic_info(df, name, n=5):
if df is None:
logging.warning(f"{name} is not loaded")
return
print(f"\n--- {name} ({df.shape}) ---")
display(df.head(n))
print('\nColumn dtypes:')
print(df.dtypes)
print('\nMissing values (top 10):')
print(df.isnull().sum().sort_values(ascending=False).head(10))




# Example usage (safe to run)
show_basic_info(Participant_Status, 'Participant_Status')
show_basic_info(Demographics, 'Demographics')



In [None]:
# ## 4. Build Participant Master table (static participant-level info)
# Follow guide: keep participants with ENROLL_STATUS in (enrolled, withdrew, complete)





def build_participant_master(participant_status_df, demographics_df, codes_df=None):
"""Return a master table with selected participants and decoded fields when possible."""
if participant_status_df is None:
logging.error("Participant_Status table is required to build master table")
return None
ps = participant_status_df.copy()


# Normalize enroll status (lowercase) to handle mixed case
if 'ENROLL_STATUS' in ps.columns:
ps['ENROLL_STATUS'] = ps['ENROLL_STATUS'].astype(str).str.lower()
else:
logging.warning('ENROLL_STATUS missing; proceeding without filtering by status')


valid_status = ['enrolled', 'withdrew', 'complete']
if 'ENROLL_STATUS' in ps.columns:
ps = ps[ps['ENROLL_STATUS'].isin(valid_status)].copy()


# Merge demographics
if demographics_df is not None:
merged = ps.merge(demographics_df, on='PATNO', how='left', suffixes=('', '_demo'))
else:
merged = ps


# Optionally decode codes with Codes table
# (Codes table has mappings for many coded fields; user can expand as needed)


return merged


participant_master = build_participant_master(Participant_Status, Demographics, Codes)
show_basic_info(participant_master, 'Participant_Master')

In [None]:
# ## 5. Cohort composition & quick stats (for data access proposal / EDA)




def cohort_summary(participant_status_df):
if participant_status_df is None:
logging.warning('Participant_Status not available')
return
ps = participant_status_df.copy()
ps['ENROLL_STATUS'] = ps['ENROLL_STATUS'].astype(str).str.lower()
display(ps.groupby(['COHORT_DEFINITION', 'ENROLL_STATUS']).size().unstack(fill_value=0))


cohort_summary(Participant_Status)

In [None]:
# ## 6. Preparing longitudinal UPDRS summary (example)
# We create a per-visit total for each participant for Part I / II / III where available.
# The user guide notes that part III may have special '101' values (unable to rate).





def compute_updrs_total(df_part, part_prefix='NP'):
"""Attempt to compute a total score across typical NP* columns. Designed to be robust
to variation in column names. Returns df with a TOTAL column.
"""
if df_part is None:
logging.warning('UPDRS table not provided')
return None
df = df_part.copy()


# normalize INFODT if present
if 'INFODT' in df.columns:
df['INFODT'] = pd.to_datetime(df['INFODT'], errors='coerce')


# Identify score columns: common pattern in guide is NP* or similar
score_cols = [c for c in df.columns if (c.startswith('NP') or c.startswith('MDS') or c.upper().startswith('P'))]
# Fallback: numeric columns except identifiers / dates
if not score_cols:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
possible = [c for c in numeric_cols if c not in ['PATNO']]
score_cols = possible


if not score_cols:
logging.warning('No candidate score columns found to compute TOTAL')
return df


# Handle special '101' -> NaN for part III 'unable to rate' as per guide
df[score_cols] = df[score_cols].apply(pd.to_numeric, errors='coerce')
df['TOTAL'] = df[score_cols].sum(axis=1, skipna=True)


return df


updrs1_tot = compute_updrs_total(MDS_UPDRS_I)
show_basic_info(updrs1_tot, 'UPDRS Part I (with TOTAL)')



In [None]:
# ## 7. Visual example: UPDRS progression for a sample participant




def plot_updrs_progression(updrs_df, patno=None, title_prefix='UPDRS'):
if updrs_df is None:
logging.warning('No UPDRS data passed')
return
df = updrs_df.copy()
if 'PATNO' not in df.columns:
logging.error('PATNO not in UPDRS table')
return
if patno is None:
patno = df['PATNO'].iloc[0]


sel = df[df['PATNO'] == patno].copy()
if 'INFODT' in sel.columns:
sel = sel.sort_values('INFODT')
x = sel['INFODT']
else:
sel = sel.sort_values('EVENT_ID') if 'EVENT_ID' in sel.columns else sel
x = range(len(sel))


if 'TOTAL' not in sel.columns:
logging.warning('TOTAL not computed; attempting to compute')
sel = compute_updrs_total(sel)


plt.figure(figsize=(10,4))
plt.plot(x, sel['TOTAL'], marker='o')
plt.title(f"{title_prefix} progression for PATNO {patno}")
plt.xlabel('Visit Date' if 'INFODT' in sel.columns else 'Visit Index')
plt.ylabel('Total Score')
plt.tight_layout()
plt.show()


# Example plot (will only show if a sample participant exists)
if updrs1_tot is not None and not updrs1_tot.empty:
sample_pat = updrs1_tot['PATNO'].iloc[0]
plot_updrs_progression(updrs1_tot, patno=sample_pat, title_prefix='UPDRS Part I')



In [None]:
# ## 8. Medication (LEDD) processing example
# The guide suggests creating a table that records LEDD changes over time. We'll provide
# a helper to build a simple version from the LEDD_Concomitant_Medication_Log table.





def build_ledd_timeline(ledd_df):
"""Create a simple timeline of LEDD per PATNO by aggregating LEDD rows by start date.
Note: the PPMI guide uses month-level dates; this is a conservative example.
"""
if ledd_df is None:
logging.warning('LEDD table not available')
return None
df = ledd_df.copy()


# Ensure STARTDT is parseable
if 'STARTDT' in df.columns:
df['STARTDT'] = pd.to_datetime(df['STARTDT'], errors='coerce')
# normalize LEDD column name possibilities
led_cols = [c for c in df.columns if c.upper() in ('LEDD','LEDDSUM','LD')]
if led_cols:
df['LEDD_VAL'] = pd.to_numeric(df[led_cols[0]], errors='coerce')
else:
# attempt to find any numeric column that looks like LEDD
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
df['LEDD_VAL'] = df[numeric_cols[0]]
else:
logging.warning('No LEDD-like column found')
df['LEDD_VAL'] = np.nan


df = df.sort_values(['PATNO', 'STARTDT'])
# keep only rows where LEDD_VAL is present
df = df[~df['LEDD_VAL'].isnull()]


# For each PATNO, record changes
out = df.groupby(['PATNO', 'STARTDT'])['LEDD_VAL'].sum().reset_index()
out = out.sort_values(['PATNO', 'STARTDT'])
return out


led_timeline = build_ledd_timeline(LEDD_log)
show_basic_info(led_timeline, 'LEDD Timeline Sample')



In [None]:
# ## 9. Processed data preparation (all work done in this notebook)
# 1. Load raw PPMI CSV files from `./PPMI_raw/`.
# 2. Clean column names, fix date formats, harmonize EVENT_ID, and standardize missing values.
# 3. Create clinical tables, UPDRS longitudinal tables, and voice/acoustic feature tables directly in Python.
# 4. Save cleaned versions into `./processed/` so later ML cells read from them.
#
# Load any already-processed or partially cleaned files if desired:


def load_processed(name, directory=PROCESSED_DIR):
fp = find_file(name, directory)
if fp is None:
return None
return pd.read_csv(fp, low_memory=False)


processed_clinical = load_processed('clinical_clean')
processed_updrs = load_processed('updrs_longitudinal')
processed_voice = load_processed('voice_features')(name, directory=PROCESSED_DIR):
fp = find_file(name, directory)
if fp is None:
return None
return pd.read_csv(fp, low_memory=False)


processed_clinical = load_processed('clinical_clean')
processed_updrs = load_processed('updrs_longitudinal')
processed_voice = load_processed('voice_features')


show_basic_info(processed_clinical, 'Processed Clinical')
show_basic_info(processed_updrs, 'Processed UPDRS')
show_basic_info(processed_voice, 'Processed Voice Features')



In [None]:
# ## 10. Quick ML-ready checks & train/test split (example using processed_clinical)
# These cells assume R has produced a tidy clinical table ready for modeling where:
# - PATNO is present
# - TARGET is a binary column (1 = PD, 0 = Control)



from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler




def prepare_ml_data(clinical_df, target_col='TARGET', id_col='PATNO', test_size=0.2, random_state=42):
if clinical_df is None:
logging.error('Processed clinical dataframe required for ML prep')
return None
df = clinical_df.copy()
if id_col not in df.columns or target_col not in df.columns:
logging.error(f"Required columns '{id_col}' or '{target_col}' missing in processed clinical data")
return None


X = df.drop(columns=[id_col, target_col])
y = df[target_col].astype(int)


# Simple impute numeric features
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
imp = SimpleImputer(strategy='median')
X[num_cols] = imp.fit_transform(X[num_cols])


# Scale numeric features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
return X_train, X_test, y_train, y_test


# Example (will do nothing if processed_clinical not present)
ml_split_example = None
if processed_clinical is not None:
ml_split_example = prepare_ml_data(processed_clinical, target_col='TARGET')
if ml_split_example is not None:
X_train, X_test, y_train, y_test = ml_split_example
print('ML split shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# ## 11. Model skeleton (baseline classifier) — ready to run after processed data provided
# We'll provide a simple pipeline using RandomForest as a baseline and show where to plug in
# XAI (SHAP) and robustness tests.



from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support, confusion_matrix




def baseline_train_eval(X_train, X_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, 'predict_proba') else None


acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
p, r, f, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')


print('Accuracy:', acc)
if auc is not None:
print('ROC AUC:', auc)
print('Precision, Recall, F1:', (p, r, f))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
return clf


# Example (only run if ml_split_example exists)
if ml_split_example is not None:
clf = baseline_train_eval(X_train, X_test, y_train, y_test)



In [None]:
# ## 12. XAI hooks (SHAP)
# Example code to compute SHAP values for the RandomForest baseline. Run after model training.



try:
import shap
has_shap = True
except Exception:
has_shap = False
logging.warning('SHAP not installed; install shap to run explainability analyses')


if has_shap and 'clf' in globals() and ml_split_example is not None:
explainer = shap.TreeExplainer(clf)
shap_vals = explainer.shap_values(X_test)
# For binary classification shap_vals[1] corresponds to positive class
shap.summary_plot(shap_vals[1], X_test, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'shap_summary.png'))
print('Saved SHAP summary plot to output folder')



In [None]:
# ## 13. Robustness testing placeholder
# Outline: to test robustness to noise, create noisy variants of audio-derived features or
# perturb numeric features with gaussian noise / domain shifts and re-evaluate model.





def evaluate_robustness_numeric(clf, X_test, y_test, noise_snr_db=20):
"""Add Gaussian noise scaled by desired SNR (dB) to numeric features and eval model.
This is a placeholder — for audio-level noise you should perturb audio files before
extracting features in R or Python audio pipelines.
"""
Xn = X_test.copy()
# Compute signal power and noise power relation
rms_signal = np.sqrt((Xn ** 2).mean())
# Convert SNR dB to linear
snr_lin = 10 ** (noise_snr_db / 10.0)
noise_std = np.sqrt(rms_signal / snr_lin)
noise = np.random.normal(0, noise_std, size=Xn.shape)
Xn += noise
y_pred = clf.predict(Xn)
acc = accuracy_score(y_test, y_pred)
print(f'Robustness test at SNR={noise_snr_db} dB: Accuracy = {acc:.4f}')


if ml_split_example is not None and 'clf' in globals():
evaluate_robustness_numeric(clf, X_test, y_test, noise_snr_db=10)



In [None]:
# ## 14. Saving outputs & reproducibility
# Save participant_master and any processed outputs you produce for traceability.



if participant_master is not None:
out_fp = os.path.join(OUTPUT_DIR, 'participant_master_sample.csv')
participant_master.head(200).to_csv(out_fp, index=False)
print('Saved participant_master sample to', out_fp)

