In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import matplotlib as mpl
import mne
import pathlib
import pytorch_lightning as pl
import torch
import torcheeg
import xgboost
import wandb

### Interface with data

In [2]:
data_dir = pathlib.Path("../../data/bap/")
assert data_dir.is_dir()

data_dir_healthy = data_dir / "healthy_controls/preprocessed"
assert data_dir_healthy.is_dir()

data_dir_chronic_pain = data_dir / "chronic_pain_patients"
assert data_dir_chronic_pain.is_dir()

AssertionError: 

In [None]:
eeg_files_all = list(data_dir.rglob("*.vhdr"))

eeg_files_raw = [f for f in eeg_files_all if not "preprocessed" in str(f)]
eeg_files_preprocessed = [f for f in eeg_files_all if "preprocessed" in str(f)]

eeg_files_healthy = list(data_dir_healthy.rglob("*.vhdr"))
eeg_files_chronic_pain = [f for f in eeg_files_preprocessed if "chronic_pain_patients" in str(f)]


### Analyze the subject data

In [None]:
def get_subj_id(eeg_file: pathlib.Path):
    return eeg_file.stem.split("_")[-3]

def get_age(df_subj, eeg_file: pathlib.Path):
    subj_id = get_subj_id(eeg_file)
    return df_subj[df_subj["Subject ID"] == subj_id]["Age(years)"].values[0]

#### Load clinical data

In [None]:
f_subj = data_dir / "clinical_data_updated_2020-08-04.ods"
assert f_subj.is_file()

In [None]:
df_subj_chronic_pain = pd.read_excel(f_subj, sheet_name=0, engine="odf", skiprows=1)
df_subj_healthy = pd.read_excel(f_subj, sheet_name=1, engine="odf", skiprows=0)

df_subj = pd.concat([df_subj_chronic_pain, df_subj_healthy])
df_subj["Age(years)"] = df_subj["Age(years)"].fillna(df_subj["Age (years)"])

print(f"# recorded subjects:      {len(df_subj)}")
print(f"# raw eeg files:          {len(eeg_files_raw)}")
print(f"# preprocessed eeg files: {len(eeg_files_preprocessed)}")

zfill_ints = lambda x:str(x).zfill(3) if type(x) else x
df_subj_chronic_pain["Subject ID"] = df_subj_chronic_pain["Subject ID"].astype(str).map(zfill_ints)
df_subj_healthy["Subject ID"] = df_subj_healthy["Subject ID"].astype(str).map(zfill_ints)
df_subj["Subject ID"] = df_subj["Subject ID"].astype(str).map(zfill_ints)

df_subj

#### Check the metadata and set channel types

In [None]:
example_raw = mne.io.read_raw_brainvision(eeg_files_chronic_pain[0])
example_raw

In [None]:
channel_to_channel_type = {ch_name:"eeg" for ch_name in example_raw.ch_names}
channel_to_channel_type.update({"LE":"misc", "RE":"misc"})
eeg_chs = [ch for ch in example_raw.ch_names if not (ch=="RE" or ch=="LE")]
example_raw.set_channel_types(channel_to_channel_type)


In [None]:
montage = mne.channels.make_standard_montage("standard_1020")
example_raw = example_raw.set_montage(montage)

In [None]:
for f in eeg_files_preprocessed:
    if not f.is_file():
        print(f)

### Extract PSD dataset

In [None]:
ages = []
psds = []

for f in eeg_files_preprocessed:
    
    age = get_age(df_subj, f)
    raw = mne.io.read_raw_brainvision(f, verbose=False, preload=True)
    raw = raw.set_channel_types(channel_to_channel_type, verbose=False)
    raw = raw.crop(raw.tmin+30, raw.tmax-30)
    
    raw = raw.notch_filter(freqs=50, notch_widths=0.5)
    raw = raw.filter(l_freq=1, h_freq=100)
    
    epochs = mne.make_fixed_length_epochs(raw, duration=10, preload=True)
    epochs_psds = np.log(epochs.compute_psd("welch", fmin=1, fmax=100, n_fft=int(5*1024)))
    
    psds.append(epochs_psds)
    ages.append(len(epochs_psds)*[age])



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline

### Baselines with train-validation split

In [None]:
ch_name_to_ch_pos = montage.get_positions()["ch_pos"]

pos = [ch_name_to_ch_pos[ch_name] for ch_name in example_raw.ch_names if channel_to_channel_type[ch_name]=="eeg"]
pos = np.stack(pos)

In [None]:
### small sanity check: shuffling retains relationships between labels and features

permutation1 = np.arange(0, 10)
permutation2 = np.arange(0, 10)
permutation3 = np.array([3*[i] for i in range(10)])

np.random.seed(42)
np.random.shuffle(permutation1)
np.random.seed(42)
np.random.shuffle(permutation2)
np.random.seed(42)
np.random.shuffle(permutation3)

permutation1[0], permutation2[0], permutation3[0]

In [None]:
np.random.seed(42)
np.random.shuffle(psds)
np.random.seed(42)
np.random.shuffle(ages)

n_train = int(0.7*len(psds))
n_val = int(0.15*len(psds))
n_test = int(0.15*len(psds))

psds_train = np.concatenate(psds[:n_train])
psds_train = psds_train.reshape(len(psds_train), -1)
ages_train = np.concatenate(ages[:n_train])

psds_val = np.concatenate(psds[n_train:n_train+n_val])
psds_val = psds_val.reshape(len(psds_val), -1)
ages_val = np.concatenate(ages[n_train:n_train+n_val])

print("# subjects")
print(f"train set: {n_train}")
print(f"validation set: {n_val}")
print(f"test set: {n_test}")

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(psds_train)

rfg = RandomForestRegressor(n_jobs=128)
rfg.fit(psds_train, ages_train)
r_squared_rfg_train = rfg.score(scaler.transform(psds_train), ages_train)
r_squared_rfg_val = rfg.score(scaler.transform(psds_val), ages_val)

print("Random forest regressor")
print(f"train R²:     {r_squared_rfg_train:.3}")
print(f"val R²:     {r_squared_rfg_val:.3}")

In [None]:
# percent_feat = 0.5
# feat_mask = rfg.feature_importances_ > np.percentile(rfg.feature_importances_, 100-percent_feat)
# X_train = X_train[:, feat_mask]
# X_val = X_val[:, feat_mask]
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

gbr = GradientBoostingRegressor(n_estimators=400)
gbr.fit(psds_train, ages_train)
r_squared_gbr_train = gbr.score(scaler.transform(psds_train), ages_train)
r_squared_gbr_val = gbr.score(scaler.transform(psds_val), ages_val)

print("\nTraining performance (R²) \n")
print(f"random forest regressor:     {r_squared_rfg_train:.3}")
print(f"gradient boosting regressor: {r_squared_gbr_train:.3}")
print("\nValidation performance (R²) \n")
print(f"random forest regressor:     {r_squared_rfg_val:.3}")
print(f"gradient boosting regressor: {r_squared_gbr_val:.3}")

### KFold Crossvalidation

In [None]:
# X, y = psds.reshape(len(psds), -1), ages
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

#### Random Forest Regressor (sklearn)

In [None]:
rfg = RandomForestRegressor(n_jobs=-1)
rfg_pipe = Pipeline(steps=[("scaler", scaler), ("rfg", rfg)])

cv = KFold(n_splits=10)

r_squared_rfg_val = cross_val_score(rfg_pipe, X=psds_train, y=ages_train, scoring='r2', cv=cv, n_jobs=-1)
r_squared_rfg_val_mean, r_squared_rfg_val_std = r_squared_rfg_val.mean(), r_squared_rfg_val.std()

print(f"{r_squared_rfg_val_mean:.3} +/- {r_squared_rfg_val_std:.3}")

#### Gradient Boosting Regressor (sklearn)

Slow to train since it doesn't support parallelism

In [None]:
gbr = GradientBoostingRegressor()
gbr_pipe = Pipeline(steps=[("scaler", scaler), ("gbr", gbr)])

cv = KFold(n_splits=10)

r_squared_gbr_val = cross_val_score(gbr_pipe, X=psds_train, y=ages_train, scoring='r2', cv=cv, n_jobs=32)
r_squared_gbr_val_mean, r_squared_gbr_val_std = r_squared_gbr_val.mean(), r_squared_gbr_val.std()

print(f"{r_squared_gbr_val_mean:.3} +/- {r_squared_gbr_val_std:.3}")

#### Gradient Boosting Regressor (XGboost)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgbr = XGBRegressor(n_estimators=160, max_depth=2, learning_rate=0.1, tree_method='gpu_hist', gpu_id=0)
xgbr_pipe = Pipeline(steps=[("scaler", scaler), ("xgbr", xgbr)])

cv = KFold(n_splits=10)

r_squared_xgbr_val = cross_val_score(xgbr_pipe, X=psds_train, y=ages_train, scoring='r2', cv=cv, n_jobs=-1)
r_squared_xgbr_val_mean, r_squared_xgbr_val_std = r_squared_xgbr_val.mean(), r_squared_xgbr_val.std()

print(f"{r_squared_xgbr_val_mean:.3} +/- {r_squared_xgbr_val_std:.3}")

### XGBoost hyperparameter tuning (on GPU)

In [None]:
param_distributions = {
    "xg_gbr__n_estimators": [20, 80, 160, 320, 640, 1280],
    "xg_gbr__max_depth": [2, 3, 4, 5, 6],
    "xg_gbr__learning_rate": [1e-3, 1e-2, 1e-1, 0.2]
}

xgb_regr = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
xgb_pipe = Pipeline(steps=[("scaler", scaler), ("xg_gbr", xgb_regr)])

inner_cv = KFold(n_splits=5)
outer_cv = KFold(n_splits=5)

xgb_regr_cv = RandomizedSearchCV(xgb_pipe, param_distributions, cv=inner_cv, n_iter=100)
search = xgb_regr_cv.fit(psds_train, ages_train)

r_squared_xgbr_train = xgb_regr_cv.best_score_
r_squared_xgbr_val = cross_val_score(xgb_regr_cv, X=psds_train, y=ages_train, scoring='r2', cv=outer_cv)

print(
    r_squared_xgbr_train,
    r_squared_xgbr_val.mean(),
    r_squared_xgbr_val.std()
)