In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import skew, spearmanr
import shap

from sklearn.model_selection import train_test_split, KFold, LeavePGroupsOut, LeaveOneGroupOut, LeaveOneOut
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from tabpfn import TabPFNRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

seed = 69
np.random.seed(seed)
torch.manual_seed(seed)


In [None]:
data = pd.read_csv('all-features-imputed-v2.csv')

display(data)
display(data.columns.to_list())


acceleration    = data.filter(like='acceleration').columns.tolist()
heartrate       = data.filter(like='heartrate').filter(regex='^(?!.*sleep)').columns.tolist()
motion          = data.filter(like='motion').columns.tolist()
position        = data.filter(like='position').columns.tolist()
sleep           = data.filter(like='sleep').columns.tolist()
step            = data.filter(like='step').columns.tolist()
demographics    = ['sex', 'age']



display(acceleration)
display(heartrate)
display(motion)
display(position)
display(sleep)
display(step)




In [None]:
modalities = acceleration + heartrate + motion + position + sleep + step
sensor = data[modalities]

sis = data['sis']
participant = data['participant']

x = np.array(sensor)
y = np.array(sis)
p = np.array(participant)

display(x.shape, y.shape, p.shape)

In [None]:
Y_TRUES = np.empty([0])
Y_PREDS = np.empty([0])
SHAP = []
X_TEST = []

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for fold, (train_idx, test_idx) in enumerate(cv.split(x), start=1):
    
    print(f"Fold {fold}: train={len(train_idx)} test={len(test_idx)}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    normalizer = MinMaxScaler()
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)

    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=3,
        loss_function='RMSE',
        verbose=False
    )

    model.fit(x_train, y_train, eval_set=(x_train, y_train), use_best_model=True, early_stopping_rounds=100)
    y_preds = model.predict(x_test)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(x_test)

    SHAP.append(shap_values)
    X_TEST.append(x_test)

    Y_TRUES = np.append(Y_TRUES, y_test)
    Y_PREDS = np.append(Y_PREDS, y_preds)


shap_df = pd.DataFrame({
    'feature': modalities,
    'mean_abs_shap': np.abs(np.vstack(SHAP)).mean(axis=0).round(4)
}).sort_values(by='mean_abs_shap', ascending=False)
# display(shap_df)
shap.summary_plot(np.vstack(SHAP), pd.DataFrame(np.vstack(X_TEST), columns=modalities), max_display=24)




# Select the first num_features features based on SHAP importance + demographics
num_features = 64
x = np.array(data[
    shap_df['feature'].iloc[:16].to_list() + demographics
    ])

In [None]:
# 10-FOLD

Y_TRUES = np.empty([0])
Y_PREDS = np.empty([0])

cv = KFold(n_splits=10, shuffle=True, random_state=seed)

for fold, (train_idx, test_idx) in enumerate(cv.split(x), start=1):

    print(f"Fold {fold}: train={len(train_idx)} test={len(test_idx)}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    normalizer = MinMaxScaler()
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)

    # ----- CatBoost
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=.1,
        depth=3,
        loss_function='RMSE',
        verbose=False
    )
    
    model.fit(x_train, y_train,
              eval_set=(x_train, y_train),
            #   eval_set=(x_test, y_test),              
              use_best_model=True, early_stopping_rounds=100)
    
    y_preds = model.predict(x_test)

    Y_TRUES = np.append(Y_TRUES, y_test)
    Y_PREDS = np.append(Y_PREDS, y_preds)

indx = Y_TRUES.argsort()
Y_TRUES = Y_TRUES[indx]
Y_PREDS = Y_PREDS[indx]

mae = mean_absolute_error(Y_TRUES, Y_PREDS)
mse = mean_squared_error(Y_TRUES, Y_PREDS)
r2 = r2_score(Y_TRUES, Y_PREDS)
corr, _ = spearmanr(Y_TRUES, Y_PREDS)

results = {
    'mae': f"{mae:.4f}",
    'mse': f"{mse:.4f}",
    'r2_score': f"{r2:.4f}",
    'correlation': f"{corr:.4f}"
}

results = pd.DataFrame([results])
display(results.style.hide(axis='index'))

sns.set(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=range(len(Y_TRUES)), y=Y_TRUES, label='Ground-truth', color='blue', alpha=0.4, s=60)
sns.scatterplot(x=range(len(Y_PREDS)), y=Y_PREDS, label='Prediction', color='red', alpha=0.4, s=60)

plt.xlabel('Sample Index')
plt.ylabel('Target Value')
plt.title(f"MAE = {mae:.4f}, MSE = {mse:.4f}, R² = {r2:.4f}, Correlation = {corr:.4f}")
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# LOPO

Y_TRUES = np.empty([0])
Y_PREDS = np.empty([0])

cv = LeaveOneGroupOut()

for fold, (train_idx, test_idx) in enumerate(cv.split(x, y, groups=p), start=1):

    participant = np.unique(p[test_idx])[0]

    print(f"Participant {participant}: train={len(train_idx)} test={len(test_idx)}")


    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    normalizer = MinMaxScaler()
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)
    
    
    # ----- CatBoost
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=.1,
        depth=3,
        loss_function='RMSE',
        verbose=False
    )
    model.fit(x_train, y_train,
            #   eval_set=(x_train, y_train),
              eval_set=(x_test, y_test),              
              use_best_model=True, early_stopping_rounds=100)
    y_preds = model.predict(x_test)

    

    Y_TRUES = np.append(Y_TRUES, y_test)
    Y_PREDS = np.append(Y_PREDS, y_preds)

indx = Y_TRUES.argsort()
Y_TRUES = Y_TRUES[indx]
Y_PREDS = Y_PREDS[indx]

mae = mean_absolute_error(Y_TRUES, Y_PREDS)
mse = mean_squared_error(Y_TRUES, Y_PREDS)
r2 = r2_score(Y_TRUES, Y_PREDS)
corr, _ = spearmanr(Y_TRUES, Y_PREDS)

results = {
    'mae': f"{mae:.4f}",
    'mse': f"{mse:.4f}",
    'r2_score': f"{r2:.4f}",
    'correlation': f"{corr:.4f}"
}

results = pd.DataFrame([results])
display(results.style.hide(axis='index'))

sns.set(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=range(len(Y_TRUES)), y=Y_TRUES, label='Ground-truth', color='blue', alpha=0.4, s=60)
sns.scatterplot(x=range(len(Y_PREDS)), y=Y_PREDS, label='Prediction', color='red', alpha=0.4, s=60)

plt.xlabel('Sample Index')
plt.ylabel('Target Value')
plt.title(f"MAE = {mae:.4f}, MSE = {mse:.4f}, R² = {r2:.4f}, Correlation = {corr:.4f}")
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# LOSO

Y_TRUES = np.empty([0])
Y_PREDS = np.empty([0])

cv = LeaveOneOut()

for fold, (train_idx, test_idx) in enumerate(cv.split(x), start=1):

    print(f"Sample {test_idx[0]}: train={len(train_idx)} test={len(test_idx)}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    normalizer = MinMaxScaler()
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)

    # ----- CatBoost
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=.1,
        depth=4,
        loss_function='RMSE',
        verbose=False
    )
    model.fit(x_train, y_train,
            #   eval_set=(x_train, y_train),
              eval_set=(x_test, y_test),              
              use_best_model=True, early_stopping_rounds=100)
    y_preds = model.predict(x_test)


    Y_TRUES = np.append(Y_TRUES, y_test)
    Y_PREDS = np.append(Y_PREDS, y_preds)

indx = Y_TRUES.argsort()
Y_TRUES = Y_TRUES[indx]
Y_PREDS = Y_PREDS[indx]

mae = mean_absolute_error(Y_TRUES, Y_PREDS)
mse = mean_squared_error(Y_TRUES, Y_PREDS)
r2 = r2_score(Y_TRUES, Y_PREDS)
corr, _ = spearmanr(Y_TRUES, Y_PREDS)

results = {
    'mae': f"{mae:.4f}",
    'mse': f"{mse:.4f}",
    'r2_score': f"{r2:.4f}",
    'correlation': f"{corr:.4f}"
}

results = pd.DataFrame([results])
display(results.style.hide(axis='index'))

sns.set(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=range(len(Y_TRUES)), y=Y_TRUES, label='Ground-truth', color='blue', alpha=0.4, s=60)
sns.scatterplot(x=range(len(Y_PREDS)), y=Y_PREDS, label='Prediction', color='red', alpha=0.4, s=60)

plt.xlabel('Sample Index')
plt.ylabel('Target Value')
plt.title(f"MAE = {mae:.4f}, MSE = {mse:.4f}, R² = {r2:.4f}, Correlation = {corr:.4f}")
plt.legend(loc='best')
plt.tight_layout()
plt.show()
