In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Modelling helpers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report,
                             confusion_matrix,
                             balanced_accuracy_score,
                             f1_score)
RANDOM_STATE = 42

In [2]:
DATA_DIR = Path("./")          # folder that contains the CSV/XLSX
HR_FILE  = DATA_DIR / "heartrate_15min.csv"
DX_FILE  = DATA_DIR / "Diagnoses_20250404.csv"

In [3]:
hr   = pd.read_csv(HR_FILE,  parse_dates=["Time"])
diag = pd.read_csv(DX_FILE,  parse_dates=["DCDate.diagnosis_baseline"])

# three-group label already prepared in the file
KEEP_COLS = ["PIDN", "DCDate.diagnosis_baseline", "Diagnosis_baseline_3groups"]
diag = diag[KEEP_COLS]

# intersection
labelled_pidns = set(diag.PIDN) & set(hr.PIDN)
hr   = hr [hr.PIDN.isin(labelled_pidns)].copy()
diag = diag[diag.PIDN.isin(labelled_pidns)].copy()

print(f"Labelled participants with HR data = {diag.PIDN.nunique()}")   # 235


Labelled participants with HR data = 235


In [4]:
def baseline_window(hr_grp, baseline_date):
    """
    Return HR rows belonging to the first 24-h “day” that starts
    on or after the baseline diagnosis date.
    If none exist on/after that date, fall back to the first HR day available.
    """
    # rows on/after baseline date
    after = hr_grp[hr_grp.Time.dt.date >= baseline_date]
    if after.empty:
        day0 = hr_grp.Time.dt.date.min()
    else:
        day0 = after.Time.dt.date.min()
    return hr_grp[hr_grp.Time.dt.date == day0]

# add baseline date column for a quick merge
diag = diag.rename(columns={"DCDate.diagnosis_baseline": "BaselineDate"})
hr    = hr.merge(diag[["PIDN", "BaselineDate"]], on="PIDN", how="left")

baseline_slices = []
for pid, grp in hr.groupby("PIDN"):
    rows = baseline_window(grp, grp["BaselineDate"].iloc[0].date())
    if not rows.empty:
        baseline_slices.append(rows)

baseline_hr = pd.concat(baseline_slices, ignore_index=True)
print(baseline_hr.PIDN.nunique())    # should still be 235 (all have at least 1 day)


235


In [7]:
# ---------- 3  FEATURE ENGINEERING (robust version) -----------------
import numpy as np
import pandas as pd

def extract_features(df: pd.DataFrame) -> pd.Series:
    """Compute 24-h HR summary stats for one participant-day."""
    v = df["Value"].to_numpy()
    n = v.size

    # Day (06-22) vs night masks
    hours      = df.Time.dt.hour
    day_mask   = hours.between(6, 21)
    night_mask = ~day_mask

    # Helpers that avoid warnings
    safe_mean = lambda arr: np.nan if arr.size == 0 else arr.mean()
    safe_pct  = lambda cond: np.nan if n == 0 else cond.mean()

    return pd.Series({
        "hr_mean"   : safe_mean(v),
        "hr_median" : np.nan if n == 0 else np.median(v),
        "hr_std"    : np.nan if n == 0 else np.std(v, ddof=0),     # no DoF warning
        "hr_min"    : np.nan if n == 0 else v.min(),
        "hr_max"    : np.nan if n == 0 else v.max(),
        "hr_iqr"    : np.nan if n == 0 else np.percentile(v, 75) - np.percentile(v, 25),
        "hr_p10"    : np.nan if n == 0 else np.percentile(v, 10),
        "hr_p90"    : np.nan if n == 0 else np.percentile(v, 90),
        "tachy_prop": safe_pct(v > 100),                            # proportion > 100 bpm
        "rmssd"     : np.nan if n < 2 else np.sqrt(np.mean(np.diff(v)**2)),
        "day_mean"  : safe_mean(df.loc[day_mask,   "Value"].to_numpy()),
        "night_mean": safe_mean(df.loc[night_mask, "Value"].to_numpy())
    })

# -------------------------------------------------------------------
# Build per-participant feature table
feature_rows = (
    baseline_hr
      .groupby("PIDN", group_keys=False)
      .apply(extract_features, include_groups=False)   # silences future pandas warning
      .reset_index()
)

# Attach 3-group diagnosis label
data = feature_rows.merge(
    diag[["PIDN", "Diagnosis_baseline_3groups"]],
    on="PIDN"
)

# ----- SAVE to CSV so you can reuse it later -----
OUTFILE = "baseline_hr_features.csv"
data.to_csv(OUTFILE, index=False)
print(f"Saved feature table -> {OUTFILE}   (shape = {data.shape})")

# quick peek
data.head()


Saved feature table -> baseline_hr_features.csv   (shape = (235, 14))


Unnamed: 0,PIDN,hr_mean,hr_median,hr_std,hr_min,hr_max,hr_iqr,hr_p10,hr_p90,tachy_prop,rmssd,day_mean,night_mean,Diagnosis_baseline_3groups
0,1416,76.222222,71.0,12.447916,67.0,115.0,4.75,70.0,88.2,0.111111,14.397712,76.588235,70.0,Clinically Normal
1,2502,76.830189,77.0,10.720701,60.0,103.0,18.0,62.2,91.0,0.018868,3.705505,79.488889,61.875,Clinically Normal
2,2692,82.512195,82.0,7.951444,70.0,100.0,10.0,71.0,90.0,0.0,8.362117,83.153846,70.0,Clinically Normal
3,3700,67.479167,66.0,6.154232,58.0,84.0,10.0,61.0,76.0,0.0,4.387782,67.15625,68.125,Clinically Normal
4,3835,72.322917,70.0,15.271498,52.0,110.0,21.0,55.0,97.0,0.072917,8.758755,76.34375,64.28125,Clinically Normal


In [10]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# ----------------- rebuild X / y (drop PIDN) -----------------
X = data.drop(columns=["Diagnosis_baseline_3groups", "PIDN"])
y = data["Diagnosis_baseline_3groups"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# handy diagnostic
print("Missing values per column:\n", X_train.isna().sum())

Missing values per column:
 hr_mean        0
hr_median      0
hr_std         0
hr_min         0
hr_max         0
hr_iqr         0
hr_p10         0
hr_p90         0
tachy_prop     0
rmssd          6
day_mean       7
night_mean    23
dtype: int64


In [11]:
# ----------------- 5-a  Logistic Regression -----------------
logreg = make_pipeline(
    SimpleImputer(strategy="median"),      # NEW
    StandardScaler(),
    LogisticRegression(max_iter=1000,
                       class_weight="balanced",
                       random_state=RANDOM_STATE)
)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)

print("\nLogistic Regression")
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_lr).round(3))
print(classification_report(y_test, y_pred_lr))



Logistic Regression
Balanced accuracy: 0.205
                   precision    recall  f1-score   support

Clinically Normal       0.57      0.43      0.49        30
     FTD syndrome       0.00      0.00      0.00         6
           MCI/AD       0.15      0.18      0.17        11

         accuracy                           0.32        47
        macro avg       0.24      0.21      0.22        47
     weighted avg       0.40      0.32      0.35        47



In [12]:
# ----------------- 5-b  Random Forest -----------------
rf = make_pipeline(
    SimpleImputer(strategy="median"),      # NEW
    RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",
        random_state=RANDOM_STATE
    )
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nRandom Forest")
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_rf).round(3))
print(classification_report(y_test, y_pred_rf))


Random Forest
Balanced accuracy: 0.289
                   precision    recall  f1-score   support

Clinically Normal       0.62      0.87      0.72        30
     FTD syndrome       0.00      0.00      0.00         6
           MCI/AD       0.00      0.00      0.00        11

         accuracy                           0.55        47
        macro avg       0.21      0.29      0.24        47
     weighted avg       0.40      0.55      0.46        47



In [14]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, balanced_accuracy_score

pipe = Pipeline([
    ("impute" , SimpleImputer(strategy="median")),
    ("scale"  , StandardScaler()),
    ("adasyn" , ADASYN(random_state=RANDOM_STATE, sampling_strategy="auto")),
    ("model"  , HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth"    : [None, 3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs = GridSearchCV(
    pipe, param_grid, cv=cv,
    scoring=make_scorer(balanced_accuracy_score),
    n_jobs=-1, verbose=1
)
gs.fit(X, y)
print("Best CV balanced-accuracy:", gs.best_score_.round(3))
print("Best params:", gs.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best CV balanced-accuracy: 0.3
Best params: {'model__learning_rate': 0.05, 'model__max_depth': 3}


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_int = le.fit_transform(y)          # 0, 1, 2

pipe = Pipeline([
    ("impute" , SimpleImputer(strategy="median")),
    ("scale"  , StandardScaler()),
    ("adasyn" , ADASYN(random_state=RANDOM_STATE)),
    ("model"  , HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    # keeps the explicit target sizes
    "adasyn__sampling_strategy": [{0:180, 1:180, 2:180}],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth"    : [None, 3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs = GridSearchCV(
    pipe, param_grid, cv=cv,
    scoring=make_scorer(balanced_accuracy_score),
    n_jobs=-1, verbose=1
)
gs.fit(X, y_int)
print("Best CV balanced-accuracy:", gs.best_score_.round(3))
print("Params:", gs.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best CV balanced-accuracy: 0.302
Params: {'adasyn__sampling_strategy': {0: 180, 1: 180, 2: 180}, 'model__learning_rate': 0.1, 'model__max_depth': None}
