In [None]:
# ========== 0. Imports & paths =========================================
import pandas as pd
import numpy as np
from pathlib import Path

# modelling
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, balanced_accuracy_score, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.pipeline import Pipeline        # pip install imbalanced-learn
from imblearn.over_sampling import ADASYN

RANDOM_STATE = 42
DATA_DIR = Path("./")                         # adjust if needed
HR_FILE  = DATA_DIR / "heartrate_15min.csv"
DX_FILE  = DATA_DIR / "Diagnoses_20250404.csv"


In [None]:
# ========== 1. Load tables & attach BaselineDate ======================
# --- diagnoses --------------------------------------------------------
diag = (pd.read_csv(DX_FILE, parse_dates=["DCDate.diagnosis_baseline"])
          .rename(columns={"DCDate.diagnosis_baseline": "BaselineDate"}))

# keep only rows that HAVE a baseline date
diag = diag.dropna(subset=["BaselineDate"])
diag = diag[["PIDN", "BaselineDate", "Diagnosis_baseline_3groups"]]

# --- heart-rate -------------------------------------------------------
hr = pd.read_csv(HR_FILE, parse_dates=["Time"])

# --- intersect on PIDN -----------------------------------------------
common_pidn = set(diag.PIDN) & set(hr.PIDN)
diag = diag[diag.PIDN.isin(common_pidn)].copy()
hr   = hr [hr .PIDN.isin(common_pidn)].copy()

# attach each participant's baseline date to every HR row
hr = hr.merge(diag[["PIDN", "BaselineDate"]], on="PIDN", how="left")
assert hr["BaselineDate"].notna().all()

print("HR rows:", len(hr), "| participants:", hr.PIDN.nunique())


In [None]:
# ========== 2. Build the 7-day HR window ==============================
def first_n_days(grp: pd.DataFrame, baseline_date, n=7):
    """Return rows for the first n calendar days on/after baseline_date."""
    after = grp[grp.Time.dt.date >= baseline_date]
    start = after.Time.dt.date.min() if not after.empty else grp.Time.dt.date.min()
    end   = start + pd.Timedelta(days=n)        # exclusive upper bound
    return grp[(grp.Time.dt.date >= start) & (grp.Time.dt.date < end)]

hr7_slices = []
for pid, g in hr.groupby("PIDN"):
    bdate = g["BaselineDate"].iloc[0].date()
    win   = first_n_days(g, bdate, n=7)
    if not win.empty:
        hr7_slices.append(win)

hr7 = pd.concat(hr7_slices, ignore_index=True)
print("7-day HR rows:", len(hr7), "| participants:", hr7.PIDN.nunique())


In [None]:
# ---------------- STEP 3 : 7-day HR feature table ----------------------
# (run after Steps 1–2 so `hr7` and `diag` are defined)

import numpy as np
import pandas as pd

def hr_features(df: pd.DataFrame) -> pd.Series:
    """Summary stats across the participant’s 7-day HR window."""
    v = df["Value"].to_numpy()
    n = v.size
    hrs        = df["Time"].dt.hour
    day_mask   = hrs.between(6, 21)
    night_mask = ~day_mask
    mean_ = lambda a: np.nan if a.size == 0 else a.mean()
    pct_  = lambda cond: np.nan if n == 0 else cond.mean()

    return pd.Series({
        "hr_mean"   : mean_(v),
        "hr_median" : np.nan if n == 0 else np.median(v),
        "hr_std"    : np.nan if n == 0 else np.std(v, ddof=0),
        "hr_min"    : np.nan if n == 0 else v.min(),
        "hr_max"    : np.nan if n == 0 else v.max(),
        "hr_iqr"    : np.nan if n == 0 else np.percentile(v, 75) - np.percentile(v, 25),
        "hr_p10"    : np.nan if n == 0 else np.percentile(v, 10),
        "hr_p90"    : np.nan if n == 0 else np.percentile(v, 90),
        "tachy_prop": pct_(v > 100),
        "rmssd"     : np.nan if n < 2 else np.sqrt(np.mean(np.diff(v)**2)),
        "day_mean"  : mean_(df.loc[day_mask,   "Value"].to_numpy()),
        "night_mean": mean_(df.loc[night_mask, "Value"].to_numpy())
    })

# --- build one row per participant ------------------------------------
features7 = (
    hr7.groupby("PIDN")[["Time", "Value"]]     # only the cols we need → no warning
       .apply(hr_features)                    # apply summariser
       .reset_index()                         # PIDN becomes a normal column
)

# drop any duplicated columns that might sneak in
features7 = features7.loc[:, ~features7.columns.duplicated()]

# keep numeric feature columns + PIDN
numeric_cols = features7.select_dtypes(include="number").columns.difference(["PIDN"])
features7    = features7[["PIDN"] + numeric_cols.tolist()]

# attach 3-group diagnosis
data7 = features7.merge(
    diag[["PIDN", "Diagnosis_baseline_3groups"]],
    on="PIDN",
    how="inner"
)

# save for convenience
data7.to_csv("hr7day_features.csv", index=False)
print("Saved hr7day_features.csv | shape =", data7.shape)

# quick look
data7.head()


In [None]:
# ================================================================
# Binary sanity-check:  Clinically Normal (0)  vs  Abnormal (1)
# ================================================================

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report

RANDOM_STATE = 42

# ----- 1.  X / y  -------------------------------------------------
X = data7.drop(columns=["Diagnosis_baseline_3groups", "PIDN"])
y = (data7["Diagnosis_baseline_3groups"] != "Clinically Normal").astype(int)   # 1 = MCI/AD or FTD

print("Class balance:", y.value_counts().to_dict())   # sanity-check

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# ----- 2.  pipeline (no oversampling) -----------------------------
pipe_bin = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale" , StandardScaler()),
    ("model" , HistGradientBoostingClassifier(
                  random_state=RANDOM_STATE,
                  class_weight="balanced"))
])

pipe_bin.fit(X_train, y_train)

# ----- 3.  evaluation --------------------------------------------
y_pred = pipe_bin.predict(X_test)
print("\nTEST balanced-accuracy:",
      round(balanced_accuracy_score(y_test, y_pred), 3))
print(classification_report(y_test, y_pred, target_names=["CN", "Abnormal"]))

In [None]:
ST_FILE = Path("minuteStepsNarrow.csv") 

In [None]:
# ------------  S-1  read & filter large steps CSV  --------------------
use_pids = set(features7.PIDN)            # the 192 participants we kept after HR step

chunks = []
for chunk in pd.read_csv(
        ST_FILE,
        usecols=["PIDN", "ActivityMinute", "Steps"],   # skip Fitbit model column
        parse_dates=["ActivityMinute"],
        dtype={"PIDN": "int32", "Steps": "int32"},
        chunksize=5_000_000,          # ~100-150 MB per chunk
        low_memory=True):
    
    filt = chunk[chunk["PIDN"].isin(use_pids)]
    if not filt.empty:
        chunks.append(filt)

steps_raw = pd.concat(chunks, ignore_index=True)
steps_raw = steps_raw.rename(columns={"ActivityMinute": "Time", "Steps": "Value"})
print("Minute-steps rows kept:", len(steps_raw))

In [None]:
# attach BaselineDate to steps rows
steps_raw = steps_raw.merge(diag[["PIDN", "BaselineDate"]], on="PIDN", how="left")
assert steps_raw["BaselineDate"].notna().all()

step7 = pd.concat(
    [first_n_days(g, g["BaselineDate"].iloc[0].date(), n=7)
     for pid, g in steps_raw.groupby("PIDN") if not g.empty],
    ignore_index=True
)
print("Minute-steps rows in 7-day window:", len(step7))

In [None]:
# ------------  S-3  resample 15-min & build step features  -------------
def build_step_features(grp: pd.DataFrame) -> pd.Series:
    # 1. resample to 15-min bins, summing steps within each bin
    ts = (grp.set_index("Time")
              .sort_index()
              ["Value"]
              .resample("15min")          # ← updated keyword
              .sum(min_count=1)
              .ffill(limit=1))            # carry forward a single missing bin

    v = ts.to_numpy()
    n = v.size
    hrs = ts.index.hour                  # Int64Index

    # day = 06:00-21:59  (bool mask)
    day_mask = (hrs >= 6) & (hrs <= 21)  # ← no .between() needed

    mean_ = lambda a: np.nan if a.size == 0 else a.mean()

    return pd.Series({
        "steps_mean"      : mean_(v),
        "steps_std"       : np.std(v, ddof=0),
        "steps_max"       : v.max(),
        "steps_iqr"       : np.percentile(v, 75) - np.percentile(v, 25),
        "steps_p10"       : np.percentile(v, 10),
        "steps_p90"       : np.percentile(v, 90),
        "sedentary_prop"  : (v == 0).mean(),
        "moderate_bouts"  : (v >= 100).sum(),   # 100+ steps / 15 min
        "vigorous_bouts"  : (v >= 250).sum(),   # 250+ steps / 15 min
        "day_steps_mean"  : mean_(v[day_mask]),
        "night_steps_mean": mean_(v[~day_mask]),
    })

step_feat = (
    step7.groupby("PIDN")[["Time", "Value"]]
         .apply(build_step_features)
         .reset_index()
)

print("Step-feature table shape:", step_feat.shape)


In [None]:
# merge with existing HR feature table
full_features = (features7
                 .merge(step_feat, on="PIDN", how="left")      # may introduce NaNs
                 .merge(diag[["PIDN", "Diagnosis_baseline_3groups"]], on="PIDN"))

full_features.to_csv("hr_steps_7day_features.csv", index=False)
print("Merged HR+Steps feature table  |  shape =", full_features.shape)

# ---------- ready for modelling -----------------
X = full_features.drop(columns=["Diagnosis_baseline_3groups", "PIDN"])
y = full_features["Diagnosis_baseline_3groups"]

# same pipeline you used before
pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale" , StandardScaler()),
    ("adasyn", ADASYN(random_state=RANDOM_STATE, sampling_strategy="auto")),
    ("model" , HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth"    : [None, 3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs = GridSearchCV(
    pipe, param_grid, cv=cv,
    scoring=make_scorer(balanced_accuracy_score),
    n_jobs=-1, verbose=1
)
gs.fit(X, y)

print("\nBest CV balanced-accuracy:", round(gs.best_score_, 3))
print("Best params:", gs.best_params_)


In [None]:
# ---------------------------------------------------------------
# Modelling cell – median-impute ➜ scale ➜ HistGB (no ADASYN)
# ---------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report, make_scorer

RANDOM_STATE = 42

# --- train / test split (reuse if you still have X_train, X_test) -----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# --- pipeline without synthetic oversampling --------------------------
pipe_no_synth = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale" , StandardScaler()),
    ("model" , HistGradientBoostingClassifier(
                  random_state=RANDOM_STATE,
                  class_weight="balanced"))      # handles imbalance internally
])

param_grid = {
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth"    : [None, 3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs = GridSearchCV(
    pipe_no_synth, param_grid, cv=cv,
    scoring=make_scorer(balanced_accuracy_score),
    n_jobs=-1, verbose=1
)

gs.fit(X_train, y_train)

print("\nBest CV balanced-accuracy:", round(gs.best_score_, 3))
print("Best parameters:", gs.best_params_)

# ---- evaluate on held-out test set -----------------------------------
y_pred = gs.predict(X_test)
print("\nTEST balanced-accuracy:", round(balanced_accuracy_score(y_test, y_pred), 3))
print(classification_report(y_test, y_pred))
