In [1]:
# !pip install xgboost lightgbm imbalanced-learn tensorflow==2.15 --quiet
import numpy as np, pandas as pd, joblib, os, gc, warnings, random, math
from pathlib import Path
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# paths
DATA_DIR   = Path(".")
HR_FILE    = DATA_DIR / "heartrate_15min.csv"
STEPS_FILE = DATA_DIR / "minuteStepsNarrow.csv"
DX_FILE    = DATA_DIR / "Diagnoses_20250404.csv"


In [2]:
# ---------- load diagnosis ------------------------------------------------
diag = (pd.read_csv(DX_FILE, parse_dates=["DCDate.diagnosis_baseline"])
          .rename(columns={"DCDate.diagnosis_baseline":"BaselineDate"})
          .dropna(subset=["BaselineDate"])
          [["PIDN","BaselineDate","Diagnosis_baseline_3groups"]])

# ---------- helper to slice first N days ----------------------------------
def slice_days(df, base_date, n=14):
    after = df[df.Time.dt.date >= base_date]
    start = after.Time.min() if not after.empty else df.Time.min()
    end   = start + pd.Timedelta(days=n)
    return df[(df.Time >= start) & (df.Time < end)]

# ---------- HEART RATE ----------------------------------------------------
hr = pd.read_csv(HR_FILE, parse_dates=["Time"]).merge(diag, "inner", "PIDN")
hr14 = pd.concat([slice_days(g, g.BaselineDate.iloc[0].date(), 14)
                  for _, g in hr.groupby("PIDN")])

def hr_stats(df):
    v = df.Value.to_numpy()
    h = df.Time.dt.hour
    day, night = h.between(6,21), ~h.between(6,21)
    rmssd = np.sqrt(np.mean(np.diff(v)**2)) if v.size>1 else np.nan
    sdnn  = np.std(v, ddof=0)
    return pd.Series({
        "hr_mean":v.mean(), "hr_std":sdnn, "hr_min":v.min(), "hr_max":v.max(),
        "rmssd":rmssd, "lfhf":rmssd/(sdnn+1e-6),
        "day_mean":df.Value[day].mean(), "night_mean":df.Value[night].mean()
    })
feat_hr = hr14.groupby("PIDN").apply(hr_stats).reset_index()

# ---------- minute steps → daily totals -----------------------------------
steps_raw = (pd.read_csv(STEPS_FILE, parse_dates=["ActivityMinute"])
               .rename(columns={"ActivityMinute":"Time","Steps":"Value"})
               .merge(diag[["PIDN","BaselineDate"]],"inner"))
step14 = pd.concat([slice_days(g,g.BaselineDate.iloc[0].date(),14)
                    for _,g in steps_raw.groupby("PIDN")])

step_daily = (step14.assign(Date=step14.Time.dt.date)
                      .groupby(["PIDN","Date"]).Value.sum()
                      .rename("steps").reset_index())

def st_stats(df):
    v=df.steps.to_numpy()
    trend=v[-1]-v[0] if len(v)>1 else np.nan
    wknd_mask = pd.to_datetime(df.Date).dt.dayofweek>=5
    return pd.Series({
        "steps_mean":v.mean(),"steps_std":v.std(ddof=0),
        "steps_min":v.min(),"steps_max":v.max(),
        "steps_trend":trend,
        "wknd_ratio":v[wknd_mask].mean()/(v[~wknd_mask].mean()+1e-6)
    })
feat_st = step_daily.groupby("PIDN").apply(st_stats).reset_index()

# ---------- HR–steps coupling --------------------------------------------
hr_daily = (hr14.assign(Date=hr14.Time.dt.date)
                 .groupby(["PIDN","Date"]).Value.mean()
                 .rename("hr").reset_index())
coupling = (hr_daily.merge(step_daily,"inner")
                   .groupby("PIDN")
                   .apply(lambda d: pd.Series({"hr_steps_corr":
                         np.corrcoef(d.hr,d.steps)[0,1] if len(d)>2 else np.nan}))
                   .reset_index())

# ---------- merge all features & label -----------------------------------
tab = (feat_hr.merge(feat_st,"left").merge(coupling,"left")
       .merge(diag[["PIDN","Diagnosis_baseline_3groups"]],"inner"))
tab.to_csv("tabular_features.csv",index=False)
print(tab.shape, "tabular rows saved")


(192, 17) tabular rows saved


In [3]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

X = tab.drop(columns=["Diagnosis_baseline_3groups","PIDN"])
y = (tab.Diagnosis_baseline_3groups!="Clinically Normal").astype(int).to_numpy()

kf = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)
p_tab = np.zeros_like(y, dtype=float)

for train_idx, val_idx in kf.split(X,y):
    dtrain = xgb.DMatrix(X.iloc[train_idx], label=y[train_idx])
    dval   = xgb.DMatrix(X.iloc[val_idx])
    bst = xgb.train(
        params=dict(max_depth=4, learning_rate=0.07,
                    subsample=0.8, colsample_bytree=0.8,
                    objective="binary:logistic", eval_metric="logloss",
                    random_state=RANDOM_STATE),
        dtrain=dtrain, num_boost_round=200, verbose_eval=False)
    p_tab[val_idx] = bst.predict(dval)

print("XGB OOF balanced-acc:",
      balanced_accuracy_score(y, (p_tab>=0.5).astype(int)).round(3))
joblib.dump(bst,"xgb_full.joblib")


XGB OOF balanced-acc: 0.534


['xgb_full.joblib']

In [6]:
# down-sample minute steps to 15-min bins, align with HR index
# reuse hr14 & step14 from CELL 1
def to_channel(df, full_index, sentinel):
    s = df.reindex(full_index)["Value"].astype(float)
    mu, sd = s.mean(), s.std(ddof=0)
    return s.sub(mu).div(sd+1e-6).fillna(sentinel).to_numpy()

sentinel = -1000.0
seqs = []
for pid in tab.PIDN:
    idx = hr14.Time[hr14.PIDN==pid].min().floor("D")
    full_idx = pd.date_range(idx, periods=96*14, freq="15min")
    ch_hr = to_channel(hr14[hr14.PIDN==pid].set_index("Time"), full_idx, sentinel)
    st_15 = (step14[step14.PIDN == pid]
               .set_index("Time").Value
               .resample("15min").sum()
               .to_frame(name="Value"))
    ch_st = to_channel(st_15, full_idx, sentinel)
    seqs.append(np.stack([ch_hr, ch_st], axis=-1))    # shape 1344×2
X_seq = np.stack(seqs)
np.save("seq.npy",X_seq); np.save("labels.npy",y)
print("Seq tensor:",X_seq.shape)


Seq tensor: (192, 1344, 2)


In [8]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight   # NEW
import tensorflow as tf
from tensorflow.keras import layers, Model
tf.keras.utils.set_random_seed(RANDOM_STATE)

def build_seq_model(input_shape, sentinel):
    inp = layers.Input(shape=input_shape)
    x = layers.Masking(mask_value=sentinel)(inp)
    x = layers.Bidirectional(layers.GRU(32, return_sequences=True,
                                        dropout=0.2, recurrent_dropout=0.2))(x)
    x = layers.MultiHeadAttention(2, 16)(x,x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.5)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    mdl = Model(inp, out)
    mdl.compile(tf.keras.optimizers.Adam(1e-3),
                loss="binary_crossentropy")
    return mdl

p_seq = np.zeros_like(y, dtype=float)
kf = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)

for fold,(tr,vl) in enumerate(kf.split(X_seq,y)):
    model = build_seq_model((1344,2), sentinel)
    cw = compute_class_weight("balanced",classes=np.array([0,1]),y=y[tr])
    model.fit(X_seq[tr], y[tr],
              epochs=50, batch_size=32, verbose=0,
              validation_data=(X_seq[vl], y[vl]),
              class_weight={0:cw[0],1:cw[1]},
              callbacks=[tf.keras.callbacks.EarlyStopping(
                  patience=8, restore_best_weights=True, monitor="val_loss")])
    p_seq[vl] = model.predict(X_seq[vl], verbose=0).ravel()

print("GRU OOF balanced-acc:",
      balanced_accuracy_score(y, (p_seq>=0.5).astype(int)).round(3))
model.save("gru_full.h5")






GRU OOF balanced-acc: 0.682


In [9]:
from sklearn.linear_model import LogisticRegression
meta = LogisticRegression(max_iter=1000, class_weight="balanced")
meta_X = np.column_stack([p_tab, p_seq])
meta.fit(meta_X, y)
joblib.dump(meta,"blender.joblib")

print("Blender OOF balanced-acc:",
      balanced_accuracy_score(y, meta.predict(meta_X)).round(3))


Blender OOF balanced-acc: 0.672


In [12]:
from sklearn.model_selection import train_test_split
import xgboost as xgb, numpy as np, tensorflow as tf, joblib
from sklearn.metrics import balanced_accuracy_score, recall_score, confusion_matrix

# ------------------------------------------------------------
# 0.  train/test split (same random_state as before)
# ------------------------------------------------------------
X_tab_train, X_tab_test, y_train, y_test, \
X_seq_train, X_seq_test = train_test_split(
    tab.drop(columns=["Diagnosis_baseline_3groups","PIDN"]),
    y, X_seq,
    test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# ------------------------------------------------------------
# 1.  retrain XGBoost on the *training* fold
# ------------------------------------------------------------
params_xgb = dict(
    max_depth=4,
    learning_rate=0.07,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE
)
num_rounds = 200                      # same as Cell 2

dtrain = xgb.DMatrix(X_tab_train, label=y_train)
dtest  = xgb.DMatrix(X_tab_test)
bst_final = xgb.train(params_xgb, dtrain, num_boost_round=num_rounds)
probs_tab = bst_final.predict(dtest)

# ------------------------------------------------------------
# 2.  retrain GRU-Attention on the *training* fold
# ------------------------------------------------------------
gru_final = build_seq_model((1344, 2), sentinel)
cw = compute_class_weight("balanced", classes=np.array([0, 1]), y=y_train)
gru_final.fit(
    X_seq_train, y_train,
    epochs=50, batch_size=32, verbose=0,
    validation_split=0.15,
    class_weight={0: cw[0], 1: cw[1]},
    callbacks=[tf.keras.callbacks.EarlyStopping(
        patience=8, restore_best_weights=True, monitor="val_loss")]
)
probs_seq = gru_final.predict(X_seq_test, verbose=0).ravel()

# ------------------------------------------------------------
# 3.  blend with the logistic meta-model trained in Cell 5
# ------------------------------------------------------------
meta   = joblib.load("blender.joblib")
probs_blend = meta.predict_proba(
    np.column_stack([probs_tab, probs_seq])
)[:, 1]
y_pred = (probs_blend >= 0.5).astype(int)

print("TEST balanced-accuracy:",
      round(balanced_accuracy_score(y_test, y_pred), 3))
print("Abnormal recall:",
      round(recall_score(y_test, y_pred), 3))
print(confusion_matrix(y_test, y_pred))


TEST balanced-accuracy: 0.577
Abnormal recall: 0.714
[[11 14]
 [ 4 10]]


In [13]:
# ================================================================
# Threshold scan for blended model
# ================================================================
import numpy as np, pandas as pd
from sklearn.metrics import balanced_accuracy_score, recall_score, confusion_matrix

scan = np.arange(0.35, 0.61, 0.01)          # cut-off range to explore
rows = []
for cut in scan:
    y_pred = (probs_blend >= cut).astype(int)
    ba   = balanced_accuracy_score(y_test, y_pred)
    rec1 = recall_score(y_test, y_pred)           # abnormal recall
    fp   = ((y_pred == 1) & (y_test == 0)).sum()
    fn   = ((y_pred == 0) & (y_test == 1)).sum()
    rows.append([cut, ba, rec1, fp, fn])

thr_df = pd.DataFrame(rows,
                      columns=["cut", "bal_acc", "abn_recall", "false_pos", "false_neg"])
display(thr_df)

# --- best rows -----------------------------------------------------------
best_row   = thr_df.loc[thr_df["bal_acc"].idxmax()]
rec70_row  = thr_df[thr_df["abn_recall"] >= 0.70].head(1)   # first cut hitting ≥70 % recall

print("\nBest balanced-accuracy :", best_row.to_dict())
if not rec70_row.empty:
    print("≥70 % abnormal recall :", rec70_row.iloc[0].to_dict())
else:
    print("No threshold in scan hits 0.70 recall")


Unnamed: 0,cut,bal_acc,abn_recall,false_pos,false_neg
0,0.35,0.612857,0.785714,14,3
1,0.36,0.612857,0.785714,14,3
2,0.37,0.612857,0.785714,14,3
3,0.38,0.612857,0.785714,14,3
4,0.39,0.612857,0.785714,14,3
5,0.4,0.612857,0.785714,14,3
6,0.41,0.612857,0.785714,14,3
7,0.42,0.612857,0.785714,14,3
8,0.43,0.612857,0.785714,14,3
9,0.44,0.612857,0.785714,14,3



Best balanced-accuracy : {'cut': 0.5600000000000002, 'bal_acc': 0.6171428571428572, 'abn_recall': 0.7142857142857143, 'false_pos': 12.0, 'false_neg': 4.0}
≥70 % abnormal recall : {'cut': 0.35, 'bal_acc': 0.6128571428571429, 'abn_recall': 0.7857142857142857, 'false_pos': 14.0, 'false_neg': 3.0}
