In [1]:
# ================================================================
# CELL M-1 · Build 14-day HR sequences  (sentinel = -1000) + labels
# ================================================================
import pandas as pd, numpy as np
from pathlib import Path

RANDOM_STATE = 42
DATA_DIR  = Path(".")
HR_FILE   = DATA_DIR / "heartrate_15min.csv"
DX_FILE   = DATA_DIR / "Diagnoses_20250404.csv"

# ---- load diagnosis --------------------------------------------------
diag = (pd.read_csv(DX_FILE, parse_dates=["DCDate.diagnosis_baseline"])
          .rename(columns={"DCDate.diagnosis_baseline": "BaselineDate"})
          .dropna(subset=["BaselineDate"])
          [["PIDN", "BaselineDate", "Diagnosis_baseline_3groups"]])

# ---- load HR ---------------------------------------------------------
hr = pd.read_csv(HR_FILE, parse_dates=["Time"])
hr = hr[hr.PIDN.isin(diag.PIDN)]
hr = hr.merge(diag[["PIDN", "BaselineDate"]], on="PIDN", how="left")

# ---- helper ----------------------------------------------------------
def seq_maskable(grp, n_days=14, sentinel=-1000.0):
    bdate = grp["BaselineDate"].iloc[0].date()
    after = grp[grp.Time.dt.date >= bdate]
    start = after.Time.min() if not after.empty else grp.Time.min()
    full_index = pd.date_range(start=start.floor("D"),
                               periods=96*n_days, freq="15min")

    s = (grp.set_index("Time")
            .reindex(full_index)["Value"]
            .astype(float))

    # global z-score (use overall mean/std across participant’s 14 days)
    mu, sigma = s.mean(), s.std(ddof=0)
    s = (s - mu) / (sigma + 1e-6)

    # leave NaN where bins are missing → fill with sentinel
    return s.fillna(sentinel).to_numpy(dtype=np.float32)

seqs, labels = [], []
for pid, g in hr.groupby("PIDN"):
    seqs.append(seq_maskable(g))
    label = diag.loc[diag.PIDN == pid, "Diagnosis_baseline_3groups"].iloc[0]
    labels.append(1 if label != "Clinically Normal" else 0)

X_seq = np.stack(seqs)          # shape (n_participants, 1344)
y_bin = np.array(labels, dtype=int)

print("Sequences:", X_seq.shape, "| positives:", y_bin.sum())

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_bin, test_size=0.20, stratify=y_bin, random_state=RANDOM_STATE
)

# add channel dim for GRU (timesteps, features=1)
X_train = X_train[..., None]
X_test  = X_test [..., None]

# class weights
from sklearn.utils.class_weight import compute_class_weight
cw = compute_class_weight("balanced", classes=np.array([0,1]), y=y_train)
class_weights = {0: cw[0], 1: cw[1]}
print("Class weights:", class_weights)


Sequences: (192, 1344) | positives: 70
Class weights: {0: np.float64(0.788659793814433), 1: np.float64(1.3660714285714286)}


In [2]:
# ================================================================
# CELL M-2 · Masking + Bi-GRU  (early stopping on val BA)
# ================================================================
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Masking, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

tf.keras.utils.set_random_seed(RANDOM_STATE)

def bal_acc(y_true, y_pred):
    """Balanced accuracy as a Keras metric (threshold 0.5)."""
    y_pred_bin = tf.cast(y_pred > 0.5, tf.float32)
    tp = tf.reduce_sum(tf.cast((y_true == 1) & (y_pred_bin == 1), tf.float32))
    tn = tf.reduce_sum(tf.cast((y_true == 0) & (y_pred_bin == 0), tf.float32))
    p  = tf.reduce_sum(tf.cast(y_true == 1, tf.float32))
    n  = tf.reduce_sum(tf.cast(y_true == 0, tf.float32))
    rec_pos = tp / (p + 1e-6)
    rec_neg = tn / (n + 1e-6)
    return (rec_pos + rec_neg) / 2

model = Sequential([
    Input(shape=(1344, 1)),
    Masking(mask_value=-1000.0),
    Bidirectional(GRU(32, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)),
    Dense(32, activation="relu"),
    Dropout(0.4),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss="binary_crossentropy",
              metrics=[bal_acc])

callback = EarlyStopping(monitor="val_bal_acc",
                         mode="max",
                         patience=12,
                         restore_best_weights=True,
                         verbose=1)

history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_split=0.15,
    class_weight=class_weights,
    callbacks=[callback],
    verbose=1
)

# ---- evaluate on hold-out test set -----------------------------------
probs_gru = model.predict(X_test, verbose=0).ravel()
y_pred = (probs_gru >= 0.5).astype(int)

from sklearn.metrics import balanced_accuracy_score, recall_score, confusion_matrix
print("\nTEST balanced-accuracy:", round(balanced_accuracy_score(y_test, y_pred), 3))
print("Abnormal recall:", round(recall_score(y_test, y_pred), 3))
print(confusion_matrix(y_test, y_pred))


Epoch 1/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 526ms/step - bal_acc: 15.0000 - loss: 0.6870 - val_bal_acc: 11.5000 - val_loss: 0.7094
Epoch 2/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 405ms/step - bal_acc: 15.0000 - loss: 0.6852 - val_bal_acc: 11.5000 - val_loss: 0.7105
Epoch 3/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 394ms/step - bal_acc: 15.0000 - loss: 0.6875 - val_bal_acc: 11.5000 - val_loss: 0.7083
Epoch 4/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 409ms/step - bal_acc: 15.0000 - loss: 0.6882 - val_bal_acc: 11.5000 - val_loss: 0.7054
Epoch 5/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 382ms/step - bal_acc: 15.0000 - loss: 0.6840 - val_bal_acc: 11.5000 - val_loss: 0.7031
Epoch 6/150
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 430ms/step - bal_acc: 15.0000 - loss: 0.6892 - val_bal_acc: 11.5000 - val_loss: 0.7008
Epoch 7/150
[1m5/5[0m [32m━━━━━