In [1]:
import pandas as pd

# Load the dataset
file_path = "hp18_hr.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Convert the time column to datetime format and remove timezone information
df['time'] = pd.to_datetime(df['time']).dt.strftime('%Y-%m-%d %H:%M:%S')

# Sort the dataframe by time in increasing order
df_sorted = df.sort_values(by='time')

# Display the first few rows of the sorted dataframe
print(df_sorted.head())

                  time data_type  value
0  2024-04-28 00:00:00        hr     67
1  2024-04-28 00:01:00        hr     67
2  2024-04-28 00:02:00        hr     70
3  2024-04-28 00:03:00        hr     66
4  2024-04-28 00:04:00        hr     67


In [2]:
import pandas as pd

# Load the steps dataset
file_path_steps = "hp18_steps.csv"  # Replace with your actual file path
df_steps = pd.read_csv(file_path_steps)

# Convert the time column to datetime format and remove timezone information
df_steps['time'] = pd.to_datetime(df_steps['time']).dt.strftime('%Y-%m-%d %H:%M:%S')

# Sort the dataframe by time in increasing order
df_steps_sorted = df_steps.sort_values(by='time')

# Display the first few rows of the sorted dataframe
print(df_steps_sorted.head())

                  time data_type  value
0  2024-04-28 02:13:00     steps     33
1  2024-04-28 02:14:00     steps     19
2  2024-04-28 04:33:00     steps     18
3  2024-04-28 04:35:00     steps     17
4  2024-04-28 04:36:00     steps     41


In [3]:
# Load the blood pressure dataset
file_path_bp = "blood_pressure_readings_ID18_cleaned.csv"  # Replace with your actual file path
df_bp = pd.read_csv(file_path_bp)

# Select relevant columns
df_bp = df_bp[['datetime_local', 'systolic', 'diastolic']]

# Convert the datetime column to datetime format
df_bp['datetime_local'] = pd.to_datetime(df_bp['datetime_local'])

# Sort the dataframe by datetime in increasing order
df_bp_sorted = df_bp.sort_values(by='datetime_local')

# Add a binary classification column for BP spikes
df_bp_sorted['BP_spike'] = ((df_bp_sorted['systolic'] > 130) | (df_bp_sorted['diastolic'] > 80)).astype(int)

# Count the number of BP spikes and total records
total_records = len(df_bp_sorted)
bp_spike_count = df_bp_sorted['BP_spike'].sum()

# Print summary
print(f"Total records: {total_records}")
print(f"Number of BP spikes: {bp_spike_count}")

# Display the first few rows of the processed dataframe
print(df_bp_sorted.head())

Total records: 136
Number of BP spikes: 126
       datetime_local  systolic  diastolic  BP_spike
0 2024-04-27 14:54:05       152        101         1
1 2024-04-27 20:35:43       149         97         1
2 2024-04-28 04:54:57       140         98         1
4 2024-04-28 06:57:35       143         95         1
3 2024-04-28 06:58:50       142         90         1


In [4]:
# Load the stress data dataset
file_path_stress = "questionnaire_responses_ID18.csv"  # Replace with your actual file path
df_stress = pd.read_csv(file_path_stress)

# Select relevant columns
df_stress = df_stress[['local_created_at', 'stressLevel_value']]

# Convert the time column to datetime format
df_stress['local_created_at'] = pd.to_datetime(df_stress['local_created_at'])

# Sort the dataframe by time in increasing order
df_stress_sorted = df_stress.sort_values(by='local_created_at')

# Display the first few rows of the processed dataframe
print(df_stress_sorted.head())

     local_created_at  stressLevel_value
0 2024-04-27 15:01:00                  3
1 2024-04-28 04:56:00                  2
2 2024-04-28 07:00:00                  3
3 2024-04-28 11:52:00                  4
4 2024-04-28 15:09:00                  3


In [5]:
import pandas as pd
import numpy as np

# 📌 **Step 1: Load Datasets**
file_path_hr = "hp18_hr.csv"
file_path_steps = "hp18_steps.csv"
file_path_bp = "blood_pressure_readings_ID18_cleaned.csv"
file_path_stress = "questionnaire_responses_ID18.csv"

df_hr = pd.read_csv(file_path_hr)
df_steps = pd.read_csv(file_path_steps)
df_bp = pd.read_csv(file_path_bp)
df_stress = pd.read_csv(file_path_stress)

# 📌 **Step 2: Convert timestamps to uniform `datetime64[ns]`**
df_hr['time'] = pd.to_datetime(df_hr['time']).dt.tz_localize(None)
df_steps['time'] = pd.to_datetime(df_steps['time']).dt.tz_localize(None)
df_bp['datetime_local'] = pd.to_datetime(df_bp['datetime_local']).dt.tz_localize(None)
df_stress['local_created_at'] = pd.to_datetime(df_stress['local_created_at']).dt.tz_localize(None)

# 📌 **Step 3: Remove Data from `2024-10-23` for BP & Stress**
df_bp = df_bp[df_bp['datetime_local'].dt.date > pd.to_datetime("2024-04-27").date()]
df_stress = df_stress[df_stress['local_created_at'].dt.date > pd.to_datetime("2024-04-27").date()]

# 📌 **Step 4: Sort datasets by time**
df_hr_sorted = df_hr.sort_values(by='time')
df_steps_sorted = df_steps.sort_values(by='time')
df_bp_sorted = df_bp.sort_values(by='datetime_local')
df_stress_sorted = df_stress.sort_values(by='local_created_at')

# 📌 **Step 5: Add BP spike classification**
# Retain the threshold-based definition for reference:
df_bp_sorted['BP_spike_threshold'] = ((df_bp_sorted['systolic'] > 130) | (df_bp_sorted['diastolic'] > 80)).astype(int)

# New binary definition based on mean values:
mean_systolic = df_bp_sorted['systolic'].mean()
mean_diastolic = df_bp_sorted['diastolic'].mean()
df_bp_sorted['BP_spike_mean'] = ((df_bp_sorted['systolic'] > mean_systolic) | (df_bp_sorted['diastolic'] > mean_diastolic)).astype(int)

# Tertile classification for systolic and diastolic:
systolic_lower = df_bp_sorted['systolic'].quantile(0.333)
systolic_upper = df_bp_sorted['systolic'].quantile(0.667)
diastolic_lower = df_bp_sorted['diastolic'].quantile(0.333)
diastolic_upper = df_bp_sorted['diastolic'].quantile(0.667)

def classify_bp(value, lower, upper):
    if value < lower:
        return "low"
    elif value < upper:
        return "mid"
    else:
        return "high"

df_bp_sorted['systolic_tertile'] = df_bp_sorted['systolic'].apply(lambda x: classify_bp(x, systolic_lower, systolic_upper))
df_bp_sorted['diastolic_tertile'] = df_bp_sorted['diastolic'].apply(lambda x: classify_bp(x, diastolic_lower, diastolic_upper))

# 📌 **Step 6: Merge HR & Steps Data Based on Nearest Timestamp**
df_biosignals = pd.merge_asof(df_hr_sorted, df_steps_sorted, on='time', direction='backward', suffixes=('_hr', '_steps'))

# 📌 **Step 7: Compute Rolling Window Statistics for HR & Steps**
df_biosignals.set_index('time', inplace=True)
time_windows = [5, 10, 30, 60]  # Define time windows (minutes)

for window in time_windows:
    window_str = f"{window}min"
    # Rolling windows here include the current row; use .shift(1) later if strictly previous data is needed.
    df_biosignals[f'hr_mean_{window_str}'] = df_biosignals['value_hr'].rolling(f"{window}min").mean()
    df_biosignals[f'hr_min_{window_str}'] = df_biosignals['value_hr'].rolling(f"{window}min").min()
    df_biosignals[f'hr_max_{window_str}'] = df_biosignals['value_hr'].rolling(f"{window}min").max()
    df_biosignals[f'hr_std_{window_str}'] = df_biosignals['value_hr'].rolling(f"{window}min").std()

    df_biosignals[f'steps_total_{window_str}'] = df_biosignals['value_steps'].rolling(f"{window}min").sum()
    df_biosignals[f'steps_mean_{window_str}'] = df_biosignals['value_steps'].rolling(f"{window}min").mean()
    df_biosignals[f'steps_min_{window_str}'] = df_biosignals['value_steps'].rolling(f"{window}min").min()
    df_biosignals[f'steps_max_{window_str}'] = df_biosignals['value_steps'].rolling(f"{window}min").max()
    df_biosignals[f'steps_std_{window_str}'] = df_biosignals['value_steps'].rolling(f"{window}min").std()
    df_biosignals[f'steps_diff_{window_str}'] = df_biosignals[f'steps_max_{window_str}'] - df_biosignals[f'steps_min_{window_str}']

# Reset index after rolling computation
df_biosignals.reset_index(inplace=True)

# 📌 **Step 8: Merge BP Data with HR & Steps Features**
df_merged = pd.merge_asof(df_bp_sorted, df_biosignals, left_on='datetime_local', right_on='time', direction='backward')

# 📌 **Step 9: Incorporate Stress Data (±15 minutes window)**
def extract_stress_features(bp_time, df_stress):
    start_time = bp_time - pd.Timedelta(minutes=15)
    end_time = bp_time + pd.Timedelta(minutes=15)
    stress_values = df_stress[(df_stress['local_created_at'] >= start_time) & (df_stress['local_created_at'] <= end_time)]['stressLevel_value']
    return pd.Series({
        'stress_mean': stress_values.mean(),
        'stress_min': stress_values.min(),
        'stress_max': stress_values.max(),
        'stress_std': stress_values.std()
    })

df_stress_features = df_bp_sorted['datetime_local'].apply(lambda x: extract_stress_features(x, df_stress_sorted))
df_merged = pd.concat([df_merged, df_stress_features], axis=1)

# 📌 **Step 10: Create Additional Engineered Features**

# ✅ Lagged Features: only using previous data (based on BP_spike_mean and other past features)
lag_features = ['stress_mean', 'BP_spike_mean', 'hr_mean_5min', 'steps_total_10min']
for feature in lag_features:
    for lag in [1, 3, 5]:
        df_merged[f'{feature}_lag_{lag}'] = df_merged[feature].shift(lag)

# ✅ Feature Interactions
df_merged['hr_steps_ratio'] = df_merged['hr_mean_5min'] / (df_merged['steps_total_10min'] + 1)
df_merged['stress_weighted_hr'] = df_merged['hr_mean_5min'] * df_merged['stress_mean']
df_merged['stress_steps_ratio'] = df_merged['stress_mean'] / (df_merged['steps_total_10min'] + 1)
df_merged['steps_hr_variability_ratio'] = df_merged['steps_std_10min'] / (df_merged['hr_std_10min'] + 1e-5)

# ✅ Rolling Aggregations
df_merged['hr_mean_rolling_3'] = df_merged['hr_mean_5min'].rolling(3).mean()
df_merged['steps_total_rolling_5'] = df_merged['steps_total_10min'].rolling(5).mean()
df_merged['hr_std_rolling_3'] = df_merged['hr_std_10min'].rolling(3).std()
df_merged['cumulative_stress_30min'] = df_merged['stress_mean'].rolling(3).sum()
df_merged['cumulative_steps_30min'] = df_merged['steps_total_10min'].rolling(3).sum()

# ✅ Contextual Features
df_merged['hour_of_day'] = df_merged['datetime_local'].dt.hour
df_merged['day_of_week'] = df_merged['datetime_local'].dt.dayofweek
df_merged['is_working_hours'] = df_merged['hour_of_day'].between(9, 17).astype(int)
df_merged['is_weekend'] = (df_merged['day_of_week'] >= 5).astype(int)

# ✅ Time Since Last BP Spike (based on BP_spike_mean)
# Use the previous row's timestamp if it was a spike.
df_merged['last_spike_time'] = df_merged['datetime_local'].shift(1).where(df_merged['BP_spike_mean'].shift(1) == 1)
df_merged['last_spike_time'] = df_merged['last_spike_time'].ffill()
df_merged['time_since_last_BP_spike'] = (df_merged['datetime_local'] - df_merged['last_spike_time']).dt.total_seconds() / 60
df_merged.drop(columns=['last_spike_time'], inplace=True)

# ✅ Drop Direct Current Row Measurements to Prevent Data Leakage
drop_cols = ['systolic', 'diastolic', 'BP_spike_threshold', 'systolic_tertile', 'diastolic_tertile']
df_merged.drop(columns=drop_cols, inplace=True)

# 📌 **Step 11: Handle Missing Values**
df_merged.ffill(inplace=True)
df_merged.bfill(inplace=True)  # Fixes rolling feature NaNs at the beginning

# 📌 **Step 12: Save Processed Dataset**
df_merged.to_csv("processed_bp_prediction_data_full.csv", index=False)
print("✅ Final dataset saved as 'processed_bp_prediction_data_full.csv'.")
print(df_merged.head())


✅ Final dataset saved as 'processed_bp_prediction_data_full.csv'.
      id  user_id    reading_id             datetime      datetime_local  \
0  792.0     18.0  1.714316e+12  2024-04-28 14:54:57 2024-04-28 04:54:57   
1  794.0     18.0  1.714323e+12  2024-04-28 16:57:35 2024-04-28 06:57:35   
2  793.0     18.0  1.714324e+12  2024-04-28 16:58:50 2024-04-28 06:58:50   
3  796.0     18.0  1.714341e+12  2024-04-28 21:49:18 2024-04-28 11:49:18   
4  795.0     18.0  1.714341e+12  2024-04-28 21:50:36 2024-04-28 11:50:36   

   pulse device_type           created_at  BP_spike_mean                time  \
0   67.0      BP8000  2024-04-28 15:00:10            1.0 2024-04-28 04:54:00   
1   65.0      BP8000  2024-04-28 17:05:11            1.0 2024-04-28 06:57:00   
2   69.0      BP8000  2024-04-28 17:05:11            1.0 2024-04-28 06:58:00   
3   68.0      BP8000  2024-04-28 21:55:09            1.0 2024-04-28 11:49:00   
4   64.0      BP8000  2024-04-28 21:55:09            1.0 2024-04-28 11:50:00 

In [6]:
# =============================================
# Blood‑pressure spike prediction end‑to‑end
# (XGBoost + LSTM ensemble, no oversampling)
# =============================================
import time, random, os
start_time = time.time()

# -------------------------
# Reproducibility
# -------------------------
import numpy as np
import tensorflow as tf
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
os.environ["PYTHONHASHSEED"] = "42"

# -------------------------
# Core libraries
# -------------------------
import pandas as pd
import xgboost as xgb
import shap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import keras_tuner as kt            # ← updated import

# -------------------------
# Keras / TF layers
# -------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, BatchNormalization, Bidirectional,
    GlobalAveragePooling1D
)
from tensorflow.keras.optimizers import Adam

# =============================================
# 1. Data load & feature list
# =============================================
df = pd.read_csv("processed_bp_prediction_data_full.csv")

target = "BP_spike_mean"

features = [
    # --- rolling stats (abbreviated comment) -------------
    'hr_mean_5min', 'hr_min_5min', 'hr_max_5min', 'hr_std_5min',
    'steps_total_5min', 'steps_mean_5min', 'steps_min_5min',
    'steps_max_5min', 'steps_std_5min', 'steps_diff_5min',
    'hr_mean_10min', 'hr_min_10min', 'hr_max_10min', 'hr_std_10min',
    'steps_total_10min', 'steps_mean_10min', 'steps_min_10min',
    'steps_max_10min', 'steps_std_10min', 'steps_diff_10min',
    'hr_mean_30min', 'hr_min_30min', 'hr_max_30min', 'hr_std_30min',
    'steps_total_30min', 'steps_mean_30min', 'steps_min_30min',
    'steps_max_30min', 'steps_std_30min', 'steps_diff_30min',
    'hr_mean_60min', 'hr_min_60min', 'hr_max_60min', 'hr_std_60min',
    'steps_total_60min', 'steps_mean_60min', 'steps_min_60min',
    'steps_max_60min', 'steps_std_60min', 'steps_diff_60min',
    # --- stress & lags -----------------------------------
    'stress_mean', 'stress_min', 'stress_max', 'stress_std',
    'stress_mean_lag_1', 'stress_mean_lag_3', 'stress_mean_lag_5',
    'BP_spike_mean_lag_1', 'BP_spike_mean_lag_3', 'BP_spike_mean_lag_5',
    'hr_mean_5min_lag_1', 'hr_mean_5min_lag_3', 'hr_mean_5min_lag_5',
    'steps_total_10min_lag_1', 'steps_total_10min_lag_3',
    'steps_total_10min_lag_5',
    # --- interactions & context --------------------------
    'hr_steps_ratio', 'stress_weighted_hr', 'stress_steps_ratio',
    'steps_hr_variability_ratio', 'hr_mean_rolling_3',
    'steps_total_rolling_5', 'hr_std_rolling_3',
    'cumulative_stress_30min', 'cumulative_steps_30min',
    'hour_of_day', 'day_of_week', 'is_working_hours', 'is_weekend',
    'time_since_last_BP_spike'
]

df = df[["datetime_local"] + features + [target]]
df[features] = df[features].apply(pd.to_numeric, errors='coerce')

df["datetime_local"] = pd.to_datetime(df["datetime_local"])
train_cutoff = df["datetime_local"].min() + pd.Timedelta(days=20)
train_data = df[df["datetime_local"] < train_cutoff]
test_data  = df[df["datetime_local"] >= train_cutoff]

X_train, y_train = train_data[features], train_data[target]
X_test,  y_test  = test_data[features],  test_data[target]

print("🔹 BP spike counts (train/test): "
      f"{int(y_train.sum())}/{len(y_train)}  —  "
      f"{int(y_test.sum())}/{len(y_test)}")

# =============================================
# 2. XGBoost pipeline & grid search
# =============================================
pos, neg = y_train.sum(), len(y_train) - y_train.sum()
scale_pos_weight = neg / pos
print(f"🔹 scale_pos_weight = {scale_pos_weight:.2f}")

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("xgb", xgb.XGBClassifier(
        random_state=42,
        objective="binary:logistic",
        eval_metric="auc"
    ))
])

param_grid = {
    "xgb__max_depth":      [3, 5, 7],
    "xgb__learning_rate":  [0.01, 0.05, 0.1],
    "xgb__n_estimators":   [100, 150, 200],
    "xgb__scale_pos_weight": [scale_pos_weight]
}

grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    error_score="raise"        # debug‑friendly
)
grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_
print("🔹 Best XGB params:", grid.best_params_)

# =============================================
# 3. LSTM (tuned with Keras‑Tuner)
# =============================================
scaler_lstm = StandardScaler()
X_train_s = scaler_lstm.fit_transform(X_train)
X_test_s  = scaler_lstm.transform(X_test)

X_train_lstm = X_train_s.reshape((X_train_s.shape[0], X_train_s.shape[1], 1))
X_test_lstm  = X_test_s.reshape((X_test_s.shape[0],  X_test_s.shape[1],  1))

classes = np.unique(y_train)
weights  = compute_class_weight("balanced", classes=classes, y=y_train)
class_wt = {cls: w for cls, w in zip(classes, weights)}
print("🔹 Class weights:", class_wt)

# ----- custom attention helpers -----
class AttentionLayer(tf.keras.layers.Layer):
    def build(self, input_shape):
        self.W = self.add_weight("W", shape=(input_shape[-1], 1),
                                 initializer="normal", trainable=True)
        self.b = self.add_weight("b", shape=(input_shape[1], 1),
                                 initializer="zeros", trainable=True)
    def call(self, x):
        e = tf.math.tanh(tf.matmul(x, self.W) + self.b)
        a = tf.nn.softmax(e, axis=1)
        return tf.reduce_sum(x * a, axis=1)

class MultiHeadAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=key_dim)
    def call(self, x): return tf.reduce_mean(
        self.mha(query=x, key=x, value=x), axis=1)

class SelfAttentionLayer(tf.keras.layers.Layer):
    def __init__(self): super().__init__(); self.att = tf.keras.layers.Attention()
    def call(self, x): return tf.reduce_mean(self.att([x, x]), axis=1)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, heads, kdim, ffdim, rate=0.1):
        super().__init__()
        self.mha   = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=kdim)
        self.ffn   = Sequential([Dense(ffdim, activation="relu"), Dense(kdim)])
        self.ln1   = tf.keras.layers.LayerNormalization()
        self.ln2   = tf.keras.layers.LayerNormalization()
        self.do1   = Dropout(rate); self.do2 = Dropout(rate)
    def call(self, x):
        attn  = self.do1(self.mha(x, x, x))
        out1  = self.ln1(x + attn)
        ffn   = self.do2(self.ffn(out1))
        return self.ln2(out1 + ffn)

# ----- model builder for tuner -----
def build_model(hp):
    model = Sequential()
    model.add(Bidirectional(LSTM(hp.Int("l1", 64, 256, 32),
                                 return_sequences=True),
                            input_shape=(X_train_lstm.shape[1], 1)))
    model.add(BatchNormalization())
    dr = hp.Float("drop", 0.2, 0.5, 0.1)
    model.add(Dropout(dr))

    model.add(LSTM(hp.Int("l2", 32, 128, 16), return_sequences=True))
    model.add(BatchNormalization())
    model.add(Dropout(dr))

    variant = hp.Choice("attention", ["custom", "multi", "self", "trans"])
    if variant == "custom":
        model.add(AttentionLayer())
    elif variant == "multi":
        model.add(MultiHeadAttentionLayer(
            num_heads=hp.Int("heads", 1, 4, 1),
            key_dim=hp.Int("kdim", 16, 64, 16)))
    elif variant == "self":
        model.add(SelfAttentionLayer())
    else:
        model.add(TransformerBlock(
            heads=hp.Int("heads_t", 1, 4, 1),
            kdim=hp.Int("kdim_t", 16, 64, 16),
            ffdim=hp.Int("ff", 32, 128, 32),
            rate=dr))
        model.add(GlobalAveragePooling1D())

    model.add(Dense(hp.Int("dense", 16, 64, 16),
                    activation="relu",
                    kernel_regularizer=tf.keras.regularizers.l2(
                        hp.Choice("l2", [0.0, 0.001, 0.01, 0.1]))))
    model.add(Dropout(dr))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(
        optimizer=Adam(hp.Choice("lr", [1e-3, 5e-4, 1e-4])),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc")])
    return model

tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective("val_auc", direction="max"),
    max_trials=20, directory="lstm_tuner",
    project_name="bp_spike_pred", overwrite=True
)

tuner.search(X_train_lstm, y_train,
             epochs=50, batch_size=32,
             validation_data=(X_test_lstm, y_test),
             class_weight=class_wt)

best_lstm = tuner.get_best_models(1)[0]
print("🔹 Best LSTM HP:", tuner.get_best_hyperparameters(1)[0].values)

# =============================================
# 4. Ensemble & threshold search
# =============================================
y_xgb  = best_xgb.predict_proba(X_test)[:, 1]
y_lstm = best_lstm.predict(X_test_lstm).ravel()

alphas = np.linspace(0, 1, 11)
best_auc, best_alpha = -1, None
for a in alphas:
    auc = roc_auc_score(y_test, a*y_xgb + (1-a)*y_lstm)
    if auc > best_auc: best_auc, best_alpha = auc, a
print(f"🔹 Ensemble AUROC = {best_auc:.3f}  (α={best_alpha:.2f})")
best_beta = 1 - best_alpha

best_thr, best_youden = None, -1
for thr in np.arange(0, 1.01, 0.01):
    y_bin = ((best_alpha*y_xgb + best_beta*y_lstm) >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_bin).ravel()
    sens = tp/(tp+fn) if tp+fn else 0
    spec = tn/(tn+fp) if tn+fp else 0
    ydn  = sens + spec - 1
    if ydn > best_youden: best_thr, best_youden, best_sens, best_spec = thr, ydn, sens, spec
print(f"🔹 Optimal threshold = {best_thr:.2f} (sens={best_sens:.2f}, spec={best_spec:.2f})")

# =============================================
# 5. Sensitivity‑specificity plot
# =============================================
thr = np.arange(0, 1.01, 0.01)
sens, spec = [], []
for t in thr:
    yb = ((best_alpha*y_xgb + best_beta*y_lstm) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, yb).ravel()
    sens.append(tp/(tp+fn) if tp+fn else 0)
    spec.append(tn/(tn+fp) if tn+fp else 0)

plt.figure(figsize=(8,5))
plt.plot(thr, sens, "-o", label="Sensitivity")
plt.plot(thr, spec, "-s", label="Specificity")
plt.xlabel("Decision threshold"); plt.ylabel("Value")
plt.title(f"TPR / TNR trade‑off — AUROC {best_auc:.3f}")
plt.grid(); plt.legend(); plt.tight_layout(); plt.show()

# =============================================
# 6. SHAP for XGBoost
# =============================================
explainer  = shap.Explainer(best_xgb.named_steps["xgb"])
shap_vals  = explainer(best_xgb.named_steps["scaler"].transform(X_test))
shap.summary_plot(shap_vals, X_test, feature_names=features)

# =============================================
# 7. Time taken
# =============================================
print(f"\nTotal run time: {time.time()-start_time:.1f} s")





  from .autonotebook import tqdm as notebook_tqdm


🔹 BP spike counts (train/test): 68/108  —  14/28
🔹 scale_pos_weight = 0.59


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


🔹 Best XGB params: {'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 150, 'xgb__scale_pos_weight': 0.5882352941176471}
🔹 Class weights: {0.0: 1.35, 1.0: 0.7941176470588235}


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
256               |256               |l1
0.4               |0.4               |drop
48                |48                |l2
self              |self              |attention
32                |32                |dense
0.0005            |0.0005            |lr

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

KeyboardInterrupt: 

In [8]:
print("🔹 BP Spike Counts Before Resampling:")
print(f"   - Training Set: {sum(y_train)} spikes out of {len(y_train)} samples ({sum(y_train)/len(y_train)*100:.2f}%)")
print(f"   - Test Set: {sum(y_test)} spikes out of {len(y_test)} samples ({sum(y_test)/len(y_test)*100:.2f}%)")

🔹 BP Spike Counts Before Resampling:
   - Training Set: 127.0 spikes out of 177 samples (71.75%)
   - Test Set: 27.0 spikes out of 43 samples (62.79%)
