In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score


import joblib



In [2]:
df = pd.read_csv("human_vital_signs_dataset_2024.csv")
df.head()


Unnamed: 0,Patient ID,Heart Rate,Respiratory Rate,Timestamp,Body Temperature,Oxygen Saturation,Systolic Blood Pressure,Diastolic Blood Pressure,Age,Gender,Weight (kg),Height (m),Derived_HRV,Derived_Pulse_Pressure,Derived_BMI,Derived_MAP,Risk Category
0,1,60,12,2024-07-19 21:53:45.729841,36.861707,95.702046,124,86,37,Female,91.541618,1.679351,0.121033,38,32.459031,98.666667,High Risk
1,2,63,18,2024-07-19 21:52:45.729841,36.511633,96.689413,126,84,77,Male,50.704921,1.992546,0.117062,42,12.771246,98.0,High Risk
2,3,63,15,2024-07-19 21:51:45.729841,37.052049,98.508265,131,78,68,Female,90.31676,1.770228,0.0532,53,28.821069,95.666667,Low Risk
3,4,99,16,2024-07-19 21:50:45.729841,36.654748,95.011801,118,72,41,Female,96.006188,1.833629,0.064475,46,28.554611,87.333333,High Risk
4,5,69,16,2024-07-19 21:49:45.729841,36.975098,98.623792,138,76,25,Female,56.020006,1.866419,0.118484,62,16.081438,96.666667,High Risk


In [3]:
le = LabelEncoder()
df["Risk_Label"] = le.fit_transform(df["Risk Category"])
# Low Risk = 0, High Risk = 1


In [4]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df = df.sort_values(["Patient ID", "Timestamp"])


In [5]:
baseline_window = 10

baseline_cols = [
    "Heart Rate",
    "Oxygen Saturation",
    "Body Temperature"
]

for col in baseline_cols:
    df[f"{col}_baseline"] = (
        df.groupby("Patient ID")[col]
        .transform(lambda x: x.rolling(baseline_window, min_periods=5).mean())
    )
    
    df[f"{col}_delta"] = df[col] - df[f"{col}_baseline"]


In [6]:
unsup_features_v2 = [
    "Heart Rate",
    "Respiratory Rate",
    "Body Temperature",
    "Oxygen Saturation",
    "Derived_HRV",
    "Derived_MAP",
    "Derived_Pulse_Pressure",
    "Heart Rate_delta",
    "Oxygen Saturation_delta",
    "Body Temperature_delta"
]


In [7]:
sup_features_v2 = [
    "anomaly_percentile",
    "Derived_HRV",
    "Derived_MAP",
    "Derived_Pulse_Pressure",
    "Heart Rate_delta",
    "Oxygen Saturation_delta"
]


In [9]:
# Fill missing baselines with current value (first readings)
for col in baseline_cols:
    df[f"{col}_baseline"] = df[f"{col}_baseline"].fillna(df[col])
    df[f"{col}_delta"] = df[f"{col}_delta"].fillna(0)


In [10]:
df[unsup_features_v2].isna().sum()


Heart Rate                 0
Respiratory Rate           0
Body Temperature           0
Oxygen Saturation          0
Derived_HRV                0
Derived_MAP                0
Derived_Pulse_Pressure     0
Heart Rate_delta           0
Oxygen Saturation_delta    0
Body Temperature_delta     0
dtype: int64

In [11]:
normal_df = df[df["Risk_Label"] == 0]

scaler_v2 = StandardScaler()
X_unsup_train = scaler_v2.fit_transform(
    normal_df[unsup_features_v2]
)

iso_model_v2 = IsolationForest(
    n_estimators=200,
    contamination=0.08,
    random_state=42
)

iso_model_v2.fit(X_unsup_train)


In [12]:
normal_df = df[df["Risk_Label"] == 0]

scaler_v2 = StandardScaler()
X_unsup_train = scaler_v2.fit_transform(
    normal_df[unsup_features_v2]
)

iso_model_v2 = IsolationForest(
    n_estimators=200,
    contamination=0.08,
    random_state=42
)

iso_model_v2.fit(X_unsup_train)


In [14]:
# Generate anomaly percentile BEFORE supervised training
X_all_scaled = scaler_v2.transform(df[unsup_features_v2])

df["anomaly_score"] = -iso_model_v2.decision_function(X_all_scaled)
df["anomaly_percentile"] = df["anomaly_score"].rank(pct=True)


In [15]:
print("anomaly_percentile" in df.columns)


True


In [16]:
sup_features_v2 = [
    "anomaly_percentile",
    "Derived_HRV",
    "Derived_MAP",
    "Derived_Pulse_Pressure",
    "Heart Rate_delta",
    "Oxygen Saturation_delta"
]


In [17]:
X = df[sup_features_v2]
y = df["Risk_Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

rf_model_v2 = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=50,
    class_weight="balanced",
    random_state=42
)

rf_model_v2.fit(X_train, y_train)


In [18]:
y_pred = rf_model_v2.predict(X_test)
risk_prob = rf_model_v2.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, risk_prob))


              precision    recall  f1-score   support

           0       0.54      0.65      0.59     21023
           1       0.49      0.38      0.43     18981

    accuracy                           0.52     40004
   macro avg       0.52      0.51      0.51     40004
weighted avg       0.52      0.52      0.51     40004

ROC-AUC: 0.5185918311655286


In [19]:
sup_features_v2 = [
    # Anomaly intelligence
    "anomaly_percentile",

    # Absolute vitals (label-aligned)
    "Heart Rate",
    "Oxygen Saturation",

    # Personalized deviation
    "Heart Rate_delta",
    "Oxygen Saturation_delta",

    # Stable derived indicators
    "Derived_MAP",
    "Derived_Pulse_Pressure"
]


In [20]:
X = df[sup_features_v2]
y = df["Risk_Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

rf_model_v2 = RandomForestClassifier(
    n_estimators=400,
    max_depth=14,
    min_samples_leaf=40,
    class_weight="balanced",
    random_state=42
)

rf_model_v2.fit(X_train, y_train)


In [21]:
y_pred = rf_model_v2.predict(X_test)
risk_prob = rf_model_v2.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, risk_prob))


              precision    recall  f1-score   support

           0       1.00      0.43      0.60     21023
           1       0.61      1.00      0.76     18981

    accuracy                           0.70     40004
   macro avg       0.81      0.72      0.68     40004
weighted avg       0.82      0.70      0.68     40004

ROC-AUC: 0.7152514160678151


In [22]:
def assign_risk(anomaly, prob):
    if anomaly > 0.9 and prob > 0.6:
        return "Critical"
    elif anomaly > 0.5 or prob > 0.5:
        return "Warning"
    else:
        return "Normal"

df["Predicted_Risk_Level"] = [
    assign_risk(a, p)
    for a, p in zip(
        df["anomaly_percentile"],
        rf_model_v2.predict_proba(df[sup_features_v2])[:, 1]
    )
]


In [23]:
df[
    ["Risk Category", "anomaly_percentile", "Predicted_Risk_Level"]
].sample(10)


Unnamed: 0,Risk Category,anomaly_percentile,Predicted_Risk_Level
147508,High Risk,0.494856,Normal
42347,Low Risk,0.879112,Warning
112693,Low Risk,0.542471,Warning
47995,High Risk,0.538591,Warning
179982,Low Risk,0.042516,Warning
57192,High Risk,0.196795,Warning
28778,Low Risk,0.760379,Warning
187895,High Risk,0.325507,Warning
39660,Low Risk,0.615398,Warning
9175,Low Risk,0.807084,Warning


In [24]:
df[
    ["Risk Category", "anomaly_percentile", "Predicted_Risk_Level"]
].sample(10)


Unnamed: 0,Risk Category,anomaly_percentile,Predicted_Risk_Level
124972,High Risk,0.90061,Warning
199654,High Risk,0.851585,Warning
30719,Low Risk,0.299375,Warning
159630,Low Risk,0.052555,Warning
148187,Low Risk,0.548585,Warning
78637,Low Risk,0.3473,Warning
189135,High Risk,0.310274,Normal
7370,High Risk,0.297785,Warning
59508,High Risk,0.786001,Warning
18679,Low Risk,0.268163,Warning


In [26]:
from datetime import datetime
import joblib

model_bundle_v2 = {
    "iso_model": iso_model_v2,
    "scaler": scaler_v2,
    "rf_model": rf_model_v2,

    "unsup_features": unsup_features_v2,
    "sup_features": sup_features_v2,

    "thresholds": {
        "anomaly_warning": 0.5,
        "anomaly_critical": 0.9,
        "risk_warning": 0.5,
        "risk_critical": 0.6
    },

    "version": "v2.0-final",
    "created_at": datetime.now().isoformat(),
    "notes": "Patient-adaptive hybrid early health risk model"
}

joblib.dump(model_bundle_v2, "health_risk_pipeline_v2_final.pkl")


['health_risk_pipeline_v2_final.pkl']

In [27]:
bundle = joblib.load("health_risk_pipeline_v2_final.pkl")
bundle.keys()


dict_keys(['iso_model', 'scaler', 'rf_model', 'unsup_features', 'sup_features', 'thresholds', 'version', 'created_at', 'notes'])