# XGBoost: Predicting **Heart Disease** from BRFSS 2015
Source file: `diabetes_binary_5050split_health_indicators_BRFSS2015-1.csv`

Target variable is set to **HeartDiseaseorAttack**.


In [None]:

import pandas as pd, numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, RocCurveDisplay
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

DATA_PATH = r"/mnt/data/diabetes_binary_5050split_health_indicators_BRFSS2015-1.csv"
TARGET = "HeartDiseaseorAttack"

df = pd.read_csv(DATA_PATH)
assert TARGET in df.columns, f"Target '{TARGET}' not found. Available columns: {list(df.columns)}"
df.shape, df.head()


In [None]:

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
categorical_cols = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False) if len(numeric_cols) > 0 else "drop", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore") if len(categorical_cols) > 0 else "drop", categorical_cols),
    ],
    remainder="drop"
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("xgb", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=4,
        eval_metric="logloss",
    ))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, proba)

print(f"Accuracy: {acc:.4f}")
print(f"ROC-AUC:  {auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_test, pred))

RocCurveDisplay.from_predictions(y_test, proba)
plt.title("XGBoost ROC — HeartDiseaseorAttack")
plt.show()


In [None]:

# Save the trained pipeline for reuse in this session (optional demo)
import joblib, os
model_path = "/mnt/data/xgb_heart_disease_pipeline.joblib"
joblib.dump(model, model_path)
model_path
