# Heart Disease Prediction ‚Äî Ensemble GBDT Approach
## Playground Series S6E2 | Target: AUC-ROC

**Strategy:**
- Proper feature typing (continuous vs categorical)
- Domain-informed feature engineering
- Tuned XGBoost + LightGBM + CatBoost
- Blended ensemble for robust predictions


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50)
print("All imports loaded")


In [None]:
# Load data
train = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')
sub = pd.read_csv('/kaggle/input/playground-series-s6e2/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape:  {test.shape}")
train.head()


## Exploratory Data Analysis


In [None]:
# Target encoding
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})

print("Target Distribution:")
print(train['Heart Disease'].value_counts(normalize=True))
print(f"\nMissing values in train: {train.isnull().sum().sum()}")
print(f"Missing values in test:  {test.isnull().sum().sum()}")
print(f"\nUnique values per column:")
print(train.drop(['id','Heart Disease'], axis=1).nunique().sort_values())


In [None]:
fig, axes = plt.subplots(3, 5, figsize=(20, 12))
axes = axes.flatten()

features = [c for c in train.columns if c not in ['id', 'Heart Disease']]
for i, col in enumerate(features):
    ax = axes[i]
    train[train['Heart Disease']==0][col].hist(ax=ax, alpha=0.5, label='No HD', bins=30, density=True)
    train[train['Heart Disease']==1][col].hist(ax=ax, alpha=0.5, label='HD', bins=30, density=True)
    ax.set_title(col, fontsize=10)
    ax.legend(fontsize=7)

# hide unused subplots
for j in range(len(features), len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Feature Distributions by Target', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
corr = train.drop('id', axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()


## Feature Engineering

**Continuous features:** Age, BP, Cholesterol, Max HR, ST depression  
**Categorical features:** Sex, Chest pain type, FBS over 120, EKG results, Exercise angina, Slope of ST, Number of vessels fluro, Thallium



In [None]:
def feature_engineering(df):
    df = df.copy()
    
    # --- Interaction Features ---
    df['Age_x_MaxHR'] = df['Age'] * df['Max HR']
    df['Age_x_STdep'] = df['Age'] * df['ST depression']
    df['BP_x_Chol'] = df['BP'] * df['Cholesterol']
    df['MaxHR_x_STdep'] = df['Max HR'] * df['ST depression']
    df['Age_x_Vessels'] = df['Age'] * df['Number of vessels fluro']
    df['BP_x_MaxHR'] = df['BP'] * df['Max HR']
    
    # --- Ratio Features ---
    df['Chol_BP_ratio'] = df['Cholesterol'] / (df['BP'] + 1)
    df['MaxHR_Age_ratio'] = df['Max HR'] / (df['Age'] + 1)
    df['STdep_MaxHR_ratio'] = df['ST depression'] / (df['Max HR'] + 1)
    
    # --- Polynomial Features for key predictors ---
    df['STdep_sq'] = df['ST depression'] ** 2
    df['MaxHR_sq'] = df['Max HR'] ** 2
    df['Age_sq'] = df['Age'] ** 2
    
    # --- Domain-informed bins ---
    df['Age_bin'] = pd.cut(df['Age'], bins=[0, 40, 50, 60, 100], labels=[0,1,2,3]).astype(int)
    df['BP_category'] = pd.cut(df['BP'], bins=[0, 120, 130, 140, 300], labels=[0,1,2,3]).astype(int)
    df['Chol_category'] = pd.cut(df['Cholesterol'], bins=[0, 200, 240, 600], labels=[0,1,2]).astype(int)
    
    # --- Aggregated risk score (simple) ---
    df['risk_score'] = (
        (df['Age'] > 55).astype(int) +
        (df['BP'] > 130).astype(int) +
        (df['Cholesterol'] > 240).astype(int) +
        (df['Max HR'] < 140).astype(int) +
        (df['ST depression'] > 1).astype(int) +
        df['Exercise angina'] +
        (df['Number of vessels fluro'] > 0).astype(int)
    )
    
    return df

# Define categorical columns for GBDT models
CAT_COLS = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 
            'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium',
            'Age_bin', 'BP_category', 'Chol_category']

# Apply feature engineering
train_fe = feature_engineering(train)
test_fe = feature_engineering(test)

# Separate features and target
test_ids = test_fe.pop('id')
train_fe.drop('id', axis=1, inplace=True)
y = train_fe.pop('Heart Disease')

FEATURES = [c for c in train_fe.columns]
print(f"Total features: {len(FEATURES)}")
print(f"Categorical features: {len(CAT_COLS)}")
print(f"Numerical features: {len(FEATURES) - len(CAT_COLS)}")
train_fe.head()


## Model Training ‚Äî Optuna-Tuned GBDT Ensemble

Tune each model individually with Optuna, then blend their predictions.

**Models:**
1. LightGBM
2. XGBoost  
3. CatBoost


In [None]:
N_SPLITS = 10
SEED = 42
N_OPTUNA_TRIALS = 50

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
print(f"Using {N_SPLITS}-Fold Stratified CV with seed {SEED}")
print(f"Optuna trials per model: {N_OPTUNA_TRIALS}")


In [None]:
def lgb_objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': SEED,
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1.0, log=True),
    }
    
    scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_fe, y)):
        X_trn, X_val = train_fe.iloc[trn_idx], train_fe.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)]
        )
        preds = model.predict_proba(X_val)[:, 1]
        scores.append(roc_auc_score(y_val, preds))
    
    return np.mean(scores)

print("üîç Tuning LightGBM...")
lgb_study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
lgb_study.optimize(lgb_objective, n_trials=N_OPTUNA_TRIALS, show_progress_bar=True)
print(f"\n‚úÖ Best LightGBM AUC: {lgb_study.best_value:.6f}")
print(f"Best params: {lgb_study.best_params}")


In [None]:
def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        'random_state': SEED,
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 5.0, log=True),
    }
    
    scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_fe, y)):
        X_trn, X_val = train_fe.iloc[trn_idx], train_fe.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        model = xgb.XGBClassifier(**params, verbosity=0, early_stopping_rounds=100)
        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        preds = model.predict_proba(X_val)[:, 1]
        scores.append(roc_auc_score(y_val, preds))
    
    return np.mean(scores)

print("üîç Tuning XGBoost...")
xgb_study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
xgb_study.optimize(xgb_objective, n_trials=N_OPTUNA_TRIALS, show_progress_bar=True)
print(f"\n‚úÖ Best XGBoost AUC: {xgb_study.best_value:.6f}")
print(f"Best params: {xgb_study.best_params}")


In [None]:
# For CatBoost, identify categorical feature indices
cat_indices = [train_fe.columns.get_loc(c) for c in CAT_COLS if c in train_fe.columns]

def cb_objective(trial):
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': SEED,
        'iterations': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 5.0),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'verbose': 0,
    }
    
    scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_fe, y)):
        X_trn, X_val = train_fe.iloc[trn_idx], train_fe.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        model = cb.CatBoostClassifier(**params)
        model.fit(
            X_trn, y_trn,
            eval_set=(X_val, y_val),
            cat_features=cat_indices,
            early_stopping_rounds=100,
            verbose=0
        )
        preds = model.predict_proba(X_val)[:, 1]
        scores.append(roc_auc_score(y_val, preds))
    
    return np.mean(scores)

print("üîç Tuning CatBoost...")
cb_study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
cb_study.optimize(cb_objective, n_trials=N_OPTUNA_TRIALS, show_progress_bar=True)
print(f"\n‚úÖ Best CatBoost AUC: {cb_study.best_value:.6f}")
print(f"Best params: {cb_study.best_params}")


## Final Training with Best Params + Ensemble Blend

Train all 3 models with their best hyperparameters, collect OOF predictions, and find optimal blend weights.


In [None]:
def train_full_cv(model_type, best_params):
    """Train a model with full CV, return OOF preds and test preds."""
    oof_preds = np.zeros(len(train_fe))
    test_preds = np.zeros(len(test_fe))
    fold_scores = []
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_fe, y)):
        X_trn, X_val = train_fe.iloc[trn_idx], train_fe.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        if model_type == 'lgb':
            params = {
                'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
                'boosting_type': 'gbdt', 'random_state': SEED, 'n_estimators': 3000,
                **best_params
            }
            model = lgb.LGBMClassifier(**params)
            model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)],
                      callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)])
        
        elif model_type == 'xgb':
            params = {
                'objective': 'binary:logistic', 'eval_metric': 'auc',
                'tree_method': 'hist', 'random_state': SEED, 'n_estimators': 3000,
                'verbosity': 0, 'early_stopping_rounds': 100, **best_params
            }
            model = xgb.XGBClassifier(**params)
            model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=False)
        
        elif model_type == 'cb':
            params = {
                'loss_function': 'Logloss', 'eval_metric': 'AUC',
                'random_seed': SEED, 'iterations': 3000, 'verbose': 0,
                **best_params
            }
            model = cb.CatBoostClassifier(**params)
            model.fit(X_trn, y_trn, eval_set=(X_val, y_val),
                      cat_features=cat_indices, early_stopping_rounds=100, verbose=0)
        
        val_pred = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_pred
        test_preds += model.predict_proba(test_fe[FEATURES])[:, 1] / N_SPLITS
        
        score = roc_auc_score(y_val, val_pred)
        fold_scores.append(score)
        print(f"  Fold {fold+1}: AUC = {score:.6f}")
    
    mean_score = np.mean(fold_scores)
    print(f"  ‚û°Ô∏è  Mean AUC: {mean_score:.6f} (¬±{np.std(fold_scores):.6f})\n")
    return oof_preds, test_preds, mean_score

# Train all models
print("=" * 60)
print("üìà LightGBM ‚Äî Final CV Training")
print("=" * 60)
lgb_oof, lgb_test, lgb_score = train_full_cv('lgb', lgb_study.best_params)

print("=" * 60)
print("üìà XGBoost ‚Äî Final CV Training")
print("=" * 60)
xgb_oof, xgb_test, xgb_score = train_full_cv('xgb', xgb_study.best_params)

print("=" * 60)
print("üìà CatBoost ‚Äî Final CV Training")
print("=" * 60)
cb_oof, cb_test, cb_score = train_full_cv('cb', cb_study.best_params)


In [None]:
from scipy.optimize import minimize

def blend_objective(weights):
    w1, w2, w3 = weights
    blend = w1 * lgb_oof + w2 * xgb_oof + w3 * cb_oof
    return -roc_auc_score(y, blend)  # negative because we minimize

# Optimize blend weights
result = minimize(
    blend_objective,
    x0=[1/3, 1/3, 1/3],
    method='Nelder-Mead',
    bounds=[(0, 1), (0, 1), (0, 1)]
)

w1, w2, w3 = result.x
# Normalize weights
total = w1 + w2 + w3
w1, w2, w3 = w1/total, w2/total, w3/total

blend_oof = w1 * lgb_oof + w2 * xgb_oof + w3 * cb_oof
blend_score = roc_auc_score(y, blend_oof)

print(f"\n{'='*60}")
print(f"üèÜ RESULTS SUMMARY")
print(f"{'='*60}")
print(f"LightGBM CV:  {lgb_score:.6f}")
print(f"XGBoost CV:   {xgb_score:.6f}")
print(f"CatBoost CV:  {cb_score:.6f}")
print(f"\nOptimal Blend Weights: LGB={w1:.3f} | XGB={w2:.3f} | CB={w3:.3f}")
print(f"\nüéØ Blended CV AUC: {blend_score:.6f}")
print(f"\nüìä Baseline (Logistic Regression): ~0.9500")
print(f"üìä Our improvement: +{(blend_score - 0.95)*100:.2f}% AUC")


In [None]:
# Train a single LGB model for feature importance visualization
imp_model = lgb.LGBMClassifier(
    objective='binary', metric='auc', verbosity=-1, n_estimators=1000,
    random_state=SEED, **lgb_study.best_params
)
imp_model.fit(train_fe, y)

importance = pd.DataFrame({
    'feature': FEATURES,
    'importance': imp_model.feature_importances_
}).sort_values('importance', ascending=True).tail(20)

plt.figure(figsize=(10, 8))
plt.barh(importance['feature'], importance['importance'], color='#2ecc71')
plt.title('Top 20 Feature Importances (LightGBM)', fontsize=14)
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


## Generate Submission


In [None]:
# Create blended test predictions
final_preds = w1 * lgb_test + w2 * xgb_test + w3 * cb_test

submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': final_preds
})

submission.to_csv('submission.csv', index=False)
print(f"Submission shape: {submission.shape}")
print(f"\nPrediction stats:")
print(submission['Heart Disease'].describe())
submission.head(10)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(blend_oof[y==0], bins=50, alpha=0.6, label='No HD', density=True)
axes[0].hist(blend_oof[y==1], bins=50, alpha=0.6, label='HD', density=True)
axes[0].set_title('OOF Prediction Distribution by Target')
axes[0].legend()
axes[0].set_xlabel('Predicted Probability')

axes[1].hist(final_preds, bins=50, alpha=0.7, color='#3498db', density=True)
axes[1].set_title('Test Prediction Distribution')
axes[1].set_xlabel('Predicted Probability')

plt.tight_layout()
plt.show()

print("Done: submission.csv is ready for upload.")
