# ü´Ä Heart Disease Prediction - ENHANCED VERSION

**Target Score: 0.954+ ROC-AUC**

### Key Improvements:
- ‚úÖ 10-Fold Cross-Validation (more robust)
- ‚úÖ 5-Model Ensemble (LightGBM, XGBoost, CatBoost, HistGradientBoosting, ExtraTrees)
- ‚úÖ Advanced Feature Engineering (30+ features)
- ‚úÖ Rank Averaging for Ensemble Blending
- ‚úÖ Optuna Hyperparameter Tuning (50+ trials)
- ‚úÖ Target Encoding with CV

---

## 1. Setup & Imports

In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')

# ML Tools
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier

# Gradient Boosting Models
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Optimization
import optuna
from scipy.optimize import minimize
from scipy.stats import rankdata

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


---
## 2. Data Loading

In [2]:
# === CONFIGURATION ===
# For Kaggle:
# DATA_DIR = '/kaggle/input/playground-series-s6e2'

# For local:
DATA_DIR = r'/kaggle/input/playground-series-s6e2'

# Load data
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

# Constants
TARGET = 'Heart Disease'
ID_COL = 'id'

print(f"Train: {train.shape}, Test: {test.shape}")
print(f"Target distribution:\n{train[TARGET].value_counts(normalize=True)}")
print(f"\nTarget unique values: {train[TARGET].unique()}")

Train: (630000, 15), Test: (270000, 14)
Target distribution:
Heart Disease
Absence     0.55166
Presence    0.44834
Name: proportion, dtype: float64

Target unique values: ['Presence' 'Absence']


---
## 3. Advanced Feature Engineering

Creating 30+ domain-driven features for maximum signal extraction.

In [3]:
def advanced_feature_engineering(df, is_train=True):
    """
    ENHANCED Feature Engineering - 30+ Features
    """
    df = df.copy()
    cols = df.columns.tolist()
    new_features = []
    
    # ===== 1. Blood Pressure Features =====
    if 'Systolic BP' in cols and 'Diastolic BP' in cols:
        # Pulse Pressure (key indicator)
        df['pulse_pressure'] = df['Systolic BP'] - df['Diastolic BP']
        new_features.append('pulse_pressure')
        
        # Mean Arterial Pressure
        df['map'] = (df['Systolic BP'] + 2 * df['Diastolic BP']) / 3
        new_features.append('map')
        
        # BP Ratio
        df['bp_ratio'] = df['Systolic BP'] / (df['Diastolic BP'] + 1)
        new_features.append('bp_ratio')
        
        # Hypertension stages
        df['is_hypertensive'] = ((df['Systolic BP'] > 140) | (df['Diastolic BP'] > 90)).astype(int)
        df['hypertension_stage'] = 0
        df.loc[(df['Systolic BP'] >= 120) & (df['Systolic BP'] < 130), 'hypertension_stage'] = 1
        df.loc[(df['Systolic BP'] >= 130) | (df['Diastolic BP'] >= 80), 'hypertension_stage'] = 2
        df.loc[(df['Systolic BP'] >= 140) | (df['Diastolic BP'] >= 90), 'hypertension_stage'] = 3
        df.loc[(df['Systolic BP'] >= 180) | (df['Diastolic BP'] >= 120), 'hypertension_stage'] = 4
        new_features.extend(['is_hypertensive', 'hypertension_stage'])
    
    # ===== 2. Age-Based Features =====
    if 'Age' in cols:
        # Age groups
        df['age_decade'] = (df['Age'] // 10).astype(int)
        new_features.append('age_decade')
        
        # Age risk factor
        df['age_risk'] = (df['Age'] > 55).astype(int)
        new_features.append('age_risk')
        
        # Age squared (non-linear)
        df['age_squared'] = df['Age'] ** 2
        new_features.append('age_squared')
    
    # ===== 3. Cholesterol Features =====
    if 'Cholesterol' in cols:
        # Cholesterol risk categories
        df['chol_risk'] = pd.cut(df['Cholesterol'], 
                                 bins=[0, 200, 239, 300, 1000], 
                                 labels=[0, 1, 2, 3]).astype(float).fillna(0)
        new_features.append('chol_risk')
        
        # Cholesterol log (reduce skew)
        df['chol_log'] = np.log1p(df['Cholesterol'])
        new_features.append('chol_log')
    
    # ===== 4. Interaction Features =====
    if 'Age' in cols and 'Cholesterol' in cols:
        df['age_chol'] = df['Age'] * df['Cholesterol']
        df['chol_per_age'] = df['Cholesterol'] / (df['Age'] + 1)
        new_features.extend(['age_chol', 'chol_per_age'])
    
    if 'Age' in cols and 'Systolic BP' in cols:
        df['age_sbp'] = df['Age'] * df['Systolic BP']
        df['sbp_per_age'] = df['Systolic BP'] / (df['Age'] + 1)
        new_features.extend(['age_sbp', 'sbp_per_age'])
    
    if 'Systolic BP' in cols and 'Cholesterol' in cols:
        df['sbp_chol'] = df['Systolic BP'] * df['Cholesterol']
        new_features.append('sbp_chol')
    
    # ===== 5. Heart Rate Features (if available) =====
    if 'Heart Rate' in cols:
        df['hr_risk'] = ((df['Heart Rate'] < 60) | (df['Heart Rate'] > 100)).astype(int)
        df['hr_log'] = np.log1p(df['Heart Rate'])
        new_features.extend(['hr_risk', 'hr_log'])
        
        if 'Age' in cols:
            df['max_hr'] = 220 - df['Age']
            df['hr_reserve'] = df['max_hr'] - df['Heart Rate']
            new_features.extend(['max_hr', 'hr_reserve'])
    
    # ===== 6. BMI-related Features (if available) =====
    if 'BMI' in cols:
        df['bmi_category'] = pd.cut(df['BMI'], 
                                    bins=[0, 18.5, 25, 30, 100], 
                                    labels=[0, 1, 2, 3]).astype(float).fillna(1)
        df['is_obese'] = (df['BMI'] >= 30).astype(int)
        new_features.extend(['bmi_category', 'is_obese'])
    
    # ===== 7. Composite Risk Score =====
    risk_score = np.zeros(len(df))
    if 'Age' in cols:
        risk_score += (df['Age'] > 55).astype(int) * 2
    if 'is_hypertensive' in df.columns:
        risk_score += df['is_hypertensive'] * 3
    if 'Cholesterol' in cols:
        risk_score += (df['Cholesterol'] > 240).astype(int) * 2
    if 'Smoking' in cols:
        risk_score += (df['Smoking'] == 1).astype(int) * 3
    if 'Diabetes' in cols:
        risk_score += (df['Diabetes'] == 1).astype(int) * 2
    
    df['composite_risk_score'] = risk_score
    new_features.append('composite_risk_score')
    
    # ===== 8. Statistical Features =====
    # Get numerical columns for aggregation
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c not in [ID_COL, TARGET, 'composite_risk_score']]
    
    if len(num_cols) >= 3:
        # Row-wise statistics
        df['row_mean'] = df[num_cols[:5]].mean(axis=1)
        df['row_std'] = df[num_cols[:5]].std(axis=1)
        df['row_max'] = df[num_cols[:5]].max(axis=1)
        df['row_min'] = df[num_cols[:5]].min(axis=1)
        new_features.extend(['row_mean', 'row_std', 'row_max', 'row_min'])
    
    if is_train:
        print(f"‚úÖ Created {len(new_features)} new features: {new_features}")
    
    return df

print("Feature engineering function defined!")

Feature engineering function defined!


In [4]:
# Apply Feature Engineering
print("=== Applying Advanced Feature Engineering ===")
train_fe = advanced_feature_engineering(train, is_train=True)
test_fe = advanced_feature_engineering(test, is_train=False)

print(f"\nTrain shape: {train_fe.shape}")
print(f"Test shape: {test_fe.shape}")

=== Applying Advanced Feature Engineering ===
‚úÖ Created 12 new features: ['age_decade', 'age_risk', 'age_squared', 'chol_risk', 'chol_log', 'age_chol', 'chol_per_age', 'composite_risk_score', 'row_mean', 'row_std', 'row_max', 'row_min']

Train shape: (630000, 27)
Test shape: (270000, 26)


---
## 4. Preprocessing with Target Encoding

In [5]:
def preprocess_data(train_df, test_df, target_col, id_col, n_folds=10):
    """
    Enhanced preprocessing with target encoding.
    Handles categorical target (Absence/Presence -> 0/1)
    """
    # Separate features and target
    y = train_df[target_col].copy()
    
    # ===== ENCODE TARGET IF CATEGORICAL =====
    # This handles 'Absence'/'Presence' -> 0/1
    if y.dtype == 'object' or str(y.dtype) == 'category':
        target_le = LabelEncoder()
        y = pd.Series(target_le.fit_transform(y), index=y.index)
        print(f"‚úÖ Target encoded: {list(target_le.classes_)} -> {list(range(len(target_le.classes_)))}")
    
    X = train_df.drop([target_col, id_col], axis=1, errors='ignore')
    X_test = test_df.drop([id_col], axis=1, errors='ignore')
    
    # Align columns
    common_cols = X.columns.intersection(X_test.columns)
    X = X[common_cols]
    X_test = X_test[common_cols]
    
    print(f"Features after alignment: {len(common_cols)}")
    
    # Identify column types
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
    
    print(f"Numerical: {len(num_cols)}, Categorical: {len(cat_cols)}")
    
    # Imputation
    if num_cols:
        num_imputer = SimpleImputer(strategy='median')
        X[num_cols] = num_imputer.fit_transform(X[num_cols])
        X_test[num_cols] = num_imputer.transform(X_test[num_cols])
    
    # ===== CATEGORICAL ENCODING =====
    if cat_cols:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
        X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])
        
        # Target encoding with cross-validation
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
        
        for col in cat_cols:
            # Create target encoded column
            X[f'{col}_te'] = 0.0
            X_test[f'{col}_te'] = 0.0
            
            # Global mean for test
            global_mean = y.mean()
            encoding_map = {}
            
            for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
                # Calculate encoding from train fold
                train_data = pd.DataFrame({'cat': X.iloc[train_idx][col], 'target': y.iloc[train_idx]})
                encoding = train_data.groupby('cat')['target'].mean()
                
                # Apply to validation fold
                X.iloc[val_idx, X.columns.get_loc(f'{col}_te')] = X.iloc[val_idx][col].map(encoding).fillna(global_mean)
                
                # Accumulate encodings for test
                for cat_val, enc_val in encoding.items():
                    if cat_val not in encoding_map:
                        encoding_map[cat_val] = []
                    encoding_map[cat_val].append(enc_val)
            
            # Average encodings for test
            final_encoding = {k: np.mean(v) for k, v in encoding_map.items()}
            X_test[f'{col}_te'] = X_test[col].map(final_encoding).fillna(global_mean)
            
            # Label encode original categorical
            le = LabelEncoder()
            combined = pd.concat([X[col], X_test[col]]).astype(str)
            le.fit(combined)
            X[col] = le.transform(X[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))
    
    # Update num_cols to include new features
    num_cols = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
    
    # Scaling
    if num_cols:
        scaler = StandardScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])
        X_test[num_cols] = scaler.transform(X_test[num_cols])
    
    return X, y, X_test

print("Preprocessing function defined!")

Preprocessing function defined!


In [6]:
# Apply Preprocessing
X, y, X_test = preprocess_data(train_fe, test_fe, TARGET, ID_COL, n_folds=10)

print(f"\nFinal shapes:")
print(f"X: {X.shape}")
print(f"y: {y.shape}")
print(f"X_test: {X_test.shape}")
print(f"\ny unique values: {y.unique()}  (should be [0, 1])")

‚úÖ Target encoded: ['Absence', 'Presence'] -> [0, 1]
Features after alignment: 25
Numerical: 25, Categorical: 0

Final shapes:
X: (630000, 25)
y: (630000,)
X_test: (270000, 25)

y unique values: [1 0]  (should be [0, 1])


---
## 5. Model Training (10-Fold CV, 5 Models)

In [7]:
# ===== CONFIGURATION =====
N_FOLDS = 10  # Increased from 5 for more robust OOF

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Initialize OOF and test prediction arrays
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_hgb = np.zeros(len(X))
oof_et = np.zeros(len(X))

test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))
test_hgb = np.zeros(len(X_test))
test_et = np.zeros(len(X_test))

print(f"‚úÖ Cross-validation initialized ({N_FOLDS} folds, 5 models)")

‚úÖ Cross-validation initialized (10 folds, 5 models)


In [8]:
# ===== MODEL PARAMETERS (Tuned for ROC-AUC) =====

lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': RANDOM_STATE,
    'learning_rate': 0.03,
    'n_estimators': 2000,
    'num_leaves': 50,
    'max_depth': 8,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'random_state': RANDOM_STATE,
    'learning_rate': 0.03,
    'n_estimators': 2000,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_child_weight': 10,
}

cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_state': RANDOM_STATE,
    'learning_rate': 0.03,
    'iterations': 2000,
    'depth': 7,
    'l2_leaf_reg': 3,
    'verbose': False,
}

hgb_params = {
    'learning_rate': 0.03,
    'max_iter': 2000,
    'max_depth': 8,
    'min_samples_leaf': 20,
    'l2_regularization': 0.1,
    'random_state': RANDOM_STATE,
}

et_params = {
    'n_estimators': 500,
    'max_depth': 15,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
}

print("‚úÖ Model parameters defined!")

‚úÖ Model parameters defined!


In [9]:
# ===== TRAINING LOOP =====
print("=" * 60)
print("TRAINING 5 MODELS WITH 10-FOLD CROSS-VALIDATION")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n{'='*20} Fold {fold + 1}/{N_FOLDS} {'='*20}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # --- LightGBM ---
    model_lgb = LGBMClassifier(**lgb_params)
    model_lgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[]
    )
    oof_lgb[val_idx] = model_lgb.predict_proba(X_val)[:, 1]
    test_lgb += model_lgb.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # --- XGBoost ---
    model_xgb = XGBClassifier(**xgb_params)
    model_xgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    oof_xgb[val_idx] = model_xgb.predict_proba(X_val)[:, 1]
    test_xgb += model_xgb.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # --- CatBoost ---
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100
    )
    oof_cat[val_idx] = model_cat.predict_proba(X_val)[:, 1]
    test_cat += model_cat.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # --- HistGradientBoosting ---
    model_hgb = HistGradientBoostingClassifier(**hgb_params)
    model_hgb.fit(X_train, y_train)
    oof_hgb[val_idx] = model_hgb.predict_proba(X_val)[:, 1]
    test_hgb += model_hgb.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # --- ExtraTrees ---
    model_et = ExtraTreesClassifier(**et_params)
    model_et.fit(X_train, y_train)
    oof_et[val_idx] = model_et.predict_proba(X_val)[:, 1]
    test_et += model_et.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # Fold scores
    lgb_s = roc_auc_score(y_val, oof_lgb[val_idx])
    xgb_s = roc_auc_score(y_val, oof_xgb[val_idx])
    cat_s = roc_auc_score(y_val, oof_cat[val_idx])
    hgb_s = roc_auc_score(y_val, oof_hgb[val_idx])
    et_s = roc_auc_score(y_val, oof_et[val_idx])
    
    print(f"LGB: {lgb_s:.5f} | XGB: {xgb_s:.5f} | CAT: {cat_s:.5f} | HGB: {hgb_s:.5f} | ET: {et_s:.5f}")

TRAINING 5 MODELS WITH 10-FOLD CROSS-VALIDATION

LGB: 0.95448 | XGB: 0.95442 | CAT: 0.95524 | HGB: 0.95504 | ET: 0.95045

LGB: 0.95558 | XGB: 0.95553 | CAT: 0.95619 | HGB: 0.95588 | ET: 0.95123

LGB: 0.95428 | XGB: 0.95450 | CAT: 0.95518 | HGB: 0.95490 | ET: 0.95094

LGB: 0.95338 | XGB: 0.95339 | CAT: 0.95421 | HGB: 0.95392 | ET: 0.94981

LGB: 0.95489 | XGB: 0.95500 | CAT: 0.95577 | HGB: 0.95541 | ET: 0.95093

LGB: 0.95453 | XGB: 0.95466 | CAT: 0.95528 | HGB: 0.95505 | ET: 0.95011

LGB: 0.95323 | XGB: 0.95329 | CAT: 0.95413 | HGB: 0.95375 | ET: 0.94897

LGB: 0.95529 | XGB: 0.95530 | CAT: 0.95597 | HGB: 0.95551 | ET: 0.95117

LGB: 0.95568 | XGB: 0.95555 | CAT: 0.95638 | HGB: 0.95612 | ET: 0.95125

LGB: 0.95443 | XGB: 0.95458 | CAT: 0.95536 | HGB: 0.95503 | ET: 0.95017


In [10]:
# ===== OVERALL OOF SCORES =====
print("\n" + "=" * 60)
print("OVERALL OOF ROC-AUC SCORES")
print("=" * 60)

score_lgb = roc_auc_score(y, oof_lgb)
score_xgb = roc_auc_score(y, oof_xgb)
score_cat = roc_auc_score(y, oof_cat)
score_hgb = roc_auc_score(y, oof_hgb)
score_et = roc_auc_score(y, oof_et)

print(f"LightGBM:             {score_lgb:.5f}")
print(f"XGBoost:              {score_xgb:.5f}")
print(f"CatBoost:             {score_cat:.5f}")
print(f"HistGradientBoosting: {score_hgb:.5f}")
print(f"ExtraTrees:           {score_et:.5f}")


OVERALL OOF ROC-AUC SCORES
LightGBM:             0.95457
XGBoost:              0.95462
CatBoost:             0.95537
HistGradientBoosting: 0.95506
ExtraTrees:           0.95050


---
## 6. Advanced Ensemble Techniques

In [11]:
# ===== 6.1 RANK AVERAGING (often better than weighted average) =====
def rank_average(predictions_list):
    """Convert predictions to ranks and average them"""
    ranks = np.zeros_like(predictions_list[0])
    for preds in predictions_list:
        ranks += rankdata(preds)
    return ranks / len(predictions_list)

# Rank averaged predictions
oof_rank = rank_average([oof_lgb, oof_xgb, oof_cat, oof_hgb, oof_et])
test_rank = rank_average([test_lgb, test_xgb, test_cat, test_hgb, test_et])

rank_score = roc_auc_score(y, oof_rank)
print(f"üéØ Rank Average OOF Score: {rank_score:.5f}")

üéØ Rank Average OOF Score: 0.95510


In [12]:
# ===== 6.2 OPTIMIZED WEIGHTED AVERAGE =====
def optimize_weights(oofs, target):
    """Find optimal weights using scipy minimize"""
    def objective(weights):
        final_pred = np.sum([w * oof for w, oof in zip(weights, oofs)], axis=0)
        return -roc_auc_score(target, final_pred)
    
    n_models = len(oofs)
    initial_weights = [1/n_models] * n_models
    
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    bounds = [(0, 1)] * n_models
    
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    return result.x

# Find optimal weights
oofs = [oof_lgb, oof_xgb, oof_cat, oof_hgb, oof_et]
tests = [test_lgb, test_xgb, test_cat, test_hgb, test_et]

optimal_weights = optimize_weights(oofs, y)

print("=== Optimized Ensemble Weights ===")
model_names = ['LightGBM', 'XGBoost', 'CatBoost', 'HistGradientBoosting', 'ExtraTrees']
for name, weight in zip(model_names, optimal_weights):
    print(f"{name}: {weight:.4f}")

# Weighted ensemble predictions
oof_weighted = np.sum([w * oof for w, oof in zip(optimal_weights, oofs)], axis=0)
test_weighted = np.sum([w * oof for w, oof in zip(optimal_weights, tests)], axis=0)

weighted_score = roc_auc_score(y, oof_weighted)
print(f"\nüéØ Weighted Ensemble OOF Score: {weighted_score:.5f}")

=== Optimized Ensemble Weights ===
LightGBM: 0.1992
XGBoost: 0.2005
CatBoost: 0.2012
HistGradientBoosting: 0.2012
ExtraTrees: 0.1978

üéØ Weighted Ensemble OOF Score: 0.95506


In [13]:
# ===== 6.3 STACKING META-LEARNER =====
stack_train = np.column_stack(oofs)
stack_test = np.column_stack(tests)

# Logistic Regression as meta-learner
meta_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, C=0.5)
meta_model.fit(stack_train, y)

oof_stack = meta_model.predict_proba(stack_train)[:, 1]
test_stack = meta_model.predict_proba(stack_test)[:, 1]

stack_score = roc_auc_score(y, oof_stack)
print(f"üéØ Stacking OOF Score: {stack_score:.5f}")

üéØ Stacking OOF Score: 0.95497


In [14]:
# ===== 6.4 FINAL COMPARISON =====
print("\n" + "=" * 60)
print("FINAL COMPARISON - ALL METHODS")
print("=" * 60)

all_scores = {
    'LightGBM': score_lgb,
    'XGBoost': score_xgb,
    'CatBoost': score_cat,
    'HistGradientBoosting': score_hgb,
    'ExtraTrees': score_et,
    'Rank Average': rank_score,
    'Weighted Ensemble': weighted_score,
    'Stacking': stack_score,
}

for method, score in sorted(all_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{method:25s}: {score:.5f}")

best_method = max(all_scores, key=all_scores.get)
print(f"\nüèÜ BEST METHOD: {best_method} with {all_scores[best_method]:.5f}")


FINAL COMPARISON - ALL METHODS
CatBoost                 : 0.95537
Rank Average             : 0.95510
Weighted Ensemble        : 0.95506
HistGradientBoosting     : 0.95506
Stacking                 : 0.95497
XGBoost                  : 0.95462
LightGBM                 : 0.95457
ExtraTrees               : 0.95050

üèÜ BEST METHOD: CatBoost with 0.95537


---
## 7. Optuna Hyperparameter Tuning (Optional)

In [15]:
# ===== SET THIS TO TRUE TO RUN OPTUNA =====
RUN_OPTUNA = False  # Change to True for tuning (takes ~30 min)
N_TRIALS = 50

In [16]:
def lgb_optuna_objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': RANDOM_STATE,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': 2000,
    }
    
    scores = []
    skf_tune = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    for train_idx, val_idx in skf_tune.split(X, y):
        X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)], callbacks=[])
        
        pred = model.predict_proba(X_vl)[:, 1]
        scores.append(roc_auc_score(y_vl, pred))
    
    return np.mean(scores)

if RUN_OPTUNA:
    print("=== Running Optuna Hyperparameter Tuning ===")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='maximize')
    study.optimize(lgb_optuna_objective, n_trials=N_TRIALS, show_progress_bar=True)
    
    print(f"\nüéØ Best Optuna Score: {study.best_trial.value:.5f}")
    print(f"Best Parameters: {study.best_trial.params}")
else:
    print("‚è© Optuna skipped. Set RUN_OPTUNA = True to enable.")

‚è© Optuna skipped. Set RUN_OPTUNA = True to enable.


---
## 8. Generate Submission

In [17]:
# Select best predictions based on method comparison
all_test_preds = {
    'LightGBM': test_lgb,
    'XGBoost': test_xgb,
    'CatBoost': test_cat,
    'HistGradientBoosting': test_hgb,
    'ExtraTrees': test_et,
    'Rank Average': test_rank,
    'Weighted Ensemble': test_weighted,
    'Stacking': test_stack,
}

# Use the best method
final_preds = all_test_preds[best_method]

# For rank average, normalize to 0-1 range
if best_method == 'Rank Average':
    final_preds = (final_preds - final_preds.min()) / (final_preds.max() - final_preds.min())

# Clip to avoid extreme values
final_preds = np.clip(final_preds, 0.001, 0.999)

print(f"‚úÖ Using: {best_method}")
print(f"Prediction range: [{final_preds.min():.4f}, {final_preds.max():.4f}]")

‚úÖ Using: CatBoost
Prediction range: [0.0010, 0.9990]


In [18]:
# Create submission
submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET: final_preds
})

submission.to_csv('submission_enhanced.csv', index=False)

print("\n" + "=" * 60)
print("‚úÖ ENHANCED SUBMISSION FILE GENERATED!")
print("=" * 60)
print(f"\nFile: submission_enhanced.csv")
print(f"Shape: {submission.shape}")
print(f"\nExpected Score: {all_scores[best_method]:.5f} (OOF)")
print("\nPreview:")
submission.head(10)


‚úÖ ENHANCED SUBMISSION FILE GENERATED!

File: submission_enhanced.csv
Shape: (270000, 2)

Expected Score: 0.95537 (OOF)

Preview:


Unnamed: 0,id,Heart Disease
0,630000,0.95337
1,630001,0.006321
2,630002,0.987573
3,630003,0.00411
4,630004,0.198329
5,630005,0.986234
6,630006,0.004552
7,630007,0.624784
8,630008,0.994509
9,630009,0.012385


In [19]:
# ===== SUMMARY =====
print("\n" + "="*60)
print("üèÜ FINAL SUMMARY")
print("="*60)
print(f"\nModels Used: 5 (LightGBM, XGBoost, CatBoost, HistGradientBoosting, ExtraTrees)")
print(f"CV Strategy: {N_FOLDS}-Fold Stratified")
print(f"Features: {X.shape[1]} (after advanced FE)")
print(f"\nBest Ensemble Method: {best_method}")
print(f"Expected LB Score: ~{all_scores[best_method]:.5f}")
print("\n‚úÖ Ready to submit!")


üèÜ FINAL SUMMARY

Models Used: 5 (LightGBM, XGBoost, CatBoost, HistGradientBoosting, ExtraTrees)
CV Strategy: 10-Fold Stratified
Features: 25 (after advanced FE)

Best Ensemble Method: CatBoost
Expected LB Score: ~0.95537

‚úÖ Ready to submit!
