# Advanced Ensemble Model on Feature-Engineered Dataset

This notebook builds an ensemble model (LightGBM + XGBoost) on the engineered feature set in `data/train_feature_engineered.csv`.
It performs stratified cross-validation, blends fold predictions, and persists trained models plus evaluation metadata for downstream use.

In [None]:
from pathlib import Path
import json

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    f1_score,
)

import lightgbm as lgb
from xgboost import XGBClassifier

DATA_PATH = Path('data/train_feature_engineered.csv')
OUTPUT_DIR = Path('output/models/advanced')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
df = pd.read_csv(DATA_PATH)
print(f'Dataset shape: {df.shape}')
print('Class balance:')
print(df['TARGET'].value_counts(normalize=True).rename('share'))
df.head()


In [None]:
bool_cols = df.select_dtypes(include='bool').columns
if len(bool_cols):
    df[bool_cols] = df[bool_cols].astype('uint8')

X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df['TARGET'].astype(int)

float_cols = X.select_dtypes(include=['float64']).columns
if len(float_cols):
    X[float_cols] = X[float_cols].astype(np.float32)

scale_pos_weight = (len(y) - y.sum()) / y.sum()
print(f'scale_pos_weight: {scale_pos_weight:.2f}')
print(f'Feature matrix: {X.shape[0]} rows x {X.shape[1]} columns')


In [None]:
def find_best_threshold(y_true, y_score):
    precision, recall, thresholds = precision_recall_curve(y_true, y_score)
    if thresholds.size == 0:
        fallback = f1_score(y_true, (y_score >= 0.5).astype(int))
        return 0.5, fallback
    f1_scores = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1] + 1e-12)
    best_idx = int(np.argmax(f1_scores))
    return float(thresholds[best_idx]), float(f1_scores[best_idx])

lgbm_params = dict(
    boosting_type='gbdt',
    objective='binary',
    learning_rate=0.03,
    n_estimators=2000,
    num_leaves=96,
    max_depth=-1,
    subsample=0.85,
    subsample_freq=1,
    colsample_bytree=0.65,
    reg_alpha=0.15,
    reg_lambda=1.2,
    min_child_samples=40,
    min_split_gain=0.01,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

xgb_params = dict(
    objective='binary:logistic',
    learning_rate=0.02,
    n_estimators=2500,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.6,
    reg_alpha=0.2,
    reg_lambda=1.0,
    min_child_weight=5,
    gamma=0.2,
    eval_metric='auc',
    scale_pos_weight=scale_pos_weight,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
oof_preds = {
    'lgbm': np.zeros(len(X), dtype=np.float32),
    'xgb': np.zeros(len(X), dtype=np.float32),
}
fold_metrics = []
lgb_best_iterations = []
xgb_best_iterations = []
feature_importances = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), start=1):
    print(f'Fold {fold}')
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    lgb_model = lgb.LGBMClassifier(**lgbm_params)
    lgb_model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(150), lgb.log_evaluation(period=0)],
    )
    best_iter = lgb_model.best_iteration_ or lgbm_params['n_estimators']
    lgb_best_iterations.append(int(best_iter))
    preds_lgb = lgb_model.predict_proba(X_valid)[:, 1]
    oof_preds['lgbm'][valid_idx] = preds_lgb

    feature_importances.append(
        pd.DataFrame({
            'feature': X.columns,
            'importance': lgb_model.booster_.feature_importance(importance_type='gain'),
            'fold': fold,
        })
    )

    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=200,
        verbose=False,
    )
    best_iter_xgb = getattr(xgb_model, 'best_iteration', None)
    if best_iter_xgb is None:
        best_iter_xgb = getattr(xgb_model, 'best_iteration_', xgb_params['n_estimators'])
    else:
        best_iter_xgb = int(best_iter_xgb) + 1
    xgb_best_iterations.append(int(best_iter_xgb))
    preds_xgb = xgb_model.predict_proba(X_valid)[:, 1]
    oof_preds['xgb'][valid_idx] = preds_xgb

    blended = 0.6 * preds_lgb + 0.4 * preds_xgb
    roc_auc = roc_auc_score(y_valid, blended)
    ap = average_precision_score(y_valid, blended)
    best_threshold, best_f1 = find_best_threshold(y_valid, blended)

    fold_metrics.append({
        'fold': fold,
        'roc_auc': roc_auc,
        'average_precision': ap,
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'lgb_best_iteration': int(best_iter),
        'xgb_best_iteration': int(best_iter_xgb),
    })

fold_metrics_df = pd.DataFrame(fold_metrics)
fold_metrics_df


In [None]:
blended_oof = 0.6 * oof_preds['lgbm'] + 0.4 * oof_preds['xgb']
overall_roc = roc_auc_score(y, blended_oof)
overall_ap = average_precision_score(y, blended_oof)
best_threshold, best_f1 = find_best_threshold(y, blended_oof)

print(f'OOF ROC-AUC: {overall_roc:.4f}')
print(f'OOF Average Precision: {overall_ap:.4f}')
print(f'Best global threshold: {best_threshold:.3f} -> F1 {best_f1:.4f}')

feature_importance_df = (
    pd.concat(feature_importances)
    .groupby('feature', as_index=False)['importance']
    .mean()
    .sort_values(by='importance', ascending=False)
)
feature_importance_df.head(20)


In [None]:
avg_lgb_iter = int(np.round(np.mean(lgb_best_iterations)))
avg_xgb_iter = int(np.round(np.mean(xgb_best_iterations)))

print(f'Average best iteration -> LightGBM: {avg_lgb_iter}, XGBoost: {avg_xgb_iter}')

final_lgbm = lgb.LGBMClassifier(**{**lgbm_params, 'n_estimators': avg_lgb_iter})
final_xgb = XGBClassifier(**{**xgb_params, 'n_estimators': avg_xgb_iter})

final_lgbm.fit(X, y)
final_xgb.fit(X, y, verbose=False)

joblib.dump(final_lgbm, OUTPUT_DIR / 'lgbm_advanced_model.pkl')
joblib.dump(final_xgb, OUTPUT_DIR / 'xgb_advanced_model.pkl')

metadata = {
    'data_path': str(DATA_PATH.resolve()),
    'n_samples': int(X.shape[0]),
    'n_features': int(X.shape[1]),
    'class_balance': {
        'positive': int(y.sum()),
        'negative': int((1 - y).sum()),
        'positive_rate': float(y.mean()),
    },
    'scale_pos_weight': float(scale_pos_weight),
    'avg_lgb_iterations': avg_lgb_iter,
    'avg_xgb_iterations': avg_xgb_iter,
    'overall_metrics': {
        'roc_auc': float(overall_roc),
        'average_precision': float(overall_ap),
        'best_threshold': float(best_threshold),
        'best_f1': float(best_f1),
    },
    'cv_metrics': fold_metrics,
}

with open(OUTPUT_DIR / 'advanced_metadata.json', 'w', encoding='utf-8') as fp:
    json.dump(metadata, fp, indent=2)

feature_importance_df.to_csv(OUTPUT_DIR / 'lgbm_feature_importance.csv', index=False)
fold_metrics_df.to_csv(OUTPUT_DIR / 'cv_metrics.csv', index=False)
pd.DataFrame({
    'SK_ID_CURR': df['SK_ID_CURR'],
    'TARGET': y,
    'BLENDED_PROB': blended_oof,
}).to_csv(OUTPUT_DIR / 'oof_predictions.csv', index=False)

print('Artifacts saved under', OUTPUT_DIR.resolve())


## Next Steps
- Run this notebook to train the ensemble and review the saved metrics under `output/models/advanced`.
- If GPU resources are available, set `tree_method='gpu_hist'` for XGBoost to speed up training.
- Consider tuning the blend weights or adding calibrated models (e.g., CatBoost) for further lift.