## Ensemble
Esse script implementa um ensemble por m√©dia entre dois modelos de Gradient Boosting:
* LightGBM (rodando em GPU, com categorias nativas)
* XGBoost (rodando em CPU, com √°rvores histogram)

## 1.Bibliotecas

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb

# Importa√ß√µes locais
from pathlib import Path
from setup_notebook import setup_path
setup_path()
from src.model_utils import *
from src.preprocess_utils_diab import *
from src.plot_metrica_class import *

print("\n#Processo iniciado em:", time.strftime("%H:%M:%S"))
start_inicial = time.time()


#Processo iniciado em: 11:55:32


## 2-DataLoad

In [2]:
BASE = Path.cwd().parent   

# # =====================================================
# # üìÅ 1. Leitura dos dados & Separa√ß√£o das bases
# # =====================================================

DATA_DIR = BASE / "data" / "raw"

train_df =pd.read_csv(DATA_DIR / "train.csv")
test_df =pd.read_csv(DATA_DIR / "test.csv")

TARGET = "diagnosed_diabetes"
ID_COL = "id"
X = train_df.drop(columns=[TARGET, ID_COL])
y = train_df[TARGET].values
X_test = test_df.drop(columns=[ID_COL])

binary_features = ['family_history_diabetes','hypertension_history','cardiovascular_history']
cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = (X.select_dtypes(include='number').columns.difference(binary_features)).tolist()
X[cat_features].head(5)

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
0,Female,Hispanic,Highschool,Lower-Middle,Current,Employed
1,Female,White,Highschool,Upper-Middle,Never,Employed
2,Male,Hispanic,Highschool,Lower-Middle,Never,Retired
3,Female,White,Highschool,Lower-Middle,Current,Employed
4,Male,White,Highschool,Upper-Middle,Never,Retired


### XGBoost Features 

In [3]:
#copiando variaveis
X_xgb = X.copy()
mem = X_xgb.memory_usage(deep=True).sum()
display(mem/1024**2)
X_test_xgb = X_test.copy()

for col in cat_features:
    le = LabelEncoder()
    X_xgb[col] = le.fit_transform(X_xgb[col])
    X_test_xgb[col] = le.transform(X_test_xgb[col])

352.2520446777344

128.1739501953125

### LightGBM Features 

In [6]:
X_lgb = X.copy()
mem = X_lgb.memory_usage(deep=True).sum()
display(mem/1024**2)
X_test_lgb = X_test.copy()
for col in cat_features:
    X_lgb[col] = X_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")

X_lgb[cat_features].head(5)
mem = X_lgb.memory_usage(deep=True).sum()
display(mem/1024**2)

352.2520446777344

100.1382532119751

In [7]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# OOF predictions
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

# Test predictions
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))

# Parametros
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.02,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "predictor": "auto",
    "seed": 42
}

# LightGBM (GPU)
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.02,
    "num_leaves": 128,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "verbosity": -1,
    "seed": 42
}

In [19]:
print("\n#Processo iniciado em:", time.strftime("%H:%M:%S"))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    start_p = time.time()

    #skf.split analisa os dados e retorna as posi√ß√µes
    print(f"\n===== Fold {fold} =====")

    # #Split data
    X_tr_lgb, X_val_lgb = X_lgb.iloc[train_idx], X_lgb.iloc[val_idx]
    X_tr_xgb, X_val_xgb = X_xgb.iloc[train_idx], X_xgb.iloc[val_idx]
    
    y_tr, y_val = y[train_idx], y[val_idx]

    # # ---------- LightGBM ----------
    lgb_train = lgb.Dataset(X_tr_lgb, y_tr, categorical_feature=cat_features)
    lgb_val   = lgb.Dataset(X_val_lgb, y_val, categorical_feature=cat_features, reference=lgb_train)

    lgb_model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_val],
        callbacks=[lgb.early_stopping(300)]
    )

    oof_lgb[val_idx] = lgb_model.predict(X_val_lgb, num_iteration=lgb_model.best_iteration)
    test_lgb += lgb_model.predict(X_test_lgb, num_iteration=lgb_model.best_iteration) / N_SPLITS

    # # ---------- XGBoost ----------
    dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr)
    dval   = xgb.DMatrix(X_val_xgb, label=y_val)
    dtest  = xgb.DMatrix(X_test_xgb)

    xgb_model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=10000,
        evals=[(dval, "val")],
        early_stopping_rounds=300,
        verbose_eval=False
    )

    oof_xgb[val_idx] = xgb_model.predict(dval, iteration_range=(0, xgb_model.best_iteration))
    test_xgb += xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration)) / N_SPLITS
    end_p = time.time()

    print(f"Tempo total: {end_p-start_p:.2f} segundos")

print("\n#Processo encerrado em:", time.strftime("%H:%M:%S"))



#Processo iniciado em: 13:06:04

===== Fold 1 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1414]	valid_0's auc: 0.727427
Tempo total: 233.49 segundos

===== Fold 2 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1500]	valid_0's auc: 0.725737
Tempo total: 238.96 segundos

===== Fold 3 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1718]	valid_0's auc: 0.727113
Tempo total: 254.17 segundos

===== Fold 4 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1397]	valid_0's auc: 0.727265
Tempo total: 224.68 segundos

===== Fold 5 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1613]	valid_0's auc: 0.727596
Tempo total: 247.49 segundos

#Processo encerrado em: 13:26:03


## score

In [20]:
print("LightGBM CV AUC:", roc_auc_score(y, oof_lgb))
print("XGBoost CV AUC:", roc_auc_score(y, oof_xgb))

# Ensemble
oof_ensemble = 0.5 * oof_lgb + 0.5 * oof_xgb
test_ensemble = 0.5 * test_lgb + 0.5 * test_xgb
print("Ensemble CV AUC:", roc_auc_score(y, oof_ensemble))

LightGBM CV AUC: 0.7270260681816806
XGBoost CV AUC: 0.7261109408999977
Ensemble CV AUC: 0.7273126936333114


In [None]:
submission = pd.read_csv("/kaggle/input/playground-series-s5e12/sample_submission.csv")
submission[TARGET] = test_ensemble
submission.to_csv("ensemble_lgb_xgb_submission_teste.csv", index=False)

submission.head()