## Ensemble
Esse script implementa um ensemble por m√©dia entre dois modelos de Gradient Boosting:
* LightGBM (rodando em GPU, com categorias nativas)
* XGBoost (rodando em CPU, com √°rvores histogram)

## 1.Bibliotecas

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


# Importa√ß√µes locais
from pathlib import Path
from setup_notebook import setup_path
setup_path()
from src.model_utils import *
from src.preprocess_utils_diab import *
from src.plot_metrica_class import *

print("\n#Processo iniciado em:", time.strftime("%H:%M:%S"))
start_inicial = time.time()


#Processo iniciado em: 16:56:01


## 2-DataLoad

In [5]:
BASE = Path.cwd().parent   
# =====================================================
# ‚öôÔ∏è 0. carregamento dos preprocessador 
# =====================================================
PP_lgb = joblib.load(BASE/'src'/'preprocess_diabetes_v1.4.joblib')['preprocessador'] 
PP_xgb = joblib.load(BASE/'src'/'preprocess_diabetes_v1.31.joblib')['preprocessador']

# # =====================================================
# # üìÅ 1. Leitura dos dados & Separa√ß√£o das bases
# # =====================================================

DATA_DIR = BASE / "data" / "raw"
X_train = pd.read_csv(DATA_DIR / "X_train_raw.csv").reset_index(drop=True)
X_val  = pd.read_csv(DATA_DIR / "X_test_raw.csv")
y_train = pd.read_csv(DATA_DIR / "y_train_raw.csv").values.ravel()
y_val  = pd.read_csv(DATA_DIR / "y_test_raw.csv")

base = pd.read_csv(DATA_DIR / "test.csv")
id_test = base["id"]
x_test=base.drop(columns='id')



DATA_MODELS= BASE /"models"
pipe_XGB1 = joblib.load(DATA_MODELS / 'modelo_XGB_final_randsearch.roc_auc_v1.2.joblib')
pipe_LGBM1 = joblib.load(DATA_MODELS / 'modelo_LGBM_final_randsearch.roc_auc_v1.4.joblib')


In [11]:
print("\n#Processo iniciado em:", time.strftime("%H:%M:%S"))
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
#treinamento
oof_lgbm = np.zeros(len(X_train))
oof_xgb  = np.zeros(len(X_train))

for train_idx, val_idx in skf.split(X_train, y_train):
    pipe_LGBM1.fit(X_train.iloc[train_idx], y_train[train_idx])
    pipe_XGB1.fit(X_train.iloc[train_idx], y_train[train_idx])

    oof_lgbm[val_idx] = pipe_LGBM1.predict_proba(X_train.iloc[val_idx])[:,1]
    oof_xgb[val_idx]  = pipe_XGB1.predict_proba(X_train.iloc[val_idx])[:,1]

oof_ensemble = 0.5 * oof_lgbm + 0.5 * oof_xgb
roc_auc_score(y_train, oof_ensemble)


#Processo iniciado em: 17:02:50


0.7280966997093421

In [31]:
weights = np.linspace(0, 1, 41)

best_auc = 0
best_w = None

for w in weights:
    oof_mix = w * oof_lgbm + (1 - w) * oof_xgb
    auc = roc_auc_score(y_train, oof_mix)

    if auc > best_auc:
        best_auc = auc
        best_w = w
print(best_w)
oof_ensemble2 = best_w * oof_lgbm + (1-best_w) * oof_xgb
roc_auc_score(y_train, oof_ensemble2)

0.7250000000000001


0.7282051057756662

In [32]:
from sklearn.metrics import f1_score
import numpy as np
from sklearn.metrics import roc_curve

thresholds = np.linspace(0.01, 0.99, 99)

f1_scores = []

for t in thresholds:
    y_pred = (oof_ensemble >= t).astype(int)
    f1 = f1_score(y_train, y_pred)
    f1_scores.append(f1)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"üéØ Melhor threshold: {best_threshold:.3f}")
print(f"‚≠ê F1 OOF m√°ximo: {best_f1:.4f}")


fpr, tpr, thresholds = roc_curve(y_train, oof_ensemble)
youden = tpr - fpr

best_idx = np.argmax(youden)
best_threshold = thresholds[best_idx]

print(f"üéØ Threshold (Youden): {best_threshold:.3f}")




üéØ Melhor threshold: 0.390
‚≠ê F1 OOF m√°ximo: 0.7808
üéØ Threshold (Youden): 0.615


In [35]:
print("\n# Processo de submiss√£o iniciado em:", time.strftime("%H:%M:%S"))

# =====================================================
# Submiss√£o Kaggle ‚Äî Ensemble LGBM + XGB
# =====================================================


pipe_LGBM1.fit(X_train, y_train)
pipe_XGB1.fit(X_train, y_train)

# Probabilidades no teste
p_lgbm = pipe_LGBM1.predict_proba(x_test)[:, 1]
p_xgb  = pipe_XGB1.predict_proba(x_test)[:, 1]

# Ensemble (use os pesos definidos no OOF)

p_ensemble = 0.72 * p_lgbm + 0.28 * p_xgb


print(p_ensemble.min(), p_ensemble.max())

# Se Kaggle aceita probabilidade (ex: AUC)
y_pred = p_ensemble


threshold = 0.615  # ou otimizado no OOF
y_pred = (p_ensemble >= threshold).astype(int)

# DataFrame de submiss√£o
submission = pd.DataFrame({
    "id": id_test,
    "Survived": y_pred
})

submission_path = (
    "/home/akel/PycharmProjects/Kaggle/Diabetes_Prediction_Challenge/"
    "data/process/submission_ENSEMBLE_LGBM_XGB_v1.2_th61.csv"
)

submission.to_csv(submission_path, index=False)

print("‚úÖ Submiss√£o ensemble salva com sucesso!")
print("\n# Processo finalizado em:", time.strftime("%H:%M:%S"))



# Processo de submiss√£o iniciado em: 17:53:13
0.0313432095331086 0.9968922926761271
‚úÖ Submiss√£o ensemble salva com sucesso!

# Processo finalizado em: 17:54:10


In [38]:
## staking model
from sklearn.linear_model import LogisticRegression


X_meta = np.column_stack([oof_lgbm, oof_xgb])


meta = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="lbfgs"
)

meta.fit(X_meta, y_train)
p_meta_oof = meta.predict_proba(X_meta)[:,1]
roc_auc_score(y_train, p_meta_oof)


0.7281015803809952