In [None]:
# ============================================================
# 04 - Comparação ML: XGBoost + Análise SHAP
# Fonte: SUSEP AUTOSEG (2019-2021)
# Autor: Arthur Pontes Motta
# ============================================================

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../src')
from modeling import gini_coefficient, double_lift_chart, evaluate_model

# Configurações visuais
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 5)

# Caminhos
PROCESSED_PATH = '../data/processed'
FIGURES_PATH = '../reports/figures'

# ============================================================
# 1. CARREGAR DADOS
# ============================================================

df = pd.read_parquet(f'{PROCESSED_PATH}/autoseg_model.parquet')

features = (
    ['sexo_bin', 'faixa_etaria', 'idade_veiculo', 'log_is_media'] +
    [c for c in df.columns if c.startswith('regiao_')]
)

# Split temporal: treino 2019-2020 | teste 2021
df_train = df[df['ano'] < 2021].copy()
df_test  = df[df['ano'] == 2021].copy()

# Corrigir NaN
for split in [df_train, df_test]:
    split['idade_veiculo'] = split['idade_veiculo'].fillna(
        df_train['idade_veiculo'].median()
    )

X_train = df_train[features].astype(float)
X_test  = df_test[features].astype(float)

y_train = df_train['freq_colisao_rel']
y_test  = df_test['freq_colisao_rel']

print(f"Treino: {X_train.shape}")
print(f"Teste:  {X_test.shape}")
print(f"Zeros no target: {(y_train == 0).mean():.1%}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ============================================================
# 2. XGBOOST — FREQUÊNCIA DE COLISÃO
# ============================================================

SAMPLE_SIZE = 500_000
df_train_sample = df_train.sample(n=SAMPLE_SIZE, random_state=42)

X_train_s = df_train_sample[features].astype(float)
y_train_s = df_train_sample['freq_colisao_rel']

print(f"Amostra treino: {len(df_train_sample):,} linhas")
print("Treinando XGBoost...")

xgb_freq = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:tweedie',
    tweedie_variance_power=1.5,
    random_state=42,
    n_jobs=-1,
    eval_metric='mae'
)

xgb_freq.fit(
    X_train_s, y_train_s,
    eval_set=[(X_test, y_test)],
    verbose=50
)

print("✓ XGBoost treinado!")

In [None]:
# ============================================================
# 3. COMPARAÇÃO DE MÉTRICAS: GLM vs XGBoost
# ============================================================

pred_xgb = np.clip(xgb_freq.predict(X_test), 0, None)

mae_xgb  = mean_absolute_error(y_test, pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))
corr_xgb = np.corrcoef(y_test, pred_xgb)[0, 1]

print("=" * 50)
print("COMPARAÇÃO: GLM vs XGBoost")
print("=" * 50)
print(f"{'Métrica':<15} {'GLM':>12} {'XGBoost':>12}")
print("-" * 40)
print(f"{'MAE':<15} {'0.0571':>12} {mae_xgb:>12.4f}")
print(f"{'RMSE':<15} {'0.1387':>12} {rmse_xgb:>12.4f}")
print(f"{'Correlação':<15} {'0.0408':>12} {corr_xgb:>12.4f}")

In [None]:
# ============================================================
# 4. MÉTRICAS ATUARIAIS — GINI E LIFT CHART
# ============================================================

# Gini
gini_xgb  = gini_coefficient(y_test.values, pred_xgb)
pred_null = np.full(len(y_test), y_train.mean())
gini_null = gini_coefficient(y_test.values, pred_null)

print("=" * 55)
print("MÉTRICAS ATUARIAIS")
print("=" * 55)
print(f"{'Métrica':<20} {'Nulo (média)':>15} {'XGBoost':>15}")
print("-" * 55)
print(f"{'MAE':<20} {mean_absolute_error(y_test, pred_null):>15.4f} {mae_xgb:>15.4f}")
print(f"{'Correlação':<20} {'0.0000':>15} {corr_xgb:>15.4f}")
print(f"{'Gini':<20} {gini_null:>15.4f} {gini_xgb:>15.4f}")
print("=" * 55)

# Lift chart via src
lift = double_lift_chart(
    y_true=y_test.reset_index(drop=True),
    pred_glm=pd.Series(pred_null),
    pred_ml=pd.Series(pred_xgb),
    n_bins=10
)

print("\nLift Chart (XGBoost por decil de risco):")
print(lift[['decil', 'obs_mean', 'ml_mean', 'lift', 'n']].to_string(index=False))

# Visualização
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.bar(lift['decil'].astype(str), lift['lift'], color='#2196F3', edgecolor='white')
ax1.axhline(1.0, color='red', linestyle='--', linewidth=1.5, label='Baseline (lift=1)')
ax1.set_xlabel('Decil de Risco (1=menor, 10=maior)')
ax1.set_ylabel('Lift (observado / média geral)')
ax1.set_title('Lift Chart — XGBoost por Decil')
ax1.legend()

ax2 = axes[1]
df_gini = pd.DataFrame({'true': y_test.values, 'pred': pred_xgb})
df_gini = df_gini.sort_values('pred', ascending=False).reset_index(drop=True)
cum_true = df_gini['true'].cumsum() / df_gini['true'].sum()
cum_pop  = (df_gini.index + 1) / len(df_gini)

ax2.plot(cum_pop, cum_true, color='#2196F3', linewidth=2,
         label=f'XGBoost (Gini={gini_xgb:.3f})')
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Aleatório (Gini=0)')
ax2.fill_between(cum_pop, cum_true, cum_pop, alpha=0.1, color='#2196F3')
ax2.set_xlabel('Proporção de Apólices')
ax2.set_ylabel('Proporção de Sinistros')
ax2.set_title('Curva de Lorenz — Poder Discriminante')
ax2.legend()

plt.tight_layout()
plt.savefig(f'{FIGURES_PATH}/04_gini_lift.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"\nGini XGBoost: {gini_xgb:.4f} | Gini Nulo: {gini_null:.4f}")
print("Figura salva: 04_gini_lift.png")

In [None]:
# ============================================================
# 5. ANÁLISE SHAP — EXPLICABILIDADE DO XGBOOST
# ============================================================

print("Calculando SHAP values (amostra de 10k)...")
rng = np.random.default_rng(42)
idx_shap = rng.choice(len(X_test), 10_000, replace=False)
X_shap = X_test.iloc[idx_shap]

explainer   = shap.TreeExplainer(xgb_freq)
shap_values = explainer.shap_values(X_shap)

# Summary plot
plt.figure(figsize=(10, 7))
shap.summary_plot(shap_values, X_shap, show=False)
plt.title('SHAP — Importância das Features (XGBoost)', fontsize=14)
plt.tight_layout()
plt.savefig(f'{FIGURES_PATH}/04_shap_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print("Figura salva: 04_shap_summary.png")

# Dependence plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

shap.dependence_plot('idade_veiculo', shap_values, X_shap,
                     interaction_index='log_is_media', ax=axes[0], show=False)
axes[0].set_title('SHAP: Idade do Veículo × IS Média')

shap.dependence_plot('faixa_etaria', shap_values, X_shap,
                     interaction_index='sexo_bin', ax=axes[1], show=False)
axes[1].set_title('SHAP: Faixa Etária × Sexo')

plt.tight_layout()
plt.savefig(f'{FIGURES_PATH}/04_shap_dependence.png', dpi=150, bbox_inches='tight')
plt.show()
print("Figura salva: 04_shap_dependence.png")