In [1]:
# ============================================================
# 04 - Comparação ML: XGBoost + Análise SHAP
# Fonte: SUSEP AUTOSEG (2019-2021)
# Autor: Arthur Pontes Motta
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Configurações visuais
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 5)

# Caminhos
PROCESSED_PATH = '../data/processed'
FIGURES_PATH = '../reports/figures'

# ============================================================
# 1. CARREGAR DADOS
# ============================================================

df = pd.read_parquet(f'{PROCESSED_PATH}/autoseg_model.parquet')

features = (
    ['sexo_bin', 'faixa_etaria', 'idade_veiculo', 'log_is_media'] +
    [c for c in df.columns if c.startswith('regiao_')]
)

# Split temporal: treino 2019-2020 | teste 2021
df_train = df[df['ano'] < 2021].copy()
df_test  = df[df['ano'] == 2021].copy()

# Corrigir NaN
for split in [df_train, df_test]:
    split['idade_veiculo'] = split['idade_veiculo'].fillna(
        df_train['idade_veiculo'].median()
    )

X_train = df_train[features].astype(float)
X_test  = df_test[features].astype(float)

y_train = df_train['freq_colisao_rel']
y_test  = df_test['freq_colisao_rel']

print(f"Treino: {X_train.shape}")
print(f"Teste:  {X_test.shape}")
print(f"Zeros no target: {(y_train == 0).mean():.1%}")

  from .autonotebook import tqdm as notebook_tqdm


Treino: (6520676, 44)
Teste:  (2347730, 44)
Zeros no target: 90.2%


In [None]:
# ============================================================
# 2. XGBOOST — FREQUÊNCIA DE COLISÃO
# ============================================================

# Amostra para treino (mesmo tamanho do GLM para comparação justa)
SAMPLE_SIZE = 500_000
df_train_sample = df_train.sample(n=SAMPLE_SIZE, random_state=42)

X_train_s = df_train_sample[features].astype(float)
y_train_s = df_train_sample['freq_colisao_rel']

print(f"Amostra treino: {len(df_train_sample):,} linhas")
print("Treinando XGBoost...")

xgb_freq = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:tweedie',   # Tweedie é ideal para dados com muitos zeros
    tweedie_variance_power=1.5,
    random_state=42,
    n_jobs=-1,
    eval_metric='mae'
)

xgb_freq.fit(
    X_train_s, y_train_s,
    eval_set=[(X_test, y_test)],
    verbose=50
)

print("✓ XGBoost treinado!")