# Previsão de Inadimplência - XGBoost (Base UCI)

## 1. Configuração Inicial

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            ConfusionMatrixDisplay, RocCurveDisplay)
from scipy.stats import randint, uniform
import joblib
import json
import os
from urllib.request import urlretrieve

# Configurações
plt.style.use('ggplot')
sns.set_palette("viridis")

## 2. Carga de Dados


In [2]:
file_path = "default_of_credit_card_clients.xls"

if not os.path.exists(file_path):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
    urlretrieve(url, file_path)

# Carregar dados
df = pd.read_excel(file_path, header=1)
df = df.rename(columns={'default payment next month': 'DEFAULT'})

# Preparar features e target
features = [col for col in df.columns if col not in ['ID', 'DEFAULT']]
X = df[features]
y = df['DEFAULT']

# Dividir dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Calcular proporção para balanceamento de classes
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)


## 3. Modelagem - XGBoost

In [3]:
# Criar modelo base
xgb = XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Espaço de parâmetros para busca aleatória
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'scale_pos_weight': [1, scale_pos_weight]
}

# Busca aleatória
random_search = RandomizedSearchCV(
    xgb, param_dist, n_iter=50, cv=5, scoring='roc_auc',
    n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)

# Melhor modelo
best_model = random_search.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## 4. Avaliação do Modelo

In [None]:
# Previsões
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Métricas
metrics = {
    'model': 'xgboost',
    'best_params': random_search.best_params_,
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'features': features
}

# Exibir resultados
print("Melhores parâmetros:", random_search.best_params_)
print("\nMétricas de avaliação:")
for k, v in metrics.items():
    if k not in ['model', 'best_params', 'features']:
        print(f"{k}: {v:.4f}")

# Visualizações
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax1)
ax1.set_title('Matriz de Confusão')

# Curva ROC
RocCurveDisplay.from_estimator(best_model, X_test, y_test, ax=ax2)
ax2.set_title('Curva ROC')

plt.tight_layout()
plt.show()

## 5. Importância das Features com SHAP

In [None]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Plotar importância global
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=15)
plt.title('Importância das Features - SHAP Values')
plt.tight_layout()
plt.show()

## 6. Salvamento do Modelo e Métricas

In [None]:
joblib.dump(best_model, '../models/xgboost_model.pkl')

# Carregar métricas existentes e adicionar novas
try:
    with open('../models/model_metrics.json', 'r') as f:
        all_metrics = json.load(f)
except FileNotFoundError:
    all_metrics = []

all_metrics.append(metrics)

with open('../models/model_metrics.json', 'w') as f:
    json.dump(all_metrics, f)

print("Modelo e métricas atualizados com sucesso!")