# Previsão de Inadimplência - Random Forest (Base UCI)

# 1. Configuração Inicial

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            ConfusionMatrixDisplay, RocCurveDisplay)
from scipy.stats import randint, uniform
import joblib
import json
import os
from urllib.request import urlretrieve

# Configurações
plt.style.use('ggplot')
sns.set_palette("viridis")

## 2. Carga de Dados

In [2]:
file_path = "default_of_credit_card_clients.xls"

if not os.path.exists(file_path):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
    urlretrieve(url, file_path)

# Carregar dados
df = pd.read_excel(file_path, header=1)
df = df.rename(columns={'default payment next month': 'DEFAULT'})

# Preparar features e target
features = [col for col in df.columns if col not in ['ID', 'DEFAULT']]
X = df[features]
y = df['DEFAULT']

# Dividir dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

## 3. Modelagem - Random Forest

In [3]:
# Criar modelo base
rf = RandomForestClassifier(random_state=42)

# Espaço de parâmetros para busca aleatória
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(np.arange(5, 30, 5)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

# Busca aleatória
random_search = RandomizedSearchCV(
    rf, param_dist, n_iter=50, cv=5, scoring='roc_auc',
    n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)

# Melhor modelo
best_model = random_search.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


## 4. Avaliação do Modelo

In [None]:
# Previsões
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Métricas
metrics = {
    'model': 'random_forest',
    'best_params': random_search.best_params_,
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'features': features
}

# Exibir resultados
print("Melhores parâmetros:", random_search.best_params_)
print("\nMétricas de avaliação:")
for k, v in metrics.items():
    if k not in ['model', 'best_params', 'features']:
        print(f"{k}: {v:.4f}")

# Visualizações
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax1)
ax1.set_title('Matriz de Confusão')

# Curva ROC
RocCurveDisplay.from_estimator(best_model, X_test, y_test, ax=ax2)
ax2.set_title('Curva ROC')

plt.tight_layout()
plt.show()

## 5. Importância das Features

In [None]:
# Extrair importância das features
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plotar as 15 mais importantes
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Features Mais Importantes')
plt.tight_layout()
plt.show()

## 6. Salvamento do Modelo e Métricas

In [None]:
# Salvar modelo
joblib.dump(best_model, '../models/random_forest_model.pkl')

# Carregar métricas existentes e adicionar novas
try:
    with open('../models/model_metrics.json', 'r') as f:
        all_metrics = json.load(f)
except FileNotFoundError:
    all_metrics = []

all_metrics.append(metrics)

with open('../models/model_metrics.json', 'w') as f:
    json.dump(all_metrics, f)

print("Modelo e métricas atualizados com sucesso!")