In [None]:
from dataclasses import dataclass
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import joblib

import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")


In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)
df.rename(columns={"default payment next month": "default"}, inplace=True)
df.head()


# Análise exploratória (EDA)

In [None]:
# Distribuição do alvo
sns.countplot(x='default', data=df)
plt.title('Distribuição de Inadimplentes')
plt.show()

# Correlação com a variável alvo
correlations = df.corr(numeric_only=True)['default'].sort_values(ascending=False)
print("Top correlações com 'default':")
print(correlations.head(10))


In [None]:
X = df.drop(columns=['ID', 'default'])
y = df['default']

# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)


In [None]:
# Pipeline com SMOTE, escalonamento e XGBoost
pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        tree_method='hist'
    ))
])

# Reduzir o espaço de busca de hiperparâmetros
param_dist = {
    'classifier__n_estimators': randint(100, 500),
    'classifier__max_depth': randint(4, 10),
    'classifier__learning_rate': uniform(0.05, 0.15),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__colsample_bytree': uniform(0.7, 0.3),
    'classifier__gamma': uniform(0, 0.3),
    'classifier__min_child_weight': randint(1, 5)
}

# Validação cruzada com menos folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring='accuracy',
    cv=cv,
    n_iter=50,  # Limitar para 50 combinações
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Treinamento do modelo
random_search.fit(X_train, y_train)

# Resultados
print("Melhores parâmetros encontrados:")
print(random_search.best_params_)



In [None]:
# Avaliação do modelo
y_pred = random_search.predict(X_test)
y_proba = random_search.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# Matriz de Confusão
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de Confusão')
plt.show()

In [None]:
import os

# Criar o diretório se não existir
os.makedirs(os.path.join('..', 'models'), exist_ok=True)

# Salvando o modelo
joblib.dump(random_search.best_estimator_, os.path.join('..', 'models', 'modelo_001.pkl'))
print("Modelo salvo com sucesso em '/models/modelo_001.pkl'")