In [1]:
import os
import json
import time
import warnings
from typing import Dict, Any, Tuple
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             f1_score, accuracy_score, precision_score,
                             recall_score, make_scorer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.base import clone

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline

from scipy.stats import loguniform, randint
from joblib import Memory, dump

warnings.filterwarnings('ignore')
memory = Memory(location='./.cache', verbose=0)
RANDOM_STATE = 42

class CreditRiskModel:
    def __init__(self, model_type: str, metric: str = 'roc_auc') -> None:
        self.model_type = model_type
        self.metric = metric
        self.best_model = None
        self.metrics_history = []
        self.scorer = self._get_scorer()
        self._setup_model_config()

    def _get_scorer(self):
        scoring = {
            'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
            'precision': make_scorer(precision_score),
            'recall': make_scorer(recall_score),
            'f1': make_scorer(f1_score),
            'accuracy': make_scorer(accuracy_score),
            'average_precision': make_scorer(average_precision_score, needs_proba=True)
        }
        return scoring.get(self.metric, scoring['roc_auc'])

    def _setup_model_config(self) -> None:
        self.model_config = {
            'logistic': {
                'model': LogisticRegression(max_iter=2000, solver='lbfgs', class_weight='balanced'),
                'params': {
                    'classifier__C': loguniform(1e-3, 1e2)
                }
            },
            'random_forest': {
                'model': RandomForestClassifier(class_weight='balanced', n_jobs=-1),
                'params': {
                    'classifier__n_estimators': randint(200, 500),
                    'classifier__max_depth': randint(10, 50),
                    'classifier__min_samples_split': randint(2, 20),
                    'classifier__min_samples_leaf': randint(1, 10)
                }
            },
            'xgboost': {
                'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
                'params': {
                    'classifier__n_estimators': randint(150, 400),
                    'classifier__learning_rate': loguniform(0.01, 0.2),
                    'classifier__max_depth': randint(3, 10),
                    'classifier__subsample': loguniform(0.5, 1.0),
                    'classifier__colsample_bytree': loguniform(0.5, 1.0)
                }
            },
            'lightgbm': {
                'model': LGBMClassifier(class_weight='balanced', n_jobs=-1),
                'params': {
                    'classifier__n_estimators': randint(150, 400),
                    'classifier__learning_rate': loguniform(0.01, 0.2),
                    'classifier__max_depth': randint(3, 10),
                    'classifier__num_leaves': randint(15, 100)
                }
            }
        }

    def load_data(self, url: str) -> pd.DataFrame:
        column_names = [
            "Status", "Duration", "CreditHistory", "Purpose", "CreditAmount", "Savings",
            "Employment", "InstallmentRate", "PersonalStatusSex", "OtherDebtors", "ResidenceSince",
            "Property", "Age", "OtherInstallmentPlans", "Housing", "ExistingCredits",
            "Job", "NumPeopleLiable", "Telephone", "ForeignWorker", "Target"
        ]
        df = pd.read_csv(url, sep=' ', header=None, names=column_names)
        df['Target'] = df['Target'].map({1: 0, 2: 1})
        return df

    def prepare_data(self, df: pd.DataFrame) -> Tuple[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series], ColumnTransformer, list]:
        X = df.drop('Target', axis=1)
        y = df['Target']
        cat_cols = X.select_dtypes(include='object').columns.tolist()
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

        numeric_transformer = Pipeline([('scaler', StandardScaler())])
        categorical_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ])

        return train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE), preprocessor, cat_cols

    def build_pipeline(self, preprocessor: ColumnTransformer, cat_cols: list) -> ImbPipeline:
        return ImbPipeline([
            ('preprocessor', preprocessor),
            ('smote', SMOTENC(categorical_features=[i for i, col in enumerate(preprocessor.transformers[1][2])], random_state=RANDOM_STATE)),
            ('classifier', clone(self.model_config[self.model_type]['model']))
        ])

    def optimize_model(self, pipeline: ImbPipeline, X_train: pd.DataFrame, y_train: pd.Series):
        search = RandomizedSearchCV(
            pipeline,
            self.model_config[self.model_type]['params'],
            n_iter=25,
            cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE),
            scoring=self.scorer,
            n_jobs=-1,
            random_state=RANDOM_STATE,
            verbose=0
        )
        search.fit(X_train, y_train)
        return search.best_estimator_, search.best_params_

    def evaluate_model(self, model, X_test, y_test) -> Dict[str, float | str]:
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
        return {
            'roc_auc': roc_auc_score(y_test, y_proba),
            'average_precision': average_precision_score(y_test, y_proba),
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'optimization_metric': self.metric
        }

    def train_and_evaluate(self, url: str) -> Dict[str, Any]:
        df = self.load_data(url)
        (X_train, X_test, y_train, y_test), preprocessor, cat_cols = self.prepare_data(df)

        start = time.time()
        pipeline = self.build_pipeline(preprocessor, cat_cols)
        best_model, best_params = self.optimize_model(pipeline, X_train, y_train)
        train_time = time.time() - start

        metrics = self.evaluate_model(best_model, X_test, y_test)
        results = {
            'model_type': self.model_type,
            'best_params': best_params,
            'metrics': metrics,
            'training_time': train_time
        }

        self.best_model = best_model
        self.metrics_history.append(results)
        return results

    def save_results(self, results: Dict[str, Any], path: str = '../models') -> None:
        os.makedirs(path, exist_ok=True)
        dump(self.best_model, f'{path}/{self.model_type}_model.pkl')
        with open(f'{path}/{self.model_type}_metrics.json', 'w') as f:
            json.dump(results, f, indent=2)

In [2]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
MODELOS = ['logistic', 'random_forest', 'xgboost', 'lightgbm']
METRICAS = ['roc_auc', 'f1', 'average_precision']
all_results: Dict[str, Any] = {}

def execute(modelo_metrica: Tuple[str, str]) -> Tuple[str, Dict[str, Any]]:
    modelo, metrica = modelo_metrica
    print(f"[INFO] Iniciando treino para {modelo} com métrica {metrica}...")
    cr = CreditRiskModel(modelo, metrica)
    res = cr.train_and_evaluate(URL)
    cr.save_results(res)
    print(f"[INFO] Finalizado {modelo} - {metrica} | AUC: {res['metrics']['roc_auc']:.4f} | F1: {res['metrics']['f1']:.4f} | Tempo: {res['training_time']:.2f}s")
    return (f"{modelo}_{metrica}", res)

combos = [(m, metrica) for m in MODELOS for metrica in METRICAS]
with ThreadPoolExecutor() as executor:
    for chave, resultado in executor.map(execute, combos):
        all_results[chave] = resultado

with open('../models/all_results.json', 'w') as f:
    json.dump(all_results, f, indent=2)

[INFO] Iniciando treino para logistic com métrica roc_auc...
[INFO] Iniciando treino para logistic com métrica f1...
[INFO] Iniciando treino para logistic com métrica average_precision...
[INFO] Iniciando treino para random_forest com métrica roc_auc...
[INFO] Iniciando treino para random_forest com métrica f1...
[INFO] Iniciando treino para random_forest com métrica average_precision...
[INFO] Iniciando treino para xgboost com métrica roc_auc...
[INFO] Iniciando treino para xgboost com métrica f1...
[INFO] Iniciando treino para xgboost com métrica average_precision...
[INFO] Iniciando treino para lightgbm com métrica roc_auc...
[INFO] Iniciando treino para lightgbm com métrica f1...
[INFO] Iniciando treino para lightgbm com métrica average_precision...
[INFO] Finalizado logistic - roc_auc | AUC: 0.8054 | F1: 0.6667 | Tempo: 1750.36s
[INFO] Finalizado random_forest - f1 | AUC: 0.7874 | F1: 0.5902 | Tempo: 1766.69s
[INFO] Finalizado random_forest - roc_auc | AUC: 0.7879 | F1: 0.5833 | T