# Carregamento da Base

In [37]:
pip install ucimlrepo

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [38]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets

# Imports

In [39]:
import numpy as np
import pandas as pd

# Visualização
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Reamostragem (classes desbalanceadas)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Validação e busca de hiperparâmetros
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
)

# Métricas
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    cohen_kappa_score,
    classification_report,
    confusion_matrix,
    make_scorer,
)

# Otimizadores
from skopt import BayesSearchCV
from skopt.space import Real as SKOReal, Integer as SKOInteger, Categorical as SKOCategorical
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous as GAContinuous, Integer as GAInteger, Categorical as GACategorical

# Utilidades
import warnings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42

# Pré-Processamento

## Juntar as features e o target

In [40]:
data = pd.concat([X, y], axis=1)
data.columns = list(X.columns) + ['target']

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Attribute1   1000 non-null   object
 1   Attribute2   1000 non-null   int64 
 2   Attribute3   1000 non-null   object
 3   Attribute4   1000 non-null   object
 4   Attribute5   1000 non-null   int64 
 5   Attribute6   1000 non-null   object
 6   Attribute7   1000 non-null   object
 7   Attribute8   1000 non-null   int64 
 8   Attribute9   1000 non-null   object
 9   Attribute10  1000 non-null   object
 10  Attribute11  1000 non-null   int64 
 11  Attribute12  1000 non-null   object
 12  Attribute13  1000 non-null   int64 
 13  Attribute14  1000 non-null   object
 14  Attribute15  1000 non-null   object
 15  Attribute16  1000 non-null   int64 
 16  Attribute17  1000 non-null   object
 17  Attribute18  1000 non-null   int64 
 18  Attribute19  1000 non-null   object
 19  Attribute20  1000 non-null  

## Remoção de Outliers

In [42]:
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'target']

# IQR capping  
def iqr_cap(series, factor=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor * iqr
    upper = q3 + factor * iqr
    return series.clip(lower, upper)

# Aplicar capping aos numéricos
for col in numeric_cols:
    data[col] = iqr_cap(data[col])

data[numeric_cols].describe()


Unnamed: 0,Attribute2,Attribute5,Attribute8,Attribute11,Attribute13,Attribute16,Attribute18
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.307,3051.101,2.973,2.845,35.4535,1.404,1.0
std,10.615151,2187.140403,1.118715,1.103718,11.106324,0.565335,0.0
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0
max,42.0,7882.375,4.0,4.0,64.5,3.5,1.0


## Separação Entre Features e Target

In [43]:
X = data.drop('target', axis=1)
y = data['target']

## Separar Features Numericas e Categóricas

In [44]:
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(exclude=np.number).columns

## Pipeline de Features

In [45]:
# numéricas: imputar valores ausentes com a mediana e escalar
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categóricas: imputar valores ausentes com o valor mais frequente e aplicar One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Processador

In [46]:
# Criar um pré-processador usando ColumnTransformer para aplicar transformações diferentes a diferentes colunas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Manter outras colunas não especificadas, se houver
)

### Aplicando as Features

In [47]:
X_processed = preprocessor.fit_transform(X)

## Convertendo Rótulo

In [48]:
# Original usa 1 = Good, 2 = Bad. Converter 2 para 0 para ter classes 0 e 1.
y_processed = y.values.ravel()
y_processed = np.where(y_processed == 2, 0, y_processed) # 1 (Good credit), 0 (Bad credit)

## Split Treino e Teste

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=RANDOM_STATE, stratify=y_processed
)

## Dados Sinteticos (SMOTE)

In [50]:
smote = SMOTE(random_state=RANDOM_STATE)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [51]:
## Cenários de Classificação (Normal vs PCA 3 componentes)
from scipy import sparse

# Garantir densidade para algoritmos que exigem matriz densa
def to_dense(X):
    return X.toarray() if sparse.issparse(X) else X

X_train_resampled = X_resampled  # resultado do SMOTE
X_test_original = X_test         # teste sem oversampling

X_train_resampled_dense = to_dense(X_train_resampled)
X_test_dense = to_dense(X_test_original)

# Cenário PCA (3 componentes)
pca = PCA(n_components=3, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_resampled_dense)
X_test_pca = pca.transform(X_test_dense)

scenarios = {
    'normal': (X_train_resampled_dense, X_test_dense),
    'pca_3': (X_train_pca, X_test_pca)
}
print({k: v[0].shape for k, v in scenarios.items()})

{'normal': (1120, 61), 'pca_3': (1120, 3)}


In [52]:
## Espaços de Hiperparâmetros (Grid, Random, Bayes, Genética)
from scipy.stats import randint, uniform

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Estimadores base
estimators = {
    'rf': RandomForestClassifier(random_state=RANDOM_STATE),
    'knn': KNeighborsClassifier(),
    'svm': SVC(probability=True, random_state=RANDOM_STATE),
    'mlp': MLPClassifier(max_iter=300, random_state=RANDOM_STATE)
}

# Grid (pequeno para demonstração)
param_grid = {
    'rf': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5]
    },
    'knn': {
        'n_neighbors': [5, 11, 17],
        'weights': ['uniform', 'distance']
    },
    'svm': {
        'C': [0.5, 1, 5],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'mlp': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001]
    }
}

# Random distributions
param_dist = {
    'rf': {
        'n_estimators': randint(50, 300),
        'max_depth': [None] + list(range(5, 16, 5)),
        'min_samples_split': randint(2, 10)
    },
    'knn': {
        'n_neighbors': randint(3, 25),
        'weights': ['uniform', 'distance'],
        'p': randint(1, 3)
    },
    'svm': {
        'C': uniform(0.1, 10),
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    },
    'mlp': {
        'hidden_layer_sizes': [(randint.rvs(40, 120),)],
        'alpha': uniform(1e-5, 1e-2),
        'learning_rate_init': uniform(1e-4, 1e-2)
    }
}

# Bayes search spaces
bayes_spaces = {
    'rf': {
        'n_estimators': SKOInteger(50, 300),
        'max_depth': SKOCategorical([None, 5, 10, 15]),
        'min_samples_split': SKOInteger(2, 10)
    },
    'knn': {
        'n_neighbors': SKOInteger(3, 25),
        'weights': SKOCategorical(['uniform', 'distance']),
        'p': SKOInteger(1, 2)
    },
    'svm': {
        'C': SKOReal(0.1, 10.0, prior='log-uniform'),
        'gamma': SKOCategorical(['scale', 'auto']),
        'kernel': SKOCategorical(['rbf', 'linear'])
    },
    'mlp': {
        'hidden_layer_sizes': SKOCategorical([(50,), (100,), (150,)]),
        'alpha': SKOReal(1e-5, 1e-2, prior='log-uniform'),
        'learning_rate_init': SKOReal(1e-4, 1e-2, prior='log-uniform')
    }
}

# GA search spaces
ga_spaces = {
    'rf': {
        'n_estimators': GAInteger(50, 300),
        'max_depth': GACategorical([None, 5, 10, 15]),
        'min_samples_split': GAInteger(2, 10)
    },
    'knn': {
        'n_neighbors': GAInteger(3, 25),
        'weights': GACategorical(['uniform', 'distance']),
        'p': GAInteger(1, 2)
    },
    'svm': {
        'C': GAContinuous(0.1, 10.0),
        'gamma': GACategorical(['scale', 'auto']),
        'kernel': GACategorical(['rbf', 'linear'])
    },
    'mlp': {
        'hidden_layer_sizes': GACategorical([(50,), (100,), (150,)]),
        'alpha': GAContinuous(1e-5, 1e-2),
        'learning_rate_init': GAContinuous(1e-4, 1e-2)
    }
}
print('Espaços definidos: OK')

Espaços definidos: OK


In [None]:
## Funções de Otimização e Métricas
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
    'kappa': make_scorer(cohen_kappa_score)
}

import time

def fit_search(search_obj, X, y):
    start = time.time()
    search_obj.fit(X, y)
    return search_obj, time.time() - start

def build_search(alg_key, search_type):
    est = estimators[alg_key]
    if search_type == 'grid':
        return GridSearchCV(est, param_grid[alg_key], cv=cv, scoring='f1', n_jobs=-1, refit=True)
    if search_type == 'random':
        return RandomizedSearchCV(est, param_distributions=param_dist[alg_key], cv=cv, scoring='f1', n_iter=15, random_state=RANDOM_STATE, n_jobs=-1, refit=True)
    if search_type == 'bayes':
        return BayesSearchCV(est, bayes_spaces[alg_key], cv=cv, scoring='f1', n_iter=25, random_state=RANDOM_STATE, n_jobs=-1, refit=True)
    if search_type == 'genetic':
        return GASearchCV(estimator=est, cv=cv, scoring='f1', population_size=15, generations=10, tournament_size=3,
                          n_jobs=-1, verbose=False, param_grid=ga_spaces[alg_key])
    raise ValueError('Tipo de busca inválido')

from collections import OrderedDict

def evaluate_best(best_estimator, X_train, y_train, X_test, y_test):
    # CV detalhado nas métricas
    cv_results = cross_validate(best_estimator, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    summary_cv = {f'{m}_cv_mean': np.mean(cv_results[f'test_{m}']) for m in ['accuracy','precision','recall','f1','roc_auc','kappa']}
    summary_cv.update({f'{m}_cv_std': np.std(cv_results[f'test_{m}']) for m in ['accuracy','precision','recall','f1','roc_auc','kappa']})

    # Test set
    y_pred = best_estimator.predict(X_test)
    if hasattr(best_estimator, 'predict_proba'):
        y_proba = best_estimator.predict_proba(X_test)[:,1]
    else:
        # fallback para decision_function se existir
        if hasattr(best_estimator, 'decision_function'):
            from sklearn.preprocessing import MinMaxScaler
            dec = best_estimator.decision_function(X_test).reshape(-1,1)
            y_proba = MinMaxScaler().fit_transform(dec).ravel()
        else:
            y_proba = y_pred  # aproximação

    test_metrics = {
        'accuracy_test': accuracy_score(y_test, y_pred),
        'precision_test': precision_score(y_test, y_pred),
        'recall_test': recall_score(y_test, y_pred),
        'f1_test': f1_score(y_test, y_pred),
        'roc_auc_test': roc_auc_score(y_test, y_proba),
        'kappa_test': cohen_kappa_score(y_test, y_pred)
    }
    return {**summary_cv, **test_metrics}

print('Funções definidas: OK')

Funções definidas: OK


## Loop de Experimentos (Cenários x Modelos x Otimizadores)

Este bloco executa as buscas de hiperparâmetros (Grid, Random, Bayes, Genética) para RF, KNN, SVM e MLP em dois cenários: normal e PCA(3). Coleta métricas (médias e desvios no CV e desempenho em teste) e tempo de execução.

In [54]:
# Execução dos Experimentos

results = []
best_by_scenario = {}

optimizers = ['grid', 'random', 'bayes', 'genetic']

for scen_name, (Xtr, Xte) in scenarios.items():
    print(f"\n=== Cenário: {scen_name} ===")
    # y para treino (após SMOTE) e teste (original)
    ytr = y_resampled
    yte = y_test

    scen_records = []

    for alg_key in estimators.keys():
        for opt in optimizers:
            print(f" - {alg_key.upper()} via {opt}")
            try:
                search = build_search(alg_key, opt)
                fitted, elapsed = fit_search(search, Xtr, ytr)
                best_est = fitted.best_estimator_
                metrics = evaluate_best(best_est, Xtr, ytr, Xte, yte)

                record = {
                    'scenario': scen_name,
                    'algorithm': alg_key,
                    'optimizer': opt,
                    'time_sec': elapsed,
                    'best_params': fitted.best_params_,
                }
                record.update(metrics)
                results.append(record)
                scen_records.append(record)
            except Exception as e:
                print(f"   ! Falhou: {e}")
                continue

    # Selecionar melhor por F1 em teste neste cenário
    if scen_records:
        scen_df = pd.DataFrame(scen_records)
        top = scen_df.sort_values('f1_test', ascending=False).iloc[0]
        best_by_scenario[scen_name] = {
            'algorithm': top['algorithm'],
            'optimizer': top['optimizer'],
            'f1_test': top['f1_test'],
            'roc_auc_test': top['roc_auc_test'],
            'best_params': top['best_params'],
        }

# Consolidar resultados
results_df = pd.DataFrame(results)
print("\nResumo (top 10 por F1 em teste):")
if not results_df.empty:
    display(results_df.sort_values(['scenario', 'f1_test'], ascending=[True, False]).head(10))

print('\nMelhores por cenário:')
for scen, info in best_by_scenario.items():
    print(scen, '=>', info)

# Opcional: salvar em CSV
results_df.to_csv('resultados_experimentos.csv', index=False)


=== Cenário: normal ===
 - RF via grid
 - RF via random
 - RF via random
 - RF via bayes
 - RF via bayes
 - RF via genetic
 - RF via genetic
 - KNN via grid
 - KNN via random
 - KNN via grid
 - KNN via random
 - KNN via bayes
 - KNN via bayes
 - KNN via genetic
 - KNN via genetic
 - SVM via grid
 - SVM via grid
 - SVM via random
 - SVM via random
 - SVM via bayes
 - SVM via bayes
 - SVM via genetic
 - SVM via genetic
 - MLP via grid
 - MLP via grid
 - MLP via random
 - MLP via random
 - MLP via bayes
 - MLP via bayes
   ! Falhou: Not all points are within the bounds of the space.
 - MLP via genetic
   ! Falhou: Not all points are within the bounds of the space.
 - MLP via genetic

=== Cenário: pca_3 ===
 - RF via grid

=== Cenário: pca_3 ===
 - RF via grid
 - RF via random
 - RF via random
 - RF via bayes
 - RF via bayes
 - RF via genetic
 - RF via genetic
 - KNN via grid
 - KNN via random
 - KNN via grid
 - KNN via random
 - KNN via bayes
 - KNN via bayes
 - KNN via genetic
 - KNN vi

Unnamed: 0,scenario,algorithm,optimizer,time_sec,best_params,accuracy_cv_mean,precision_cv_mean,recall_cv_mean,f1_cv_mean,roc_auc_cv_mean,...,recall_cv_std,f1_cv_std,roc_auc_cv_std,kappa_cv_std,accuracy_test,precision_test,recall_test,f1_test,roc_auc_test,kappa_test
1,normal,rf,random,8.857357,"{'max_depth': 15, 'min_samples_split': 5, 'n_e...",0.849107,0.834394,0.871429,0.852226,0.925446,...,0.031237,0.019523,0.009301,0.037201,0.745,0.798658,0.85,0.823529,0.794286,0.365672
0,normal,rf,grid,5.263286,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.846429,0.83701,0.860714,0.848255,0.922816,...,0.036857,0.018545,0.009784,0.032242,0.74,0.823529,0.8,0.811594,0.787262,0.392523
3,normal,rf,genetic,154.132376,"{'n_estimators': 282, 'max_depth': None, 'min_...",0.857143,0.839074,0.883929,0.86073,0.9283,...,0.025877,0.015051,0.007904,0.028235,0.725,0.785235,0.835714,0.809689,0.785476,0.31592
2,normal,rf,bayes,54.984208,"{'max_depth': 15, 'min_samples_split': 3, 'n_e...",0.851786,0.834242,0.878571,0.855536,0.927583,...,0.026245,0.013489,0.008891,0.025505,0.725,0.793103,0.821429,0.807018,0.785119,0.329268
10,normal,svm,bayes,20.956353,"{'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}",0.858929,0.896727,0.8125,0.852024,0.941486,...,0.024614,0.01412,0.014539,0.026845,0.715,0.79021,0.807143,0.798587,0.722857,0.311594
11,normal,svm,genetic,75.777912,"{'C': 9.542163751882395, 'gamma': 'scale', 'ke...",0.858929,0.895206,0.814286,0.852323,0.941231,...,0.023555,0.013597,0.014598,0.026245,0.715,0.79021,0.807143,0.798587,0.724524,0.311594
14,normal,mlp,genetic,196.13704,"{'hidden_layer_sizes': (50,), 'alpha': 0.00473...",0.859821,0.893856,0.817857,0.853468,0.921971,...,0.034626,0.019787,0.017263,0.035084,0.715,0.80292,0.785714,0.794224,0.724762,0.330986
12,normal,mlp,grid,16.05883,"{'activation': 'relu', 'alpha': 0.001, 'hidden...",0.851786,0.884363,0.810714,0.845104,0.912691,...,0.039286,0.021643,0.021536,0.038049,0.705,0.791367,0.785714,0.78853,0.704286,0.300948
8,normal,svm,grid,5.072419,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}",0.85625,0.886675,0.817857,0.850428,0.935029,...,0.028571,0.019669,0.013822,0.03677,0.7,0.789855,0.778571,0.784173,0.74131,0.292453
9,normal,svm,random,7.492666,"{'C': 4.419450186421157, 'gamma': 'scale', 'ke...",0.855357,0.881994,0.821429,0.850153,0.93353,...,0.029342,0.018125,0.013851,0.033216,0.695,0.780142,0.785714,0.782918,0.744405,0.270335



Melhores por cenário:
normal => {'algorithm': 'rf', 'optimizer': 'random', 'f1_test': np.float64(0.8235294117647058), 'roc_auc_test': np.float64(0.7942857142857143), 'best_params': {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 237}}
pca_3 => {'algorithm': 'rf', 'optimizer': 'grid', 'f1_test': np.float64(0.7730496453900709), 'roc_auc_test': np.float64(0.6672619047619048), 'best_params': {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}}
