In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from IPython.display import Markdown
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.spatial.distance import cdist
    
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.pipeline import (
    Pipeline,
    TransformerMixin
)

from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder
)

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    StratifiedKFold,
    ShuffleSplit
)

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV

In [4]:
data_path = Path('./data/raw/data.xls')
dict_path = Path('./data/external/dicionario.csv')

In [5]:
df = (
    pd
    .read_excel(data_path, header=1)
    .rename(columns={
        'default payment next month': 'DEFAULT_PAY'
    })
).sample(1000)
display(Markdown('### Conjunto de dados'))
display(df.head())


df_dict = (
    pd
    .read_csv(dict_path)
    .replace({
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Dicionário de dados'))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAY
4551,4552,80000,2,2,2,29,0,0,0,0,...,69990,36711,38103,4000,4000,2000,4000,2000,2000,0
23228,23229,350000,2,1,2,32,-2,-2,-2,-2,...,0,944,534,717,409,0,944,534,2281,0
6727,6728,220000,1,1,2,29,0,0,0,0,...,157898,164860,168294,10110,12000,5852,10000,6610,5836,0
11184,11185,180000,1,2,2,29,1,2,0,0,...,133372,92101,60389,0,7100,5500,3500,3000,30000,0
27599,27600,80000,2,1,2,26,2,0,0,2,...,41577,41595,43264,2000,3000,2000,1000,2500,1000,1


### Dicionário de dados

Unnamed: 0,variavel,significado,tipo
0,ID,Identificador unico,continuo
1,LIMIT_BAL,Valor do crédito concedido (Novo Dolár Taiwanês ),continuo
2,SEX,Genero (1= masculino; 2= feminino),nominal
3,EDUCATION,Educação Nivel de Escolaridade (1 = pós-gradua...,nominal
4,MARRIAGE,Estado civil (1 = casado; 2 = solteiro; 3 = ou...,nominal


In [6]:
df.DEFAULT_PAY.value_counts() / len(df)

0    0.776
1    0.224
Name: DEFAULT_PAY, dtype: float64

In [7]:
target_column = 'DEFAULT_PAY'
useless_columns = ['ID', target_column]
continuous_columns = (
    df_dict
    .query('tipo == "continuo" and variavel not in @useless_columns')
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('tipo == "nominal" and variavel not in @useless_columns')
    .variavel
    .to_list()
)

In [8]:
class OutlierExtractor(TransformerMixin):
    def __init__(self, columns, threshold=3, **kwargs):
        self.threshold = threshold
        self.columns = columns
        self.kwargs = kwargs

    def transform(self, X, y=None):
        X_ = np.asarray(X[self.columns])
        outliers_zscore = np.abs(cdist(self.avg[np.newaxis], X_) / self.std)[0]
        if y is not None:
            y = np.asarray(y)
            return (
                X[outliers_zscore <= self.threshold],
                y[outliers_zscore <= self.threshold]
            )
        
        return X[outliers_zscore <= self.threshold]
    
    def fit(self, X, y=None):
        X = np.asarray(X[self.columns])
        self.std = X.std()
        self.avg = X.mean(axis=0)
        return self

In [18]:
continuous_preprocessor = Pipeline(steps=[
    #('outlier_extractor', OutlierExtractor(columns = continuous_columns)), # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=10)), # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])

nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False)), # Codificação de variáveis
    # não aplicar seleção apenas 3 variaveis nominais SEX, EDUCATION e MARRIAGE # Seleção de variáveis
    #('normalization', MinMaxScaler())# Normalização 
])

preprocessor = ColumnTransformer(transformers=[
    ('continuous', continuous_preprocessor, continuous_columns),
    ('nominal', nominal_preprocessor, nominal_columns)
])

In [36]:
models = [
    ("logistic regression", LogisticRegression(), {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'penalty': ['none', 'l1', 'l2'], 'max_iter':[5000, 8000, 10000]}),
    ("support vector machine ", SVC(), {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid'], 'max_iter':[5000, 8000, 10000]}),
    ("k nearest neighbors", KNeighborsClassifier(), {'n_neighbors': [1, 3, 5, 9, 11], 'weights': ['uniform', 'distance']}),
    ("decision tree", DecisionTreeClassifier(), {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2, 10, 20], 'max_features': ['sqrt', None]}),
    ("random forest", RandomForestClassifier(), {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2, 10, 20], 'n_estimators': [10, 20, 30, 50]}),
    ("mlp", MLPClassifier(), {'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'max_iter':[5000, 8000, 10000]}), 
    ("gaussian naive bayes", GaussianNB(), {})
]

In [37]:
X = df.drop(columns=useless_columns, axis=1)
y = df[target_column]

In [38]:
def train_model(X, y, n_splits=10, final_results = {}):
    for model_name, model_obj, model_params in models:
        model_gs = GridSearchCV(
            model_obj,
            model_params,
            scoring='f1',
            cv=5
        )
        
        approach = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model_gs)
        ])
        results = cross_validate(
            approach,
            X=X,
            y=y,
            scoring=['accuracy', 'f1', 'recall', 'precision'],
            cv=ShuffleSplit(n_splits=n_splits, test_size=.2, random_state=42),
            n_jobs=-1
        )
        results["name"] = [model_name] * n_splits
        
        
        if final_results:
            for key, value in results.items():
                final_results[key] = np.append(final_results[key], value)
        else:
            final_results = results
    return final_results
    

In [39]:
final_results = train_model(X, y)

60 fits failed out of a total of 225.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/gme/.cache/pypoetry/virtualenvs/src-yN7TRKYd-py3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gme/.cache/pypoetry/virtualenvs/src-yN7TRKYd-py3.8/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/gme/.cache/pypoetry/virtualenvs/src-yN7TRKYd-py3.8/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 78, in _check_solver
    ra

In [40]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(final_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,decision tree,gaussian naive bayes,k nearest neighbors,logistic regression,mlp,random forest,support vector machine
fit_time,12.904 ± 1.191,0.635 ± 0.099,2.238 ± 0.497,20.824 ± 3.424,425.897 ± 70.734,244.964 ± 100.930,64.618 ± 9.424
score_time,0.089 ± 0.051,0.072 ± 0.036,0.081 ± 0.034,0.027 ± 0.006,0.037 ± 0.011,0.056 ± 0.041,0.059 ± 0.038
test_accuracy,0.759 ± 0.041,0.274 ± 0.024,0.727 ± 0.028,0.804 ± 0.025,0.782 ± 0.021,0.792 ± 0.015,0.764 ± 0.041
test_f1,0.400 ± 0.060,0.380 ± 0.034,0.356 ± 0.064,0.368 ± 0.065,0.383 ± 0.061,0.378 ± 0.058,0.373 ± 0.041
test_recall,0.362 ± 0.079,0.994 ± 0.016,0.339 ± 0.073,0.258 ± 0.060,0.306 ± 0.062,0.286 ± 0.059,0.315 ± 0.058
test_precision,0.472 ± 0.109,0.235 ± 0.026,0.387 ± 0.083,0.670 ± 0.099,0.526 ± 0.087,0.579 ± 0.103,0.485 ± 0.084


O melhor modelo é o : **logistic regression**