In [16]:
import time
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from IPython.display import Markdown
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.spatial.distance import cdist
    
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.pipeline import (
    Pipeline,
    TransformerMixin
)

from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder
)

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    StratifiedKFold,
    ShuffleSplit
)

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV

In [2]:
data_path = Path('../data/raw/data.xls')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
df = (
    pd
    .read_excel(data_path, header=1)
    .rename(columns={
        'default payment next month': 'DEFAULT_PAY'
    })
).sample(1000)
display(Markdown('### Conjunto de dados'))
display(df.head())


df_dict = (
    pd
    .read_csv(dict_path)
    .replace({
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Dicionário de dados'))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAY
24480,24481,360000,1,1,1,33,-1,-1,-1,-1,...,746,2153,307,897,1234,749,2163,308,298,0
921,922,50000,2,1,2,24,0,0,0,2,...,31090,31837,32555,1800,4400,0,1400,1400,1400,0
16183,16184,150000,2,1,2,30,1,-2,-2,-2,...,0,1883,1883,0,0,0,1883,0,0,1
4653,4654,10000,2,1,1,27,2,0,0,0,...,7088,8108,8264,1256,1264,407,1291,435,0,0
17137,17138,20000,2,1,2,25,-1,0,0,2,...,5040,5304,5777,1200,2500,0,500,558,0,1


### Dicionário de dados

Unnamed: 0,variavel,significado,tipo
0,ID,Identificador unico,continuo
1,LIMIT_BAL,Valor do crédito concedido (Novo Dolár Taiwanês ),continuo
2,SEX,Genero (1= masculino; 2= feminino),nominal
3,EDUCATION,Educação Nivel de Escolaridade (1 = pós-gradua...,nominal
4,MARRIAGE,Estado civil (1 = casado; 2 = solteiro; 3 = ou...,nominal


In [4]:
df.DEFAULT_PAY.value_counts() / len(df)

0    0.786
1    0.214
Name: DEFAULT_PAY, dtype: float64

In [5]:
target_column = 'DEFAULT_PAY'
useless_columns = ['ID', target_column]
continuous_columns = (
    df_dict
    .query('tipo == "continuo" and variavel not in @useless_columns')
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('tipo == "nominal" and variavel not in @useless_columns')
    .variavel
    .to_list()
)

In [6]:
class OutlierExtractor(TransformerMixin):
    def __init__(self, columns, threshold=3, **kwargs):
        self.threshold = threshold
        self.columns = columns
        self.kwargs = kwargs

    def transform(self, X, y=None):
        X_ = np.asarray(X[self.columns])
        outliers_zscore = np.abs(cdist(self.avg[np.newaxis], X_) / self.std)[0]
        if y is not None:
            y = np.asarray(y)
            return (
                X[outliers_zscore <= self.threshold],
                y[outliers_zscore <= self.threshold]
            )
        
        return X[outliers_zscore <= self.threshold]
    
    def fit(self, X, y=None):
        X = np.asarray(X[self.columns])
        self.std = X.std()
        self.avg = X.mean(axis=0)
        return self

In [7]:
continuous_preprocessor = Pipeline(steps=[
    #('outlier_extractor', OutlierExtractor(columns = continuous_columns)), # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=10)), # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])

nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')), # Codificação de variáveis
    # não aplicar seleção apenas 3 variaveis nominais SEX, EDUCATION e MARRIAGE # Seleção de variáveis
    #('normalization', MinMaxScaler())# Normalização 
])

preprocessor = ColumnTransformer(transformers=[
    ('continuous', continuous_preprocessor, continuous_columns),
    ('nominal', nominal_preprocessor, nominal_columns)
])

In [8]:
models = [
    ("logistic regression", LogisticRegression(solver='saga', max_iter=10000), {'penalty': ['none', 'l1', 'l2']}),
    ("support vector machine ", SVC(max_iter=10000), {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}),
    ("k nearest neighbors", KNeighborsClassifier(), {'n_neighbors': [1, 3, 5, 9, 11], 'weights': ['uniform', 'distance']}),
    ("decision tree", DecisionTreeClassifier(), {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2, 10, 20], 'max_features': ['sqrt', None]}),
    ("random forest", RandomForestClassifier(), {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2, 10, 20], 'n_estimators': [10, 20, 30, 50]}),
    ("mlp", MLPClassifier(solver='adam', max_iter=10000), {'activation':['identity', 'logistic', 'tanh', 'relu']}),
]

In [9]:
X = df.drop(columns=useless_columns, axis=1)
y = df[target_column]

In [10]:
def train_model(X, y, n_splits=10, final_results = {}):
    for model_name, model_obj, model_params in models:
        print(f'{model_name} run...')
        model_gs = GridSearchCV(
            model_obj,
            model_params,
            scoring='f1',
            cv=5
        )
        
        approach = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model_gs)
        ])
        results = cross_validate(
            approach,
            X=X,
            y=y,
            scoring=['accuracy', 'f1', 'recall', 'precision'],
            cv=ShuffleSplit(n_splits=n_splits, test_size=.2, random_state=42),
            n_jobs=-1
        )
        results["name"] = [model_name] * n_splits
        
        
        if final_results:
            for key, value in results.items():
                final_results[key] = np.append(final_results[key], value)
        else:
            final_results = results
    return final_results
    

In [11]:
final_results = train_model(X, y)

logistic regression run...
support vector machine  run...




k nearest neighbors run...
decision tree run...
random forest run...
mlp run...


In [12]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(final_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,decision tree,k nearest neighbors,logistic regression,mlp,random forest,support vector machine
fit_time,5.507 ± 1.306,0.994 ± 0.238,3.336 ± 0.603,91.401 ± 21.381,147.062 ± 32.881,9.657 ± 1.978
score_time,0.016 ± 0.004,0.030 ± 0.011,0.014 ± 0.002,0.041 ± 0.032,0.030 ± 0.008,0.023 ± 0.007
test_accuracy,0.753 ± 0.023,0.737 ± 0.037,0.802 ± 0.022,0.785 ± 0.019,0.791 ± 0.028,0.778 ± 0.024
test_f1,0.382 ± 0.063,0.313 ± 0.044,0.315 ± 0.073,0.385 ± 0.030,0.361 ± 0.078,0.368 ± 0.039
test_recall,0.361 ± 0.094,0.280 ± 0.063,0.212 ± 0.061,0.310 ± 0.032,0.273 ± 0.062,0.298 ± 0.039
test_precision,0.426 ± 0.068,0.373 ± 0.075,0.655 ± 0.129,0.516 ± 0.060,0.552 ± 0.144,0.490 ± 0.074


O melhor modelo é o : **logistic regression**

In [14]:
#Obtem o modelo e os parametros ganhadores
model_name, model, model_params  = [foo for foo in models if foo[0] == winner][0] 

model_gs = GridSearchCV(model, model_params, scoring='accuracy')
approach = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model_gs)
])
approach.fit(X, y) #Seleciona o approach

In [15]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

NameError: name 'joblib' is not defined