In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import Markdown
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

In [2]:
data_path = Path('../data/raw/data.xls')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
df = (
    pd
    .read_excel(data_path, header=1)
    .rename(columns={
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Conjunto de dados'))
display(df.head())


df_dict = (
    pd
    .read_csv(dict_path)
    .replace({
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Dicionário de dados'))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAY
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Dicionário de dados

Unnamed: 0,variavel,significado,tipo
0,ID,Identificador unico,continuo
1,LIMIT_BAL,Valor do crédito concedido (Novo Dolár Taiwanês ),continuo
2,SEX,Genero (1= masculino; 2= feminino),nominal
3,EDUCATION,Educação Nivel de Escolaridade (1 = pós-gradua...,nominal
4,MARRIAGE,Estado civil (1 = casado; 2 = solteiro; 3 = ou...,nominal


In [4]:
target_column = 'DEFAULT_PAY'
useless_columns = ['ID', target_column]
continuous_columns = (
    df_dict
    .query('tipo == "continuo" and variavel not in @useless_columns')
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('tipo == "nominal" and variavel not in @useless_columns')
    .variavel
    .to_list()
)

In [5]:
continuous_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])

nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False)), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', MinMaxScaler())# Normalização 
])

preprocessor = ColumnTransformer(transformers=[
    ('continuous', continuous_preprocessor, continuous_columns),
    ('nominal', nominal_preprocessor, nominal_columns)
])

In [6]:
models = [
    ("logistic regression", LogisticRegression()),
    ("support vector machine ", SVC(max_iter=1000)),
    ("k nearest neighbors", KNeighborsClassifier()),
    ("decision tree", DecisionTreeClassifier())
]

In [7]:
X = df.drop(columns=useless_columns, axis=1)
y = df[[target_column]]

In [8]:
n_splits = 10
X = df.drop(columns=useless_columns, axis=1)
y = df[[target_column]]

final_results = {}
for model_name, model_obj in models:
    approach = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model_obj)
    ])
    results = cross_validate(
        approach,
        X=X,
        y=y,
        scoring=['accuracy', 'f1', 'recall', 'precision'],
        cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42),
        n_jobs=-1
    )
    results["name"] = [model_name] * n_splits
    if final_results:
        for key, value in results.items():
            final_results[key] = np.append(final_results[key], value)
    else:
        final_results = results

In [9]:
pd.DataFrame(final_results).groupby('name').agg([np.mean, np.std])

Unnamed: 0_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_f1,test_f1,test_recall,test_recall,test_precision,test_precision
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
decision tree,1.930105,0.510347,0.022015,0.004512,0.723867,0.007931,0.398855,0.016174,0.414255,0.020339,0.384792,0.01557
k nearest neighbors,0.215015,0.086575,1.399151,0.30049,0.794067,0.004154,0.424895,0.01874,0.344332,0.021701,0.555752,0.015346
logistic regression,1.877162,0.624051,0.049284,0.020228,0.810267,0.005289,0.35308,0.029763,0.234632,0.024737,0.717113,0.026156
support vector machine,14.903103,4.201472,1.018914,0.335358,0.587533,0.085628,0.305647,0.046401,0.423265,0.159555,0.250333,0.028042
