In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import pandas as pd
import numpy as np

In [2]:
df = (
    pd
    .read_csv(
        "../data/interim/data_pre_processados.csv",
    )
    # .drop(columns=["ID"], axis=1)
)

In [3]:
df.head()

Unnamed: 0,limite,sexo,educacao,estado_civil,idade,status_0509,status_0508,status_0507,status_0506,status_0505,...,saldo_0506,saldo_0505,saldo_0504,pago_0509,pago_0508,pago_0507,pago_0506,pago_0505,pago_0504,mau
0,20000,feminino,universidade,casado,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,sim
1,120000,feminino,universidade,solteiro,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,sim
2,90000,feminino,universidade,solteiro,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,nao
3,50000,feminino,universidade,casado,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,nao
4,50000,masculino,universidade,casado,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,nao


In [4]:
X = df.drop(columns=['mau'], axis=1)
y = df[['mau']]

In [5]:
nominal_columns = ['sexo', 'educacao', 'estado_civil']
numerical_columns = list(set(df.columns) - set(nominal_columns) - {'mau'}) 
# numerical_columns = [
#     coluna for coluna in df.columns if coluna not in nominal_columns and coluna != 'mau'
# ]

In [6]:
preprocessor = ColumnTransformer([
    ("one-hot-encoding", OneHotEncoder(), nominal_columns),
    ("scaler", StandardScaler(), numerical_columns)
])

In [7]:
models = {
    'logistic-regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'gradient-boosting': GradientBoostingClassifier()
}

In [8]:
results = {}
n_splits = 30
for model_name, model in models.items():
    scores = cross_validate(
        estimator = Pipeline(steps=[
            ("preprocessamento", preprocessor),
            ("modelo", model)
        ]),
        X = X,
        y = y.replace({'sim': 1, 'nao': 0}),
        cv = ShuffleSplit(n_splits=n_splits, test_size=.2),
        n_jobs = 2,
        scoring = ['accuracy', 'f1']
    )
    scores['model'] = [model_name] * n_splits
    if not(results):
        results = scores
    else:
        for key in results:
            results[key] = list(results[key])
            results[key].extend(scores[key])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

In [9]:
pd.DataFrame(results).groupby('model').agg([np.mean, np.std]).transpose()

Unnamed: 0,model,KNN,SVM,gradient-boosting,logistic-regression
fit_time,mean,0.069133,43.300327,10.017252,0.510432
fit_time,std,0.02411,3.523539,0.065603,0.017412
score_time,mean,3.878011,4.77352,0.022062,0.026657
score_time,std,0.434461,0.362957,0.000911,0.002296
test_accuracy,mean,0.792544,0.820383,0.8205,0.811239
test_accuracy,std,0.003163,0.004064,0.004573,0.005688
test_f1,mean,0.428168,0.45558,0.471047,0.363469
test_f1,std,0.010449,0.012285,0.009977,0.011782
