In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from IPython.display import Markdown
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.spatial.distance import cdist
    
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.pipeline import (
    Pipeline,
    TransformerMixin
)

from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder
)

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    StratifiedKFold,
    ShuffleSplit
)

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV

In [2]:
data_path = Path('../data/raw/data.xls')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
df = (
    pd
    .read_excel(data_path, header=1)
    .rename(columns={
        'default payment next month': 'DEFAULT_PAY'
    })
).sample(1000)
display(Markdown('### Conjunto de dados'))
display(df.head())


df_dict = (
    pd
    .read_csv(dict_path)
    .replace({
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Dicionário de dados'))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAY
14405,14406,100000,2,1,2,26,0,0,0,-1,...,881,57540,55038,3000,1016,2007,57500,2000,0,0
25194,25195,30000,2,2,2,23,0,0,0,0,...,29497,29290,28693,10000,5000,3000,2000,3150,0,0
27264,27265,310000,1,2,2,41,0,0,0,0,...,168389,151755,73083,10250,9278,7000,7000,5000,188000,0
27996,27997,180000,2,1,2,31,-2,-2,-2,-2,...,0,4775,0,1771,1571,0,4775,0,0,0
28590,28591,80000,2,3,1,34,0,0,2,2,...,49423,46132,47233,5000,2000,2000,0,2000,2000,1


### Dicionário de dados

Unnamed: 0,variavel,significado,tipo
0,ID,Identificador unico,continuo
1,LIMIT_BAL,Valor do crédito concedido (Novo Dolár Taiwanês ),continuo
2,SEX,Genero (1= masculino; 2= feminino),nominal
3,EDUCATION,Educação Nivel de Escolaridade (1 = pós-gradua...,nominal
4,MARRIAGE,Estado civil (1 = casado; 2 = solteiro; 3 = ou...,nominal


In [4]:
df.DEFAULT_PAY.value_counts() / len(df)

0    0.79
1    0.21
Name: DEFAULT_PAY, dtype: float64

In [5]:
target_column = 'DEFAULT_PAY'
useless_columns = ['ID', target_column]
continuous_columns = (
    df_dict
    .query('tipo == "continuo" and variavel not in @useless_columns')
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('tipo == "nominal" and variavel not in @useless_columns')
    .variavel
    .to_list()
)

In [6]:
class OutlierExtractor(TransformerMixin):
    def __init__(self, columns, threshold=3, **kwargs):
        self.threshold = threshold
        self.columns = columns
        self.kwargs = kwargs

    def transform(self, X, y=None):
        X_ = np.asarray(X[self.columns])
        outliers_zscore = np.abs(cdist(self.avg[np.newaxis], X_) / self.std)[0]
        if y is not None:
            y = np.asarray(y)
            return (
                X[outliers_zscore <= self.threshold],
                y[outliers_zscore <= self.threshold]
            )
        
        return X[outliers_zscore <= self.threshold]
    
    def fit(self, X, y=None):
        X = np.asarray(X[self.columns])
        self.std = X.std()
        self.avg = X.mean(axis=0)
        return self

In [7]:
continuous_preprocessor = Pipeline(steps=[
    # ('outlier_extractor', OutlierExtractor(columns = continuous_columns)), # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=10)), # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])

nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False)), # Codificação de variáveis
    # não aplicar seleção apenas 3 variaveis nominais SEX, EDUCATION e MARRIAGE # Seleção de variáveis
    #('normalization', MinMaxScaler())# Normalização 
])

preprocessor = ColumnTransformer(transformers=[
    ('continuous', continuous_preprocessor, continuous_columns),
    ('nominal', nominal_preprocessor, nominal_columns)
])

In [8]:
models = [
    ("logistic regression", LogisticRegression(max_iter=1000, solver='saga'), {'penalty': ['none', 'l1', 'l2']}),
    # ("support vector machine ", SVC(max_iter=5000), {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}),
    ("k nearest neighbors", KNeighborsClassifier(), {'n_neighbors': [1, 3, 5, 9, 11], 'weights': ['uniform', 'distance']}),
    # ("decision tree", DecisionTreeClassifier(), {}),
    # ("random forest", RandomForestClassifier(), {}),
    # ("mlp", MLPClassifier(max_iter=5000), {}), 
    # ("gaussian naive bayes", GaussianNB(), {})
]

In [9]:
X = df.drop(columns=useless_columns, axis=1)
y = df[target_column]
# X = pd.DataFrame(preprocessor.fit_transform(X, y), columns = preprocessor.get_feature_names_out())


# X_train, X_both, y_train, y_both = train_test_split(X, y, test_size = 0.5)
# X_test, X_val, y_test, y_val = train_test_split(X_both, y_both, test_size = 0.5)


In [10]:
def undersampling(x, y):
    x = pd.DataFrame(x)
    y = pd.DataFrame(y)
    x['label'] = y
    x_0 = x[x['label'] == 0]
    x_1 = x[x['label'] == 1]
    x_0 = x_0.sample(n = x_1.shape[0])
    x = pd.concat([x_0, x_1], ignore_index=True, axis=0).sample(frac=1)
    return x.drop('label', axis=1), x['label']

In [11]:
X_train2, y_train2 = undersampling(X_train, y_train)

NameError: name 'X_train' is not defined

In [None]:
y_train2.value_counts()

In [12]:
def conf_matrix(y_test, predicted):
    cm = confusion_matrix(y_test, predicted)
    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm
    )
    disp.plot()
    plt.show()

In [13]:
def train_model(X, y, n_splits=10, final_results = {}):
    for model_name, model_obj, model_params in models:
        model_gs = GridSearchCV(
            model_obj,
            model_params,
            scoring='accuracy',
            cv=5
        )
        approach = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model_gs)
        ])
        results = cross_validate(
            approach,
            X=X,
            y=y,
            scoring=['accuracy', 'f1', 'recall', 'precision'],
            cv=ShuffleSplit(n_splits=n_splits, test_size=.2, random_state=42),
            n_jobs=-1
        )
        results["name"] = [model_name] * n_splits

        # clf = model_obj.fit(x, y)
        # y_pred = model_obj.predict(X_val)
        
        # print("=======================================")
        # print(model_name)
        # # conf_matrix(y_val, y_pred)
        
        if final_results:
            for key, value in results.items():
                final_results[key] = np.append(final_results[key], value)
        else:
            final_results = results
    return final_results
    

In [14]:
final_results = train_model(X, y)

In [18]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(final_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,k nearest neighbors,logistic regression
fit_time,0.860 ± 0.184,3.016 ± 0.657
score_time,0.041 ± 0.020,0.014 ± 0.003
test_accuracy,0.798 ± 0.014,0.796 ± 0.023
test_f1,0.302 ± 0.078,0.297 ± 0.064
test_recall,0.205 ± 0.069,0.200 ± 0.057
test_precision,0.638 ± 0.102,0.628 ± 0.148


O melhor modelo é o : **k nearest neighbors**

Traceback (most recent call last):
  File "/home/madson/Workspace/atlantico/projects/risk-prediction/.venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 74, in _cached_call
    return cache[method]
KeyError: 'predict'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/madson/Workspace/atlantico/projects/risk-prediction/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/madson/Workspace/atlantico/projects/risk-prediction/.venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/madson/Workspace/atlantico/projects/risk-prediction/.venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/madson/Workspace/atl

In [None]:
final_results = train_model(X_train.drop('label', axis=1),y_train,X_val,y_val) ## Dados desbalanceados para a classe 0

In [None]:
final_results2 = train_model(X_train2,y_train2,X_val,y_val) ## Dados balanceados

In [None]:
pd.DataFrame(final_results).groupby('name').agg([np.mean, np.std])

In [None]:
pd.DataFrame(final_results2).groupby('name').agg([np.mean, np.std])