In [92]:
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
dataset = fetch_openml(data_id='1499')
X, y = dataset.data, dataset.target.astype(int)
X.shape, y.shape, set(y)



((210, 7), (210,), {1, 2, 3})

# Modelos de Avaliação

In [254]:
# classificadores
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

# divisao do dataset
from sklearn.model_selection import RepeatedKFold, GridSearchCV

# preprocessamento
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MaxAbsScaler

# métricas 
from sklearn.metrics import mean_squared_log_error, mean_squared_error

# extras
import numpy as np 

# ---------------------------------------------------
# sem realizar nenhum pré processamento
params_model_1_knn = {'n_neighbors':[3, 5 , 7]}
params_model_1_logistic = {'fit_intercept':[True, False]}
model_1_knn = GridSearchCV(KNeighborsRegressor(), params_model_1_knn)
model_1_logistic = GridSearchCV(LogisticRegression(), params_model_1_logistic)

# ---------------------------------------------------
# realizando um pré processamento Normalizer
pipeline_2_knn = Pipeline([
    ('normalize', Normalizer()),
    ('knn', KNeighborsRegressor())
])
pipeline_2_logistic = Pipeline([
    ('normalize', Normalizer()),
    ('logistic', LogisticRegression())
])

model_2_knn = GridSearchCV(pipeline_2_knn, 
{'knn__n_neighbors':[3, 5 , 7]}, scoring='explained_variance')

model_2_logistic = GridSearchCV(pipeline_2_logistic, {'logistic__fit_intercept':[True, False]}, scoring='explained_variance')

# ---------------------------------------------------
# realizando um pré processamento StandardScaler
pipeline_3_knn = Pipeline([
    ('normalize', StandardScaler()),
    ('knn', KNeighborsRegressor())
])
pipeline_3_logistic = Pipeline([
    ('normalize', StandardScaler()),
    ('logistic', LogisticRegression())
])

model_3_knn = GridSearchCV(pipeline_3_knn,{'knn__n_neighbors':[3, 5 , 7]}, scoring='explained_variance')
model_3_logistic = GridSearchCV(pipeline_3_logistic,{'logistic__fit_intercept':[True, False]}, scoring='explained_variance')

# ---------------------------------------------------
# realizando um pré processamento MaxAbsScaler
pipeline_4_knn = Pipeline([
    ('normalize', MaxAbsScaler()),
    ('knn', KNeighborsRegressor())
])
pipeline_4_logistic = Pipeline([
    ('normalize', MaxAbsScaler()),
    ('logistic', LogisticRegression())
])

model_4_knn = GridSearchCV(pipeline_4_knn,{'knn__n_neighbors':[3, 5 , 7]}, scoring='explained_variance')
model_4_logistic = GridSearchCV(pipeline_4_logistic,{'logistic__fit_intercept':[True, False]}, scoring='explained_variance')

# ---------------------------------------------------
# prepara a lista de avaliações
evaluates = {
    1 : {'knn':model_1_knn, 'logistic':model_1_logistic, 'data':[], 'title':'Sem Pré-processamento'},
    2 : {'knn':model_2_knn, 'logistic':model_2_logistic, 'data':[], 'title':'Com Normalizer'},
    3 : {'knn':model_3_knn, 'logistic':model_3_logistic, 'data':[], 'title':'Com StandardScaler'},
    4 : {'knn':model_4_knn, 'logistic':model_4_logistic, 'data':[], 'title':'Com MaxAbsScaler'},
}


# Avaliações

In [255]:
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.filterwarnings('ignore')

# realização 10 repetições do kfold, para k=5 (5 folds)
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=150)

def evaluate(model, X_train, y_train, X_test, y_test):
    """ função para treinar o modelo,  realizar a predição e calcular as métricas 
    """
    # treina e realiza a predição
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)

    # calcula as métricas
    hits  = ypred == y_test
    hits_percent = sum(hits)/len(hits)   # hits 
    neg_msle = mean_squared_log_error(y_test, ypred)   
    root_mse = mean_squared_error(y_test, ypred, squared=False) 
    return [hits_percent, neg_msle, root_mse]

# para cada repetição de kfold
for train_index, test_index in rkf.split(X):
    # obtém os dados da repetição i
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # para cada classificador 
    for classifier in classifiers:        
        # calcula os resultados do knn
        result_knn = evaluate(classifier['knn'], X_train, y_train, X_test, y_test)
        # calcula os resultados do logistic
        result_logistic = evaluate(classifier['logistic'], X_train, y_train, X_test, y_test)

        # adiciona na lista da respectiva avaliação
        evaluates[classifier['id']]['data'].append([result_knn, result_logistic])

# Apresentação dos resultados

In [256]:
# para cada avalição realizada
for e in evaluates:
    # dados da avalição
    data = np.array(evaluates[e]['data'])
    title = evaluates[e]['title']    

    # métricas
    ## hits 
    knn_mean_hits = np.mean(data[:, 0, 0])*100
    logistic_mean_hits = np.mean(data[:, 1, 0])*100

    ## mean squared log_error
    knn_mean_mslr = np.mean(data[:, 0, 1])*100
    logistic_mean_mslr = np.mean(data[:, 1, 1])*100

    ## mean squared error
    knn_mean_rmse = np.mean(data[:, 0, 2])*100
    logistic_mean_rmse = np.mean(data[:, 1, 2])*100
    
    print('---'*30)
    print(f'Avaliação {e} - {title}')
    print(f'\tKNN: {"":5}hits: {knn_mean_hits:.2f}%, {"":5} mean_squared_log_error: {knn_mean_mslr:.2f}%, {"":5} root_mse: {knn_mean_rmse:.2f}%')
    print(f'\tLogistic: hits: {logistic_mean_hits:.2f}%, {"":5} mean_squared_log_error: {logistic_mean_mslr:.2f}%, {"":5} root_mse: {logistic_mean_rmse:.2f}%')
    print('')

------------------------------------------------------------------------------------------
Avaliação 1 - Sem Pré-processamento
	KNN:      hits: 74.14%,       mean_squared_log_error: 2.51%,       root_mse: 42.28%
	Logistic: hits: 91.76%,       mean_squared_log_error: 2.72%,       root_mse: 44.02%

------------------------------------------------------------------------------------------
Avaliação 2 - Com Normalizer
	KNN:      hits: 71.57%,       mean_squared_log_error: 2.35%,       root_mse: 41.23%
	Logistic: hits: 79.81%,       mean_squared_log_error: 5.89%,       root_mse: 66.22%

------------------------------------------------------------------------------------------
Avaliação 3 - Com StandardScaler
	KNN:      hits: 72.95%,       mean_squared_log_error: 2.11%,       root_mse: 38.59%
	Logistic: hits: 92.57%,       mean_squared_log_error: 2.67%,       root_mse: 44.10%

------------------------------------------------------------------------------------------
Avaliação 4 - Com MaxAbsS

In [None]:
{}