In [84]:
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

dataset = load_diabetes()
print(dataset.keys())
X, y = dataset.data, dataset.target
X.shape, y.shape

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])


((442, 10), (442,))

In [85]:
#Usando apenas Cross Validate

In [86]:
classificador_knn = KNeighborsRegressor()

scores_cv = cross_validate(classificador_knn, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10)
print(scores_cv['test_score'])
sempad = np.mean(scores_cv['test_score'])
print(f"Sem padronização: {sempad}")

[-60.5026207  -54.72698299 -67.61620973 -58.78417381 -61.34256271
 -60.96334219 -66.81650585 -53.16344438 -72.33537422 -54.4206094 ]
Sem padronização: -61.06718259764112


In [87]:
classificador_lr = LogisticRegression()

scores_cv = cross_validate(classificador_lr, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10)
print(scores_cv['test_score'])
sempad = np.mean(scores_cv['test_score'])
print(f"Sem padronização: {sempad}")

ValueError: n_splits=10 cannot be greater than the number of members in each class.

In [71]:
#Usando apenas Grid Search

In [72]:
hiper_parametros = {'n_neighbors': [3,5,7,9]}

grid_search_knn = GridSearchCV(classificador_knn , hiper_parametros, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))
grid_search_knn.fit(X, y)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [3, 5, 7, 9]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))

In [73]:
grid_search_knn.cv_results_

{'mean_fit_time': array([0.00160279, 0.00139942, 0.00059991, 0.00059981]),
 'std_fit_time': array([0.0008042 , 0.00049182, 0.00080004, 0.00048975]),
 'mean_score_time': array([0.00219378, 0.00259829, 0.00159769, 0.00159869]),
 'std_score_time': array([0.00074917, 0.00049067, 0.00079795, 0.00048928]),
 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3},
  {'n_neighbors': 5},
  {'n_neighbors': 7},
  {'n_neighbors': 9}],
 'split0_test_score': array([-0., -0., -0., -0.]),
 'split1_test_score': array([-0.06085806, -0.03651484, -0.07824608, -0.1014301 ]),
 'split2_test_score': array([-0.36514837, -0.38122609, -0.34797764, -0.35311664]),
 'split3_test_score': array([-0.27216553, -0.22211108, -0.18257419, -0.19668027]),
 'split4_test_score': array([-0.43033148, -0.40331956, -0.39469268, -0.38596787]),
 'mean_test_score': array([-0.22570069, -0.20863431, -0.20069

In [74]:
grid_search_knn.best_estimator_

KNeighborsRegressor(n_neighbors=7)

In [75]:
#Usando Grid Search + Cross Validate

In [76]:
scores_cv_gs = cross_validate(grid_search_knn, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10, return_estimator=True)
print(scores_cv_gs['test_score'])
sempad = np.mean(scores_cv_gs['test_score'])
print(f"Sem padronização: {sempad}")

[-0.         -0.         -0.         -0.11065667 -0.37515429 -0.31031645
 -0.03688556 -0.28571429 -0.32249031 -0.2       ]
Sem padronização: -0.1641217553023404


In [77]:
scores_cv_gs

{'fit_time': array([0.10594249, 0.09594178, 0.06295967, 0.0829463 , 0.06295967,
        0.03997517, 0.04097557, 0.04097509, 0.04297543, 0.03997183]),
 'score_time': array([0.00498939, 0.00099993, 0.00099945, 0.0009985 , 0.00099945,
        0.00099969, 0.00099826, 0.00099969, 0.0009973 , 0.00100017]),
 'estimator': (GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7, 9]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7, 9]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7, 9]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_n

In [78]:
for estimator in scores_cv_gs['estimator']:
    print(estimator.best_estimator_)

KNeighborsRegressor()
KNeighborsRegressor()
KNeighborsRegressor()
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=3)
KNeighborsRegressor(n_neighbors=3)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor()
KNeighborsRegressor()


# Pipeline + GridSearch

In [82]:
#Com o Grid Search dentro do Pipeline
parametros = {'n_neighbors': [3,5,7]}

GridSearch_KNN = GridSearchCV(KNeighborsRegressor(), parametros, scoring='neg_root_mean_squared_error')

pipeline = Pipeline([
    ("padronização", StandardScaler()),
    ("gsknn", GridSearch_KNN)
])
scores_cv_pipe = cross_validate(pipeline, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False),  cv=10)
print(scores_cv_pipe['test_score'])
compad = np.mean(scores_cv_pipe['test_score'])
print(f"Com padronização: {compad}")

[-0.         -0.         -0.17213259 -0.17689691 -0.36514837 -0.32203059
 -0.05163978 -0.37594326 -0.40368671 -0.27216553]
Com padronização: -0.2139643746805823


In [83]:
#Com o Pipeline dentro do Grid Search

pipeline = Pipeline([
    ("padronização", StandardScaler()),
    ("knn", KNeighborsRegressor())
])

parametros = {'knn__n_neighbors': [3,5,7]}

grid_search_pipeline = GridSearchCV(pipeline, parametros, scoring='neg_root_mean_squared_error')

scores_cv_gs_pipe = cross_validate(grid_search_pipeline, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10)
print(scores_cv_gs_pipe['test_score'])
compad = np.mean(scores_cv_gs_pipe['test_score'])
print(f"Com padronização: {compad}")

[-0.         -0.         -0.17213259 -0.17689691 -0.36514837 -0.32203059
 -0.05163978 -0.37594326 -0.40368671 -0.27216553]
Com padronização: -0.2139643746805823
