In [36]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [37]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [38]:
target.head()

0    Adelie Penguin (Pygoscelis adeliae)
1    Adelie Penguin (Pygoscelis adeliae)
2    Adelie Penguin (Pygoscelis adeliae)
4    Adelie Penguin (Pygoscelis adeliae)
5    Adelie Penguin (Pygoscelis adeliae)
Name: Species, dtype: object

In [39]:
target.unique()

array(['Adelie Penguin (Pygoscelis adeliae)',
       'Gentoo penguin (Pygoscelis papua)',
       'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)

In [40]:
target.describe()

count                                     342
unique                                      3
top       Adelie Penguin (Pygoscelis adeliae)
freq                                      151
Name: Species, dtype: object

In [41]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [42]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [44]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__copy', 'preprocessor__with_mean', 'preprocessor__with_std', 'classifier__algorithm', 'classifier__leaf_size', 'classifier__metric', 'classifier__metric_params', 'classifier__n_jobs', 'classifier__n_neighbors', 'classifier__p', 'classifier__weights'])

In [45]:
from sklearn.model_selection import cross_validate

In [56]:
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy")

In [57]:
cv_results

{'fit_time': array([0.00425935, 0.00314999, 0.0029583 , 0.00293803, 0.00291705,
        0.00291276, 0.00290108, 0.00289202, 0.00290632, 0.00288653]),
 'score_time': array([0.00276279, 0.00250697, 0.00248575, 0.00246811, 0.00246048,
        0.00244761, 0.00245857, 0.00243878, 0.00245667, 0.00244093]),
 'test_score': array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
        0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])}

In [58]:
cv_results["test_score"].mean()

0.9521978021978021

In [59]:
model.set_params(classifier__n_neighbors=51)

cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy")
cv_results["test_score"].mean()

0.9418803418803419

In [62]:
model.set_params(classifier__n_neighbors=101)
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy")
cv_results["test_score"].mean()

0.8766422466422465

In [51]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [53]:
from sklearn.model_selection import GridSearchCV

In [54]:
param_grid = {"preprocessor": all_preprocessors, "classifier__n_neighbors": [5, 51, 101]}

model_grid_search = GridSearchCV(model, param_grid=param_grid)
cv_results = cross_validate(
    model_grid_search, data, target, cv=10, scoring="balanced_accuracy", return_estimator=True)

scores = cv_results["test_score"]
print(f"Accuracy score by cross-validation combined with hyperparameters "
      f"search:\n{scores.mean():.3f} +/- {scores.std():.3f}")
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f"Best parameter found on fold #{fold_idx + 1}")
    print(f"{estimator.best_params_}")


Accuracy score by cross-validation combined with hyperparameters search:
0.947 +/- 0.036
Best parameter found on fold #1
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #2
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #3
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best parameter found on fold #4
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best parameter found on fold #5
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best parameter found on fold #6
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #7
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #8
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #9
{'cla

In [61]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__copy', 'preprocessor__with_mean', 'preprocessor__with_std', 'classifier__algorithm', 'classifier__leaf_size', 'classifier__metric', 'classifier__metric_params', 'classifier__n_jobs', 'classifier__n_neighbors', 'classifier__p', 'classifier__weights'])