# [Wrap-up quiz 4](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_wrap_up_quiz.html)

In [1]:
import pandas as pd

penguins = pd.read_csv('./datasets/penguins.csv')

columns = ['Body Mass (g)', 'Flipper Length (mm)', 'Culmen Length (mm)']
target_name = 'Species'

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [4]:
target.info()

<class 'pandas.core.series.Series'>
Int64Index: 342 entries, 0 to 343
Series name: Species
Non-Null Count  Dtype 
--------------  ----- 
342 non-null    object
dtypes: object(1)
memory usage: 5.3+ KB


In [5]:
target.describe()

count                                     342
unique                                      3
top       Adelie Penguin (Pygoscelis adeliae)
freq                                      151
Name: Species, dtype: object

In [10]:
target.nunique()

3

The problem to be solved is a multiclass classification problem (more than 2 possible classes)

In [3]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 343
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Body Mass (g)        342 non-null    float64
 1   Flipper Length (mm)  342 non-null    float64
 2   Culmen Length (mm)   342 non-null    float64
dtypes: float64(3)
memory usage: 10.7 KB


The proportion of the class counts are balanced: There are approximately the same number of rows for each class.

The input feature DO NOT have similar scales (range of values)

In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [18]:
from sklearn.model_selection import cross_validate

cv_scores = cross_validate(
    model, data, target, cv=10, scoring='balanced_accuracy'
)

print(f"Average cross-validates test balanced accuracy score: "
      f"{cv_scores['test_score'].mean():.3f}")

Average cross-validates test balanced accuracy score: 0.952


In [20]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [49]:
from sklearn.model_selection import validation_curve

param_range = [5, 51, 101]
param_name = 'classifier__n_neighbors'

train_scores, test_scores = validation_curve(
    model, data, target, cv=10, scoring='balanced_accuracy',
    param_name=param_name, param_range=param_range,
)

In [30]:
import numpy as np

In [50]:
np.count_nonzero(test_scores[0] > test_scores[1])

4

Looking at the individual cross-validation scores, using a model with `n_neighbors=5` is NOT  substantially better than a model with `n_neighbors=51`.


In [51]:
np.count_nonzero(test_scores[0] > test_scores[2])

10

Looking at the individual cross-validation scores, using a model with `n_neighbors=5` is substantially better than a model with `n_neighbors=101`

In [52]:
model_unscaled = KNeighborsClassifier(n_neighbors=5)

In [53]:
cv_scores_unscaled = cross_validate(
    model_unscaled, data, target, cv=10, scoring='balanced_accuracy'
)

In [61]:
np.count_nonzero(test_scores[0] > cv_scores_unscaled['test_score'])

10

In [59]:
cv_scores_unscaled['test_score']

array([0.66468254, 0.73601954, 0.74102564, 0.7042735 , 0.58412698,
       0.66984127, 0.83492063, 0.74285714, 0.88253968, 0.83809524])

In [60]:
test_scores[0]

array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
       0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])

Looking at the individual cross-validation scores, a 5 neares neighbors using a `StandardScaler` is substantially better (at least 7 of the cross-validations scores are better) than a 5 nearest neighbors using the raw features (without scaling).

In [65]:
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, PowerTransformer

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method='box-cox')
]

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': [5, 51, 101]
}

In [63]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [69]:
model = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

gridsearch = GridSearchCV(
    model, param_grid=param_grid,
    cv=10, scoring='balanced_accuracy', n_jobs=4
)

TypeError: GridSearchCV.__init__() got multiple values for argument 'param_grid'

In [70]:
gridsearch.fit(data, target)

In [88]:
cv_results = pd.DataFrame(gridsearch.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.006578,0.000898,0.00816,0.000783,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.005309,0.000653,0.006965,0.000604,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.006465,0.000899,0.006506,0.000495,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.008953,0.002144,0.00612,0.000553,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.004229,0.000566,0.006067,0.000453,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5


Let us focus on the most interesting columns and shorten the parameter
names to remove the `"param_classifier__"` prefix for readability:

In [89]:
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [90]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,param_preprocessor,n_neighbors,mean_test_score,std_test_score,rank_test_score
1,StandardScaler(),5,0.952198,0.039902,1
2,MinMaxScaler(),5,0.947778,0.034268,2
3,QuantileTransformer(n_quantiles=100),5,0.947094,0.033797,3
4,PowerTransformer(method='box-cox'),5,0.94696,0.047387,4
6,StandardScaler(),51,0.94188,0.038905,5
8,QuantileTransformer(n_quantiles=100),51,0.927277,0.043759,6
9,PowerTransformer(method='box-cox'),51,0.922833,0.047883,7
7,MinMaxScaler(),51,0.920293,0.045516,8
11,StandardScaler(),101,0.876642,0.041618,9
12,MinMaxScaler(),101,0.862357,0.046244,10
