# [Wrap-up quiz 4](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_wrap_up_quiz.html)

In [1]:
import pandas as pd

penguins = pd.read_csv('./datasets/penguins.csv')

columns = ['Body Mass (g)', 'Flipper Length (mm)', 'Culmen Length (mm)']
target_name = 'Species'

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
target.info()

<class 'pandas.core.series.Series'>
Int64Index: 342 entries, 0 to 343
Series name: Species
Non-Null Count  Dtype 
--------------  ----- 
342 non-null    object
dtypes: object(1)
memory usage: 5.3+ KB


In [3]:
target.describe()

count                                     342
unique                                      3
top       Adelie Penguin (Pygoscelis adeliae)
freq                                      151
Name: Species, dtype: object

In [4]:
target.nunique()

3

The problem to be solved is a multiclass classification problem (more than 2 possible classes)

In [5]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 343
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Body Mass (g)        342 non-null    float64
 1   Flipper Length (mm)  342 non-null    float64
 2   Culmen Length (mm)   342 non-null    float64
dtypes: float64(3)
memory usage: 10.7 KB


The proportion of the class counts are balanced: There are approximately the same number of rows for each class.

The input feature DO NOT have similar scales (range of values)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [8]:
from sklearn.model_selection import cross_validate

cv_scores = cross_validate(
    model, data, target, cv=10, scoring='balanced_accuracy'
)

print(f"Average cross-validates test balanced accuracy score: "
      f"{cv_scores['test_score'].mean():.3f}")

Average cross-validates test balanced accuracy score: 0.952


In [9]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [10]:
from sklearn.model_selection import validation_curve

param_range = [5, 51, 101]
param_name = 'classifier__n_neighbors'

train_scores, test_scores = validation_curve(
    model, data, target, cv=10, scoring='balanced_accuracy',
    param_name=param_name, param_range=param_range,
)

In [11]:
import numpy as np

In [12]:
np.count_nonzero(test_scores[0] > test_scores[1])

4

Looking at the individual cross-validation scores, using a model with `n_neighbors=5` is NOT  substantially better than a model with `n_neighbors=51`.


In [13]:
np.count_nonzero(test_scores[0] > test_scores[2])

10

Looking at the individual cross-validation scores, using a model with `n_neighbors=5` is substantially better than a model with `n_neighbors=101`

In [14]:
model_unscaled = KNeighborsClassifier(n_neighbors=5)

In [15]:
cv_scores_unscaled = cross_validate(
    model_unscaled, data, target, cv=10, scoring='balanced_accuracy'
)

In [16]:
np.count_nonzero(test_scores[0] > cv_scores_unscaled['test_score'])

10

In [17]:
cv_scores_unscaled['test_score']

array([0.66468254, 0.73601954, 0.74102564, 0.7042735 , 0.58412698,
       0.66984127, 0.83492063, 0.74285714, 0.88253968, 0.83809524])

In [18]:
test_scores[0]

array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
       0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])

Looking at the individual cross-validation scores, a 5 neares neighbors using a `StandardScaler` is substantially better (at least 7 of the cross-validations scores are better) than a 5 nearest neighbors using the raw features (without scaling).

In [19]:
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, PowerTransformer

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method='box-cox')
]

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': [5, 51, 101]
}

In [20]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [21]:
model = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

gridsearch = GridSearchCV(
    model, param_grid=param_grid,
    cv=10, scoring='balanced_accuracy', n_jobs=4
)

In [22]:
gridsearch.fit(data, target)

In [31]:
cv_results = pd.DataFrame(gridsearch.cv_results_).sort_values(
    "mean_test_score", ascending=False)

In [32]:
cv_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_classifier__n_neighbors', 'param_preprocessor', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [42]:
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    #"mean_test_score", "std_test_score", "rank_test_score"] + [\
    f"split{n}_test_score" for n in range(10)]
cv_results_selected = cv_results[column_results]

In [89]:
np.count_nonzero(cv_results_selected[:1][[f"split{n}_test_score" for n in range(10)]].to_numpy() > cv_results_selected[8:9][[f"split{n}_test_score" for n in range(10)]].to_numpy())

10

Looking at the individual cross-validation scores, the best ranked model using a `StandardScaler` is substantially better only after the 9th ranked model

In [88]:
cv_results_selected

Unnamed: 0,param_preprocessor,param_classifier__n_neighbors,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score
1,StandardScaler(),5,1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381
2,MinMaxScaler(),5,1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381
3,QuantileTransformer(n_quantiles=100),5,0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381
4,PowerTransformer(method='box-cox'),5,1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0
6,StandardScaler(),51,0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381
8,QuantileTransformer(n_quantiles=100),51,0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381
9,PowerTransformer(method='box-cox'),51,0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762
7,MinMaxScaler(),51,0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762
11,StandardScaler(),101,0.857143,0.952381,0.944444,0.863248,0.834921,0.857143,0.834921,0.88254,0.834921,0.904762
12,MinMaxScaler(),101,0.857143,0.857143,0.944444,0.863248,0.834921,0.857143,0.765079,0.904762,0.834921,0.904762


Using any of the preprocessos has always a better ranking than using no preprocessor, irrespective of the valye of `n_neighbors`, since `None` are ranked as the three sorst methods

In [91]:
# count how many times the model on the left hand side of the inequality is better than the other.
split_test_score = [f"split{n}_test_score" for n in range(10)]

np.count_nonzero(
    cv_results_selected[:1][split_test_score].to_numpy() > cv_results_selected[5:6][split_test_score].to_numpy()
)

4

Looking at the individual cross-validation scores, the model with `n_neighbors=5` and `StandardScaler` is NOT substaintially better than the model with `n_neighbors=51` and `StandardScaler`

In [92]:
# count how many times the model on the left hand side of the inequality is better than the other.
split_test_score = [f"split{n}_test_score" for n in range(10)]

np.count_nonzero(
    cv_results_selected[5:6][split_test_score].to_numpy() > cv_results_selected[10:11][split_test_score].to_numpy()
)

8

Looking at the individual cross-validation scores, the model with `n_neighbors=51` and `StandardScaler` is substantially better (8 of the cross-validation scores ar better) than the model with `n_neighbors=101` and `StandardScaler`

# Nested cross-validation

In [106]:
from sklearn.model_selection import KFold, cross_val_score

In [117]:
all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method='box-cox')
]

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': [5, 51, 101]
}

In [118]:
model = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

In [119]:
NUM_TRIALS = 30

non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=0)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=0)

    # Non-nested CV
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=outer_cv, scoring='balanced_accuracy')
    clf.fit(data, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='balanced_accuracy')
    nested_scores = cross_val_score(clf, data, target, cv=outer_cv)
    nested_scores[i] = nested_scores.mean()

Traceback (most recent call last):
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_pred = prediction_method(X)
  File "/home/ageo/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 508, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)


IndexError: index 4 is out of bounds for axis 0 with size 4

In [120]:
non_nested_scores

array([0.95470915, 0.95470915, 0.95470915, 0.95470915, 0.95470915,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [121]:
nested_scores

array([0.98007856, 0.90572391, 0.99166667, 0.9388422 ])