# Q1

In [2]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [5]:
target.value_counts() # target.nunique()

Species
Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: count, dtype: int64

# Q2

In [11]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [15]:
pd.DataFrame(target.value_counts())

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
Adelie Penguin (Pygoscelis adeliae),151
Gentoo penguin (Pygoscelis papua),123
Chinstrap penguin (Pygoscelis antarctica),68


# Q3

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [17]:
from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(
    model, data, target, cv=10, scoring="balanced_accuracy"
)

In [21]:
print(f"The cross-validation score is {cv_results.mean():.3f} ± {cv_results.std():.3f}")

The cross-validation score is 0.952 ± 0.040


# Q4

In [22]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'transform_input': None,
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [29]:
for i in (5, 51, 101):
    model.set_params(classifier__n_neighbors=i)
    cv_results = cross_val_score(model, data, target, cv=10, scoring="balanced_accuracy")
    print(f"The cross-validation score for n_neighbors={i} is:\n{pd.DataFrame(cv_results)}")

The cross-validation score for n_neighbors=5 is:
          0
0  1.000000
1  1.000000
2  1.000000
3  0.918803
4  0.882540
5  0.952381
6  0.977778
7  0.930159
8  0.907937
9  0.952381
The cross-validation score for n_neighbors=51 is:
          0
0  0.952381
1  0.977778
2  1.000000
3  0.863248
4  0.882540
5  0.952381
6  0.955556
7  0.952381
8  0.930159
9  0.952381
The cross-validation score for n_neighbors=101 is:
          0
0  0.857143
1  0.952381
2  0.944444
3  0.863248
4  0.834921
5  0.857143
6  0.834921
7  0.882540
8  0.834921
9  0.904762


In [60]:
model_raw = KNeighborsClassifier(n_neighbors=5)
cv_results_raw = cross_val_score(
    model_raw, data, target, cv=10
)

In [62]:
pd.DataFrame(cv_results_raw)
print(f"The cross-validation score when raw is:\n{pd.DataFrame(cv_results_raw)}")

The cross-validation score when raw is:
          0
0  0.742857
1  0.800000
2  0.794118
3  0.794118
4  0.647059
5  0.764706
6  0.882353
7  0.794118
8  0.911765
9  0.852941


# Q5

In [63]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [64]:
from sklearn.model_selection import GridSearchCV

In [66]:
param_grid = {
    "preprocessor": all_preprocessors,
    "classifier__n_neighbors": [5, 51, 101]
}

In [68]:
model_grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=10
)

In [69]:
model_grid_search.fit(data, target)

0,1,2
,estimator,Pipeline(step...ghbors=101))])
,param_grid,"{'classifier__n_neighbors': [5, 51, ...], 'preprocessor': [None, StandardScaler(), ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [70]:
model_grid_search.best_params_

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

In [72]:
pd.DataFrame(model_grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002029,0.000342,0.002472,0.000293,5,,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.742857,0.8,0.794118,0.794118,0.647059,0.764706,0.882353,0.794118,0.911765,0.852941,0.798403,0.070751,13
1,0.002962,0.000347,0.00256,0.000297,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.941176,0.911765,0.970588,0.970588,0.941176,0.911765,0.970588,0.961765,0.032353,1
2,0.002742,0.000212,0.002495,0.00026,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.971429,1.0,0.970588,0.911765,0.941176,0.941176,0.970588,0.911765,0.970588,0.958908,0.030028,3
3,0.003868,0.000632,0.003106,0.001576,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.971429,0.942857,1.0,0.941176,0.941176,1.0,0.970588,0.941176,0.911765,0.970588,0.959076,0.026888,2
4,0.049755,0.007,0.002839,0.000365,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.941176,0.911765,1.0,0.955966,0.035452,5
5,0.001693,0.000173,0.002325,0.0003,51,,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.742857,0.685714,0.735294,0.705882,0.647059,0.764706,0.735294,0.735294,0.764706,0.764706,0.728151,0.036402,15
6,0.002666,0.000257,0.002503,0.000309,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,4
7,0.002701,0.000403,0.002492,0.000238,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942857,0.971429,1.0,0.911765,0.882353,0.970588,0.911765,0.970588,0.941176,0.941176,0.94437,0.033461,7
8,0.003252,0.000206,0.00262,0.00029,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.914286,0.971429,1.0,0.911765,0.941176,0.941176,0.970588,0.941176,0.941176,0.970588,0.950336,0.026181,6
9,0.045536,0.006625,0.002696,6.9e-05,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942857,0.971429,1.0,0.911765,0.882353,0.970588,0.911765,0.970588,0.941176,0.941176,0.94437,0.033461,7


In [76]:
cv_model_grid_search = pd.DataFrame(model_grid_search.cv_results_)

In [81]:
cv_model_grid_search.loc[cv_model_grid_search["param_preprocessor"] == all_preprocessors[1]]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.002962,0.000347,0.00256,0.000297,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.941176,0.911765,0.970588,0.970588,0.941176,0.911765,0.970588,0.961765,0.032353,1
6,0.002666,0.000257,0.002503,0.000309,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,4
11,0.002521,7e-05,0.00253,0.000115,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.914286,0.971429,0.970588,0.911765,0.882353,0.911765,0.882353,0.911765,0.882353,0.941176,0.917983,0.03178,9


#### > Using a StandardScaler, the best ranked model is NOT substantially better than any other

#### > Using any preprocessor indeed HAS better ranking than not using 

In [91]:
cv_model_grid_search.loc[
    (
        cv_model_grid_search["param_preprocessor"] == all_preprocessors[1] # StdSclr
    )
        &
    (
        (
        cv_model_grid_search["param_classifier__n_neighbors"] == 5
        )
        |
        (
        cv_model_grid_search["param_classifier__n_neighbors"] == 51
        )
    )
]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.002962,0.000347,0.00256,0.000297,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.941176,0.911765,0.970588,0.970588,0.941176,0.911765,0.970588,0.961765,0.032353,1
6,0.002666,0.000257,0.002503,0.000309,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,4


#### > 5 + StdScaler is NOT subs. better than 51 + StdScaler

In [92]:
cv_model_grid_search.loc[
    (
        cv_model_grid_search["param_preprocessor"] == all_preprocessors[1] # StdSclr
    )
        &
    (
        (
        cv_model_grid_search["param_classifier__n_neighbors"] == 51
        )
        |
        (
        cv_model_grid_search["param_classifier__n_neighbors"] == 101
        )
    )
]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
6,0.002666,0.000257,0.002503,0.000309,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,4
11,0.002521,7e-05,0.00253,0.000115,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.914286,0.971429,0.970588,0.911765,0.882353,0.911765,0.882353,0.911765,0.882353,0.941176,0.917983,0.03178,9


#### > 51 + StdScaler indeed IS subs. better than 101 + StdScaler

# Q6

In [93]:
from sklearn.model_selection import cross_validate

cv_results_nested = cross_validate(
    model_grid_search,
    data,
    target,
    cv=10,
    return_estimator=True,
    scoring="balanced_accuracy"
)

In [94]:
cv_results_nested['test_score'].mean()

np.float64(0.9370940170940172)

# Q7

In [104]:
estimator = cv_results_nested['estimator']

In [110]:
for cv_fold, estimator_in_fold in enumerate(estimator):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #2:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #3:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #4:
{'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #5:
{'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #6:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #7:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #8:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #9:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #10:
{'classifier__