In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [7]:
target

0            Adelie Penguin (Pygoscelis adeliae)
1            Adelie Penguin (Pygoscelis adeliae)
2            Adelie Penguin (Pygoscelis adeliae)
4            Adelie Penguin (Pygoscelis adeliae)
5            Adelie Penguin (Pygoscelis adeliae)
                         ...                    
339    Chinstrap penguin (Pygoscelis antarctica)
340    Chinstrap penguin (Pygoscelis antarctica)
341    Chinstrap penguin (Pygoscelis antarctica)
342    Chinstrap penguin (Pygoscelis antarctica)
343    Chinstrap penguin (Pygoscelis antarctica)
Name: Species, Length: 342, dtype: object

In [13]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])
model.set_params(param_name=param_value)
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, data, target, cv=10,scoring="balanced_accuracy")
cv_results['test_score'].mean()

0.9521978021978021

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, data, target, cv=10,scoring="balanced_accuracy")
cv_results['test_score']

array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
       0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("classifier", KNeighborsClassifier(n_neighbors=101)),
])
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, data, target, cv=10,scoring="balanced_accuracy")
cv_results['test_score']

array([0.61805556, 0.59316239, 0.57435897, 0.56410256, 0.58888889,
       0.64444444, 0.62222222, 0.62222222, 0.64444444, 0.66666667])

In [59]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox")
]



scores_mega=[]
cols=["Preprocessor","n_neighbors","cv_fold_no","score"]

param_grid = {"classifier__n_neighbors":[5, 51, 101]}

for preprocessor in all_preprocessors:
    pipeline= Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", KNeighborsClassifier()),])
    model = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=10,
        scoring="balanced_accuracy").fit(data, target)
    
    print(preprocessor,"\n",model.cv_results_['params'],"\n",'mean_test_score',model.cv_results_['mean_test_score'])
    for i in range(10):
        scores="split"+str(i)+"_test_score"
        print(scores,model.cv_results_[scores],model.cv_results_[scores].mean())

        
        scores_temp1=[str(preprocessor),5,i,model.cv_results_[scores][0]]
        scores_mega.append(scores_temp1)
        scores_temp2=[str(preprocessor),51,i,model.cv_results_[scores][1]]
        scores_mega.append(scores_temp2)
        scores_temp3=[str(preprocessor),101,i,model.cv_results_[scores][2]]
        scores_mega.append(scores_temp3)
        
    print("\n\n\n")
    
df1 = pd.DataFrame(scores_mega, columns=cols)

None 
 [{'classifier__n_neighbors': 5}, {'classifier__n_neighbors': 51}, {'classifier__n_neighbors': 101}] 
 mean_test_score [0.73983822 0.60518162 0.61385684]
split0_test_score [0.66468254 0.61805556 0.61805556] 0.6335978835978836
split1_test_score [0.73601954 0.56752137 0.59316239] 0.6322344322344323
split2_test_score [0.74102564 0.5965812  0.57435897] 0.6373219373219374
split3_test_score [0.7042735  0.56410256 0.56410256] 0.6108262108262108
split4_test_score [0.58412698 0.53333333 0.58888889] 0.5687830687830688
split5_test_score [0.66984127 0.64444444 0.64444444] 0.652910052910053
split6_test_score [0.83492063 0.62222222 0.62222222] 0.6931216931216931
split7_test_score [0.74285714 0.62222222 0.62222222] 0.6624338624338625
split8_test_score [0.88253968 0.64444444 0.64444444] 0.7238095238095238
split9_test_score [0.83809524 0.63888889 0.66666667] 0.7145502645502645




StandardScaler() 
 [{'classifier__n_neighbors': 5}, {'classifier__n_neighbors': 51}, {'classifier__n_neighbors': 101}

In [81]:
df1.tail(20)

Unnamed: 0,Preprocessor,n_neighbors,cv_fold_no,score
130,PowerTransformer(method='box-cox'),51,3,0.863248
131,PowerTransformer(method='box-cox'),101,3,0.888889
132,PowerTransformer(method='box-cox'),5,4,0.88254
133,PowerTransformer(method='box-cox'),51,4,0.834921
134,PowerTransformer(method='box-cox'),101,4,0.787302
135,PowerTransformer(method='box-cox'),5,5,0.952381
136,PowerTransformer(method='box-cox'),51,5,0.952381
137,PowerTransformer(method='box-cox'),101,5,0.809524
138,PowerTransformer(method='box-cox'),5,6,0.955556
139,PowerTransformer(method='box-cox'),51,6,0.907937


In [78]:
df1[df1['n_neighbors'] == 5].groupby('cv_fold_no')['score'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cv_fold_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,5.0,0.923413,0.146097,0.664683,0.952381,1.0,1.0,1.0
1,5.0,0.918584,0.105678,0.73602,0.92674,0.952381,0.977778,1.0
2,5.0,0.948205,0.115817,0.741026,1.0,1.0,1.0,1.0
3,5.0,0.869915,0.097232,0.704274,0.863248,0.918803,0.918803,0.944444
4,5.0,0.827302,0.136279,0.584127,0.88254,0.88254,0.88254,0.904762
5,5.0,0.900952,0.131687,0.669841,0.930159,0.952381,0.952381,1.0
6,5.0,0.940317,0.059957,0.834921,0.955556,0.955556,0.977778,0.977778
7,5.0,0.897143,0.086783,0.742857,0.930159,0.930159,0.930159,0.952381
8,5.0,0.902857,0.011358,0.88254,0.907937,0.907937,0.907937,0.907937
9,5.0,0.939048,0.060083,0.838095,0.952381,0.952381,0.952381,1.0


In [79]:
df1[df1['n_neighbors'] == 51].groupby('cv_fold_no')['score'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cv_fold_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,5.0,0.847421,0.132567,0.618056,0.857143,0.904762,0.904762,0.952381
1,5.0,0.885568,0.178246,0.567521,0.952381,0.952381,0.977778,0.977778
2,5.0,0.919316,0.180414,0.596581,1.0,1.0,1.0,1.0
3,5.0,0.803419,0.133782,0.564103,0.863248,0.863248,0.863248,0.863248
4,5.0,0.798095,0.151096,0.533333,0.834921,0.834921,0.88254,0.904762
5,5.0,0.88127,0.133986,0.644444,0.904762,0.952381,0.952381,0.952381
6,5.0,0.874286,0.144149,0.622222,0.907937,0.907937,0.955556,0.977778
7,5.0,0.881905,0.145486,0.622222,0.930159,0.952381,0.952381,0.952381
8,5.0,0.873016,0.127775,0.644444,0.930159,0.930159,0.930159,0.930159
9,5.0,0.870635,0.13172,0.638889,0.904762,0.904762,0.952381,0.952381


In [80]:
df1[df1['n_neighbors'] == 101].groupby('cv_fold_no')['score'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cv_fold_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,5.0,0.818849,0.114125,0.618056,0.857143,0.857143,0.857143,0.904762
1,5.0,0.832918,0.139722,0.593162,0.857143,0.857143,0.904762,0.952381
2,5.0,0.825983,0.156254,0.574359,0.777778,0.888889,0.944444,0.944444
3,5.0,0.808547,0.137099,0.564103,0.863248,0.863248,0.863248,0.888889
4,5.0,0.762222,0.101553,0.588889,0.765079,0.787302,0.834921,0.834921
5,5.0,0.79619,0.087903,0.644444,0.809524,0.812698,0.857143,0.857143
6,5.0,0.76,0.082802,0.622222,0.765079,0.765079,0.812698,0.834921
7,5.0,0.825397,0.116405,0.622222,0.834921,0.88254,0.88254,0.904762
8,5.0,0.777778,0.085184,0.644444,0.739683,0.834921,0.834921,0.834921
9,5.0,0.838095,0.098745,0.666667,0.857143,0.857143,0.904762,0.904762


In [None]:
df1[df1['Preprocessor'] == 101].groupby('cv_fold_no')['score'].describe()