Apply the $k$ nearest neighbors classifier to the ''iris data.''
Choose $k$ by cross-validation.

The data may be found at
https://archive.ics.uci.edu/dataset/53/iris

In [82]:
import pandas as pd

from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

# Download the iris data set

In [88]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/iris.dat')

Y = df['class'].to_numpy()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].to_numpy()

# Conduct the hyperparameter search

In [96]:
param_grid = [{
    'n_neighbors': range(1, 30)
}]
# The data is ordered by class, so it seems important to shuffle the data
# when performing the cross validation
grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=20, shuffle=True))
grid_search.fit(X, Y)

cv_results = pd.DataFrame(grid_search.cv_results_)
print("Here are the cross-validation estimates of one minus the true error rate for each parameter:")
print(
    cv_results.sort_values(by="rank_test_score")
    [["param_n_neighbors","mean_test_score", "std_test_score"]])
print(
    "\nThe best parameter among these is "
    f"k = {grid_search.best_params_['n_neighbors']}\n"
    "Its cross-validation estimate of the true error rate is "
    f"{1 - grid_search.best_score_:.3}"
)
best_model = grid_search.best_estimator_

Here are the cross-validation estimates of one minus the true error rate for each parameter:
    param_n_neighbors  mean_test_score  std_test_score
14                 15         0.979464        0.048993
20                 21         0.979464        0.048993
18                 19         0.979464        0.048993
19                 20         0.979464        0.048993
17                 18         0.973214        0.053720
10                 11         0.973214        0.053720
9                  10         0.973214        0.053720
12                 13         0.973214        0.053720
23                 24         0.973214        0.053720
24                 25         0.973214        0.053720
16                 17         0.973214        0.053720
6                   7         0.972321        0.055465
15                 16         0.966964        0.057386
3                   4         0.966964        0.057386
11                 12         0.966964        0.057386
13                 14      

# Bonus: working on the spam data

In [98]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

In [99]:
param_grid = [{
    'n_neighbors': range(1, 30)
}]
# The data is ordered by class, so it seems important to shuffle the data
# when performing the cross validation
grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=20, shuffle=True))
grid_search.fit(X, Y)

cv_results = pd.DataFrame(grid_search.cv_results_)
print("Here are the cross-validation estimates of one minus the true error rate for each parameter:")
print(
    cv_results.sort_values(by="rank_test_score")
    [["param_n_neighbors","mean_test_score", "std_test_score"]])
print(
    "\nThe best parameter among these is "
    f"k = {grid_search.best_params_['n_neighbors']}\n"
    "Its cross-validation estimate of the true error rate is "
    f"{1 - grid_search.best_score_:.3}"
)
best_model = grid_search.best_estimator_

Here are the cross-validation estimates of one minus the true error rate for each parameter:
    param_n_neighbors  mean_test_score  std_test_score
0                   1         0.829596        0.020874
2                   3         0.813722        0.027466
4                   5         0.810901        0.022634
3                   4         0.806121        0.019432
1                   2         0.805256        0.019066
6                   7         0.802646        0.019412
5                   6         0.797431        0.021444
7                   8         0.796126        0.023607
9                  10         0.795907        0.024390
10                 11         0.793518        0.024538
8                   9         0.793086        0.022714
11                 12         0.791347        0.022365
12                 13         0.791126        0.026186
14                 15         0.789827        0.027311
13                 14         0.787646        0.025531
15                 16      