In [78]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from itertools import product
from functools import partial

First let's prepare the data.

In [79]:
ds = pd.read_csv('iris.csv')

ds.replace('setosa', 1, inplace=True)
ds.replace('versicolor', 2, inplace=True)
ds.replace('virginica', 3, inplace=True)

X = ds[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].to_numpy()
Y = ds['species'].to_numpy()

In [80]:
np.random.seed(42)

In [81]:
def mode_rand_ties(a):
    """
        Compute the mode of an array a.
        If there are ties for the most frequently occurring
        value, break the ties randomly.
    """
    # Compute the unique values in a and their counts
    uniq, cnts = np.unique(a, return_counts=True)
    max_cnt = np.max(cnts)
    
    # The array of values which all have count
    # equal to the mode
    ties = uniq[cnts==max_cnt]
    
    return np.random.choice(ties)

In [82]:
def k_nearest_neighbors(X, Y, k):
    """
        Return the k-nearest neighbors classifier
        for X and Y.
        X: numpy array of input features
        Y: numpy array of class labels
        k: the number of neighbors
    """
    if isinstance(k, tuple):
        k = k[0]
    
    # The classifier
    def f(x):
        
        # Compute the squared distance between x
        # and every row of x
        sq_dists = ((x-X)**2).sum(axis=-1)
        indices = np.argsort(sq_dists, axis=-1)
        
        # The rows with the k smallest distances to x
        k_smallest = indices[:k]
        Y_vals = Y[k_smallest]
        
        # The classifier returns the mode of class labels
        # of the nearest neighbors
        return mode_rand_ties(Y_vals)
    
    return f

Below we perform cross-validation to choose $k$.

In [83]:
def create_random_splits(X, Y, num_ch):
    """
        Split the dataset (X, Y) into num_ch random chunks
        of equal size (except for possibly the last chunk).
        (X, Y): the dataset
        num_ch: the number of chunks
    """
    
    # Randomly permute (X, Y)
    n = len(X)
    indices = np.random.permutation(n)
    X_perm = X[indices]
    Y_perm = Y[indices]
    
    k = n // num_ch
    
    # Arrays to hold the chunks
    Xs = []
    Ys = []
    
    # Append one chunk at a time
    for i in range(num_ch):
        if i < num_ch-1:
            Xs.append(X_perm[i*k:i*k + k, :])
            Ys.append(Y_perm[i*k:i*k + k])
        else:
            Xs.append(X_perm[i*k:, :])
            Ys.append(Y_perm[i*k:])
    
    return Xs, Ys

In [84]:
def cross_validation(X, Y, num_ch, model_func):
    """
        Perform n-fold cross-validation where
        (X, Y): the dataset
        num_ch: the number of folds
        model_func: a function which takes X, Y
            as input and outputs a classifer function
        Returns:
            errors: a list of errors for each fold
    """
    errors = []
    Xs, Ys = create_random_splits(X, Y, num_ch)
    
    for i in range(num_ch):
        X_left_over = np.concatenate(Xs[:i] + Xs[i+1:], axis=0)
        Y_left_over = np.concatenate(Ys[:i] + Ys[i+1:], axis=0)
        X_ch = Xs[i]
        Y_ch = Ys[i]
        
        model = model_func(X_left_over, Y_left_over)
        Y_pred = np.array(list(map(model, X_ch)))
        error = (Y_pred != Y_ch).mean()
        errors.append(error)

    return errors

In [85]:
def param_search(X, Y, num_ch, model_func, param_grid):
    """
        Perform cross-validation with a parameter search.
        X: numpy array of input features
        Y: numpy array of class labels
        num_ch: number of chunks to break the dataset into for cross-validation
        model_func: a function that returns a classifier given input features
            X, class labels Y, and possibly other parameters
        param_grid: a dictionary of parameters and values to test
        returns:
            out: a list of dictionaries, one for each parameter combination,
                displaying the cross-validation errors for that combination
                as well as the mean error across all splits
    """
    
    # The names of the parameters
    param_names = list(param_grid.keys())
    
    # All possible combinations of the parameter values
    param_combos = list(product(*list(param_grid.values())))
    
    # Input the parameters into the model_func for each combination
    params_to_test = [{param_name:param_vals[i] for i, param_name in enumerate(param_names)}
                      for param_vals in param_combos]
    models_to_test = [partial(model_func, **params) for params in params_to_test]
    params_models = list(zip(params_to_test, models_to_test))
    
    out = []
    
    # Compute the CV errors and mean error for each parameter
    # combination and append them to out
    for (params, func) in params_models:
        errors = cross_validation(X, Y, num_ch, func)
        mean_error = np.mean(errors)
        out.append({'params': params, 'errors': errors, 'mean_error': mean_error})
    
    return out

Here are the results of the parameter search.

In [86]:
out = param_search(X, Y, 5, k_nearest_neighbors, {'k': range(1,51)})

In [87]:
for d in out:
    print(f"k: {d['params']['k']}")
    print(f"Mean error: {d['mean_error']:.5f}\n")

k: 1
Mean error: 0.04000

k: 2
Mean error: 0.03333

k: 3
Mean error: 0.03333

k: 4
Mean error: 0.02667

k: 5
Mean error: 0.04000

k: 6
Mean error: 0.03333

k: 7
Mean error: 0.02000

k: 8
Mean error: 0.02667

k: 9
Mean error: 0.04667

k: 10
Mean error: 0.03333

k: 11
Mean error: 0.02667

k: 12
Mean error: 0.03333

k: 13
Mean error: 0.04000

k: 14
Mean error: 0.02667

k: 15
Mean error: 0.02667

k: 16
Mean error: 0.04667

k: 17
Mean error: 0.03333

k: 18
Mean error: 0.04667

k: 19
Mean error: 0.05333

k: 20
Mean error: 0.02667

k: 21
Mean error: 0.04667

k: 22
Mean error: 0.06000

k: 23
Mean error: 0.04667

k: 24
Mean error: 0.07333

k: 25
Mean error: 0.06000

k: 26
Mean error: 0.05333

k: 27
Mean error: 0.05333

k: 28
Mean error: 0.05333

k: 29
Mean error: 0.04667

k: 30
Mean error: 0.04667

k: 31
Mean error: 0.06000

k: 32
Mean error: 0.06000

k: 33
Mean error: 0.04667

k: 34
Mean error: 0.06000

k: 35
Mean error: 0.04667

k: 36
Mean error: 0.06667

k: 37
Mean error: 0.07333

k: 38
Mean

$k=11$ gives the lowest mean error, $0.02667$, for cross validation.