In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate, StratifiedKFold

from tools.CarlosJimenez.distance import ManhattanDistance, EuclideanDistance
from tools.CarlosJimenez.voting import (
    MajorityClassVote,
    InverseDistanceWeightedVote,
    ShepardsWorkVote,
)
from tools.CarlosJimenez.knn import KNNClassifier

## Create a synthetic dataset

This synthetic dataset can help us prove the KNN is working, without having to preprocess anything.

In [2]:
X, y = make_classification(
    n_samples=100,
    n_features=6,
    n_informative=2,
    n_redundant=3,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    random_state=29,
)

cv = StratifiedKFold(n_splits=10)
knn = KNNClassifier(
    k=3,
    distance_func=ManhattanDistance(),
    voting_func=ShepardsWorkVote(),
    weights=np.ones(X.shape[1]),
)
results = cross_validate(knn, X, y, cv=cv, scoring="accuracy")
scores = results["test_score"]

print(f"Mean: {scores.mean()}")
print(f"Std: {scores.std()}")

Mean: 0.8400000000000001
Std: 0.11135528725660045


## Test Harness

This test harness allows us to test the KNN classifier with varying values for all hyperparameters with a simple grid search.

The `itertools.product` function is used to generate all combinations of the hyperparameters. These hyperparameters and the results are stored in a pandas DataFrame for easy viewing and analysis.

In [3]:
import itertools


k_values = [1, 3, 5, 7]
distance_funcs = [ManhattanDistance(), EuclideanDistance()]
voting_funcs = [MajorityClassVote(), InverseDistanceWeightedVote(), ShepardsWorkVote()]
weights_lists = [np.ones(X.shape[1]), np.linspace(0, 1, X.shape[1])]

results = pd.DataFrame(
    columns=[
        "k",
        "distance_func",
        "voting_func",
        "weights",
        "accuracy_mean",
        "accuracy_std",
    ]
)
for k, distance_func, voting_func, weights in itertools.product(
    k_values, distance_funcs, voting_funcs, weights_lists
):
    knn = KNNClassifier(
        k=k,
        distance_func=distance_func,
        voting_func=voting_func,
        weights=weights,
    )
    cv_results = cross_validate(knn, X, y, cv=cv, scoring="accuracy")
    scores = cv_results["test_score"]
    results.loc[len(results)] = [
        k,
        distance_func.__class__.__name__,
        voting_func.__class__.__name__,
        weights,
        scores.mean(),
        scores.std(),
    ]

results.sort_values(by="accuracy_mean", ascending=False)

Unnamed: 0,k,distance_func,voting_func,weights,accuracy_mean,accuracy_std
0,1,ManhattanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.88,0.087178
30,5,EuclideanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.88,0.09798
2,1,ManhattanDistance,InverseDistanceWeightedVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.88,0.087178
4,1,ManhattanDistance,ShepardsWorkVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.88,0.087178
42,7,EuclideanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.87,0.100499
36,7,ManhattanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.87,0.09
24,5,ManhattanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.86,0.10198
12,3,ManhattanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.86,0.10198
6,1,EuclideanDistance,MajorityClassVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.85,0.111803
10,1,EuclideanDistance,ShepardsWorkVote,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.85,0.111803
