In [74]:
import numpy as np
from sklearn import datasets, utils, neighbors

np.random.seed(42)

In [75]:
iris:utils.Bunch = datasets.load_iris()

x = iris.data
y = iris.target

class_names = iris.target_names
description = iris.DESCR

In [76]:
example_size = 24
print(f"x: {x[:example_size]}")
print(f"y: {y[:example_size]}")
print(f"classes: {class_names[y[:example_size]]}")
print(f"descripion: {description}")

x: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]]
y: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
classes: ['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa']
descripion: .. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Infor

In [77]:
print(f"x shape: {x.shape}")
print(f"y shape: {y.shape}")

x shape: (150, 4)
y shape: (150,)


### Dataset Split

In [78]:
num_samples = x.shape[0]
num_feats = x.shape[1]
num_classes = y.shape[0]

In [79]:
test_size = num_samples // 3
filtered_data = x[:, :2]

random_data_points = np.random.permutation(num_samples)

training_data_points = filtered_data[random_data_points[:-test_size]]
training_classes = y[random_data_points[:-test_size]]

test_data_points = filtered_data[random_data_points[-test_size:]]
test_classes = y[random_data_points[-test_size:]]

print(f"training data shape: {training_data_points.shape}")
print(f"training classes shape: {training_classes.shape}")
print(f"test data shape: {test_data_points.shape}")
print(f"test classes shape: {test_classes.shape}")

training data shape: (100, 2)
training classes shape: (100,)
test data shape: (50, 2)
test classes shape: (50,)


### KNN Model

In [80]:
from sklearn import neighbors

In [81]:
classifier = neighbors.KNeighborsClassifier(n_neighbors=4, algorithm="kd_tree", leaf_size=32, weights="distance")

In [82]:
classifier.fit(training_data_points, training_classes)

In [83]:
test_pred = classifier.predict(test_data_points)
print(f"test pred: {test_pred}")

accuracy = classifier.score(test_data_points, test_classes)
print(f"accuracy: {accuracy*100:.4}%")

test pred: [0 0 0 2 1 0 1 1 0 1 2 1 2 1 0 2 2 2 2 1 2 1 1 1 0 1 2 2 0 1 2 2 0 1 0 2 2
 2 1 2 1 2 1 2 0 1 1 0 1 2]
accuracy: 66.0%


### Try Different Hyperparameters

In [84]:
n_neighbors = [n_neighbor for n_neighbor in range(1, 11)]
weight_modes = ["uniform", "distance"]
for n in n_neighbors:
    for w_m in weight_modes:
        classifier.n_neighbors = n
        classifier.weights = w_m
        accuracy = classifier.score(test_data_points, test_classes)
        print(f"accuracy({n}) {w_m}: {accuracy*100.0:.4}%")

accuracy(1) uniform: 70.0%
accuracy(1) distance: 70.0%
accuracy(2) uniform: 58.0%
accuracy(2) distance: 68.0%
accuracy(3) uniform: 60.0%
accuracy(3) distance: 58.0%
accuracy(4) uniform: 64.0%
accuracy(4) distance: 66.0%
accuracy(5) uniform: 68.0%
accuracy(5) distance: 62.0%
accuracy(6) uniform: 68.0%
accuracy(6) distance: 62.0%
accuracy(7) uniform: 72.0%
accuracy(7) distance: 66.0%
accuracy(8) uniform: 74.0%
accuracy(8) distance: 68.0%
accuracy(9) uniform: 74.0%
accuracy(9) distance: 72.0%
accuracy(10) uniform: 64.0%
accuracy(10) distance: 70.0%


### Best KNN Model

In [85]:
clf = neighbors.KNeighborsClassifier(n_neighbors=8, weights="uniform")
clf.fit(training_data_points, training_classes)

accuracy = clf.score(test_data_points, test_classes)
print(f"accuracy: {accuracy*100:.4}%")

classes = clf.predict(test_data_points)
print(f"predicted:\n{classes}")

classes_proba = clf.predict_proba(test_data_points)
print(f"probability:\n{classes_proba}")


accuracy: 74.0%
predicted:
[0 0 0 2 1 0 1 2 0 1 2 2 2 1 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 1 0 2 2
 2 1 2 2 2 1 1 0 1 0 0 1 2]
probability:
[[1.    0.    0.   ]
 [1.    0.    0.   ]
 [1.    0.    0.   ]
 [0.    0.125 0.875]
 [0.    0.5   0.5  ]
 [1.    0.    0.   ]
 [0.    0.625 0.375]
 [0.    0.375 0.625]
 [1.    0.    0.   ]
 [0.    0.5   0.5  ]
 [0.    0.125 0.875]
 [0.    0.375 0.625]
 [0.    0.375 0.625]
 [0.    0.5   0.5  ]
 [1.    0.    0.   ]
 [0.    0.375 0.625]
 [0.    0.625 0.375]
 [0.    0.375 0.625]
 [0.    0.625 0.375]
 [0.125 0.875 0.   ]
 [0.    0.625 0.375]
 [0.625 0.375 0.   ]
 [0.    0.5   0.5  ]
 [0.    0.875 0.125]
 [1.    0.    0.   ]
 [0.    0.875 0.125]
 [0.    0.125 0.875]
 [0.    0.375 0.625]
 [1.    0.    0.   ]
 [0.125 0.875 0.   ]
 [0.    0.375 0.625]
 [0.    0.125 0.875]
 [1.    0.    0.   ]
 [0.    0.5   0.5  ]
 [1.    0.    0.   ]
 [0.    0.375 0.625]
 [0.    0.125 0.875]
 [0.    0.25  0.75 ]
 [0.    0.625 0.375]
 [0.    0.375 0.625]
 [0.    0.25  0.7