In [7]:
import numpy as np
from random import shuffle
from sklearn.datasets import load_iris

In [8]:
def accuracy(preds, targets):
    return (preds == targets).mean().item()

In [9]:
class KMeansClassifier:
    def __init__(self, k: int, threshold: float=1e-5):
        self.k = k
        self.threshold = threshold
        
    def fit(self, X: np.ndarray, y: np.ndarray):
        self.centroids = np.empty((self.k, X.shape[1]))
        self.centroids[0] = X[np.random.randint(len(X))]
        
        for i in range(1, self.k):
            distance = ((X[None,:,:] - self.centroids[:i, None, :])**2).sum(axis=-1)
            p = distance.min(axis=0)
            self.centroids[i] = X[np.random.choice(np.arange(len(p)), p=p/p.sum())]
            

        distance = ((X[None,:,:] - self.centroids[:, None, :])**2).sum(axis=-1)
        clusters = distance.argmin(axis=0)
        prev_error = distance[clusters, np.arange(len(X))].mean()
        
        while True:
            for i in range(self.k):
                self.centroids[i] = X[clusters == i].mean(axis=0)
                
            distance = ((X[None,:,:] - self.centroids[:, None, :])**2).sum(axis=-1)
            clusters = distance.argmin(axis=0)
            error = distance[clusters, np.arange(len(X))].mean()
            
            if error - prev_error < self.threshold:
                break
            
        self.classes = []
        for i in range(self.k):
            self.classes.append(np.bincount(y[clusters == i]).argmax())
    
    def predict(self, x: np.ndarray) -> int:
        return self.classes[((self.centroids - x.reshape(1, -1))**2).sum(axis=1).argmin()]

In [10]:
data = load_iris(as_frame=True)
X = data["data"].values
y = data["target"].values

idx = list(range(len(X)))
shuffle(idx)
X = X[idx]
y = y[idx]

val_size = int(len(X)*0.1)
train_X, val_X = X[val_size:], X[:val_size]
train_y, val_y = y[val_size:], y[:val_size]

In [11]:
classifier = KMeansClassifier(3)
classifier.fit(train_X, train_y)

preds = np.array([classifier.predict(x) for x in train_X])
accuracy(preds, train_y)

0.8740740740740741

In [12]:
preds = np.array([classifier.predict(x) for x in val_X])
accuracy(preds, val_y)

0.9333333333333333