In [1]:
import numpy as np
import pandas as pd

In [2]:
data_path = 'data/iris.data'

### Dataset Description

- Number of features for each data-point: 4
- Number of classes: 3 (although we need to cluster the data points to identify what works the best)

### Read data from file in X, y

In [3]:
iris = pd.read_csv(data_path, sep=',', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type'])

In [4]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
for i, flower_type in enumerate(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']):
    iris.loc[iris['type'] == flower_type, 'type'] = i

In [13]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].to_numpy()

In [15]:
X.shape

(150, 4)

In [16]:
y = iris[['type']].to_numpy()

In [17]:
y.shape

(150, 1)

### k-Means Clustering

In [24]:
np.random.rand(1, 4)

array([[0.52812718, 0.97123369, 0.20407977, 0.32092227]])

In [40]:
prev_centroids = np.random.randint(1, 9, size=(1, 4))[0]
new_centroids = np.random.randint(10, 19, size=(1, 4))[0]
np.average(np.subtract(prev_centroids, new_centroids))

-7.5

In [34]:
class KMeans:
    
    def __init__(self, k=3, tolerance=0.001):
        self.k = k
        self.tolerance = tolerance
        
    def fit(self, X, max_iters=300):
        centroids = self.init_centroids(X)
        
        # Create empty clusters based on the value of k
        # This will be used to store the data points that lie on the respective clusters
        clusters = {i: [] for i in self.k}
        
        # Calculate distances of each data point from the centroids
        # Find the one with the minimum and assign to that cluster
        trained = False
        while not trained:
            for data_point in X:
                distances = [np.linalg.norm(data_point - centroid) for _, centroid in centroids.items()]
                # Take the index of the one with the minimum distance and assign that cluster to the data point
                pred_cluster = distances.index(min(distances))
                clusters[pred_cluster].append(data_point)
                
            # Calculate the new centroids based on the new clusters
            prev_centroids = dict(centroids)
            centroids = self.get_centroids(clusters)
            trained = self.is_trained(prev_centroids, centroids)
            
                
    
    def init_centroids(self, X):
        centroid_ids = np.random.choice(X.shape[0], self.k, replace=False)
        return {i: X[id] for id in centroid_ids}
    
    def get_centroids(self, clusters):
        centroids = dict()
        for i in clusters:
            centroids[i] = np.average(clusters[i], axis=0)
        return centroids
    
    def is_trained(self, prev_centroids, new_centroids):
        diffs = []
        for i in prev_centroids:
            diff = np.average(np.subtract(prev_centroids[i], new_centroids[i]))
            diffs.append(abs(diff))
        if max(diffs) <= self.tolerance:
            return True
        else:
            return False