## Lloyd's algorithm implemantaion 

In [56]:

import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd 
from sklearn.metrics import normalized_mutual_info_score




def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def distance_norm(x1,x2,axis = 0):
    return np.linalg.norm(x1 - x2, axis=axis)


class KMeans:
    def __init__(self, K=153, max_iterations=5):
        self.K = K
        self.max_iterations = max_iterations

        # initialize a list of clusters
        self.clusters = [[] for _ in range(self.K)]
        # initialize an empty list of centroids
        self.centroids = []

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize random centroids 
        random_indexs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_indexs]

        # updating the clusters
        count = 0
        start_time = time.time()
        for i in range(self.max_iterations):
            # assign the points to the closest cluster 
            self.clusters = self.initialize_clusters(self.centroids)
            # saving sa copy of the centroids in old centroids to compare it for convergence
            old_centroids = self.centroids
            # updating the centroids with the mean 
            self.centroids = self.reevaluate_centers(self.clusters)
            
            # kepping count of the iterations for convergence
            count+=1
            # checking for convergence 
            if self.check_convergence(old_centroids, self.centroids):
                print("number of iteration to converge " +str(count))
                print("--- %s seconds to convergence---" % (time.time() - start_time))
                break

        print("number of iteration: " +str(count))
        print("--- %s seconds over 5 iterations---" % (time.time() - start_time))

        # returning the clusters and the clusters labels 
        return  self.clusters , self.clusters_labels(self.clusters) , count


    ################## helper functions ######################

    def clusters_labels(self, clusters):
        # assigning labels of the clusters to the samples assign to it 
        labels = np.empty(self.n_samples)

        for cluster_idx, cluster in enumerate(clusters):
            for sample_index in cluster:
                labels[sample_index] = cluster_idx
        return labels

    def initialize_clusters(self, centroids):
        # assigning the points to the closest centroids 
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self.closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def closest_centroid(self, sample, centroids):
        # calculting the distance of the current point to each cluster centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_index = np.argmin(distances)
        return closest_index

    def reevaluate_centers(self, clusters):
        # updating the centroids of clusters with mean value 
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def check_convergence(self, old_centroids, centroids):
        # calculating the distance between old centroids and new centroids 
        distances = [euclidean_distance(old_centroids[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0

    

In [57]:
# loading the data set 
data = pd.read_csv('bio_train.csv', header=None)
# accoriding to the dataset documentation Columns 4-77 are the input features  
X = data.values[:,3:]
# loading the block id column to evalutate the clustering results 
Block_ID = data.values[:,0]
# number of clusters 
K = 153
# number of max iterations 
max_iterations = 5

########### resutls #############
kmeans = KMeans(K=K, max_iterations=max_iterations)
clusters , predicted_labels , count = kmeans.predict(X)
accuracy = normalized_mutual_info_score(Block_ID, predicted_labels ,average_method="arithmetic") / count
print(predicted_labels)
print()
print("NMI over " +str(count)+" iterations:"+str(accuracy))


number of iteration: 5
--- 1057.2904615402222 seconds over 5 iterations---
[ 71. 115. 149. ... 143.  88.  34.]

NMI over 5 iterations:0.031049369364492797


In [58]:
accuracy = normalized_mutual_info_score(Block_ID, predicted_labels ,average_method="arithmetic") 
print(accuracy)


0.15524684682246398


## sklearn.cluster KMeans implementation 

In [60]:
from sklearn.cluster import KMeans
import numpy as np

data = pd.read_csv('bio_train.csv', header=None)
# accoriding to the dataset documentation Columns 4-77 are the input features  
X = data.values[:,1:]
# loading the block id column to evalutate the clustering results 
Block_ID = data.values[:,0]
# number of clusters 
K = 153
# number of max iterations 
max_iterations = 5
kmeans = KMeans(n_clusters=K, random_state=0).fit(X)
accuracy = normalized_mutual_info_score(Block_ID, kmeans.labels_ ,average_method="arithmetic") 
print(accuracy)


0.8517726266820533
