In [None]:
import numpy as np
import pandas as pd
import random as rd

In [None]:
##################### KMeans with initialization KMeans++   ###########################
def L2_dist(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

class KMeans:
    def __init__(self, K=3, max_iters=100):
        self.K = K
        self.max_iters = max_iters

        # list of sample indices for each cluster
        self.clusters = [[] for _ in range(self.K)]

        # the centers (mean vector) for each cluster
        self.centroids = []


    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize
        self.centroids = self.initial_kmeans_centroids(self.X, self.K)
        
        # optimize clusters
        for _ in range(self.max_iters):
            # assign samples to closest centroids (create clusters)
            self.clusters = self._create_clusters(self.centroids)

            # calculate new centroids from the clusters
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)

            if self._is_converged(centroids_old, self.centroids):
                break
        
        # classify samples as the index of their clusters
        return self.centroids, self._get_cluster_labels(self.clusters)
    
    
    
    def initial_kmeans_centroids(self, X, K): # using Kmeans++ to initialize centroids
#         self.X = X
#         self.n_samples, self.n_features = X.shape
#         self.K = K
        
        centroids = [[] for _ in range(self.K)]
        centroids = [np.array(rd.choices(self.X, k=1))]
        # print(centroids)
        i = 1
        while i != self.K:
            D = np.zeros(self.n_samples)
            for idx, sample in enumerate(self.X):
                D[idx] = np.min([L2_dist(sample, point) for point in centroids])

            prob = np.square(D)/np.sum(np.square(D))
            idx_new = rd.choices(range(self.n_samples), weights=prob, k=1)
            centroids.append(self.X[idx_new])
            #centroids = pd.concat([centroids, centroid_new])
            #self.X = self.X.drop(idx)
            i+=1

        #print("initial centroids:", centroids)
        return centroids
    
    
    
    def _create_clusters(self, centroids):
        # assign the samples to the closest centroids
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters
    
    
    
    def _get_cluster_labels(self, clusters):
        # each sample will get the label of the cluster it was assigned to
        labels = np.empty(self.n_samples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                labels[sample_idx] = cluster_idx

        return labels

    
    def _closest_centroid(self, sample, centroids):
        # distance of the current sample to each centroid
        distances = [L2_dist(sample, point) for point in centroids]
        closest_idx = np.argmin(distances)
        return closest_idx


    def _get_centroids(self, clusters):
        # assign mean value of clusters to centroids
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # distances between old and new centroids, for all centroids
        distances = [L2_dist(centroids_old[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0