In [131]:
import numpy as np
from scipy.optimize import minimize


def minkowski_distance(a, b, p):
    difference = np.subtract(a, b)
    absolute_difference = np.abs(difference)
    power_in_sum = np.power(absolute_difference, p)
    summa = np.sum(power_in_sum)
    return np.power(summa, 1/p)


# def minkowski_distance(x, y, p):
#     if len(x) != len(y):
#         raise ValueError("Vectors x and y must have the same length")
#     return = np.power(np.sum(np.power(np.abs(x - y), p)), 1/p)

# def minkowski_distance(x, y, p):
#     """Calculate the Minkowski distance between two points."""
#     return np.sum(np.abs(x - y) ** p) ** (1 / p)


class KMeans:
    def __init__(self, n_clusters, max_iter=100, p=2, optimizer='mean'):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.p = p
        self.optimizer = optimizer
        self.centroids = []
        self.labels = []

    def fit(self, X):
        # initialize centroids randomly
        self.centroids = [X[i] for i in np.random.choice(X.shape[0], self.n_clusters, replace=False)]

        for _ in range(self.max_iter):
            clusters = [[] for _ in range(self.n_clusters)]
            self.labels = []

            # assign each data point to the closest centroid
            for x in X:
                distances_to_each_cebtroid = [minkowski_distance(x, centroid, self.p) for centroid in self.centroids]
                closest_centroid = np.argmin(distances_to_each_cebtroid)
                clusters[closest_centroid].append(x)
                self.labels.append(closest_centroid)

            # update centroids using the specified optimizer
            for j, cluster in enumerate(clusters):
                cluster = np.array(cluster)
                if len(cluster) == 0:
                    continue
                if self.optimizer == 'mean':
                    self.centroids[j] = cluster.mean(axis=0)
                elif self.optimizer == 'median':
                    self.centroids[j] = np.median(cluster, axis=0)
                elif self.optimizer in ('Powell', 'CG', 'BFGS', 'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'trust-constr'):
                    self.centroids[j] = minimize(lambda x: minkowski_distance(x, cluster, self.p), self.centroids[j], method=self.optimizer).x.copy()
                else:
                    raise ValueError(f"Unsupported optimizer: {self.optimizer}")

        return self.centroids, self.labels

In [132]:
import numpy as np
import matplotlib.pyplot as plt

def generate_clusters(k, means, stds, n_points_per_cluster, X, distance_factor=1):
    clusters = []
    labels = []
    for i in range(k):
        mean, std = means[i] * distance_factor, stds[i]
        cov = np.identity(X) * std ** 2
        points = np.random.multivariate_normal(mean, cov, n_points_per_cluster)
        clusters.append(points)
        labels.append(np.full((n_points_per_cluster,), i))
    return np.concatenate(clusters), np.concatenate(labels)

In [133]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def make_plots(axs, index, color_map, data, labels, opt_name=''):
    colors = [color_map[label] for label in labels]

    tsne = TSNE(n_components=2)
    data_2d = tsne.fit_transform(data)

    axs[index, 0].scatter(data_2d[:,0], data_2d[:,1], c=colors)
    # axs[index, 0].set_xlabel('t-SNE Dimension 1')
    # axs[index, 0].set_ylabel('t-SNE Dimension 2')
    axs[index, 0].set_title(f'{opt_name} in 2D via t-SNE')

    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(data)

    axs[index, 1].scatter(data_2d[:,0], data_2d[:,1], c=colors)
    # axs[index, 1].set_xlabel('PCA Dimension 1')
    # axs[index, 1].set_ylabel('PCA Dimension 2')
    axs[index, 1].set_title(f'{opt_name} in 2D via PCA')


In [134]:
n_clusters = 2
assert n_clusters <= 5, "n_clusters must be less than or equal to 5"


color_map = {0: 'red', 1: 'green', 2: 'blue', 3: 'yellow', 4: 'purple'}

In [135]:
dimension = 50
means = [np.zeros(dimension), np.ones(dimension), -np.ones(dimension), np.ones(dimension) * -1 + 1, np.ones(dimension) + -1]
stds = [0.5, 0.5, 0.5, 0.5, 0.5]
n_points_per_cluster = 100
distance_factor = 0.3

data, true_labels = generate_clusters(n_clusters, means, stds, n_points_per_cluster, dimension, distance_factor)

In [None]:
X = data
optimizers = ['mean', 'median', 'Powell', 'CG', 'BFGS', 'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'trust-constr']
data_map = dict.fromkeys(optimizers)
p = 0.2

len_optimizers = len(optimizers)
fig, axs = plt.subplots(len_optimizers, 2, figsize=(15, 5*len_optimizers))
fig.suptitle(f'Experiment with data dim: {dimension}, p: {p}', fontsize=16)

for index, optimizer in enumerate(optimizers):
    kmeans = KMeans(n_clusters=n_clusters, optimizer=optimizer, p=p)
    centroids, labels = kmeans.fit(X)
    data_map[optimizer] = labels

    make_plots(axs, index, color_map, X, labels, optimizer)
    print(f'Processed {optimizer}')
plt.show()

In [None]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score
import pandas as pd

ari = []
ami = []
for optimizer in optimizers:
    pred_lable = data_map[optimizer]
    ari.append(adjusted_rand_score(true_labels, pred_lable))
    ami.append(mutual_info_score(true_labels, pred_lable))

pd.DataFrame({'Adjusted Rand Index': ari, 'Mutual Information Score': ami}, index=optimizers)