+ * Elkan, Charles. + * "Using the triangle inequality to accelerate k-means." + * ICML. Vol. 3. 2003. + *
+ * + *

+ * Algorithm uses triangle inequality to speed up computation, by reducing + * the amount of distances calculations. Towards the last iterations of + * the algorithm, points which already assigned to some cluster are unlikely + * to move to a new cluster; updates of cluster centers are also usually + * relatively small. + * Triangle inequality is thus used to determine the cases where distance + * computation could be skipped since center move only a little, without + * affecting points partitioning. + * + *

+ * For initial centers seeding, we apply the algorithm described in + *

+ * Arthur, David, and Sergei Vassilvitskii. + * "k-means++: The advantages of careful seeding." + * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. + * Society for Industrial and Applied Mathematics, 2007. + *
+ * + * @param Type of the points to cluster. + */ +public class ElkanKMeansPlusPlusClusterer + extends KMeansPlusPlusClusterer { + + /** + * @param k Clustering parameter. + */ + public ElkanKMeansPlusPlusClusterer(int k) { + super(k); + } + + /** + * @param k Clustering parameter. + * @param maxIterations Allowed number of iterations. + * @param measure Distance measure. + * @param random Random generator. + */ + public ElkanKMeansPlusPlusClusterer(int k, + int maxIterations, + DistanceMeasure measure, + UniformRandomProvider random) { + super(k, maxIterations, measure, random); + } + + /** + * @param k Clustering parameter. + * @param maxIterations Allowed number of iterations. + * @param measure Distance measure. + * @param random Random generator. + * @param emptyStrategy Strategy for handling empty clusters that + * may appear during algorithm progress. + */ + public ElkanKMeansPlusPlusClusterer(int k, + int maxIterations, + DistanceMeasure measure, + UniformRandomProvider random, + EmptyClusterStrategy emptyStrategy) { + super(k, maxIterations, measure, random, emptyStrategy); + } + + /** {@inheritDoc} */ + @Override + public List> cluster(final Collection points) { + final int k = getNumberOfClusters(); + + // Number of clusters has to be smaller or equal the number of data points. + if (points.size() < k) { + throw new NumberIsTooSmallException(points.size(), k, false); + } + + final List pointsList = new ArrayList<>(points); + final int n = points.size(); + final int dim = pointsList.get(0).getPoint().length; + + // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is + // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c) + final double[] s = new double[k]; + Arrays.fill(s, Double.MAX_VALUE); + // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2) + final double[][] dcc = new double[k][k]; + + // For each point keeps the upper bound distance to the cluster center. + final double[] u = new double[n]; + Arrays.fill(u, Double.MAX_VALUE); + + // For each point and for each cluster keeps the lower bound for the distance between the point and cluster + final double[][] l = new double[n][k]; + + // Seed initial set of cluster centers. + final double[][] centers = seed(pointsList); + + // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates + // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c). + final int[] partitions = partitionPoints(pointsList, centers, u, l); + + final double[] deltas = new double[k]; + VectorialMean[] means = new VectorialMean[k]; + for (int it = 0, changes = 0, max = getMaxIterations(); + it < max; + it++, changes = 0) { + // Step I. + // Compute inter-cluster distances. + updateIntraCentersDistances(centers, dcc, s); + + for (int xi = 0; xi < n; xi++) { + boolean r = true; + + // Step II. + if (u[xi] <= s[partitions[xi]]) { + continue; + } + + for (int c = 0; c < k; c++) { + // Check condition III. + if (isSkipNext(partitions, u, l, dcc, xi, c)) { + continue; + } + + final double[] x = pointsList.get(xi).getPoint(); + + // III(a) + if (r) { + u[xi] = distance(x, centers[partitions[xi]]); + l[xi][partitions[xi]] = u[xi]; + r = false; + } + // III(b) + if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) { + l[xi][c] = distance(x, centers[c]); + if (l[xi][c] < u[xi]) { + partitions[xi] = c; + u[xi] = l[xi][c]; + ++changes; + } + } + } + } + + // Stopping criterion. + if (changes == 0 && + it != 0) { // First iteration needed (to update bounds). + break; + } + + // Step IV. + Arrays.fill(means, null); + for (int i = 0; i < n; i++) { + if (means[partitions[i]] == null) { + means[partitions[i]] = new VectorialMean(dim); + } + means[partitions[i]].increment(pointsList.get(i).getPoint()); + } + + for (int i = 0; i < k; i++) { + deltas[i] = distance(centers[i], means[i].getResult()); + centers[i] = means[i].getResult(); + } + + updateBounds(partitions, u, l, deltas); + } + + return buildResults(pointsList, partitions, centers); + } + + /** + * kmeans++ seeding which provides guarantee of resulting with log(k) approximation + * for final clustering results + *

+ * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding." + * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. + * Society for Industrial and Applied Mathematics, 2007. + * + * @param points input data points + * @return an array of initial clusters centers + * + */ + private double[][] seed(final List points) { + final int k = getNumberOfClusters(); + final UniformRandomProvider random = getRandomGenerator(); + + final double[][] result = new double[k][]; + final int n = points.size(); + final int pointIndex = random.nextInt(n); + + final double[] minDistances = new double[n]; + + int idx = 0; + result[idx] = points.get(pointIndex).getPoint(); + + double sumSqDist = 0; + + for (int i = 0; i < n; i++) { + final double d = distance(result[idx], points.get(i).getPoint()); + minDistances[i] = d * d; + sumSqDist += minDistances[i]; + } + + while (++idx < k) { + final double p = sumSqDist * random.nextDouble(); + int next = 0; + for (double cdf = 0; cdf < p; next++) { + cdf += minDistances[next]; + } + + result[idx] = points.get(next - 1).getPoint(); + for (int i = 0; i < n; i++) { + final double d = distance(result[idx], points.get(i).getPoint()); + sumSqDist -= minDistances[i]; + minDistances[i] = Math.min(minDistances[i], d * d); + sumSqDist += minDistances[i]; + } + } + + return result; + } + + + /** + * Once initial centers are chosen, we can actually go through data points and assign points to the + * cluster based on the distance between initial centers and points. + * + * @param pointsList data points list + * @param centers current clusters centers + * @param u points upper bounds + * @param l lower bounds for points to clusters centers + * + * @return initial assignment of points into clusters + */ + private int[] partitionPoints(List pointsList, + double[][] centers, + double[] u, + double[][] l) { + final int k = getNumberOfClusters(); + final int n = pointsList.size(); + // Points assignments vector. + final int[] assignments = new int[n]; + Arrays.fill(assignments, -1); + // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c) + for (int i = 0; i < n; i++) { + final double[] x = pointsList.get(i).getPoint(); + for (int j = 0; j < k; j++) { + l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c) + if (u[i] > l[i][j]) { + u[i] = l[i][j]; // u(x) = min_c d(x, c) + assignments[i] = j; // c(x) = argmin_c d(x, c) + } + } + } + return assignments; + } + + /** + * Updated distances between clusters centers and for each cluster + * pick the closest neighbour and keep distance to it. + * + * @param centers cluster centers + * @param dcc matrix of distance between clusters centers, e.g. + * {@code dcc[i][j] = distance(centers[i], centers[j])} + * @param s For a given cluster, {@code s[si]} holds distance value + * to the closest cluster center. + */ + private void updateIntraCentersDistances(double[][] centers, + double[][] dcc, + double[] s) { + final int k = getNumberOfClusters(); + for (int i = 0; i < k; i++) { + // Since distance(xi, xj) == distance(xj, xi), we need to update + // only upper or lower triangle of the distances matrix and mirror + // to the lower of upper triangle accordingly, trace has to be all + // zeros, since distance(xi, xi) == 0. + for (int j = i + 1; j < k; j++) { + dcc[i][j] = 0.5 * distance(centers[i], centers[j]); + dcc[j][i] = dcc[i][j]; + if (dcc[i][j] < s[i]) { + s[i] = dcc[i][j]; + } + if (dcc[j][i] < s[j]) { + s[j] = dcc[j][i]; + } + } + } + } + + /** + * For given points and and cluster, check condition (3) of Elkan algorithm. + * + *

+ *
• c is not the cluster xi assigned to
• + *
• {@code u[xi] > l[xi][x]} upper bound for point xi is greater than + * lower bound between xi and some cluster c
• + *
• {@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than + * distance between center of xi's cluster and c
• + *