From 295bdde38d930b9fa32076a23e751ec128104648 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 30 Mar 2015 20:29:12 +0900 Subject: [PATCH 01/76] [SPARK-6517][mllib] Implement the Algorithm of Hierarchical Clustering Thank you for your great cooperation, RJ Nowling(@rnowling), Jeremy Freeman(@freeman-lab), Xiangrui Meng(@mengxr) and Sean Owen(@srowen). --- .../mllib/api/python/PythonMLLibAPI.scala | 23 + .../clustering/HierarchicalClustering.scala | 609 ++++++++++++++++++ .../HierarchicalClusteringModel.scala | 97 +++ .../HierarchicalClusteringModelSuite.scala | 118 ++++ .../HierarchicalClusteringSuite.scala | 188 ++++++ 5 files changed, 1035 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 21e55938fa7aa..49a3420c26945 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -401,6 +401,29 @@ private[python] class PythonMLLibAPI extends Serializable { } } + /** + * Java stub for Python mllib HierarchicalClustering.run() + */ + def trainHierarchicalClusteringModel( + data: JavaRDD[Vector], + k: Int, + maxIterations: Int, + maxRetries: Int, + seed: java.lang.Long): HierarchicalClusteringModel = { + val algo = new HierarchicalClustering() + .setNumClusters(k) + .setMaxIterations(maxIterations) + .setMaxRetries(maxRetries) + + if (seed != null) algo.setSeed(seed) + + try { + algo.run(data) + } finally { + data.rdd.unpersist(blocking = false) + } + } + /** * Java stub for Python mllib GaussianMixtureModel.predictSoft() */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala new file mode 100644 index 0000000000000..dd6cf6d0f8b94 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -0,0 +1,609 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm} +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.random.XORShiftRandom +import org.apache.spark.{Logging, SparkException} + +import scala.collection.{Map, mutable} + + +/** + * Top-level methods for calling the hierarchical clustering algorithm + */ +object HierarchicalClustering extends Logging { + + private[clustering] val ROOT_INDEX_KEY: Long = 1 + + /** + * Trains a hierarchical clustering model with the given data + * + * @param data trained data + * @param numClusters the maximum number of clusters you want + * @return a hierarchical clustering model + */ + def train(data: RDD[Vector], numClusters: Int): HierarchicalClusteringModel = { + val algo = new HierarchicalClustering().setNumClusters(numClusters) + algo.run(data) + } + + /** + * Trains a hierarchical clustering model with the given data + * + * @param data training data + * @param numClusters the maximum number of clusters you want + * @param maxIterations the number of maximal iterations + * @param maxRetries the number of maximum retries when the clustering can't be succeeded + * @param seed the randomseed to generate the initial vectors for each bisecting + * @return a hierarchical clustering model + */ + def train(data: RDD[Vector], + numClusters: Int, + maxIterations: Int, + maxRetries: Int, + seed: Int): HierarchicalClusteringModel = { + + val algo = new HierarchicalClustering().setNumClusters(numClusters) + .setMaxIterations(maxIterations) + .setMaxRetries(maxRetries) + .setSeed(seed) + algo.run(data) + } + + /** + * Finds the closes cluster's center + * + * @param metric a distance metric + * @param centers centers of the clusters + * @param point a target point + * @return an index of the array of clusters + */ + private[mllib] + def findClosestCenter(metric: Function2[BV[Double], BV[Double], Double]) + (centers: Seq[BV[Double]])(point: BV[Double]): Int = { + val (closestCenter, closestIndex) = + centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1) + closestIndex + } +} + +/** + * This is a divisive hierarchical clustering algorithm based on bi-sect k-means algorithm. + * + * The main idea of this algorithm is based on "A comparison of document clustering techniques", + * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000. + * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf + * + * @param numClusters tne number of clusters you want + * @param clusterMap the pairs of cluster and its index as Map + * @param maxIterations the number of maximal iterations + * @param maxRetries the number of maximum retries + * @param seed a random seed + */ +class HierarchicalClustering( + private var numClusters: Int, + private var clusterMap: Map[Long, ClusterTree], + private var maxIterations: Int, + private var maxRetries: Int, + private var seed: Long) extends Logging { + + /** + * Constructs with the default configuration + */ + def this() = this(20, mutable.ListMap.empty[Long, ClusterTree], 20, 10, 1) + + /** + * Sets the number of clusters you want + */ + def setNumClusters(numClusters: Int): this.type = { + this.numClusters = numClusters + this + } + + /** + * Sets the number of maximal iterations in each clustering step + */ + def setMaxIterations(maxIterations: Int): this.type = { + this.maxIterations = maxIterations + this + } + + def getSubIterations(): Int = this.maxIterations + + /** + * Sets the number of maximum retries of each clustering step + */ + def setMaxRetries(maxRetries: Int): this.type = { + this.maxRetries = maxRetries + this + } + + def getMaxRetries(): Int = this.maxRetries + + /** + * Sets the random seed + */ + def setSeed(seed: Long): this.type = { + this.seed = seed + this + } + + def getSeed(): Long = this.seed + + /** + * Runs the hierarchical clustering algorithm + * @param input RDD of vectors + * @return model for the hierarchical clustering + */ + def run(input: RDD[Vector]): HierarchicalClusteringModel = { + val sc = input.sparkContext + log.info(s"${sc.appName} starts a hierarchical clustering algorithm") + + var data = initData(input).cache() + val startTime = System.currentTimeMillis() + + // `clusters` is described as binary tree structure + // `clusters(1)` means the root of a binary tree + var clusters = summarizeAsClusters(data) + var leafClusters = clusters + var step = 1 + var numDividedClusters = 0 + var noMoreDividable = false + var rddArray = Array.empty[RDD[(Long, BV[Double])]] + // the number of maximum nodes of a binary tree by given parameter + val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1 + val maxAllNodesInTree = math.pow(2, multiplier).toInt + + while (clusters.size < maxAllNodesInTree && noMoreDividable == false) { + log.info(s"${sc.appName} starts step ${step}") + + // enough to be clustered if the number of divided clusters is equal to 0 + val divided = getDividedClusters(data, leafClusters) + if (divided.size == 0) { + noMoreDividable = true + } + else { + // update each index + val newData = updateClusterIndex(data, divided).cache() + rddArray = rddArray ++ Array(data) + data = newData + + // keep recent 2 cached RDDs in order to run more quickly + if (rddArray.size > 1) { + val head = rddArray.head + head.unpersist() + rddArray = rddArray.filterNot(_.hashCode() == head.hashCode()) + } + + // merge the divided clusters with the map as the cluster tree + clusters = clusters ++ divided + numDividedClusters = data.map(_._1).distinct().count().toInt + leafClusters = divided + step += 1 + + log.info(s"${sc.appName} adding ${divided.size} new clusters at step:${step}") + } + } + // unpersist kept RDDs + rddArray.foreach(_.unpersist()) + + // build a cluster tree by Map class which is expressed + log.info(s"Building the cluster tree is started in ${sc.appName}") + val root = buildTree(clusters, HierarchicalClustering.ROOT_INDEX_KEY, this.numClusters) + if (root == None) { + new SparkException("Failed to build a cluster tree from a Map type of clusters") + } + + // set the elapsed time for training + val finishTime = (System.currentTimeMillis() - startTime) / 1000.0 + log.info(s"Elapsed Time for Hierarchical Clustering Training: ${finishTime} [sec]") + + // make a hierarchical clustering model + val model = new HierarchicalClusteringModel(root.get) + val leavesNodes = model.getClusters() + if (leavesNodes.size < this.numClusters) { + log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ") + } + model + } + + /** + * Assigns the initial cluster index id to all data + */ + private[clustering] + def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { + data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}.cache + } + + /** + * Summarizes data by each cluster as ClusterTree2 classes + */ + private[clustering] + def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = { + // summarize input data + val stats = summarize(data) + + // convert statistics to ClusterTree class + stats.map { case (i, (sum, n, sumOfSquares)) => + val center = Vectors.fromBreeze(sum :/ n) + val variances = n match { + case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) + case _ => Vectors.zeros(sum.size) + } + (i, new ClusterTree(center, n.toLong, variances)) + }.toMap + } + + /** + * Summarizes data by each cluster as Map + */ + private[clustering] + def summarize(data: RDD[(Long, BV[Double])]): Map[Long, (BV[Double], Double, BV[Double])] = { + data.mapPartitions { iter => + // calculate the accumulation of the all point in a partition and count the rows + val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] + iter.foreach { case (idx: Long, point: BV[Double]) => + // get a map value or else get a sparse vector + val (sumBV, n, sumOfSquares) = map.get(idx) + .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) + } + map.toIterator + }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) => + // sum the accumulation and the count in the all partition + (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) + }.collect().toMap + } + + /** + * Gets the initial centers for bi-sect k-means + */ + private[clustering] + def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = { + val rand = new XORShiftRandom() + rand.setSeed(this.seed) + + clusters.flatMap { case (idx, center) => + val childrenIndexes = Array(2 * idx, 2 * idx + 1) + val relativeErrorCoefficient = 0.001 + Array( + (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))), + (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble()))) + ) + }.toMap + } + + /** + * Gets the new divided centers + */ + private[clustering] + def getDividedClusters(data: RDD[(Long, BV[Double])], + dividedClusters: Map[Long, ClusterTree]): Map[Long, ClusterTree] = { + val sc = data.sparkContext + val appName = sc.appName + + // get keys of dividable clusters + val dividableKeys = dividedClusters.filter { case (idx, cluster) => + cluster.variances.toArray.sum > 0.0 && cluster.records >= 2 + }.keySet + if (dividableKeys.size == 0) { + log.info(s"There is no dividable clusters in ${appName}.") + return Map.empty[Long, ClusterTree] + } + + // divide input data + var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)} + var dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)} + val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator) + var stats = divide(data, dividableClusters) + + // if there is clusters which is failed to be divided, + // retry to divide only failed clusters again and again + var tryTimes = 1 + while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) { + // get the indexes of clusters which is failed to be divided + val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => (idx / 2).toLong) + val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)} + log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" + + s"at ${tryTimes} times in ${appName}") + + // divide the failed clusters again + sc.broadcast(failedIndexes) + dividableData = data.filter { case (idx, point) => failedIndexes.contains(idx)} + val missingStats = divide(dividableData, failedCenters) + stats = stats ++ missingStats + tryTimes += 1 + } + + // make children clusters + stats.filter { case (i, (sum, n, sumOfSquares)) => n > 0} + .map { case (i, (sum, n, sumOfSquares)) => + val center = Vectors.fromBreeze(sum :/ n) + val variances = n match { + case 1 => Vectors.sparse(sum.size, Array(), Array()) + case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) + } + val child = new ClusterTree(center, n.toLong, variances) + (i, child) + }.toMap + } + + /** + * Builds a cluster tree from a Map of clusters + * + * @param treeMap divided clusters as a Map class + * @param rootIndex index you want to start + * @param numClusters the number of clusters you want + * @return + */ + private[clustering] + def buildTree(treeMap: Map[Long, ClusterTree], + rootIndex: Long, + numClusters: Int): Option[ClusterTree] = { + + // if there is no index in the Map + if (!treeMap.contains(rootIndex)) return None + + // build a cluster tree if the queue is empty or until the number of leaves clusters is enough + var numLeavesClusters = 1 + val root = treeMap(rootIndex) + var leavesQueue = Map(rootIndex -> root) + while (leavesQueue.size > 0 && numLeavesClusters < numClusters) { + // pick up the cluster whose variance is the maximum in the queue + val mostScattered = leavesQueue.maxBy(_._2.variancesNorm) + val mostScatteredKey = mostScattered._1 + val mostScatteredCluster = mostScattered._2 + + // relate the most scattered cluster to its children clusters + val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1) + if (childrenIndexes.forall(i => treeMap.contains(i))) { + // insert children to the most scattered cluster + val children = childrenIndexes.map(i => treeMap(i)) + mostScatteredCluster.insert(children) + + // calculate the local dendrogram height + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val localHeight = children + .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max + mostScatteredCluster.setLocalHeight(localHeight) + + // update the queue + leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap + numLeavesClusters += 1 + } + + // remove the cluster which is involved to the cluster tree + leavesQueue = leavesQueue.filterNot(_ == mostScattered) + + log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " + + s"Cluster ${childrenIndexes.mkString(",")} are merged.") + } + Some(root) + } + + /** + * Divides the input data + * + * @param data the pairs of cluster index and point which you want to divide + * @param clusters the clusters you want to divide AS a Map class + * @return divided clusters as Map + */ + private[clustering] + def divide(data: RDD[(Long, BV[Double])], + clusters: Map[Long, ClusterTree]): Map[Long, (BV[Double], Double, BV[Double])] = { + + val sc = data.sparkContext + val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)} + var newCenters = initChildrenCenter(centers) + if (newCenters.size == 0) { + return Map.empty[Long, (BV[Double], Double, BV[Double])] + } + sc.broadcast(newCenters) + + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + sc.broadcast(metric) + + val vectorSize = newCenters(newCenters.keySet.min).size + var stats = newCenters.keys.map { idx => + (idx, (BSV.zeros[Double](vectorSize).toVector, 0.0, BSV.zeros[Double](vectorSize).toVector)) + }.toMap + + var subIter = 0 + var diffVariances = Double.MaxValue + var oldVariances = Double.MaxValue + var variances = Double.MaxValue + while (subIter < this.maxIterations && diffVariances > 10E-4) { + // calculate summary of each cluster + val eachStats = data.mapPartitions { iter => + val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] + iter.foreach { case (idx, point) => + // calculate next index number + val childrenCenters = Array(2 * idx, 2 * idx + 1).filter(newCenters.keySet.contains(_)) + .map(newCenters(_)).toArray + if (childrenCenters.size >= 1) { + val closestIndex = + HierarchicalClustering.findClosestCenter(metric)(childrenCenters)(point) + val nextIndex = 2 * idx + closestIndex + + // get a map value or else get a sparse vector + val (sumBV, n, sumOfSquares) = map.get(nextIndex) + .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) + } + } + map.toIterator + }.reduceByKey { case ((sv1, n1, sumOfSquares1), (sv2, n2, sumOfSquares2)) => + // sum the accumulation and the count in the all partition + (sv1 + sv2, n1 + n2, sumOfSquares1 + sumOfSquares2) + }.collect().toMap + + // calculate the center of each cluster + newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)} + + // update summary of each cluster + stats = eachStats.toMap + + variances = stats.map { case (idx, (sum, n, sumOfSquares)) => + math.pow(sumOfSquares.toArray.sum, 1.0 / sumOfSquares.size) + }.sum + diffVariances = math.abs(oldVariances - variances) / oldVariances + oldVariances = variances + subIter += 1 + } + stats + } + + /** + * Updates the indexes of clusters which is divided to its children indexes + */ + private[clustering] + def updateClusterIndex( + data: RDD[(Long, BV[Double])], + dividedClusters: Map[Long, ClusterTree]): RDD[(Long, BV[Double])] = { + // extract the centers of the clusters + val sc = data.sparkContext + var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} + sc.broadcast(centers) + + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + sc.broadcast(metric) + + // update the indexes to their children indexes + data.map { case (idx, point) => + val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(centers.keySet.contains(_)) + childrenIndexes.size match { + // stay the index if the number of children is not enough + case s if s < 2 => (idx, point) + // update the indexes + case _ => { + val nextCenters = childrenIndexes.map(centers(_)).map(_.toBreeze) + val closestIndex = HierarchicalClustering.findClosestCenter(metric)(nextCenters)(point) + val nextIndex = 2 * idx + closestIndex + (nextIndex, point) + } + } + } + } +} + +/** + * A cluster as a tree node which can have its sub nodes + * + * @param center the center of the cluster + * @param records the number of rows in the cluster + * @param variances variance vectors + * @param parent the parent cluster of the cluster + * @param children the children nodes of the cluster + * @param variancesNorm the sum of squares of variances + */ +class ClusterTree( + val center: Vector, + val records: Long, + val variances: Vector, + val variancesNorm: Double, + private var localHeight: Double, + private var parent: Option[ClusterTree], + private var children: Seq[ClusterTree]) extends Serializable { + + require(!variancesNorm.isNaN) + + def this(center: Vector, rows: Long, variances: Vector) = + this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0), + 0.0, None, Array.empty[ClusterTree]) + + /** + * Inserts sub nodes as its children + * + * @param children inserted sub nodes + */ + def insert(children: Array[ClusterTree]) { + this.children = this.children ++ children + children.foreach(child => child.parent = Some(this)) + } + + /** + * Inserts a sub node as its child + * + * @param child inserted sub node + */ + def insert(child: ClusterTree) { + insert(Array(child)) + } + + /** + * Converts the tree into Array class + * the sub nodes are recursively expanded + * + * @return Seq class which the cluster tree is expanded + */ + def toArray(): Array[ClusterTree] = { + val array = this.children.size match { + case 0 => Array(this) + case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator) + } + array.sortWith { case (a, b) => + a.getDepth() < b.getDepth() && a.variances.toArray.sum < b.variances.toArray.sum + } + } + + /** + * Gets the depth of the cluster in the tree + * + * @return the depth + */ + def getDepth(): Int = { + this.parent match { + case None => 0 + case _ => 1 + this.parent.get.getDepth() + } + } + + /** + * Gets the leaves nodes in the cluster tree + */ + def getLeavesNodes(): Array[ClusterTree] = { + this.toArray().filter(_.isLeaf()).sortBy(_.center.toArray.sum) + } + + def isLeaf(): Boolean = (this.children.size == 0) + + def getParent(): Option[ClusterTree] = this.parent + + def getChildren(): Seq[ClusterTree] = this.children + + /** + * Gets the dendrogram height of the cluster at the cluster tree + * + * @return the dendrogram height + */ + def getHeight(): Double = { + this.children.size match { + case 0 => 0.0 + case _ => this.localHeight + this.children.map(_.getHeight()).max + } + } + + private[mllib] + def setLocalHeight(height: Double) = (this.localHeight = height) +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala new file mode 100644 index 0000000000000..d61a0775f7c6b --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm} +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.util.{Loader, Saveable} +import org.apache.spark.rdd.RDD +import org.apache.spark.{Logging, SparkContext} + +/** + * This class is used for the model of the hierarchical clustering + * + * @param tree a cluster as a tree node + */ +class HierarchicalClusteringModel(val tree: ClusterTree) + extends Serializable with Saveable with Logging { + + /** Current version of model save/load format. */ + override protected def formatVersion: String = "1.0" + + override def save(sc: SparkContext, path: String) { + val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(path)) + try { + oos.writeObject(this) + } finally { + oos.close() + } + } + + def getClusters(): Array[ClusterTree] = this.tree.getLeavesNodes() + + def getCenters(): Array[Vector] = this.getClusters().map(_.center) + + /** + * Predicts the closest cluster by one point + */ + def predict(vector: Vector): Int = { + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + + val centers = this.getCenters().map(_.toBreeze) + HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze) + } + + /** + * Predicts the closest cluster by RDD of the points + */ + def predict(data: RDD[Vector]): RDD[Int] = { + val sc = data.sparkContext + + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + sc.broadcast(metric) + val centers = this.getCenters().map(_.toBreeze) + sc.broadcast(centers) + + data.map{point => + HierarchicalClustering.findClosestCenter(metric)(centers)(point.toBreeze) + } + } + + /** + * Predicts the closest cluster by RDD of the points for Java + */ + def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = + predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] +} + + +object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] { + + override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = { + val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(path)) + try { + stream.readObject().asInstanceOf[HierarchicalClusteringModel] + } finally { + stream.close() + } + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala new file mode 100644 index 0000000000000..73674184dff77 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.scalatest.{BeforeAndAfterEach, FunSuite} + +class HierarchicalClusteringModelSuite + extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach { + + test("clustering dense vectors") { + val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) + + val localData = (1 to 100).toSeq.map { i => + val label = i % 5 + val vector = Vectors.dense(label, label, label) + (label, vector) + } + val data = sc.parallelize(localData.map(_._2)) + val model = app.run(data) + + val clusters = model.getClusters() + assert(clusters.isInstanceOf[Array[ClusterTree]]) + assert(clusters.size === 5) + + val centers = model.getCenters().sortBy(_.toArray.sum) + assert(centers.size === 5) + assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0)) + assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0)) + assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0)) + assert(centers(3) === Vectors.dense(3.0, 3.0, 3.0)) + assert(centers(4) === Vectors.dense(4.0, 4.0, 4.0)) + + // predict with one vector + assert(model.predict(Vectors.dense(0.0, 0.0, 0.0)) === 0) + assert(model.predict(Vectors.dense(0.5, 0.5, 0.5)) === 0) + assert(model.predict(Vectors.dense(1.0, 1.0, 1.0)) === 1) + assert(model.predict(Vectors.dense(2.0, 2.0, 2.0)) === 2) + assert(model.predict(Vectors.dense(3.0, 3.0, 3.0)) === 3) + assert(model.predict(Vectors.dense(4.0, 4.0, 4.0)) === 4) + + // predict with a RDD + val predicted = model.predict(data).collect() + assert(predicted === localData.map(_._1)) + } + + test("clustering sparse vectors") { + val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) + + val localData = (1 to 100).toSeq.map { i => + val label = i % 5 + val vector = Vectors.sparse(5, Seq((label, label.toDouble))) + (label, vector) + } + val data = sc.parallelize(localData.map(_._2)) + val model = app.run(data) + + val clusters = model.getClusters() + assert(clusters.isInstanceOf[Array[ClusterTree]]) + assert(clusters.size === 5) + + val centers = model.getCenters().sortBy(_.toArray.sum) + assert(centers.size === 5) + assert(centers(0) === Vectors.sparse(5, Array(), Array())) + assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0))) + assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0))) + assert(centers(3) === Vectors.sparse(5, Array(3), Array(3.0))) + assert(centers(4) === Vectors.sparse(5, Array(4), Array(4.0))) + + // predict with one vector + assert(model.predict(Vectors.sparse(5, Array(0), Array(0.0))) === 0) + assert(model.predict(Vectors.sparse(5, Array(1), Array(1.0))) === 1) + assert(model.predict(Vectors.sparse(5, Array(2), Array(2.0))) === 2) + assert(model.predict(Vectors.sparse(5, Array(3), Array(3.0))) === 3) + assert(model.predict(Vectors.sparse(5, Array(4), Array(4.0))) === 4) + + // predict with a RDD + val predicted = model.predict(data).collect() + assert(predicted === localData.map(_._1)) + } + + test("save a model, and then load the model") { + val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) + + val localData = (1 to 100).toSeq.map { i => + val label = i % 5 + val vector = Vectors.dense(label, label, label) + (label, vector) + } + val data = sc.parallelize(localData.map(_._2)) + val model = app.run(data) + + val tmpFile = java.io.File.createTempFile("hierarchical-clustering", "save-load") + model.save(sc, tmpFile.getAbsolutePath) + + val sameModel = HierarchicalClusteringModel.load(sc, tmpFile.getAbsolutePath) + assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel") + localData.foreach { case (label, vector) => + assert(model.predict(vector) === sameModel.predict(vector)) + } + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala new file mode 100644 index 0000000000000..befb0bea5af90 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm} +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.scalatest.FunSuite + + +class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext { + + test("train") { + val numClusters = 9 + val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val data = sc.parallelize(localSeed, 1) + val model = HierarchicalClustering.train(data, numClusters) + assert(model.getClusters().size === numClusters) + assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4) + } + + test("train with full arguments") { + val numClusters = 9 + val subIterations = 20 + val maxRetries = 20 + val seed = 321 + + val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val data = sc.parallelize(localSeed, 1) + + val model = HierarchicalClustering.train(data, numClusters, subIterations, maxRetries, seed) + assert(model.getClusters().size === numClusters) + assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4) + } +} + + +class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { + + test("run") { + val algo = new HierarchicalClustering().setNumClusters(123) + val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val data = sc.parallelize(localSeed, 2) + val model = algo.run(data) + assert(model.getClusters().size == 123) + assert(model.tree.getHeight() ~== 702.8641 absTol 10E-4) + + // check the relations between a parent cluster and its children + assert(model.tree.getParent() === None) + assert(model.tree.getChildren().apply(0).getParent().get === model.tree) + assert(model.tree.getChildren().apply(1).getParent().get === model.tree) + assert(model.getClusters().forall(_.getParent() != None)) + } + + test("run with too many cluster size than the records") { + val algo = new HierarchicalClustering().setNumClusters(123) + val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val data = sc.parallelize(localSeed, 2) + val model = algo.run(data) + assert(model.getClusters().size == 100) + assert(model.tree.getHeight() ~== 72.12489 absTol 10E-4) + } + + test("initializeData") { + val algo = new HierarchicalClustering + val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val seed = sc.parallelize(localSeed) + val data = algo.initData(seed) + assert(data.map(_._1).collect().distinct === Array(1)) + } + + test("get center stats") { + val algo = new HierarchicalClustering + val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq + val seed = sc.parallelize(localSeed) + val data = algo.initData(seed) + + val clusters = algo.summarizeAsClusters(data) + val center = clusters(1).center + assert(clusters.size === 1) + assert(clusters(1).center === Vectors.dense(49.5, 49.5)) + assert(clusters(1).records === 100) + + val data2 = seed.map(v => ((v.apply(0) / 25).toLong + 1L, v.toBreeze)) + val clusters2 = algo.summarizeAsClusters(data2) + assert(clusters2.size === 4) + assert(clusters2(1).center === Vectors.dense(12.0, 12.0)) + assert(clusters2(1).records === 25) + assert(clusters2(2).center === Vectors.dense(37.0, 37.0)) + assert(clusters2(2).records === 25) + assert(clusters2(3).center === Vectors.dense(62.0, 62.0)) + assert(clusters2(3).records === 25) + assert(clusters2(4).center === Vectors.dense(87.0, 87.0)) + assert(clusters2(4).records === 25) + } + + test("getChildrenCenter") { + val algo = new HierarchicalClustering + val centers = Map( + 2L -> Vectors.dense(1.0, 1.0).toBreeze, + 3L -> Vectors.dense(2.0, 2.0).toBreeze + ) + val initNextCenters = algo.initChildrenCenter(centers) + assert(initNextCenters.size === 4) + assert(initNextCenters.keySet === Set(4, 5, 6, 7)) + } + + test("should divide clusters") { + val algo = new HierarchicalClustering + val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze)) + val data = sc.parallelize(seed) + val clusters = algo.summarizeAsClusters(data) + val newClusters = algo.getDividedClusters(data, clusters) + + assert(newClusters.size === 4) + assert(newClusters(4).center === Vectors.dense(12.0, 12.0)) + assert(newClusters(4).records === 25) + assert(newClusters(5).center === Vectors.dense(37.0, 37.0)) + assert(newClusters(5).records === 25) + assert(newClusters(6).center === Vectors.dense(62.0, 62.0)) + assert(newClusters(6).records === 25) + assert(newClusters(7).center === Vectors.dense(87.0, 87.0)) + assert(newClusters(7).records === 25) + } + + test("should assign each data to new clusters") { + val algo = new HierarchicalClustering + val seed = Seq( + (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)), + (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), + (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)), + (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0)) + ).map { case (idx, vector) => (idx, vector.toBreeze)} + val newClusters = Map( + 4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), + 5L -> new ClusterTree(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)), + 6L -> new ClusterTree(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)), + 7L -> new ClusterTree(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0)) + ) + val data = sc.parallelize(seed) + val result = algo.updateClusterIndex(data, newClusters).collect().toSeq + + val expected = Seq( + (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), + (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)), + (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)), + (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0)) + ).map { case (idx, vector) => (idx, vector.toBreeze)} + assert(result === expected) + } + + test("setSubIterations") { + val algo = new HierarchicalClustering() + assert(algo.getSubIterations() == 20) + algo.setMaxIterations(15) + assert(algo.getSubIterations() == 15) + } + + test("setNumRetries") { + val algo = new HierarchicalClustering() + assert(algo.getMaxRetries() == 10) + algo.setMaxRetries(15) + assert(algo.getMaxRetries() == 15) + } + + test("setSeed") { + val algo = new HierarchicalClustering() + assert(algo.getSeed() == 1) + algo.setSeed(987) + assert(algo.getSeed() == 987) + } +} From c51017cccf8d62ac4ffdb4d67f0f83a7526da659 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 30 Mar 2015 23:13:39 +0900 Subject: [PATCH 02/76] Fix the some comments --- .../spark/mllib/clustering/HierarchicalClustering.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index dd6cf6d0f8b94..30c7804e3d151 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -352,7 +352,7 @@ class HierarchicalClustering( * @param treeMap divided clusters as a Map class * @param rootIndex index you want to start * @param numClusters the number of clusters you want - * @return + * @return a built cluster tree */ private[clustering] def buildTree(treeMap: Map[Long, ClusterTree], @@ -513,9 +513,10 @@ class HierarchicalClustering( * @param center the center of the cluster * @param records the number of rows in the cluster * @param variances variance vectors + * @param variancesNorm the norm of variance vector + * @param localHeight the maximal distance between this node and its children * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster - * @param variancesNorm the sum of squares of variances */ class ClusterTree( val center: Vector, @@ -555,7 +556,7 @@ class ClusterTree( * Converts the tree into Array class * the sub nodes are recursively expanded * - * @return Seq class which the cluster tree is expanded + * @return an Array class which the cluster tree is expanded */ def toArray(): Array[ClusterTree] = { val array = this.children.size match { @@ -570,7 +571,7 @@ class ClusterTree( /** * Gets the depth of the cluster in the tree * - * @return the depth + * @return the depth from the root */ def getDepth(): Int = { this.parent match { From a8cd7abcef0688cee9e6af8f6e2416fe7c0e266c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 2 Apr 2015 13:10:38 +0900 Subject: [PATCH 03/76] Remove parentheses for getters and add a test for HierarchicalClustering.setNumClusters --- .../clustering/HierarchicalClustering.scala | 8 +++++--- .../HierarchicalClusteringSuite.scala | 19 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 30c7804e3d151..566475dce2345 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -118,6 +118,8 @@ class HierarchicalClustering( this } + def getNumClusters: Int = this.numClusters + /** * Sets the number of maximal iterations in each clustering step */ @@ -126,7 +128,7 @@ class HierarchicalClustering( this } - def getSubIterations(): Int = this.maxIterations + def getSubIterations: Int = this.maxIterations /** * Sets the number of maximum retries of each clustering step @@ -136,7 +138,7 @@ class HierarchicalClustering( this } - def getMaxRetries(): Int = this.maxRetries + def getMaxRetries: Int = this.maxRetries /** * Sets the random seed @@ -146,7 +148,7 @@ class HierarchicalClustering( this } - def getSeed(): Long = this.seed + def getSeed: Long = this.seed /** * Runs the hierarchical clustering algorithm diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index befb0bea5af90..57af03a4ae305 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -165,24 +165,31 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { assert(result === expected) } + test("setNumClusters") { + val algo = new HierarchicalClustering() + assert(algo.getNumClusters == 20) + algo.setNumClusters(1000) + assert(algo.getNumClusters == 1000) + } + test("setSubIterations") { val algo = new HierarchicalClustering() - assert(algo.getSubIterations() == 20) + assert(algo.getSubIterations == 20) algo.setMaxIterations(15) - assert(algo.getSubIterations() == 15) + assert(algo.getSubIterations == 15) } test("setNumRetries") { val algo = new HierarchicalClustering() - assert(algo.getMaxRetries() == 10) + assert(algo.getMaxRetries == 10) algo.setMaxRetries(15) - assert(algo.getMaxRetries() == 15) + assert(algo.getMaxRetries == 15) } test("setSeed") { val algo = new HierarchicalClustering() - assert(algo.getSeed() == 1) + assert(algo.getSeed == 1) algo.setSeed(987) - assert(algo.getSeed() == 987) + assert(algo.getSeed == 987) } } From 306f6037c634745f3b66d0048685156fb2725b8a Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 8 Apr 2015 11:26:57 +0900 Subject: [PATCH 04/76] Remove the static train() method from HierarchicalClustering object --- .../clustering/HierarchicalClustering.scala | 38 ------------------- .../HierarchicalClusteringSuite.scala | 23 ----------- 2 files changed, 61 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 566475dce2345..1ce993e8be14c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -26,48 +26,10 @@ import org.apache.spark.{Logging, SparkException} import scala.collection.{Map, mutable} -/** - * Top-level methods for calling the hierarchical clustering algorithm - */ object HierarchicalClustering extends Logging { private[clustering] val ROOT_INDEX_KEY: Long = 1 - /** - * Trains a hierarchical clustering model with the given data - * - * @param data trained data - * @param numClusters the maximum number of clusters you want - * @return a hierarchical clustering model - */ - def train(data: RDD[Vector], numClusters: Int): HierarchicalClusteringModel = { - val algo = new HierarchicalClustering().setNumClusters(numClusters) - algo.run(data) - } - - /** - * Trains a hierarchical clustering model with the given data - * - * @param data training data - * @param numClusters the maximum number of clusters you want - * @param maxIterations the number of maximal iterations - * @param maxRetries the number of maximum retries when the clustering can't be succeeded - * @param seed the randomseed to generate the initial vectors for each bisecting - * @return a hierarchical clustering model - */ - def train(data: RDD[Vector], - numClusters: Int, - maxIterations: Int, - maxRetries: Int, - seed: Int): HierarchicalClusteringModel = { - - val algo = new HierarchicalClustering().setNumClusters(numClusters) - .setMaxIterations(maxIterations) - .setMaxRetries(maxRetries) - .setSeed(seed) - algo.run(data) - } - /** * Finds the closes cluster's center * diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 57af03a4ae305..e0c7bd13b3bfb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -25,29 +25,6 @@ import org.scalatest.FunSuite class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext { - - test("train") { - val numClusters = 9 - val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val data = sc.parallelize(localSeed, 1) - val model = HierarchicalClustering.train(data, numClusters) - assert(model.getClusters().size === numClusters) - assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4) - } - - test("train with full arguments") { - val numClusters = 9 - val subIterations = 20 - val maxRetries = 20 - val seed = 321 - - val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val data = sc.parallelize(localSeed, 1) - - val model = HierarchicalClustering.train(data, numClusters, subIterations, maxRetries, seed) - assert(model.getClusters().size === numClusters) - assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4) - } } From b2d0369947bce56f4bbce2b87a0da00a81d5545c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 8 Apr 2015 12:12:52 +0900 Subject: [PATCH 05/76] Add a test for HierarchicalClustering.findClosestCenter() --- .../HierarchicalClusteringSuite.scala | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index e0c7bd13b3bfb..afe2d0652bf9b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -25,6 +25,25 @@ import org.scalatest.FunSuite class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext { + + test("the root index is equal to 1") { + assert(HierarchicalClustering.ROOT_INDEX_KEY === 1) + } + + test("findClosestCenter") { + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val centers = Seq( + Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze, + Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze, + Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze + ) + + for (i <- 0 to (centers.size - 1)) { + val point = centers(i) + val closestIndex = HierarchicalClustering.findClosestCenter(metric)(centers)(point) + assert(closestIndex === i) + } + } } From 0ddfcfb2d02c286a01227902c5a5ea859bf6a981 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 7 Apr 2015 14:11:40 +0900 Subject: [PATCH 06/76] Add a function to compute Within Set Sum of Squared Error into Scala/Java/Python --- .../HierarchicalClusteringModel.scala | 19 +++++++++++++++++++ .../HierarchicalClusteringModelSuite.scala | 6 ++++++ 2 files changed, 25 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index d61a0775f7c6b..18040de3afcea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -81,6 +81,25 @@ class HierarchicalClusteringModel(val tree: ClusterTree) */ def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] + + /** + * Computes Within Set Sum of Squeared Error(WSSSE) + */ + def WSSSE(data: RDD[Vector]): Double = { + val bvCenters = this.getCenters().map(_.toBreeze) + data.context.broadcast(bvCenters) + val distances = data.map {point => + val bvPoint = point.toBreeze + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val idx = HierarchicalClustering.findClosestCenter(metric)(bvCenters)(bvPoint) + val closestCenter = bvCenters(idx) + val distance = metric(bvPoint, closestCenter) + distance + } + distances.sum() + } + + def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index 73674184dff77..31b7a5255cd14 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -58,6 +58,9 @@ class HierarchicalClusteringModelSuite // predict with a RDD val predicted = model.predict(data).collect() assert(predicted === localData.map(_._1)) + + // compute WSSSE + assert(model.WSSSE(data) === 0.0) } test("clustering sparse vectors") { @@ -93,6 +96,9 @@ class HierarchicalClusteringModelSuite // predict with a RDD val predicted = model.predict(data).collect() assert(predicted === localData.map(_._1)) + + // compute WSSSE + assert(model.WSSSE(data) === 0.0) } test("save a model, and then load the model") { From ecb3fd703573ea9c75df1c38f96124fa7f3c003f Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 2 Apr 2015 14:24:58 +0900 Subject: [PATCH 07/76] Change the visibility of constructer parameters of HierarchicalClustering and ClusterTree from public to private --- .../spark/mllib/clustering/HierarchicalClustering.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 1ce993e8be14c..6f862440ae3c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -60,7 +60,7 @@ object HierarchicalClustering extends Logging { * @param maxRetries the number of maximum retries * @param seed a random seed */ -class HierarchicalClustering( +class HierarchicalClustering private ( private var numClusters: Int, private var clusterMap: Map[Long, ClusterTree], private var maxIterations: Int, @@ -482,7 +482,7 @@ class HierarchicalClustering( * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster */ -class ClusterTree( +class ClusterTree private ( val center: Vector, val records: Long, val variances: Vector, From 08f01013b27500c5266f92a8be3300684a40cc9c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 27 Apr 2015 15:45:41 +0900 Subject: [PATCH 08/76] Rename getSubIterations to getMaxIterations --- .../spark/mllib/clustering/HierarchicalClustering.scala | 2 +- .../spark/mllib/clustering/HierarchicalClusteringSuite.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 6f862440ae3c9..dc295714ad729 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -90,7 +90,7 @@ class HierarchicalClustering private ( this } - def getSubIterations: Int = this.maxIterations + def getMaxIterations: Int = this.maxIterations /** * Sets the number of maximum retries of each clustering step diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index afe2d0652bf9b..4c3630ad2025d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -170,9 +170,9 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { test("setSubIterations") { val algo = new HierarchicalClustering() - assert(algo.getSubIterations == 20) + assert(algo.getMaxIterations == 20) algo.setMaxIterations(15) - assert(algo.getSubIterations == 15) + assert(algo.getMaxIterations == 15) } test("setNumRetries") { From 2a14900e92d8981d6313e47773499d3ad622569a Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 27 Apr 2015 16:23:52 +0900 Subject: [PATCH 09/76] Modify how to broadcast variables --- .../clustering/HierarchicalClustering.scala | 26 ++++++++++--------- .../HierarchicalClusteringModelSuite.scala | 23 ++++++++++++++++ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index dc295714ad729..77804c3bc68ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -290,8 +290,8 @@ class HierarchicalClustering private ( s"at ${tryTimes} times in ${appName}") // divide the failed clusters again - sc.broadcast(failedIndexes) - dividableData = data.filter { case (idx, point) => failedIndexes.contains(idx)} + val bcFailedIndexes = sc.broadcast(failedIndexes) + dividableData = data.filter { case (idx, point) => bcFailedIndexes.value.contains(idx)} val missingStats = divide(dividableData, failedCenters) stats = stats ++ missingStats tryTimes += 1 @@ -381,11 +381,11 @@ class HierarchicalClustering private ( if (newCenters.size == 0) { return Map.empty[Long, (BV[Double], Double, BV[Double])] } - sc.broadcast(newCenters) + var bcNewCenters = sc.broadcast(newCenters) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - sc.broadcast(metric) + val bcMetric = sc.broadcast(metric) val vectorSize = newCenters(newCenters.keySet.min).size var stats = newCenters.keys.map { idx => @@ -402,11 +402,11 @@ class HierarchicalClustering private ( val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] iter.foreach { case (idx, point) => // calculate next index number - val childrenCenters = Array(2 * idx, 2 * idx + 1).filter(newCenters.keySet.contains(_)) - .map(newCenters(_)).toArray + val childrenCenters = Array(2 * idx, 2 * idx + 1) + .filter(bcNewCenters.value.keySet.contains(_)).map(bcNewCenters.value(_)).toArray if (childrenCenters.size >= 1) { val closestIndex = - HierarchicalClustering.findClosestCenter(metric)(childrenCenters)(point) + HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point) val nextIndex = 2 * idx + closestIndex // get a map value or else get a sparse vector @@ -423,6 +423,7 @@ class HierarchicalClustering private ( // calculate the center of each cluster newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)} + bcNewCenters = sc.broadcast(newCenters) // update summary of each cluster stats = eachStats.toMap @@ -447,22 +448,23 @@ class HierarchicalClustering private ( // extract the centers of the clusters val sc = data.sparkContext var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} - sc.broadcast(centers) + val bcCenters = sc.broadcast(centers) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - sc.broadcast(metric) + val bcMetric = sc.broadcast(metric) // update the indexes to their children indexes data.map { case (idx, point) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(centers.keySet.contains(_)) + val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(bcCenters.value.keySet.contains(_)) childrenIndexes.size match { // stay the index if the number of children is not enough case s if s < 2 => (idx, point) // update the indexes case _ => { - val nextCenters = childrenIndexes.map(centers(_)).map(_.toBreeze) - val closestIndex = HierarchicalClustering.findClosestCenter(metric)(nextCenters)(point) + val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze) + val closestIndex = HierarchicalClustering + .findClosestCenter(bcMetric.value)(nextCenters)(point) val nextIndex = 2 * idx + closestIndex (nextIndex, point) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index 31b7a5255cd14..d747cec7fd558 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -101,6 +101,29 @@ class HierarchicalClusteringModelSuite assert(model.WSSSE(data) === 0.0) } + test("clustering should be done correctly") { + for (numClusters <- Array(9, 99, 999)) { + val app = new HierarchicalClustering().setNumClusters(numClusters).setSeed(1) + val localData = (1 to 1000).toSeq.map { i => + val label = i % numClusters + val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble))) + val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector) + (label, denseVector, sparseVector) + } + // dense version + val denseData = sc.parallelize(localData.map(_._2), 2) + val denseModel = app.run(denseData) + assert(denseModel.getCenters().size === numClusters) + assert(denseModel.getClusters().forall(_.variancesNorm == 0.0)) + + // sparse version + val sparseData = sc.parallelize(localData.map(_._3), 2) + val sparseModel = app.run(sparseData) + assert(sparseModel.getCenters().size === numClusters) + assert(sparseModel.getClusters().forall(_.variancesNorm == 0.0)) + } + } + test("save a model, and then load the model") { val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) From 38f07bd779fb0e19219aa4a413ca2c9ac1044928 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 27 Apr 2015 16:34:27 +0900 Subject: [PATCH 10/76] Remove unnecessary parentheses --- .../clustering/HierarchicalClustering.scala | 22 +++++++++---------- .../HierarchicalClusteringModel.scala | 10 ++++----- .../HierarchicalClusteringModelSuite.scala | 16 +++++++------- .../HierarchicalClusteringSuite.scala | 16 +++++++------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 77804c3bc68ae..6ca3dff0e5f07 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -182,7 +182,7 @@ class HierarchicalClustering private ( // make a hierarchical clustering model val model = new HierarchicalClusteringModel(root.get) - val leavesNodes = model.getClusters() + val leavesNodes = model.getClusters if (leavesNodes.size < this.numClusters) { log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ") } @@ -530,7 +530,7 @@ class ClusterTree private ( case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator) } array.sortWith { case (a, b) => - a.getDepth() < b.getDepth() && a.variances.toArray.sum < b.variances.toArray.sum + a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum } } @@ -539,35 +539,35 @@ class ClusterTree private ( * * @return the depth from the root */ - def getDepth(): Int = { + def getDepth: Int = { this.parent match { case None => 0 - case _ => 1 + this.parent.get.getDepth() + case _ => 1 + this.parent.get.getDepth } } /** * Gets the leaves nodes in the cluster tree */ - def getLeavesNodes(): Array[ClusterTree] = { - this.toArray().filter(_.isLeaf()).sortBy(_.center.toArray.sum) + def getLeavesNodes: Array[ClusterTree] = { + this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum) } - def isLeaf(): Boolean = (this.children.size == 0) + def isLeaf: Boolean = (this.children.size == 0) - def getParent(): Option[ClusterTree] = this.parent + def getParent: Option[ClusterTree] = this.parent - def getChildren(): Seq[ClusterTree] = this.children + def getChildren: Seq[ClusterTree] = this.children /** * Gets the dendrogram height of the cluster at the cluster tree * * @return the dendrogram height */ - def getHeight(): Double = { + def getHeight: Double = { this.children.size match { case 0 => 0.0 - case _ => this.localHeight + this.children.map(_.getHeight()).max + case _ => this.localHeight + this.children.map(_.getHeight).max } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 18040de3afcea..6ec44325a0f8d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -44,9 +44,9 @@ class HierarchicalClusteringModel(val tree: ClusterTree) } } - def getClusters(): Array[ClusterTree] = this.tree.getLeavesNodes() + def getClusters: Array[ClusterTree] = this.tree.getLeavesNodes - def getCenters(): Array[Vector] = this.getClusters().map(_.center) + def getCenters: Array[Vector] = this.getClusters.map(_.center) /** * Predicts the closest cluster by one point @@ -55,7 +55,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val centers = this.getCenters().map(_.toBreeze) + val centers = this.getCenters.map(_.toBreeze) HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze) } @@ -68,7 +68,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) sc.broadcast(metric) - val centers = this.getCenters().map(_.toBreeze) + val centers = this.getCenters.map(_.toBreeze) sc.broadcast(centers) data.map{point => @@ -86,7 +86,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree) * Computes Within Set Sum of Squeared Error(WSSSE) */ def WSSSE(data: RDD[Vector]): Double = { - val bvCenters = this.getCenters().map(_.toBreeze) + val bvCenters = this.getCenters.map(_.toBreeze) data.context.broadcast(bvCenters) val distances = data.map {point => val bvPoint = point.toBreeze diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index d747cec7fd558..805269d8e2f61 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -35,11 +35,11 @@ class HierarchicalClusteringModelSuite val data = sc.parallelize(localData.map(_._2)) val model = app.run(data) - val clusters = model.getClusters() + val clusters = model.getClusters assert(clusters.isInstanceOf[Array[ClusterTree]]) assert(clusters.size === 5) - val centers = model.getCenters().sortBy(_.toArray.sum) + val centers = model.getCenters.sortBy(_.toArray.sum) assert(centers.size === 5) assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0)) assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0)) @@ -74,11 +74,11 @@ class HierarchicalClusteringModelSuite val data = sc.parallelize(localData.map(_._2)) val model = app.run(data) - val clusters = model.getClusters() + val clusters = model.getClusters assert(clusters.isInstanceOf[Array[ClusterTree]]) assert(clusters.size === 5) - val centers = model.getCenters().sortBy(_.toArray.sum) + val centers = model.getCenters.sortBy(_.toArray.sum) assert(centers.size === 5) assert(centers(0) === Vectors.sparse(5, Array(), Array())) assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0))) @@ -113,14 +113,14 @@ class HierarchicalClusteringModelSuite // dense version val denseData = sc.parallelize(localData.map(_._2), 2) val denseModel = app.run(denseData) - assert(denseModel.getCenters().size === numClusters) - assert(denseModel.getClusters().forall(_.variancesNorm == 0.0)) + assert(denseModel.getCenters.size === numClusters) + assert(denseModel.getClusters.forall(_.variancesNorm == 0.0)) // sparse version val sparseData = sc.parallelize(localData.map(_._3), 2) val sparseModel = app.run(sparseData) - assert(sparseModel.getCenters().size === numClusters) - assert(sparseModel.getClusters().forall(_.variancesNorm == 0.0)) + assert(sparseModel.getCenters.size === numClusters) + assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 4c3630ad2025d..9f0b18e6dfa58 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -54,14 +54,14 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) - assert(model.getClusters().size == 123) - assert(model.tree.getHeight() ~== 702.8641 absTol 10E-4) + assert(model.getClusters.size == 123) + assert(model.tree.getHeight ~== 702.8641 absTol 10E-4) // check the relations between a parent cluster and its children - assert(model.tree.getParent() === None) - assert(model.tree.getChildren().apply(0).getParent().get === model.tree) - assert(model.tree.getChildren().apply(1).getParent().get === model.tree) - assert(model.getClusters().forall(_.getParent() != None)) + assert(model.tree.getParent === None) + assert(model.tree.getChildren.apply(0).getParent.get === model.tree) + assert(model.tree.getChildren.apply(1).getParent.get === model.tree) + assert(model.getClusters.forall(_.getParent != None)) } test("run with too many cluster size than the records") { @@ -69,8 +69,8 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) - assert(model.getClusters().size == 100) - assert(model.tree.getHeight() ~== 72.12489 absTol 10E-4) + assert(model.getClusters.size == 100) + assert(model.tree.getHeight ~== 72.12489 absTol 10E-4) } test("initializeData") { From 99e703b1a2a4c3f3e7827ffcfd61dde22ae9e7cf Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 28 Apr 2015 15:42:24 +0900 Subject: [PATCH 11/76] Add toLinkageMatrix() and toAdjacencyList() --- .../clustering/HierarchicalClustering.scala | 52 +++++++++++++++++++ .../HierarchicalClusteringModel.scala | 31 +++++++++++ .../HierarchicalClusteringModelSuite.scala | 44 ++++++++++++++++ 3 files changed, 127 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 6ca3dff0e5f07..dc06f1891e64b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -571,6 +571,58 @@ class ClusterTree private ( } } + /** + * Converts to a adjacency list + * + * @return List[(fromNodeId, toNodeId, distance)] + */ + def toAdjacencyList(): Array[(Int, Int, Double)] = { + val nodes = toArray() + + var adjacencyList = Array.empty[(Int, Int, Double)] + nodes.foreach { parent => + if (parent.children.size > 1) { + val parentIndex = nodes.indexOf(parent) + parent.children.foreach { child => + val childIndex = nodes.indexOf(child) + adjacencyList = adjacencyList :+(parentIndex, childIndex, parent.localHeight) + } + } + } + adjacencyList + } + + /** + * Converts to a linkage matrix + * Returned data format is fit for scipy's dendrogram function + * SEE ALSO: scipy.cluster.hierarchy.dendrogram + * + * @return List[(node1, node2, distance, tree size)] + */ + def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = { + val nodes = toArray().sortWith { case (a, b) => a.getHeight < b.getHeight} + val leaves = nodes.filter(_.isLeaf) + val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1) + val clusters = leaves ++ notLeaves + val treeMap = clusters.zipWithIndex.map { case (tree, idx) => (tree -> idx)}.toMap + + // If a node only has one-child, the child is regarded as the cluster of the child. + // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D. + // ==> A merge list is (B, D), not (B, C). + def getIndex(map: Map[ClusterTree, Int], tree: ClusterTree): Int = { + tree.children.size match { + case 1 => getIndex(map, tree.children(0)) + case _ => map(tree) + } + } + clusters.filterNot(_.isLeaf).map { tree => + (getIndex(treeMap, tree.children(0)), + getIndex(treeMap, tree.children(1)), + tree.getHeight, + tree.toArray().filter(_.isLeaf).size) + } + } + private[mllib] def setLocalHeight(height: Double) = (this.localHeight = height) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 6ec44325a0f8d..9dcf84b5df381 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -100,6 +100,37 @@ class HierarchicalClusteringModel(val tree: ClusterTree) } def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) + + def toAdjacencyList(): Array[(Int, Int, Double)] = this.tree.toAdjacencyList() + + /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ + def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { + var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]](); + this.tree.toAdjacencyList().foreach { x => + val edge = new java.util.ArrayList[java.lang.Double]() + edge.add(x._1) + edge.add(x._2) + edge.add(x._3) + javaList.add(edge) + } + javaList + } + + def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.tree.toLinkageMatrix() + + /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ + def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { + val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() + this.tree.toLinkageMatrix().foreach {x => + val row = new java.util.ArrayList[java.lang.Double]() + row.add(x._1) + row.add(x._2) + row.add(x._3) + row.add(x._4) + javaList.add(row) + } + javaList + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index 805269d8e2f61..d374ec956562a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -61,6 +61,28 @@ class HierarchicalClusteringModelSuite // compute WSSSE assert(model.WSSSE(data) === 0.0) + + // adjacency list + val adjacencyList = model.toAdjacencyList() + .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) + assert(adjacencyList.size === 8) + assert(adjacencyList(0) === (0, 1, 2.5981)) + assert(adjacencyList(1) === (0, 6, 2.5981)) + assert(adjacencyList(2) === (1, 2, 1.7321)) + assert(adjacencyList(3) === (1, 5, 1.7321)) + assert(adjacencyList(4) === (2, 3, 0.866)) + assert(adjacencyList(5) === (2, 4, 0.866)) + assert(adjacencyList(6) === (6, 7, 0.866)) + assert(adjacencyList(7) === (6, 8, 0.866)) + + // linkage matrix + val linkageMatrix = model.toLinkageMatrix() + .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) + assert(linkageMatrix.size === 4) + assert(linkageMatrix(0) === (0, 1, 0.866, 2)) + assert(linkageMatrix(1) === (3, 4, 0.866, 2)) + assert(linkageMatrix(2) === (5, 2, 2.5981, 3)) + assert(linkageMatrix(3) === (7, 6, 5.1962, 5)) } test("clustering sparse vectors") { @@ -99,6 +121,28 @@ class HierarchicalClusteringModelSuite // compute WSSSE assert(model.WSSSE(data) === 0.0) + + // adjacency list + val adjacencyList = model.toAdjacencyList() + .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) + assert(adjacencyList.size === 8) + assert(adjacencyList(0) === (0, 1, 1.5652)) + assert(adjacencyList(1) === (0, 6, 1.5652)) + assert(adjacencyList(2) === (1, 2, 1.3744)) + assert(adjacencyList(3) === (1, 5, 1.3744)) + assert(adjacencyList(4) === (2, 3, 0.5)) + assert(adjacencyList(5) === (2, 4, 0.5)) + assert(adjacencyList(6) === (6, 7, 2.5)) + assert(adjacencyList(7) === (6, 8, 2.5)) + + // linkage matrix + val linkageMatrix = model.toLinkageMatrix() + .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) + assert(linkageMatrix.size === 4) + assert(linkageMatrix(0) === (0, 1, 0.5, 2)) + assert(linkageMatrix(1) === (5, 2, 1.8744, 3)) + assert(linkageMatrix(2) === (3, 4, 2.5, 2)) + assert(linkageMatrix(3) === (6, 7, 4.0652, 5)) } test("clustering should be done correctly") { From 344d14eaaf315bc4fd2d9fa951ee0a2026a01d1d Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 20 May 2015 16:15:05 +0900 Subject: [PATCH 12/76] Add a java test file for HierarchicalClustering --- .../JavaHierarchicalClusteringSuite.java | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java new file mode 100644 index 0000000000000..84ae01d6dde0a --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering; + +import com.google.common.collect.Lists; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class JavaHierarchicalClusteringSuite implements Serializable { + private transient JavaSparkContext sc; + + @Before + public void setUp() { + sc = new JavaSparkContext("local", "JavaHierarchicalClustering"); + } + + @After + public void tearDown() { + sc.stop(); + sc = null; + } + + @Test + public void runWithSmallData() { + List points = Lists.newArrayList( + Vectors.dense(1.0, 2.0, 6.0), + Vectors.dense(1.0, 3.0, 0.0), + Vectors.dense(1.0, 4.0, 6.0) + ); + + Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0); + + JavaRDD data = sc.parallelize(points, 2); + HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(1); + HierarchicalClusteringModel model = algo.run(data.rdd()); + assertEquals(1, model.getCenters().length); + assertEquals(expectedCenter, model.getCenters()[0]); + } + + @Test + public void runWithDenseVectors() { + int numClusters = 5; + List points = Lists.newArrayList(); + for (int i = 0; i < 99; i++) { + Double elm = new Double(i % numClusters); + Vector point = Vectors.dense(elm, elm); + points.add(point); + } + JavaRDD data = sc.parallelize(points, 2); + HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters); + HierarchicalClusteringModel model = algo.run(data.rdd()); + Vector[] centers = model.getCenters(); + assertEquals(numClusters, centers.length); + assertEquals(Vectors.dense(0.0, 0.0), centers[0]); + assertEquals(Vectors.dense(1.0, 1.0), centers[1]); + assertEquals(Vectors.dense(2.0, 2.0), centers[2]); + assertEquals(Vectors.dense(3.0, 3.0), centers[3]); + assertEquals(Vectors.dense(4.0, 4.0), centers[4]); + + // adjacency list + ArrayList> edges = model.toJavaAdjacencyList(); + assertEquals(8, edges.size()); + // linkage matrix + ArrayList> matrix = model.toJavaLinkageMatrix(); + assertEquals(4, matrix.size()); + } + + @Test + public void runWithSparseVectors() { + int numClusters = 5; + List points = Lists.newArrayList(); + for (int i = 0; i < 99; i++) { + int elm = i % numClusters; + int indexes[] = {elm}; + double values[] = {elm}; + Vector point = Vectors.sparse(numClusters, indexes, values); + points.add(point); + } + JavaRDD data = sc.parallelize(points, 2); + HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters); + HierarchicalClusteringModel model = algo.run(data.rdd()); + Vector[] centers = model.getCenters(); + assertEquals(numClusters, centers.length); + assertEquals(points.get(0), centers[0]); + assertEquals(points.get(1), centers[1]); + assertEquals(points.get(2), centers[2]); + assertEquals(points.get(3), centers[3]); + assertEquals(points.get(4), centers[4]); + + // adjacency list + ArrayList> edges = model.toJavaAdjacencyList(); + assertEquals(8, edges.size()); + // linkage matrix + ArrayList> matrix = model.toJavaLinkageMatrix(); + assertEquals(4, matrix.size()); + } +} From e7256f540324efe36d8fa4774f9d11ca12aaac3e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 20 May 2015 16:36:38 +0900 Subject: [PATCH 13/76] Support save and load functions in Java --- .../HierarchicalClusteringModel.scala | 35 ++++++++++++++++--- .../JavaHierarchicalClusteringSuite.java | 20 +++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 9dcf84b5df381..75d967c868947 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -17,8 +17,11 @@ package org.apache.spark.mllib.clustering +import java.io.File + import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm} -import org.apache.spark.api.java.JavaRDD +import org.apache.commons.io.FilenameUtils +import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD @@ -35,8 +38,20 @@ class HierarchicalClusteringModel(val tree: ClusterTree) /** Current version of model save/load format. */ override protected def formatVersion: String = "1.0" - override def save(sc: SparkContext, path: String) { - val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(path)) + override def save(sc: SparkContext, path: String): Unit = this.save(path) + + def save(sc: JavaSparkContext, path: String): Unit = this.save(path) + + private def save(path: String): Unit = { + val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile + if (pathObj.exists()) { + throw new IllegalArgumentException("You should save your model in another directory. " + + "the directory already exists: " + path) + } + + pathObj.mkdir(); + val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path) + val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath)) try { oos.writeObject(this) } finally { @@ -137,11 +152,23 @@ class HierarchicalClusteringModel(val tree: ClusterTree) object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] { override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = { - val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(path)) + this.load(path) + } + + def load(sc: JavaSparkContext, path: String): HierarchicalClusteringModel = { + this.load(path) + } + + def load(path: String): HierarchicalClusteringModel = { + val modelFilePath = getModelFilePath(path) + val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(modelFilePath)) try { stream.readObject().asInstanceOf[HierarchicalClusteringModel] } finally { stream.close() } } + + private[clustering] + def getModelFilePath(path: String): String = FilenameUtils.concat(path, "model") } diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java index 84ae01d6dde0a..54f532e9e5eec 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.mllib.clustering; import com.google.common.collect.Lists; +import jodd.io.FileUtil; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; @@ -26,7 +27,11 @@ import org.junit.Before; import org.junit.Test; +import java.io.File; +import java.io.IOException; import java.io.Serializable; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; @@ -61,6 +66,21 @@ public void runWithSmallData() { HierarchicalClusteringModel model = algo.run(data.rdd()); assertEquals(1, model.getCenters().length); assertEquals(expectedCenter, model.getCenters()[0]); + + // save & load + try { + String tempDir = System.getProperty("java.io.tmpdir"); + Path pathObj = Paths.get(tempDir, this.getClass().getSimpleName()); + String path = pathObj.toAbsolutePath().toString(); + + model.save(sc, pathObj.toAbsolutePath().toString()); + HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path); + assertEquals(1, savedModel.getCenters().length); + assertEquals(expectedCenter, savedModel.getCenters()[0]); + FileUtil.delete(new File(path)); + } catch (IOException e) { + e.printStackTrace(); + } } @Test From e2947959d7a4b724c9c4a9e7c17ef4be4f258f25 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 21 May 2015 13:20:07 +0900 Subject: [PATCH 14/76] Change the specification of HierarchicalClusteringModel.save() --- .../clustering/HierarchicalClusteringModel.scala | 7 ++----- .../JavaHierarchicalClusteringSuite.java | 7 +++++-- .../HierarchicalClusteringModelSuite.scala | 16 +++++++++++++--- .../clustering/HierarchicalClusteringSuite.scala | 6 +----- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 75d967c868947..eb7271df4c2dc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -44,12 +44,9 @@ class HierarchicalClusteringModel(val tree: ClusterTree) private def save(path: String): Unit = { val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile - if (pathObj.exists()) { - throw new IllegalArgumentException("You should save your model in another directory. " + - "the directory already exists: " + path) + if (! pathObj.exists()) { + pathObj.mkdirs(); } - - pathObj.mkdir(); val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path) val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath)) try { diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java index 54f532e9e5eec..16d77570ce188 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java @@ -69,14 +69,17 @@ public void runWithSmallData() { // save & load try { + // create a temporary directory String tempDir = System.getProperty("java.io.tmpdir"); - Path pathObj = Paths.get(tempDir, this.getClass().getSimpleName()); + Path pathObj = Paths.get(tempDir, String.valueOf(this.hashCode())); String path = pathObj.toAbsolutePath().toString(); - model.save(sc, pathObj.toAbsolutePath().toString()); + model.save(sc, path); HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path); assertEquals(1, savedModel.getCenters().length); assertEquals(expectedCenter, savedModel.getCenters()[0]); + + // delete the temporary directory FileUtil.delete(new File(path)); } catch (IOException e) { e.printStackTrace(); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index d374ec956562a..e8341e4ba58f8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -17,10 +17,13 @@ package org.apache.spark.mllib.clustering +import org.apache.commons.io.FilenameUtils import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.scalatest.{BeforeAndAfterEach, FunSuite} +import scala.reflect.io.Path + class HierarchicalClusteringModelSuite extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach { @@ -179,13 +182,20 @@ class HierarchicalClusteringModelSuite val data = sc.parallelize(localData.map(_._2)) val model = app.run(data) - val tmpFile = java.io.File.createTempFile("hierarchical-clustering", "save-load") - model.save(sc, tmpFile.getAbsolutePath) + // create a temporary directory for the test + val tmpBaseDir = System.getProperty("java.io.tmpdir") + val tmpDir = this.getClass.getSimpleName + this.hashCode().toString + val tmpPath = FilenameUtils.concat(tmpBaseDir, tmpDir) - val sameModel = HierarchicalClusteringModel.load(sc, tmpFile.getAbsolutePath) + model.save(sc, tmpPath) + val sameModel = HierarchicalClusteringModel.load(sc, tmpPath) assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel") localData.foreach { case (label, vector) => assert(model.predict(vector) === sameModel.predict(vector)) } + + // delete the temporary directory + val path = Path(tmpPath) + path.deleteRecursively() } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 9f0b18e6dfa58..606752c6d4201 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.util.TestingUtils._ import org.scalatest.FunSuite -class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext { +class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { test("the root index is equal to 1") { assert(HierarchicalClustering.ROOT_INDEX_KEY === 1) @@ -44,10 +44,6 @@ class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext assert(closestIndex === i) } } -} - - -class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { test("run") { val algo = new HierarchicalClustering().setNumClusters(123) From 1c66e09558d5d78af27a718a55ff54c8822de8bf Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 27 Apr 2015 16:24:41 +0900 Subject: [PATCH 15/76] Format code and modify the comments --- .../clustering/HierarchicalClustering.scala | 175 +++++++++--------- 1 file changed, 89 insertions(+), 86 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index dc06f1891e64b..36d36956443c2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -198,7 +198,7 @@ class HierarchicalClustering private ( } /** - * Summarizes data by each cluster as ClusterTree2 classes + * Summarizes data by each cluster as ClusterTree classes */ private[clustering] def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = { @@ -237,24 +237,6 @@ class HierarchicalClustering private ( }.collect().toMap } - /** - * Gets the initial centers for bi-sect k-means - */ - private[clustering] - def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = { - val rand = new XORShiftRandom() - rand.setSeed(this.seed) - - clusters.flatMap { case (idx, center) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1) - val relativeErrorCoefficient = 0.001 - Array( - (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))), - (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble()))) - ) - }.toMap - } - /** * Gets the new divided centers */ @@ -310,60 +292,6 @@ class HierarchicalClustering private ( }.toMap } - /** - * Builds a cluster tree from a Map of clusters - * - * @param treeMap divided clusters as a Map class - * @param rootIndex index you want to start - * @param numClusters the number of clusters you want - * @return a built cluster tree - */ - private[clustering] - def buildTree(treeMap: Map[Long, ClusterTree], - rootIndex: Long, - numClusters: Int): Option[ClusterTree] = { - - // if there is no index in the Map - if (!treeMap.contains(rootIndex)) return None - - // build a cluster tree if the queue is empty or until the number of leaves clusters is enough - var numLeavesClusters = 1 - val root = treeMap(rootIndex) - var leavesQueue = Map(rootIndex -> root) - while (leavesQueue.size > 0 && numLeavesClusters < numClusters) { - // pick up the cluster whose variance is the maximum in the queue - val mostScattered = leavesQueue.maxBy(_._2.variancesNorm) - val mostScatteredKey = mostScattered._1 - val mostScatteredCluster = mostScattered._2 - - // relate the most scattered cluster to its children clusters - val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1) - if (childrenIndexes.forall(i => treeMap.contains(i))) { - // insert children to the most scattered cluster - val children = childrenIndexes.map(i => treeMap(i)) - mostScatteredCluster.insert(children) - - // calculate the local dendrogram height - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val localHeight = children - .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max - mostScatteredCluster.setLocalHeight(localHeight) - - // update the queue - leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap - numLeavesClusters += 1 - } - - // remove the cluster which is involved to the cluster tree - leavesQueue = leavesQueue.filterNot(_ == mostScattered) - - log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " + - s"Cluster ${childrenIndexes.mkString(",")} are merged.") - } - Some(root) - } - /** * Divides the input data * @@ -438,6 +366,78 @@ class HierarchicalClustering private ( stats } + /** + * Gets the initial centers for bi-sect k-means + */ + private[clustering] + def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = { + val rand = new XORShiftRandom() + rand.setSeed(this.seed) + + clusters.flatMap { case (idx, center) => + val childrenIndexes = Array(2 * idx, 2 * idx + 1) + val relativeErrorCoefficient = 0.001 + Array( + (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))), + (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble()))) + ) + }.toMap + } + + /** + * Builds a cluster tree from a Map of clusters + * + * @param treeMap divided clusters as a Map class + * @param rootIndex index you want to start + * @param numClusters the number of clusters you want + * @return a built cluster tree + */ + private[clustering] + def buildTree(treeMap: Map[Long, ClusterTree], + rootIndex: Long, + numClusters: Int): Option[ClusterTree] = { + + // if there is no index in the Map + if (!treeMap.contains(rootIndex)) return None + + // build a cluster tree if the queue is empty or until the number of leaves clusters is enough + var numLeavesClusters = 1 + val root = treeMap(rootIndex) + var leavesQueue = Map(rootIndex -> root) + while (leavesQueue.size > 0 && numLeavesClusters < numClusters) { + // pick up the cluster whose variance is the maximum in the queue + val mostScattered = leavesQueue.maxBy(_._2.variancesNorm) + val mostScatteredKey = mostScattered._1 + val mostScatteredCluster = mostScattered._2 + + // relate the most scattered cluster to its children clusters + val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1) + if (childrenIndexes.forall(i => treeMap.contains(i))) { + // insert children to the most scattered cluster + val children = childrenIndexes.map(i => treeMap(i)) + mostScatteredCluster.insert(children) + + // calculate the local dendrogram height + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val localHeight = children + .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max + mostScatteredCluster.setLocalHeight(localHeight) + + // update the queue + leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap + numLeavesClusters += 1 + } + + // remove the cluster which is involved to the cluster tree + leavesQueue = leavesQueue.filterNot(_ == mostScattered) + + log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " + + s"Cluster ${childrenIndexes.mkString(",")} are merged.") + } + Some(root) + } + /** * Updates the indexes of clusters which is divided to its children indexes */ @@ -500,22 +500,22 @@ class ClusterTree private ( 0.0, None, Array.empty[ClusterTree]) /** - * Inserts sub nodes as its children + * Inserts a sub node as its child * - * @param children inserted sub nodes + * @param child inserted sub node */ - def insert(children: Array[ClusterTree]) { - this.children = this.children ++ children - children.foreach(child => child.parent = Some(this)) + def insert(child: ClusterTree) { + insert(Array(child)) } /** - * Inserts a sub node as its child + * Inserts sub nodes as its children * - * @param child inserted sub node + * @param children inserted sub nodes */ - def insert(child: ClusterTree) { - insert(Array(child)) + def insert(children: Array[ClusterTree]) { + this.children = this.children ++ children + children.foreach(child => child.parent = Some(this)) } /** @@ -560,7 +560,10 @@ class ClusterTree private ( def getChildren: Seq[ClusterTree] = this.children /** - * Gets the dendrogram height of the cluster at the cluster tree + * Gets the dendrogram height of the cluster at the cluster tree. + * A dendrogram height is different from a local height. + * A dendrogram height means a total height of a node in a tree. + * A local height means a maximum distance between a node and its children. * * @return the dendrogram height */ @@ -571,6 +574,9 @@ class ClusterTree private ( } } + private[mllib] + def setLocalHeight(height: Double) = (this.localHeight = height) + /** * Converts to a adjacency list * @@ -622,7 +628,4 @@ class ClusterTree private ( tree.toArray().filter(_.isLeaf).size) } } - - private[mllib] - def setLocalHeight(height: Double) = (this.localHeight = height) } From 59480d3f498f4a223acaa795c138e750236e4a51 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 21 May 2015 14:10:03 +0900 Subject: [PATCH 16/76] Format the code because there is a long line --- .../mllib/clustering/HierarchicalClusteringSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 606752c6d4201..82ac672747367 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -134,10 +134,10 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { test("should assign each data to new clusters") { val algo = new HierarchicalClustering val seed = Seq( - (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)), - (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), - (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)), - (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0)) + (2L, Vectors.dense(0.0, 0.0)),(2L, Vectors.dense(1.0, 1.0)),(2L, Vectors.dense(2.0, 2.0)), + (2L, Vectors.dense(3.0, 3.0)),(2L, Vectors.dense(4.0, 4.0)),(2L, Vectors.dense(5.0, 5.0)), + (3L, Vectors.dense(6.0, 6.0)),(3L, Vectors.dense(7.0, 7.0)),(3L, Vectors.dense(8.0, 8.0)), + (3L, Vectors.dense(9.0, 9.0)),(3L, Vectors.dense(10.0, 10.0)),(3L, Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze)} val newClusters = Map( 4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), From ec9f85f0791e325ac25d7ca403afe8f444f53a76 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Sat, 13 Jun 2015 07:32:59 +0900 Subject: [PATCH 17/76] Fix some comments for HierarchicalClustering in Scala --- .../mllib/clustering/HierarchicalClustering.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 36d36956443c2..8942424cb7e24 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -48,7 +48,7 @@ object HierarchicalClustering extends Logging { } /** - * This is a divisive hierarchical clustering algorithm based on bi-sect k-means algorithm. + * This is a divisive hierarchical clustering algorithm based on bisect k-means algorithm. * * The main idea of this algorithm is based on "A comparison of document clustering techniques", * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000. @@ -139,7 +139,7 @@ class HierarchicalClustering private ( while (clusters.size < maxAllNodesInTree && noMoreDividable == false) { log.info(s"${sc.appName} starts step ${step}") - // enough to be clustered if the number of divided clusters is equal to 0 + // can be clustered if the number of divided clusters is equal to 0 val divided = getDividedClusters(data, leafClusters) if (divided.size == 0) { noMoreDividable = true @@ -367,7 +367,7 @@ class HierarchicalClustering private ( } /** - * Gets the initial centers for bi-sect k-means + * Gets the initial centers for bisect k-means */ private[clustering] def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = { @@ -400,7 +400,7 @@ class HierarchicalClustering private ( // if there is no index in the Map if (!treeMap.contains(rootIndex)) return None - // build a cluster tree if the queue is empty or until the number of leaves clusters is enough + // build a cluster tree if the queue is empty or until the number of leaf clusters is enough var numLeavesClusters = 1 val root = treeMap(rootIndex) var leavesQueue = Map(rootIndex -> root) @@ -578,7 +578,7 @@ class ClusterTree private ( def setLocalHeight(height: Double) = (this.localHeight = height) /** - * Converts to a adjacency list + * Converts to an adjacency list * * @return List[(fromNodeId, toNodeId, distance)] */ @@ -601,7 +601,6 @@ class ClusterTree private ( /** * Converts to a linkage matrix * Returned data format is fit for scipy's dendrogram function - * SEE ALSO: scipy.cluster.hierarchy.dendrogram * * @return List[(node1, node2, distance, tree size)] */ From 58999db483b5d2a34af2a8172a541198eadd080e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Sat, 13 Jun 2015 08:54:42 +0900 Subject: [PATCH 18/76] Sort `import` statements in HierarchicalClustering.scala and HierarchicalClusteringModel.scala --- .../spark/mllib/clustering/HierarchicalClustering.scala | 7 ++++--- .../mllib/clustering/HierarchicalClusteringModel.scala | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 8942424cb7e24..350080e850476 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -17,14 +17,15 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm} +import scala.collection.{Map, mutable} + +import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm} + import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.util.random.XORShiftRandom import org.apache.spark.{Logging, SparkException} -import scala.collection.{Map, mutable} - object HierarchicalClustering extends Logging { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index eb7271df4c2dc..73a745c8d02cb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.clustering import java.io.File -import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm} +import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.commons.io.FilenameUtils import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.mllib.linalg.Vector From a077e99b700f8aa52e82f8960bcbda5524262041 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Sat, 13 Jun 2015 10:49:32 +0900 Subject: [PATCH 19/76] Format HierarchicalClusteringSuite and HierarchicalClusteringModelSuite --- .../HierarchicalClusteringModelSuite.scala | 12 +++++++----- .../clustering/HierarchicalClusteringSuite.scala | 16 +++++++++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index e8341e4ba58f8..be5efca958754 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -17,15 +17,17 @@ package org.apache.spark.mllib.clustering +import scala.reflect.io.Path +import org.scalatest.BeforeAndAfterEach + import org.apache.commons.io.FilenameUtils + +import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.scalatest.{BeforeAndAfterEach, FunSuite} - -import scala.reflect.io.Path class HierarchicalClusteringModelSuite - extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach { + extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { test("clustering dense vectors") { val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) @@ -191,7 +193,7 @@ class HierarchicalClusteringModelSuite val sameModel = HierarchicalClusteringModel.load(sc, tmpPath) assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel") localData.foreach { case (label, vector) => - assert(model.predict(vector) === sameModel.predict(vector)) + assert(model.predict(vector) === sameModel.predict(vector)) } // delete the temporary directory diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 82ac672747367..306d5896c297e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -17,14 +17,15 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm} +import breeze.linalg.{Vector => BV, norm => breezeNorm} + +import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ -import org.scalatest.FunSuite -class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { +class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkContext { test("the root index is equal to 1") { assert(HierarchicalClustering.ROOT_INDEX_KEY === 1) @@ -134,10 +135,11 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext { test("should assign each data to new clusters") { val algo = new HierarchicalClustering val seed = Seq( - (2L, Vectors.dense(0.0, 0.0)),(2L, Vectors.dense(1.0, 1.0)),(2L, Vectors.dense(2.0, 2.0)), - (2L, Vectors.dense(3.0, 3.0)),(2L, Vectors.dense(4.0, 4.0)),(2L, Vectors.dense(5.0, 5.0)), - (3L, Vectors.dense(6.0, 6.0)),(3L, Vectors.dense(7.0, 7.0)),(3L, Vectors.dense(8.0, 8.0)), - (3L, Vectors.dense(9.0, 9.0)),(3L, Vectors.dense(10.0, 10.0)),(3L, Vectors.dense(11.0, 11.0)) + (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)), + (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), + (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)), + (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), + (3L, Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze)} val newClusters = Map( 4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), From fa74f20c3a802387dbaded48eaff33fb4c477f12 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 19 Jun 2015 07:29:04 +0900 Subject: [PATCH 20/76] Rename ClusterTree to ClusterNode --- .../clustering/HierarchicalClustering.scala | 62 +++++++++---------- .../HierarchicalClusteringModel.scala | 28 ++++----- .../HierarchicalClusteringModelSuite.scala | 4 +- .../HierarchicalClusteringSuite.scala | 18 +++--- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 350080e850476..2b2e15295cdb0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -63,7 +63,7 @@ object HierarchicalClustering extends Logging { */ class HierarchicalClustering private ( private var numClusters: Int, - private var clusterMap: Map[Long, ClusterTree], + private var clusterMap: Map[Long, ClusterNode], private var maxIterations: Int, private var maxRetries: Int, private var seed: Long) extends Logging { @@ -71,7 +71,7 @@ class HierarchicalClustering private ( /** * Constructs with the default configuration */ - def this() = this(20, mutable.ListMap.empty[Long, ClusterTree], 20, 10, 1) + def this() = this(20, mutable.ListMap.empty[Long, ClusterNode], 20, 10, 1) /** * Sets the number of clusters you want @@ -202,7 +202,7 @@ class HierarchicalClustering private ( * Summarizes data by each cluster as ClusterTree classes */ private[clustering] - def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = { + def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterNode] = { // summarize input data val stats = summarize(data) @@ -213,7 +213,7 @@ class HierarchicalClustering private ( case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) case _ => Vectors.zeros(sum.size) } - (i, new ClusterTree(center, n.toLong, variances)) + (i, new ClusterNode(center, n.toLong, variances)) }.toMap } @@ -243,7 +243,7 @@ class HierarchicalClustering private ( */ private[clustering] def getDividedClusters(data: RDD[(Long, BV[Double])], - dividedClusters: Map[Long, ClusterTree]): Map[Long, ClusterTree] = { + dividedClusters: Map[Long, ClusterNode]): Map[Long, ClusterNode] = { val sc = data.sparkContext val appName = sc.appName @@ -253,7 +253,7 @@ class HierarchicalClustering private ( }.keySet if (dividableKeys.size == 0) { log.info(s"There is no dividable clusters in ${appName}.") - return Map.empty[Long, ClusterTree] + return Map.empty[Long, ClusterNode] } // divide input data @@ -288,7 +288,7 @@ class HierarchicalClustering private ( case 1 => Vectors.sparse(sum.size, Array(), Array()) case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) } - val child = new ClusterTree(center, n.toLong, variances) + val child = new ClusterNode(center, n.toLong, variances) (i, child) }.toMap } @@ -302,7 +302,7 @@ class HierarchicalClustering private ( */ private[clustering] def divide(data: RDD[(Long, BV[Double])], - clusters: Map[Long, ClusterTree]): Map[Long, (BV[Double], Double, BV[Double])] = { + clusters: Map[Long, ClusterNode]): Map[Long, (BV[Double], Double, BV[Double])] = { val sc = data.sparkContext val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)} @@ -394,9 +394,9 @@ class HierarchicalClustering private ( * @return a built cluster tree */ private[clustering] - def buildTree(treeMap: Map[Long, ClusterTree], + def buildTree(treeMap: Map[Long, ClusterNode], rootIndex: Long, - numClusters: Int): Option[ClusterTree] = { + numClusters: Int): Option[ClusterNode] = { // if there is no index in the Map if (!treeMap.contains(rootIndex)) return None @@ -445,7 +445,7 @@ class HierarchicalClustering private ( private[clustering] def updateClusterIndex( data: RDD[(Long, BV[Double])], - dividedClusters: Map[Long, ClusterTree]): RDD[(Long, BV[Double])] = { + dividedClusters: Map[Long, ClusterNode]): RDD[(Long, BV[Double])] = { // extract the centers of the clusters val sc = data.sparkContext var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} @@ -485,27 +485,27 @@ class HierarchicalClustering private ( * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster */ -class ClusterTree private ( +class ClusterNode private ( val center: Vector, val records: Long, val variances: Vector, val variancesNorm: Double, private var localHeight: Double, - private var parent: Option[ClusterTree], - private var children: Seq[ClusterTree]) extends Serializable { + private var parent: Option[ClusterNode], + private var children: Seq[ClusterNode]) extends Serializable { require(!variancesNorm.isNaN) def this(center: Vector, rows: Long, variances: Vector) = this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0), - 0.0, None, Array.empty[ClusterTree]) + 0.0, None, Array.empty[ClusterNode]) /** * Inserts a sub node as its child * * @param child inserted sub node */ - def insert(child: ClusterTree) { + def insert(child: ClusterNode) { insert(Array(child)) } @@ -514,7 +514,7 @@ class ClusterTree private ( * * @param children inserted sub nodes */ - def insert(children: Array[ClusterTree]) { + def insert(children: Array[ClusterNode]) { this.children = this.children ++ children children.foreach(child => child.parent = Some(this)) } @@ -525,7 +525,7 @@ class ClusterTree private ( * * @return an Array class which the cluster tree is expanded */ - def toArray(): Array[ClusterTree] = { + def toArray(): Array[ClusterNode] = { val array = this.children.size match { case 0 => Array(this) case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator) @@ -550,15 +550,15 @@ class ClusterTree private ( /** * Gets the leaves nodes in the cluster tree */ - def getLeavesNodes: Array[ClusterTree] = { + def getLeavesNodes: Array[ClusterNode] = { this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum) } def isLeaf: Boolean = (this.children.size == 0) - def getParent: Option[ClusterTree] = this.parent + def getParent: Option[ClusterNode] = this.parent - def getChildren: Seq[ClusterTree] = this.children + def getChildren: Seq[ClusterNode] = this.children /** * Gets the dendrogram height of the cluster at the cluster tree. @@ -610,22 +610,22 @@ class ClusterTree private ( val leaves = nodes.filter(_.isLeaf) val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1) val clusters = leaves ++ notLeaves - val treeMap = clusters.zipWithIndex.map { case (tree, idx) => (tree -> idx)}.toMap + val treeMap = clusters.zipWithIndex.map { case (node, idx) => (node -> idx)}.toMap // If a node only has one-child, the child is regarded as the cluster of the child. // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D. // ==> A merge list is (B, D), not (B, C). - def getIndex(map: Map[ClusterTree, Int], tree: ClusterTree): Int = { - tree.children.size match { - case 1 => getIndex(map, tree.children(0)) - case _ => map(tree) + def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = { + node.children.size match { + case 1 => getIndex(map, node.children(0)) + case _ => map(node) } } - clusters.filterNot(_.isLeaf).map { tree => - (getIndex(treeMap, tree.children(0)), - getIndex(treeMap, tree.children(1)), - tree.getHeight, - tree.toArray().filter(_.isLeaf).size) + clusters.filterNot(_.isLeaf).map { node => + (getIndex(treeMap, node.children(0)), + getIndex(treeMap, node.children(1)), + node.getHeight, + node.toArray().filter(_.isLeaf).size) } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 73a745c8d02cb..634d82a23f940 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -30,9 +30,9 @@ import org.apache.spark.{Logging, SparkContext} /** * This class is used for the model of the hierarchical clustering * - * @param tree a cluster as a tree node + * @param node a cluster as a tree node */ -class HierarchicalClusteringModel(val tree: ClusterTree) +class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable with Saveable with Logging { /** Current version of model save/load format. */ @@ -56,7 +56,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree) } } - def getClusters: Array[ClusterTree] = this.tree.getLeavesNodes + def getClusters: Array[ClusterNode] = this.node.getLeavesNodes def getCenters: Array[Vector] = this.getClusters.map(_.center) @@ -113,32 +113,32 @@ class HierarchicalClusteringModel(val tree: ClusterTree) def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) - def toAdjacencyList(): Array[(Int, Int, Double)] = this.tree.toAdjacencyList() + def toAdjacencyList(): Array[(Int, Int, Double)] = this.node.toAdjacencyList() /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]](); - this.tree.toAdjacencyList().foreach { x => + this.node.toAdjacencyList().foreach { x => val edge = new java.util.ArrayList[java.lang.Double]() - edge.add(x._1) - edge.add(x._2) - edge.add(x._3) + edge.add(x._1.toDouble) + edge.add(x._2.toDouble) + edge.add(x._3.toDouble) javaList.add(edge) } javaList } - def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.tree.toLinkageMatrix() + def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix() /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() - this.tree.toLinkageMatrix().foreach {x => + this.node.toLinkageMatrix().foreach {x => val row = new java.util.ArrayList[java.lang.Double]() - row.add(x._1) - row.add(x._2) - row.add(x._3) - row.add(x._4) + row.add(x._1.toDouble) + row.add(x._2.toDouble) + row.add(x._3.toDouble) + row.add(x._4.toDouble) javaList.add(row) } javaList diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index be5efca958754..9c83f80a6a41c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -41,7 +41,7 @@ class HierarchicalClusteringModelSuite val model = app.run(data) val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[ClusterTree]]) + assert(clusters.isInstanceOf[Array[ClusterNode]]) assert(clusters.size === 5) val centers = model.getCenters.sortBy(_.toArray.sum) @@ -102,7 +102,7 @@ class HierarchicalClusteringModelSuite val model = app.run(data) val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[ClusterTree]]) + assert(clusters.isInstanceOf[Array[ClusterNode]]) assert(clusters.size === 5) val centers = model.getCenters.sortBy(_.toArray.sum) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 306d5896c297e..29d39ed87f42d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -52,12 +52,12 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte val data = sc.parallelize(localSeed, 2) val model = algo.run(data) assert(model.getClusters.size == 123) - assert(model.tree.getHeight ~== 702.8641 absTol 10E-4) + assert(model.node.getHeight ~== 702.8641 absTol 10E-4) // check the relations between a parent cluster and its children - assert(model.tree.getParent === None) - assert(model.tree.getChildren.apply(0).getParent.get === model.tree) - assert(model.tree.getChildren.apply(1).getParent.get === model.tree) + assert(model.node.getParent === None) + assert(model.node.getChildren.apply(0).getParent.get === model.node) + assert(model.node.getChildren.apply(1).getParent.get === model.node) assert(model.getClusters.forall(_.getParent != None)) } @@ -67,7 +67,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte val data = sc.parallelize(localSeed, 2) val model = algo.run(data) assert(model.getClusters.size == 100) - assert(model.tree.getHeight ~== 72.12489 absTol 10E-4) + assert(model.node.getHeight ~== 72.12489 absTol 10E-4) } test("initializeData") { @@ -142,10 +142,10 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte (3L, Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze)} val newClusters = Map( - 4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), - 5L -> new ClusterTree(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)), - 6L -> new ClusterTree(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)), - 7L -> new ClusterTree(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0)) + 4L -> new ClusterNode(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), + 5L -> new ClusterNode(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)), + 6L -> new ClusterNode(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)), + 7L -> new ClusterNode(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0)) ) val data = sc.parallelize(seed) val result = algo.updateClusterIndex(data, newClusters).collect().toSeq From 11439202ff555a8ed303b453872de9ccefbcf792 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 19 Jun 2015 07:43:40 +0900 Subject: [PATCH 21/76] Remove save/load from HierarchicalClusteringModel --- .../HierarchicalClusteringModel.scala | 56 +------------------ .../JavaHierarchicalClusteringSuite.java | 23 -------- .../HierarchicalClusteringModelSuite.scala | 34 +---------- 3 files changed, 4 insertions(+), 109 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index 634d82a23f940..bbd13855e2835 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -17,44 +17,18 @@ package org.apache.spark.mllib.clustering -import java.io.File - import breeze.linalg.{Vector => BV, norm => breezeNorm} -import org.apache.commons.io.FilenameUtils -import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} +import org.apache.spark.Logging +import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD -import org.apache.spark.{Logging, SparkContext} /** * This class is used for the model of the hierarchical clustering * * @param node a cluster as a tree node */ -class HierarchicalClusteringModel(val node: ClusterNode) - extends Serializable with Saveable with Logging { - - /** Current version of model save/load format. */ - override protected def formatVersion: String = "1.0" - - override def save(sc: SparkContext, path: String): Unit = this.save(path) - - def save(sc: JavaSparkContext, path: String): Unit = this.save(path) - - private def save(path: String): Unit = { - val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile - if (! pathObj.exists()) { - pathObj.mkdirs(); - } - val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path) - val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath)) - try { - oos.writeObject(this) - } finally { - oos.close() - } - } +class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable with Logging { def getClusters: Array[ClusterNode] = this.node.getLeavesNodes @@ -145,27 +119,3 @@ class HierarchicalClusteringModel(val node: ClusterNode) } } - -object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] { - - override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = { - this.load(path) - } - - def load(sc: JavaSparkContext, path: String): HierarchicalClusteringModel = { - this.load(path) - } - - def load(path: String): HierarchicalClusteringModel = { - val modelFilePath = getModelFilePath(path) - val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(modelFilePath)) - try { - stream.readObject().asInstanceOf[HierarchicalClusteringModel] - } finally { - stream.close() - } - } - - private[clustering] - def getModelFilePath(path: String): String = FilenameUtils.concat(path, "model") -} diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java index 16d77570ce188..84ae01d6dde0a 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.mllib.clustering; import com.google.common.collect.Lists; -import jodd.io.FileUtil; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; @@ -27,11 +26,7 @@ import org.junit.Before; import org.junit.Test; -import java.io.File; -import java.io.IOException; import java.io.Serializable; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; @@ -66,24 +61,6 @@ public void runWithSmallData() { HierarchicalClusteringModel model = algo.run(data.rdd()); assertEquals(1, model.getCenters().length); assertEquals(expectedCenter, model.getCenters()[0]); - - // save & load - try { - // create a temporary directory - String tempDir = System.getProperty("java.io.tmpdir"); - Path pathObj = Paths.get(tempDir, String.valueOf(this.hashCode())); - String path = pathObj.toAbsolutePath().toString(); - - model.save(sc, path); - HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path); - assertEquals(1, savedModel.getCenters().length); - assertEquals(expectedCenter, savedModel.getCenters()[0]); - - // delete the temporary directory - FileUtil.delete(new File(path)); - } catch (IOException e) { - e.printStackTrace(); - } } @Test diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index 9c83f80a6a41c..1335a0b8c6e3d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -17,14 +17,10 @@ package org.apache.spark.mllib.clustering -import scala.reflect.io.Path -import org.scalatest.BeforeAndAfterEach - -import org.apache.commons.io.FilenameUtils - import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.scalatest.BeforeAndAfterEach class HierarchicalClusteringModelSuite extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { @@ -172,32 +168,4 @@ class HierarchicalClusteringModelSuite assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0)) } } - - test("save a model, and then load the model") { - val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) - - val localData = (1 to 100).toSeq.map { i => - val label = i % 5 - val vector = Vectors.dense(label, label, label) - (label, vector) - } - val data = sc.parallelize(localData.map(_._2)) - val model = app.run(data) - - // create a temporary directory for the test - val tmpBaseDir = System.getProperty("java.io.tmpdir") - val tmpDir = this.getClass.getSimpleName + this.hashCode().toString - val tmpPath = FilenameUtils.concat(tmpBaseDir, tmpDir) - - model.save(sc, tmpPath) - val sameModel = HierarchicalClusteringModel.load(sc, tmpPath) - assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel") - localData.foreach { case (label, vector) => - assert(model.predict(vector) === sameModel.predict(vector)) - } - - // delete the temporary directory - val path = Path(tmpPath) - path.deleteRecursively() - } } From 16cc823690a0b7d084785e56bdc76d9bb73474bc Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 19 Jun 2015 08:22:38 +0900 Subject: [PATCH 22/76] Fix some mislenious code pointed out by IntelliJ --- .../clustering/HierarchicalClustering.scala | 58 ++++++++++--------- .../HierarchicalClusteringModel.scala | 16 ++--- .../JavaHierarchicalClusteringSuite.java | 2 +- .../HierarchicalClusteringModelSuite.scala | 28 ++++----- .../HierarchicalClusteringSuite.scala | 7 +-- 5 files changed, 56 insertions(+), 55 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala index 2b2e15295cdb0..f7249272b0c3d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala @@ -40,7 +40,7 @@ object HierarchicalClustering extends Logging { * @return an index of the array of clusters */ private[mllib] - def findClosestCenter(metric: Function2[BV[Double], BV[Double], Double]) + def findClosestCenter(metric: (BV[Double], BV[Double]) => Double) (centers: Seq[BV[Double]])(point: BV[Double]): Int = { val (closestCenter, closestIndex) = centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1) @@ -152,7 +152,7 @@ class HierarchicalClustering private ( data = newData // keep recent 2 cached RDDs in order to run more quickly - if (rddArray.size > 1) { + if (rddArray.length > 1) { val head = rddArray.head head.unpersist() rddArray = rddArray.filterNot(_.hashCode() == head.hashCode()) @@ -184,8 +184,8 @@ class HierarchicalClustering private ( // make a hierarchical clustering model val model = new HierarchicalClusteringModel(root.get) val leavesNodes = model.getClusters - if (leavesNodes.size < this.numClusters) { - log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ") + if (leavesNodes.length < this.numClusters) { + log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}") } model } @@ -195,7 +195,7 @@ class HierarchicalClustering private ( */ private[clustering] def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { - data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}.cache + data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)} } /** @@ -227,8 +227,8 @@ class HierarchicalClustering private ( val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] iter.foreach { case (idx: Long, point: BV[Double]) => // get a map value or else get a sparse vector - val (sumBV, n, sumOfSquares) = map.get(idx) - .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + val (sumBV, n, sumOfSquares) = map + .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) } map.toIterator @@ -258,16 +258,15 @@ class HierarchicalClustering private ( // divide input data var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)} - var dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)} + val dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)} val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator) var stats = divide(data, dividableClusters) - // if there is clusters which is failed to be divided, - // retry to divide only failed clusters again and again + // if there are clusters which failed to be divided, retry to split the failed clusters var tryTimes = 1 while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) { // get the indexes of clusters which is failed to be divided - val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => (idx / 2).toLong) + val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => idx / 2) val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)} log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" + s"at ${tryTimes} times in ${appName}") @@ -332,15 +331,18 @@ class HierarchicalClustering private ( iter.foreach { case (idx, point) => // calculate next index number val childrenCenters = Array(2 * idx, 2 * idx + 1) - .filter(bcNewCenters.value.keySet.contains(_)).map(bcNewCenters.value(_)).toArray - if (childrenCenters.size >= 1) { + .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) + if (childrenCenters.length >= 1) { val closestIndex = HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point) val nextIndex = 2 * idx + closestIndex // get a map value or else get a sparse vector - val (sumBV, n, sumOfSquares) = map.get(nextIndex) - .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + val (sumBV, n, sumOfSquares) = map + .getOrElse( + nextIndex, + (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + ) map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) } } @@ -426,7 +428,7 @@ class HierarchicalClustering private ( mostScatteredCluster.setLocalHeight(localHeight) // update the queue - leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap + leavesQueue = leavesQueue ++ childrenIndexes.map(i => i -> treeMap(i)).toMap numLeavesClusters += 1 } @@ -457,8 +459,8 @@ class HierarchicalClustering private ( // update the indexes to their children indexes data.map { case (idx, point) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(bcCenters.value.keySet.contains(_)) - childrenIndexes.size match { + val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c)) + childrenIndexes.length match { // stay the index if the number of children is not enough case s if s < 2 => (idx, point) // update the indexes @@ -525,10 +527,10 @@ class ClusterNode private ( * * @return an Array class which the cluster tree is expanded */ - def toArray(): Array[ClusterNode] = { + def toArray: Array[ClusterNode] = { val array = this.children.size match { case 0 => Array(this) - case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator) + case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) } array.sortWith { case (a, b) => a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum @@ -551,7 +553,7 @@ class ClusterNode private ( * Gets the leaves nodes in the cluster tree */ def getLeavesNodes: Array[ClusterNode] = { - this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum) + this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) } def isLeaf: Boolean = (this.children.size == 0) @@ -583,8 +585,8 @@ class ClusterNode private ( * * @return List[(fromNodeId, toNodeId, distance)] */ - def toAdjacencyList(): Array[(Int, Int, Double)] = { - val nodes = toArray() + def toAdjacencyList: Array[(Int, Int, Double)] = { + val nodes = toArray var adjacencyList = Array.empty[(Int, Int, Double)] nodes.foreach { parent => @@ -605,8 +607,8 @@ class ClusterNode private ( * * @return List[(node1, node2, distance, tree size)] */ - def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = { - val nodes = toArray().sortWith { case (a, b) => a.getHeight < b.getHeight} + def toLinkageMatrix: Array[(Int, Int, Double, Int)] = { + val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight} val leaves = nodes.filter(_.isLeaf) val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1) val clusters = leaves ++ notLeaves @@ -617,15 +619,15 @@ class ClusterNode private ( // ==> A merge list is (B, D), not (B, C). def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = { node.children.size match { - case 1 => getIndex(map, node.children(0)) + case 1 => getIndex(map, node.children.head) case _ => map(node) } } clusters.filterNot(_.isLeaf).map { node => - (getIndex(treeMap, node.children(0)), + (getIndex(treeMap, node.children.head), getIndex(treeMap, node.children(1)), node.getHeight, - node.toArray().filter(_.isLeaf).size) + node.toArray.filter(_.isLeaf).length) } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala index bbd13855e2835..7a38a7994fb0c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala @@ -69,7 +69,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] /** - * Computes Within Set Sum of Squeared Error(WSSSE) + * Computes Within Set Sum of Squared Error(WSSSE) */ def WSSSE(data: RDD[Vector]): Double = { val bvCenters = this.getCenters.map(_.toBreeze) @@ -87,12 +87,12 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) - def toAdjacencyList(): Array[(Int, Int, Double)] = this.node.toAdjacencyList() + def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ - def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { - var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]](); - this.node.toAdjacencyList().foreach { x => + def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { + val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() + this.node.toAdjacencyList.foreach { x => val edge = new java.util.ArrayList[java.lang.Double]() edge.add(x._1.toDouble) edge.add(x._2.toDouble) @@ -102,12 +102,12 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi javaList } - def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix() + def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ - def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { + def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() - this.node.toLinkageMatrix().foreach {x => + this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java index 84ae01d6dde0a..7132658f5a3e7 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java @@ -68,7 +68,7 @@ public void runWithDenseVectors() { int numClusters = 5; List points = Lists.newArrayList(); for (int i = 0; i < 99; i++) { - Double elm = new Double(i % numClusters); + Double elm = (double)(i % numClusters); Vector point = Vectors.dense(elm, elm); points.add(point); } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala index 1335a0b8c6e3d..8d8715c77db92 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala @@ -38,10 +38,10 @@ class HierarchicalClusteringModelSuite val clusters = model.getClusters assert(clusters.isInstanceOf[Array[ClusterNode]]) - assert(clusters.size === 5) + assert(clusters.length === 5) val centers = model.getCenters.sortBy(_.toArray.sum) - assert(centers.size === 5) + assert(centers.length === 5) assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0)) assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0)) assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0)) @@ -64,9 +64,9 @@ class HierarchicalClusteringModelSuite assert(model.WSSSE(data) === 0.0) // adjacency list - val adjacencyList = model.toAdjacencyList() + val adjacencyList = model.toAdjacencyList .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) - assert(adjacencyList.size === 8) + assert(adjacencyList.length === 8) assert(adjacencyList(0) === (0, 1, 2.5981)) assert(adjacencyList(1) === (0, 6, 2.5981)) assert(adjacencyList(2) === (1, 2, 1.7321)) @@ -77,9 +77,9 @@ class HierarchicalClusteringModelSuite assert(adjacencyList(7) === (6, 8, 0.866)) // linkage matrix - val linkageMatrix = model.toLinkageMatrix() + val linkageMatrix = model.toLinkageMatrix .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) - assert(linkageMatrix.size === 4) + assert(linkageMatrix.length === 4) assert(linkageMatrix(0) === (0, 1, 0.866, 2)) assert(linkageMatrix(1) === (3, 4, 0.866, 2)) assert(linkageMatrix(2) === (5, 2, 2.5981, 3)) @@ -99,10 +99,10 @@ class HierarchicalClusteringModelSuite val clusters = model.getClusters assert(clusters.isInstanceOf[Array[ClusterNode]]) - assert(clusters.size === 5) + assert(clusters.length === 5) val centers = model.getCenters.sortBy(_.toArray.sum) - assert(centers.size === 5) + assert(centers.length === 5) assert(centers(0) === Vectors.sparse(5, Array(), Array())) assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0))) assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0))) @@ -124,9 +124,9 @@ class HierarchicalClusteringModelSuite assert(model.WSSSE(data) === 0.0) // adjacency list - val adjacencyList = model.toAdjacencyList() + val adjacencyList = model.toAdjacencyList .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) - assert(adjacencyList.size === 8) + assert(adjacencyList.length === 8) assert(adjacencyList(0) === (0, 1, 1.5652)) assert(adjacencyList(1) === (0, 6, 1.5652)) assert(adjacencyList(2) === (1, 2, 1.3744)) @@ -137,9 +137,9 @@ class HierarchicalClusteringModelSuite assert(adjacencyList(7) === (6, 8, 2.5)) // linkage matrix - val linkageMatrix = model.toLinkageMatrix() + val linkageMatrix = model.toLinkageMatrix .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) - assert(linkageMatrix.size === 4) + assert(linkageMatrix.length === 4) assert(linkageMatrix(0) === (0, 1, 0.5, 2)) assert(linkageMatrix(1) === (5, 2, 1.8744, 3)) assert(linkageMatrix(2) === (3, 4, 2.5, 2)) @@ -158,13 +158,13 @@ class HierarchicalClusteringModelSuite // dense version val denseData = sc.parallelize(localData.map(_._2), 2) val denseModel = app.run(denseData) - assert(denseModel.getCenters.size === numClusters) + assert(denseModel.getCenters.length === numClusters) assert(denseModel.getClusters.forall(_.variancesNorm == 0.0)) // sparse version val sparseData = sc.parallelize(localData.map(_._3), 2) val sparseModel = app.run(sparseData) - assert(sparseModel.getCenters.size === numClusters) + assert(sparseModel.getCenters.length === numClusters) assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala index 29d39ed87f42d..bd4c94c05b1bc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala @@ -51,12 +51,12 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) - assert(model.getClusters.size == 123) + assert(model.getClusters.length == 123) assert(model.node.getHeight ~== 702.8641 absTol 10E-4) // check the relations between a parent cluster and its children assert(model.node.getParent === None) - assert(model.node.getChildren.apply(0).getParent.get === model.node) + assert(model.node.getChildren.head.getParent.get === model.node) assert(model.node.getChildren.apply(1).getParent.get === model.node) assert(model.getClusters.forall(_.getParent != None)) } @@ -66,7 +66,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) - assert(model.getClusters.size == 100) + assert(model.getClusters.length == 100) assert(model.node.getHeight ~== 72.12489 absTol 10E-4) } @@ -85,7 +85,6 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte val data = algo.initData(seed) val clusters = algo.summarizeAsClusters(data) - val center = clusters(1).center assert(clusters.size === 1) assert(clusters(1).center === Vectors.dense(49.5, 49.5)) assert(clusters(1).records === 100) From c02134e9cc26337c077cefa421a76d72ac9690ba Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 3 Jul 2015 11:56:02 +0900 Subject: [PATCH 23/76] Remove python API. We will implement it at another issue. --- .../mllib/api/python/PythonMLLibAPI.scala | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 49a3420c26945..21e55938fa7aa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -401,29 +401,6 @@ private[python] class PythonMLLibAPI extends Serializable { } } - /** - * Java stub for Python mllib HierarchicalClustering.run() - */ - def trainHierarchicalClusteringModel( - data: JavaRDD[Vector], - k: Int, - maxIterations: Int, - maxRetries: Int, - seed: java.lang.Long): HierarchicalClusteringModel = { - val algo = new HierarchicalClustering() - .setNumClusters(k) - .setMaxIterations(maxIterations) - .setMaxRetries(maxRetries) - - if (seed != null) algo.setSeed(seed) - - try { - algo.run(data) - } finally { - data.rdd.unpersist(blocking = false) - } - } - /** * Java stub for Python mllib GaussianMixtureModel.predictSoft() */ From def81e202125de52b191e05e3df60c80c647d89e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 3 Jul 2015 16:21:39 +0900 Subject: [PATCH 24/76] Rename HierarchicalClustering to BisectingKMeans --- ...Clustering.scala => BisectingKMeans.scala} | 28 +++++++++---------- ...Model.scala => BisectingKMeansModel.scala} | 10 +++---- ...ite.java => JavaBisectingKMeansSuite.java} | 16 +++++------ ....scala => BisectingKMeansModelSuite.scala} | 8 +++--- ...Suite.scala => BisectingKMeansSuite.scala} | 28 +++++++++---------- 5 files changed, 45 insertions(+), 45 deletions(-) rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{HierarchicalClustering.scala => BisectingKMeans.scala} (96%) rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringModel.scala => BisectingKMeansModel.scala} (90%) rename mllib/src/test/java/org/apache/spark/mllib/clustering/{JavaHierarchicalClusteringSuite.java => JavaBisectingKMeansSuite.java} (86%) rename mllib/src/test/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringModelSuite.scala => BisectingKMeansModelSuite.scala} (96%) rename mllib/src/test/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringSuite.scala => BisectingKMeansSuite.scala} (90%) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala similarity index 96% rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index f7249272b0c3d..20f87c12cac8e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -27,7 +27,7 @@ import org.apache.spark.util.random.XORShiftRandom import org.apache.spark.{Logging, SparkException} -object HierarchicalClustering extends Logging { +object BisectingKMeans extends Logging { private[clustering] val ROOT_INDEX_KEY: Long = 1 @@ -49,7 +49,7 @@ object HierarchicalClustering extends Logging { } /** - * This is a divisive hierarchical clustering algorithm based on bisect k-means algorithm. + * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm. * * The main idea of this algorithm is based on "A comparison of document clustering techniques", * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000. @@ -61,7 +61,7 @@ object HierarchicalClustering extends Logging { * @param maxRetries the number of maximum retries * @param seed a random seed */ -class HierarchicalClustering private ( +class BisectingKMeans private ( private var numClusters: Int, private var clusterMap: Map[Long, ClusterNode], private var maxIterations: Int, @@ -114,13 +114,13 @@ class HierarchicalClustering private ( def getSeed: Long = this.seed /** - * Runs the hierarchical clustering algorithm + * Runs the bisecting kmeans algorithm * @param input RDD of vectors - * @return model for the hierarchical clustering + * @return model for the bisecting kmeans */ - def run(input: RDD[Vector]): HierarchicalClusteringModel = { + def run(input: RDD[Vector]): BisectingKMeansModel = { val sc = input.sparkContext - log.info(s"${sc.appName} starts a hierarchical clustering algorithm") + log.info(s"${sc.appName} starts a bisecting kmeans algorithm") var data = initData(input).cache() val startTime = System.currentTimeMillis() @@ -172,17 +172,17 @@ class HierarchicalClustering private ( // build a cluster tree by Map class which is expressed log.info(s"Building the cluster tree is started in ${sc.appName}") - val root = buildTree(clusters, HierarchicalClustering.ROOT_INDEX_KEY, this.numClusters) + val root = buildTree(clusters, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters) if (root == None) { new SparkException("Failed to build a cluster tree from a Map type of clusters") } // set the elapsed time for training val finishTime = (System.currentTimeMillis() - startTime) / 1000.0 - log.info(s"Elapsed Time for Hierarchical Clustering Training: ${finishTime} [sec]") + log.info(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]") - // make a hierarchical clustering model - val model = new HierarchicalClusteringModel(root.get) + // make a bisecting kmeans model + val model = new BisectingKMeansModel(root.get) val leavesNodes = model.getClusters if (leavesNodes.length < this.numClusters) { log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}") @@ -195,7 +195,7 @@ class HierarchicalClustering private ( */ private[clustering] def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { - data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)} + data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)} } /** @@ -334,7 +334,7 @@ class HierarchicalClustering private ( .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) if (childrenCenters.length >= 1) { val closestIndex = - HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point) + BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point) val nextIndex = 2 * idx + closestIndex // get a map value or else get a sparse vector @@ -466,7 +466,7 @@ class HierarchicalClustering private ( // update the indexes case _ => { val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze) - val closestIndex = HierarchicalClustering + val closestIndex = BisectingKMeans .findClosestCenter(bcMetric.value)(nextCenters)(point) val nextIndex = 2 * idx + closestIndex (nextIndex, point) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala similarity index 90% rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 7a38a7994fb0c..2c257caced02e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -24,11 +24,11 @@ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD /** - * This class is used for the model of the hierarchical clustering + * This class is used for the model of the bisecting kmeans * * @param node a cluster as a tree node */ -class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable with Logging { +class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logging { def getClusters: Array[ClusterNode] = this.node.getLeavesNodes @@ -42,7 +42,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val centers = this.getCenters.map(_.toBreeze) - HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze) + BisectingKMeans.findClosestCenter(metric)(centers)(vector.toBreeze) } /** @@ -58,7 +58,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi sc.broadcast(centers) data.map{point => - HierarchicalClustering.findClosestCenter(metric)(centers)(point.toBreeze) + BisectingKMeans.findClosestCenter(metric)(centers)(point.toBreeze) } } @@ -77,7 +77,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi val distances = data.map {point => val bvPoint = point.toBreeze val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val idx = HierarchicalClustering.findClosestCenter(metric)(bvCenters)(bvPoint) + val idx = BisectingKMeans.findClosestCenter(metric)(bvCenters)(bvPoint) val closestCenter = bvCenters(idx) val distance = metric(bvPoint, closestCenter) distance diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java similarity index 86% rename from mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java rename to mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index 7132658f5a3e7..75daf4c26f93b 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -32,12 +32,12 @@ import static org.junit.Assert.assertEquals; -public class JavaHierarchicalClusteringSuite implements Serializable { +public class JavaBisectingKMeansSuite implements Serializable { private transient JavaSparkContext sc; @Before public void setUp() { - sc = new JavaSparkContext("local", "JavaHierarchicalClustering"); + sc = new JavaSparkContext("local", this.getClass().getSimpleName()); } @After @@ -57,8 +57,8 @@ public void runWithSmallData() { Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0); JavaRDD data = sc.parallelize(points, 2); - HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(1); - HierarchicalClusteringModel model = algo.run(data.rdd()); + BisectingKMeans algo = new BisectingKMeans().setNumClusters(1); + BisectingKMeansModel model = algo.run(data.rdd()); assertEquals(1, model.getCenters().length); assertEquals(expectedCenter, model.getCenters()[0]); } @@ -73,8 +73,8 @@ public void runWithDenseVectors() { points.add(point); } JavaRDD data = sc.parallelize(points, 2); - HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters); - HierarchicalClusteringModel model = algo.run(data.rdd()); + BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters); + BisectingKMeansModel model = algo.run(data.rdd()); Vector[] centers = model.getCenters(); assertEquals(numClusters, centers.length); assertEquals(Vectors.dense(0.0, 0.0), centers[0]); @@ -103,8 +103,8 @@ public void runWithSparseVectors() { points.add(point); } JavaRDD data = sc.parallelize(points, 2); - HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters); - HierarchicalClusteringModel model = algo.run(data.rdd()); + BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters); + BisectingKMeansModel model = algo.run(data.rdd()); Vector[] centers = model.getCenters(); assertEquals(numClusters, centers.length); assertEquals(points.get(0), centers[0]); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala similarity index 96% rename from mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala rename to mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index 8d8715c77db92..6df074e34d23d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -22,11 +22,11 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.scalatest.BeforeAndAfterEach -class HierarchicalClusteringModelSuite +class BisectingKMeansModelSuite extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { test("clustering dense vectors") { - val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) + val app = new BisectingKMeans().setNumClusters(5).setSeed(1) val localData = (1 to 100).toSeq.map { i => val label = i % 5 @@ -87,7 +87,7 @@ class HierarchicalClusteringModelSuite } test("clustering sparse vectors") { - val app = new HierarchicalClustering().setNumClusters(5).setSeed(1) + val app = new BisectingKMeans().setNumClusters(5).setSeed(1) val localData = (1 to 100).toSeq.map { i => val label = i % 5 @@ -148,7 +148,7 @@ class HierarchicalClusteringModelSuite test("clustering should be done correctly") { for (numClusters <- Array(9, 99, 999)) { - val app = new HierarchicalClustering().setNumClusters(numClusters).setSeed(1) + val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1) val localData = (1 to 1000).toSeq.map { i => val label = i % numClusters val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble))) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala similarity index 90% rename from mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala rename to mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index bd4c94c05b1bc..8e5d95dfb2846 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -25,10 +25,10 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ -class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkContext { +class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("the root index is equal to 1") { - assert(HierarchicalClustering.ROOT_INDEX_KEY === 1) + assert(BisectingKMeans.ROOT_INDEX_KEY === 1) } test("findClosestCenter") { @@ -41,13 +41,13 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte for (i <- 0 to (centers.size - 1)) { val point = centers(i) - val closestIndex = HierarchicalClustering.findClosestCenter(metric)(centers)(point) + val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point) assert(closestIndex === i) } } test("run") { - val algo = new HierarchicalClustering().setNumClusters(123) + val algo = new BisectingKMeans().setNumClusters(123) val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) @@ -62,7 +62,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("run with too many cluster size than the records") { - val algo = new HierarchicalClustering().setNumClusters(123) + val algo = new BisectingKMeans().setNumClusters(123) val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) @@ -71,7 +71,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("initializeData") { - val algo = new HierarchicalClustering + val algo = new BisectingKMeans val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val seed = sc.parallelize(localSeed) val data = algo.initData(seed) @@ -79,7 +79,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("get center stats") { - val algo = new HierarchicalClustering + val algo = new BisectingKMeans val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val seed = sc.parallelize(localSeed) val data = algo.initData(seed) @@ -103,7 +103,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("getChildrenCenter") { - val algo = new HierarchicalClustering + val algo = new BisectingKMeans val centers = Map( 2L -> Vectors.dense(1.0, 1.0).toBreeze, 3L -> Vectors.dense(2.0, 2.0).toBreeze @@ -114,7 +114,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("should divide clusters") { - val algo = new HierarchicalClustering + val algo = new BisectingKMeans val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze)) val data = sc.parallelize(seed) val clusters = algo.summarizeAsClusters(data) @@ -132,7 +132,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("should assign each data to new clusters") { - val algo = new HierarchicalClustering + val algo = new BisectingKMeans val seed = Seq( (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)), (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), @@ -159,28 +159,28 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte } test("setNumClusters") { - val algo = new HierarchicalClustering() + val algo = new BisectingKMeans() assert(algo.getNumClusters == 20) algo.setNumClusters(1000) assert(algo.getNumClusters == 1000) } test("setSubIterations") { - val algo = new HierarchicalClustering() + val algo = new BisectingKMeans() assert(algo.getMaxIterations == 20) algo.setMaxIterations(15) assert(algo.getMaxIterations == 15) } test("setNumRetries") { - val algo = new HierarchicalClustering() + val algo = new BisectingKMeans() assert(algo.getMaxRetries == 10) algo.setMaxRetries(15) assert(algo.getMaxRetries == 15) } test("setSeed") { - val algo = new HierarchicalClustering() + val algo = new BisectingKMeans() assert(algo.getSeed == 1) algo.setSeed(987) assert(algo.getSeed == 987) From 707609a040c7b5c8a67b266f8f3422ec7aec069d Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Fri, 3 Jul 2015 16:54:05 +0900 Subject: [PATCH 25/76] Remove the unnecessary parentheses --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 20f87c12cac8e..a1805bd603c29 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -556,7 +556,7 @@ class ClusterNode private ( this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) } - def isLeaf: Boolean = (this.children.size == 0) + def isLeaf: Boolean = this.children.size == 0 def getParent: Option[ClusterNode] = this.parent @@ -578,7 +578,7 @@ class ClusterNode private ( } private[mllib] - def setLocalHeight(height: Double) = (this.localHeight = height) + def setLocalHeight(height: Double) = this.localHeight = height /** * Converts to an adjacency list @@ -612,7 +612,7 @@ class ClusterNode private ( val leaves = nodes.filter(_.isLeaf) val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1) val clusters = leaves ++ notLeaves - val treeMap = clusters.zipWithIndex.map { case (node, idx) => (node -> idx)}.toMap + val treeMap = clusters.zipWithIndex.map { case (node, idx) => node -> idx}.toMap // If a node only has one-child, the child is regarded as the cluster of the child. // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D. From 4e1653d1a76a2b4c8efb8ca30ec5511316b68530 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 7 Jul 2015 08:22:35 +0900 Subject: [PATCH 26/76] Change the way how to initialize the children centers --- .../mllib/clustering/BisectingKMeans.scala | 283 ++++++++---------- .../BisectingKMeansModelSuite.scala | 29 +- .../clustering/BisectingKMeansSuite.scala | 103 +++---- 3 files changed, 200 insertions(+), 215 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index a1805bd603c29..091884e603c6d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,19 +17,17 @@ package org.apache.spark.mllib.clustering -import scala.collection.{Map, mutable} - -import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm} +import scala.collection.{SortedSet, mutable, Map} +import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD -import org.apache.spark.util.random.XORShiftRandom import org.apache.spark.{Logging, SparkException} object BisectingKMeans extends Logging { - private[clustering] val ROOT_INDEX_KEY: Long = 1 + private[clustering] val ROOT_INDEX_KEY: BigInt = 1 /** * Finds the closes cluster's center @@ -62,16 +60,15 @@ object BisectingKMeans extends Logging { * @param seed a random seed */ class BisectingKMeans private ( - private var numClusters: Int, - private var clusterMap: Map[Long, ClusterNode], - private var maxIterations: Int, - private var maxRetries: Int, - private var seed: Long) extends Logging { + private var numClusters: Int, + private var clusterMap: Map[BigInt, ClusterNode], + private var maxIterations: Int, + private var seed: Long) extends Logging { /** * Constructs with the default configuration */ - def this() = this(20, mutable.ListMap.empty[Long, ClusterNode], 20, 10, 1) + def this() = this(20, mutable.ListMap.empty[BigInt, ClusterNode], 20, 1) /** * Sets the number of clusters you want @@ -93,16 +90,6 @@ class BisectingKMeans private ( def getMaxIterations: Int = this.maxIterations - /** - * Sets the number of maximum retries of each clustering step - */ - def setMaxRetries(maxRetries: Int): this.type = { - this.maxRetries = maxRetries - this - } - - def getMaxRetries: Int = this.maxRetries - /** * Sets the random seed */ @@ -125,27 +112,30 @@ class BisectingKMeans private ( var data = initData(input).cache() val startTime = System.currentTimeMillis() - // `clusters` is described as binary tree structure - // `clusters(1)` means the root of a binary tree - var clusters = summarizeAsClusters(data) - var leafClusters = clusters + // `clusterStats` is described as binary tree structure + // `clusterStats(1)` means the root of a binary tree + var clusterStats = mutable.Map.empty[BigInt, ClusterNodeStat] var step = 1 var numDividedClusters = 0 var noMoreDividable = false - var rddArray = Array.empty[RDD[(Long, BV[Double])]] + var rddArray = Array.empty[RDD[(BigInt, BV[Double])]] // the number of maximum nodes of a binary tree by given parameter val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1 val maxAllNodesInTree = math.pow(2, multiplier).toInt - while (clusters.size < maxAllNodesInTree && noMoreDividable == false) { + while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) { log.info(s"${sc.appName} starts step ${step}") + val leafClusters = summarize(data) + val dividableLeafClusters = leafClusters.filter(_._2.isDividable) + clusterStats = clusterStats ++ leafClusters - // can be clustered if the number of divided clusters is equal to 0 - val divided = getDividedClusters(data, leafClusters) - if (divided.size == 0) { + if (dividableLeafClusters.isEmpty) { noMoreDividable = true } else { + // can be clustered if the number of divided clusterStats is equal to 0 + val divided = getDividedClusters(data, dividableLeafClusters) + // update each index val newData = updateClusterIndex(data, divided).cache() rddArray = rddArray ++ Array(data) @@ -157,24 +147,20 @@ class BisectingKMeans private ( head.unpersist() rddArray = rddArray.filterNot(_.hashCode() == head.hashCode()) } - - // merge the divided clusters with the map as the cluster tree - clusters = clusters ++ divided - numDividedClusters = data.map(_._1).distinct().count().toInt - leafClusters = divided step += 1 - - log.info(s"${sc.appName} adding ${divided.size} new clusters at step:${step}") + log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") } } // unpersist kept RDDs rddArray.foreach(_.unpersist()) + val nodes = summarizeAsClusters(data, clusterStats) + // build a cluster tree by Map class which is expressed log.info(s"Building the cluster tree is started in ${sc.appName}") - val root = buildTree(clusters, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters) - if (root == None) { - new SparkException("Failed to build a cluster tree from a Map type of clusters") + val root = buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters) + if (root.isEmpty) { + new SparkException("Failed to build a cluster tree from a Map type of clusterStats") } // set the elapsed time for training @@ -185,7 +171,7 @@ class BisectingKMeans private ( val model = new BisectingKMeansModel(root.get) val leavesNodes = model.getClusters if (leavesNodes.length < this.numClusters) { - log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}") + log.warn(s"# clusterStats is less than you want: ${leavesNodes.length} / ${numClusters}") } model } @@ -194,7 +180,7 @@ class BisectingKMeans private ( * Assigns the initial cluster index id to all data */ private[clustering] - def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { + def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = { data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)} } @@ -202,30 +188,24 @@ class BisectingKMeans private ( * Summarizes data by each cluster as ClusterTree classes */ private[clustering] - def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterNode] = { - // summarize input data - val stats = summarize(data) - - // convert statistics to ClusterTree class - stats.map { case (i, (sum, n, sumOfSquares)) => - val center = Vectors.fromBreeze(sum :/ n) - val variances = n match { - case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) - case _ => Vectors.zeros(sum.size) - } - (i, new ClusterNode(center, n.toLong, variances)) - }.toMap + def summarizeAsClusters( + data: RDD[(BigInt, BV[Double])], + stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { + + stats.map { case (i, stat) => + i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, breezeNorm(stat.variances, 2.0)) + } } /** * Summarizes data by each cluster as Map */ private[clustering] - def summarize(data: RDD[(Long, BV[Double])]): Map[Long, (BV[Double], Double, BV[Double])] = { - data.mapPartitions { iter => + def summarize(data: RDD[(BigInt, BV[Double])]): Map[BigInt, ClusterNodeStat] = { + val stats = data.mapPartitions { iter => // calculate the accumulation of the all point in a partition and count the rows - val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] - iter.foreach { case (idx: Long, point: BV[Double]) => + val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] + iter.foreach { case (idx: BigInt, point: BV[Double]) => // get a map value or else get a sparse vector val (sumBV, n, sumOfSquares) = map .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) @@ -236,79 +216,45 @@ class BisectingKMeans private ( // sum the accumulation and the count in the all partition (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) }.collect().toMap + + stats.map {case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3)} } /** * Gets the new divided centers */ private[clustering] - def getDividedClusters(data: RDD[(Long, BV[Double])], - dividedClusters: Map[Long, ClusterNode]): Map[Long, ClusterNode] = { + def getDividedClusters(data: RDD[(BigInt, BV[Double])], + leafClusters: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = { val sc = data.sparkContext val appName = sc.appName // get keys of dividable clusters - val dividableKeys = dividedClusters.filter { case (idx, cluster) => - cluster.variances.toArray.sum > 0.0 && cluster.records >= 2 - }.keySet - if (dividableKeys.size == 0) { + val dividableClusters = leafClusters.filter { case (idx, cluster) => cluster.isDividable } + if (dividableClusters.isEmpty) { log.info(s"There is no dividable clusters in ${appName}.") - return Map.empty[Long, ClusterNode] + return Map.empty[BigInt, ClusterNodeStat] } // divide input data - var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)} - val dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)} - val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator) - var stats = divide(data, dividableClusters) - - // if there are clusters which failed to be divided, retry to split the failed clusters - var tryTimes = 1 - while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) { - // get the indexes of clusters which is failed to be divided - val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => idx / 2) - val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)} - log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" + - s"at ${tryTimes} times in ${appName}") - - // divide the failed clusters again - val bcFailedIndexes = sc.broadcast(failedIndexes) - dividableData = data.filter { case (idx, point) => bcFailedIndexes.value.contains(idx)} - val missingStats = divide(dividableData, failedCenters) - stats = stats ++ missingStats - tryTimes += 1 - } - - // make children clusters - stats.filter { case (i, (sum, n, sumOfSquares)) => n > 0} - .map { case (i, (sum, n, sumOfSquares)) => - val center = Vectors.fromBreeze(sum :/ n) - val variances = n match { - case 1 => Vectors.sparse(sum.size, Array(), Array()) - case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0))) - } - val child = new ClusterNode(center, n.toLong, variances) - (i, child) - }.toMap + val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)} + divide(dividableData, dividableClusters) } /** * Divides the input data * * @param data the pairs of cluster index and point which you want to divide - * @param clusters the clusters you want to divide AS a Map class + * @param currentStats the cluster stats you want to divide AS a Map class * @return divided clusters as Map */ private[clustering] - def divide(data: RDD[(Long, BV[Double])], - clusters: Map[Long, ClusterNode]): Map[Long, (BV[Double], Double, BV[Double])] = { + def divide( + data: RDD[(BigInt, BV[Double])], + currentStats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = { val sc = data.sparkContext - val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)} - var newCenters = initChildrenCenter(centers) - if (newCenters.size == 0) { - return Map.empty[Long, (BV[Double], Double, BV[Double])] - } + var newCenters = initChildCenters(data, currentStats) var bcNewCenters = sc.broadcast(newCenters) // TODO Supports distance metrics other Euclidean distance metric @@ -316,9 +262,7 @@ class BisectingKMeans private ( val bcMetric = sc.broadcast(metric) val vectorSize = newCenters(newCenters.keySet.min).size - var stats = newCenters.keys.map { idx => - (idx, (BSV.zeros[Double](vectorSize).toVector, 0.0, BSV.zeros[Double](vectorSize).toVector)) - }.toMap + var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])] var subIter = 0 var diffVariances = Double.MaxValue @@ -327,7 +271,7 @@ class BisectingKMeans private ( while (subIter < this.maxIterations && diffVariances > 10E-4) { // calculate summary of each cluster val eachStats = data.mapPartitions { iter => - val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] + val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] iter.foreach { case (idx, point) => // calculate next index number val childrenCenters = Array(2 * idx, 2 * idx + 1) @@ -366,24 +310,48 @@ class BisectingKMeans private ( oldVariances = variances subIter += 1 } - stats + + stats.map { case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3) } } /** * Gets the initial centers for bisect k-means */ private[clustering] - def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = { - val rand = new XORShiftRandom() - rand.setSeed(this.seed) - - clusters.flatMap { case (idx, center) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1) - val relativeErrorCoefficient = 0.001 - Array( - (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))), - (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble()))) - ) + def initChildCenters( + data: RDD[(BigInt, BV[Double])], + stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, BV[Double]] = { + + // Since the combination sampleByKey and groupByKey is more expensive, + // this as follows would be better. + val bcIndeces = data.sparkContext.broadcast(stats.keySet) + val samples = data.mapPartitions { iter => + val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]] + + bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} + val LOCAL_SAMPLE_SIZE = 20 + iter.foreach { case (i, point) => + map(i).append(point) + // to avoid to increase the memory usage on each map thread, + // the number of elements is cut off at the right time. + if (map(i).size > LOCAL_SAMPLE_SIZE) { + val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) + map(i) = mutable.ArrayBuffer(elements.head, elements.last) + } + } + + // in order to reduce the shuffle size, take only two elements + map.filterNot(_._2.isEmpty).map { case (i, points) => + val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) + i -> mutable.ArrayBuffer(elements.head, elements.last) + }.toIterator + }.reduceByKey { case (points1, points2) => + points1.union(points2) + }.collect() + + samples.flatMap { case (i, points) => + val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) + Array((2 * i, elements.head), (2 * i + 1, elements.last)) }.toMap } @@ -396,9 +364,10 @@ class BisectingKMeans private ( * @return a built cluster tree */ private[clustering] - def buildTree(treeMap: Map[Long, ClusterNode], - rootIndex: Long, - numClusters: Int): Option[ClusterNode] = { + def buildTree( + treeMap: Map[BigInt, ClusterNode], + rootIndex: BigInt, + numClusters: Int): Option[ClusterNode] = { // if there is no index in the Map if (!treeMap.contains(rootIndex)) return None @@ -407,9 +376,9 @@ class BisectingKMeans private ( var numLeavesClusters = 1 val root = treeMap(rootIndex) var leavesQueue = Map(rootIndex -> root) - while (leavesQueue.size > 0 && numLeavesClusters < numClusters) { + while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) { // pick up the cluster whose variance is the maximum in the queue - val mostScattered = leavesQueue.maxBy(_._2.variancesNorm) + val mostScattered = leavesQueue.maxBy(_._2.criterion) val mostScatteredKey = mostScattered._1 val mostScatteredCluster = mostScattered._2 @@ -446,8 +415,9 @@ class BisectingKMeans private ( */ private[clustering] def updateClusterIndex( - data: RDD[(Long, BV[Double])], - dividedClusters: Map[Long, ClusterNode]): RDD[(Long, BV[Double])] = { + data: RDD[(BigInt, BV[Double])], + dividedClusters: Map[BigInt, ClusterNodeStat]): RDD[(BigInt, BV[Double])] = { + // extract the centers of the clusters val sc = data.sparkContext var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} @@ -465,7 +435,7 @@ class BisectingKMeans private ( case s if s < 2 => (idx, point) // update the indexes case _ => { - val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze) + val nextCenters = childrenIndexes.map(bcCenters.value(_)) val closestIndex = BisectingKMeans .findClosestCenter(bcMetric.value)(nextCenters)(point) val nextIndex = 2 * idx + closestIndex @@ -476,31 +446,44 @@ class BisectingKMeans private ( } } +private[this] +case class ClusterNodeStat ( + rows: Long, + sums: BV[Double], + sumOfSquares: BV[Double]) extends Serializable { + + // initialization + val center: BV[Double] = sums :/ rows.toDouble + val variances: BV[Double] = rows match { + case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) + case _ => BV.zeros[Double](sums.size) + } + + def isDividable: Boolean = breezeAny(variances) && rows >= 2 +} + /** * A cluster as a tree node which can have its sub nodes * * @param center the center of the cluster - * @param records the number of rows in the cluster - * @param variances variance vectors - * @param variancesNorm the norm of variance vector + * @param rows the number of rows in the cluster + * @param criterion the norm of variance vector * @param localHeight the maximal distance between this node and its children * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster */ class ClusterNode private ( - val center: Vector, - val records: Long, - val variances: Vector, - val variancesNorm: Double, - private var localHeight: Double, - private var parent: Option[ClusterNode], - private var children: Seq[ClusterNode]) extends Serializable { + val center: Vector, + val rows: Long, + val criterion: Double, + private var localHeight: Double, + private var parent: Option[ClusterNode], + private var children: Seq[ClusterNode]) extends Serializable { - require(!variancesNorm.isNaN) + require(!criterion.isNaN) - def this(center: Vector, rows: Long, variances: Vector) = - this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0), - 0.0, None, Array.empty[ClusterNode]) + def this(center: Vector, rows: Long, criterion: Double) = + this(center, rows, criterion, 0.0, None, Array.empty[ClusterNode]) /** * Inserts a sub node as its child @@ -533,7 +516,7 @@ class ClusterNode private ( case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) } array.sortWith { case (a, b) => - a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum + a.getDepth < b.getDepth && a.criterion < b.criterion } } @@ -552,11 +535,9 @@ class ClusterNode private ( /** * Gets the leaves nodes in the cluster tree */ - def getLeavesNodes: Array[ClusterNode] = { - this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) - } + def getLeavesNodes: Array[ClusterNode] = this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) - def isLeaf: Boolean = this.children.size == 0 + def isLeaf: Boolean = this.children.isEmpty def getParent: Option[ClusterNode] = this.parent @@ -627,7 +608,7 @@ class ClusterNode private ( (getIndex(treeMap, node.children.head), getIndex(treeMap, node.children(1)), node.getHeight, - node.toArray.filter(_.isLeaf).length) + node.toArray.filter(_.isLeaf).size) } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index 6df074e34d23d..f4c4fe1cbfc4f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -127,14 +127,14 @@ class BisectingKMeansModelSuite val adjacencyList = model.toAdjacencyList .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) assert(adjacencyList.length === 8) - assert(adjacencyList(0) === (0, 1, 1.5652)) - assert(adjacencyList(1) === (0, 6, 1.5652)) - assert(adjacencyList(2) === (1, 2, 1.3744)) - assert(adjacencyList(3) === (1, 5, 1.3744)) - assert(adjacencyList(4) === (2, 3, 0.5)) - assert(adjacencyList(5) === (2, 4, 0.5)) - assert(adjacencyList(6) === (6, 7, 2.5)) - assert(adjacencyList(7) === (6, 8, 2.5)) + assert(adjacencyList(0) === (0, 1, 3.2863)) + assert(adjacencyList(1) === (0, 8, 3.2863)) + assert(adjacencyList(2) === (1, 2, 2.3184)) + assert(adjacencyList(3) === (1, 7, 2.3184)) + assert(adjacencyList(4) === (2, 3, 1.3744)) + assert(adjacencyList(5) === (2, 6, 1.3744)) + assert(adjacencyList(6) === (3, 4, 0.5)) + assert(adjacencyList(7) === (3, 5, 0.5)) // linkage matrix val linkageMatrix = model.toLinkageMatrix @@ -142,30 +142,31 @@ class BisectingKMeansModelSuite assert(linkageMatrix.length === 4) assert(linkageMatrix(0) === (0, 1, 0.5, 2)) assert(linkageMatrix(1) === (5, 2, 1.8744, 3)) - assert(linkageMatrix(2) === (3, 4, 2.5, 2)) - assert(linkageMatrix(3) === (6, 7, 4.0652, 5)) + assert(linkageMatrix(2) === (6, 3, 4.1928, 4)) + assert(linkageMatrix(3) === (7, 4, 7.4791, 5)) } test("clustering should be done correctly") { - for (numClusters <- Array(9, 99, 999)) { + for (numClusters <- Array(9, 19)) { val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1) - val localData = (1 to 1000).toSeq.map { i => + val localData = (1 to 19).toSeq.map { i => val label = i % numClusters val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble))) val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector) (label, denseVector, sparseVector) } + // dense version val denseData = sc.parallelize(localData.map(_._2), 2) val denseModel = app.run(denseData) assert(denseModel.getCenters.length === numClusters) - assert(denseModel.getClusters.forall(_.variancesNorm == 0.0)) + assert(denseModel.getClusters.forall(_.criterion == 0.0)) // sparse version val sparseData = sc.parallelize(localData.map(_._3), 2) val sparseModel = app.run(sparseData) assert(sparseModel.getCenters.length === numClusters) - assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0)) + assert(sparseModel.getClusters.forall(_.criterion == 0.0)) } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 8e5d95dfb2846..b93a320890b22 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -47,12 +47,12 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { } test("run") { - val algo = new BisectingKMeans().setNumClusters(123) + val algo = new BisectingKMeans().setNumClusters(123).setSeed(1) val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) assert(model.getClusters.length == 123) - assert(model.node.getHeight ~== 702.8641 absTol 10E-4) + assert(model.node.getHeight ~== 702.86414 absTol 10E-4) // check the relations between a parent cluster and its children assert(model.node.getParent === None) @@ -62,9 +62,9 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { } test("run with too many cluster size than the records") { - val algo = new BisectingKMeans().setNumClusters(123) + val algo = new BisectingKMeans().setNumClusters(123).setSeed(1) val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val data = sc.parallelize(localSeed, 2) + val data = sc.parallelize(localSeed) val model = algo.run(data) assert(model.getClusters.length == 100) assert(model.node.getHeight ~== 72.12489 absTol 10E-4) @@ -84,67 +84,77 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val seed = sc.parallelize(localSeed) val data = algo.initData(seed) - val clusters = algo.summarizeAsClusters(data) + val clusters = algo.summarize(data) assert(clusters.size === 1) - assert(clusters(1).center === Vectors.dense(49.5, 49.5)) - assert(clusters(1).records === 100) + assert(clusters(1).center === Vectors.dense(49.5, 49.5).toBreeze) + assert(clusters(1).rows === 100) - val data2 = seed.map(v => ((v.apply(0) / 25).toLong + 1L, v.toBreeze)) - val clusters2 = algo.summarizeAsClusters(data2) + val data2 = seed.map(v => (BigInt((v.apply(0) / 25).toInt + 1), v.toBreeze)) + val clusters2 = algo.summarize(data2) assert(clusters2.size === 4) - assert(clusters2(1).center === Vectors.dense(12.0, 12.0)) - assert(clusters2(1).records === 25) - assert(clusters2(2).center === Vectors.dense(37.0, 37.0)) - assert(clusters2(2).records === 25) - assert(clusters2(3).center === Vectors.dense(62.0, 62.0)) - assert(clusters2(3).records === 25) - assert(clusters2(4).center === Vectors.dense(87.0, 87.0)) - assert(clusters2(4).records === 25) + assert(clusters2(1).center === Vectors.dense(12.0, 12.0).toBreeze) + assert(clusters2(1).rows === 25) + assert(clusters2(2).center === Vectors.dense(37.0, 37.0).toBreeze) + assert(clusters2(2).rows === 25) + assert(clusters2(3).center === Vectors.dense(62.0, 62.0).toBreeze) + assert(clusters2(3).rows === 25) + assert(clusters2(4).center === Vectors.dense(87.0, 87.0).toBreeze) + assert(clusters2(4).rows === 25) } test("getChildrenCenter") { val algo = new BisectingKMeans - val centers = Map( - 2L -> Vectors.dense(1.0, 1.0).toBreeze, - 3L -> Vectors.dense(2.0, 2.0).toBreeze + val local = Seq( + (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), + (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1)) + ) + val data = sc.parallelize(local) + val stats = Map[BigInt, ClusterNodeStat]( + BigInt(2) -> new ClusterNodeStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)), + BigInt(3) -> new ClusterNodeStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2)) ) - val initNextCenters = algo.initChildrenCenter(centers) + val initNextCenters = algo.initChildCenters(data, stats) assert(initNextCenters.size === 4) assert(initNextCenters.keySet === Set(4, 5, 6, 7)) } test("should divide clusters") { - val algo = new BisectingKMeans - val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze)) - val data = sc.parallelize(seed) - val clusters = algo.summarizeAsClusters(data) - val newClusters = algo.getDividedClusters(data, clusters) + val algo = new BisectingKMeans().setSeed(5) + val local = Seq( + (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), + (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)), + (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)), + (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1)) + ) + val data = sc.parallelize(local) + val stats = algo.summarize(data) + val newClusters = algo.getDividedClusters(data, stats) assert(newClusters.size === 4) - assert(newClusters(4).center === Vectors.dense(12.0, 12.0)) - assert(newClusters(4).records === 25) - assert(newClusters(5).center === Vectors.dense(37.0, 37.0)) - assert(newClusters(5).records === 25) - assert(newClusters(6).center === Vectors.dense(62.0, 62.0)) - assert(newClusters(6).records === 25) - assert(newClusters(7).center === Vectors.dense(87.0, 87.0)) - assert(newClusters(7).records === 25) + assert(newClusters(4).center === BV[Double](1.0, 1.0)) + assert(newClusters(4).rows === 2) + assert(newClusters(5).center === BV[Double](10.0, 10.0)) + assert(newClusters(5).rows === 2) + assert(newClusters(6).center === BV[Double](100.0, 100.0)) + assert(newClusters(6).rows === 2) + assert(newClusters(7).center === BV[Double](110.0, 110.0)) + assert(newClusters(7).rows === 2) } test("should assign each data to new clusters") { val algo = new BisectingKMeans val seed = Seq( - (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)), - (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), - (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)), - (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), - (3L, Vectors.dense(11.0, 11.0)) + (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), (BigInt(2), Vectors.dense(2.0, 2.0)), + (BigInt(2), Vectors.dense(3.0, 3.0)), (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)), + (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), (BigInt(3), Vectors.dense(8.0, 8.0)), + (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)), + (BigInt(3), Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze)} val newClusters = Map( - 4L -> new ClusterNode(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)), - 5L -> new ClusterNode(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)), - 6L -> new ClusterNode(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)), - 7L -> new ClusterNode(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0)) + BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(5) -> new ClusterNodeStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(6) -> new ClusterNodeStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(7) -> new ClusterNodeStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0)) ) val data = sc.parallelize(seed) val result = algo.updateClusterIndex(data, newClusters).collect().toSeq @@ -172,13 +182,6 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { assert(algo.getMaxIterations == 15) } - test("setNumRetries") { - val algo = new BisectingKMeans() - assert(algo.getMaxRetries == 10) - algo.setMaxRetries(15) - assert(algo.getMaxRetries == 15) - } - test("setSeed") { val algo = new BisectingKMeans() assert(algo.getSeed == 1) From 6a51b129260abed13b2c869d77f53852020fe7c5 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 14 Jul 2015 10:11:48 +0900 Subject: [PATCH 27/76] Change the criterion for building a cluster tree from variance vectors to avg. costs. --- .../mllib/clustering/BisectingKMeans.scala | 33 +++++++++++++++++-- .../clustering/BisectingKMeansSuite.scala | 2 +- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 091884e603c6d..c5d76874fb73c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -147,6 +147,7 @@ class BisectingKMeans private ( head.unpersist() rddArray = rddArray.filterNot(_.hashCode() == head.hashCode()) } + clusterStats = clusterStats ++ divided step += 1 log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") } @@ -154,7 +155,7 @@ class BisectingKMeans private ( // unpersist kept RDDs rddArray.foreach(_.unpersist()) - val nodes = summarizeAsClusters(data, clusterStats) + val nodes = calcCriterions(data, clusterStats) // build a cluster tree by Map class which is expressed log.info(s"Building the cluster tree is started in ${sc.appName}") @@ -188,12 +189,38 @@ class BisectingKMeans private ( * Summarizes data by each cluster as ClusterTree classes */ private[clustering] - def summarizeAsClusters( + def calcCriterions( data: RDD[(BigInt, BV[Double])], stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { + // TODO: support other criteria, such as entropy + calcAvgConsts(data, stats) + } + + private[clustering] + def calcAvgConsts( + data: RDD[(BigInt, BV[Double])], + stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { + + val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center }) + val costs = data.mapPartitions { iter => + val counters = mutable.Map.empty[BigInt, (Long, Double)] + bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} + iter.foreach { case (i, point) => + val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0) + counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost) + } + counters.toIterator + }.reduceByKey { case((n1, cost1), (n2, cost2)) => + (n1 + n2, cost1 + cost2) + }.collectAsMap() + stats.map { case (i, stat) => - i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, breezeNorm(stat.variances, 2.0)) + val avgCost = costs(i)._1 match { + case x if x == 0.0 => 0.0 + case _ => costs(i)._2 / costs(i)._1 + } + i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index b93a320890b22..f99e1823f94dc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -52,7 +52,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val data = sc.parallelize(localSeed, 2) val model = algo.run(data) assert(model.getClusters.length == 123) - assert(model.node.getHeight ~== 702.86414 absTol 10E-4) + assert(model.node.getHeight ~== 705.6925 absTol 10E-4) // check the relations between a parent cluster and its children assert(model.node.getParent === None) From c8a2a1932f08cc356537ca8b96bb3b733a3eea0e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 14 Jul 2015 15:21:04 +0900 Subject: [PATCH 28/76] Change `toArray` to avoid the TimSort error --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index c5d76874fb73c..9538952dbb17c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -543,7 +543,7 @@ class ClusterNode private ( case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) } array.sortWith { case (a, b) => - a.getDepth < b.getDepth && a.criterion < b.criterion + a.getDepth < b.getDepth && a.criterion < b.criterion && a.rows < b.rows } } From 5f899b3e93b4d98d87d72b6b2b291cea97ed5918 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 14 Jul 2015 16:04:07 +0900 Subject: [PATCH 29/76] Format the code, since there are some validation problems --- .../spark/mllib/clustering/BisectingKMeans.scala | 6 ++++-- .../spark/mllib/clustering/BisectingKMeansSuite.scala | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 9538952dbb17c..7780fec90ad42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -352,7 +352,7 @@ class BisectingKMeans private ( // Since the combination sampleByKey and groupByKey is more expensive, // this as follows would be better. val bcIndeces = data.sparkContext.broadcast(stats.keySet) - val samples = data.mapPartitions { iter => + val samples = data.mapPartitions { iter => val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]] bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} @@ -562,7 +562,9 @@ class ClusterNode private ( /** * Gets the leaves nodes in the cluster tree */ - def getLeavesNodes: Array[ClusterNode] = this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) + def getLeavesNodes: Array[ClusterNode] = { + this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) + } def isLeaf: Boolean = this.children.isEmpty diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index f99e1823f94dc..95b4f59772481 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -144,11 +144,12 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("should assign each data to new clusters") { val algo = new BisectingKMeans val seed = Seq( - (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), (BigInt(2), Vectors.dense(2.0, 2.0)), - (BigInt(2), Vectors.dense(3.0, 3.0)), (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)), - (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), (BigInt(3), Vectors.dense(8.0, 8.0)), - (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)), - (BigInt(3), Vectors.dense(11.0, 11.0)) + (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), + (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)), + (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)), + (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), + (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)), + (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze)} val newClusters = Map( BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)), From 313e87f013d92b178b0c231f5e905eb19f45d34c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Tue, 14 Jul 2015 16:55:59 +0900 Subject: [PATCH 30/76] Remove unnecesary comment and import --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 7780fec90ad42..8fd14b2ffa0af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering -import scala.collection.{SortedSet, mutable, Map} +import scala.collection.{mutable, Map} import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny} import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -56,7 +56,6 @@ object BisectingKMeans extends Logging { * @param numClusters tne number of clusters you want * @param clusterMap the pairs of cluster and its index as Map * @param maxIterations the number of maximal iterations - * @param maxRetries the number of maximum retries * @param seed a random seed */ class BisectingKMeans private ( From 3f6b14a7b2b901e1c479ec0c7fd64f3482809338 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 15 Jul 2015 15:30:46 +0900 Subject: [PATCH 31/76] Fix a typo and a few comments --- .../apache/spark/mllib/clustering/BisectingKMeans.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 8fd14b2ffa0af..977cb14f7a3a0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -185,7 +185,7 @@ class BisectingKMeans private ( } /** - * Summarizes data by each cluster as ClusterTree classes + * Calculates criterions for building cluster tree */ private[clustering] def calcCriterions( @@ -193,11 +193,14 @@ class BisectingKMeans private ( stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { // TODO: support other criteria, such as entropy - calcAvgConsts(data, stats) + calcAvgCosts(data, stats) } + /** + * Calculates the average costs of each cluster + */ private[clustering] - def calcAvgConsts( + def calcAvgCosts( data: RDD[(BigInt, BV[Double])], stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { From 52b47049a12e819496d41ed5a96b396472f4ac18 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 29 Jul 2015 14:42:09 +0900 Subject: [PATCH 32/76] Add a new line above spark project classes --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 977cb14f7a3a0..fbdbbd9ef3c53 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering import scala.collection.{mutable, Map} import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny} + import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.{Logging, SparkException} From fe87715d86c0f54cfb2422670b4a70c759473277 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 29 Jul 2015 14:43:47 +0900 Subject: [PATCH 33/76] Arrange the order of import statements --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- .../spark/mllib/clustering/BisectingKMeansModelSuite.scala | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index fbdbbd9ef3c53..49a237be037ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -21,9 +21,9 @@ import scala.collection.{mutable, Map} import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny} +import org.apache.spark.{Logging, SparkException} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD -import org.apache.spark.{Logging, SparkException} object BisectingKMeans extends Logging { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index f4c4fe1cbfc4f..a7fafb67a76e4 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -17,10 +17,11 @@ package org.apache.spark.mllib.clustering +import org.scalatest.BeforeAndAfterEach + import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.scalatest.BeforeAndAfterEach class BisectingKMeansModelSuite extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { From eeef1e7b1ab09818e3b0389be24dc53af3ead32d Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 29 Jul 2015 15:15:58 +0900 Subject: [PATCH 34/76] Use `isDefined`, instead of `!= None` --- .../apache/spark/mllib/clustering/BisectingKMeansSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 95b4f59772481..04640a0fc97cc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -58,7 +58,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.node.getParent === None) assert(model.node.getChildren.head.getParent.get === model.node) assert(model.node.getChildren.apply(1).getParent.get === model.node) - assert(model.getClusters.forall(_.getParent != None)) + assert(model.getClusters.forall(_.getParent.isDefined)) } test("run with too many cluster size than the records") { From 3156dd7e0c4fc4f5d1b1116bb689592eee76ec25 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 21 Oct 2015 15:38:46 -0700 Subject: [PATCH 35/76] Update the bisecting k-means - Add the description about the algorithm and the criterion for build a cluster tree - Extract utility methods to `BisectingKMeans` object - Make `BisectingKMeans` `private[clustering]` - Add `@Since` tags to public methods, parameters and variables --- .../mllib/clustering/BisectingKMeans.scala | 551 ++++++++++-------- .../clustering/BisectingKMeansModel.scala | 20 +- .../clustering/JavaBisectingKMeansSuite.java | 6 +- .../BisectingKMeansModelSuite.scala | 10 +- .../clustering/BisectingKMeansSuite.scala | 192 +++--- 5 files changed, 416 insertions(+), 363 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 49a237be037ca..a44cbd32ac5fe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,36 +17,16 @@ package org.apache.spark.mllib.clustering -import scala.collection.{mutable, Map} +import scala.collection.{Map, mutable} -import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny} +import breeze.linalg.{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm} import org.apache.spark.{Logging, SparkException} +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD -object BisectingKMeans extends Logging { - - private[clustering] val ROOT_INDEX_KEY: BigInt = 1 - - /** - * Finds the closes cluster's center - * - * @param metric a distance metric - * @param centers centers of the clusters - * @param point a target point - * @return an index of the array of clusters - */ - private[mllib] - def findClosestCenter(metric: (BV[Double], BV[Double]) => Double) - (centers: Seq[BV[Double]])(point: BV[Double]): Int = { - val (closestCenter, closestIndex) = - centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1) - closestIndex - } -} - /** * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm. * @@ -54,78 +34,109 @@ object BisectingKMeans extends Logging { * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000. * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf * - * @param numClusters tne number of clusters you want + * However, we modified it to fit for Spark. This algorithm consists of the two main parts. + * + * 1. Split clusters until the number of clusters will be enough to build a cluster tree + * 2. Build a cluster tree as a binary tree by the splitted clusters + * + * First, it splits clusters to their children clusters step by step, not considering a cluster + * will be included in the final cluster tree or not. That's because it makes the algorithm more + * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until + * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting + * when there are no dividable clusters before the number of clusters will be sufficient. And + * it calculates the criterions, such as average cost, entropy and so on, for building a cluster + * tree in the first part. The criterion means how large the cluster is. That is, the cluster + * whose criterion is maximum of all the clusters is the largest cluster. + * + * Second, it builds a cluster tree as a binary tree by the result of the first part. + * First of all, the cluster tree starts with only the root cluster which includes all points. + * So, there are two candidates which can be merged to the cluster tree. Those are the children of + * the root. Then, it picks up the larger child of the two and merge it to the cluster tree. + * After that, there are tree candidates to merge. Those are the smaller child of the root and + * the two children of the larger cluster of the root. It picks up the largest cluster of the tree + * and merge it to the * cluster tree. Like this, it continues to pick up the largest one of the + * candidates and merge it to the cluster tree until the desired number of clusters is reached. + * + * @param k tne desired number of clusters * @param clusterMap the pairs of cluster and its index as Map - * @param maxIterations the number of maximal iterations + * @param maxIterations the number of maximal iterations to split clusters * @param seed a random seed */ +@Since("1.6.0") class BisectingKMeans private ( - private var numClusters: Int, - private var clusterMap: Map[BigInt, ClusterNode], + private var k: Int, + private var clusterMap: Map[BigInt, BisectingClusterNode], private var maxIterations: Int, private var seed: Long) extends Logging { /** * Constructs with the default configuration */ - def this() = this(20, mutable.ListMap.empty[BigInt, ClusterNode], 20, 1) + @Since("1.6.0") + def this() = this(20, mutable.ListMap.empty[BigInt, BisectingClusterNode], 20, 1) /** * Sets the number of clusters you want */ - def setNumClusters(numClusters: Int): this.type = { - this.numClusters = numClusters + @Since("1.6.0") + def setK(k: Int): this.type = { + this.k = k this } - def getNumClusters: Int = this.numClusters + @Since("1.6.0") + def getK: Int = this.k /** * Sets the number of maximal iterations in each clustering step */ + @Since("1.6.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } + @Since("1.6.0") def getMaxIterations: Int = this.maxIterations /** * Sets the random seed */ + @Since("1.6.0") def setSeed(seed: Long): this.type = { this.seed = seed this } + @Since("1.6.0") def getSeed: Long = this.seed /** - * Runs the bisecting kmeans algorithm + * Runs the bisecting k-means algorithm * @param input RDD of vectors * @return model for the bisecting kmeans */ + @Since("1.6.0") def run(input: RDD[Vector]): BisectingKMeansModel = { val sc = input.sparkContext - log.info(s"${sc.appName} starts a bisecting kmeans algorithm") - - var data = initData(input).cache() - val startTime = System.currentTimeMillis() // `clusterStats` is described as binary tree structure // `clusterStats(1)` means the root of a binary tree - var clusterStats = mutable.Map.empty[BigInt, ClusterNodeStat] + var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat] var step = 1 - var numDividedClusters = 0 var noMoreDividable = false var rddArray = Array.empty[RDD[(BigInt, BV[Double])]] // the number of maximum nodes of a binary tree by given parameter - val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1 + val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1 val maxAllNodesInTree = math.pow(2, multiplier).toInt + // divide clusters until the number of clusters reachs the condition + // or there is no dividable cluster + val startTime = System.currentTimeMillis() + var data = BisectingKMeans.initData(input).cache() while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) { - log.info(s"${sc.appName} starts step ${step}") - val leafClusters = summarize(data) + logInfo(s"${sc.appName} starts step ${step}") + val leafClusters = BisectingKMeans.summarizeClusters(data) val dividableLeafClusters = leafClusters.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusters @@ -134,13 +145,12 @@ class BisectingKMeans private ( } else { // can be clustered if the number of divided clusterStats is equal to 0 - val divided = getDividedClusters(data, dividableLeafClusters) - + val divided = + BisectingKMeans.divideClusters(data, dividableLeafClusters, maxIterations) // update each index - val newData = updateClusterIndex(data, divided).cache() + val newData = BisectingKMeans.updateClusterIndex(data, divided).cache() rddArray = rddArray ++ Array(data) data = newData - // keep recent 2 cached RDDs in order to run more quickly if (rddArray.length > 1) { val head = rddArray.head @@ -149,96 +159,69 @@ class BisectingKMeans private ( } clusterStats = clusterStats ++ divided step += 1 - log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") + logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") } } // unpersist kept RDDs rddArray.foreach(_.unpersist()) - - val nodes = calcCriterions(data, clusterStats) + // create a map of cluster node with their criterions + val nodes = BisectingKMeans.createClusterNodes(data, clusterStats) // build a cluster tree by Map class which is expressed - log.info(s"Building the cluster tree is started in ${sc.appName}") - val root = buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters) + logInfo(s"Building the cluster tree is started in ${sc.appName}") + val root = BisectingKMeans.buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.k) if (root.isEmpty) { new SparkException("Failed to build a cluster tree from a Map type of clusterStats") } // set the elapsed time for training val finishTime = (System.currentTimeMillis() - startTime) / 1000.0 - log.info(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]") + logInfo(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]") // make a bisecting kmeans model val model = new BisectingKMeansModel(root.get) val leavesNodes = model.getClusters - if (leavesNodes.length < this.numClusters) { - log.warn(s"# clusterStats is less than you want: ${leavesNodes.length} / ${numClusters}") + if (leavesNodes.length < this.k) { + logWarning(s"# clusters is less than you want: ${leavesNodes.length} / ${k}") } model } +} - /** - * Assigns the initial cluster index id to all data - */ - private[clustering] - def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = { - data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)} - } - /** - * Calculates criterions for building cluster tree - */ - private[clustering] - def calcCriterions( - data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { +private[clustering] object BisectingKMeans { - // TODO: support other criteria, such as entropy - calcAvgCosts(data, stats) - } + val ROOT_INDEX_KEY: BigInt = 1 /** - * Calculates the average costs of each cluster + * Finds the closes cluster's center + * + * @param metric a distance metric + * @param centers centers of the clusters + * @param point a target point + * @return an index of the array of clusters */ - private[clustering] - def calcAvgCosts( - data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = { - - val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center }) - val costs = data.mapPartitions { iter => - val counters = mutable.Map.empty[BigInt, (Long, Double)] - bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} - iter.foreach { case (i, point) => - val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0) - counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost) - } - counters.toIterator - }.reduceByKey { case((n1, cost1), (n2, cost2)) => - (n1 + n2, cost1 + cost2) - }.collectAsMap() - - stats.map { case (i, stat) => - val avgCost = costs(i)._1 match { - case x if x == 0.0 => 0.0 - case _ => costs(i)._2 / costs(i)._1 - } - i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost) - } + def findClosestCenter(metric: (BV[Double], BV[Double]) => Double) + (centers: Seq[BV[Double]])(point: BV[Double]): Int = { + val (closestCenter, closestIndex) = + centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1) + closestIndex } /** * Summarizes data by each cluster as Map + * + * @param data pairs of point and its cluster index */ - private[clustering] - def summarize(data: RDD[(BigInt, BV[Double])]): Map[BigInt, ClusterNodeStat] = { + def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = { + val stats = data.mapPartitions { iter => // calculate the accumulation of the all point in a partition and count the rows val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] iter.foreach { case (idx: BigInt, point: BV[Double]) => // get a map value or else get a sparse vector val (sumBV, n, sumOfSquares) = map - .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) + .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) } map.toIterator @@ -247,76 +230,152 @@ class BisectingKMeans private ( (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) }.collect().toMap - stats.map {case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3)} + stats.map {case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3)} } /** - * Gets the new divided centers + * Assigns the initial cluster index id to all data */ - private[clustering] - def getDividedClusters(data: RDD[(BigInt, BV[Double])], - leafClusters: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = { - val sc = data.sparkContext - val appName = sc.appName + def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = { + data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)} + } - // get keys of dividable clusters - val dividableClusters = leafClusters.filter { case (idx, cluster) => cluster.isDividable } - if (dividableClusters.isEmpty) { - log.info(s"There is no dividable clusters in ${appName}.") - return Map.empty[BigInt, ClusterNodeStat] - } + /** + * Gets the initial centers for bisect k-means + * + * @param data pairs of point and its cluster index + * @param stats pairs of cluster index and cluster statistics + */ + def initNextCenters( + data: RDD[(BigInt, BV[Double])], + stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BV[Double]] = { - // divide input data - val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)} - divide(dividableData, dividableClusters) + // Since the combination sampleByKey and groupByKey is more expensive, + // this as follows would be better. + val bcIndeces = data.sparkContext.broadcast(stats.keySet) + val samples = data.mapPartitions { iter => + val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]] + + bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} + val LOCAL_SAMPLE_SIZE = 100 + iter.foreach { case (i, point) => + map(i).append(point) + // to avoid to increase the memory usage on each map thread, + // the number of elements is cut off at the right time. + if (map(i).size > LOCAL_SAMPLE_SIZE) { + val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) + map(i) = mutable.ArrayBuffer(elements.head, elements.last) + } + } + + // in order to reduce the shuffle size, take only two elements + map.filterNot(_._2.isEmpty).map { case (i, points) => + val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) + i -> mutable.ArrayBuffer(elements.head, elements.last) + }.toIterator + }.reduceByKey { case (points1, points2) => + points1.union(points2) + }.collect() + + val nextCenters = samples.flatMap { case (i, points) => + val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) + Array((2 * i, elements.head), (2 * i + 1, elements.last)) + }.toMap + if (!stats.keySet.flatMap(idx => Array(2 * idx, 2 * idx + 1)).forall(nextCenters.contains(_))) { + throw new SparkException("Failed to initialize centers for next step") + } + nextCenters } /** - * Divides the input data + * Updates the indexes of clusters which is divided to its children indexes * - * @param data the pairs of cluster index and point which you want to divide - * @param currentStats the cluster stats you want to divide AS a Map class - * @return divided clusters as Map + * @param data pairs of point and its cluster index + * @param dividedClusters pairs of cluster index and cluster statistics */ - private[clustering] - def divide( + def updateClusterIndex( data: RDD[(BigInt, BV[Double])], - currentStats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = { + dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = { + // extract the centers of the clusters val sc = data.sparkContext - var newCenters = initChildCenters(data, currentStats) - var bcNewCenters = sc.broadcast(newCenters) + var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} + val bcCenters = sc.broadcast(centers) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val bcMetric = sc.broadcast(metric) - val vectorSize = newCenters(newCenters.keySet.min).size + // update the indexes to their children indexes + data.map { case (idx, point) => + val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c)) + childrenIndexes.length match { + // stay the index if the number of children is not enough + case s if s < 2 => (idx, point) + // update the indexes + case _ => { + val nextCenters = childrenIndexes.map(bcCenters.value(_)) + val closestIndex = BisectingKMeans + .findClosestCenter(bcMetric.value)(nextCenters)(point) + val nextIndex = 2 * idx + closestIndex + (nextIndex, point) + } + } + } + } + + /** + * Divides clusters according to their statistics + * + * @param data pairs of point and its cluster index + * @param targetClusters target clusters to divide + * @param maxIterations the maximum iterations to calculate clusters statistics + */ + def divideClusters( + data: RDD[(BigInt, BV[Double])], + targetClusters: Map[BigInt, BisectingClusterStat], + maxIterations: Int): Map[BigInt, BisectingClusterStat] = { + val sc = data.sparkContext + val appName = sc.appName + + // get keys of dividable clusters + val dividableClusters = targetClusters.filter { case (idx, cluster) => cluster.isDividable } + if (dividableClusters.isEmpty) { + return Map.empty[BigInt, BisectingClusterStat] + } + // extract dividable input data + val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)} + + var newCenters = BisectingKMeans.initNextCenters(dividableData, dividableClusters) + var bcNewCenters = sc.broadcast(newCenters) + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val bcMetric = sc.broadcast(metric) var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])] var subIter = 0 - var diffVariances = Double.MaxValue - var oldVariances = Double.MaxValue - var variances = Double.MaxValue - while (subIter < this.maxIterations && diffVariances > 10E-4) { + var totalStd = Double.MaxValue + var oldTotalStd = Double.MaxValue + var relativeError = Double.MaxValue + while (subIter < maxIterations && relativeError > 10E-4) { // calculate summary of each cluster - val eachStats = data.mapPartitions { iter => + val eachStats = dividableData.mapPartitions { iter => val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] iter.foreach { case (idx, point) => // calculate next index number val childrenCenters = Array(2 * idx, 2 * idx + 1) - .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) - if (childrenCenters.length >= 1) { + .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) + if (childrenCenters.length == 2) { val closestIndex = BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point) val nextIndex = 2 * idx + closestIndex // get a map value or else get a sparse vector val (sumBV, n, sumOfSquares) = map - .getOrElse( - nextIndex, - (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) - ) + .getOrElse( + nextIndex, + (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + ) map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) } } @@ -333,56 +392,60 @@ class BisectingKMeans private ( // update summary of each cluster stats = eachStats.toMap - variances = stats.map { case (idx, (sum, n, sumOfSquares)) => - math.pow(sumOfSquares.toArray.sum, 1.0 / sumOfSquares.size) + totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) => + sum.toArray.zip(sumOfSquares.toArray).map { case (s, ss) => + math.pow(ss / n - math.pow(s / n, 2), 2.0) + }.sum }.sum - diffVariances = math.abs(oldVariances - variances) / oldVariances - oldVariances = variances + relativeError = math.abs(oldTotalStd - totalStd) / totalStd + oldTotalStd = totalStd subIter += 1 } - - stats.map { case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3) } + stats.map { case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3) } } /** - * Gets the initial centers for bisect k-means + * Creates the map of cluster stats to the map of cluster nodes with their criterions + * + * @param data input data + * @param stats map of cluster stats which is described as a binary tree */ - private[clustering] - def initChildCenters( + def createClusterNodes( data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, BV[Double]] = { + stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { - // Since the combination sampleByKey and groupByKey is more expensive, - // this as follows would be better. - val bcIndeces = data.sparkContext.broadcast(stats.keySet) - val samples = data.mapPartitions { iter => - val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]] + // TODO: support other criterion, such as entropy + createClusterNodesWithAverageCost(data, stats) + } - bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} - val LOCAL_SAMPLE_SIZE = 20 + /** + * Creates the map of cluster stats to the map of cluster nodes with their average costs + */ + private def createClusterNodesWithAverageCost( + data: RDD[(BigInt, BV[Double])], + stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { + + // calculate average costs of all clusters + val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center }) + val costs = data.mapPartitions { iter => + val counters = mutable.Map.empty[BigInt, (Long, Double)] + bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} iter.foreach { case (i, point) => - map(i).append(point) - // to avoid to increase the memory usage on each map thread, - // the number of elements is cut off at the right time. - if (map(i).size > LOCAL_SAMPLE_SIZE) { - val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) - map(i) = mutable.ArrayBuffer(elements.head, elements.last) - } + val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0) + counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost) } + counters.toIterator + }.reduceByKey { case((n1, cost1), (n2, cost2)) => + (n1 + n2, cost1 + cost2) + }.collectAsMap() - // in order to reduce the shuffle size, take only two elements - map.filterNot(_._2.isEmpty).map { case (i, points) => - val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) - i -> mutable.ArrayBuffer(elements.head, elements.last) - }.toIterator - }.reduceByKey { case (points1, points2) => - points1.union(points2) - }.collect() - - samples.flatMap { case (i, points) => - val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0)) - Array((2 * i, elements.head), (2 * i + 1, elements.last)) - }.toMap + stats.map { case (i, stat) => + val avgCost = costs(i)._1 match { + case x if x == 0.0 => 0.0 + case _ => costs(i)._2 / costs(i)._1 + } + i -> new BisectingClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost) + } } /** @@ -393,11 +456,10 @@ class BisectingKMeans private ( * @param numClusters the number of clusters you want * @return a built cluster tree */ - private[clustering] - def buildTree( - treeMap: Map[BigInt, ClusterNode], + private def buildTree( + treeMap: Map[BigInt, BisectingClusterNode], rootIndex: BigInt, - numClusters: Int): Option[ClusterNode] = { + numClusters: Int): Option[BisectingClusterNode] = { // if there is no index in the Map if (!treeMap.contains(rootIndex)) return None @@ -407,7 +469,7 @@ class BisectingKMeans private ( val root = treeMap(rootIndex) var leavesQueue = Map(rootIndex -> root) while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) { - // pick up the cluster whose variance is the maximum in the queue + // pick up the largest cluster by the maximum criterion of all the clusters val mostScattered = leavesQueue.maxBy(_._2.criterion) val mostScatteredKey = mostScattered._1 val mostScatteredCluster = mostScattered._2 @@ -423,7 +485,7 @@ class BisectingKMeans private ( // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val localHeight = children - .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max + .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max mostScatteredCluster.setLocalHeight(localHeight) // update the queue @@ -433,63 +495,9 @@ class BisectingKMeans private ( // remove the cluster which is involved to the cluster tree leavesQueue = leavesQueue.filterNot(_ == mostScattered) - - log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " + - s"Cluster ${childrenIndexes.mkString(",")} are merged.") } Some(root) } - - /** - * Updates the indexes of clusters which is divided to its children indexes - */ - private[clustering] - def updateClusterIndex( - data: RDD[(BigInt, BV[Double])], - dividedClusters: Map[BigInt, ClusterNodeStat]): RDD[(BigInt, BV[Double])] = { - - // extract the centers of the clusters - val sc = data.sparkContext - var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} - val bcCenters = sc.broadcast(centers) - - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val bcMetric = sc.broadcast(metric) - - // update the indexes to their children indexes - data.map { case (idx, point) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c)) - childrenIndexes.length match { - // stay the index if the number of children is not enough - case s if s < 2 => (idx, point) - // update the indexes - case _ => { - val nextCenters = childrenIndexes.map(bcCenters.value(_)) - val closestIndex = BisectingKMeans - .findClosestCenter(bcMetric.value)(nextCenters)(point) - val nextIndex = 2 * idx + closestIndex - (nextIndex, point) - } - } - } - } -} - -private[this] -case class ClusterNodeStat ( - rows: Long, - sums: BV[Double], - sumOfSquares: BV[Double]) extends Serializable { - - // initialization - val center: BV[Double] = sums :/ rows.toDouble - val variances: BV[Double] = rows match { - case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) - case _ => BV.zeros[Double](sums.size) - } - - def isDividable: Boolean = breezeAny(variances) && rows >= 2 } /** @@ -497,30 +505,33 @@ case class ClusterNodeStat ( * * @param center the center of the cluster * @param rows the number of rows in the cluster - * @param criterion the norm of variance vector + * @param criterion how large a cluster is * @param localHeight the maximal distance between this node and its children * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster */ -class ClusterNode private ( - val center: Vector, - val rows: Long, - val criterion: Double, +@Since("1.6.0") +class BisectingClusterNode private ( + @Since("1.6.0") val center: Vector, + @Since("1.6.0") val rows: Long, + @Since("1.6.0") val criterion: Double, private var localHeight: Double, - private var parent: Option[ClusterNode], - private var children: Seq[ClusterNode]) extends Serializable { + private var parent: Option[BisectingClusterNode], + private var children: Seq[BisectingClusterNode]) extends Serializable { require(!criterion.isNaN) + @Since("1.6.0") def this(center: Vector, rows: Long, criterion: Double) = - this(center, rows, criterion, 0.0, None, Array.empty[ClusterNode]) + this(center, rows, criterion, 0.0, None, Array.empty[BisectingClusterNode]) /** * Inserts a sub node as its child * * @param child inserted sub node */ - def insert(child: ClusterNode) { + @Since("1.6.0") + def insert(child: BisectingClusterNode) { insert(Array(child)) } @@ -529,7 +540,8 @@ class ClusterNode private ( * * @param children inserted sub nodes */ - def insert(children: Array[ClusterNode]) { + @Since("1.6.0") + def insert(children: Array[BisectingClusterNode]) { this.children = this.children ++ children children.foreach(child => child.parent = Some(this)) } @@ -540,7 +552,8 @@ class ClusterNode private ( * * @return an Array class which the cluster tree is expanded */ - def toArray: Array[ClusterNode] = { + @Since("1.6.0") + def toArray: Array[BisectingClusterNode] = { val array = this.children.size match { case 0 => Array(this) case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) @@ -555,6 +568,7 @@ class ClusterNode private ( * * @return the depth from the root */ + @Since("1.6.0") def getDepth: Int = { this.parent match { case None => 0 @@ -565,15 +579,19 @@ class ClusterNode private ( /** * Gets the leaves nodes in the cluster tree */ - def getLeavesNodes: Array[ClusterNode] = { + @Since("1.6.0") + def getLeavesNodes: Array[BisectingClusterNode] = { this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) } + @Since("1.6.0") def isLeaf: Boolean = this.children.isEmpty - def getParent: Option[ClusterNode] = this.parent + @Since("1.6.0") + def getParent: Option[BisectingClusterNode] = this.parent - def getChildren: Seq[ClusterNode] = this.children + @Since("1.6.0") + def getChildren: Seq[BisectingClusterNode] = this.children /** * Gets the dendrogram height of the cluster at the cluster tree. @@ -583,6 +601,7 @@ class ClusterNode private ( * * @return the dendrogram height */ + @Since("1.6.0") def getHeight: Double = { this.children.size match { case 0 => 0.0 @@ -590,14 +609,15 @@ class ClusterNode private ( } } - private[mllib] - def setLocalHeight(height: Double) = this.localHeight = height + @Since("1.6.0") + def setLocalHeight(height: Double): Unit = this.localHeight = height /** * Converts to an adjacency list * * @return List[(fromNodeId, toNodeId, distance)] */ + @Since("1.6.0") def toAdjacencyList: Array[(Int, Int, Double)] = { val nodes = toArray @@ -620,6 +640,7 @@ class ClusterNode private ( * * @return List[(node1, node2, distance, tree size)] */ + @Since("1.6.0") def toLinkageMatrix: Array[(Int, Int, Double, Int)] = { val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight} val leaves = nodes.filter(_.isLeaf) @@ -630,7 +651,7 @@ class ClusterNode private ( // If a node only has one-child, the child is regarded as the cluster of the child. // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D. // ==> A merge list is (B, D), not (B, C). - def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = { + def getIndex(map: Map[BisectingClusterNode, Int], node: BisectingClusterNode): Int = { node.children.size match { case 1 => getIndex(map, node.children.head) case _ => map(node) @@ -638,9 +659,33 @@ class ClusterNode private ( } clusters.filterNot(_.isLeaf).map { node => (getIndex(treeMap, node.children.head), - getIndex(treeMap, node.children(1)), - node.getHeight, - node.toArray.filter(_.isLeaf).size) + getIndex(treeMap, node.children(1)), + node.getHeight, + node.toArray.filter(_.isLeaf).length) } } } + + +/** + * This class is used for maneging a cluster statistics + * + * @param rows the number of points + * @param sums the sum of points + * @param sumOfSquares the sum of squares of points + */ +private[clustering] case class BisectingClusterStat ( + rows: Long, + sums: BV[Double], + sumOfSquares: BV[Double]) extends Serializable { + + // initialization + val center: BV[Double] = sums :/ rows.toDouble + val variances: BV[Double] = rows match { + case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) + case _ => BV.zeros[Double](sums.size) + } + + def isDividable: Boolean = breezeAny(variances) && rows >= 2 +} + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 2c257caced02e..b524ae6b370e3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -18,7 +18,9 @@ package org.apache.spark.mllib.clustering import breeze.linalg.{Vector => BV, norm => breezeNorm} + import org.apache.spark.Logging +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -28,15 +30,21 @@ import org.apache.spark.rdd.RDD * * @param node a cluster as a tree node */ -class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logging { +@Since("1.6.0") +class BisectingKMeansModel @Since("1.6.0") ( + @Since("1.6.0") val node: BisectingClusterNode + ) extends Serializable with Logging { - def getClusters: Array[ClusterNode] = this.node.getLeavesNodes + @Since("1.6.0") + def getClusters: Array[BisectingClusterNode] = this.node.getLeavesNodes + @Since("1.6.0") def getCenters: Array[Vector] = this.getClusters.map(_.center) /** * Predicts the closest cluster by one point */ + @Since("1.6.0") def predict(vector: Vector): Int = { // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) @@ -48,6 +56,7 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg /** * Predicts the closest cluster by RDD of the points */ + @Since("1.6.0") def predict(data: RDD[Vector]): RDD[Int] = { val sc = data.sparkContext @@ -65,12 +74,14 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg /** * Predicts the closest cluster by RDD of the points for Java */ + @Since("1.6.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] /** * Computes Within Set Sum of Squared Error(WSSSE) */ + @Since("1.6.0") def WSSSE(data: RDD[Vector]): Double = { val bvCenters = this.getCenters.map(_.toBreeze) data.context.broadcast(bvCenters) @@ -85,11 +96,14 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg distances.sum() } + @Since("1.6.0") def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) + @Since("1.6.0") def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ + @Since("1.6.0") def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toAdjacencyList.foreach { x => @@ -102,9 +116,11 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg javaList } + @Since("1.6.0") def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ + @Since("1.6.0") def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index 75daf4c26f93b..913ca9ac6169a 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -57,7 +57,7 @@ public void runWithSmallData() { Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0); JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setNumClusters(1); + BisectingKMeans algo = new BisectingKMeans().setK(1); BisectingKMeansModel model = algo.run(data.rdd()); assertEquals(1, model.getCenters().length); assertEquals(expectedCenter, model.getCenters()[0]); @@ -73,7 +73,7 @@ public void runWithDenseVectors() { points.add(point); } JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters); + BisectingKMeans algo = new BisectingKMeans().setK(numClusters); BisectingKMeansModel model = algo.run(data.rdd()); Vector[] centers = model.getCenters(); assertEquals(numClusters, centers.length); @@ -103,7 +103,7 @@ public void runWithSparseVectors() { points.add(point); } JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters); + BisectingKMeans algo = new BisectingKMeans().setK(numClusters); BisectingKMeansModel model = algo.run(data.rdd()); Vector[] centers = model.getCenters(); assertEquals(numClusters, centers.length); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index a7fafb67a76e4..c8e077ecd1413 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -27,7 +27,7 @@ class BisectingKMeansModelSuite extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { test("clustering dense vectors") { - val app = new BisectingKMeans().setNumClusters(5).setSeed(1) + val app = new BisectingKMeans().setK(5).setSeed(1) val localData = (1 to 100).toSeq.map { i => val label = i % 5 @@ -38,7 +38,7 @@ class BisectingKMeansModelSuite val model = app.run(data) val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[ClusterNode]]) + assert(clusters.isInstanceOf[Array[BisectingClusterNode]]) assert(clusters.length === 5) val centers = model.getCenters.sortBy(_.toArray.sum) @@ -88,7 +88,7 @@ class BisectingKMeansModelSuite } test("clustering sparse vectors") { - val app = new BisectingKMeans().setNumClusters(5).setSeed(1) + val app = new BisectingKMeans().setK(5).setSeed(1) val localData = (1 to 100).toSeq.map { i => val label = i % 5 @@ -99,7 +99,7 @@ class BisectingKMeansModelSuite val model = app.run(data) val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[ClusterNode]]) + assert(clusters.isInstanceOf[Array[BisectingClusterNode]]) assert(clusters.length === 5) val centers = model.getCenters.sortBy(_.toArray.sum) @@ -149,7 +149,7 @@ class BisectingKMeansModelSuite test("clustering should be done correctly") { for (numClusters <- Array(9, 19)) { - val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1) + val app = new BisectingKMeans().setK(numClusters).setSeed(1) val localData = (1 to 19).toSeq.map { i => val label = i % numClusters val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble))) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 04640a0fc97cc..43d5f5a2d3c63 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -27,27 +27,8 @@ import org.apache.spark.mllib.util.TestingUtils._ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { - test("the root index is equal to 1") { - assert(BisectingKMeans.ROOT_INDEX_KEY === 1) - } - - test("findClosestCenter") { - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val centers = Seq( - Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze, - Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze, - Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze - ) - - for (i <- 0 to (centers.size - 1)) { - val point = centers(i) - val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point) - assert(closestIndex === i) - } - } - test("run") { - val algo = new BisectingKMeans().setNumClusters(123).setSeed(1) + val algo = new BisectingKMeans().setK(123).setSeed(1) val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) @@ -62,7 +43,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { } test("run with too many cluster size than the records") { - val algo = new BisectingKMeans().setNumClusters(123).setSeed(1) + val algo = new BisectingKMeans().setK(123).setSeed(1) val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed) val model = algo.run(data) @@ -70,79 +51,73 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.node.getHeight ~== 72.12489 absTol 10E-4) } - test("initializeData") { - val algo = new BisectingKMeans - val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val seed = sc.parallelize(localSeed) - val data = algo.initData(seed) - assert(data.map(_._1).collect().distinct === Array(1)) + test("setNumClusters") { + val algo = new BisectingKMeans() + assert(algo.getK == 20) + algo.setK(1000) + assert(algo.getK == 1000) } - test("get center stats") { - val algo = new BisectingKMeans - val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val seed = sc.parallelize(localSeed) - val data = algo.initData(seed) - - val clusters = algo.summarize(data) - assert(clusters.size === 1) - assert(clusters(1).center === Vectors.dense(49.5, 49.5).toBreeze) - assert(clusters(1).rows === 100) - - val data2 = seed.map(v => (BigInt((v.apply(0) / 25).toInt + 1), v.toBreeze)) - val clusters2 = algo.summarize(data2) - assert(clusters2.size === 4) - assert(clusters2(1).center === Vectors.dense(12.0, 12.0).toBreeze) - assert(clusters2(1).rows === 25) - assert(clusters2(2).center === Vectors.dense(37.0, 37.0).toBreeze) - assert(clusters2(2).rows === 25) - assert(clusters2(3).center === Vectors.dense(62.0, 62.0).toBreeze) - assert(clusters2(3).rows === 25) - assert(clusters2(4).center === Vectors.dense(87.0, 87.0).toBreeze) - assert(clusters2(4).rows === 25) + test("setSubIterations") { + val algo = new BisectingKMeans() + assert(algo.getMaxIterations == 20) + algo.setMaxIterations(15) + assert(algo.getMaxIterations == 15) } - test("getChildrenCenter") { + test("setSeed") { + val algo = new BisectingKMeans() + assert(algo.getSeed == 1) + algo.setSeed(987) + assert(algo.getSeed == 987) + } + + test("summarize center stats") { val algo = new BisectingKMeans val local = Seq( - (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), - (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1)) + (BigInt(4), Vectors.dense(1.5, 1.5).toBreeze), + (BigInt(4), Vectors.dense(2.5, 2.5).toBreeze), + (BigInt(5), Vectors.dense(11.5, 11.5).toBreeze), + (BigInt(5), Vectors.dense(12.5, 12.5).toBreeze), + (BigInt(6), Vectors.dense(21.5, 21.5).toBreeze), + (BigInt(6), Vectors.dense(22.5, 22.5).toBreeze), + (BigInt(7), Vectors.dense(31.5, 31.5).toBreeze), + (BigInt(7), Vectors.dense(32.5, 32.5).toBreeze) ) val data = sc.parallelize(local) - val stats = Map[BigInt, ClusterNodeStat]( - BigInt(2) -> new ClusterNodeStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)), - BigInt(3) -> new ClusterNodeStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2)) - ) - val initNextCenters = algo.initChildCenters(data, stats) - assert(initNextCenters.size === 4) - assert(initNextCenters.keySet === Set(4, 5, 6, 7)) + + val clusters = BisectingKMeans.summarizeClusters(data) + assert(clusters.size === 4) + assert(clusters(4).center === Vectors.dense(2.0, 2.0).toBreeze) + assert(clusters(4).variances === Vectors.dense(0.25, 0.25).toBreeze) + assert(clusters(4).rows === 2) + assert(clusters(5).center === Vectors.dense(12.0, 12.0).toBreeze) + assert(clusters(5).variances === Vectors.dense(0.25, 0.25).toBreeze) + assert(clusters(5).rows === 2) + assert(clusters(6).center === Vectors.dense(22.0, 22.0).toBreeze) + assert(clusters(6).variances === Vectors.dense(0.25, 0.25).toBreeze) + assert(clusters(6).rows === 2) + assert(clusters(7).center === Vectors.dense(32.0, 32.0).toBreeze) + assert(clusters(7).variances === Vectors.dense(0.25, 0.25).toBreeze) + assert(clusters(7).rows === 2) } - test("should divide clusters") { - val algo = new BisectingKMeans().setSeed(5) + test("initialize centers at next step") { val local = Seq( (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), - (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)), - (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)), - (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1)) + (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1)) ) val data = sc.parallelize(local) - val stats = algo.summarize(data) - val newClusters = algo.getDividedClusters(data, stats) - - assert(newClusters.size === 4) - assert(newClusters(4).center === BV[Double](1.0, 1.0)) - assert(newClusters(4).rows === 2) - assert(newClusters(5).center === BV[Double](10.0, 10.0)) - assert(newClusters(5).rows === 2) - assert(newClusters(6).center === BV[Double](100.0, 100.0)) - assert(newClusters(6).rows === 2) - assert(newClusters(7).center === BV[Double](110.0, 110.0)) - assert(newClusters(7).rows === 2) + val stats = Map[BigInt, BisectingClusterStat]( + BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)), + BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2)) + ) + val initNextCenters = BisectingKMeans.initNextCenters(data, stats) + assert(initNextCenters.size === 4) + assert(initNextCenters.keySet === Set(4, 5, 6, 7)) } test("should assign each data to new clusters") { - val algo = new BisectingKMeans val seed = Seq( (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)), @@ -150,43 +125,60 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0)) - ).map { case (idx, vector) => (idx, vector.toBreeze)} + ).map { case (idx, vector) => (idx, vector.toBreeze) } val newClusters = Map( - BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(5) -> new ClusterNodeStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(6) -> new ClusterNodeStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(7) -> new ClusterNodeStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0)) + BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)), + BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0)) ) val data = sc.parallelize(seed) - val result = algo.updateClusterIndex(data, newClusters).collect().toSeq + val result = BisectingKMeans.updateClusterIndex(data, newClusters).collect().toSeq val expected = Seq( (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)), (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)), (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0)) - ).map { case (idx, vector) => (idx, vector.toBreeze)} + ).map { case (idx, vector) => (idx, vector.toBreeze) } assert(result === expected) } - test("setNumClusters") { - val algo = new BisectingKMeans() - assert(algo.getNumClusters == 20) - algo.setNumClusters(1000) - assert(algo.getNumClusters == 1000) - } + test("findClosestCenter") { + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val centers = Seq( + Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze, + Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze, + Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze + ) - test("setSubIterations") { - val algo = new BisectingKMeans() - assert(algo.getMaxIterations == 20) - algo.setMaxIterations(15) - assert(algo.getMaxIterations == 15) + for (i <- 0 to (centers.size - 1)) { + val point = centers(i) + val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point) + assert(closestIndex === i) + } } - test("setSeed") { - val algo = new BisectingKMeans() - assert(algo.getSeed == 1) - algo.setSeed(987) - assert(algo.getSeed == 987) + test("should divide clusters correctly") { + val local = Seq( + (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), + (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)), + (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)), + (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1)) + ) + val data = sc.parallelize(local) + val stats = BisectingKMeans.summarizeClusters(data) + val newClusters = BisectingKMeans.divideClusters(data, stats, 20) + + assert(newClusters.size === 4) + assert(newClusters(4).center === BV[Double](1.0, 1.0)) + assert(newClusters(4).rows === 2) + assert(newClusters(5).center === BV[Double](10.0, 10.0)) + assert(newClusters(5).rows === 2) + assert(newClusters(6).center === BV[Double](100.0, 100.0)) + assert(newClusters(6).rows === 2) + assert(newClusters(7).center === BV[Double](110.0, 110.0)) + assert(newClusters(7).rows === 2) } + } From 31623ead10d4664c7c5d98b5c72731bcc804bcba Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 22 Oct 2015 08:51:23 -0700 Subject: [PATCH 36/76] Improve a performance --- .../apache/spark/mllib/clustering/BisectingKMeans.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index a44cbd32ac5fe..c7b94607206ab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -19,7 +19,8 @@ package org.apache.spark.mllib.clustering import scala.collection.{Map, mutable} -import breeze.linalg.{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm} +import breeze.linalg + .{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm, sum => breezeSum} import org.apache.spark.{Logging, SparkException} import org.apache.spark.annotation.Since @@ -393,9 +394,7 @@ private[clustering] object BisectingKMeans { stats = eachStats.toMap totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) => - sum.toArray.zip(sumOfSquares.toArray).map { case (s, ss) => - math.pow(ss / n - math.pow(s / n, 2), 2.0) - }.sum + breezeSum((sumOfSquares :/ n) :- breezeNorm(sum :/ n, 2.0)) }.sum relativeError = math.abs(oldTotalStd - totalStd) / totalStd oldTotalStd = totalStd From 052c9d6ac42961cc6e8cf7533746e6829b20d236 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 12:45:13 -0700 Subject: [PATCH 37/76] Remove `toAdjacencyList` and `toLinkageList` These will be merge at following PR. --- .../clustering/BisectingKMeansModel.scala | 34 -------------- .../clustering/JavaBisectingKMeansSuite.java | 15 ------- .../BisectingKMeansModelSuite.scala | 44 ------------------- 3 files changed, 93 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index b524ae6b370e3..3c7eb0d50fb6e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -99,39 +99,5 @@ class BisectingKMeansModel @Since("1.6.0") ( @Since("1.6.0") def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) - @Since("1.6.0") - def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList - - /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ - @Since("1.6.0") - def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { - val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() - this.node.toAdjacencyList.foreach { x => - val edge = new java.util.ArrayList[java.lang.Double]() - edge.add(x._1.toDouble) - edge.add(x._2.toDouble) - edge.add(x._3.toDouble) - javaList.add(edge) - } - javaList - } - - @Since("1.6.0") - def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix - - /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */ - @Since("1.6.0") - def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { - val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() - this.node.toLinkageMatrix.foreach {x => - val row = new java.util.ArrayList[java.lang.Double]() - row.add(x._1.toDouble) - row.add(x._2.toDouble) - row.add(x._3.toDouble) - row.add(x._4.toDouble) - javaList.add(row) - } - javaList - } } diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index 913ca9ac6169a..fb729cfcfa1a2 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -27,7 +27,6 @@ import org.junit.Test; import java.io.Serializable; -import java.util.ArrayList; import java.util.List; import static org.junit.Assert.assertEquals; @@ -82,13 +81,6 @@ public void runWithDenseVectors() { assertEquals(Vectors.dense(2.0, 2.0), centers[2]); assertEquals(Vectors.dense(3.0, 3.0), centers[3]); assertEquals(Vectors.dense(4.0, 4.0), centers[4]); - - // adjacency list - ArrayList> edges = model.toJavaAdjacencyList(); - assertEquals(8, edges.size()); - // linkage matrix - ArrayList> matrix = model.toJavaLinkageMatrix(); - assertEquals(4, matrix.size()); } @Test @@ -112,12 +104,5 @@ public void runWithSparseVectors() { assertEquals(points.get(2), centers[2]); assertEquals(points.get(3), centers[3]); assertEquals(points.get(4), centers[4]); - - // adjacency list - ArrayList> edges = model.toJavaAdjacencyList(); - assertEquals(8, edges.size()); - // linkage matrix - ArrayList> matrix = model.toJavaLinkageMatrix(); - assertEquals(4, matrix.size()); } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index c8e077ecd1413..bda0c9cc999e2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -63,28 +63,6 @@ class BisectingKMeansModelSuite // compute WSSSE assert(model.WSSSE(data) === 0.0) - - // adjacency list - val adjacencyList = model.toAdjacencyList - .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) - assert(adjacencyList.length === 8) - assert(adjacencyList(0) === (0, 1, 2.5981)) - assert(adjacencyList(1) === (0, 6, 2.5981)) - assert(adjacencyList(2) === (1, 2, 1.7321)) - assert(adjacencyList(3) === (1, 5, 1.7321)) - assert(adjacencyList(4) === (2, 3, 0.866)) - assert(adjacencyList(5) === (2, 4, 0.866)) - assert(adjacencyList(6) === (6, 7, 0.866)) - assert(adjacencyList(7) === (6, 8, 0.866)) - - // linkage matrix - val linkageMatrix = model.toLinkageMatrix - .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) - assert(linkageMatrix.length === 4) - assert(linkageMatrix(0) === (0, 1, 0.866, 2)) - assert(linkageMatrix(1) === (3, 4, 0.866, 2)) - assert(linkageMatrix(2) === (5, 2, 2.5981, 3)) - assert(linkageMatrix(3) === (7, 6, 5.1962, 5)) } test("clustering sparse vectors") { @@ -123,28 +101,6 @@ class BisectingKMeansModelSuite // compute WSSSE assert(model.WSSSE(data) === 0.0) - - // adjacency list - val adjacencyList = model.toAdjacencyList - .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3)) - assert(adjacencyList.length === 8) - assert(adjacencyList(0) === (0, 1, 3.2863)) - assert(adjacencyList(1) === (0, 8, 3.2863)) - assert(adjacencyList(2) === (1, 2, 2.3184)) - assert(adjacencyList(3) === (1, 7, 2.3184)) - assert(adjacencyList(4) === (2, 3, 1.3744)) - assert(adjacencyList(5) === (2, 6, 1.3744)) - assert(adjacencyList(6) === (3, 4, 0.5)) - assert(adjacencyList(7) === (3, 5, 0.5)) - - // linkage matrix - val linkageMatrix = model.toLinkageMatrix - .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4)) - assert(linkageMatrix.length === 4) - assert(linkageMatrix(0) === (0, 1, 0.5, 2)) - assert(linkageMatrix(1) === (5, 2, 1.8744, 3)) - assert(linkageMatrix(2) === (6, 3, 4.1928, 4)) - assert(linkageMatrix(3) === (7, 4, 7.4791, 5)) } test("clustering should be done correctly") { From e13e47fc268472d3edd8df51a2c3d15f6224dd0e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 12:54:39 -0700 Subject: [PATCH 38/76] Remove an unnecessary constructor arg: `clusterMap` --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index c7b94607206ab..abb696c14a969 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -59,14 +59,12 @@ import org.apache.spark.rdd.RDD * candidates and merge it to the cluster tree until the desired number of clusters is reached. * * @param k tne desired number of clusters - * @param clusterMap the pairs of cluster and its index as Map * @param maxIterations the number of maximal iterations to split clusters * @param seed a random seed */ @Since("1.6.0") class BisectingKMeans private ( private var k: Int, - private var clusterMap: Map[BigInt, BisectingClusterNode], private var maxIterations: Int, private var seed: Long) extends Logging { @@ -74,7 +72,7 @@ class BisectingKMeans private ( * Constructs with the default configuration */ @Since("1.6.0") - def this() = this(20, mutable.ListMap.empty[BigInt, BisectingClusterNode], 20, 1) + def this() = this(20, 20, 1) /** * Sets the number of clusters you want From ad6b9e2f7b13c8b2ae11198ffe7f903fdfb76f4a Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:00:46 -0700 Subject: [PATCH 39/76] Add `import BisectingKMeans._` inside of `BisectingKMeans` class --- .../mllib/clustering/BisectingKMeans.scala | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index abb696c14a969..3388b28ec4d7a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -68,6 +68,8 @@ class BisectingKMeans private ( private var maxIterations: Int, private var seed: Long) extends Logging { + import BisectingKMeans._ + /** * Constructs with the default configuration */ @@ -132,10 +134,10 @@ class BisectingKMeans private ( // divide clusters until the number of clusters reachs the condition // or there is no dividable cluster val startTime = System.currentTimeMillis() - var data = BisectingKMeans.initData(input).cache() + var data = initData(input).cache() while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) { logInfo(s"${sc.appName} starts step ${step}") - val leafClusters = BisectingKMeans.summarizeClusters(data) + val leafClusters = summarizeClusters(data) val dividableLeafClusters = leafClusters.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusters @@ -144,10 +146,9 @@ class BisectingKMeans private ( } else { // can be clustered if the number of divided clusterStats is equal to 0 - val divided = - BisectingKMeans.divideClusters(data, dividableLeafClusters, maxIterations) + val divided = divideClusters(data, dividableLeafClusters, maxIterations) // update each index - val newData = BisectingKMeans.updateClusterIndex(data, divided).cache() + val newData = updateClusterIndex(data, divided).cache() rddArray = rddArray ++ Array(data) data = newData // keep recent 2 cached RDDs in order to run more quickly @@ -164,11 +165,11 @@ class BisectingKMeans private ( // unpersist kept RDDs rddArray.foreach(_.unpersist()) // create a map of cluster node with their criterions - val nodes = BisectingKMeans.createClusterNodes(data, clusterStats) + val nodes = createClusterNodes(data, clusterStats) // build a cluster tree by Map class which is expressed logInfo(s"Building the cluster tree is started in ${sc.appName}") - val root = BisectingKMeans.buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.k) + val root = buildTree(nodes, ROOT_INDEX_KEY, this.k) if (root.isEmpty) { new SparkException("Failed to build a cluster tree from a Map type of clusterStats") } @@ -236,7 +237,7 @@ private[clustering] object BisectingKMeans { * Assigns the initial cluster index id to all data */ def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = { - data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)} + data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)} } /** @@ -345,7 +346,7 @@ private[clustering] object BisectingKMeans { // extract dividable input data val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)} - var newCenters = BisectingKMeans.initNextCenters(dividableData, dividableClusters) + var newCenters = initNextCenters(dividableData, dividableClusters) var bcNewCenters = sc.broadcast(newCenters) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) @@ -365,8 +366,7 @@ private[clustering] object BisectingKMeans { val childrenCenters = Array(2 * idx, 2 * idx + 1) .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) if (childrenCenters.length == 2) { - val closestIndex = - BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point) + val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point) val nextIndex = 2 * idx + closestIndex // get a map value or else get a sparse vector From 043f5f31037911388f277beb067ffce9da85b30c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:03:05 -0700 Subject: [PATCH 40/76] Rename `rddArray` to `updatedDataHistory` in order to make the name more descraptive --- .../spark/mllib/clustering/BisectingKMeans.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 3388b28ec4d7a..1ccaafc5a205e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -126,7 +126,7 @@ class BisectingKMeans private ( var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat] var step = 1 var noMoreDividable = false - var rddArray = Array.empty[RDD[(BigInt, BV[Double])]] + var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]] // the number of maximum nodes of a binary tree by given parameter val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1 val maxAllNodesInTree = math.pow(2, multiplier).toInt @@ -149,13 +149,13 @@ class BisectingKMeans private ( val divided = divideClusters(data, dividableLeafClusters, maxIterations) // update each index val newData = updateClusterIndex(data, divided).cache() - rddArray = rddArray ++ Array(data) + updatedDataHistory = updatedDataHistory ++ Array(data) data = newData // keep recent 2 cached RDDs in order to run more quickly - if (rddArray.length > 1) { - val head = rddArray.head + if (updatedDataHistory.length > 1) { + val head = updatedDataHistory.head head.unpersist() - rddArray = rddArray.filterNot(_.hashCode() == head.hashCode()) + updatedDataHistory = updatedDataHistory.filterNot(_.hashCode() == head.hashCode()) } clusterStats = clusterStats ++ divided step += 1 @@ -163,7 +163,7 @@ class BisectingKMeans private ( } } // unpersist kept RDDs - rddArray.foreach(_.unpersist()) + updatedDataHistory.foreach(_.unpersist()) // create a map of cluster node with their criterions val nodes = createClusterNodes(data, clusterStats) From 31b05ecfda798a8d2b848d27d738841b0a863c87 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:08:43 -0700 Subject: [PATCH 41/76] Modify `math.log10` to `math.log` --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 1ccaafc5a205e..157db50928720 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -127,8 +127,8 @@ class BisectingKMeans private ( var step = 1 var noMoreDividable = false var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]] - // the number of maximum nodes of a binary tree by given parameter - val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1 + // the minimum number of nodes of a binary tree by given parameter + val multiplier = math.ceil(math.log(this.k) / math.log(2.0)) + 1 val maxAllNodesInTree = math.pow(2, multiplier).toInt // divide clusters until the number of clusters reachs the condition From 12a60cf4134764a9c03fa98eefc2536feb508d37 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:41:04 -0700 Subject: [PATCH 42/76] Add `getMinimumNumNodeInTree` to calculate the minimum number of node in a binary tree --- .../mllib/clustering/BisectingKMeans.scala | 18 +++++++++++++++--- .../clustering/BisectingKMeansSuite.scala | 10 ++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 157db50928720..9751fee219059 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -128,14 +128,13 @@ class BisectingKMeans private ( var noMoreDividable = false var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]] // the minimum number of nodes of a binary tree by given parameter - val multiplier = math.ceil(math.log(this.k) / math.log(2.0)) + 1 - val maxAllNodesInTree = math.pow(2, multiplier).toInt + val numNodeLimit = getMinimumNumNodesInTree(this.k) // divide clusters until the number of clusters reachs the condition // or there is no dividable cluster val startTime = System.currentTimeMillis() var data = initData(input).cache() - while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) { + while (clusterStats.size < numNodeLimit && noMoreDividable == false) { logInfo(s"${sc.appName} starts step ${step}") val leafClusters = summarizeClusters(data) val dividableLeafClusters = leafClusters.filter(_._2.isDividable) @@ -208,6 +207,19 @@ private[clustering] object BisectingKMeans { closestIndex } + /** + * Gets the minimum number of nodes in a tree by the number of leaves + * + * @param k: the number of leaf nodes + */ + def getMinimumNumNodesInTree(k: Int): Int = { + val multiplier = math.ceil(math.log(k) / math.log(2.0)) + // the calculation is same as `math.pow(2, multiplier)` + var numNodes = 2 + (1 to multiplier.toInt).foreach (i => numNodes = numNodes << 1) + numNodes + } + /** * Summarizes data by each cluster as Map * diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 43d5f5a2d3c63..c6674b00ebb56 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -159,6 +159,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("should be equal to math.pow") { + (1 to 1000).foreach { k => + // the minimum number of nodes of a binary tree by given parameter + val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1 + val expected = math.pow(2, multiplier).toInt + val result = BisectingKMeans.getMinimumNumNodesInTree(k) + assert(result === expected) + } + } + test("should divide clusters correctly") { val local = Seq( (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), From a13a4048d9d1f370da404993ea336596a6c5cb23 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:43:47 -0700 Subject: [PATCH 43/76] Rename `leafClusters` to `leafClusterStats` --- .../apache/spark/mllib/clustering/BisectingKMeans.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 9751fee219059..fe1ad0bebd3f3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -136,9 +136,10 @@ class BisectingKMeans private ( var data = initData(input).cache() while (clusterStats.size < numNodeLimit && noMoreDividable == false) { logInfo(s"${sc.appName} starts step ${step}") - val leafClusters = summarizeClusters(data) - val dividableLeafClusters = leafClusters.filter(_._2.isDividable) - clusterStats = clusterStats ++ leafClusters + // TODO Remove non-leaf cluster stats from `leafClusterStats` + val leafClusterStats = summarizeClusters(data) + val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) + clusterStats = clusterStats ++ leafClusterStats if (dividableLeafClusters.isEmpty) { noMoreDividable = true From 75564b57dc4c956b54b35baaba24327af98b66a4 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 13:58:11 -0700 Subject: [PATCH 44/76] Modify `BisectingKMeans.updateClusterIndex` --- .../spark/mllib/clustering/BisectingKMeans.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index fe1ad0bebd3f3..32a7aeca48a74 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -310,6 +310,11 @@ private[clustering] object BisectingKMeans { data: RDD[(BigInt, BV[Double])], dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = { + // If there is no divided clusters, return the original + if (dividedClusters.size == 0) { + return data + } + // extract the centers of the clusters val sc = data.sparkContext var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} @@ -323,16 +328,16 @@ private[clustering] object BisectingKMeans { data.map { case (idx, point) => val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c)) childrenIndexes.length match { - // stay the index if the number of children is not enough - case s if s < 2 => (idx, point) // update the indexes - case _ => { + case s if s == 2 => { val nextCenters = childrenIndexes.map(bcCenters.value(_)) val closestIndex = BisectingKMeans .findClosestCenter(bcMetric.value)(nextCenters)(point) val nextIndex = 2 * idx + closestIndex (nextIndex, point) } + // stay the index if the number of children is not enough + case _ => (idx, point) } } } From 084b9928ff6eb4b95632a9a52c6cc095f3fb5e64 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:06:14 -0700 Subject: [PATCH 45/76] Move checking whether there are dividable clusters or not to below --- .../mllib/clustering/BisectingKMeans.scala | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 32a7aeca48a74..96c57d598e8c1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -141,26 +141,25 @@ class BisectingKMeans private ( val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusterStats + // can be clustered if the number of divided clusterStats is equal to 0 + val divided = divideClusters(data, dividableLeafClusters, maxIterations) + // update each index + val newData = updateClusterIndex(data, divided).cache() + updatedDataHistory = updatedDataHistory ++ Array(data) + data = newData + // keep recent 2 cached RDDs in order to run more quickly + if (updatedDataHistory.length > 1) { + val head = updatedDataHistory.head + updatedDataHistory = updatedDataHistory.tail + head.unpersist() + } + clusterStats = clusterStats ++ divided + step += 1 + logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") + if (dividableLeafClusters.isEmpty) { noMoreDividable = true } - else { - // can be clustered if the number of divided clusterStats is equal to 0 - val divided = divideClusters(data, dividableLeafClusters, maxIterations) - // update each index - val newData = updateClusterIndex(data, divided).cache() - updatedDataHistory = updatedDataHistory ++ Array(data) - data = newData - // keep recent 2 cached RDDs in order to run more quickly - if (updatedDataHistory.length > 1) { - val head = updatedDataHistory.head - head.unpersist() - updatedDataHistory = updatedDataHistory.filterNot(_.hashCode() == head.hashCode()) - } - clusterStats = clusterStats ++ divided - step += 1 - logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") - } } // unpersist kept RDDs updatedDataHistory.foreach(_.unpersist()) From b6a952df91129cd9ce53ab0f47a824b4859f5682 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:14:51 -0700 Subject: [PATCH 46/76] Make sure the input data keeps the storage level and unpersist unnecessary RDDs --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 96c57d598e8c1..65584564a32a0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -161,10 +161,11 @@ class BisectingKMeans private ( noMoreDividable = true } } - // unpersist kept RDDs - updatedDataHistory.foreach(_.unpersist()) // create a map of cluster node with their criterions val nodes = createClusterNodes(data, clusterStats) + // unpersist RDDs + data.unpersist() + updatedDataHistory.foreach(_.unpersist()) // build a cluster tree by Map class which is expressed logInfo(s"Building the cluster tree is started in ${sc.appName}") From 1ba4e4599b003f4de32526398e196e09c6cec3f3 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:18:41 -0700 Subject: [PATCH 47/76] Remove `closestCenter` from `findClosestCenter` because it was a unnecessary variable --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 65584564a32a0..15411824095d0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -203,9 +203,8 @@ private[clustering] object BisectingKMeans { */ def findClosestCenter(metric: (BV[Double], BV[Double]) => Double) (centers: Seq[BV[Double]])(point: BV[Double]): Int = { - val (closestCenter, closestIndex) = - centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1) - closestIndex + // get the closest index + centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)._2 } /** From cb4fbfe03a8a0f86a31b0735c67d7b51721714ce Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:35:04 -0700 Subject: [PATCH 48/76] Modify `summarizeClusters` --- .../apache/spark/mllib/clustering/BisectingKMeans.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 15411824095d0..91b3e55c31ea4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -227,7 +227,7 @@ private[clustering] object BisectingKMeans { */ def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = { - val stats = data.mapPartitions { iter => + data.mapPartitions { iter => // calculate the accumulation of the all point in a partition and count the rows val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] iter.foreach { case (idx: BigInt, point: BV[Double]) => @@ -240,9 +240,9 @@ private[clustering] object BisectingKMeans { }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) => // sum the accumulation and the count in the all partition (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) - }.collect().toMap - - stats.map {case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3)} + }.map { case (i, (sum, n, sumOfSquares)) => + (i, new BisectingClusterStat(n.toLong, sum, sumOfSquares)) + }.collectAsMap() } /** From fbcb9ea9bb7e6b60e19fa631be7691a9a2ff2431 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:43:47 -0700 Subject: [PATCH 49/76] Modify a type --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 91b3e55c31ea4..e97c3ad8a6d88 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -253,7 +253,7 @@ private[clustering] object BisectingKMeans { } /** - * Gets the initial centers for bisect k-means + * Gets the initial centers for bisecting k-means * * @param data pairs of point and its cluster index * @param stats pairs of cluster index and cluster statistics From ffbe399b88376d41861f808461b94d06b7387bf4 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 14:48:26 -0700 Subject: [PATCH 50/76] Replace `criterion` with `cost` --- .../mllib/clustering/BisectingKMeans.scala | 28 +++++++++---------- .../BisectingKMeansModelSuite.scala | 4 +-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index e97c3ad8a6d88..f037e7979bcc0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -45,9 +45,9 @@ import org.apache.spark.rdd.RDD * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting * when there are no dividable clusters before the number of clusters will be sufficient. And - * it calculates the criterions, such as average cost, entropy and so on, for building a cluster - * tree in the first part. The criterion means how large the cluster is. That is, the cluster - * whose criterion is maximum of all the clusters is the largest cluster. + * it calculates the costs, such as average cost, entropy and so on, for building a cluster + * tree in the first part. The costs means how large the cluster is. That is, the cluster + * whose cost is maximum of all the clusters is the largest cluster. * * Second, it builds a cluster tree as a binary tree by the result of the first part. * First of all, the cluster tree starts with only the root cluster which includes all points. @@ -161,7 +161,7 @@ class BisectingKMeans private ( noMoreDividable = true } } - // create a map of cluster node with their criterions + // create a map of cluster node with their costs val nodes = createClusterNodes(data, clusterStats) // unpersist RDDs data.unpersist() @@ -419,7 +419,7 @@ private[clustering] object BisectingKMeans { } /** - * Creates the map of cluster stats to the map of cluster nodes with their criterions + * Creates the map of cluster stats to the map of cluster nodes with their costs * * @param data input data * @param stats map of cluster stats which is described as a binary tree @@ -428,7 +428,7 @@ private[clustering] object BisectingKMeans { data: RDD[(BigInt, BV[Double])], stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { - // TODO: support other criterion, such as entropy + // TODO: support other cost, such as entropy createClusterNodesWithAverageCost(data, stats) } @@ -483,8 +483,8 @@ private[clustering] object BisectingKMeans { val root = treeMap(rootIndex) var leavesQueue = Map(rootIndex -> root) while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) { - // pick up the largest cluster by the maximum criterion of all the clusters - val mostScattered = leavesQueue.maxBy(_._2.criterion) + // pick up the largest cluster by the maximum cost of all the clusters + val mostScattered = leavesQueue.maxBy(_._2.cost) val mostScatteredKey = mostScattered._1 val mostScatteredCluster = mostScattered._2 @@ -519,7 +519,7 @@ private[clustering] object BisectingKMeans { * * @param center the center of the cluster * @param rows the number of rows in the cluster - * @param criterion how large a cluster is + * @param cost how large a cluster is * @param localHeight the maximal distance between this node and its children * @param parent the parent cluster of the cluster * @param children the children nodes of the cluster @@ -528,16 +528,16 @@ private[clustering] object BisectingKMeans { class BisectingClusterNode private ( @Since("1.6.0") val center: Vector, @Since("1.6.0") val rows: Long, - @Since("1.6.0") val criterion: Double, + @Since("1.6.0") val cost: Double, private var localHeight: Double, private var parent: Option[BisectingClusterNode], private var children: Seq[BisectingClusterNode]) extends Serializable { - require(!criterion.isNaN) + require(!cost.isNaN) @Since("1.6.0") - def this(center: Vector, rows: Long, criterion: Double) = - this(center, rows, criterion, 0.0, None, Array.empty[BisectingClusterNode]) + def this(center: Vector, rows: Long, cost: Double) = + this(center, rows, cost, 0.0, None, Array.empty[BisectingClusterNode]) /** * Inserts a sub node as its child @@ -573,7 +573,7 @@ class BisectingClusterNode private ( case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) } array.sortWith { case (a, b) => - a.getDepth < b.getDepth && a.criterion < b.criterion && a.rows < b.rows + a.getDepth < b.getDepth && a.cost < b.cost && a.rows < b.rows } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index bda0c9cc999e2..667c96eb72d70 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -117,13 +117,13 @@ class BisectingKMeansModelSuite val denseData = sc.parallelize(localData.map(_._2), 2) val denseModel = app.run(denseData) assert(denseModel.getCenters.length === numClusters) - assert(denseModel.getClusters.forall(_.criterion == 0.0)) + assert(denseModel.getClusters.forall(_.cost == 0.0)) // sparse version val sparseData = sc.parallelize(localData.map(_._3), 2) val sparseModel = app.run(sparseData) assert(sparseModel.getCenters.length === numClusters) - assert(sparseModel.getClusters.forall(_.criterion == 0.0)) + assert(sparseModel.getClusters.forall(_.cost == 0.0)) } } } From 6f37028db93d08a24af21545b58156cbeddb9e3e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 15:48:05 -0700 Subject: [PATCH 51/76] Change the constructor args of `BisectingClusterStats` --- .../mllib/clustering/BisectingKMeans.scala | 63 +++++++++++------ .../clustering/BisectingKMeansSuite.scala | 70 ++++++++++--------- 2 files changed, 77 insertions(+), 56 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index f037e7979bcc0..93807746454b6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -191,6 +191,8 @@ class BisectingKMeans private ( private[clustering] object BisectingKMeans { + import BisectingClusterStat._ + val ROOT_INDEX_KEY: BigInt = 1 /** @@ -241,7 +243,9 @@ private[clustering] object BisectingKMeans { // sum the accumulation and the count in the all partition (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) }.map { case (i, (sum, n, sumOfSquares)) => - (i, new BisectingClusterStat(n.toLong, sum, sumOfSquares)) + val mean = calcMean(n.toLong, sum) + val variance = getVariance(n.toLong, sum, sumOfSquares) + (i, new BisectingClusterStat(n.toLong, mean, variance)) }.collectAsMap() } @@ -316,7 +320,7 @@ private[clustering] object BisectingKMeans { // extract the centers of the clusters val sc = data.sparkContext - var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)} + var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)} val bcCenters = sc.broadcast(centers) // TODO Supports distance metrics other Euclidean distance metric @@ -345,29 +349,30 @@ private[clustering] object BisectingKMeans { * Divides clusters according to their statistics * * @param data pairs of point and its cluster index - * @param targetClusters target clusters to divide + * @param clusterStats target clusters to divide * @param maxIterations the maximum iterations to calculate clusters statistics */ def divideClusters( data: RDD[(BigInt, BV[Double])], - targetClusters: Map[BigInt, BisectingClusterStat], + clusterStats: Map[BigInt, BisectingClusterStat], maxIterations: Int): Map[BigInt, BisectingClusterStat] = { val sc = data.sparkContext val appName = sc.appName // get keys of dividable clusters - val dividableClusters = targetClusters.filter { case (idx, cluster) => cluster.isDividable } - if (dividableClusters.isEmpty) { + val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable } + if (dividableClusterStats.isEmpty) { return Map.empty[BigInt, BisectingClusterStat] } // extract dividable input data - val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)} + val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} - var newCenters = initNextCenters(dividableData, dividableClusters) + var newCenters = initNextCenters(dividableData, dividableClusterStats) var bcNewCenters = sc.broadcast(newCenters) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val bcMetric = sc.broadcast(metric) + // pairs of cluster index and (sums, #points, sumOfSquares) var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])] var subIter = 0 @@ -415,7 +420,11 @@ private[clustering] object BisectingKMeans { oldTotalStd = totalStd subIter += 1 } - stats.map { case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3) } + stats.map { case (i, (sums, rows, sumOfSquares)) => + val mean = calcMean(rows.toLong, sums) + val variance = getVariance(rows.toLong, sums, sumOfSquares) + i -> new BisectingClusterStat(rows.toLong, mean, variance) + } } /** @@ -440,7 +449,7 @@ private[clustering] object BisectingKMeans { stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { // calculate average costs of all clusters - val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center }) + val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean }) val costs = data.mapPartitions { iter => val counters = mutable.Map.empty[BigInt, (Long, Double)] bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} @@ -458,7 +467,7 @@ private[clustering] object BisectingKMeans { case x if x == 0.0 => 0.0 case _ => costs(i)._2 / costs(i)._1 } - i -> new BisectingClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost) + i -> new BisectingClusterNode(Vectors.fromBreeze(stat.mean), stat.rows, avgCost) } } @@ -685,21 +694,29 @@ class BisectingClusterNode private ( * This class is used for maneging a cluster statistics * * @param rows the number of points - * @param sums the sum of points - * @param sumOfSquares the sum of squares of points + * @param mean the sum of points + * @param variance the sum of squares of points */ private[clustering] case class BisectingClusterStat ( rows: Long, - sums: BV[Double], - sumOfSquares: BV[Double]) extends Serializable { - - // initialization - val center: BV[Double] = sums :/ rows.toDouble - val variances: BV[Double] = rows match { - case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) - case _ => BV.zeros[Double](sums.size) - } + mean: BV[Double], + variance: Double) extends Serializable { - def isDividable: Boolean = breezeAny(variances) && rows >= 2 + def isDividable: Boolean = variance > 0 && rows >= 2 } +private[clustering] object BisectingClusterStat { + // calculate a mean vector + def calcMean(rows: Long, sums: BV[Double]): BV[Double] = sums :/ rows.toDouble + + // calculate a variance + def getVariance(rows: Long, sums: BV[Double], sumOfSquares: BV[Double]): Double = { + val variances: BV[Double] = rows match { + case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) + case _ => BV.zeros[Double](sums.size) + } + breezeNorm(variances, 2.0) + } +} + + diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index c6674b00ebb56..875d2c4f694cd 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -86,20 +86,20 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { ) val data = sc.parallelize(local) - val clusters = BisectingKMeans.summarizeClusters(data) - assert(clusters.size === 4) - assert(clusters(4).center === Vectors.dense(2.0, 2.0).toBreeze) - assert(clusters(4).variances === Vectors.dense(0.25, 0.25).toBreeze) - assert(clusters(4).rows === 2) - assert(clusters(5).center === Vectors.dense(12.0, 12.0).toBreeze) - assert(clusters(5).variances === Vectors.dense(0.25, 0.25).toBreeze) - assert(clusters(5).rows === 2) - assert(clusters(6).center === Vectors.dense(22.0, 22.0).toBreeze) - assert(clusters(6).variances === Vectors.dense(0.25, 0.25).toBreeze) - assert(clusters(6).rows === 2) - assert(clusters(7).center === Vectors.dense(32.0, 32.0).toBreeze) - assert(clusters(7).variances === Vectors.dense(0.25, 0.25).toBreeze) - assert(clusters(7).rows === 2) + val clusterStats = BisectingKMeans.summarizeClusters(data) + assert(clusterStats.size === 4) + assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze) + assert(clusterStats(4).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(4).rows === 2) + assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze) + assert(clusterStats(5).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(5).rows === 2) + assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze) + assert(clusterStats(6).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(6).rows === 2) + assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze) + assert(clusterStats(7).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(7).rows === 2) } test("initialize centers at next step") { @@ -109,8 +109,8 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { ) val data = sc.parallelize(local) val stats = Map[BigInt, BisectingClusterStat]( - BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)), - BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2)) + BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0), + BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0) ) val initNextCenters = BisectingKMeans.initNextCenters(data, stats) assert(initNextCenters.size === 4) @@ -126,14 +126,18 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze) } - val newClusters = Map( - BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)), - BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0)) + val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0) + val newClusterStats = Map( + BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance), + BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance), + BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance), + BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance) ) val data = sc.parallelize(seed) - val result = BisectingKMeans.updateClusterIndex(data, newClusters).collect().toSeq + val leafClusterStats = BisectingKMeans.summarizeClusters(data) + val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) + val divided = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20) + val result = BisectingKMeans.updateClusterIndex(data, divided).collect().toSeq val expected = Seq( (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), @@ -178,17 +182,17 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { ) val data = sc.parallelize(local) val stats = BisectingKMeans.summarizeClusters(data) - val newClusters = BisectingKMeans.divideClusters(data, stats, 20) - - assert(newClusters.size === 4) - assert(newClusters(4).center === BV[Double](1.0, 1.0)) - assert(newClusters(4).rows === 2) - assert(newClusters(5).center === BV[Double](10.0, 10.0)) - assert(newClusters(5).rows === 2) - assert(newClusters(6).center === BV[Double](100.0, 100.0)) - assert(newClusters(6).rows === 2) - assert(newClusters(7).center === BV[Double](110.0, 110.0)) - assert(newClusters(7).rows === 2) + val newClusterStats = BisectingKMeans.divideClusters(data, stats, 20) + + assert(newClusterStats.size === 4) + assert(newClusterStats(4).mean === BV[Double](1.0, 1.0)) + assert(newClusterStats(4).rows === 2) + assert(newClusterStats(5).mean === BV[Double](10.0, 10.0)) + assert(newClusterStats(5).rows === 2) + assert(newClusterStats(6).mean === BV[Double](100.0, 100.0)) + assert(newClusterStats(6).rows === 2) + assert(newClusterStats(7).mean === BV[Double](110.0, 110.0)) + assert(newClusterStats(7).rows === 2) } } From 010fd2ca4eadc22ef38805b02bf35820f2117c9d Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 15:54:40 -0700 Subject: [PATCH 52/76] Convert `Long` to `BigInt` --- .../mllib/clustering/BisectingKMeans.scala | 50 ++++++++--------- .../clustering/BisectingKMeansSuite.scala | 54 +++++++++---------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 93807746454b6..ed385d9454c76 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -123,10 +123,10 @@ class BisectingKMeans private ( // `clusterStats` is described as binary tree structure // `clusterStats(1)` means the root of a binary tree - var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat] + var clusterStats = mutable.Map.empty[Long, BisectingClusterStat] var step = 1 var noMoreDividable = false - var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]] + var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]] // the minimum number of nodes of a binary tree by given parameter val numNodeLimit = getMinimumNumNodesInTree(this.k) @@ -193,7 +193,7 @@ private[clustering] object BisectingKMeans { import BisectingClusterStat._ - val ROOT_INDEX_KEY: BigInt = 1 + val ROOT_INDEX_KEY: Long = 1 /** * Finds the closes cluster's center @@ -227,12 +227,12 @@ private[clustering] object BisectingKMeans { * * @param data pairs of point and its cluster index */ - def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = { + def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = { data.mapPartitions { iter => // calculate the accumulation of the all point in a partition and count the rows - val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] - iter.foreach { case (idx: BigInt, point: BV[Double]) => + val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] + iter.foreach { case (idx: Long, point: BV[Double]) => // get a map value or else get a sparse vector val (sumBV, n, sumOfSquares) = map .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) @@ -252,7 +252,7 @@ private[clustering] object BisectingKMeans { /** * Assigns the initial cluster index id to all data */ - def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = { + def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)} } @@ -263,14 +263,14 @@ private[clustering] object BisectingKMeans { * @param stats pairs of cluster index and cluster statistics */ def initNextCenters( - data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BV[Double]] = { + data: RDD[(Long, BV[Double])], + stats: Map[Long, BisectingClusterStat]): Map[Long, BV[Double]] = { // Since the combination sampleByKey and groupByKey is more expensive, // this as follows would be better. val bcIndeces = data.sparkContext.broadcast(stats.keySet) val samples = data.mapPartitions { iter => - val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]] + val map = mutable.Map.empty[Long, mutable.ArrayBuffer[BV[Double]]] bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} val LOCAL_SAMPLE_SIZE = 100 @@ -310,8 +310,8 @@ private[clustering] object BisectingKMeans { * @param dividedClusters pairs of cluster index and cluster statistics */ def updateClusterIndex( - data: RDD[(BigInt, BV[Double])], - dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = { + data: RDD[(Long, BV[Double])], + dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = { // If there is no divided clusters, return the original if (dividedClusters.size == 0) { @@ -353,16 +353,16 @@ private[clustering] object BisectingKMeans { * @param maxIterations the maximum iterations to calculate clusters statistics */ def divideClusters( - data: RDD[(BigInt, BV[Double])], - clusterStats: Map[BigInt, BisectingClusterStat], - maxIterations: Int): Map[BigInt, BisectingClusterStat] = { + data: RDD[(Long, BV[Double])], + clusterStats: Map[Long, BisectingClusterStat], + maxIterations: Int): Map[Long, BisectingClusterStat] = { val sc = data.sparkContext val appName = sc.appName // get keys of dividable clusters val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable } if (dividableClusterStats.isEmpty) { - return Map.empty[BigInt, BisectingClusterStat] + return Map.empty[Long, BisectingClusterStat] } // extract dividable input data val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} @@ -373,7 +373,7 @@ private[clustering] object BisectingKMeans { val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val bcMetric = sc.broadcast(metric) // pairs of cluster index and (sums, #points, sumOfSquares) - var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])] + var stats = Map.empty[Long, (BV[Double], Double, BV[Double])] var subIter = 0 var totalStd = Double.MaxValue @@ -382,7 +382,7 @@ private[clustering] object BisectingKMeans { while (subIter < maxIterations && relativeError > 10E-4) { // calculate summary of each cluster val eachStats = dividableData.mapPartitions { iter => - val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])] + val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] iter.foreach { case (idx, point) => // calculate next index number val childrenCenters = Array(2 * idx, 2 * idx + 1) @@ -434,8 +434,8 @@ private[clustering] object BisectingKMeans { * @param stats map of cluster stats which is described as a binary tree */ def createClusterNodes( - data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { + data: RDD[(Long, BV[Double])], + stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = { // TODO: support other cost, such as entropy createClusterNodesWithAverageCost(data, stats) @@ -445,13 +445,13 @@ private[clustering] object BisectingKMeans { * Creates the map of cluster stats to the map of cluster nodes with their average costs */ private def createClusterNodesWithAverageCost( - data: RDD[(BigInt, BV[Double])], - stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = { + data: RDD[(Long, BV[Double])], + stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = { // calculate average costs of all clusters val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean }) val costs = data.mapPartitions { iter => - val counters = mutable.Map.empty[BigInt, (Long, Double)] + val counters = mutable.Map.empty[Long, (Long, Double)] bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} iter.foreach { case (i, point) => val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0) @@ -480,8 +480,8 @@ private[clustering] object BisectingKMeans { * @return a built cluster tree */ private def buildTree( - treeMap: Map[BigInt, BisectingClusterNode], - rootIndex: BigInt, + treeMap: Map[Long, BisectingClusterNode], + rootIndex: Long, numClusters: Int): Option[BisectingClusterNode] = { // if there is no index in the Map diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 875d2c4f694cd..0e49c6c7afe5c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -75,14 +75,14 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("summarize center stats") { val algo = new BisectingKMeans val local = Seq( - (BigInt(4), Vectors.dense(1.5, 1.5).toBreeze), - (BigInt(4), Vectors.dense(2.5, 2.5).toBreeze), - (BigInt(5), Vectors.dense(11.5, 11.5).toBreeze), - (BigInt(5), Vectors.dense(12.5, 12.5).toBreeze), - (BigInt(6), Vectors.dense(21.5, 21.5).toBreeze), - (BigInt(6), Vectors.dense(22.5, 22.5).toBreeze), - (BigInt(7), Vectors.dense(31.5, 31.5).toBreeze), - (BigInt(7), Vectors.dense(32.5, 32.5).toBreeze) + (4L, Vectors.dense(1.5, 1.5).toBreeze), + (4L, Vectors.dense(2.5, 2.5).toBreeze), + (5L, Vectors.dense(11.5, 11.5).toBreeze), + (5L, Vectors.dense(12.5, 12.5).toBreeze), + (6L, Vectors.dense(21.5, 21.5).toBreeze), + (6L, Vectors.dense(22.5, 22.5).toBreeze), + (7L, Vectors.dense(31.5, 31.5).toBreeze), + (7L, Vectors.dense(32.5, 32.5).toBreeze) ) val data = sc.parallelize(local) @@ -104,13 +104,13 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("initialize centers at next step") { val local = Seq( - (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), - (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1)) + (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)), + (3L, BV[Double](1.9, 1.9)), (2L, BV[Double](2.1, 2.1)) ) val data = sc.parallelize(local) - val stats = Map[BigInt, BisectingClusterStat]( - BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0), - BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0) + val stats = Map[Long, BisectingClusterStat]( + 2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0), + 3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0) ) val initNextCenters = BisectingKMeans.initNextCenters(data, stats) assert(initNextCenters.size === 4) @@ -119,19 +119,19 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("should assign each data to new clusters") { val seed = Seq( - (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), - (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)), - (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)), - (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), - (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)), - (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0)) + (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), + (2L, Vectors.dense(2.0, 2.0)), (2L, Vectors.dense(3.0, 3.0)), + (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), + (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), + (3L, Vectors.dense(8.0, 8.0)), (3L, Vectors.dense(9.0, 9.0)), + (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0)) ).map { case (idx, vector) => (idx, vector.toBreeze) } val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0) val newClusterStats = Map( - BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance), - BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance), - BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance), - BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance) + 4L -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance), + 5L -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance), + 6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance), + 7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance) ) val data = sc.parallelize(seed) val leafClusterStats = BisectingKMeans.summarizeClusters(data) @@ -175,10 +175,10 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("should divide clusters correctly") { val local = Seq( - (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)), - (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)), - (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)), - (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1)) + (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)), + (2L, BV[Double](9.9, 9.9)), (2L, BV[Double](10.1, 10.1)), + (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)), + (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1)) ) val data = sc.parallelize(local) val stats = BisectingKMeans.summarizeClusters(data) From e39f69ac70bedbce20d1a028d4fc32a79d88ef3f Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 16:12:59 -0700 Subject: [PATCH 53/76] Modify `BisectingKMeansModel.predict` --- .../mllib/clustering/BisectingKMeans.scala | 21 +++++++++++++++++++ .../clustering/BisectingKMeansModel.scala | 15 ++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index ed385d9454c76..afd35d74dc18a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -599,6 +599,27 @@ class BisectingClusterNode private ( } } + /** + * Finds a leaf which is the closest under the node + * + * @param point target point + */ + @Since("1.6.0") + def findClosestLeaf( + point: Vector, + metric: (BV[Double], BV[Double]) => Double + ): BisectingClusterNode = { + this.children.size match { + case 0 => this + case _ => { + val bv = point.toBreeze + val centers = this.children.map(_.center).map(_.toBreeze) + val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(bv) + this.children(closestIndex).findClosestLeaf(point, metric) + } + } + } + /** * Gets the leaves nodes in the cluster tree */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 3c7eb0d50fb6e..8177cd360d28e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -48,9 +48,11 @@ class BisectingKMeansModel @Since("1.6.0") ( def predict(vector: Vector): Int = { // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val closestLeafNode = this.node.findClosestLeaf(vector, metric) + val closestCenter = closestLeafNode.center val centers = this.getCenters.map(_.toBreeze) - BisectingKMeans.findClosestCenter(metric)(centers)(vector.toBreeze) + BisectingKMeans.findClosestCenter(metric)(centers)(closestCenter.toBreeze) } /** @@ -59,16 +61,7 @@ class BisectingKMeansModel @Since("1.6.0") ( @Since("1.6.0") def predict(data: RDD[Vector]): RDD[Int] = { val sc = data.sparkContext - - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - sc.broadcast(metric) - val centers = this.getCenters.map(_.toBreeze) - sc.broadcast(centers) - - data.map{point => - BisectingKMeans.findClosestCenter(metric)(centers)(point.toBreeze) - } + data.map { p => predict(p) } } /** From 2e8226d509ead4c14940335bc44c110d4c02c52f Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 16:15:43 -0700 Subject: [PATCH 54/76] Organize import statements --- .../spark/mllib/clustering/BisectingKMeans.scala | 3 +-- .../clustering/JavaBisectingKMeansSuite.java | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index afd35d74dc18a..ff55221402e5d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -19,8 +19,7 @@ package org.apache.spark.mllib.clustering import scala.collection.{Map, mutable} -import breeze.linalg - .{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm, sum => breezeSum} +import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, sum => breezeSum} import org.apache.spark.{Logging, SparkException} import org.apache.spark.annotation.Since diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index fb729cfcfa1a2..926bd54e54424 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -17,19 +17,19 @@ package org.apache.spark.mllib.clustering; +import java.io.Serializable; +import java.util.List; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.assertEquals; import com.google.common.collect.Lists; + import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.io.Serializable; -import java.util.List; - -import static org.junit.Assert.assertEquals; public class JavaBisectingKMeansSuite implements Serializable { private transient JavaSparkContext sc; From 622499e95e2717625eecd11f5c43e0c9eddc820e Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 16:24:10 -0700 Subject: [PATCH 55/76] Modify the comment inside of `updateClusterIndex` --- .../spark/mllib/clustering/BisectingKMeans.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index ff55221402e5d..47453fb46c169 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -319,7 +319,7 @@ private[clustering] object BisectingKMeans { // extract the centers of the clusters val sc = data.sparkContext - var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)} + val centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)} val bcCenters = sc.broadcast(centers) // TODO Supports distance metrics other Euclidean distance metric @@ -328,17 +328,19 @@ private[clustering] object BisectingKMeans { // update the indexes to their children indexes data.map { case (idx, point) => - val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c)) - childrenIndexes.length match { + // TODO improve how to extract child indexes + val childIndexes = Array(2 * idx, 2 * idx + 1) + val extractedChildIndexes = childIndexes.filter(c => bcCenters.value.contains(c)) + extractedChildIndexes.length match { // update the indexes case s if s == 2 => { - val nextCenters = childrenIndexes.map(bcCenters.value(_)) + val nextCenters = extractedChildIndexes.map(bcCenters.value(_)) val closestIndex = BisectingKMeans .findClosestCenter(bcMetric.value)(nextCenters)(point) val nextIndex = 2 * idx + closestIndex (nextIndex, point) } - // stay the index if the number of children is not enough + // stay the index if a cluster which a point belongs wasn't divided case _ => (idx, point) } } From 267908aa577d0732e322a2bde420571fea34ed3a Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 17:21:02 -0700 Subject: [PATCH 56/76] Change the default value of `k` to 2 --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 47453fb46c169..7dd3ec5439ae1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -73,7 +73,7 @@ class BisectingKMeans private ( * Constructs with the default configuration */ @Since("1.6.0") - def this() = this(20, 20, 1) + def this() = this(2, 20, 1) /** * Sets the number of clusters you want From a664f6d639d8fd2afb8df40cc7eb38bb5aa79923 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 17:30:31 -0700 Subject: [PATCH 57/76] Change `10E-4` to `10e-4` --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 7dd3ec5439ae1..9a62c1746405d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -380,7 +380,7 @@ private[clustering] object BisectingKMeans { var totalStd = Double.MaxValue var oldTotalStd = Double.MaxValue var relativeError = Double.MaxValue - while (subIter < maxIterations && relativeError > 10E-4) { + while (subIter < maxIterations && relativeError > 10e-4) { // calculate summary of each cluster val eachStats = dividableData.mapPartitions { iter => val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] From 165e191755bb641f10c604f1ec214248f5a104e7 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 26 Oct 2015 17:35:52 -0700 Subject: [PATCH 58/76] Remove `toLinkageMatrix` and `toAdjacencyList` from `BisectingKMeans` --- .../mllib/clustering/BisectingKMeans.scala | 53 ------------------- 1 file changed, 53 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 9a62c1746405d..ae06b0151641b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -656,59 +656,6 @@ class BisectingClusterNode private ( @Since("1.6.0") def setLocalHeight(height: Double): Unit = this.localHeight = height - - /** - * Converts to an adjacency list - * - * @return List[(fromNodeId, toNodeId, distance)] - */ - @Since("1.6.0") - def toAdjacencyList: Array[(Int, Int, Double)] = { - val nodes = toArray - - var adjacencyList = Array.empty[(Int, Int, Double)] - nodes.foreach { parent => - if (parent.children.size > 1) { - val parentIndex = nodes.indexOf(parent) - parent.children.foreach { child => - val childIndex = nodes.indexOf(child) - adjacencyList = adjacencyList :+(parentIndex, childIndex, parent.localHeight) - } - } - } - adjacencyList - } - - /** - * Converts to a linkage matrix - * Returned data format is fit for scipy's dendrogram function - * - * @return List[(node1, node2, distance, tree size)] - */ - @Since("1.6.0") - def toLinkageMatrix: Array[(Int, Int, Double, Int)] = { - val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight} - val leaves = nodes.filter(_.isLeaf) - val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1) - val clusters = leaves ++ notLeaves - val treeMap = clusters.zipWithIndex.map { case (node, idx) => node -> idx}.toMap - - // If a node only has one-child, the child is regarded as the cluster of the child. - // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D. - // ==> A merge list is (B, D), not (B, C). - def getIndex(map: Map[BisectingClusterNode, Int], node: BisectingClusterNode): Int = { - node.children.size match { - case 1 => getIndex(map, node.children.head) - case _ => map(node) - } - } - clusters.filterNot(_.isLeaf).map { node => - (getIndex(treeMap, node.children.head), - getIndex(treeMap, node.children(1)), - node.getHeight, - node.toArray.filter(_.isLeaf).length) - } - } } From c417dd10e74892f3d6b594438d7e47aba6335e65 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 28 Oct 2015 14:41:14 -0700 Subject: [PATCH 59/76] Change 10e-4 to 1e-4 --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index ae06b0151641b..b46e2a092f23d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -380,7 +380,7 @@ private[clustering] object BisectingKMeans { var totalStd = Double.MaxValue var oldTotalStd = Double.MaxValue var relativeError = Double.MaxValue - while (subIter < maxIterations && relativeError > 10e-4) { + while (subIter < maxIterations && relativeError > 1e-4) { // calculate summary of each cluster val eachStats = dividableData.mapPartitions { iter => val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] From e2f696671fa073ab64110d56ba508ed6154f7231 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 28 Oct 2015 22:56:04 -0700 Subject: [PATCH 60/76] Fix a test for the default value of k --- .../apache/spark/mllib/clustering/BisectingKMeansSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 0e49c6c7afe5c..71c8fa66a232a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -53,7 +53,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("setNumClusters") { val algo = new BisectingKMeans() - assert(algo.getK == 20) + assert(algo.getK == 2) algo.setK(1000) assert(algo.getK == 1000) } From 69e09103de8af2b1804eaa33d810813aaa35138f Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 28 Oct 2015 15:55:14 -0700 Subject: [PATCH 61/76] Replace `mapPartition(...).reduceByKey(...)` with `aggregateByKey` --- .../mllib/clustering/BisectingKMeans.scala | 37 ++++++++++--------- .../clustering/BisectingKMeansSuite.scala | 8 ++-- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index b46e2a092f23d..d04d45462fe62 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -227,24 +227,27 @@ private[clustering] object BisectingKMeans { * @param data pairs of point and its cluster index */ def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = { - - data.mapPartitions { iter => - // calculate the accumulation of the all point in a partition and count the rows - val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] - iter.foreach { case (idx: Long, point: BV[Double]) => - // get a map value or else get a sparse vector - val (sumBV, n, sumOfSquares) = map - .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))) - map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) + val dimension = data.first()._2.size + // zeroValue: (#rows, sum of vectors, sum of squares) + val zeroValue = (0L, BV.zeros[Double](dimension), 0.0) + val seqOp = (acc: (Long, BV[Double], Double), point: BV[Double]) => { + val n = acc._1 + 1L + val sums = acc._2 + point + val sumOfSquares = acc._3 + math.pow(breezeSum(point), 2.0) + (n, sums, sumOfSquares) + } + val comOp = + (acc1: (Long, BV[Double], Double), acc2: (Long, BV[Double], Double)) => + (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3) + + val stats = data.aggregateByKey(zeroValue)(seqOp, comOp) + stats.map { case (i, (n, sums, sumOfSquare)) => + val meanPoint = calcMean(n, sums) + val variance = n match { + case n if n < 2 => 0.0 + case _ => (sumOfSquare / n) - math.pow(breezeSum(sums) / n, 2.0) } - map.toIterator - }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) => - // sum the accumulation and the count in the all partition - (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2) - }.map { case (i, (sum, n, sumOfSquares)) => - val mean = calcMean(n.toLong, sum) - val variance = getVariance(n.toLong, sum, sumOfSquares) - (i, new BisectingClusterStat(n.toLong, mean, variance)) + (i, new BisectingClusterStat(n, meanPoint, variance)) }.collectAsMap() } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 71c8fa66a232a..6eaa40bb26378 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -89,16 +89,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val clusterStats = BisectingKMeans.summarizeClusters(data) assert(clusterStats.size === 4) assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze) - assert(clusterStats(4).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(4).variance ~== 1.0 absTol 10e-4) assert(clusterStats(4).rows === 2) assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze) - assert(clusterStats(5).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(5).variance ~== 1.0 absTol 10e-4) assert(clusterStats(5).rows === 2) assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze) - assert(clusterStats(6).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(6).variance ~== 1.0 absTol 10e-4) assert(clusterStats(6).rows === 2) assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze) - assert(clusterStats(7).variance ~== 0.3535 absTol 10e-4) + assert(clusterStats(7).variance ~== 1.0 absTol 10e-4) assert(clusterStats(7).rows === 2) } From 675bafb23296f71c0007b73db6e003214cd0dcf9 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 28 Oct 2015 23:15:32 -0700 Subject: [PATCH 62/76] Change `sumOfSquares` vector to scholar at `divideClusters` --- .../mllib/clustering/BisectingKMeans.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index d04d45462fe62..2a19679f94d1a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -377,7 +377,7 @@ private[clustering] object BisectingKMeans { val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val bcMetric = sc.broadcast(metric) // pairs of cluster index and (sums, #points, sumOfSquares) - var stats = Map.empty[Long, (BV[Double], Double, BV[Double])] + var stats = Map.empty[Long, (BV[Double], Double, Double)] var subIter = 0 var totalStd = Double.MaxValue @@ -386,7 +386,7 @@ private[clustering] object BisectingKMeans { while (subIter < maxIterations && relativeError > 1e-4) { // calculate summary of each cluster val eachStats = dividableData.mapPartitions { iter => - val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])] + val map = mutable.Map.empty[Long, (BV[Double], Double, Double)] iter.foreach { case (idx, point) => // calculate next index number val childrenCenters = Array(2 * idx, 2 * idx + 1) @@ -399,9 +399,9 @@ private[clustering] object BisectingKMeans { val (sumBV, n, sumOfSquares) = map .getOrElse( nextIndex, - (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)) + (BSV.zeros[Double](point.size), 0.0, 0.0) ) - map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point)) + map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + math.pow(breezeSum(point), 2.0)) } } map.toIterator @@ -417,17 +417,17 @@ private[clustering] object BisectingKMeans { // update summary of each cluster stats = eachStats.toMap - totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) => - breezeSum((sumOfSquares :/ n) :- breezeNorm(sum :/ n, 2.0)) + totalStd = stats.map { case (idx, (sums, n, sumOfSquares)) => + (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0) }.sum relativeError = math.abs(oldTotalStd - totalStd) / totalStd oldTotalStd = totalStd subIter += 1 } - stats.map { case (i, (sums, rows, sumOfSquares)) => - val mean = calcMean(rows.toLong, sums) - val variance = getVariance(rows.toLong, sums, sumOfSquares) - i -> new BisectingClusterStat(rows.toLong, mean, variance) + stats.map { case (i, (sums, n, sumOfSquares)) => + val mean = calcMean(n.toLong, sums) + val variance = (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0) + i -> new BisectingClusterStat(n.toLong, mean, variance) } } From 704e145ec73b9d519b02a8c16ceecfe61056bdd5 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 28 Oct 2015 23:51:14 -0700 Subject: [PATCH 63/76] Replace a chain of `mapPartition` and `reduceByKey` with `aggregateByKey` at `divideClusters` --- .../mllib/clustering/BisectingKMeans.scala | 72 +++++++++---------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 2a19679f94d1a..77d3b9ea4aa2a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -372,56 +372,54 @@ private[clustering] object BisectingKMeans { val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} var newCenters = initNextCenters(dividableData, dividableClusterStats) - var bcNewCenters = sc.broadcast(newCenters) // TODO Supports distance metrics other Euclidean distance metric val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) val bcMetric = sc.broadcast(metric) // pairs of cluster index and (sums, #points, sumOfSquares) - var stats = Map.empty[Long, (BV[Double], Double, Double)] + var stats = Map.empty[Long, (BV[Double], Long, Double)] var subIter = 0 - var totalStd = Double.MaxValue - var oldTotalStd = Double.MaxValue + var totalSumOfSquares = Double.MaxValue + var oldTotalSumOfSquares = Double.MaxValue var relativeError = Double.MaxValue + val dimension = dividableData.first()._2.size + // TODO add a set method for the threshold, instead of 1e-4 while (subIter < maxIterations && relativeError > 1e-4) { - // calculate summary of each cluster - val eachStats = dividableData.mapPartitions { iter => - val map = mutable.Map.empty[Long, (BV[Double], Double, Double)] - iter.foreach { case (idx, point) => - // calculate next index number - val childrenCenters = Array(2 * idx, 2 * idx + 1) - .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) - if (childrenCenters.length == 2) { - val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point) - val nextIndex = 2 * idx + closestIndex - - // get a map value or else get a sparse vector - val (sumBV, n, sumOfSquares) = map - .getOrElse( - nextIndex, - (BSV.zeros[Double](point.size), 0.0, 0.0) - ) - map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + math.pow(breezeSum(point), 2.0)) - } + // convert each index into the closest child index + val bcNewCenters = sc.broadcast(newCenters) + val nextData = dividableData.map { case (idx, point) => + // calculate next index number + val childIndexes = Array(2 * idx, 2 * idx + 1) + val childrenCenters = childIndexes + .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) + if (childrenCenters.length != 2) { + new SparkException(s"A node whose index is ${idx} doesn't have two children") } - map.toIterator - }.reduceByKey { case ((sv1, n1, sumOfSquares1), (sv2, n2, sumOfSquares2)) => - // sum the accumulation and the count in the all partition - (sv1 + sv2, n1 + n2, sumOfSquares1 + sumOfSquares2) - }.collect().toMap + val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point) + val nextIndex = 2 * idx + closestIndex + (nextIndex, point) + } - // calculate the center of each cluster - newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)} - bcNewCenters = sc.broadcast(newCenters) + // summarize each cluster + val zeroValue = (BV.zeros[Double](dimension), 0L, 0.0) + val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => { + val sums = acc._1 + point + val n = acc._2 + 1L + val sumOfSquares = acc._3 + (point dot point) + (sums, n, sumOfSquares) + } + val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) => + (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3) + val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap() + // calculate the center of each cluster + newCenters = tempStats.map {case (idx, (sums, n, sumOfSquares)) => (idx, sums :/ n.toDouble)} // update summary of each cluster - stats = eachStats.toMap + stats = tempStats.toMap - totalStd = stats.map { case (idx, (sums, n, sumOfSquares)) => - (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0) - }.sum - relativeError = math.abs(oldTotalStd - totalStd) / totalStd - oldTotalStd = totalStd + totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfSquares)) => sumOfSquares}.sum + relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares + oldTotalSumOfSquares = totalSumOfSquares subIter += 1 } stats.map { case (i, (sums, n, sumOfSquares)) => From ee3ea621a5839fd04ae174553d809a2ad57d2127 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 00:35:54 -0700 Subject: [PATCH 64/76] Modify `getMinimumNumNodesInTree` with `1 << multiplier` --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 77d3b9ea4aa2a..1166cf108060b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -214,11 +214,9 @@ private[clustering] object BisectingKMeans { * @param k: the number of leaf nodes */ def getMinimumNumNodesInTree(k: Int): Int = { - val multiplier = math.ceil(math.log(k) / math.log(2.0)) // the calculation is same as `math.pow(2, multiplier)` - var numNodes = 2 - (1 to multiplier.toInt).foreach (i => numNodes = numNodes << 1) - numNodes + val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1 + 1 << multiplier.toInt } /** From 73e2c7afd21b29443f6dfe106b87af0cc68944eb Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 00:37:10 -0700 Subject: [PATCH 65/76] Rename `numClusters` parameter to `k` --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 1166cf108060b..ae2d6016648e0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -476,13 +476,13 @@ private[clustering] object BisectingKMeans { * * @param treeMap divided clusters as a Map class * @param rootIndex index you want to start - * @param numClusters the number of clusters you want + * @param k the number of clusters you want * @return a built cluster tree */ private def buildTree( treeMap: Map[Long, BisectingClusterNode], rootIndex: Long, - numClusters: Int): Option[BisectingClusterNode] = { + k: Int): Option[BisectingClusterNode] = { // if there is no index in the Map if (!treeMap.contains(rootIndex)) return None @@ -491,7 +491,7 @@ private[clustering] object BisectingKMeans { var numLeavesClusters = 1 val root = treeMap(rootIndex) var leavesQueue = Map(rootIndex -> root) - while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) { + while (leavesQueue.nonEmpty && numLeavesClusters < k) { // pick up the largest cluster by the maximum cost of all the clusters val mostScattered = leavesQueue.maxBy(_._2.cost) val mostScatteredKey = mostScattered._1 From a876ba233e197bb5228a05fe2946523af1b40ee6 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 00:38:29 -0700 Subject: [PATCH 66/76] Rename a variable in `BisectingKMeansModelSuite` --- .../mllib/clustering/BisectingKMeansModelSuite.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index 667c96eb72d70..3c0892c225d5c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -104,11 +104,11 @@ class BisectingKMeansModelSuite } test("clustering should be done correctly") { - for (numClusters <- Array(9, 19)) { - val app = new BisectingKMeans().setK(numClusters).setSeed(1) + for (k <- Array(9, 19)) { + val app = new BisectingKMeans().setK(k).setSeed(1) val localData = (1 to 19).toSeq.map { i => - val label = i % numClusters - val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble))) + val label = i % k + val sparseVector = Vectors.sparse(k, Seq((label, label.toDouble))) val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector) (label, denseVector, sparseVector) } @@ -116,13 +116,13 @@ class BisectingKMeansModelSuite // dense version val denseData = sc.parallelize(localData.map(_._2), 2) val denseModel = app.run(denseData) - assert(denseModel.getCenters.length === numClusters) + assert(denseModel.getCenters.length === k) assert(denseModel.getClusters.forall(_.cost == 0.0)) // sparse version val sparseData = sc.parallelize(localData.map(_._3), 2) val sparseModel = app.run(sparseData) - assert(sparseModel.getCenters.length === numClusters) + assert(sparseModel.getCenters.length === k) assert(sparseModel.getClusters.forall(_.cost == 0.0)) } } From 1985feafa6254167de6b2d650312456c7bacba57 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 12:06:18 -0700 Subject: [PATCH 67/76] tmp --- .../mllib/clustering/BisectingKMeans.scala | 43 ++++++++----------- .../clustering/BisectingKMeansSuite.scala | 8 ++-- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index ae2d6016648e0..84c3f18d9debb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -225,28 +225,23 @@ private[clustering] object BisectingKMeans { * @param data pairs of point and its cluster index */ def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = { - val dimension = data.first()._2.size - // zeroValue: (#rows, sum of vectors, sum of squares) - val zeroValue = (0L, BV.zeros[Double](dimension), 0.0) - val seqOp = (acc: (Long, BV[Double], Double), point: BV[Double]) => { - val n = acc._1 + 1L - val sums = acc._2 + point - val sumOfSquares = acc._3 + math.pow(breezeSum(point), 2.0) - (n, sums, sumOfSquares) + // sum the number of node and points of each cluster + val stats = data.map {case (idx, p) => + (idx, (p, 1L)) + }.reduceByKey {case ((p1, n1), (p2, n2)) => (p1 + p2, n1 + n2) }.collectAsMap() + + // calculate within-cluster sum of squares of each cluster + val bcStats = data.sparkContext.broadcast(stats) + val sumOfSquaresMap = data.map { case (idx, point) => + val meanPoint = bcStats.value.apply(idx)._1 :/ bcStats.value.apply(idx)._2.toDouble + (idx, (point - meanPoint) dot (point - meanPoint)) + }.reduceByKey(_ + _).collectAsMap() + + stats.map { case (idx, (sumPoint, n)) => + val meanPoint = sumPoint :/ n.toDouble + val sumOfSquares = sumOfSquaresMap(idx) + (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares)) } - val comOp = - (acc1: (Long, BV[Double], Double), acc2: (Long, BV[Double], Double)) => - (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3) - - val stats = data.aggregateByKey(zeroValue)(seqOp, comOp) - stats.map { case (i, (n, sums, sumOfSquare)) => - val meanPoint = calcMean(n, sums) - val variance = n match { - case n if n < 2 => 0.0 - case _ => (sumOfSquare / n) - math.pow(breezeSum(sums) / n, 2.0) - } - (i, new BisectingClusterStat(n, meanPoint, variance)) - }.collectAsMap() } /** @@ -663,14 +658,14 @@ class BisectingClusterNode private ( * * @param rows the number of points * @param mean the sum of points - * @param variance the sum of squares of points + * @param sumOfSquares the sum of squares of points */ private[clustering] case class BisectingClusterStat ( rows: Long, mean: BV[Double], - variance: Double) extends Serializable { + sumOfSquares: Double) extends Serializable { - def isDividable: Boolean = variance > 0 && rows >= 2 + def isDividable: Boolean = sumOfSquares > 0 && rows >= 2 } private[clustering] object BisectingClusterStat { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 6eaa40bb26378..1ef02676294b3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -89,16 +89,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val clusterStats = BisectingKMeans.summarizeClusters(data) assert(clusterStats.size === 4) assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze) - assert(clusterStats(4).variance ~== 1.0 absTol 10e-4) + assert(clusterStats(4).sumOfSquares ~== 1.0 absTol 10e-4) assert(clusterStats(4).rows === 2) assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze) - assert(clusterStats(5).variance ~== 1.0 absTol 10e-4) + assert(clusterStats(5).sumOfSquares ~== 1.0 absTol 10e-4) assert(clusterStats(5).rows === 2) assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze) - assert(clusterStats(6).variance ~== 1.0 absTol 10e-4) + assert(clusterStats(6).sumOfSquares ~== 1.0 absTol 10e-4) assert(clusterStats(6).rows === 2) assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze) - assert(clusterStats(7).variance ~== 1.0 absTol 10e-4) + assert(clusterStats(7).sumOfSquares ~== 1.0 absTol 10e-4) assert(clusterStats(7).rows === 2) } From 629f8970ebcfc78086a98911a9bfbf9a2b0b0db0 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:14:51 -0700 Subject: [PATCH 68/76] Make this implementation more simple --- .../mllib/clustering/BisectingKMeans.scala | 95 +++++++------------ .../clustering/BisectingKMeansSuite.scala | 30 +++--- 2 files changed, 48 insertions(+), 77 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 84c3f18d9debb..8f9c5668683a2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -119,46 +119,42 @@ class BisectingKMeans private ( @Since("1.6.0") def run(input: RDD[Vector]): BisectingKMeansModel = { val sc = input.sparkContext + val startTime = System.currentTimeMillis() + var data = initData(input).cache() + // this is used for managing calculated cached RDDs + var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]] // `clusterStats` is described as binary tree structure // `clusterStats(1)` means the root of a binary tree - var clusterStats = mutable.Map.empty[Long, BisectingClusterStat] - var step = 1 - var noMoreDividable = false - var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]] + var leafClusterStats = summarizeClusters(data) + var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) + var clusterStats = leafClusterStats + // the minimum number of nodes of a binary tree by given parameter + var step = 1 val numNodeLimit = getMinimumNumNodesInTree(this.k) - // divide clusters until the number of clusters reachs the condition // or there is no dividable cluster - val startTime = System.currentTimeMillis() - var data = initData(input).cache() - while (clusterStats.size < numNodeLimit && noMoreDividable == false) { + while (clusterStats.size < numNodeLimit && dividableLeafClusters.nonEmpty) { logInfo(s"${sc.appName} starts step ${step}") + + // can be clustered if the number of divided clusterStats is equal to 0 // TODO Remove non-leaf cluster stats from `leafClusterStats` - val leafClusterStats = summarizeClusters(data) - val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) + val dividedData = divideClusters(data, dividableLeafClusters, maxIterations).cache() + leafClusterStats = summarizeClusters(dividedData) + dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusterStats - // can be clustered if the number of divided clusterStats is equal to 0 - val divided = divideClusters(data, dividableLeafClusters, maxIterations) // update each index - val newData = updateClusterIndex(data, divided).cache() - updatedDataHistory = updatedDataHistory ++ Array(data) - data = newData + updatedDataHistory = updatedDataHistory ++ Array(dividedData) + data = dividedData // keep recent 2 cached RDDs in order to run more quickly if (updatedDataHistory.length > 1) { val head = updatedDataHistory.head updatedDataHistory = updatedDataHistory.tail head.unpersist() } - clusterStats = clusterStats ++ divided step += 1 - logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}") - - if (dividableLeafClusters.isEmpty) { - noMoreDividable = true - } } // create a map of cluster node with their costs val nodes = createClusterNodes(data, clusterStats) @@ -190,8 +186,6 @@ class BisectingKMeans private ( private[clustering] object BisectingKMeans { - import BisectingClusterStat._ - val ROOT_INDEX_KEY: Long = 1 /** @@ -239,7 +233,7 @@ private[clustering] object BisectingKMeans { stats.map { case (idx, (sumPoint, n)) => val meanPoint = sumPoint :/ n.toDouble - val sumOfSquares = sumOfSquaresMap(idx) + val sumOfSquares = math.abs(sumOfSquaresMap(idx)) (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares)) } } @@ -309,7 +303,7 @@ private[clustering] object BisectingKMeans { dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = { // If there is no divided clusters, return the original - if (dividedClusters.size == 0) { + if (dividedClusters.isEmpty) { return data } @@ -352,14 +346,14 @@ private[clustering] object BisectingKMeans { def divideClusters( data: RDD[(Long, BV[Double])], clusterStats: Map[Long, BisectingClusterStat], - maxIterations: Int): Map[Long, BisectingClusterStat] = { + maxIterations: Int): RDD[(Long, BV[Double])] = { val sc = data.sparkContext val appName = sc.appName // get keys of dividable clusters val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable } if (dividableClusterStats.isEmpty) { - return Map.empty[Long, BisectingClusterStat] + return data } // extract dividable input data val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} @@ -371,6 +365,7 @@ private[clustering] object BisectingKMeans { // pairs of cluster index and (sums, #points, sumOfSquares) var stats = Map.empty[Long, (BV[Double], Long, Double)] + var nextData = data var subIter = 0 var totalSumOfSquares = Double.MaxValue var oldTotalSumOfSquares = Double.MaxValue @@ -380,7 +375,7 @@ private[clustering] object BisectingKMeans { while (subIter < maxIterations && relativeError > 1e-4) { // convert each index into the closest child index val bcNewCenters = sc.broadcast(newCenters) - val nextData = dividableData.map { case (idx, point) => + nextData = dividableData.map { case (idx, point) => // calculate next index number val childIndexes = Array(2 * idx, 2 * idx + 1) val childrenCenters = childIndexes @@ -398,28 +393,22 @@ private[clustering] object BisectingKMeans { val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => { val sums = acc._1 + point val n = acc._2 + 1L - val sumOfSquares = acc._3 + (point dot point) - (sums, n, sumOfSquares) + val sumOfNorm = acc._3 + (point dot point) + (sums, n, sumOfNorm) } val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) => - (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3) + (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc1._3) val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap() // calculate the center of each cluster - newCenters = tempStats.map {case (idx, (sums, n, sumOfSquares)) => (idx, sums :/ n.toDouble)} - // update summary of each cluster - stats = tempStats.toMap + newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)} - totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfSquares)) => sumOfSquares}.sum + totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares oldTotalSumOfSquares = totalSumOfSquares subIter += 1 } - stats.map { case (i, (sums, n, sumOfSquares)) => - val mean = calcMean(n.toLong, sums) - val variance = (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0) - i -> new BisectingClusterStat(n.toLong, mean, variance) - } + nextData } /** @@ -442,27 +431,11 @@ private[clustering] object BisectingKMeans { private def createClusterNodesWithAverageCost( data: RDD[(Long, BV[Double])], stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = { - - // calculate average costs of all clusters - val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean }) - val costs = data.mapPartitions { iter => - val counters = mutable.Map.empty[Long, (Long, Double)] - bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)} - iter.foreach { case (i, point) => - val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0) - counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost) - } - counters.toIterator - }.reduceByKey { case((n1, cost1), (n2, cost2)) => - (n1 + n2, cost1 + cost2) - }.collectAsMap() - - stats.map { case (i, stat) => - val avgCost = costs(i)._1 match { - case x if x == 0.0 => 0.0 - case _ => costs(i)._2 / costs(i)._1 - } - i -> new BisectingClusterNode(Vectors.fromBreeze(stat.mean), stat.rows, avgCost) + stats.map { case (idx, clusterStats) => + val rows = clusterStats.rows + val center = clusterStats.mean + val cost = math.sqrt(clusterStats.sumOfSquares) / rows + idx -> new BisectingClusterNode(Vectors.fromBreeze(center), rows, cost) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 1ef02676294b3..1177e3b293de2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -33,7 +33,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val data = sc.parallelize(localSeed, 2) val model = algo.run(data) assert(model.getClusters.length == 123) - assert(model.node.getHeight ~== 705.6925 absTol 10E-4) + assert(model.node.getHeight ~== 702.8641 absTol 10E-4) // check the relations between a parent cluster and its children assert(model.node.getParent === None) @@ -133,11 +133,10 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { 6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance), 7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance) ) - val data = sc.parallelize(seed) + val data = sc.parallelize(seed, 1) val leafClusterStats = BisectingKMeans.summarizeClusters(data) val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) - val divided = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20) - val result = BisectingKMeans.updateClusterIndex(data, divided).collect().toSeq + val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20).collect() val expected = Seq( (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), @@ -180,19 +179,18 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)), (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1)) ) - val data = sc.parallelize(local) + val data = sc.parallelize(local, 1) val stats = BisectingKMeans.summarizeClusters(data) - val newClusterStats = BisectingKMeans.divideClusters(data, stats, 20) - - assert(newClusterStats.size === 4) - assert(newClusterStats(4).mean === BV[Double](1.0, 1.0)) - assert(newClusterStats(4).rows === 2) - assert(newClusterStats(5).mean === BV[Double](10.0, 10.0)) - assert(newClusterStats(5).rows === 2) - assert(newClusterStats(6).mean === BV[Double](100.0, 100.0)) - assert(newClusterStats(6).rows === 2) - assert(newClusterStats(7).mean === BV[Double](110.0, 110.0)) - assert(newClusterStats(7).rows === 2) + val dividedData = BisectingKMeans.divideClusters(data, stats, 20).collect() + + assert(dividedData(0) == (4L, BV[Double](0.9, 0.9))) + assert(dividedData(1) == (4L, BV[Double](1.1, 1.1))) + assert(dividedData(2) == (5L, BV[Double](9.9, 9.9))) + assert(dividedData(3) == (5L, BV[Double](10.1, 10.1))) + assert(dividedData(4) == (6L, BV[Double](99.9, 99.9))) + assert(dividedData(5) == (6L, BV[Double](100.1, 100.1))) + assert(dividedData(6) == (7L, BV[Double](109.9, 109.9))) + assert(dividedData(7) == (7L, BV[Double](110.1, 110.1))) } } From 1f84ded143748ea9dae3f13eb737de394ba0f6e9 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:28:26 -0700 Subject: [PATCH 69/76] Reorganize import statements and adjust parameters and return values --- .../mllib/clustering/BisectingKMeans.scala | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 8f9c5668683a2..493849fa477a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,9 +17,7 @@ package org.apache.spark.mllib.clustering -import scala.collection.{Map, mutable} - -import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, sum => breezeSum} +import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.{Logging, SparkException} import org.apache.spark.annotation.Since @@ -218,7 +216,10 @@ private[clustering] object BisectingKMeans { * * @param data pairs of point and its cluster index */ - def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = { + def summarizeClusters( + data: RDD[(Long, BV[Double])] + ): collection.Map[Long, BisectingClusterStat] = { + // sum the number of node and points of each cluster val stats = data.map {case (idx, p) => (idx, (p, 1L)) @@ -253,15 +254,15 @@ private[clustering] object BisectingKMeans { */ def initNextCenters( data: RDD[(Long, BV[Double])], - stats: Map[Long, BisectingClusterStat]): Map[Long, BV[Double]] = { + stats: collection.Map[Long, BisectingClusterStat]): collection.Map[Long, BV[Double]] = { // Since the combination sampleByKey and groupByKey is more expensive, // this as follows would be better. val bcIndeces = data.sparkContext.broadcast(stats.keySet) val samples = data.mapPartitions { iter => - val map = mutable.Map.empty[Long, mutable.ArrayBuffer[BV[Double]]] + val map = collection.mutable.Map.empty[Long, collection.mutable.ArrayBuffer[BV[Double]]] - bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]} + bcIndeces.value.foreach {i => map(i) = collection.mutable.ArrayBuffer.empty[BV[Double]]} val LOCAL_SAMPLE_SIZE = 100 iter.foreach { case (i, point) => map(i).append(point) @@ -269,14 +270,14 @@ private[clustering] object BisectingKMeans { // the number of elements is cut off at the right time. if (map(i).size > LOCAL_SAMPLE_SIZE) { val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) - map(i) = mutable.ArrayBuffer(elements.head, elements.last) + map(i) = collection.mutable.ArrayBuffer(elements.head, elements.last) } } // in order to reduce the shuffle size, take only two elements map.filterNot(_._2.isEmpty).map { case (i, points) => val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) - i -> mutable.ArrayBuffer(elements.head, elements.last) + i -> collection.mutable.ArrayBuffer(elements.head, elements.last) }.toIterator }.reduceByKey { case (points1, points2) => points1.union(points2) @@ -345,7 +346,7 @@ private[clustering] object BisectingKMeans { */ def divideClusters( data: RDD[(Long, BV[Double])], - clusterStats: Map[Long, BisectingClusterStat], + clusterStats: collection.Map[Long, BisectingClusterStat], maxIterations: Int): RDD[(Long, BV[Double])] = { val sc = data.sparkContext val appName = sc.appName @@ -419,8 +420,8 @@ private[clustering] object BisectingKMeans { */ def createClusterNodes( data: RDD[(Long, BV[Double])], - stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = { - + stats: collection.Map[Long, BisectingClusterStat] + ): collection.Map[Long, BisectingClusterNode] = { // TODO: support other cost, such as entropy createClusterNodesWithAverageCost(data, stats) } @@ -430,7 +431,9 @@ private[clustering] object BisectingKMeans { */ private def createClusterNodesWithAverageCost( data: RDD[(Long, BV[Double])], - stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = { + stats: collection.Map[Long, BisectingClusterStat] + ): collection.Map[Long, BisectingClusterNode] = { + stats.map { case (idx, clusterStats) => val rows = clusterStats.rows val center = clusterStats.mean @@ -448,7 +451,7 @@ private[clustering] object BisectingKMeans { * @return a built cluster tree */ private def buildTree( - treeMap: Map[Long, BisectingClusterNode], + treeMap: collection.Map[Long, BisectingClusterNode], rootIndex: Long, k: Int): Option[BisectingClusterNode] = { From 12b322392682dce1be666ad20e5cb72eb2f49197 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:30:34 -0700 Subject: [PATCH 70/76] Rename `WSSSE` to `computeCost` --- .../apache/spark/mllib/clustering/BisectingKMeansModel.scala | 4 ++-- .../spark/mllib/clustering/BisectingKMeansModelSuite.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 8177cd360d28e..38f4695eb0d26 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -75,7 +75,7 @@ class BisectingKMeansModel @Since("1.6.0") ( * Computes Within Set Sum of Squared Error(WSSSE) */ @Since("1.6.0") - def WSSSE(data: RDD[Vector]): Double = { + def computeCost(data: RDD[Vector]): Double = { val bvCenters = this.getCenters.map(_.toBreeze) data.context.broadcast(bvCenters) val distances = data.map {point => @@ -90,7 +90,7 @@ class BisectingKMeansModel @Since("1.6.0") ( } @Since("1.6.0") - def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd) + def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala index 3c0892c225d5c..ceac039efc8d0 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala @@ -62,7 +62,7 @@ class BisectingKMeansModelSuite assert(predicted === localData.map(_._1)) // compute WSSSE - assert(model.WSSSE(data) === 0.0) + assert(model.computeCost(data) === 0.0) } test("clustering sparse vectors") { @@ -100,7 +100,7 @@ class BisectingKMeansModelSuite assert(predicted === localData.map(_._1)) // compute WSSSE - assert(model.WSSSE(data) === 0.0) + assert(model.computeCost(data) === 0.0) } test("clustering should be done correctly") { From ef4a3e86ff1289a4508beb600f147699752b7987 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:35:02 -0700 Subject: [PATCH 71/76] Remove `updateClusterIndex` --- .../mllib/clustering/BisectingKMeans.scala | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 493849fa477a6..72a56b9635f16 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -293,50 +293,6 @@ private[clustering] object BisectingKMeans { nextCenters } - /** - * Updates the indexes of clusters which is divided to its children indexes - * - * @param data pairs of point and its cluster index - * @param dividedClusters pairs of cluster index and cluster statistics - */ - def updateClusterIndex( - data: RDD[(Long, BV[Double])], - dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = { - - // If there is no divided clusters, return the original - if (dividedClusters.isEmpty) { - return data - } - - // extract the centers of the clusters - val sc = data.sparkContext - val centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)} - val bcCenters = sc.broadcast(centers) - - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val bcMetric = sc.broadcast(metric) - - // update the indexes to their children indexes - data.map { case (idx, point) => - // TODO improve how to extract child indexes - val childIndexes = Array(2 * idx, 2 * idx + 1) - val extractedChildIndexes = childIndexes.filter(c => bcCenters.value.contains(c)) - extractedChildIndexes.length match { - // update the indexes - case s if s == 2 => { - val nextCenters = extractedChildIndexes.map(bcCenters.value(_)) - val closestIndex = BisectingKMeans - .findClosestCenter(bcMetric.value)(nextCenters)(point) - val nextIndex = 2 * idx + closestIndex - (nextIndex, point) - } - // stay the index if a cluster which a point belongs wasn't divided - case _ => (idx, point) - } - } - } - /** * Divides clusters according to their statistics * From 57b06bab01fc9b1404f87ec4c4a8319963a6894c Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:35:24 -0700 Subject: [PATCH 72/76] Remove BisectingClusterStat object --- .../spark/mllib/clustering/BisectingKMeans.scala | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 72a56b9635f16..00fbcb4519d7e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -599,19 +599,3 @@ private[clustering] case class BisectingClusterStat ( def isDividable: Boolean = sumOfSquares > 0 && rows >= 2 } - -private[clustering] object BisectingClusterStat { - // calculate a mean vector - def calcMean(rows: Long, sums: BV[Double]): BV[Double] = sums :/ rows.toDouble - - // calculate a variance - def getVariance(rows: Long, sums: BV[Double], sumOfSquares: BV[Double]): Double = { - val variances: BV[Double] = rows match { - case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble) - case _ => BV.zeros[Double](sums.size) - } - breezeNorm(variances, 2.0) - } -} - - From 5da05d3fd41d5b730d19f595ca9bf622aaf0a14d Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 14:42:49 -0700 Subject: [PATCH 73/76] Fix minors --- .../mllib/clustering/BisectingKMeans.scala | 28 +++++++++---------- .../clustering/BisectingKMeansSuite.scala | 6 +++- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 00fbcb4519d7e..9354a53884b33 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -119,11 +119,11 @@ class BisectingKMeans private ( val sc = input.sparkContext val startTime = System.currentTimeMillis() var data = initData(input).cache() - // this is used for managing calculated cached RDDs var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]] - // `clusterStats` is described as binary tree structure + // `clusterStats` is described as binary tree structure as Map // `clusterStats(1)` means the root of a binary tree + // `clusterStats(2n)` and `clusterStats(2n+1)` are the children of `clusterStats(n)` var leafClusterStats = summarizeClusters(data) var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) var clusterStats = leafClusterStats @@ -143,16 +143,15 @@ class BisectingKMeans private ( dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusterStats - // update each index + // keep recent 2 cached RDDs in order to run more quickly updatedDataHistory = updatedDataHistory ++ Array(dividedData) data = dividedData - // keep recent 2 cached RDDs in order to run more quickly + step += 1 if (updatedDataHistory.length > 1) { val head = updatedDataHistory.head updatedDataHistory = updatedDataHistory.tail head.unpersist() } - step += 1 } // create a map of cluster node with their costs val nodes = createClusterNodes(data, clusterStats) @@ -312,24 +311,25 @@ private[clustering] object BisectingKMeans { if (dividableClusterStats.isEmpty) { return data } + // extract dividable input data val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} - + // get next initial centers var newCenters = initNextCenters(dividableData, dividableClusterStats) - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val bcMetric = sc.broadcast(metric) - // pairs of cluster index and (sums, #points, sumOfSquares) - var stats = Map.empty[Long, (BV[Double], Long, Double)] - var nextData = data var subIter = 0 var totalSumOfSquares = Double.MaxValue var oldTotalSumOfSquares = Double.MaxValue var relativeError = Double.MaxValue val dimension = dividableData.first()._2.size - // TODO add a set method for the threshold, instead of 1e-4 + + // TODO Supports distance metrics other Euclidean distance metric + val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) + val bcMetric = sc.broadcast(metric) + while (subIter < maxIterations && relativeError > 1e-4) { + // TODO add a set method for the threshold, instead of 1e-4 + // convert each index into the closest child index val bcNewCenters = sc.broadcast(newCenters) nextData = dividableData.map { case (idx, point) => @@ -360,7 +360,7 @@ private[clustering] object BisectingKMeans { // calculate the center of each cluster newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)} - totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum + totalSumOfSquares = tempStats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares oldTotalSumOfSquares = totalSumOfSquares subIter += 1 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 1177e3b293de2..f0947e336cf7c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.TestingUtils._ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { test("run") { - val algo = new BisectingKMeans().setK(123).setSeed(1) + val k = 123 + val algo = new BisectingKMeans().setK(k).setSeed(1) val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq val data = sc.parallelize(localSeed, 2) val model = algo.run(data) @@ -40,6 +41,9 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.node.getChildren.head.getParent.get === model.node) assert(model.node.getChildren.apply(1).getParent.get === model.node) assert(model.getClusters.forall(_.getParent.isDefined)) + + val predicted = model.predict(data) + assert(predicted.distinct.count() === k) } test("run with too many cluster size than the records") { From a50689a144bf5801c831d0b2f33eb6435e87f929 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Thu, 29 Oct 2015 16:13:55 -0700 Subject: [PATCH 74/76] Improve `initNextCenters` --- .../mllib/clustering/BisectingKMeans.scala | 63 +++++++------------ .../clustering/BisectingKMeansSuite.scala | 6 +- 2 files changed, 26 insertions(+), 43 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 9354a53884b33..1601d6c84e217 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,8 +17,9 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{Vector => BV, norm => breezeNorm} +import breeze.linalg.{Vector => BV, SparseVector => BSV, norm => breezeNorm} +import org.apache.spark.util.random.XORShiftRandom import org.apache.spark.{Logging, SparkException} import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -138,7 +139,7 @@ class BisectingKMeans private ( // can be clustered if the number of divided clusterStats is equal to 0 // TODO Remove non-leaf cluster stats from `leafClusterStats` - val dividedData = divideClusters(data, dividableLeafClusters, maxIterations).cache() + val dividedData = divideClusters(data, dividableLeafClusters, maxIterations, seed).cache() leafClusterStats = summarizeClusters(dividedData) dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) clusterStats = clusterStats ++ leafClusterStats @@ -248,47 +249,24 @@ private[clustering] object BisectingKMeans { /** * Gets the initial centers for bisecting k-means * - * @param data pairs of point and its cluster index * @param stats pairs of cluster index and cluster statistics + * @param seed random seed */ def initNextCenters( - data: RDD[(Long, BV[Double])], - stats: collection.Map[Long, BisectingClusterStat]): collection.Map[Long, BV[Double]] = { - - // Since the combination sampleByKey and groupByKey is more expensive, - // this as follows would be better. - val bcIndeces = data.sparkContext.broadcast(stats.keySet) - val samples = data.mapPartitions { iter => - val map = collection.mutable.Map.empty[Long, collection.mutable.ArrayBuffer[BV[Double]]] - - bcIndeces.value.foreach {i => map(i) = collection.mutable.ArrayBuffer.empty[BV[Double]]} - val LOCAL_SAMPLE_SIZE = 100 - iter.foreach { case (i, point) => - map(i).append(point) - // to avoid to increase the memory usage on each map thread, - // the number of elements is cut off at the right time. - if (map(i).size > LOCAL_SAMPLE_SIZE) { - val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) - map(i) = collection.mutable.ArrayBuffer(elements.head, elements.last) - } - } + stats: collection.Map[Long, BisectingClusterStat], + seed: Long + ): collection.Map[Long, BV[Double]] = { - // in order to reduce the shuffle size, take only two elements - map.filterNot(_._2.isEmpty).map { case (i, points) => - val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) - i -> collection.mutable.ArrayBuffer(elements.head, elements.last) - }.toIterator - }.reduceByKey { case (points1, points2) => - points1.union(points2) - }.collect() - - val nextCenters = samples.flatMap { case (i, points) => - val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0)) - Array((2 * i, elements.head), (2 * i + 1, elements.last)) + val random = new XORShiftRandom() + random.setSeed(seed) + val nextCenters = stats.flatMap { case (idx, clusterStats) => + val center = clusterStats.mean + val stdev = math.sqrt(clusterStats.sumOfSquares) / clusterStats.rows + val activeKeys = clusterStats.mean.activeKeysIterator.toArray + val activeValues = activeKeys.map(i => random.nextDouble() * stdev) + val perturbation = new BSV[Double](activeKeys, activeValues, clusterStats.mean.size) + Array((2 * idx, center - perturbation), (2 * idx + 1, center + perturbation)) }.toMap - if (!stats.keySet.flatMap(idx => Array(2 * idx, 2 * idx + 1)).forall(nextCenters.contains(_))) { - throw new SparkException("Failed to initialize centers for next step") - } nextCenters } @@ -298,11 +276,15 @@ private[clustering] object BisectingKMeans { * @param data pairs of point and its cluster index * @param clusterStats target clusters to divide * @param maxIterations the maximum iterations to calculate clusters statistics + * @param seed random seed */ def divideClusters( data: RDD[(Long, BV[Double])], clusterStats: collection.Map[Long, BisectingClusterStat], - maxIterations: Int): RDD[(Long, BV[Double])] = { + maxIterations: Int, + seed: Long + ): RDD[(Long, BV[Double])] = { + val sc = data.sparkContext val appName = sc.appName @@ -315,7 +297,7 @@ private[clustering] object BisectingKMeans { // extract dividable input data val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} // get next initial centers - var newCenters = initNextCenters(dividableData, dividableClusterStats) + var newCenters = initNextCenters(dividableClusterStats, seed) var nextData = data var subIter = 0 var totalSumOfSquares = Double.MaxValue @@ -596,6 +578,7 @@ private[clustering] case class BisectingClusterStat ( rows: Long, mean: BV[Double], sumOfSquares: Double) extends Serializable { + require(sumOfSquares >= 0.0) def isDividable: Boolean = sumOfSquares > 0 && rows >= 2 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index f0947e336cf7c..74e12d00c2022 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -116,7 +116,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { 2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0), 3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0) ) - val initNextCenters = BisectingKMeans.initNextCenters(data, stats) + val initNextCenters = BisectingKMeans.initNextCenters(stats, 1) assert(initNextCenters.size === 4) assert(initNextCenters.keySet === Set(4, 5, 6, 7)) } @@ -140,7 +140,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { val data = sc.parallelize(seed, 1) val leafClusterStats = BisectingKMeans.summarizeClusters(data) val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) - val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20).collect() + val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20, 1).collect() val expected = Seq( (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), @@ -185,7 +185,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { ) val data = sc.parallelize(local, 1) val stats = BisectingKMeans.summarizeClusters(data) - val dividedData = BisectingKMeans.divideClusters(data, stats, 20).collect() + val dividedData = BisectingKMeans.divideClusters(data, stats, 20, 1).collect() assert(dividedData(0) == (4L, BV[Double](0.9, 0.9))) assert(dividedData(1) == (4L, BV[Double](1.1, 1.1))) From d422be759b967ab5c4bef0f6a34aa53f1a3a4c77 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 9 Nov 2015 00:13:21 -0800 Subject: [PATCH 75/76] refactor --- .../mllib/clustering/BisectingKMeans.scala | 771 ++++++++---------- .../clustering/BisectingKMeansModel.scala | 73 +- .../clustering/JavaBisectingKMeansSuite.java | 83 +- .../BisectingKMeansModelSuite.scala | 129 --- .../clustering/BisectingKMeansSuite.scala | 296 ++++--- 5 files changed, 537 insertions(+), 815 deletions(-) delete mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 1601d6c84e217..9a7916f75dcc7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -17,53 +17,45 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{Vector => BV, SparseVector => BSV, norm => breezeNorm} +import java.util.Random -import org.apache.spark.util.random.XORShiftRandom -import org.apache.spark.{Logging, SparkException} -import org.apache.spark.annotation.Since -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.rdd.RDD +import scala.collection.mutable +import org.apache.spark.Logging +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel /** - * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm. - * - * The main idea of this algorithm is based on "A comparison of document clustering techniques", - * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000. - * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf - * - * However, we modified it to fit for Spark. This algorithm consists of the two main parts. + * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" + * by Steinbach, Karypis, and Kumar, with modification to fit Spark. + * The algorithm starts from a single cluster that contains all points. + * Iteratively it finds divisible clusters on the bottom level and bisects each of them using + * k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible. + * The bisecting steps of clusters on the same level are grouped together to increase parallelism. + * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters, + * larger clusters get higher priority. * - * 1. Split clusters until the number of clusters will be enough to build a cluster tree - * 2. Build a cluster tree as a binary tree by the splitted clusters + * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if + * there are no divisible leaf clusters. + * @param maxIterations the max number of k-means iterations to split clusters (default: 20) + * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion + * of points (if < 1.0) of a divisible cluster (default: 1) + * @param seed a random seed (default: hash value of the class name) * - * First, it splits clusters to their children clusters step by step, not considering a cluster - * will be included in the final cluster tree or not. That's because it makes the algorithm more - * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until - * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting - * when there are no dividable clusters before the number of clusters will be sufficient. And - * it calculates the costs, such as average cost, entropy and so on, for building a cluster - * tree in the first part. The costs means how large the cluster is. That is, the cluster - * whose cost is maximum of all the clusters is the largest cluster. - * - * Second, it builds a cluster tree as a binary tree by the result of the first part. - * First of all, the cluster tree starts with only the root cluster which includes all points. - * So, there are two candidates which can be merged to the cluster tree. Those are the children of - * the root. Then, it picks up the larger child of the two and merge it to the cluster tree. - * After that, there are tree candidates to merge. Those are the smaller child of the root and - * the two children of the larger cluster of the root. It picks up the largest cluster of the tree - * and merge it to the * cluster tree. Like this, it continues to pick up the largest one of the - * candidates and merge it to the cluster tree until the desired number of clusters is reached. - * - * @param k tne desired number of clusters - * @param maxIterations the number of maximal iterations to split clusters - * @param seed a random seed + * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf + * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, + * KDD Workshop on Text Mining, 2000.]] */ @Since("1.6.0") +@Experimental class BisectingKMeans private ( private var k: Int, private var maxIterations: Int, + private var minDivisibleClusterSize: Double, private var seed: Long) extends Logging { import BisectingKMeans._ @@ -72,34 +64,62 @@ class BisectingKMeans private ( * Constructs with the default configuration */ @Since("1.6.0") - def this() = this(2, 20, 1) + def this() = this(4, 20, 1.0, classOf[BisectingKMeans].getName.##) /** - * Sets the number of clusters you want + * Sets the desired number of leaf clusters (default: 4). + * The actual number could be smaller if there are no divisible leaf clusters. */ @Since("1.6.0") def setK(k: Int): this.type = { + require(k > 0, s"k must be positive but got $k.") this.k = k this } + /** + * Gets the desired number of leaf clusters. + */ @Since("1.6.0") def getK: Int = this.k /** - * Sets the number of maximal iterations in each clustering step + * Sets the max number of k-means iterations to split clusters (default: 20). */ @Since("1.6.0") def setMaxIterations(maxIterations: Int): this.type = { + require(maxIterations > 0, s"maxIterations must be positive but got $maxIterations.") this.maxIterations = maxIterations this } + /** + * Gets the max number of k-means iterations to split clusters. + */ @Since("1.6.0") def getMaxIterations: Int = this.maxIterations /** - * Sets the random seed + * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points + * (if < `1.0`) of a divisible cluster (default: 1). + */ + @Since("1.6.0") + def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = { + require(minDivisibleClusterSize > 0.0, + s"minDivisibleClusterSize must be positive but got $minDivisibleClusterSize.") + this.minDivisibleClusterSize = minDivisibleClusterSize + this + } + + /** + * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points + * (if < `1.0`) of a divisible cluster. + */ + @Since("1.6.0") + def getMinDivisibleClusterSize: Double = minDivisibleClusterSize + + /** + * Sets the random seed (default: hash value of the class name). */ @Since("1.6.0") def setSeed(seed: Long): this.type = { @@ -107,478 +127,363 @@ class BisectingKMeans private ( this } + /** + * Gets the random seed. + */ @Since("1.6.0") def getSeed: Long = this.seed /** - * Runs the bisecting k-means algorithm + * Runs the bisecting k-means algorithm. * @param input RDD of vectors * @return model for the bisecting kmeans */ @Since("1.6.0") def run(input: RDD[Vector]): BisectingKMeansModel = { - val sc = input.sparkContext - val startTime = System.currentTimeMillis() - var data = initData(input).cache() - var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]] - - // `clusterStats` is described as binary tree structure as Map - // `clusterStats(1)` means the root of a binary tree - // `clusterStats(2n)` and `clusterStats(2n+1)` are the children of `clusterStats(n)` - var leafClusterStats = summarizeClusters(data) - var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) - var clusterStats = leafClusterStats - - // the minimum number of nodes of a binary tree by given parameter - var step = 1 - val numNodeLimit = getMinimumNumNodesInTree(this.k) - // divide clusters until the number of clusters reachs the condition - // or there is no dividable cluster - while (clusterStats.size < numNodeLimit && dividableLeafClusters.nonEmpty) { - logInfo(s"${sc.appName} starts step ${step}") - - // can be clustered if the number of divided clusterStats is equal to 0 - // TODO Remove non-leaf cluster stats from `leafClusterStats` - val dividedData = divideClusters(data, dividableLeafClusters, maxIterations, seed).cache() - leafClusterStats = summarizeClusters(dividedData) - dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) - clusterStats = clusterStats ++ leafClusterStats - - // keep recent 2 cached RDDs in order to run more quickly - updatedDataHistory = updatedDataHistory ++ Array(dividedData) - data = dividedData - step += 1 - if (updatedDataHistory.length > 1) { - val head = updatedDataHistory.head - updatedDataHistory = updatedDataHistory.tail - head.unpersist() - } + if (input.getStorageLevel == StorageLevel.NONE) { + logWarning(s"The input RDD ${input.id} is not directly cached, which may hurt performance if" + + " its parent RDDs are also not cached.") } - // create a map of cluster node with their costs - val nodes = createClusterNodes(data, clusterStats) - // unpersist RDDs - data.unpersist() - updatedDataHistory.foreach(_.unpersist()) - - // build a cluster tree by Map class which is expressed - logInfo(s"Building the cluster tree is started in ${sc.appName}") - val root = buildTree(nodes, ROOT_INDEX_KEY, this.k) - if (root.isEmpty) { - new SparkException("Failed to build a cluster tree from a Map type of clusterStats") + val d = input.map(_.size).first() + logInfo(s"Feature dimension: $d.") + // Compute and cache vector norms for fast distance computation. + val norms = input.map(v => Vectors.norm(v, 2.0)).persist(StorageLevel.MEMORY_AND_DISK) + val vectors = input.zip(norms).map { case (x, norm) => new VectorWithNorm(x, norm) } + var assignments = vectors.map(v => (ROOT_INDEX, v)) + var activeClusters = summarize(d, assignments) + val rootSummary = activeClusters(ROOT_INDEX) + val n = rootSummary.size + logInfo(s"Number of points: $n.") + logInfo(s"Initial cost: ${rootSummary.cost}.") + val minSize = if (minDivisibleClusterSize >= 1.0) { + math.ceil(minDivisibleClusterSize).toLong + } else { + math.ceil(minDivisibleClusterSize * n).toLong } - - // set the elapsed time for training - val finishTime = (System.currentTimeMillis() - startTime) / 1000.0 - logInfo(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]") - - // make a bisecting kmeans model - val model = new BisectingKMeansModel(root.get) - val leavesNodes = model.getClusters - if (leavesNodes.length < this.k) { - logWarning(s"# clusters is less than you want: ${leavesNodes.length} / ${k}") + logInfo(s"The minimum number of points of a divisible cluster is $minSize.") + var inactiveClusters = mutable.Seq.empty[(Long, ClusterSummary)] + val random = new Random(seed) + var numLeafClustersNeeded = k - 1 + var level = 1 + while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < 63) { + // Divisible clusters are sufficiently large and have non-trivial cost. + var divisibleClusters = activeClusters.filter { case (_, summary) => + (summary.size >= minSize) && (summary.cost > MLUtils.EPSILON * summary.size) + } + // If we don't need all divisible clusters, take the larger ones. + if (divisibleClusters.size > numLeafClustersNeeded) { + divisibleClusters = divisibleClusters.toSeq.sortBy { case (_, summary) => + -summary.size + }.take(numLeafClustersNeeded) + .toMap + } + if (divisibleClusters.nonEmpty) { + val divisibleIndices = divisibleClusters.keys.toSet + logInfo(s"Dividing ${divisibleIndices.size} clusters on level $level.") + var newClusterCenters = divisibleClusters.flatMap { case (index, summary) => + val (left, right) = splitCenter(summary.center, random) + Iterator((leftChildIndex(index), left), (rightChildIndex(index), right)) + }.map(identity) // workaround for a Scala bug (SI-7005) that produces a not serializable map + var newClusters: Map[Long, ClusterSummary] = null + var newAssignments: RDD[(Long, VectorWithNorm)] = null + for (iter <- 0 until maxIterations) { + newAssignments = updateAssignments(assignments, divisibleIndices, newClusterCenters) + .filter { case (index, _) => + divisibleIndices.contains(parentIndex(index)) + } + newClusters = summarize(d, newAssignments) + newClusterCenters = newClusters.mapValues(_.center).map(identity) + } + // TODO: Unpersist old indices. + val indices = updateAssignments(assignments, divisibleIndices, newClusterCenters).keys + .persist(StorageLevel.MEMORY_AND_DISK) + assignments = indices.zip(vectors) + inactiveClusters ++= activeClusters + activeClusters = newClusters + numLeafClustersNeeded -= divisibleClusters.size + } else { + logInfo(s"None active and divisible clusters left on level $level. Stop iterations.") + inactiveClusters ++= activeClusters + activeClusters = Map.empty + } + level += 1 } - model + val clusters = activeClusters ++ inactiveClusters + val root = buildTree(clusters) + new BisectingKMeansModel(root) } + + /** + * Java-friendly version of [[run(RDD[Vector])*]] + */ + def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd) } +private object BisectingKMeans extends Serializable { -private[clustering] object BisectingKMeans { + /** The index of the root node of a tree. */ + private val ROOT_INDEX: Long = 1 - val ROOT_INDEX_KEY: Long = 1 + private val MAX_DIVISIBLE_CLUSTER_INDEX: Long = Long.MaxValue / 2 - /** - * Finds the closes cluster's center - * - * @param metric a distance metric - * @param centers centers of the clusters - * @param point a target point - * @return an index of the array of clusters - */ - def findClosestCenter(metric: (BV[Double], BV[Double]) => Double) - (centers: Seq[BV[Double]])(point: BV[Double]): Int = { - // get the closest index - centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)._2 - } - - /** - * Gets the minimum number of nodes in a tree by the number of leaves - * - * @param k: the number of leaf nodes - */ - def getMinimumNumNodesInTree(k: Int): Int = { - // the calculation is same as `math.pow(2, multiplier)` - val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1 - 1 << multiplier.toInt + /** Returns the left child index of the given node index. */ + private def leftChildIndex(index: Long): Long = { + require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index.") + 2 * index } - /** - * Summarizes data by each cluster as Map - * - * @param data pairs of point and its cluster index - */ - def summarizeClusters( - data: RDD[(Long, BV[Double])] - ): collection.Map[Long, BisectingClusterStat] = { - - // sum the number of node and points of each cluster - val stats = data.map {case (idx, p) => - (idx, (p, 1L)) - }.reduceByKey {case ((p1, n1), (p2, n2)) => (p1 + p2, n1 + n2) }.collectAsMap() - - // calculate within-cluster sum of squares of each cluster - val bcStats = data.sparkContext.broadcast(stats) - val sumOfSquaresMap = data.map { case (idx, point) => - val meanPoint = bcStats.value.apply(idx)._1 :/ bcStats.value.apply(idx)._2.toDouble - (idx, (point - meanPoint) dot (point - meanPoint)) - }.reduceByKey(_ + _).collectAsMap() - - stats.map { case (idx, (sumPoint, n)) => - val meanPoint = sumPoint :/ n.toDouble - val sumOfSquares = math.abs(sumOfSquaresMap(idx)) - (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares)) - } + /** Returns the right child index of the given node index. */ + private def rightChildIndex(index: Long): Long = { + require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index + 1.") + 2 * index + 1 } - /** - * Assigns the initial cluster index id to all data - */ - def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = { - data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)} + /** Returns the parent index of the given node index, or 0 if the input is 1 (root). */ + private def parentIndex(index: Long): Long = { + index / 2 } /** - * Gets the initial centers for bisecting k-means - * - * @param stats pairs of cluster index and cluster statistics - * @param seed random seed + * Summarizes data by each cluster as Map. + * @param d feature dimension + * @param assignments pairs of point and its cluster index + * @return a map from cluster indices to corresponding cluster summaries */ - def initNextCenters( - stats: collection.Map[Long, BisectingClusterStat], - seed: Long - ): collection.Map[Long, BV[Double]] = { - - val random = new XORShiftRandom() - random.setSeed(seed) - val nextCenters = stats.flatMap { case (idx, clusterStats) => - val center = clusterStats.mean - val stdev = math.sqrt(clusterStats.sumOfSquares) / clusterStats.rows - val activeKeys = clusterStats.mean.activeKeysIterator.toArray - val activeValues = activeKeys.map(i => random.nextDouble() * stdev) - val perturbation = new BSV[Double](activeKeys, activeValues, clusterStats.mean.size) - Array((2 * idx, center - perturbation), (2 * idx + 1, center + perturbation)) - }.toMap - nextCenters + private def summarize( + d: Int, + assignments: RDD[(Long, VectorWithNorm)]): Map[Long, ClusterSummary] = { + assignments.aggregateByKey(new ClusterSummaryAggregator(d))( + seqOp = (agg, v) => agg.add(v), + combOp = (agg1, agg2) => agg1.merge(agg2) + ).mapValues(_.summary) + .collect().toMap } /** - * Divides clusters according to their statistics - * - * @param data pairs of point and its cluster index - * @param clusterStats target clusters to divide - * @param maxIterations the maximum iterations to calculate clusters statistics - * @param seed random seed + * Cluster summary aggregator. + * @param d feature dimension */ - def divideClusters( - data: RDD[(Long, BV[Double])], - clusterStats: collection.Map[Long, BisectingClusterStat], - maxIterations: Int, - seed: Long - ): RDD[(Long, BV[Double])] = { - - val sc = data.sparkContext - val appName = sc.appName - - // get keys of dividable clusters - val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable } - if (dividableClusterStats.isEmpty) { - return data + private class ClusterSummaryAggregator(val d: Int) extends Serializable { + private var n: Long = 0L + private val sum: Vector = Vectors.zeros(d) + private var sumSq: Double = 0.0 + + /** Adds a point. */ + def add(v: VectorWithNorm): this.type = { + n += 1L + // TODO: use a numerically stable approach to estimate cost + sumSq += v.norm * v.norm + BLAS.axpy(1.0, v.vector, sum) + this } - // extract dividable input data - val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)} - // get next initial centers - var newCenters = initNextCenters(dividableClusterStats, seed) - var nextData = data - var subIter = 0 - var totalSumOfSquares = Double.MaxValue - var oldTotalSumOfSquares = Double.MaxValue - var relativeError = Double.MaxValue - val dimension = dividableData.first()._2.size - - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val bcMetric = sc.broadcast(metric) - - while (subIter < maxIterations && relativeError > 1e-4) { - // TODO add a set method for the threshold, instead of 1e-4 - - // convert each index into the closest child index - val bcNewCenters = sc.broadcast(newCenters) - nextData = dividableData.map { case (idx, point) => - // calculate next index number - val childIndexes = Array(2 * idx, 2 * idx + 1) - val childrenCenters = childIndexes - .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_)) - if (childrenCenters.length != 2) { - new SparkException(s"A node whose index is ${idx} doesn't have two children") - } - val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point) - val nextIndex = 2 * idx + closestIndex - (nextIndex, point) - } + /** Merges another aggregator. */ + def merge(other: ClusterSummaryAggregator): this.type = { + n += other.n + sumSq += other.sumSq + BLAS.axpy(1.0, other.sum, sum) + this + } - // summarize each cluster - val zeroValue = (BV.zeros[Double](dimension), 0L, 0.0) - val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => { - val sums = acc._1 + point - val n = acc._2 + 1L - val sumOfNorm = acc._3 + (point dot point) - (sums, n, sumOfNorm) + /** Returns the summary. */ + def summary: ClusterSummary = { + val mean = sum.copy + if (n > 0L) { + BLAS.scal(1.0 / n, mean) } - val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) => - (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc1._3) - val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap() - - // calculate the center of each cluster - newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)} - - totalSumOfSquares = tempStats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum - relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares - oldTotalSumOfSquares = totalSumOfSquares - subIter += 1 + val center = new VectorWithNorm(mean) + val cost = math.max(sumSq - n * center.norm * center.norm, 0.0) + new ClusterSummary(n, center, cost) } - nextData } /** - * Creates the map of cluster stats to the map of cluster nodes with their costs + * Bisects a cluster center. * - * @param data input data - * @param stats map of cluster stats which is described as a binary tree + * @param center current cluster center + * @param random a random number generator + * @return initial centers */ - def createClusterNodes( - data: RDD[(Long, BV[Double])], - stats: collection.Map[Long, BisectingClusterStat] - ): collection.Map[Long, BisectingClusterNode] = { - // TODO: support other cost, such as entropy - createClusterNodesWithAverageCost(data, stats) + private def splitCenter( + center: VectorWithNorm, + random: Random): (VectorWithNorm, VectorWithNorm) = { + val d = center.vector.size + val norm = center.norm + val level = 1e-4 * norm + val noise = Vectors.dense(Array.fill(d)(random.nextDouble())) + val left = center.vector.copy + BLAS.axpy(-level, noise, left) + val right = center.vector.copy + BLAS.axpy(level, noise, right) + (new VectorWithNorm(left), new VectorWithNorm(right)) } /** - * Creates the map of cluster stats to the map of cluster nodes with their average costs + * Updates assignments. + * @param assignments current assignments + * @param divisibleIndices divisible cluster indices + * @param newClusterCenters new cluster centers + * @return new assignments */ - private def createClusterNodesWithAverageCost( - data: RDD[(Long, BV[Double])], - stats: collection.Map[Long, BisectingClusterStat] - ): collection.Map[Long, BisectingClusterNode] = { - - stats.map { case (idx, clusterStats) => - val rows = clusterStats.rows - val center = clusterStats.mean - val cost = math.sqrt(clusterStats.sumOfSquares) / rows - idx -> new BisectingClusterNode(Vectors.fromBreeze(center), rows, cost) + private def updateAssignments( + assignments: RDD[(Long, VectorWithNorm)], + divisibleIndices: Set[Long], + newClusterCenters: Map[Long, VectorWithNorm]): RDD[(Long, VectorWithNorm)] = { + assignments.map { case (index, v) => + if (divisibleIndices.contains(index)) { + val children = Seq(leftChildIndex(index), rightChildIndex(index)) + val selected = children.minBy { child => + KMeans.fastSquaredDistance(newClusterCenters(child), v) + } + (selected, v) + } else { + (index, v) + } } } /** - * Builds a cluster tree from a Map of clusters - * - * @param treeMap divided clusters as a Map class - * @param rootIndex index you want to start - * @param k the number of clusters you want - * @return a built cluster tree + * Builds a clustering tree by re-indexing internal and leaf clusters. + * @param clusters a map from cluster indices to corresponding cluster summaries + * @return the root node of the clustering tree */ - private def buildTree( - treeMap: collection.Map[Long, BisectingClusterNode], - rootIndex: Long, - k: Int): Option[BisectingClusterNode] = { - - // if there is no index in the Map - if (!treeMap.contains(rootIndex)) return None - - // build a cluster tree if the queue is empty or until the number of leaf clusters is enough - var numLeavesClusters = 1 - val root = treeMap(rootIndex) - var leavesQueue = Map(rootIndex -> root) - while (leavesQueue.nonEmpty && numLeavesClusters < k) { - // pick up the largest cluster by the maximum cost of all the clusters - val mostScattered = leavesQueue.maxBy(_._2.cost) - val mostScatteredKey = mostScattered._1 - val mostScatteredCluster = mostScattered._2 - - // relate the most scattered cluster to its children clusters - val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1) - if (childrenIndexes.forall(i => treeMap.contains(i))) { - // insert children to the most scattered cluster - val children = childrenIndexes.map(i => treeMap(i)) - mostScatteredCluster.insert(children) - - // calculate the local dendrogram height - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val localHeight = children - .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max - mostScatteredCluster.setLocalHeight(localHeight) - - // update the queue - leavesQueue = leavesQueue ++ childrenIndexes.map(i => i -> treeMap(i)).toMap - numLeavesClusters += 1 + private def buildTree(clusters: Map[Long, ClusterSummary]): ClusteringTreeNode = { + var leafIndex = 0 + var internalIndex = -1 + + /** + * Builds a subtree from this given node index. + */ + def buildSubTree(rawIndex: Long): ClusteringTreeNode = { + val cluster = clusters(rawIndex) + val size = cluster.size + val center = cluster.center + val cost = cluster.cost + val isInternal = clusters.contains(leftChildIndex(rawIndex)) + if (isInternal) { + val index = internalIndex + internalIndex -= 1 + val leftIndex = leftChildIndex(rawIndex) + val rightIndex = rightChildIndex(rawIndex) + val height = math.sqrt(Seq(leftIndex, rightIndex).map { childIndex => + KMeans.fastSquaredDistance(center, clusters(childIndex).center) + }.max) + val left = buildSubTree(leftIndex) + val right = buildSubTree(rightIndex) + new ClusteringTreeNode(index, size, center, cost, height, Array(left, right)) + } else { + val index = leafIndex + leafIndex += 1 + val height = 0.0 + new ClusteringTreeNode(index, size, center, cost, height, Array.empty) } - - // remove the cluster which is involved to the cluster tree - leavesQueue = leavesQueue.filterNot(_ == mostScattered) } - Some(root) + + buildSubTree(ROOT_INDEX) } + + /** + * Summary of a cluster. + * + * @param size the number of points within this cluster + * @param center the center of the points within this cluster + * @param cost the sum of squared distances to the center + */ + private case class ClusterSummary(size: Long, center: VectorWithNorm, cost: Double) } /** - * A cluster as a tree node which can have its sub nodes + * Represents a node in a clustering tree. * - * @param center the center of the cluster - * @param rows the number of rows in the cluster - * @param cost how large a cluster is - * @param localHeight the maximal distance between this node and its children - * @param parent the parent cluster of the cluster - * @param children the children nodes of the cluster + * @param index node index, negative for internal nodes and non-negative for leaf nodes + * @param size size of the cluster + * @param centerWithNorm cluster center with norm + * @param cost cost of the cluster, i.e., the sum of squared distances to the center + * @param height height of the node in the dendrogram. Currently this is defined as the max distance + * from the center to the centers of the children's, but subject to change. + * @param children children nodes */ @Since("1.6.0") -class BisectingClusterNode private ( - @Since("1.6.0") val center: Vector, - @Since("1.6.0") val rows: Long, - @Since("1.6.0") val cost: Double, - private var localHeight: Double, - private var parent: Option[BisectingClusterNode], - private var children: Seq[BisectingClusterNode]) extends Serializable { - - require(!cost.isNaN) - - @Since("1.6.0") - def this(center: Vector, rows: Long, cost: Double) = - this(center, rows, cost, 0.0, None, Array.empty[BisectingClusterNode]) - - /** - * Inserts a sub node as its child - * - * @param child inserted sub node - */ - @Since("1.6.0") - def insert(child: BisectingClusterNode) { - insert(Array(child)) +@Experimental +class ClusteringTreeNode private[clustering] ( + val index: Int, + val size: Long, + private val centerWithNorm: VectorWithNorm, + val cost: Double, + val height: Double, + val children: Array[ClusteringTreeNode]) extends Serializable { + + /** Whether this is a leaf node. */ + val isLeaf: Boolean = children.isEmpty + + require((isLeaf && index >= 0) || (!isLeaf && index < 0)) + + /** Cluster center. */ + def center: Vector = centerWithNorm.vector + + /** Predicts the leaf cluster node index that the input point belongs to. */ + def predict(point: Vector): Int = { + val (index, _) = predict(new VectorWithNorm(point)) + index } - /** - * Inserts sub nodes as its children - * - * @param children inserted sub nodes - */ - @Since("1.6.0") - def insert(children: Array[BisectingClusterNode]) { - this.children = this.children ++ children - children.foreach(child => child.parent = Some(this)) + /** Returns the full prediction path from root to leaf. */ + def predictPath(point: Vector): Array[ClusteringTreeNode] = { + predictPath(new VectorWithNorm(point)).toArray } - /** - * Converts the tree into Array class - * the sub nodes are recursively expanded - * - * @return an Array class which the cluster tree is expanded - */ - @Since("1.6.0") - def toArray: Array[BisectingClusterNode] = { - val array = this.children.size match { - case 0 => Array(this) - case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator) - } - array.sortWith { case (a, b) => - a.getDepth < b.getDepth && a.cost < b.cost && a.rows < b.rows + /** Returns the full prediction path from root to leaf. */ + private def predictPath(pointWithNorm: VectorWithNorm): List[ClusteringTreeNode] = { + if (isLeaf) { + this :: Nil + } else { + val selected = children.minBy { child => + KMeans.fastSquaredDistance(child.centerWithNorm, pointWithNorm) + } + selected :: selected.predictPath(pointWithNorm) } } /** - * Gets the depth of the cluster in the tree - * - * @return the depth from the root + * Computes the cost (squared distance to the predicted leaf cluster center) of the input point. */ - @Since("1.6.0") - def getDepth: Int = { - this.parent match { - case None => 0 - case _ => 1 + this.parent.get.getDepth - } + def computeCost(point: Vector): Double = { + val (_, cost) = predict(new VectorWithNorm(point)) + cost } /** - * Finds a leaf which is the closest under the node - * - * @param point target point + * Predicts the cluster index and the cost of the input point. */ - @Since("1.6.0") - def findClosestLeaf( - point: Vector, - metric: (BV[Double], BV[Double]) => Double - ): BisectingClusterNode = { - this.children.size match { - case 0 => this - case _ => { - val bv = point.toBreeze - val centers = this.children.map(_.center).map(_.toBreeze) - val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(bv) - this.children(closestIndex).findClosestLeaf(point, metric) - } - } + private def predict(pointWithNorm: VectorWithNorm): (Int, Double) = { + predict(pointWithNorm, KMeans.fastSquaredDistance(centerWithNorm, pointWithNorm)) } /** - * Gets the leaves nodes in the cluster tree + * Predicts the cluster index and the cost of the input point. + * @param pointWithNorm input point + * @param cost the cost to the current center + * @return (predicted leaf cluster index, cost) */ - @Since("1.6.0") - def getLeavesNodes: Array[BisectingClusterNode] = { - this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum) + private def predict(pointWithNorm: VectorWithNorm, cost: Double): (Int, Double) = { + if (isLeaf) { + (index, cost) + } else { + val (selectedChild, minCost) = children.map { child => + (child, KMeans.fastSquaredDistance(child.centerWithNorm, pointWithNorm)) + }.minBy(_._2) + selectedChild.predict(pointWithNorm, minCost) + } } - @Since("1.6.0") - def isLeaf: Boolean = this.children.isEmpty - - @Since("1.6.0") - def getParent: Option[BisectingClusterNode] = this.parent - - @Since("1.6.0") - def getChildren: Seq[BisectingClusterNode] = this.children - /** - * Gets the dendrogram height of the cluster at the cluster tree. - * A dendrogram height is different from a local height. - * A dendrogram height means a total height of a node in a tree. - * A local height means a maximum distance between a node and its children. - * - * @return the dendrogram height + * Returns all leaf nodes from this node. */ - @Since("1.6.0") - def getHeight: Double = { - this.children.size match { - case 0 => 0.0 - case _ => this.localHeight + this.children.map(_.getHeight).max + def leafNodes: Array[ClusteringTreeNode] = { + if (isLeaf) { + Array(this) + } else { + children.flatMap(_.leafNodes) } } - - @Since("1.6.0") - def setLocalHeight(height: Double): Unit = this.localHeight = height -} - - -/** - * This class is used for maneging a cluster statistics - * - * @param rows the number of points - * @param mean the sum of points - * @param sumOfSquares the sum of squares of points - */ -private[clustering] case class BisectingClusterStat ( - rows: Long, - mean: BV[Double], - sumOfSquares: Double) extends Serializable { - require(sumOfSquares >= 0.0) - - def isDividable: Boolean = sumOfSquares > 0 && rows >= 2 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 38f4695eb0d26..5015f1540d920 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -17,80 +17,79 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{Vector => BV, norm => breezeNorm} - import org.apache.spark.Logging -import org.apache.spark.annotation.Since +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD /** - * This class is used for the model of the bisecting kmeans + * Clustering model produced by [[BisectingKMeans]]. + * The prediction is done level-by-level from the root node to a leaf node, and at each node among + * its children the closest to the input point is selected. * - * @param node a cluster as a tree node + * @param root the root node of the clustering tree */ @Since("1.6.0") +@Experimental class BisectingKMeansModel @Since("1.6.0") ( - @Since("1.6.0") val node: BisectingClusterNode + @Since("1.6.0") val root: ClusteringTreeNode ) extends Serializable with Logging { + /** + * Leaf cluster centers. + */ @Since("1.6.0") - def getClusters: Array[BisectingClusterNode] = this.node.getLeavesNodes + def clusterCenters: Array[Vector] = root.leafNodes.map(_.center) - @Since("1.6.0") - def getCenters: Array[Vector] = this.getClusters.map(_.center) + /** + * Number of leaf clusters. + */ + lazy val k: Int = clusterCenters.length /** - * Predicts the closest cluster by one point + * Predicts the index of the cluster that the input point belongs to. */ @Since("1.6.0") - def predict(vector: Vector): Int = { - // TODO Supports distance metrics other Euclidean distance metric - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val closestLeafNode = this.node.findClosestLeaf(vector, metric) - - val closestCenter = closestLeafNode.center - val centers = this.getCenters.map(_.toBreeze) - BisectingKMeans.findClosestCenter(metric)(centers)(closestCenter.toBreeze) + def predict(point: Vector): Int = { + root.predict(point) } /** - * Predicts the closest cluster by RDD of the points + * Predicts the indices of the clusters that the input points belong to. */ @Since("1.6.0") - def predict(data: RDD[Vector]): RDD[Int] = { - val sc = data.sparkContext - data.map { p => predict(p) } + def predict(points: RDD[Vector]): RDD[Int] = { + points.map { p => root.predict(p) } } /** - * Predicts the closest cluster by RDD of the points for Java + * Java-friendly version of [[predict(RDD[Vector])*]] */ @Since("1.6.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] /** - * Computes Within Set Sum of Squared Error(WSSSE) + * Computes the squared distance between the input point and the cluster center it belongs to. + */ + @Since("1.6.0") + def computeCost(point: Vector): Double = { + root.computeCost(point) + } + + /** + * Computes the sum of squared distances between the input points and their corresponding cluster + * centers. */ @Since("1.6.0") def computeCost(data: RDD[Vector]): Double = { - val bvCenters = this.getCenters.map(_.toBreeze) - data.context.broadcast(bvCenters) - val distances = data.map {point => - val bvPoint = point.toBreeze - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val idx = BisectingKMeans.findClosestCenter(metric)(bvCenters)(bvPoint) - val closestCenter = bvCenters(idx) - val distance = metric(bvPoint, closestCenter) - distance - } - distances.sum() + data.map(root.computeCost).sum() } + /** + * Java-friendly version of [[computeCost(RDD[Vector])*]]. + */ @Since("1.6.0") def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd) - } - diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index 926bd54e54424..a714620ff7e4b 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -18,13 +18,12 @@ package org.apache.spark.mllib.clustering; import java.io.Serializable; -import java.util.List; +import com.google.common.collect.Lists; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import com.google.common.collect.Lists; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -46,63 +45,29 @@ public void tearDown() { } @Test - public void runWithSmallData() { - List points = Lists.newArrayList( - Vectors.dense(1.0, 2.0, 6.0), - Vectors.dense(1.0, 3.0, 0.0), - Vectors.dense(1.0, 4.0, 6.0) - ); - - Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0); - - JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setK(1); - BisectingKMeansModel model = algo.run(data.rdd()); - assertEquals(1, model.getCenters().length); - assertEquals(expectedCenter, model.getCenters()[0]); - } - - @Test - public void runWithDenseVectors() { - int numClusters = 5; - List points = Lists.newArrayList(); - for (int i = 0; i < 99; i++) { - Double elm = (double)(i % numClusters); - Vector point = Vectors.dense(elm, elm); - points.add(point); - } - JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setK(numClusters); - BisectingKMeansModel model = algo.run(data.rdd()); - Vector[] centers = model.getCenters(); - assertEquals(numClusters, centers.length); - assertEquals(Vectors.dense(0.0, 0.0), centers[0]); - assertEquals(Vectors.dense(1.0, 1.0), centers[1]); - assertEquals(Vectors.dense(2.0, 2.0), centers[2]); - assertEquals(Vectors.dense(3.0, 3.0), centers[3]); - assertEquals(Vectors.dense(4.0, 4.0), centers[4]); - } + public void twoDimensionalData() { + JavaRDD points = sc.parallelize(Lists.newArrayList( + Vectors.dense(4, -1), + Vectors.dense(4, 1), + Vectors.sparse(2, new int[] {0}, new double[] {1.0}) + ), 2); - @Test - public void runWithSparseVectors() { - int numClusters = 5; - List points = Lists.newArrayList(); - for (int i = 0; i < 99; i++) { - int elm = i % numClusters; - int indexes[] = {elm}; - double values[] = {elm}; - Vector point = Vectors.sparse(numClusters, indexes, values); - points.add(point); + BisectingKMeans bkm = new BisectingKMeans() + .setK(4) + .setMaxIterations(2) + .setSeed(1L); + BisectingKMeansModel model = bkm.run(points); + Assert.assertEquals(3, model.k()); + Assert.assertArrayEquals(new double[] {3.0, 0.0}, model.root().center().toArray(), 1e-12); + for (ClusteringTreeNode child: model.root().children()) { + double[] center = child.center().toArray(); + if (center[0] > 2) { + Assert.assertEquals(2, child.size()); + Assert.assertArrayEquals(new double[] {4.0, 0.0}, center, 1e-12); + } else { + Assert.assertEquals(1, child.size()); + Assert.assertArrayEquals(new double[] {1.0, 0.0}, center, 1e-12); + } } - JavaRDD data = sc.parallelize(points, 2); - BisectingKMeans algo = new BisectingKMeans().setK(numClusters); - BisectingKMeansModel model = algo.run(data.rdd()); - Vector[] centers = model.getCenters(); - assertEquals(numClusters, centers.length); - assertEquals(points.get(0), centers[0]); - assertEquals(points.get(1), centers[1]); - assertEquals(points.get(2), centers[2]); - assertEquals(points.get(3), centers[3]); - assertEquals(points.get(4), centers[4]); } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala deleted file mode 100644 index ceac039efc8d0..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.mllib.clustering - -import org.scalatest.BeforeAndAfterEach - -import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLlibTestSparkContext - -class BisectingKMeansModelSuite - extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach { - - test("clustering dense vectors") { - val app = new BisectingKMeans().setK(5).setSeed(1) - - val localData = (1 to 100).toSeq.map { i => - val label = i % 5 - val vector = Vectors.dense(label, label, label) - (label, vector) - } - val data = sc.parallelize(localData.map(_._2)) - val model = app.run(data) - - val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[BisectingClusterNode]]) - assert(clusters.length === 5) - - val centers = model.getCenters.sortBy(_.toArray.sum) - assert(centers.length === 5) - assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0)) - assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0)) - assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0)) - assert(centers(3) === Vectors.dense(3.0, 3.0, 3.0)) - assert(centers(4) === Vectors.dense(4.0, 4.0, 4.0)) - - // predict with one vector - assert(model.predict(Vectors.dense(0.0, 0.0, 0.0)) === 0) - assert(model.predict(Vectors.dense(0.5, 0.5, 0.5)) === 0) - assert(model.predict(Vectors.dense(1.0, 1.0, 1.0)) === 1) - assert(model.predict(Vectors.dense(2.0, 2.0, 2.0)) === 2) - assert(model.predict(Vectors.dense(3.0, 3.0, 3.0)) === 3) - assert(model.predict(Vectors.dense(4.0, 4.0, 4.0)) === 4) - - // predict with a RDD - val predicted = model.predict(data).collect() - assert(predicted === localData.map(_._1)) - - // compute WSSSE - assert(model.computeCost(data) === 0.0) - } - - test("clustering sparse vectors") { - val app = new BisectingKMeans().setK(5).setSeed(1) - - val localData = (1 to 100).toSeq.map { i => - val label = i % 5 - val vector = Vectors.sparse(5, Seq((label, label.toDouble))) - (label, vector) - } - val data = sc.parallelize(localData.map(_._2)) - val model = app.run(data) - - val clusters = model.getClusters - assert(clusters.isInstanceOf[Array[BisectingClusterNode]]) - assert(clusters.length === 5) - - val centers = model.getCenters.sortBy(_.toArray.sum) - assert(centers.length === 5) - assert(centers(0) === Vectors.sparse(5, Array(), Array())) - assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0))) - assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0))) - assert(centers(3) === Vectors.sparse(5, Array(3), Array(3.0))) - assert(centers(4) === Vectors.sparse(5, Array(4), Array(4.0))) - - // predict with one vector - assert(model.predict(Vectors.sparse(5, Array(0), Array(0.0))) === 0) - assert(model.predict(Vectors.sparse(5, Array(1), Array(1.0))) === 1) - assert(model.predict(Vectors.sparse(5, Array(2), Array(2.0))) === 2) - assert(model.predict(Vectors.sparse(5, Array(3), Array(3.0))) === 3) - assert(model.predict(Vectors.sparse(5, Array(4), Array(4.0))) === 4) - - // predict with a RDD - val predicted = model.predict(data).collect() - assert(predicted === localData.map(_._1)) - - // compute WSSSE - assert(model.computeCost(data) === 0.0) - } - - test("clustering should be done correctly") { - for (k <- Array(9, 19)) { - val app = new BisectingKMeans().setK(k).setSeed(1) - val localData = (1 to 19).toSeq.map { i => - val label = i % k - val sparseVector = Vectors.sparse(k, Seq((label, label.toDouble))) - val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector) - (label, denseVector, sparseVector) - } - - // dense version - val denseData = sc.parallelize(localData.map(_._2), 2) - val denseModel = app.run(denseData) - assert(denseModel.getCenters.length === k) - assert(denseModel.getClusters.forall(_.cost == 0.0)) - - // sparse version - val sparseData = sc.parallelize(localData.map(_._3), 2) - val sparseModel = app.run(sparseData) - assert(sparseModel.getCenters.length === k) - assert(sparseModel.getClusters.forall(_.cost == 0.0)) - } - } -} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala index 74e12d00c2022..41b9d5c0d93bb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala @@ -17,184 +17,166 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{Vector => BV, norm => breezeNorm} - import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ - class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext { - test("run") { - val k = 123 - val algo = new BisectingKMeans().setK(k).setSeed(1) - val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val data = sc.parallelize(localSeed, 2) - val model = algo.run(data) - assert(model.getClusters.length == 123) - assert(model.node.getHeight ~== 702.8641 absTol 10E-4) - - // check the relations between a parent cluster and its children - assert(model.node.getParent === None) - assert(model.node.getChildren.head.getParent.get === model.node) - assert(model.node.getChildren.apply(1).getParent.get === model.node) - assert(model.getClusters.forall(_.getParent.isDefined)) - - val predicted = model.predict(data) - assert(predicted.distinct.count() === k) + test("default values") { + val bkm0 = new BisectingKMeans() + assert(bkm0.getK === 4) + assert(bkm0.getMaxIterations === 20) + assert(bkm0.getMinDivisibleClusterSize === 1.0) + val bkm1 = new BisectingKMeans() + assert(bkm0.getSeed === bkm1.getSeed, "The default seed should be constant.") } - test("run with too many cluster size than the records") { - val algo = new BisectingKMeans().setK(123).setSeed(1) - val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq - val data = sc.parallelize(localSeed) - val model = algo.run(data) - assert(model.getClusters.length == 100) - assert(model.node.getHeight ~== 72.12489 absTol 10E-4) + test("setter/getter") { + val bkm = new BisectingKMeans() + + val k = 10 + assert(bkm.getK !== k) + assert(bkm.setK(k).getK === k) + val maxIter = 100 + assert(bkm.getMaxIterations !== maxIter) + assert(bkm.setMaxIterations(maxIter).getMaxIterations === maxIter) + val minSize = 2.0 + assert(bkm.getMinDivisibleClusterSize !== minSize) + assert(bkm.setMinDivisibleClusterSize(minSize).getMinDivisibleClusterSize === minSize) + val seed = 10L + assert(bkm.getSeed !== seed) + assert(bkm.setSeed(seed).getSeed === seed) + + intercept[IllegalArgumentException] { + bkm.setK(0) + } + intercept[IllegalArgumentException] { + bkm.setMaxIterations(0) + } + intercept[IllegalArgumentException] { + bkm.setMinDivisibleClusterSize(0.0) + } } - test("setNumClusters") { - val algo = new BisectingKMeans() - assert(algo.getK == 2) - algo.setK(1000) - assert(algo.getK == 1000) + test("1D data") { + val points = Vectors.sparse(1, Array.empty, Array.empty) +: + (1 until 8).map(i => Vectors.dense(i)) + val data = sc.parallelize(points, 2) + val bkm = new BisectingKMeans() + .setK(4) + .setMaxIterations(1) + .setSeed(1L) + // The clusters should be + // (0, 1, 2, 3, 4, 5, 6, 7) + // - (0, 1, 2, 3) + // - (0, 1) + // - (2, 3) + // - (4, 5, 6, 7) + // - (4, 5) + // - (6, 7) + val model = bkm.run(data) + assert(model.k === 4) + // The total cost should be 8 * 0.5 * 0.5 = 2.0. + assert(model.computeCost(data) ~== 2.0 relTol 1e-12) + val predictions = data.map(v => (v(0), model.predict(v))).collectAsMap() + Range(0, 8, 2).foreach { i => + assert(predictions(i) === predictions(i + 1), + s"$i and ${i + 1} should belong to the same cluster.") + } + val root = model.root + assert(root.center(0) ~== 3.5 relTol 1e-12) + assert(root.height ~== 2.0 relTol 1e-12) + assert(root.children.length === 2) + assert(root.children(0).height ~== 1.0 relTol 1e-12) + assert(root.children(1).height ~== 1.0 relTol 1e-12) } - test("setSubIterations") { - val algo = new BisectingKMeans() - assert(algo.getMaxIterations == 20) - algo.setMaxIterations(15) - assert(algo.getMaxIterations == 15) + test("points are the same") { + val data = sc.parallelize(Seq.fill(8)(Vectors.dense(1.0, 1.0)), 2) + val bkm = new BisectingKMeans() + .setK(2) + .setMaxIterations(1) + .setSeed(1L) + val model = bkm.run(data) + assert(model.k === 1) } - test("setSeed") { - val algo = new BisectingKMeans() - assert(algo.getSeed == 1) - algo.setSeed(987) - assert(algo.getSeed == 987) + test("more desired clusters than points") { + val data = sc.parallelize(Seq.tabulate(4)(i => Vectors.dense(i)), 2) + val bkm = new BisectingKMeans() + .setK(8) + .setMaxIterations(2) + .setSeed(1L) + val model = bkm.run(data) + assert(model.k === 4) } - test("summarize center stats") { - val algo = new BisectingKMeans - val local = Seq( - (4L, Vectors.dense(1.5, 1.5).toBreeze), - (4L, Vectors.dense(2.5, 2.5).toBreeze), - (5L, Vectors.dense(11.5, 11.5).toBreeze), - (5L, Vectors.dense(12.5, 12.5).toBreeze), - (6L, Vectors.dense(21.5, 21.5).toBreeze), - (6L, Vectors.dense(22.5, 22.5).toBreeze), - (7L, Vectors.dense(31.5, 31.5).toBreeze), - (7L, Vectors.dense(32.5, 32.5).toBreeze) - ) - val data = sc.parallelize(local) - - val clusterStats = BisectingKMeans.summarizeClusters(data) - assert(clusterStats.size === 4) - assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze) - assert(clusterStats(4).sumOfSquares ~== 1.0 absTol 10e-4) - assert(clusterStats(4).rows === 2) - assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze) - assert(clusterStats(5).sumOfSquares ~== 1.0 absTol 10e-4) - assert(clusterStats(5).rows === 2) - assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze) - assert(clusterStats(6).sumOfSquares ~== 1.0 absTol 10e-4) - assert(clusterStats(6).rows === 2) - assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze) - assert(clusterStats(7).sumOfSquares ~== 1.0 absTol 10e-4) - assert(clusterStats(7).rows === 2) + test("min divisible cluster") { + val data = sc.parallelize( + Seq.tabulate(16)(i => Vectors.dense(i)) ++ Seq.tabulate(4)(i => Vectors.dense(-100.0 - i)), + 2) + val bkm = new BisectingKMeans() + .setK(4) + .setMinDivisibleClusterSize(10) + .setMaxIterations(1) + .setSeed(1L) + val model = bkm.run(data) + assert(model.k === 3) + assert(model.predict(Vectors.dense(-100)) === model.predict(Vectors.dense(-97))) + assert(model.predict(Vectors.dense(7)) !== model.predict(Vectors.dense(8))) + + bkm.setMinDivisibleClusterSize(0.5) + val sameModel = bkm.run(data) + assert(sameModel.k === 3) } - test("initialize centers at next step") { - val local = Seq( - (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)), - (3L, BV[Double](1.9, 1.9)), (2L, BV[Double](2.1, 2.1)) - ) - val data = sc.parallelize(local) - val stats = Map[Long, BisectingClusterStat]( - 2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0), - 3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0) - ) - val initNextCenters = BisectingKMeans.initNextCenters(stats, 1) - assert(initNextCenters.size === 4) - assert(initNextCenters.keySet === Set(4, 5, 6, 7)) + test("larger clusters get selected first") { + val data = sc.parallelize( + Seq.tabulate(16)(i => Vectors.dense(i)) ++ Seq.tabulate(4)(i => Vectors.dense(-100.0 - i)), + 2) + val bkm = new BisectingKMeans() + .setK(3) + .setMaxIterations(1) + .setSeed(1L) + val model = bkm.run(data) + assert(model.k === 3) + assert(model.predict(Vectors.dense(-100)) === model.predict(Vectors.dense(-97))) + assert(model.predict(Vectors.dense(7)) !== model.predict(Vectors.dense(8))) } - test("should assign each data to new clusters") { - val seed = Seq( - (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), - (2L, Vectors.dense(2.0, 2.0)), (2L, Vectors.dense(3.0, 3.0)), - (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)), - (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), - (3L, Vectors.dense(8.0, 8.0)), (3L, Vectors.dense(9.0, 9.0)), - (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0)) - ).map { case (idx, vector) => (idx, vector.toBreeze) } - val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0) - val newClusterStats = Map( - 4L -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance), - 5L -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance), - 6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance), - 7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance) - ) - val data = sc.parallelize(seed, 1) - val leafClusterStats = BisectingKMeans.summarizeClusters(data) - val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable) - val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20, 1).collect() - - val expected = Seq( - (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)), - (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)), - (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)), - (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0)) - ).map { case (idx, vector) => (idx, vector.toBreeze) } - assert(result === expected) - } - - test("findClosestCenter") { - val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0) - val centers = Seq( - Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze, - Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze, - Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze - ) - - for (i <- 0 to (centers.size - 1)) { - val point = centers(i) - val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point) - assert(closestIndex === i) + test("2D data") { + val points = Seq( + (11, 10), (9, 10), (10, 9), (10, 11), + (11, -10), (9, -10), (10, -9), (10, -11), + (0, 1), (0, -1) + ).map { case (x, y) => + if (x == 0) { + Vectors.sparse(2, Array(1), Array(y)) + } else { + Vectors.dense(x, y) + } } - } - - test("should be equal to math.pow") { - (1 to 1000).foreach { k => - // the minimum number of nodes of a binary tree by given parameter - val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1 - val expected = math.pow(2, multiplier).toInt - val result = BisectingKMeans.getMinimumNumNodesInTree(k) - assert(result === expected) + val data = sc.parallelize(points, 2) + val bkm = new BisectingKMeans() + .setK(3) + .setMaxIterations(4) + .setSeed(1L) + val model = bkm.run(data) + assert(model.k === 3) + assert(model.root.center ~== Vectors.dense(8, 0) relTol 1e-12) + model.root.leafNodes.foreach { node => + if (node.center(0) < 5) { + assert(node.size === 2) + assert(node.center ~== Vectors.dense(0, 0) relTol 1e-12) + } else if (node.center(1) > 0) { + assert(node.size === 4) + assert(node.center ~== Vectors.dense(10, 10) relTol 1e-12) + } else { + assert(node.size === 4) + assert(node.center ~== Vectors.dense(10, -10) relTol 1e-12) + } } } - - test("should divide clusters correctly") { - val local = Seq( - (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)), - (2L, BV[Double](9.9, 9.9)), (2L, BV[Double](10.1, 10.1)), - (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)), - (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1)) - ) - val data = sc.parallelize(local, 1) - val stats = BisectingKMeans.summarizeClusters(data) - val dividedData = BisectingKMeans.divideClusters(data, stats, 20, 1).collect() - - assert(dividedData(0) == (4L, BV[Double](0.9, 0.9))) - assert(dividedData(1) == (4L, BV[Double](1.1, 1.1))) - assert(dividedData(2) == (5L, BV[Double](9.9, 9.9))) - assert(dividedData(3) == (5L, BV[Double](10.1, 10.1))) - assert(dividedData(4) == (6L, BV[Double](99.9, 99.9))) - assert(dividedData(5) == (6L, BV[Double](100.1, 100.1))) - assert(dividedData(6) == (7L, BV[Double](109.9, 109.9))) - assert(dividedData(7) == (7L, BV[Double](110.1, 110.1))) - } - } From 29ccdf9eaa987530435782d2051acbeda3d3ac36 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Mon, 9 Nov 2015 11:37:19 -0800 Subject: [PATCH 76/76] Remove a magic number 63 for level limitation --- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 9a7916f75dcc7..29a7aa0bb63f2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -165,7 +165,7 @@ class BisectingKMeans private ( val random = new Random(seed) var numLeafClustersNeeded = k - 1 var level = 1 - while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < 63) { + while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < LEVEL_LIMIT) { // Divisible clusters are sufficiently large and have non-trivial cost. var divisibleClusters = activeClusters.filter { case (_, summary) => (summary.size >= minSize) && (summary.cost > MLUtils.EPSILON * summary.size) @@ -226,6 +226,8 @@ private object BisectingKMeans extends Serializable { private val MAX_DIVISIBLE_CLUSTER_INDEX: Long = Long.MaxValue / 2 + private val LEVEL_LIMIT = math.log10(Long.MaxValue) / math.log10(2) + /** Returns the left child index of the given node index. */ private def leftChildIndex(index: Long): Long = { require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index.")