From 295bdde38d930b9fa32076a23e751ec128104648 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 30 Mar 2015 20:29:12 +0900
Subject: [PATCH 01/76] [SPARK-6517][mllib] Implement the Algorithm of
 Hierarchical Clustering

Thank you for your great cooperation, RJ Nowling(@rnowling), Jeremy Freeman(@freeman-lab), Xiangrui Meng(@mengxr) and Sean Owen(@srowen).
---
 .../mllib/api/python/PythonMLLibAPI.scala     |  23 +
 .../clustering/HierarchicalClustering.scala   | 609 ++++++++++++++++++
 .../HierarchicalClusteringModel.scala         |  97 +++
 .../HierarchicalClusteringModelSuite.scala    | 118 ++++
 .../HierarchicalClusteringSuite.scala         | 188 ++++++
 5 files changed, 1035 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 21e55938fa7aa..49a3420c26945 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -401,6 +401,29 @@ private[python] class PythonMLLibAPI extends Serializable {
     }
   }
 
+  /**
+   * Java stub for Python mllib HierarchicalClustering.run()
+   */
+  def trainHierarchicalClusteringModel(
+    data: JavaRDD[Vector],
+    k: Int,
+    maxIterations: Int,
+    maxRetries: Int,
+    seed: java.lang.Long): HierarchicalClusteringModel = {
+    val algo = new HierarchicalClustering()
+        .setNumClusters(k)
+        .setMaxIterations(maxIterations)
+        .setMaxRetries(maxRetries)
+
+    if (seed != null)  algo.setSeed(seed)
+
+    try {
+      algo.run(data)
+    } finally {
+      data.rdd.unpersist(blocking = false)
+    }
+  }
+
   /**
    * Java stub for Python mllib GaussianMixtureModel.predictSoft()
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
new file mode 100644
index 0000000000000..dd6cf6d0f8b94
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -0,0 +1,609 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.random.XORShiftRandom
+import org.apache.spark.{Logging, SparkException}
+
+import scala.collection.{Map, mutable}
+
+
+/**
+ * Top-level methods for calling the hierarchical clustering algorithm
+ */
+object HierarchicalClustering extends Logging {
+
+  private[clustering] val ROOT_INDEX_KEY: Long = 1
+
+  /**
+   * Trains a hierarchical clustering model with the given data
+   *
+   * @param data trained data
+   * @param numClusters the maximum number of clusters you want
+   * @return a hierarchical clustering model
+   */
+  def train(data: RDD[Vector], numClusters: Int): HierarchicalClusteringModel = {
+    val algo = new HierarchicalClustering().setNumClusters(numClusters)
+    algo.run(data)
+  }
+
+  /**
+   * Trains a hierarchical clustering model with the given data
+   *
+   * @param data training data
+   * @param numClusters the maximum number of clusters you want
+   * @param maxIterations the number of maximal iterations
+   * @param maxRetries the number of maximum retries when the clustering can't be succeeded
+   * @param seed the randomseed to generate the initial vectors for each bisecting
+   * @return a hierarchical clustering model
+   */
+  def train(data: RDD[Vector],
+    numClusters: Int,
+    maxIterations: Int,
+    maxRetries: Int,
+    seed: Int): HierarchicalClusteringModel = {
+
+    val algo = new HierarchicalClustering().setNumClusters(numClusters)
+        .setMaxIterations(maxIterations)
+        .setMaxRetries(maxRetries)
+        .setSeed(seed)
+    algo.run(data)
+  }
+
+  /**
+   * Finds the closes cluster's center
+   *
+   * @param metric a distance metric
+   * @param centers centers of the clusters
+   * @param point a target point
+   * @return an index of the array of clusters
+   */
+  private[mllib]
+  def findClosestCenter(metric: Function2[BV[Double], BV[Double], Double])
+        (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
+    val (closestCenter, closestIndex) =
+      centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)
+    closestIndex
+  }
+}
+
+/**
+ * This is a divisive hierarchical clustering algorithm based on bi-sect k-means algorithm.
+ *
+ * The main idea of this algorithm is based on "A comparison of document clustering techniques",
+ * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000.
+ * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf
+ *
+ * @param numClusters tne number of clusters you want
+ * @param clusterMap the pairs of cluster and its index as Map
+ * @param maxIterations the number of maximal iterations
+ * @param maxRetries the number of maximum retries
+ * @param seed a random seed
+ */
+class HierarchicalClustering(
+  private var numClusters: Int,
+  private var clusterMap: Map[Long, ClusterTree],
+  private var maxIterations: Int,
+  private var maxRetries: Int,
+  private var seed: Long) extends Logging {
+
+  /**
+   * Constructs with the default configuration
+   */
+  def this() = this(20, mutable.ListMap.empty[Long, ClusterTree], 20, 10, 1)
+
+  /**
+   * Sets the number of clusters you want
+   */
+  def setNumClusters(numClusters: Int): this.type = {
+    this.numClusters = numClusters
+    this
+  }
+
+  /**
+   * Sets the number of maximal iterations in each clustering step
+   */
+  def setMaxIterations(maxIterations: Int): this.type = {
+    this.maxIterations = maxIterations
+    this
+  }
+
+  def getSubIterations(): Int = this.maxIterations
+
+  /**
+   * Sets the number of maximum retries of each clustering step
+   */
+  def setMaxRetries(maxRetries: Int): this.type = {
+    this.maxRetries = maxRetries
+    this
+  }
+
+  def getMaxRetries(): Int = this.maxRetries
+
+  /**
+   * Sets the random seed
+   */
+  def setSeed(seed: Long): this.type = {
+    this.seed = seed
+    this
+  }
+
+  def getSeed(): Long = this.seed
+
+  /**
+   * Runs the hierarchical clustering algorithm
+   * @param input RDD of vectors
+   * @return model for the hierarchical clustering
+   */
+  def run(input: RDD[Vector]): HierarchicalClusteringModel = {
+    val sc = input.sparkContext
+    log.info(s"${sc.appName} starts a hierarchical clustering algorithm")
+
+    var data = initData(input).cache()
+    val startTime = System.currentTimeMillis()
+
+    // `clusters` is described as binary tree structure
+    // `clusters(1)` means the root of a binary tree
+    var clusters = summarizeAsClusters(data)
+    var leafClusters = clusters
+    var step = 1
+    var numDividedClusters = 0
+    var noMoreDividable = false
+    var rddArray = Array.empty[RDD[(Long, BV[Double])]]
+    // the number of maximum nodes of a binary tree by given parameter
+    val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1
+    val maxAllNodesInTree = math.pow(2, multiplier).toInt
+
+    while (clusters.size < maxAllNodesInTree && noMoreDividable == false) {
+      log.info(s"${sc.appName} starts step ${step}")
+
+      // enough to be clustered if the number of divided clusters is equal to 0
+      val divided = getDividedClusters(data, leafClusters)
+      if (divided.size == 0) {
+        noMoreDividable = true
+      }
+      else {
+        // update each index
+        val newData = updateClusterIndex(data, divided).cache()
+        rddArray = rddArray ++ Array(data)
+        data = newData
+
+        // keep recent 2 cached RDDs in order to run more quickly
+        if (rddArray.size > 1) {
+          val head = rddArray.head
+          head.unpersist()
+          rddArray = rddArray.filterNot(_.hashCode() == head.hashCode())
+        }
+
+        // merge the divided clusters with the map as the cluster tree
+        clusters = clusters ++ divided
+        numDividedClusters = data.map(_._1).distinct().count().toInt
+        leafClusters = divided
+        step += 1
+
+        log.info(s"${sc.appName} adding ${divided.size} new clusters at step:${step}")
+      }
+    }
+    // unpersist kept RDDs
+    rddArray.foreach(_.unpersist())
+
+    // build a cluster tree by Map class which is expressed
+    log.info(s"Building the cluster tree is started in ${sc.appName}")
+    val root = buildTree(clusters, HierarchicalClustering.ROOT_INDEX_KEY, this.numClusters)
+    if (root == None) {
+      new SparkException("Failed to build a cluster tree from a Map type of clusters")
+    }
+
+    // set the elapsed time for training
+    val finishTime = (System.currentTimeMillis() - startTime) / 1000.0
+    log.info(s"Elapsed Time for Hierarchical Clustering Training: ${finishTime} [sec]")
+
+    // make a hierarchical clustering model
+    val model = new HierarchicalClusteringModel(root.get)
+    val leavesNodes = model.getClusters()
+    if (leavesNodes.size < this.numClusters) {
+      log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ")
+    }
+    model
+  }
+
+  /**
+   * Assigns the initial cluster index id to all data
+   */
+  private[clustering]
+  def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
+    data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}.cache
+  }
+
+  /**
+   * Summarizes data by each cluster as ClusterTree2 classes
+   */
+  private[clustering]
+  def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = {
+    // summarize input data
+    val stats = summarize(data)
+
+    // convert statistics to ClusterTree class
+    stats.map { case (i, (sum, n, sumOfSquares)) =>
+      val center = Vectors.fromBreeze(sum :/ n)
+      val variances = n match {
+        case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
+        case _ => Vectors.zeros(sum.size)
+      }
+      (i, new ClusterTree(center, n.toLong, variances))
+    }.toMap
+  }
+
+  /**
+   * Summarizes data by each cluster as Map
+   */
+  private[clustering]
+  def summarize(data: RDD[(Long, BV[Double])]): Map[Long, (BV[Double], Double, BV[Double])] = {
+    data.mapPartitions { iter =>
+      // calculate the accumulation of the all point in a partition and count the rows
+      val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
+      iter.foreach { case (idx: Long, point: BV[Double]) =>
+        // get a map value or else get a sparse vector
+        val (sumBV, n, sumOfSquares) = map.get(idx)
+            .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+        map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
+      }
+      map.toIterator
+    }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) =>
+      // sum the accumulation and the count in the all partition
+      (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
+    }.collect().toMap
+  }
+
+  /**
+   * Gets the initial centers for bi-sect k-means
+   */
+  private[clustering]
+  def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = {
+    val rand = new XORShiftRandom()
+    rand.setSeed(this.seed)
+
+    clusters.flatMap { case (idx, center) =>
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1)
+      val relativeErrorCoefficient = 0.001
+      Array(
+        (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))),
+        (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble())))
+      )
+    }.toMap
+  }
+
+  /**
+   * Gets the new divided centers
+   */
+  private[clustering]
+  def getDividedClusters(data: RDD[(Long, BV[Double])],
+    dividedClusters: Map[Long, ClusterTree]): Map[Long, ClusterTree] = {
+    val sc = data.sparkContext
+    val appName = sc.appName
+
+    // get keys of dividable clusters
+    val dividableKeys = dividedClusters.filter { case (idx, cluster) =>
+      cluster.variances.toArray.sum > 0.0 && cluster.records >= 2
+    }.keySet
+    if (dividableKeys.size == 0) {
+      log.info(s"There is no dividable clusters in ${appName}.")
+      return Map.empty[Long, ClusterTree]
+    }
+
+    // divide input data
+    var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)}
+    var dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)}
+    val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator)
+    var stats = divide(data, dividableClusters)
+
+    // if there is clusters which is failed to be divided,
+    // retry to divide only failed clusters again and again
+    var tryTimes = 1
+    while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) {
+      // get the indexes of clusters which is failed to be divided
+      val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => (idx / 2).toLong)
+      val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)}
+      log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" +
+          s"at ${tryTimes} times in ${appName}")
+
+      // divide the failed clusters again
+      sc.broadcast(failedIndexes)
+      dividableData = data.filter { case (idx, point) => failedIndexes.contains(idx)}
+      val missingStats = divide(dividableData, failedCenters)
+      stats = stats ++ missingStats
+      tryTimes += 1
+    }
+
+    // make children clusters
+    stats.filter { case (i, (sum, n, sumOfSquares)) => n > 0}
+        .map { case (i, (sum, n, sumOfSquares)) =>
+      val center = Vectors.fromBreeze(sum :/ n)
+      val variances = n match {
+        case 1 => Vectors.sparse(sum.size, Array(), Array())
+        case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
+      }
+      val child = new ClusterTree(center, n.toLong, variances)
+      (i, child)
+    }.toMap
+  }
+
+  /**
+   * Builds a cluster tree from a Map of clusters
+   *
+   * @param treeMap divided clusters as a Map class
+   * @param rootIndex index you want to start
+   * @param numClusters the number of clusters you want
+   * @return
+   */
+  private[clustering]
+  def buildTree(treeMap: Map[Long, ClusterTree],
+    rootIndex: Long,
+    numClusters: Int): Option[ClusterTree] = {
+
+    // if there is no index in the Map
+    if (!treeMap.contains(rootIndex)) return None
+
+    // build a cluster tree if the queue is empty or until the number of leaves clusters is enough
+    var numLeavesClusters = 1
+    val root = treeMap(rootIndex)
+    var leavesQueue = Map(rootIndex -> root)
+    while (leavesQueue.size > 0 && numLeavesClusters < numClusters) {
+      // pick up the cluster whose variance is the maximum in the queue
+      val mostScattered = leavesQueue.maxBy(_._2.variancesNorm)
+      val mostScatteredKey = mostScattered._1
+      val mostScatteredCluster = mostScattered._2
+
+      // relate the most scattered cluster to its children clusters
+      val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1)
+      if (childrenIndexes.forall(i => treeMap.contains(i))) {
+        // insert children to the most scattered cluster
+        val children = childrenIndexes.map(i => treeMap(i))
+        mostScatteredCluster.insert(children)
+
+        // calculate the local dendrogram height
+        // TODO Supports distance metrics other Euclidean distance metric
+        val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+        val localHeight = children
+            .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
+        mostScatteredCluster.setLocalHeight(localHeight)
+
+        // update the queue
+        leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap
+        numLeavesClusters += 1
+      }
+
+      // remove the cluster which is involved to the cluster tree
+      leavesQueue = leavesQueue.filterNot(_ == mostScattered)
+
+      log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " +
+          s"Cluster ${childrenIndexes.mkString(",")} are merged.")
+    }
+    Some(root)
+  }
+
+  /**
+   * Divides the input data
+   *
+   * @param data the pairs of cluster index and point which you want to divide
+   * @param clusters the clusters you want to divide AS a Map class
+   * @return divided clusters as Map
+   */
+  private[clustering]
+  def divide(data: RDD[(Long, BV[Double])],
+    clusters: Map[Long, ClusterTree]): Map[Long, (BV[Double], Double, BV[Double])] = {
+
+    val sc = data.sparkContext
+    val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)}
+    var newCenters = initChildrenCenter(centers)
+    if (newCenters.size == 0) {
+      return Map.empty[Long, (BV[Double], Double, BV[Double])]
+    }
+    sc.broadcast(newCenters)
+
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    sc.broadcast(metric)
+
+    val vectorSize = newCenters(newCenters.keySet.min).size
+    var stats = newCenters.keys.map { idx =>
+      (idx, (BSV.zeros[Double](vectorSize).toVector, 0.0, BSV.zeros[Double](vectorSize).toVector))
+    }.toMap
+
+    var subIter = 0
+    var diffVariances = Double.MaxValue
+    var oldVariances = Double.MaxValue
+    var variances = Double.MaxValue
+    while (subIter < this.maxIterations && diffVariances > 10E-4) {
+      // calculate summary of each cluster
+      val eachStats = data.mapPartitions { iter =>
+        val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
+        iter.foreach { case (idx, point) =>
+          // calculate next index number
+          val childrenCenters = Array(2 * idx, 2 * idx + 1).filter(newCenters.keySet.contains(_))
+              .map(newCenters(_)).toArray
+          if (childrenCenters.size >= 1) {
+            val closestIndex =
+              HierarchicalClustering.findClosestCenter(metric)(childrenCenters)(point)
+            val nextIndex = 2 * idx + closestIndex
+
+            // get a map value or else get a sparse vector
+            val (sumBV, n, sumOfSquares) = map.get(nextIndex)
+                .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+            map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
+          }
+        }
+        map.toIterator
+      }.reduceByKey { case ((sv1, n1, sumOfSquares1), (sv2, n2, sumOfSquares2)) =>
+        // sum the accumulation and the count in the all partition
+        (sv1 + sv2, n1 + n2, sumOfSquares1 + sumOfSquares2)
+      }.collect().toMap
+
+      // calculate the center of each cluster
+      newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)}
+
+      // update summary of each cluster
+      stats = eachStats.toMap
+
+      variances = stats.map { case (idx, (sum, n, sumOfSquares)) =>
+        math.pow(sumOfSquares.toArray.sum, 1.0 / sumOfSquares.size)
+      }.sum
+      diffVariances = math.abs(oldVariances - variances) / oldVariances
+      oldVariances = variances
+      subIter += 1
+    }
+    stats
+  }
+
+  /**
+   * Updates the indexes of clusters which is divided to its children indexes
+   */
+  private[clustering]
+  def updateClusterIndex(
+    data: RDD[(Long, BV[Double])],
+    dividedClusters: Map[Long, ClusterTree]): RDD[(Long, BV[Double])] = {
+    // extract the centers of the clusters
+    val sc = data.sparkContext
+    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
+    sc.broadcast(centers)
+
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    sc.broadcast(metric)
+
+    // update the indexes to their children indexes
+    data.map { case (idx, point) =>
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(centers.keySet.contains(_))
+      childrenIndexes.size match {
+        // stay the index if the number of children is not enough
+        case s if s < 2 => (idx, point)
+        // update the indexes
+        case _ => {
+          val nextCenters = childrenIndexes.map(centers(_)).map(_.toBreeze)
+          val closestIndex = HierarchicalClustering.findClosestCenter(metric)(nextCenters)(point)
+          val nextIndex = 2 * idx + closestIndex
+          (nextIndex, point)
+        }
+      }
+    }
+  }
+}
+
+/**
+ * A cluster as a tree node which can have its sub nodes
+ *
+ * @param center the center of the cluster
+ * @param records the number of rows in the cluster
+ * @param variances variance vectors
+ * @param parent the parent cluster of the cluster
+ * @param children the children nodes of the cluster
+ * @param variancesNorm the sum of squares of variances
+ */
+class ClusterTree(
+  val center: Vector,
+  val records: Long,
+  val variances: Vector,
+  val variancesNorm: Double,
+  private var localHeight: Double,
+  private var parent: Option[ClusterTree],
+  private var children: Seq[ClusterTree]) extends Serializable {
+
+  require(!variancesNorm.isNaN)
+
+  def this(center: Vector, rows: Long, variances: Vector) =
+    this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0),
+      0.0, None, Array.empty[ClusterTree])
+
+  /**
+   * Inserts sub nodes as its children
+   *
+   * @param children inserted sub nodes
+   */
+  def insert(children: Array[ClusterTree]) {
+    this.children = this.children ++ children
+    children.foreach(child => child.parent = Some(this))
+  }
+
+  /**
+   * Inserts a sub node as its child
+   *
+   * @param child inserted sub node
+   */
+  def insert(child: ClusterTree) {
+    insert(Array(child))
+  }
+
+  /**
+   * Converts the tree into Array class
+   * the sub nodes are recursively expanded
+   *
+   * @return Seq class which the cluster tree is expanded
+   */
+  def toArray(): Array[ClusterTree] = {
+    val array = this.children.size match {
+      case 0 => Array(this)
+      case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator)
+    }
+    array.sortWith { case (a, b) =>
+      a.getDepth() < b.getDepth() && a.variances.toArray.sum < b.variances.toArray.sum
+    }
+  }
+
+  /**
+   * Gets the depth of the cluster in the tree
+   *
+   * @return the depth
+   */
+  def getDepth(): Int = {
+    this.parent match {
+      case None => 0
+      case _ => 1 + this.parent.get.getDepth()
+    }
+  }
+
+  /**
+   * Gets the leaves nodes in the cluster tree
+   */
+  def getLeavesNodes(): Array[ClusterTree] = {
+    this.toArray().filter(_.isLeaf()).sortBy(_.center.toArray.sum)
+  }
+
+  def isLeaf(): Boolean = (this.children.size == 0)
+
+  def getParent(): Option[ClusterTree] = this.parent
+
+  def getChildren(): Seq[ClusterTree] = this.children
+
+  /**
+   * Gets the dendrogram height of the cluster at the cluster tree
+   *
+   * @return the dendrogram height
+   */
+  def getHeight(): Double = {
+    this.children.size match {
+      case 0 => 0.0
+      case _ => this.localHeight + this.children.map(_.getHeight()).max
+    }
+  }
+
+  private[mllib]
+  def setLocalHeight(height: Double) = (this.localHeight = height)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
new file mode 100644
index 0000000000000..d61a0775f7c6b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm}
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.util.{Loader, Saveable}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{Logging, SparkContext}
+
+/**
+ * This class is used for the model of the hierarchical clustering
+ *
+ * @param tree a cluster as a tree node
+ */
+class HierarchicalClusteringModel(val tree: ClusterTree)
+    extends Serializable with Saveable with Logging {
+
+  /** Current version of model save/load format. */
+  override protected def formatVersion: String = "1.0"
+
+  override def save(sc: SparkContext, path: String) {
+    val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(path))
+    try {
+      oos.writeObject(this)
+    } finally {
+      oos.close()
+    }
+  }
+
+  def getClusters(): Array[ClusterTree] = this.tree.getLeavesNodes()
+
+  def getCenters(): Array[Vector] = this.getClusters().map(_.center)
+
+  /**
+   * Predicts the closest cluster by one point
+   */
+  def predict(vector: Vector): Int = {
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+
+    val centers = this.getCenters().map(_.toBreeze)
+    HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze)
+  }
+
+  /**
+   * Predicts the closest cluster by RDD of the points
+   */
+  def predict(data: RDD[Vector]): RDD[Int] = {
+    val sc = data.sparkContext
+
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    sc.broadcast(metric)
+    val centers = this.getCenters().map(_.toBreeze)
+    sc.broadcast(centers)
+
+    data.map{point =>
+      HierarchicalClustering.findClosestCenter(metric)(centers)(point.toBreeze)
+    }
+  }
+
+  /**
+   * Predicts the closest cluster by RDD of the points for Java
+   */
+  def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
+    predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
+}
+
+
+object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] {
+
+  override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = {
+    val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(path))
+    try {
+      stream.readObject().asInstanceOf[HierarchicalClusteringModel]
+    } finally {
+      stream.close()
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
new file mode 100644
index 0000000000000..73674184dff77
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.scalatest.{BeforeAndAfterEach, FunSuite}
+
+class HierarchicalClusteringModelSuite
+    extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
+
+  test("clustering dense vectors") {
+    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
+
+    val localData = (1 to 100).toSeq.map { i =>
+      val label = i % 5
+      val vector = Vectors.dense(label, label, label)
+      (label, vector)
+    }
+    val data = sc.parallelize(localData.map(_._2))
+    val model = app.run(data)
+
+    val clusters = model.getClusters()
+    assert(clusters.isInstanceOf[Array[ClusterTree]])
+    assert(clusters.size === 5)
+
+    val centers = model.getCenters().sortBy(_.toArray.sum)
+    assert(centers.size === 5)
+    assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0))
+    assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0))
+    assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0))
+    assert(centers(3) === Vectors.dense(3.0, 3.0, 3.0))
+    assert(centers(4) === Vectors.dense(4.0, 4.0, 4.0))
+
+    // predict with one vector
+    assert(model.predict(Vectors.dense(0.0, 0.0, 0.0)) === 0)
+    assert(model.predict(Vectors.dense(0.5, 0.5, 0.5)) === 0)
+    assert(model.predict(Vectors.dense(1.0, 1.0, 1.0)) === 1)
+    assert(model.predict(Vectors.dense(2.0, 2.0, 2.0)) === 2)
+    assert(model.predict(Vectors.dense(3.0, 3.0, 3.0)) === 3)
+    assert(model.predict(Vectors.dense(4.0, 4.0, 4.0)) === 4)
+
+    // predict with a RDD
+    val predicted = model.predict(data).collect()
+    assert(predicted === localData.map(_._1))
+  }
+
+  test("clustering sparse vectors") {
+    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
+
+    val localData = (1 to 100).toSeq.map { i =>
+      val label = i % 5
+      val vector = Vectors.sparse(5, Seq((label, label.toDouble)))
+      (label, vector)
+    }
+    val data = sc.parallelize(localData.map(_._2))
+    val model = app.run(data)
+
+    val clusters = model.getClusters()
+    assert(clusters.isInstanceOf[Array[ClusterTree]])
+    assert(clusters.size === 5)
+
+    val centers = model.getCenters().sortBy(_.toArray.sum)
+    assert(centers.size === 5)
+    assert(centers(0) === Vectors.sparse(5, Array(), Array()))
+    assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0)))
+    assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0)))
+    assert(centers(3) === Vectors.sparse(5, Array(3), Array(3.0)))
+    assert(centers(4) === Vectors.sparse(5, Array(4), Array(4.0)))
+
+    // predict with one vector
+    assert(model.predict(Vectors.sparse(5, Array(0), Array(0.0))) === 0)
+    assert(model.predict(Vectors.sparse(5, Array(1), Array(1.0))) === 1)
+    assert(model.predict(Vectors.sparse(5, Array(2), Array(2.0))) === 2)
+    assert(model.predict(Vectors.sparse(5, Array(3), Array(3.0))) === 3)
+    assert(model.predict(Vectors.sparse(5, Array(4), Array(4.0))) === 4)
+
+    // predict with a RDD
+    val predicted = model.predict(data).collect()
+    assert(predicted === localData.map(_._1))
+  }
+
+  test("save a model, and then load the model") {
+    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
+
+    val localData = (1 to 100).toSeq.map { i =>
+      val label = i % 5
+      val vector = Vectors.dense(label, label, label)
+      (label, vector)
+    }
+    val data = sc.parallelize(localData.map(_._2))
+    val model = app.run(data)
+
+    val tmpFile = java.io.File.createTempFile("hierarchical-clustering", "save-load")
+    model.save(sc, tmpFile.getAbsolutePath)
+
+    val sameModel = HierarchicalClusteringModel.load(sc, tmpFile.getAbsolutePath)
+    assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel")
+    localData.foreach { case (label, vector) =>
+        assert(model.predict(vector) === sameModel.predict(vector))
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
new file mode 100644
index 0000000000000..befb0bea5af90
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.scalatest.FunSuite
+
+
+class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("train") {
+    val numClusters = 9
+    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val data = sc.parallelize(localSeed, 1)
+    val model = HierarchicalClustering.train(data, numClusters)
+    assert(model.getClusters().size === numClusters)
+    assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4)
+  }
+
+  test("train with full arguments") {
+    val numClusters = 9
+    val subIterations = 20
+    val maxRetries = 20
+    val seed = 321
+
+    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val data = sc.parallelize(localSeed, 1)
+
+    val model = HierarchicalClustering.train(data, numClusters, subIterations, maxRetries, seed)
+    assert(model.getClusters().size === numClusters)
+    assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4)
+  }
+}
+
+
+class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("run") {
+    val algo = new HierarchicalClustering().setNumClusters(123)
+    val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val data = sc.parallelize(localSeed, 2)
+    val model = algo.run(data)
+    assert(model.getClusters().size == 123)
+    assert(model.tree.getHeight() ~== 702.8641 absTol 10E-4)
+
+    // check the relations between a parent cluster and its children
+    assert(model.tree.getParent() === None)
+    assert(model.tree.getChildren().apply(0).getParent().get === model.tree)
+    assert(model.tree.getChildren().apply(1).getParent().get === model.tree)
+    assert(model.getClusters().forall(_.getParent() != None))
+  }
+
+  test("run with too many cluster size than the records") {
+    val algo = new HierarchicalClustering().setNumClusters(123)
+    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val data = sc.parallelize(localSeed, 2)
+    val model = algo.run(data)
+    assert(model.getClusters().size == 100)
+    assert(model.tree.getHeight() ~== 72.12489 absTol 10E-4)
+  }
+
+  test("initializeData") {
+    val algo = new HierarchicalClustering
+    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val seed = sc.parallelize(localSeed)
+    val data = algo.initData(seed)
+    assert(data.map(_._1).collect().distinct === Array(1))
+  }
+
+  test("get center stats") {
+    val algo = new HierarchicalClustering
+    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
+    val seed = sc.parallelize(localSeed)
+    val data = algo.initData(seed)
+
+    val clusters = algo.summarizeAsClusters(data)
+    val center = clusters(1).center
+    assert(clusters.size === 1)
+    assert(clusters(1).center === Vectors.dense(49.5, 49.5))
+    assert(clusters(1).records === 100)
+
+    val data2 = seed.map(v => ((v.apply(0) / 25).toLong + 1L, v.toBreeze))
+    val clusters2 = algo.summarizeAsClusters(data2)
+    assert(clusters2.size === 4)
+    assert(clusters2(1).center === Vectors.dense(12.0, 12.0))
+    assert(clusters2(1).records === 25)
+    assert(clusters2(2).center === Vectors.dense(37.0, 37.0))
+    assert(clusters2(2).records === 25)
+    assert(clusters2(3).center === Vectors.dense(62.0, 62.0))
+    assert(clusters2(3).records === 25)
+    assert(clusters2(4).center === Vectors.dense(87.0, 87.0))
+    assert(clusters2(4).records === 25)
+  }
+
+  test("getChildrenCenter") {
+    val algo = new HierarchicalClustering
+    val centers = Map(
+      2L -> Vectors.dense(1.0, 1.0).toBreeze,
+      3L -> Vectors.dense(2.0, 2.0).toBreeze
+    )
+    val initNextCenters = algo.initChildrenCenter(centers)
+    assert(initNextCenters.size === 4)
+    assert(initNextCenters.keySet === Set(4, 5, 6, 7))
+  }
+
+  test("should divide clusters") {
+    val algo = new HierarchicalClustering
+    val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze))
+    val data = sc.parallelize(seed)
+    val clusters = algo.summarizeAsClusters(data)
+    val newClusters = algo.getDividedClusters(data, clusters)
+
+    assert(newClusters.size === 4)
+    assert(newClusters(4).center === Vectors.dense(12.0, 12.0))
+    assert(newClusters(4).records === 25)
+    assert(newClusters(5).center === Vectors.dense(37.0, 37.0))
+    assert(newClusters(5).records === 25)
+    assert(newClusters(6).center === Vectors.dense(62.0, 62.0))
+    assert(newClusters(6).records === 25)
+    assert(newClusters(7).center === Vectors.dense(87.0, 87.0))
+    assert(newClusters(7).records === 25)
+  }
+
+  test("should assign each data to new clusters") {
+    val algo = new HierarchicalClustering
+    val seed = Seq(
+      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)),
+      (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
+      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)),
+      (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0))
+    ).map { case (idx, vector) => (idx, vector.toBreeze)}
+    val newClusters = Map(
+      4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),
+      5L -> new ClusterTree(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)),
+      6L -> new ClusterTree(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)),
+      7L -> new ClusterTree(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0))
+    )
+    val data = sc.parallelize(seed)
+    val result = algo.updateClusterIndex(data, newClusters).collect().toSeq
+
+    val expected = Seq(
+      (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
+      (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)),
+      (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)),
+      (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0))
+    ).map { case (idx, vector) => (idx, vector.toBreeze)}
+    assert(result === expected)
+  }
+
+  test("setSubIterations") {
+    val algo = new HierarchicalClustering()
+    assert(algo.getSubIterations() == 20)
+    algo.setMaxIterations(15)
+    assert(algo.getSubIterations() == 15)
+  }
+
+  test("setNumRetries") {
+    val algo = new HierarchicalClustering()
+    assert(algo.getMaxRetries() == 10)
+    algo.setMaxRetries(15)
+    assert(algo.getMaxRetries() == 15)
+  }
+
+  test("setSeed") {
+    val algo = new HierarchicalClustering()
+    assert(algo.getSeed() == 1)
+    algo.setSeed(987)
+    assert(algo.getSeed() == 987)
+  }
+}

From c51017cccf8d62ac4ffdb4d67f0f83a7526da659 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 30 Mar 2015 23:13:39 +0900
Subject: [PATCH 02/76] Fix the some comments

---
 .../spark/mllib/clustering/HierarchicalClustering.scala  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index dd6cf6d0f8b94..30c7804e3d151 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -352,7 +352,7 @@ class HierarchicalClustering(
    * @param treeMap divided clusters as a Map class
    * @param rootIndex index you want to start
    * @param numClusters the number of clusters you want
-   * @return
+   * @return a built cluster tree
    */
   private[clustering]
   def buildTree(treeMap: Map[Long, ClusterTree],
@@ -513,9 +513,10 @@ class HierarchicalClustering(
  * @param center the center of the cluster
  * @param records the number of rows in the cluster
  * @param variances variance vectors
+ * @param variancesNorm the norm of variance vector
+ * @param localHeight the maximal distance between this node and its children
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
- * @param variancesNorm the sum of squares of variances
  */
 class ClusterTree(
   val center: Vector,
@@ -555,7 +556,7 @@ class ClusterTree(
    * Converts the tree into Array class
    * the sub nodes are recursively expanded
    *
-   * @return Seq class which the cluster tree is expanded
+   * @return an Array class which the cluster tree is expanded
    */
   def toArray(): Array[ClusterTree] = {
     val array = this.children.size match {
@@ -570,7 +571,7 @@ class ClusterTree(
   /**
    * Gets the depth of the cluster in the tree
    *
-   * @return the depth
+   * @return the depth from the root
    */
   def getDepth(): Int = {
     this.parent match {

From a8cd7abcef0688cee9e6af8f6e2416fe7c0e266c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 2 Apr 2015 13:10:38 +0900
Subject: [PATCH 03/76] Remove parentheses for getters and add a test for
 HierarchicalClustering.setNumClusters

---
 .../clustering/HierarchicalClustering.scala   |  8 +++++---
 .../HierarchicalClusteringSuite.scala         | 19 +++++++++++++------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 30c7804e3d151..566475dce2345 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -118,6 +118,8 @@ class HierarchicalClustering(
     this
   }
 
+  def getNumClusters: Int = this.numClusters
+
   /**
    * Sets the number of maximal iterations in each clustering step
    */
@@ -126,7 +128,7 @@ class HierarchicalClustering(
     this
   }
 
-  def getSubIterations(): Int = this.maxIterations
+  def getSubIterations: Int = this.maxIterations
 
   /**
    * Sets the number of maximum retries of each clustering step
@@ -136,7 +138,7 @@ class HierarchicalClustering(
     this
   }
 
-  def getMaxRetries(): Int = this.maxRetries
+  def getMaxRetries: Int = this.maxRetries
 
   /**
    * Sets the random seed
@@ -146,7 +148,7 @@ class HierarchicalClustering(
     this
   }
 
-  def getSeed(): Long = this.seed
+  def getSeed: Long = this.seed
 
   /**
    * Runs the hierarchical clustering algorithm
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index befb0bea5af90..57af03a4ae305 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -165,24 +165,31 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
     assert(result === expected)
   }
 
+  test("setNumClusters") {
+    val algo = new HierarchicalClustering()
+    assert(algo.getNumClusters == 20)
+    algo.setNumClusters(1000)
+    assert(algo.getNumClusters == 1000)
+  }
+
   test("setSubIterations") {
     val algo = new HierarchicalClustering()
-    assert(algo.getSubIterations() == 20)
+    assert(algo.getSubIterations == 20)
     algo.setMaxIterations(15)
-    assert(algo.getSubIterations() == 15)
+    assert(algo.getSubIterations == 15)
   }
 
   test("setNumRetries") {
     val algo = new HierarchicalClustering()
-    assert(algo.getMaxRetries() == 10)
+    assert(algo.getMaxRetries == 10)
     algo.setMaxRetries(15)
-    assert(algo.getMaxRetries() == 15)
+    assert(algo.getMaxRetries == 15)
   }
 
   test("setSeed") {
     val algo = new HierarchicalClustering()
-    assert(algo.getSeed() == 1)
+    assert(algo.getSeed == 1)
     algo.setSeed(987)
-    assert(algo.getSeed() == 987)
+    assert(algo.getSeed == 987)
   }
 }

From 306f6037c634745f3b66d0048685156fb2725b8a Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 8 Apr 2015 11:26:57 +0900
Subject: [PATCH 04/76] Remove the static train() method from
 HierarchicalClustering object

---
 .../clustering/HierarchicalClustering.scala   | 38 -------------------
 .../HierarchicalClusteringSuite.scala         | 23 -----------
 2 files changed, 61 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 566475dce2345..1ce993e8be14c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -26,48 +26,10 @@ import org.apache.spark.{Logging, SparkException}
 import scala.collection.{Map, mutable}
 
 
-/**
- * Top-level methods for calling the hierarchical clustering algorithm
- */
 object HierarchicalClustering extends Logging {
 
   private[clustering] val ROOT_INDEX_KEY: Long = 1
 
-  /**
-   * Trains a hierarchical clustering model with the given data
-   *
-   * @param data trained data
-   * @param numClusters the maximum number of clusters you want
-   * @return a hierarchical clustering model
-   */
-  def train(data: RDD[Vector], numClusters: Int): HierarchicalClusteringModel = {
-    val algo = new HierarchicalClustering().setNumClusters(numClusters)
-    algo.run(data)
-  }
-
-  /**
-   * Trains a hierarchical clustering model with the given data
-   *
-   * @param data training data
-   * @param numClusters the maximum number of clusters you want
-   * @param maxIterations the number of maximal iterations
-   * @param maxRetries the number of maximum retries when the clustering can't be succeeded
-   * @param seed the randomseed to generate the initial vectors for each bisecting
-   * @return a hierarchical clustering model
-   */
-  def train(data: RDD[Vector],
-    numClusters: Int,
-    maxIterations: Int,
-    maxRetries: Int,
-    seed: Int): HierarchicalClusteringModel = {
-
-    val algo = new HierarchicalClustering().setNumClusters(numClusters)
-        .setMaxIterations(maxIterations)
-        .setMaxRetries(maxRetries)
-        .setSeed(seed)
-    algo.run(data)
-  }
-
   /**
    * Finds the closes cluster's center
    *
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 57af03a4ae305..e0c7bd13b3bfb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -25,29 +25,6 @@ import org.scalatest.FunSuite
 
 
 class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext {
-
-  test("train") {
-    val numClusters = 9
-    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val data = sc.parallelize(localSeed, 1)
-    val model = HierarchicalClustering.train(data, numClusters)
-    assert(model.getClusters().size === numClusters)
-    assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4)
-  }
-
-  test("train with full arguments") {
-    val numClusters = 9
-    val subIterations = 20
-    val maxRetries = 20
-    val seed = 321
-
-    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val data = sc.parallelize(localSeed, 1)
-
-    val model = HierarchicalClustering.train(data, numClusters, subIterations, maxRetries, seed)
-    assert(model.getClusters().size === numClusters)
-    assert(model.tree.getHeight() ~== 67.1751 absTol 10E-4)
-  }
 }
 
 

From b2d0369947bce56f4bbce2b87a0da00a81d5545c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 8 Apr 2015 12:12:52 +0900
Subject: [PATCH 05/76] Add a test for
 HierarchicalClustering.findClosestCenter()

---
 .../HierarchicalClusteringSuite.scala         | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index e0c7bd13b3bfb..afe2d0652bf9b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -25,6 +25,25 @@ import org.scalatest.FunSuite
 
 
 class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("the root index is equal to 1") {
+    assert(HierarchicalClustering.ROOT_INDEX_KEY === 1)
+  }
+
+  test("findClosestCenter") {
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    val centers = Seq(
+      Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze,
+      Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze,
+      Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze
+    )
+
+    for (i <- 0 to (centers.size - 1)) {
+      val point = centers(i)
+      val closestIndex = HierarchicalClustering.findClosestCenter(metric)(centers)(point)
+      assert(closestIndex === i)
+    }
+  }
 }
 
 

From 0ddfcfb2d02c286a01227902c5a5ea859bf6a981 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 7 Apr 2015 14:11:40 +0900
Subject: [PATCH 06/76] Add a function to compute Within Set Sum of Squared
 Error into Scala/Java/Python

---
 .../HierarchicalClusteringModel.scala         | 19 +++++++++++++++++++
 .../HierarchicalClusteringModelSuite.scala    |  6 ++++++
 2 files changed, 25 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index d61a0775f7c6b..18040de3afcea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -81,6 +81,25 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
    */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
+
+  /**
+   * Computes Within Set Sum of Squeared Error(WSSSE)
+   */
+  def WSSSE(data: RDD[Vector]): Double = {
+    val bvCenters = this.getCenters().map(_.toBreeze)
+    data.context.broadcast(bvCenters)
+    val distances = data.map {point =>
+      val bvPoint = point.toBreeze
+      val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+      val idx = HierarchicalClustering.findClosestCenter(metric)(bvCenters)(bvPoint)
+      val closestCenter = bvCenters(idx)
+      val distance = metric(bvPoint, closestCenter)
+      distance
+    }
+    distances.sum()
+  }
+
+  def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
 }
 
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index 73674184dff77..31b7a5255cd14 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -58,6 +58,9 @@ class HierarchicalClusteringModelSuite
     // predict with a RDD
     val predicted = model.predict(data).collect()
     assert(predicted === localData.map(_._1))
+
+    // compute WSSSE
+    assert(model.WSSSE(data) === 0.0)
   }
 
   test("clustering sparse vectors") {
@@ -93,6 +96,9 @@ class HierarchicalClusteringModelSuite
     // predict with a RDD
     val predicted = model.predict(data).collect()
     assert(predicted === localData.map(_._1))
+
+    // compute WSSSE
+    assert(model.WSSSE(data) === 0.0)
   }
 
   test("save a model, and then load the model") {

From ecb3fd703573ea9c75df1c38f96124fa7f3c003f Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 2 Apr 2015 14:24:58 +0900
Subject: [PATCH 07/76] Change the visibility of constructer parameters of
 HierarchicalClustering and ClusterTree from public to private

---
 .../spark/mllib/clustering/HierarchicalClustering.scala       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 1ce993e8be14c..6f862440ae3c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -60,7 +60,7 @@ object HierarchicalClustering extends Logging {
  * @param maxRetries the number of maximum retries
  * @param seed a random seed
  */
-class HierarchicalClustering(
+class HierarchicalClustering private (
   private var numClusters: Int,
   private var clusterMap: Map[Long, ClusterTree],
   private var maxIterations: Int,
@@ -482,7 +482,7 @@ class HierarchicalClustering(
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
  */
-class ClusterTree(
+class ClusterTree private (
   val center: Vector,
   val records: Long,
   val variances: Vector,

From 08f01013b27500c5266f92a8be3300684a40cc9c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 27 Apr 2015 15:45:41 +0900
Subject: [PATCH 08/76] Rename getSubIterations to getMaxIterations

---
 .../spark/mllib/clustering/HierarchicalClustering.scala       | 2 +-
 .../spark/mllib/clustering/HierarchicalClusteringSuite.scala  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 6f862440ae3c9..dc295714ad729 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -90,7 +90,7 @@ class HierarchicalClustering private (
     this
   }
 
-  def getSubIterations: Int = this.maxIterations
+  def getMaxIterations: Int = this.maxIterations
 
   /**
    * Sets the number of maximum retries of each clustering step
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index afe2d0652bf9b..4c3630ad2025d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -170,9 +170,9 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
 
   test("setSubIterations") {
     val algo = new HierarchicalClustering()
-    assert(algo.getSubIterations == 20)
+    assert(algo.getMaxIterations == 20)
     algo.setMaxIterations(15)
-    assert(algo.getSubIterations == 15)
+    assert(algo.getMaxIterations == 15)
   }
 
   test("setNumRetries") {

From 2a14900e92d8981d6313e47773499d3ad622569a Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 27 Apr 2015 16:23:52 +0900
Subject: [PATCH 09/76] Modify how to broadcast variables

---
 .../clustering/HierarchicalClustering.scala   | 26 ++++++++++---------
 .../HierarchicalClusteringModelSuite.scala    | 23 ++++++++++++++++
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index dc295714ad729..77804c3bc68ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -290,8 +290,8 @@ class HierarchicalClustering private (
           s"at ${tryTimes} times in ${appName}")
 
       // divide the failed clusters again
-      sc.broadcast(failedIndexes)
-      dividableData = data.filter { case (idx, point) => failedIndexes.contains(idx)}
+      val bcFailedIndexes = sc.broadcast(failedIndexes)
+      dividableData = data.filter { case (idx, point) => bcFailedIndexes.value.contains(idx)}
       val missingStats = divide(dividableData, failedCenters)
       stats = stats ++ missingStats
       tryTimes += 1
@@ -381,11 +381,11 @@ class HierarchicalClustering private (
     if (newCenters.size == 0) {
       return Map.empty[Long, (BV[Double], Double, BV[Double])]
     }
-    sc.broadcast(newCenters)
+    var bcNewCenters = sc.broadcast(newCenters)
 
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    sc.broadcast(metric)
+    val bcMetric = sc.broadcast(metric)
 
     val vectorSize = newCenters(newCenters.keySet.min).size
     var stats = newCenters.keys.map { idx =>
@@ -402,11 +402,11 @@ class HierarchicalClustering private (
         val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
         iter.foreach { case (idx, point) =>
           // calculate next index number
-          val childrenCenters = Array(2 * idx, 2 * idx + 1).filter(newCenters.keySet.contains(_))
-              .map(newCenters(_)).toArray
+          val childrenCenters = Array(2 * idx, 2 * idx + 1)
+              .filter(bcNewCenters.value.keySet.contains(_)).map(bcNewCenters.value(_)).toArray
           if (childrenCenters.size >= 1) {
             val closestIndex =
-              HierarchicalClustering.findClosestCenter(metric)(childrenCenters)(point)
+              HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point)
             val nextIndex = 2 * idx + closestIndex
 
             // get a map value or else get a sparse vector
@@ -423,6 +423,7 @@ class HierarchicalClustering private (
 
       // calculate the center of each cluster
       newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)}
+      bcNewCenters = sc.broadcast(newCenters)
 
       // update summary of each cluster
       stats = eachStats.toMap
@@ -447,22 +448,23 @@ class HierarchicalClustering private (
     // extract the centers of the clusters
     val sc = data.sparkContext
     var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
-    sc.broadcast(centers)
+    val bcCenters = sc.broadcast(centers)
 
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    sc.broadcast(metric)
+    val bcMetric = sc.broadcast(metric)
 
     // update the indexes to their children indexes
     data.map { case (idx, point) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(centers.keySet.contains(_))
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(bcCenters.value.keySet.contains(_))
       childrenIndexes.size match {
         // stay the index if the number of children is not enough
         case s if s < 2 => (idx, point)
         // update the indexes
         case _ => {
-          val nextCenters = childrenIndexes.map(centers(_)).map(_.toBreeze)
-          val closestIndex = HierarchicalClustering.findClosestCenter(metric)(nextCenters)(point)
+          val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze)
+          val closestIndex = HierarchicalClustering
+              .findClosestCenter(bcMetric.value)(nextCenters)(point)
           val nextIndex = 2 * idx + closestIndex
           (nextIndex, point)
         }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index 31b7a5255cd14..d747cec7fd558 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -101,6 +101,29 @@ class HierarchicalClusteringModelSuite
     assert(model.WSSSE(data) === 0.0)
   }
 
+  test("clustering should be done correctly") {
+    for (numClusters <- Array(9, 99, 999)) {
+      val app = new HierarchicalClustering().setNumClusters(numClusters).setSeed(1)
+      val localData = (1 to 1000).toSeq.map { i =>
+        val label = i % numClusters
+        val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble)))
+        val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector)
+        (label, denseVector, sparseVector)
+      }
+      // dense version
+      val denseData = sc.parallelize(localData.map(_._2), 2)
+      val denseModel = app.run(denseData)
+      assert(denseModel.getCenters().size === numClusters)
+      assert(denseModel.getClusters().forall(_.variancesNorm == 0.0))
+
+      // sparse version
+      val sparseData = sc.parallelize(localData.map(_._3), 2)
+      val sparseModel = app.run(sparseData)
+      assert(sparseModel.getCenters().size === numClusters)
+      assert(sparseModel.getClusters().forall(_.variancesNorm == 0.0))
+    }
+  }
+
   test("save a model, and then load the model") {
     val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
 

From 38f07bd779fb0e19219aa4a413ca2c9ac1044928 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 27 Apr 2015 16:34:27 +0900
Subject: [PATCH 10/76] Remove unnecessary parentheses

---
 .../clustering/HierarchicalClustering.scala   | 22 +++++++++----------
 .../HierarchicalClusteringModel.scala         | 10 ++++-----
 .../HierarchicalClusteringModelSuite.scala    | 16 +++++++-------
 .../HierarchicalClusteringSuite.scala         | 16 +++++++-------
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 77804c3bc68ae..6ca3dff0e5f07 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -182,7 +182,7 @@ class HierarchicalClustering private (
 
     // make a hierarchical clustering model
     val model = new HierarchicalClusteringModel(root.get)
-    val leavesNodes = model.getClusters()
+    val leavesNodes = model.getClusters
     if (leavesNodes.size < this.numClusters) {
       log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ")
     }
@@ -530,7 +530,7 @@ class ClusterTree private (
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator)
     }
     array.sortWith { case (a, b) =>
-      a.getDepth() < b.getDepth() && a.variances.toArray.sum < b.variances.toArray.sum
+      a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum
     }
   }
 
@@ -539,35 +539,35 @@ class ClusterTree private (
    *
    * @return the depth from the root
    */
-  def getDepth(): Int = {
+  def getDepth: Int = {
     this.parent match {
       case None => 0
-      case _ => 1 + this.parent.get.getDepth()
+      case _ => 1 + this.parent.get.getDepth
     }
   }
 
   /**
    * Gets the leaves nodes in the cluster tree
    */
-  def getLeavesNodes(): Array[ClusterTree] = {
-    this.toArray().filter(_.isLeaf()).sortBy(_.center.toArray.sum)
+  def getLeavesNodes: Array[ClusterTree] = {
+    this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum)
   }
 
-  def isLeaf(): Boolean = (this.children.size == 0)
+  def isLeaf: Boolean = (this.children.size == 0)
 
-  def getParent(): Option[ClusterTree] = this.parent
+  def getParent: Option[ClusterTree] = this.parent
 
-  def getChildren(): Seq[ClusterTree] = this.children
+  def getChildren: Seq[ClusterTree] = this.children
 
   /**
    * Gets the dendrogram height of the cluster at the cluster tree
    *
    * @return the dendrogram height
    */
-  def getHeight(): Double = {
+  def getHeight: Double = {
     this.children.size match {
       case 0 => 0.0
-      case _ => this.localHeight + this.children.map(_.getHeight()).max
+      case _ => this.localHeight + this.children.map(_.getHeight).max
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 18040de3afcea..6ec44325a0f8d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -44,9 +44,9 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
     }
   }
 
-  def getClusters(): Array[ClusterTree] = this.tree.getLeavesNodes()
+  def getClusters: Array[ClusterTree] = this.tree.getLeavesNodes
 
-  def getCenters(): Array[Vector] = this.getClusters().map(_.center)
+  def getCenters: Array[Vector] = this.getClusters.map(_.center)
 
   /**
    * Predicts the closest cluster by one point
@@ -55,7 +55,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
 
-    val centers = this.getCenters().map(_.toBreeze)
+    val centers = this.getCenters.map(_.toBreeze)
     HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze)
   }
 
@@ -68,7 +68,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     sc.broadcast(metric)
-    val centers = this.getCenters().map(_.toBreeze)
+    val centers = this.getCenters.map(_.toBreeze)
     sc.broadcast(centers)
 
     data.map{point =>
@@ -86,7 +86,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
    * Computes Within Set Sum of Squeared Error(WSSSE)
    */
   def WSSSE(data: RDD[Vector]): Double = {
-    val bvCenters = this.getCenters().map(_.toBreeze)
+    val bvCenters = this.getCenters.map(_.toBreeze)
     data.context.broadcast(bvCenters)
     val distances = data.map {point =>
       val bvPoint = point.toBreeze
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index d747cec7fd558..805269d8e2f61 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -35,11 +35,11 @@ class HierarchicalClusteringModelSuite
     val data = sc.parallelize(localData.map(_._2))
     val model = app.run(data)
 
-    val clusters = model.getClusters()
+    val clusters = model.getClusters
     assert(clusters.isInstanceOf[Array[ClusterTree]])
     assert(clusters.size === 5)
 
-    val centers = model.getCenters().sortBy(_.toArray.sum)
+    val centers = model.getCenters.sortBy(_.toArray.sum)
     assert(centers.size === 5)
     assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0))
     assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0))
@@ -74,11 +74,11 @@ class HierarchicalClusteringModelSuite
     val data = sc.parallelize(localData.map(_._2))
     val model = app.run(data)
 
-    val clusters = model.getClusters()
+    val clusters = model.getClusters
     assert(clusters.isInstanceOf[Array[ClusterTree]])
     assert(clusters.size === 5)
 
-    val centers = model.getCenters().sortBy(_.toArray.sum)
+    val centers = model.getCenters.sortBy(_.toArray.sum)
     assert(centers.size === 5)
     assert(centers(0) === Vectors.sparse(5, Array(), Array()))
     assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0)))
@@ -113,14 +113,14 @@ class HierarchicalClusteringModelSuite
       // dense version
       val denseData = sc.parallelize(localData.map(_._2), 2)
       val denseModel = app.run(denseData)
-      assert(denseModel.getCenters().size === numClusters)
-      assert(denseModel.getClusters().forall(_.variancesNorm == 0.0))
+      assert(denseModel.getCenters.size === numClusters)
+      assert(denseModel.getClusters.forall(_.variancesNorm == 0.0))
 
       // sparse version
       val sparseData = sc.parallelize(localData.map(_._3), 2)
       val sparseModel = app.run(sparseData)
-      assert(sparseModel.getCenters().size === numClusters)
-      assert(sparseModel.getClusters().forall(_.variancesNorm == 0.0))
+      assert(sparseModel.getCenters.size === numClusters)
+      assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0))
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 4c3630ad2025d..9f0b18e6dfa58 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -54,14 +54,14 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
-    assert(model.getClusters().size == 123)
-    assert(model.tree.getHeight() ~== 702.8641 absTol 10E-4)
+    assert(model.getClusters.size == 123)
+    assert(model.tree.getHeight ~== 702.8641 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
-    assert(model.tree.getParent() === None)
-    assert(model.tree.getChildren().apply(0).getParent().get === model.tree)
-    assert(model.tree.getChildren().apply(1).getParent().get === model.tree)
-    assert(model.getClusters().forall(_.getParent() != None))
+    assert(model.tree.getParent === None)
+    assert(model.tree.getChildren.apply(0).getParent.get === model.tree)
+    assert(model.tree.getChildren.apply(1).getParent.get === model.tree)
+    assert(model.getClusters.forall(_.getParent != None))
   }
 
   test("run with too many cluster size than the records") {
@@ -69,8 +69,8 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
-    assert(model.getClusters().size == 100)
-    assert(model.tree.getHeight() ~== 72.12489 absTol 10E-4)
+    assert(model.getClusters.size == 100)
+    assert(model.tree.getHeight ~== 72.12489 absTol 10E-4)
   }
 
   test("initializeData") {

From 99e703b1a2a4c3f3e7827ffcfd61dde22ae9e7cf Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 28 Apr 2015 15:42:24 +0900
Subject: [PATCH 11/76] Add toLinkageMatrix() and toAdjacencyList()

---
 .../clustering/HierarchicalClustering.scala   | 52 +++++++++++++++++++
 .../HierarchicalClusteringModel.scala         | 31 +++++++++++
 .../HierarchicalClusteringModelSuite.scala    | 44 ++++++++++++++++
 3 files changed, 127 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 6ca3dff0e5f07..dc06f1891e64b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -571,6 +571,58 @@ class ClusterTree private (
     }
   }
 
+  /**
+   * Converts to a adjacency list
+   *
+   * @return List[(fromNodeId, toNodeId, distance)]
+   */
+  def toAdjacencyList(): Array[(Int, Int, Double)] = {
+    val nodes = toArray()
+
+    var adjacencyList = Array.empty[(Int, Int, Double)]
+    nodes.foreach { parent =>
+      if (parent.children.size > 1) {
+        val parentIndex = nodes.indexOf(parent)
+        parent.children.foreach { child =>
+          val childIndex = nodes.indexOf(child)
+          adjacencyList = adjacencyList :+(parentIndex, childIndex, parent.localHeight)
+        }
+      }
+    }
+    adjacencyList
+  }
+
+  /**
+   * Converts to a linkage matrix
+   * Returned data format is fit for scipy's dendrogram function
+   * SEE ALSO: scipy.cluster.hierarchy.dendrogram
+   *
+   * @return List[(node1, node2, distance, tree size)]
+   */
+  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = {
+    val nodes = toArray().sortWith { case (a, b) => a.getHeight < b.getHeight}
+    val leaves = nodes.filter(_.isLeaf)
+    val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1)
+    val clusters = leaves ++ notLeaves
+    val treeMap = clusters.zipWithIndex.map { case (tree, idx) => (tree -> idx)}.toMap
+
+    // If a node only has one-child, the child is regarded as the cluster of the child.
+    // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D.
+    // ==> A merge list is (B, D), not (B, C).
+    def getIndex(map: Map[ClusterTree, Int], tree: ClusterTree): Int = {
+      tree.children.size match {
+        case 1 => getIndex(map, tree.children(0))
+        case _ => map(tree)
+      }
+    }
+    clusters.filterNot(_.isLeaf).map { tree =>
+      (getIndex(treeMap, tree.children(0)),
+          getIndex(treeMap, tree.children(1)),
+          tree.getHeight,
+          tree.toArray().filter(_.isLeaf).size)
+    }
+  }
+
   private[mllib]
   def setLocalHeight(height: Double) = (this.localHeight = height)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 6ec44325a0f8d..9dcf84b5df381 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -100,6 +100,37 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
   }
 
   def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
+
+  def toAdjacencyList(): Array[(Int, Int, Double)] = this.tree.toAdjacencyList()
+
+  /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
+  def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
+    var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]();
+    this.tree.toAdjacencyList().foreach { x =>
+      val edge = new java.util.ArrayList[java.lang.Double]()
+      edge.add(x._1)
+      edge.add(x._2)
+      edge.add(x._3)
+      javaList.add(edge)
+    }
+    javaList
+  }
+
+  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.tree.toLinkageMatrix()
+
+  /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
+  def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
+    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
+    this.tree.toLinkageMatrix().foreach {x =>
+      val row = new java.util.ArrayList[java.lang.Double]()
+      row.add(x._1)
+      row.add(x._2)
+      row.add(x._3)
+      row.add(x._4)
+      javaList.add(row)
+    }
+    javaList
+  }
 }
 
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index 805269d8e2f61..d374ec956562a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -61,6 +61,28 @@ class HierarchicalClusteringModelSuite
 
     // compute WSSSE
     assert(model.WSSSE(data) === 0.0)
+
+    // adjacency list
+    val adjacencyList = model.toAdjacencyList()
+        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
+    assert(adjacencyList.size === 8)
+    assert(adjacencyList(0) === (0, 1, 2.5981))
+    assert(adjacencyList(1) === (0, 6, 2.5981))
+    assert(adjacencyList(2) === (1, 2, 1.7321))
+    assert(adjacencyList(3) === (1, 5, 1.7321))
+    assert(adjacencyList(4) === (2, 3, 0.866))
+    assert(adjacencyList(5) === (2, 4, 0.866))
+    assert(adjacencyList(6) === (6, 7, 0.866))
+    assert(adjacencyList(7) === (6, 8, 0.866))
+
+    // linkage matrix
+    val linkageMatrix = model.toLinkageMatrix()
+        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
+    assert(linkageMatrix.size === 4)
+    assert(linkageMatrix(0) === (0, 1, 0.866, 2))
+    assert(linkageMatrix(1) === (3, 4, 0.866, 2))
+    assert(linkageMatrix(2) === (5, 2, 2.5981, 3))
+    assert(linkageMatrix(3) === (7, 6, 5.1962, 5))
   }
 
   test("clustering sparse vectors") {
@@ -99,6 +121,28 @@ class HierarchicalClusteringModelSuite
 
     // compute WSSSE
     assert(model.WSSSE(data) === 0.0)
+
+    // adjacency list
+    val adjacencyList = model.toAdjacencyList()
+        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
+    assert(adjacencyList.size === 8)
+    assert(adjacencyList(0) === (0, 1, 1.5652))
+    assert(adjacencyList(1) === (0, 6, 1.5652))
+    assert(adjacencyList(2) === (1, 2, 1.3744))
+    assert(adjacencyList(3) === (1, 5, 1.3744))
+    assert(adjacencyList(4) === (2, 3, 0.5))
+    assert(adjacencyList(5) === (2, 4, 0.5))
+    assert(adjacencyList(6) === (6, 7, 2.5))
+    assert(adjacencyList(7) === (6, 8, 2.5))
+
+    // linkage matrix
+    val linkageMatrix = model.toLinkageMatrix()
+        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
+    assert(linkageMatrix.size === 4)
+    assert(linkageMatrix(0) === (0, 1, 0.5, 2))
+    assert(linkageMatrix(1) === (5, 2, 1.8744, 3))
+    assert(linkageMatrix(2) === (3, 4, 2.5, 2))
+    assert(linkageMatrix(3) === (6, 7, 4.0652, 5))
   }
 
   test("clustering should be done correctly") {

From 344d14eaaf315bc4fd2d9fa951ee0a2026a01d1d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 20 May 2015 16:15:05 +0900
Subject: [PATCH 12/76] Add a java test file for HierarchicalClustering

---
 .../JavaHierarchicalClusteringSuite.java      | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java

diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
new file mode 100644
index 0000000000000..84ae01d6dde0a
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering;
+
+import com.google.common.collect.Lists;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class JavaHierarchicalClusteringSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaHierarchicalClustering");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void runWithSmallData() {
+    List<Vector> points = Lists.newArrayList(
+        Vectors.dense(1.0, 2.0, 6.0),
+        Vectors.dense(1.0, 3.0, 0.0),
+        Vectors.dense(1.0, 4.0, 6.0)
+    );
+
+    Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0);
+
+    JavaRDD<Vector> data = sc.parallelize(points, 2);
+    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(1);
+    HierarchicalClusteringModel model = algo.run(data.rdd());
+    assertEquals(1, model.getCenters().length);
+    assertEquals(expectedCenter, model.getCenters()[0]);
+  }
+
+  @Test
+  public void runWithDenseVectors() {
+    int numClusters = 5;
+    List<Vector> points = Lists.newArrayList();
+    for (int i = 0; i < 99; i++) {
+      Double elm = new Double(i % numClusters);
+      Vector point = Vectors.dense(elm, elm);
+      points.add(point);
+    }
+    JavaRDD<Vector> data = sc.parallelize(points, 2);
+    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters);
+    HierarchicalClusteringModel model = algo.run(data.rdd());
+    Vector[] centers = model.getCenters();
+    assertEquals(numClusters, centers.length);
+    assertEquals(Vectors.dense(0.0, 0.0), centers[0]);
+    assertEquals(Vectors.dense(1.0, 1.0), centers[1]);
+    assertEquals(Vectors.dense(2.0, 2.0), centers[2]);
+    assertEquals(Vectors.dense(3.0, 3.0), centers[3]);
+    assertEquals(Vectors.dense(4.0, 4.0), centers[4]);
+
+    // adjacency list
+    ArrayList<ArrayList<Double>> edges = model.toJavaAdjacencyList();
+    assertEquals(8, edges.size());
+    // linkage matrix
+    ArrayList<ArrayList<Double>> matrix = model.toJavaLinkageMatrix();
+    assertEquals(4, matrix.size());
+  }
+
+  @Test
+  public void runWithSparseVectors() {
+    int numClusters = 5;
+    List<Vector> points = Lists.newArrayList();
+    for (int i = 0; i < 99; i++) {
+      int elm = i % numClusters;
+      int indexes[] = {elm};
+      double values[] = {elm};
+      Vector point = Vectors.sparse(numClusters, indexes, values);
+      points.add(point);
+    }
+    JavaRDD<Vector> data = sc.parallelize(points, 2);
+    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters);
+    HierarchicalClusteringModel model = algo.run(data.rdd());
+    Vector[] centers = model.getCenters();
+    assertEquals(numClusters, centers.length);
+    assertEquals(points.get(0), centers[0]);
+    assertEquals(points.get(1), centers[1]);
+    assertEquals(points.get(2), centers[2]);
+    assertEquals(points.get(3), centers[3]);
+    assertEquals(points.get(4), centers[4]);
+
+    // adjacency list
+    ArrayList<ArrayList<Double>> edges = model.toJavaAdjacencyList();
+    assertEquals(8, edges.size());
+    // linkage matrix
+    ArrayList<ArrayList<Double>> matrix = model.toJavaLinkageMatrix();
+    assertEquals(4, matrix.size());
+  }
+}

From e7256f540324efe36d8fa4774f9d11ca12aaac3e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 20 May 2015 16:36:38 +0900
Subject: [PATCH 13/76] Support save and load functions in Java

---
 .../HierarchicalClusteringModel.scala         | 35 ++++++++++++++++---
 .../JavaHierarchicalClusteringSuite.java      | 20 +++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 9dcf84b5df381..75d967c868947 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.mllib.clustering
 
+import java.io.File
+
 import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm}
-import org.apache.spark.api.java.JavaRDD
+import org.apache.commons.io.FilenameUtils
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
@@ -35,8 +38,20 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
   /** Current version of model save/load format. */
   override protected def formatVersion: String = "1.0"
 
-  override def save(sc: SparkContext, path: String) {
-    val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(path))
+  override def save(sc: SparkContext, path: String): Unit = this.save(path)
+
+  def save(sc: JavaSparkContext, path: String): Unit = this.save(path)
+
+  private def save(path: String): Unit = {
+    val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile
+    if (pathObj.exists()) {
+      throw new IllegalArgumentException("You should save your model in another directory. " +
+          "the directory already exists: " + path)
+    }
+
+    pathObj.mkdir();
+    val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path)
+    val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath))
     try {
       oos.writeObject(this)
     } finally {
@@ -137,11 +152,23 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
 object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] {
 
   override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = {
-    val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(path))
+    this.load(path)
+  }
+
+  def load(sc: JavaSparkContext, path: String): HierarchicalClusteringModel = {
+    this.load(path)
+  }
+
+  def load(path: String): HierarchicalClusteringModel = {
+    val modelFilePath = getModelFilePath(path)
+    val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(modelFilePath))
     try {
       stream.readObject().asInstanceOf[HierarchicalClusteringModel]
     } finally {
       stream.close()
     }
   }
+
+  private[clustering]
+  def getModelFilePath(path: String): String = FilenameUtils.concat(path, "model")
 }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
index 84ae01d6dde0a..54f532e9e5eec 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.clustering;
 
 import com.google.common.collect.Lists;
+import jodd.io.FileUtil;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
@@ -26,7 +27,11 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.File;
+import java.io.IOException;
 import java.io.Serializable;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -61,6 +66,21 @@ public void runWithSmallData() {
     HierarchicalClusteringModel model = algo.run(data.rdd());
     assertEquals(1, model.getCenters().length);
     assertEquals(expectedCenter, model.getCenters()[0]);
+
+    // save & load
+    try {
+      String tempDir = System.getProperty("java.io.tmpdir");
+      Path pathObj = Paths.get(tempDir, this.getClass().getSimpleName());
+      String path = pathObj.toAbsolutePath().toString();
+
+      model.save(sc, pathObj.toAbsolutePath().toString());
+      HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path);
+      assertEquals(1, savedModel.getCenters().length);
+      assertEquals(expectedCenter, savedModel.getCenters()[0]);
+      FileUtil.delete(new File(path));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
   }
 
   @Test

From e2947959d7a4b724c9c4a9e7c17ef4be4f258f25 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 21 May 2015 13:20:07 +0900
Subject: [PATCH 14/76] Change the specification of
 HierarchicalClusteringModel.save()

---
 .../clustering/HierarchicalClusteringModel.scala |  7 ++-----
 .../JavaHierarchicalClusteringSuite.java         |  7 +++++--
 .../HierarchicalClusteringModelSuite.scala       | 16 +++++++++++++---
 .../clustering/HierarchicalClusteringSuite.scala |  6 +-----
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 75d967c868947..eb7271df4c2dc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -44,12 +44,9 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
 
   private def save(path: String): Unit = {
     val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile
-    if (pathObj.exists()) {
-      throw new IllegalArgumentException("You should save your model in another directory. " +
-          "the directory already exists: " + path)
+    if (! pathObj.exists()) {
+      pathObj.mkdirs();
     }
-
-    pathObj.mkdir();
     val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path)
     val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath))
     try {
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
index 54f532e9e5eec..16d77570ce188 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
@@ -69,14 +69,17 @@ public void runWithSmallData() {
 
     // save & load
     try {
+      // create a temporary directory
       String tempDir = System.getProperty("java.io.tmpdir");
-      Path pathObj = Paths.get(tempDir, this.getClass().getSimpleName());
+      Path pathObj = Paths.get(tempDir, String.valueOf(this.hashCode()));
       String path = pathObj.toAbsolutePath().toString();
 
-      model.save(sc, pathObj.toAbsolutePath().toString());
+      model.save(sc, path);
       HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path);
       assertEquals(1, savedModel.getCenters().length);
       assertEquals(expectedCenter, savedModel.getCenters()[0]);
+
+      // delete the temporary directory
       FileUtil.delete(new File(path));
     } catch (IOException e) {
       e.printStackTrace();
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index d374ec956562a..e8341e4ba58f8 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.mllib.clustering
 
+import org.apache.commons.io.FilenameUtils
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.scalatest.{BeforeAndAfterEach, FunSuite}
 
+import scala.reflect.io.Path
+
 class HierarchicalClusteringModelSuite
     extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
 
@@ -179,13 +182,20 @@ class HierarchicalClusteringModelSuite
     val data = sc.parallelize(localData.map(_._2))
     val model = app.run(data)
 
-    val tmpFile = java.io.File.createTempFile("hierarchical-clustering", "save-load")
-    model.save(sc, tmpFile.getAbsolutePath)
+    // create a temporary directory for the test
+    val tmpBaseDir = System.getProperty("java.io.tmpdir")
+    val tmpDir = this.getClass.getSimpleName + this.hashCode().toString
+    val tmpPath = FilenameUtils.concat(tmpBaseDir, tmpDir)
 
-    val sameModel = HierarchicalClusteringModel.load(sc, tmpFile.getAbsolutePath)
+    model.save(sc, tmpPath)
+    val sameModel = HierarchicalClusteringModel.load(sc, tmpPath)
     assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel")
     localData.foreach { case (label, vector) =>
         assert(model.predict(vector) === sameModel.predict(vector))
     }
+
+    // delete the temporary directory
+    val path = Path(tmpPath)
+    path.deleteRecursively()
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 9f0b18e6dfa58..606752c6d4201 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.mllib.util.TestingUtils._
 import org.scalatest.FunSuite
 
 
-class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext {
+class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
 
   test("the root index is equal to 1") {
     assert(HierarchicalClustering.ROOT_INDEX_KEY === 1)
@@ -44,10 +44,6 @@ class HierarchicalClusteringAppSuite extends FunSuite with MLlibTestSparkContext
       assert(closestIndex === i)
     }
   }
-}
-
-
-class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
 
   test("run") {
     val algo = new HierarchicalClustering().setNumClusters(123)

From 1c66e09558d5d78af27a718a55ff54c8822de8bf Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 27 Apr 2015 16:24:41 +0900
Subject: [PATCH 15/76] Format code and modify the comments

---
 .../clustering/HierarchicalClustering.scala   | 175 +++++++++---------
 1 file changed, 89 insertions(+), 86 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index dc06f1891e64b..36d36956443c2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -198,7 +198,7 @@ class HierarchicalClustering private (
   }
 
   /**
-   * Summarizes data by each cluster as ClusterTree2 classes
+   * Summarizes data by each cluster as ClusterTree classes
    */
   private[clustering]
   def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = {
@@ -237,24 +237,6 @@ class HierarchicalClustering private (
     }.collect().toMap
   }
 
-  /**
-   * Gets the initial centers for bi-sect k-means
-   */
-  private[clustering]
-  def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = {
-    val rand = new XORShiftRandom()
-    rand.setSeed(this.seed)
-
-    clusters.flatMap { case (idx, center) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1)
-      val relativeErrorCoefficient = 0.001
-      Array(
-        (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))),
-        (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble())))
-      )
-    }.toMap
-  }
-
   /**
    * Gets the new divided centers
    */
@@ -310,60 +292,6 @@ class HierarchicalClustering private (
     }.toMap
   }
 
-  /**
-   * Builds a cluster tree from a Map of clusters
-   *
-   * @param treeMap divided clusters as a Map class
-   * @param rootIndex index you want to start
-   * @param numClusters the number of clusters you want
-   * @return a built cluster tree
-   */
-  private[clustering]
-  def buildTree(treeMap: Map[Long, ClusterTree],
-    rootIndex: Long,
-    numClusters: Int): Option[ClusterTree] = {
-
-    // if there is no index in the Map
-    if (!treeMap.contains(rootIndex)) return None
-
-    // build a cluster tree if the queue is empty or until the number of leaves clusters is enough
-    var numLeavesClusters = 1
-    val root = treeMap(rootIndex)
-    var leavesQueue = Map(rootIndex -> root)
-    while (leavesQueue.size > 0 && numLeavesClusters < numClusters) {
-      // pick up the cluster whose variance is the maximum in the queue
-      val mostScattered = leavesQueue.maxBy(_._2.variancesNorm)
-      val mostScatteredKey = mostScattered._1
-      val mostScatteredCluster = mostScattered._2
-
-      // relate the most scattered cluster to its children clusters
-      val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1)
-      if (childrenIndexes.forall(i => treeMap.contains(i))) {
-        // insert children to the most scattered cluster
-        val children = childrenIndexes.map(i => treeMap(i))
-        mostScatteredCluster.insert(children)
-
-        // calculate the local dendrogram height
-        // TODO Supports distance metrics other Euclidean distance metric
-        val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-        val localHeight = children
-            .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
-        mostScatteredCluster.setLocalHeight(localHeight)
-
-        // update the queue
-        leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap
-        numLeavesClusters += 1
-      }
-
-      // remove the cluster which is involved to the cluster tree
-      leavesQueue = leavesQueue.filterNot(_ == mostScattered)
-
-      log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " +
-          s"Cluster ${childrenIndexes.mkString(",")} are merged.")
-    }
-    Some(root)
-  }
-
   /**
    * Divides the input data
    *
@@ -438,6 +366,78 @@ class HierarchicalClustering private (
     stats
   }
 
+  /**
+   * Gets the initial centers for bi-sect k-means
+   */
+  private[clustering]
+  def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = {
+    val rand = new XORShiftRandom()
+    rand.setSeed(this.seed)
+
+    clusters.flatMap { case (idx, center) =>
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1)
+      val relativeErrorCoefficient = 0.001
+      Array(
+        (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))),
+        (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble())))
+      )
+    }.toMap
+  }
+
+  /**
+   * Builds a cluster tree from a Map of clusters
+   *
+   * @param treeMap divided clusters as a Map class
+   * @param rootIndex index you want to start
+   * @param numClusters the number of clusters you want
+   * @return a built cluster tree
+   */
+  private[clustering]
+  def buildTree(treeMap: Map[Long, ClusterTree],
+    rootIndex: Long,
+    numClusters: Int): Option[ClusterTree] = {
+
+    // if there is no index in the Map
+    if (!treeMap.contains(rootIndex)) return None
+
+    // build a cluster tree if the queue is empty or until the number of leaves clusters is enough
+    var numLeavesClusters = 1
+    val root = treeMap(rootIndex)
+    var leavesQueue = Map(rootIndex -> root)
+    while (leavesQueue.size > 0 && numLeavesClusters < numClusters) {
+      // pick up the cluster whose variance is the maximum in the queue
+      val mostScattered = leavesQueue.maxBy(_._2.variancesNorm)
+      val mostScatteredKey = mostScattered._1
+      val mostScatteredCluster = mostScattered._2
+
+      // relate the most scattered cluster to its children clusters
+      val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1)
+      if (childrenIndexes.forall(i => treeMap.contains(i))) {
+        // insert children to the most scattered cluster
+        val children = childrenIndexes.map(i => treeMap(i))
+        mostScatteredCluster.insert(children)
+
+        // calculate the local dendrogram height
+        // TODO Supports distance metrics other Euclidean distance metric
+        val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+        val localHeight = children
+            .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
+        mostScatteredCluster.setLocalHeight(localHeight)
+
+        // update the queue
+        leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap
+        numLeavesClusters += 1
+      }
+
+      // remove the cluster which is involved to the cluster tree
+      leavesQueue = leavesQueue.filterNot(_ == mostScattered)
+
+      log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " +
+          s"Cluster ${childrenIndexes.mkString(",")} are merged.")
+    }
+    Some(root)
+  }
+
   /**
    * Updates the indexes of clusters which is divided to its children indexes
    */
@@ -500,22 +500,22 @@ class ClusterTree private (
       0.0, None, Array.empty[ClusterTree])
 
   /**
-   * Inserts sub nodes as its children
+   * Inserts a sub node as its child
    *
-   * @param children inserted sub nodes
+   * @param child inserted sub node
    */
-  def insert(children: Array[ClusterTree]) {
-    this.children = this.children ++ children
-    children.foreach(child => child.parent = Some(this))
+  def insert(child: ClusterTree) {
+    insert(Array(child))
   }
 
   /**
-   * Inserts a sub node as its child
+   * Inserts sub nodes as its children
    *
-   * @param child inserted sub node
+   * @param children inserted sub nodes
    */
-  def insert(child: ClusterTree) {
-    insert(Array(child))
+  def insert(children: Array[ClusterTree]) {
+    this.children = this.children ++ children
+    children.foreach(child => child.parent = Some(this))
   }
 
   /**
@@ -560,7 +560,10 @@ class ClusterTree private (
   def getChildren: Seq[ClusterTree] = this.children
 
   /**
-   * Gets the dendrogram height of the cluster at the cluster tree
+   * Gets the dendrogram height of the cluster at the cluster tree.
+   * A dendrogram height is different from a local height.
+   * A dendrogram height means a total height of a node in a tree.
+   * A local height means a maximum distance between a node and its children.
    *
    * @return the dendrogram height
    */
@@ -571,6 +574,9 @@ class ClusterTree private (
     }
   }
 
+  private[mllib]
+  def setLocalHeight(height: Double) = (this.localHeight = height)
+
   /**
    * Converts to a adjacency list
    *
@@ -622,7 +628,4 @@ class ClusterTree private (
           tree.toArray().filter(_.isLeaf).size)
     }
   }
-
-  private[mllib]
-  def setLocalHeight(height: Double) = (this.localHeight = height)
 }

From 59480d3f498f4a223acaa795c138e750236e4a51 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 21 May 2015 14:10:03 +0900
Subject: [PATCH 16/76] Format the code because there is a long line

---
 .../mllib/clustering/HierarchicalClusteringSuite.scala    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 606752c6d4201..82ac672747367 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -134,10 +134,10 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
   test("should assign each data to new clusters") {
     val algo = new HierarchicalClustering
     val seed = Seq(
-      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)),
-      (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
-      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)),
-      (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0))
+      (2L, Vectors.dense(0.0, 0.0)),(2L, Vectors.dense(1.0, 1.0)),(2L, Vectors.dense(2.0, 2.0)),
+      (2L, Vectors.dense(3.0, 3.0)),(2L, Vectors.dense(4.0, 4.0)),(2L, Vectors.dense(5.0, 5.0)),
+      (3L, Vectors.dense(6.0, 6.0)),(3L, Vectors.dense(7.0, 7.0)),(3L, Vectors.dense(8.0, 8.0)),
+      (3L, Vectors.dense(9.0, 9.0)),(3L, Vectors.dense(10.0, 10.0)),(3L, Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze)}
     val newClusters = Map(
       4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),

From ec9f85f0791e325ac25d7ca403afe8f444f53a76 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Sat, 13 Jun 2015 07:32:59 +0900
Subject: [PATCH 17/76] Fix some comments for HierarchicalClustering in Scala

---
 .../mllib/clustering/HierarchicalClustering.scala     | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 36d36956443c2..8942424cb7e24 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -48,7 +48,7 @@ object HierarchicalClustering extends Logging {
 }
 
 /**
- * This is a divisive hierarchical clustering algorithm based on bi-sect k-means algorithm.
+ * This is a divisive hierarchical clustering algorithm based on bisect k-means algorithm.
  *
  * The main idea of this algorithm is based on "A comparison of document clustering techniques",
  * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000.
@@ -139,7 +139,7 @@ class HierarchicalClustering private (
     while (clusters.size < maxAllNodesInTree && noMoreDividable == false) {
       log.info(s"${sc.appName} starts step ${step}")
 
-      // enough to be clustered if the number of divided clusters is equal to 0
+      // can be clustered if the number of divided clusters is equal to 0
       val divided = getDividedClusters(data, leafClusters)
       if (divided.size == 0) {
         noMoreDividable = true
@@ -367,7 +367,7 @@ class HierarchicalClustering private (
   }
 
   /**
-   * Gets the initial centers for bi-sect k-means
+   * Gets the initial centers for bisect k-means
    */
   private[clustering]
   def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = {
@@ -400,7 +400,7 @@ class HierarchicalClustering private (
     // if there is no index in the Map
     if (!treeMap.contains(rootIndex)) return None
 
-    // build a cluster tree if the queue is empty or until the number of leaves clusters is enough
+    // build a cluster tree if the queue is empty or until the number of leaf clusters is enough
     var numLeavesClusters = 1
     val root = treeMap(rootIndex)
     var leavesQueue = Map(rootIndex -> root)
@@ -578,7 +578,7 @@ class ClusterTree private (
   def setLocalHeight(height: Double) = (this.localHeight = height)
 
   /**
-   * Converts to a adjacency list
+   * Converts to an adjacency list
    *
    * @return List[(fromNodeId, toNodeId, distance)]
    */
@@ -601,7 +601,6 @@ class ClusterTree private (
   /**
    * Converts to a linkage matrix
    * Returned data format is fit for scipy's dendrogram function
-   * SEE ALSO: scipy.cluster.hierarchy.dendrogram
    *
    * @return List[(node1, node2, distance, tree size)]
    */

From 58999db483b5d2a34af2a8172a541198eadd080e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Sat, 13 Jun 2015 08:54:42 +0900
Subject: [PATCH 18/76] Sort `import` statements in
 HierarchicalClustering.scala and HierarchicalClusteringModel.scala

---
 .../spark/mllib/clustering/HierarchicalClustering.scala    | 7 ++++---
 .../mllib/clustering/HierarchicalClusteringModel.scala     | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 8942424cb7e24..350080e850476 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm}
+import scala.collection.{Map, mutable}
+
+import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm}
+
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.{Logging, SparkException}
 
-import scala.collection.{Map, mutable}
-
 
 object HierarchicalClustering extends Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index eb7271df4c2dc..73a745c8d02cb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.clustering
 
 import java.io.File
 
-import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm}
+import breeze.linalg.{Vector => BV, norm => breezeNorm}
 import org.apache.commons.io.FilenameUtils
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.mllib.linalg.Vector

From a077e99b700f8aa52e82f8960bcbda5524262041 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Sat, 13 Jun 2015 10:49:32 +0900
Subject: [PATCH 19/76] Format HierarchicalClusteringSuite and
 HierarchicalClusteringModelSuite

---
 .../HierarchicalClusteringModelSuite.scala       | 12 +++++++-----
 .../clustering/HierarchicalClusteringSuite.scala | 16 +++++++++-------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index e8341e4ba58f8..be5efca958754 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -17,15 +17,17 @@
 
 package org.apache.spark.mllib.clustering
 
+import scala.reflect.io.Path
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.commons.io.FilenameUtils
+
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
-
-import scala.reflect.io.Path
 
 class HierarchicalClusteringModelSuite
-    extends FunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
+    extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
 
   test("clustering dense vectors") {
     val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
@@ -191,7 +193,7 @@ class HierarchicalClusteringModelSuite
     val sameModel = HierarchicalClusteringModel.load(sc, tmpPath)
     assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel")
     localData.foreach { case (label, vector) =>
-        assert(model.predict(vector) === sameModel.predict(vector))
+      assert(model.predict(vector) === sameModel.predict(vector))
     }
 
     // delete the temporary directory
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 82ac672747367..306d5896c297e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, norm => breezeNorm}
+import breeze.linalg.{Vector => BV, norm => breezeNorm}
+
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.scalatest.FunSuite
 
 
-class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
+class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("the root index is equal to 1") {
     assert(HierarchicalClustering.ROOT_INDEX_KEY === 1)
@@ -134,10 +135,11 @@ class HierarchicalClusteringSuite extends FunSuite with MLlibTestSparkContext {
   test("should assign each data to new clusters") {
     val algo = new HierarchicalClustering
     val seed = Seq(
-      (2L, Vectors.dense(0.0, 0.0)),(2L, Vectors.dense(1.0, 1.0)),(2L, Vectors.dense(2.0, 2.0)),
-      (2L, Vectors.dense(3.0, 3.0)),(2L, Vectors.dense(4.0, 4.0)),(2L, Vectors.dense(5.0, 5.0)),
-      (3L, Vectors.dense(6.0, 6.0)),(3L, Vectors.dense(7.0, 7.0)),(3L, Vectors.dense(8.0, 8.0)),
-      (3L, Vectors.dense(9.0, 9.0)),(3L, Vectors.dense(10.0, 10.0)),(3L, Vectors.dense(11.0, 11.0))
+      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)),
+      (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
+      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)),
+      (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)),
+      (3L, Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze)}
     val newClusters = Map(
       4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),

From fa74f20c3a802387dbaded48eaff33fb4c477f12 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 19 Jun 2015 07:29:04 +0900
Subject: [PATCH 20/76] Rename ClusterTree to ClusterNode

---
 .../clustering/HierarchicalClustering.scala   | 62 +++++++++----------
 .../HierarchicalClusteringModel.scala         | 28 ++++-----
 .../HierarchicalClusteringModelSuite.scala    |  4 +-
 .../HierarchicalClusteringSuite.scala         | 18 +++---
 4 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 350080e850476..2b2e15295cdb0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -63,7 +63,7 @@ object HierarchicalClustering extends Logging {
  */
 class HierarchicalClustering private (
   private var numClusters: Int,
-  private var clusterMap: Map[Long, ClusterTree],
+  private var clusterMap: Map[Long, ClusterNode],
   private var maxIterations: Int,
   private var maxRetries: Int,
   private var seed: Long) extends Logging {
@@ -71,7 +71,7 @@ class HierarchicalClustering private (
   /**
    * Constructs with the default configuration
    */
-  def this() = this(20, mutable.ListMap.empty[Long, ClusterTree], 20, 10, 1)
+  def this() = this(20, mutable.ListMap.empty[Long, ClusterNode], 20, 10, 1)
 
   /**
    * Sets the number of clusters you want
@@ -202,7 +202,7 @@ class HierarchicalClustering private (
    * Summarizes data by each cluster as ClusterTree classes
    */
   private[clustering]
-  def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterTree] = {
+  def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterNode] = {
     // summarize input data
     val stats = summarize(data)
 
@@ -213,7 +213,7 @@ class HierarchicalClustering private (
         case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
         case _ => Vectors.zeros(sum.size)
       }
-      (i, new ClusterTree(center, n.toLong, variances))
+      (i, new ClusterNode(center, n.toLong, variances))
     }.toMap
   }
 
@@ -243,7 +243,7 @@ class HierarchicalClustering private (
    */
   private[clustering]
   def getDividedClusters(data: RDD[(Long, BV[Double])],
-    dividedClusters: Map[Long, ClusterTree]): Map[Long, ClusterTree] = {
+    dividedClusters: Map[Long, ClusterNode]): Map[Long, ClusterNode] = {
     val sc = data.sparkContext
     val appName = sc.appName
 
@@ -253,7 +253,7 @@ class HierarchicalClustering private (
     }.keySet
     if (dividableKeys.size == 0) {
       log.info(s"There is no dividable clusters in ${appName}.")
-      return Map.empty[Long, ClusterTree]
+      return Map.empty[Long, ClusterNode]
     }
 
     // divide input data
@@ -288,7 +288,7 @@ class HierarchicalClustering private (
         case 1 => Vectors.sparse(sum.size, Array(), Array())
         case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
       }
-      val child = new ClusterTree(center, n.toLong, variances)
+      val child = new ClusterNode(center, n.toLong, variances)
       (i, child)
     }.toMap
   }
@@ -302,7 +302,7 @@ class HierarchicalClustering private (
    */
   private[clustering]
   def divide(data: RDD[(Long, BV[Double])],
-    clusters: Map[Long, ClusterTree]): Map[Long, (BV[Double], Double, BV[Double])] = {
+    clusters: Map[Long, ClusterNode]): Map[Long, (BV[Double], Double, BV[Double])] = {
 
     val sc = data.sparkContext
     val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)}
@@ -394,9 +394,9 @@ class HierarchicalClustering private (
    * @return a built cluster tree
    */
   private[clustering]
-  def buildTree(treeMap: Map[Long, ClusterTree],
+  def buildTree(treeMap: Map[Long, ClusterNode],
     rootIndex: Long,
-    numClusters: Int): Option[ClusterTree] = {
+    numClusters: Int): Option[ClusterNode] = {
 
     // if there is no index in the Map
     if (!treeMap.contains(rootIndex)) return None
@@ -445,7 +445,7 @@ class HierarchicalClustering private (
   private[clustering]
   def updateClusterIndex(
     data: RDD[(Long, BV[Double])],
-    dividedClusters: Map[Long, ClusterTree]): RDD[(Long, BV[Double])] = {
+    dividedClusters: Map[Long, ClusterNode]): RDD[(Long, BV[Double])] = {
     // extract the centers of the clusters
     val sc = data.sparkContext
     var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
@@ -485,27 +485,27 @@ class HierarchicalClustering private (
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
  */
-class ClusterTree private (
+class ClusterNode private (
   val center: Vector,
   val records: Long,
   val variances: Vector,
   val variancesNorm: Double,
   private var localHeight: Double,
-  private var parent: Option[ClusterTree],
-  private var children: Seq[ClusterTree]) extends Serializable {
+  private var parent: Option[ClusterNode],
+  private var children: Seq[ClusterNode]) extends Serializable {
 
   require(!variancesNorm.isNaN)
 
   def this(center: Vector, rows: Long, variances: Vector) =
     this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0),
-      0.0, None, Array.empty[ClusterTree])
+      0.0, None, Array.empty[ClusterNode])
 
   /**
    * Inserts a sub node as its child
    *
    * @param child inserted sub node
    */
-  def insert(child: ClusterTree) {
+  def insert(child: ClusterNode) {
     insert(Array(child))
   }
 
@@ -514,7 +514,7 @@ class ClusterTree private (
    *
    * @param children inserted sub nodes
    */
-  def insert(children: Array[ClusterTree]) {
+  def insert(children: Array[ClusterNode]) {
     this.children = this.children ++ children
     children.foreach(child => child.parent = Some(this))
   }
@@ -525,7 +525,7 @@ class ClusterTree private (
    *
    * @return an Array class which the cluster tree is expanded
    */
-  def toArray(): Array[ClusterTree] = {
+  def toArray(): Array[ClusterNode] = {
     val array = this.children.size match {
       case 0 => Array(this)
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator)
@@ -550,15 +550,15 @@ class ClusterTree private (
   /**
    * Gets the leaves nodes in the cluster tree
    */
-  def getLeavesNodes: Array[ClusterTree] = {
+  def getLeavesNodes: Array[ClusterNode] = {
     this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum)
   }
 
   def isLeaf: Boolean = (this.children.size == 0)
 
-  def getParent: Option[ClusterTree] = this.parent
+  def getParent: Option[ClusterNode] = this.parent
 
-  def getChildren: Seq[ClusterTree] = this.children
+  def getChildren: Seq[ClusterNode] = this.children
 
   /**
    * Gets the dendrogram height of the cluster at the cluster tree.
@@ -610,22 +610,22 @@ class ClusterTree private (
     val leaves = nodes.filter(_.isLeaf)
     val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1)
     val clusters = leaves ++ notLeaves
-    val treeMap = clusters.zipWithIndex.map { case (tree, idx) => (tree -> idx)}.toMap
+    val treeMap = clusters.zipWithIndex.map { case (node, idx) => (node -> idx)}.toMap
 
     // If a node only has one-child, the child is regarded as the cluster of the child.
     // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D.
     // ==> A merge list is (B, D), not (B, C).
-    def getIndex(map: Map[ClusterTree, Int], tree: ClusterTree): Int = {
-      tree.children.size match {
-        case 1 => getIndex(map, tree.children(0))
-        case _ => map(tree)
+    def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = {
+      node.children.size match {
+        case 1 => getIndex(map, node.children(0))
+        case _ => map(node)
       }
     }
-    clusters.filterNot(_.isLeaf).map { tree =>
-      (getIndex(treeMap, tree.children(0)),
-          getIndex(treeMap, tree.children(1)),
-          tree.getHeight,
-          tree.toArray().filter(_.isLeaf).size)
+    clusters.filterNot(_.isLeaf).map { node =>
+      (getIndex(treeMap, node.children(0)),
+          getIndex(treeMap, node.children(1)),
+          node.getHeight,
+          node.toArray().filter(_.isLeaf).size)
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 73a745c8d02cb..634d82a23f940 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -30,9 +30,9 @@ import org.apache.spark.{Logging, SparkContext}
 /**
  * This class is used for the model of the hierarchical clustering
  *
- * @param tree a cluster as a tree node
+ * @param node a cluster as a tree node
  */
-class HierarchicalClusteringModel(val tree: ClusterTree)
+class HierarchicalClusteringModel(val node: ClusterNode)
     extends Serializable with Saveable with Logging {
 
   /** Current version of model save/load format. */
@@ -56,7 +56,7 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
     }
   }
 
-  def getClusters: Array[ClusterTree] = this.tree.getLeavesNodes
+  def getClusters: Array[ClusterNode] = this.node.getLeavesNodes
 
   def getCenters: Array[Vector] = this.getClusters.map(_.center)
 
@@ -113,32 +113,32 @@ class HierarchicalClusteringModel(val tree: ClusterTree)
 
   def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
 
-  def toAdjacencyList(): Array[(Int, Int, Double)] = this.tree.toAdjacencyList()
+  def toAdjacencyList(): Array[(Int, Int, Double)] = this.node.toAdjacencyList()
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
   def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
     var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]();
-    this.tree.toAdjacencyList().foreach { x =>
+    this.node.toAdjacencyList().foreach { x =>
       val edge = new java.util.ArrayList[java.lang.Double]()
-      edge.add(x._1)
-      edge.add(x._2)
-      edge.add(x._3)
+      edge.add(x._1.toDouble)
+      edge.add(x._2.toDouble)
+      edge.add(x._3.toDouble)
       javaList.add(edge)
     }
     javaList
   }
 
-  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.tree.toLinkageMatrix()
+  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix()
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
   def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
     val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
-    this.tree.toLinkageMatrix().foreach {x =>
+    this.node.toLinkageMatrix().foreach {x =>
       val row = new java.util.ArrayList[java.lang.Double]()
-      row.add(x._1)
-      row.add(x._2)
-      row.add(x._3)
-      row.add(x._4)
+      row.add(x._1.toDouble)
+      row.add(x._2.toDouble)
+      row.add(x._3.toDouble)
+      row.add(x._4.toDouble)
       javaList.add(row)
     }
     javaList
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index be5efca958754..9c83f80a6a41c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -41,7 +41,7 @@ class HierarchicalClusteringModelSuite
     val model = app.run(data)
 
     val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[ClusterTree]])
+    assert(clusters.isInstanceOf[Array[ClusterNode]])
     assert(clusters.size === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
@@ -102,7 +102,7 @@ class HierarchicalClusteringModelSuite
     val model = app.run(data)
 
     val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[ClusterTree]])
+    assert(clusters.isInstanceOf[Array[ClusterNode]])
     assert(clusters.size === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 306d5896c297e..29d39ed87f42d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -52,12 +52,12 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
     assert(model.getClusters.size == 123)
-    assert(model.tree.getHeight ~== 702.8641 absTol 10E-4)
+    assert(model.node.getHeight ~== 702.8641 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
-    assert(model.tree.getParent === None)
-    assert(model.tree.getChildren.apply(0).getParent.get === model.tree)
-    assert(model.tree.getChildren.apply(1).getParent.get === model.tree)
+    assert(model.node.getParent === None)
+    assert(model.node.getChildren.apply(0).getParent.get === model.node)
+    assert(model.node.getChildren.apply(1).getParent.get === model.node)
     assert(model.getClusters.forall(_.getParent != None))
   }
 
@@ -67,7 +67,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
     assert(model.getClusters.size == 100)
-    assert(model.tree.getHeight ~== 72.12489 absTol 10E-4)
+    assert(model.node.getHeight ~== 72.12489 absTol 10E-4)
   }
 
   test("initializeData") {
@@ -142,10 +142,10 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
       (3L, Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze)}
     val newClusters = Map(
-      4L -> new ClusterTree(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),
-      5L -> new ClusterTree(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)),
-      6L -> new ClusterTree(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)),
-      7L -> new ClusterTree(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0))
+      4L -> new ClusterNode(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),
+      5L -> new ClusterNode(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)),
+      6L -> new ClusterNode(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)),
+      7L -> new ClusterNode(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0))
     )
     val data = sc.parallelize(seed)
     val result = algo.updateClusterIndex(data, newClusters).collect().toSeq

From 11439202ff555a8ed303b453872de9ccefbcf792 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 19 Jun 2015 07:43:40 +0900
Subject: [PATCH 21/76] Remove save/load from HierarchicalClusteringModel

---
 .../HierarchicalClusteringModel.scala         | 56 +------------------
 .../JavaHierarchicalClusteringSuite.java      | 23 --------
 .../HierarchicalClusteringModelSuite.scala    | 34 +----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index 634d82a23f940..bbd13855e2835 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -17,44 +17,18 @@
 
 package org.apache.spark.mllib.clustering
 
-import java.io.File
-
 import breeze.linalg.{Vector => BV, norm => breezeNorm}
-import org.apache.commons.io.FilenameUtils
-import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
+import org.apache.spark.Logging
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{Logging, SparkContext}
 
 /**
  * This class is used for the model of the hierarchical clustering
  *
  * @param node a cluster as a tree node
  */
-class HierarchicalClusteringModel(val node: ClusterNode)
-    extends Serializable with Saveable with Logging {
-
-  /** Current version of model save/load format. */
-  override protected def formatVersion: String = "1.0"
-
-  override def save(sc: SparkContext, path: String): Unit = this.save(path)
-
-  def save(sc: JavaSparkContext, path: String): Unit = this.save(path)
-
-  private def save(path: String): Unit = {
-    val pathObj = new File(HierarchicalClusteringModel.getModelFilePath(path)).getParentFile
-    if (! pathObj.exists()) {
-      pathObj.mkdirs();
-    }
-    val modelFilePath = HierarchicalClusteringModel.getModelFilePath(path)
-    val oos = new java.io.ObjectOutputStream(new java.io.FileOutputStream(modelFilePath))
-    try {
-      oos.writeObject(this)
-    } finally {
-      oos.close()
-    }
-  }
+class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable with Logging {
 
   def getClusters: Array[ClusterNode] = this.node.getLeavesNodes
 
@@ -145,27 +119,3 @@ class HierarchicalClusteringModel(val node: ClusterNode)
   }
 }
 
-
-object HierarchicalClusteringModel extends Loader[HierarchicalClusteringModel] {
-
-  override def load(sc: SparkContext, path: String): HierarchicalClusteringModel = {
-    this.load(path)
-  }
-
-  def load(sc: JavaSparkContext, path: String): HierarchicalClusteringModel = {
-    this.load(path)
-  }
-
-  def load(path: String): HierarchicalClusteringModel = {
-    val modelFilePath = getModelFilePath(path)
-    val stream = new java.io.ObjectInputStream(new java.io.FileInputStream(modelFilePath))
-    try {
-      stream.readObject().asInstanceOf[HierarchicalClusteringModel]
-    } finally {
-      stream.close()
-    }
-  }
-
-  private[clustering]
-  def getModelFilePath(path: String): String = FilenameUtils.concat(path, "model")
-}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
index 16d77570ce188..84ae01d6dde0a 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.mllib.clustering;
 
 import com.google.common.collect.Lists;
-import jodd.io.FileUtil;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
@@ -27,11 +26,7 @@
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
-import java.io.IOException;
 import java.io.Serializable;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -66,24 +61,6 @@ public void runWithSmallData() {
     HierarchicalClusteringModel model = algo.run(data.rdd());
     assertEquals(1, model.getCenters().length);
     assertEquals(expectedCenter, model.getCenters()[0]);
-
-    // save & load
-    try {
-      // create a temporary directory
-      String tempDir = System.getProperty("java.io.tmpdir");
-      Path pathObj = Paths.get(tempDir, String.valueOf(this.hashCode()));
-      String path = pathObj.toAbsolutePath().toString();
-
-      model.save(sc, path);
-      HierarchicalClusteringModel savedModel = HierarchicalClusteringModel.load(sc, path);
-      assertEquals(1, savedModel.getCenters().length);
-      assertEquals(expectedCenter, savedModel.getCenters()[0]);
-
-      // delete the temporary directory
-      FileUtil.delete(new File(path));
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
   }
 
   @Test
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index 9c83f80a6a41c..1335a0b8c6e3d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -17,14 +17,10 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.reflect.io.Path
-import org.scalatest.BeforeAndAfterEach
-
-import org.apache.commons.io.FilenameUtils
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.scalatest.BeforeAndAfterEach
 
 class HierarchicalClusteringModelSuite
     extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
@@ -172,32 +168,4 @@ class HierarchicalClusteringModelSuite
       assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0))
     }
   }
-
-  test("save a model, and then load the model") {
-    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
-
-    val localData = (1 to 100).toSeq.map { i =>
-      val label = i % 5
-      val vector = Vectors.dense(label, label, label)
-      (label, vector)
-    }
-    val data = sc.parallelize(localData.map(_._2))
-    val model = app.run(data)
-
-    // create a temporary directory for the test
-    val tmpBaseDir = System.getProperty("java.io.tmpdir")
-    val tmpDir = this.getClass.getSimpleName + this.hashCode().toString
-    val tmpPath = FilenameUtils.concat(tmpBaseDir, tmpDir)
-
-    model.save(sc, tmpPath)
-    val sameModel = HierarchicalClusteringModel.load(sc, tmpPath)
-    assert(sameModel.getClass.getSimpleName.toString === "HierarchicalClusteringModel")
-    localData.foreach { case (label, vector) =>
-      assert(model.predict(vector) === sameModel.predict(vector))
-    }
-
-    // delete the temporary directory
-    val path = Path(tmpPath)
-    path.deleteRecursively()
-  }
 }

From 16cc823690a0b7d084785e56bdc76d9bb73474bc Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 19 Jun 2015 08:22:38 +0900
Subject: [PATCH 22/76] Fix some mislenious code pointed out by IntelliJ

---
 .../clustering/HierarchicalClustering.scala   | 58 ++++++++++---------
 .../HierarchicalClusteringModel.scala         | 16 ++---
 .../JavaHierarchicalClusteringSuite.java      |  2 +-
 .../HierarchicalClusteringModelSuite.scala    | 28 ++++-----
 .../HierarchicalClusteringSuite.scala         |  7 +--
 5 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
index 2b2e15295cdb0..f7249272b0c3d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
@@ -40,7 +40,7 @@ object HierarchicalClustering extends Logging {
    * @return an index of the array of clusters
    */
   private[mllib]
-  def findClosestCenter(metric: Function2[BV[Double], BV[Double], Double])
+  def findClosestCenter(metric: (BV[Double], BV[Double]) => Double)
         (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
     val (closestCenter, closestIndex) =
       centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)
@@ -152,7 +152,7 @@ class HierarchicalClustering private (
         data = newData
 
         // keep recent 2 cached RDDs in order to run more quickly
-        if (rddArray.size > 1) {
+        if (rddArray.length > 1) {
           val head = rddArray.head
           head.unpersist()
           rddArray = rddArray.filterNot(_.hashCode() == head.hashCode())
@@ -184,8 +184,8 @@ class HierarchicalClustering private (
     // make a hierarchical clustering model
     val model = new HierarchicalClusteringModel(root.get)
     val leavesNodes = model.getClusters
-    if (leavesNodes.size < this.numClusters) {
-      log.warn(s"# clusters is less than you have expected: ${leavesNodes.size} / ${numClusters}. ")
+    if (leavesNodes.length < this.numClusters) {
+      log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}")
     }
     model
   }
@@ -195,7 +195,7 @@ class HierarchicalClustering private (
    */
   private[clustering]
   def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
-    data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}.cache
+    data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}
   }
 
   /**
@@ -227,8 +227,8 @@ class HierarchicalClustering private (
       val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
       iter.foreach { case (idx: Long, point: BV[Double]) =>
         // get a map value or else get a sparse vector
-        val (sumBV, n, sumOfSquares) = map.get(idx)
-            .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+        val (sumBV, n, sumOfSquares) = map
+            .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
         map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
       }
       map.toIterator
@@ -258,16 +258,15 @@ class HierarchicalClustering private (
 
     // divide input data
     var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)}
-    var dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)}
+    val dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)}
     val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator)
     var stats = divide(data, dividableClusters)
 
-    // if there is clusters which is failed to be divided,
-    // retry to divide only failed clusters again and again
+    // if there are clusters which failed to be divided, retry to split the failed clusters
     var tryTimes = 1
     while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) {
       // get the indexes of clusters which is failed to be divided
-      val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => (idx / 2).toLong)
+      val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => idx / 2)
       val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)}
       log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" +
           s"at ${tryTimes} times in ${appName}")
@@ -332,15 +331,18 @@ class HierarchicalClustering private (
         iter.foreach { case (idx, point) =>
           // calculate next index number
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
-              .filter(bcNewCenters.value.keySet.contains(_)).map(bcNewCenters.value(_)).toArray
-          if (childrenCenters.size >= 1) {
+              .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
+          if (childrenCenters.length >= 1) {
             val closestIndex =
               HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point)
             val nextIndex = 2 * idx + closestIndex
 
             // get a map value or else get a sparse vector
-            val (sumBV, n, sumOfSquares) = map.get(nextIndex)
-                .getOrElse(BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+            val (sumBV, n, sumOfSquares) = map
+                .getOrElse(
+                  nextIndex,
+                  (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+                )
             map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
           }
         }
@@ -426,7 +428,7 @@ class HierarchicalClustering private (
         mostScatteredCluster.setLocalHeight(localHeight)
 
         // update the queue
-        leavesQueue = leavesQueue ++ childrenIndexes.map(i => (i -> treeMap(i))).toMap
+        leavesQueue = leavesQueue ++ childrenIndexes.map(i => i -> treeMap(i)).toMap
         numLeavesClusters += 1
       }
 
@@ -457,8 +459,8 @@ class HierarchicalClustering private (
 
     // update the indexes to their children indexes
     data.map { case (idx, point) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(bcCenters.value.keySet.contains(_))
-      childrenIndexes.size match {
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c))
+      childrenIndexes.length match {
         // stay the index if the number of children is not enough
         case s if s < 2 => (idx, point)
         // update the indexes
@@ -525,10 +527,10 @@ class ClusterNode private (
    *
    * @return an Array class which the cluster tree is expanded
    */
-  def toArray(): Array[ClusterNode] = {
+  def toArray: Array[ClusterNode] = {
     val array = this.children.size match {
       case 0 => Array(this)
-      case _ => Array(this) ++ this.children.flatMap(child => child.toArray().toIterator)
+      case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
     }
     array.sortWith { case (a, b) =>
       a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum
@@ -551,7 +553,7 @@ class ClusterNode private (
    * Gets the leaves nodes in the cluster tree
    */
   def getLeavesNodes: Array[ClusterNode] = {
-    this.toArray().filter(_.isLeaf).sortBy(_.center.toArray.sum)
+    this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
   }
 
   def isLeaf: Boolean = (this.children.size == 0)
@@ -583,8 +585,8 @@ class ClusterNode private (
    *
    * @return List[(fromNodeId, toNodeId, distance)]
    */
-  def toAdjacencyList(): Array[(Int, Int, Double)] = {
-    val nodes = toArray()
+  def toAdjacencyList: Array[(Int, Int, Double)] = {
+    val nodes = toArray
 
     var adjacencyList = Array.empty[(Int, Int, Double)]
     nodes.foreach { parent =>
@@ -605,8 +607,8 @@ class ClusterNode private (
    *
    * @return List[(node1, node2, distance, tree size)]
    */
-  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = {
-    val nodes = toArray().sortWith { case (a, b) => a.getHeight < b.getHeight}
+  def toLinkageMatrix: Array[(Int, Int, Double, Int)] = {
+    val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight}
     val leaves = nodes.filter(_.isLeaf)
     val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1)
     val clusters = leaves ++ notLeaves
@@ -617,15 +619,15 @@ class ClusterNode private (
     // ==> A merge list is (B, D), not (B, C).
     def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = {
       node.children.size match {
-        case 1 => getIndex(map, node.children(0))
+        case 1 => getIndex(map, node.children.head)
         case _ => map(node)
       }
     }
     clusters.filterNot(_.isLeaf).map { node =>
-      (getIndex(treeMap, node.children(0)),
+      (getIndex(treeMap, node.children.head),
           getIndex(treeMap, node.children(1)),
           node.getHeight,
-          node.toArray().filter(_.isLeaf).size)
+          node.toArray.filter(_.isLeaf).length)
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
index bbd13855e2835..7a38a7994fb0c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
@@ -69,7 +69,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
-   * Computes Within Set Sum of Squeared Error(WSSSE)
+   * Computes Within Set Sum of Squared Error(WSSSE)
    */
   def WSSSE(data: RDD[Vector]): Double = {
     val bvCenters = this.getCenters.map(_.toBreeze)
@@ -87,12 +87,12 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
 
   def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
 
-  def toAdjacencyList(): Array[(Int, Int, Double)] = this.node.toAdjacencyList()
+  def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
-  def toJavaAdjacencyList(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
-    var javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]();
-    this.node.toAdjacencyList().foreach { x =>
+  def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
+    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
+    this.node.toAdjacencyList.foreach { x =>
       val edge = new java.util.ArrayList[java.lang.Double]()
       edge.add(x._1.toDouble)
       edge.add(x._2.toDouble)
@@ -102,12 +102,12 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
     javaList
   }
 
-  def toLinkageMatrix(): Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix()
+  def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
-  def toJavaLinkageMatrix(): java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
+  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
     val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
-    this.node.toLinkageMatrix().foreach {x =>
+    this.node.toLinkageMatrix.foreach {x =>
       val row = new java.util.ArrayList[java.lang.Double]()
       row.add(x._1.toDouble)
       row.add(x._2.toDouble)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
index 84ae01d6dde0a..7132658f5a3e7 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
@@ -68,7 +68,7 @@ public void runWithDenseVectors() {
     int numClusters = 5;
     List<Vector> points = Lists.newArrayList();
     for (int i = 0; i < 99; i++) {
-      Double elm = new Double(i % numClusters);
+      Double elm = (double)(i % numClusters);
       Vector point = Vectors.dense(elm, elm);
       points.add(point);
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
index 1335a0b8c6e3d..8d8715c77db92 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
@@ -38,10 +38,10 @@ class HierarchicalClusteringModelSuite
 
     val clusters = model.getClusters
     assert(clusters.isInstanceOf[Array[ClusterNode]])
-    assert(clusters.size === 5)
+    assert(clusters.length === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
-    assert(centers.size === 5)
+    assert(centers.length === 5)
     assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0))
     assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0))
     assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0))
@@ -64,9 +64,9 @@ class HierarchicalClusteringModelSuite
     assert(model.WSSSE(data) === 0.0)
 
     // adjacency list
-    val adjacencyList = model.toAdjacencyList()
+    val adjacencyList = model.toAdjacencyList
         .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
-    assert(adjacencyList.size === 8)
+    assert(adjacencyList.length === 8)
     assert(adjacencyList(0) === (0, 1, 2.5981))
     assert(adjacencyList(1) === (0, 6, 2.5981))
     assert(adjacencyList(2) === (1, 2, 1.7321))
@@ -77,9 +77,9 @@ class HierarchicalClusteringModelSuite
     assert(adjacencyList(7) === (6, 8, 0.866))
 
     // linkage matrix
-    val linkageMatrix = model.toLinkageMatrix()
+    val linkageMatrix = model.toLinkageMatrix
         .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
-    assert(linkageMatrix.size === 4)
+    assert(linkageMatrix.length === 4)
     assert(linkageMatrix(0) === (0, 1, 0.866, 2))
     assert(linkageMatrix(1) === (3, 4, 0.866, 2))
     assert(linkageMatrix(2) === (5, 2, 2.5981, 3))
@@ -99,10 +99,10 @@ class HierarchicalClusteringModelSuite
 
     val clusters = model.getClusters
     assert(clusters.isInstanceOf[Array[ClusterNode]])
-    assert(clusters.size === 5)
+    assert(clusters.length === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
-    assert(centers.size === 5)
+    assert(centers.length === 5)
     assert(centers(0) === Vectors.sparse(5, Array(), Array()))
     assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0)))
     assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0)))
@@ -124,9 +124,9 @@ class HierarchicalClusteringModelSuite
     assert(model.WSSSE(data) === 0.0)
 
     // adjacency list
-    val adjacencyList = model.toAdjacencyList()
+    val adjacencyList = model.toAdjacencyList
         .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
-    assert(adjacencyList.size === 8)
+    assert(adjacencyList.length === 8)
     assert(adjacencyList(0) === (0, 1, 1.5652))
     assert(adjacencyList(1) === (0, 6, 1.5652))
     assert(adjacencyList(2) === (1, 2, 1.3744))
@@ -137,9 +137,9 @@ class HierarchicalClusteringModelSuite
     assert(adjacencyList(7) === (6, 8, 2.5))
 
     // linkage matrix
-    val linkageMatrix = model.toLinkageMatrix()
+    val linkageMatrix = model.toLinkageMatrix
         .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
-    assert(linkageMatrix.size === 4)
+    assert(linkageMatrix.length === 4)
     assert(linkageMatrix(0) === (0, 1, 0.5, 2))
     assert(linkageMatrix(1) === (5, 2, 1.8744, 3))
     assert(linkageMatrix(2) === (3, 4, 2.5, 2))
@@ -158,13 +158,13 @@ class HierarchicalClusteringModelSuite
       // dense version
       val denseData = sc.parallelize(localData.map(_._2), 2)
       val denseModel = app.run(denseData)
-      assert(denseModel.getCenters.size === numClusters)
+      assert(denseModel.getCenters.length === numClusters)
       assert(denseModel.getClusters.forall(_.variancesNorm == 0.0))
 
       // sparse version
       val sparseData = sc.parallelize(localData.map(_._3), 2)
       val sparseModel = app.run(sparseData)
-      assert(sparseModel.getCenters.size === numClusters)
+      assert(sparseModel.getCenters.length === numClusters)
       assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0))
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
index 29d39ed87f42d..bd4c94c05b1bc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
@@ -51,12 +51,12 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
-    assert(model.getClusters.size == 123)
+    assert(model.getClusters.length == 123)
     assert(model.node.getHeight ~== 702.8641 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
     assert(model.node.getParent === None)
-    assert(model.node.getChildren.apply(0).getParent.get === model.node)
+    assert(model.node.getChildren.head.getParent.get === model.node)
     assert(model.node.getChildren.apply(1).getParent.get === model.node)
     assert(model.getClusters.forall(_.getParent != None))
   }
@@ -66,7 +66,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
-    assert(model.getClusters.size == 100)
+    assert(model.getClusters.length == 100)
     assert(model.node.getHeight ~== 72.12489 absTol 10E-4)
   }
 
@@ -85,7 +85,6 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
     val data = algo.initData(seed)
 
     val clusters = algo.summarizeAsClusters(data)
-    val center = clusters(1).center
     assert(clusters.size === 1)
     assert(clusters(1).center === Vectors.dense(49.5, 49.5))
     assert(clusters(1).records === 100)

From c02134e9cc26337c077cefa421a76d72ac9690ba Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 3 Jul 2015 11:56:02 +0900
Subject: [PATCH 23/76] Remove python API. We will implement it at another
 issue.

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 23 -------------------
 1 file changed, 23 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 49a3420c26945..21e55938fa7aa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -401,29 +401,6 @@ private[python] class PythonMLLibAPI extends Serializable {
     }
   }
 
-  /**
-   * Java stub for Python mllib HierarchicalClustering.run()
-   */
-  def trainHierarchicalClusteringModel(
-    data: JavaRDD[Vector],
-    k: Int,
-    maxIterations: Int,
-    maxRetries: Int,
-    seed: java.lang.Long): HierarchicalClusteringModel = {
-    val algo = new HierarchicalClustering()
-        .setNumClusters(k)
-        .setMaxIterations(maxIterations)
-        .setMaxRetries(maxRetries)
-
-    if (seed != null)  algo.setSeed(seed)
-
-    try {
-      algo.run(data)
-    } finally {
-      data.rdd.unpersist(blocking = false)
-    }
-  }
-
   /**
    * Java stub for Python mllib GaussianMixtureModel.predictSoft()
    */

From def81e202125de52b191e05e3df60c80c647d89e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 3 Jul 2015 16:21:39 +0900
Subject: [PATCH 24/76] Rename HierarchicalClustering to BisectingKMeans

---
 ...Clustering.scala => BisectingKMeans.scala} | 28 +++++++++----------
 ...Model.scala => BisectingKMeansModel.scala} | 10 +++----
 ...ite.java => JavaBisectingKMeansSuite.java} | 16 +++++------
 ....scala => BisectingKMeansModelSuite.scala} |  8 +++---
 ...Suite.scala => BisectingKMeansSuite.scala} | 28 +++++++++----------
 5 files changed, 45 insertions(+), 45 deletions(-)
 rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{HierarchicalClustering.scala => BisectingKMeans.scala} (96%)
 rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringModel.scala => BisectingKMeansModel.scala} (90%)
 rename mllib/src/test/java/org/apache/spark/mllib/clustering/{JavaHierarchicalClusteringSuite.java => JavaBisectingKMeansSuite.java} (86%)
 rename mllib/src/test/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringModelSuite.scala => BisectingKMeansModelSuite.scala} (96%)
 rename mllib/src/test/scala/org/apache/spark/mllib/clustering/{HierarchicalClusteringSuite.scala => BisectingKMeansSuite.scala} (90%)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
similarity index 96%
rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index f7249272b0c3d..20f87c12cac8e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -27,7 +27,7 @@ import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.{Logging, SparkException}
 
 
-object HierarchicalClustering extends Logging {
+object BisectingKMeans extends Logging {
 
   private[clustering] val ROOT_INDEX_KEY: Long = 1
 
@@ -49,7 +49,7 @@ object HierarchicalClustering extends Logging {
 }
 
 /**
- * This is a divisive hierarchical clustering algorithm based on bisect k-means algorithm.
+ * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm.
  *
  * The main idea of this algorithm is based on "A comparison of document clustering techniques",
  * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000.
@@ -61,7 +61,7 @@ object HierarchicalClustering extends Logging {
  * @param maxRetries the number of maximum retries
  * @param seed a random seed
  */
-class HierarchicalClustering private (
+class BisectingKMeans private (
   private var numClusters: Int,
   private var clusterMap: Map[Long, ClusterNode],
   private var maxIterations: Int,
@@ -114,13 +114,13 @@ class HierarchicalClustering private (
   def getSeed: Long = this.seed
 
   /**
-   * Runs the hierarchical clustering algorithm
+   * Runs the bisecting kmeans algorithm
    * @param input RDD of vectors
-   * @return model for the hierarchical clustering
+   * @return model for the bisecting kmeans
    */
-  def run(input: RDD[Vector]): HierarchicalClusteringModel = {
+  def run(input: RDD[Vector]): BisectingKMeansModel = {
     val sc = input.sparkContext
-    log.info(s"${sc.appName} starts a hierarchical clustering algorithm")
+    log.info(s"${sc.appName} starts a bisecting kmeans algorithm")
 
     var data = initData(input).cache()
     val startTime = System.currentTimeMillis()
@@ -172,17 +172,17 @@ class HierarchicalClustering private (
 
     // build a cluster tree by Map class which is expressed
     log.info(s"Building the cluster tree is started in ${sc.appName}")
-    val root = buildTree(clusters, HierarchicalClustering.ROOT_INDEX_KEY, this.numClusters)
+    val root = buildTree(clusters, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters)
     if (root == None) {
       new SparkException("Failed to build a cluster tree from a Map type of clusters")
     }
 
     // set the elapsed time for training
     val finishTime = (System.currentTimeMillis() - startTime) / 1000.0
-    log.info(s"Elapsed Time for Hierarchical Clustering Training: ${finishTime} [sec]")
+    log.info(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]")
 
-    // make a hierarchical clustering model
-    val model = new HierarchicalClusteringModel(root.get)
+    // make a bisecting kmeans model
+    val model = new BisectingKMeansModel(root.get)
     val leavesNodes = model.getClusters
     if (leavesNodes.length < this.numClusters) {
       log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}")
@@ -195,7 +195,7 @@ class HierarchicalClustering private (
    */
   private[clustering]
   def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
-    data.map { v: Vector => (HierarchicalClustering.ROOT_INDEX_KEY, v.toBreeze)}
+    data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)}
   }
 
   /**
@@ -334,7 +334,7 @@ class HierarchicalClustering private (
               .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
           if (childrenCenters.length >= 1) {
             val closestIndex =
-              HierarchicalClustering.findClosestCenter(bcMetric.value)(childrenCenters)(point)
+              BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point)
             val nextIndex = 2 * idx + closestIndex
 
             // get a map value or else get a sparse vector
@@ -466,7 +466,7 @@ class HierarchicalClustering private (
         // update the indexes
         case _ => {
           val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze)
-          val closestIndex = HierarchicalClustering
+          val closestIndex = BisectingKMeans
               .findClosestCenter(bcMetric.value)(nextCenters)(point)
           val nextIndex = 2 * idx + closestIndex
           (nextIndex, point)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
similarity index 90%
rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 7a38a7994fb0c..2c257caced02e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -24,11 +24,11 @@ import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 /**
- * This class is used for the model of the hierarchical clustering
+ * This class is used for the model of the bisecting kmeans
  *
  * @param node a cluster as a tree node
  */
-class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable with Logging {
+class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logging {
 
   def getClusters: Array[ClusterNode] = this.node.getLeavesNodes
 
@@ -42,7 +42,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
 
     val centers = this.getCenters.map(_.toBreeze)
-    HierarchicalClustering.findClosestCenter(metric)(centers)(vector.toBreeze)
+    BisectingKMeans.findClosestCenter(metric)(centers)(vector.toBreeze)
   }
 
   /**
@@ -58,7 +58,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
     sc.broadcast(centers)
 
     data.map{point =>
-      HierarchicalClustering.findClosestCenter(metric)(centers)(point.toBreeze)
+      BisectingKMeans.findClosestCenter(metric)(centers)(point.toBreeze)
     }
   }
 
@@ -77,7 +77,7 @@ class HierarchicalClusteringModel(val node: ClusterNode) extends Serializable wi
     val distances = data.map {point =>
       val bvPoint = point.toBreeze
       val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-      val idx = HierarchicalClustering.findClosestCenter(metric)(bvCenters)(bvPoint)
+      val idx = BisectingKMeans.findClosestCenter(metric)(bvCenters)(bvPoint)
       val closestCenter = bvCenters(idx)
       val distance = metric(bvPoint, closestCenter)
       distance
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
similarity index 86%
rename from mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
rename to mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index 7132658f5a3e7..75daf4c26f93b 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaHierarchicalClusteringSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -32,12 +32,12 @@
 
 import static org.junit.Assert.assertEquals;
 
-public class JavaHierarchicalClusteringSuite implements Serializable {
+public class JavaBisectingKMeansSuite implements Serializable {
   private transient JavaSparkContext sc;
 
   @Before
   public void setUp() {
-    sc = new JavaSparkContext("local", "JavaHierarchicalClustering");
+    sc = new JavaSparkContext("local", this.getClass().getSimpleName());
   }
 
   @After
@@ -57,8 +57,8 @@ public void runWithSmallData() {
     Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0);
 
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(1);
-    HierarchicalClusteringModel model = algo.run(data.rdd());
+    BisectingKMeans algo = new BisectingKMeans().setNumClusters(1);
+    BisectingKMeansModel model = algo.run(data.rdd());
     assertEquals(1, model.getCenters().length);
     assertEquals(expectedCenter, model.getCenters()[0]);
   }
@@ -73,8 +73,8 @@ public void runWithDenseVectors() {
       points.add(point);
     }
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters);
-    HierarchicalClusteringModel model = algo.run(data.rdd());
+    BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters);
+    BisectingKMeansModel model = algo.run(data.rdd());
     Vector[] centers = model.getCenters();
     assertEquals(numClusters, centers.length);
     assertEquals(Vectors.dense(0.0, 0.0), centers[0]);
@@ -103,8 +103,8 @@ public void runWithSparseVectors() {
       points.add(point);
     }
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    HierarchicalClustering algo = new HierarchicalClustering().setNumClusters(numClusters);
-    HierarchicalClusteringModel model = algo.run(data.rdd());
+    BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters);
+    BisectingKMeansModel model = algo.run(data.rdd());
     Vector[] centers = model.getCenters();
     assertEquals(numClusters, centers.length);
     assertEquals(points.get(0), centers[0]);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
similarity index 96%
rename from mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
rename to mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index 8d8715c77db92..6df074e34d23d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -22,11 +22,11 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.scalatest.BeforeAndAfterEach
 
-class HierarchicalClusteringModelSuite
+class BisectingKMeansModelSuite
     extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
 
   test("clustering dense vectors") {
-    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
+    val app = new BisectingKMeans().setNumClusters(5).setSeed(1)
 
     val localData = (1 to 100).toSeq.map { i =>
       val label = i % 5
@@ -87,7 +87,7 @@ class HierarchicalClusteringModelSuite
   }
 
   test("clustering sparse vectors") {
-    val app = new HierarchicalClustering().setNumClusters(5).setSeed(1)
+    val app = new BisectingKMeans().setNumClusters(5).setSeed(1)
 
     val localData = (1 to 100).toSeq.map { i =>
       val label = i % 5
@@ -148,7 +148,7 @@ class HierarchicalClusteringModelSuite
 
   test("clustering should be done correctly") {
     for (numClusters <- Array(9, 99, 999)) {
-      val app = new HierarchicalClustering().setNumClusters(numClusters).setSeed(1)
+      val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1)
       val localData = (1 to 1000).toSeq.map { i =>
         val label = i % numClusters
         val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
similarity index 90%
rename from mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
rename to mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index bd4c94c05b1bc..8e5d95dfb2846 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/HierarchicalClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -25,10 +25,10 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
 
-class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkContext {
+class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("the root index is equal to 1") {
-    assert(HierarchicalClustering.ROOT_INDEX_KEY === 1)
+    assert(BisectingKMeans.ROOT_INDEX_KEY === 1)
   }
 
   test("findClosestCenter") {
@@ -41,13 +41,13 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
 
     for (i <- 0 to (centers.size - 1)) {
       val point = centers(i)
-      val closestIndex = HierarchicalClustering.findClosestCenter(metric)(centers)(point)
+      val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point)
       assert(closestIndex === i)
     }
   }
 
   test("run") {
-    val algo = new HierarchicalClustering().setNumClusters(123)
+    val algo = new BisectingKMeans().setNumClusters(123)
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
@@ -62,7 +62,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("run with too many cluster size than the records") {
-    val algo = new HierarchicalClustering().setNumClusters(123)
+    val algo = new BisectingKMeans().setNumClusters(123)
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
@@ -71,7 +71,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("initializeData") {
-    val algo = new HierarchicalClustering
+    val algo = new BisectingKMeans
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val seed = sc.parallelize(localSeed)
     val data = algo.initData(seed)
@@ -79,7 +79,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("get center stats") {
-    val algo = new HierarchicalClustering
+    val algo = new BisectingKMeans
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val seed = sc.parallelize(localSeed)
     val data = algo.initData(seed)
@@ -103,7 +103,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("getChildrenCenter") {
-    val algo = new HierarchicalClustering
+    val algo = new BisectingKMeans
     val centers = Map(
       2L -> Vectors.dense(1.0, 1.0).toBreeze,
       3L -> Vectors.dense(2.0, 2.0).toBreeze
@@ -114,7 +114,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("should divide clusters") {
-    val algo = new HierarchicalClustering
+    val algo = new BisectingKMeans
     val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze))
     val data = sc.parallelize(seed)
     val clusters = algo.summarizeAsClusters(data)
@@ -132,7 +132,7 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("should assign each data to new clusters") {
-    val algo = new HierarchicalClustering
+    val algo = new BisectingKMeans
     val seed = Seq(
       (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)),
       (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
@@ -159,28 +159,28 @@ class HierarchicalClusteringSuite extends SparkFunSuite with MLlibTestSparkConte
   }
 
   test("setNumClusters") {
-    val algo = new HierarchicalClustering()
+    val algo = new BisectingKMeans()
     assert(algo.getNumClusters == 20)
     algo.setNumClusters(1000)
     assert(algo.getNumClusters == 1000)
   }
 
   test("setSubIterations") {
-    val algo = new HierarchicalClustering()
+    val algo = new BisectingKMeans()
     assert(algo.getMaxIterations == 20)
     algo.setMaxIterations(15)
     assert(algo.getMaxIterations == 15)
   }
 
   test("setNumRetries") {
-    val algo = new HierarchicalClustering()
+    val algo = new BisectingKMeans()
     assert(algo.getMaxRetries == 10)
     algo.setMaxRetries(15)
     assert(algo.getMaxRetries == 15)
   }
 
   test("setSeed") {
-    val algo = new HierarchicalClustering()
+    val algo = new BisectingKMeans()
     assert(algo.getSeed == 1)
     algo.setSeed(987)
     assert(algo.getSeed == 987)

From 707609a040c7b5c8a67b266f8f3422ec7aec069d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 3 Jul 2015 16:54:05 +0900
Subject: [PATCH 25/76] Remove the unnecessary parentheses

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 20f87c12cac8e..a1805bd603c29 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -556,7 +556,7 @@ class ClusterNode private (
     this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
   }
 
-  def isLeaf: Boolean = (this.children.size == 0)
+  def isLeaf: Boolean = this.children.size == 0
 
   def getParent: Option[ClusterNode] = this.parent
 
@@ -578,7 +578,7 @@ class ClusterNode private (
   }
 
   private[mllib]
-  def setLocalHeight(height: Double) = (this.localHeight = height)
+  def setLocalHeight(height: Double) = this.localHeight = height
 
   /**
    * Converts to an adjacency list
@@ -612,7 +612,7 @@ class ClusterNode private (
     val leaves = nodes.filter(_.isLeaf)
     val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1)
     val clusters = leaves ++ notLeaves
-    val treeMap = clusters.zipWithIndex.map { case (node, idx) => (node -> idx)}.toMap
+    val treeMap = clusters.zipWithIndex.map { case (node, idx) => node -> idx}.toMap
 
     // If a node only has one-child, the child is regarded as the cluster of the child.
     // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D.

From 4e1653d1a76a2b4c8efb8ca30ec5511316b68530 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 7 Jul 2015 08:22:35 +0900
Subject: [PATCH 26/76] Change the way how to initialize the children centers

---
 .../mllib/clustering/BisectingKMeans.scala    | 283 ++++++++----------
 .../BisectingKMeansModelSuite.scala           |  29 +-
 .../clustering/BisectingKMeansSuite.scala     | 103 +++----
 3 files changed, 200 insertions(+), 215 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index a1805bd603c29..091884e603c6d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,19 +17,17 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.collection.{Map, mutable}
-
-import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm}
+import scala.collection.{SortedSet, mutable, Map}
 
+import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.{Logging, SparkException}
 
 
 object BisectingKMeans extends Logging {
 
-  private[clustering] val ROOT_INDEX_KEY: Long = 1
+  private[clustering] val ROOT_INDEX_KEY: BigInt = 1
 
   /**
    * Finds the closes cluster's center
@@ -62,16 +60,15 @@ object BisectingKMeans extends Logging {
  * @param seed a random seed
  */
 class BisectingKMeans private (
-  private var numClusters: Int,
-  private var clusterMap: Map[Long, ClusterNode],
-  private var maxIterations: Int,
-  private var maxRetries: Int,
-  private var seed: Long) extends Logging {
+    private var numClusters: Int,
+    private var clusterMap: Map[BigInt, ClusterNode],
+    private var maxIterations: Int,
+    private var seed: Long) extends Logging {
 
   /**
    * Constructs with the default configuration
    */
-  def this() = this(20, mutable.ListMap.empty[Long, ClusterNode], 20, 10, 1)
+  def this() = this(20, mutable.ListMap.empty[BigInt, ClusterNode], 20, 1)
 
   /**
    * Sets the number of clusters you want
@@ -93,16 +90,6 @@ class BisectingKMeans private (
 
   def getMaxIterations: Int = this.maxIterations
 
-  /**
-   * Sets the number of maximum retries of each clustering step
-   */
-  def setMaxRetries(maxRetries: Int): this.type = {
-    this.maxRetries = maxRetries
-    this
-  }
-
-  def getMaxRetries: Int = this.maxRetries
-
   /**
    * Sets the random seed
    */
@@ -125,27 +112,30 @@ class BisectingKMeans private (
     var data = initData(input).cache()
     val startTime = System.currentTimeMillis()
 
-    // `clusters` is described as binary tree structure
-    // `clusters(1)` means the root of a binary tree
-    var clusters = summarizeAsClusters(data)
-    var leafClusters = clusters
+    // `clusterStats` is described as binary tree structure
+    // `clusterStats(1)` means the root of a binary tree
+    var clusterStats = mutable.Map.empty[BigInt, ClusterNodeStat]
     var step = 1
     var numDividedClusters = 0
     var noMoreDividable = false
-    var rddArray = Array.empty[RDD[(Long, BV[Double])]]
+    var rddArray = Array.empty[RDD[(BigInt, BV[Double])]]
     // the number of maximum nodes of a binary tree by given parameter
     val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1
     val maxAllNodesInTree = math.pow(2, multiplier).toInt
 
-    while (clusters.size < maxAllNodesInTree && noMoreDividable == false) {
+    while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) {
       log.info(s"${sc.appName} starts step ${step}")
+      val leafClusters = summarize(data)
+      val dividableLeafClusters = leafClusters.filter(_._2.isDividable)
+      clusterStats = clusterStats ++ leafClusters
 
-      // can be clustered if the number of divided clusters is equal to 0
-      val divided = getDividedClusters(data, leafClusters)
-      if (divided.size == 0) {
+      if (dividableLeafClusters.isEmpty) {
         noMoreDividable = true
       }
       else {
+        // can be clustered if the number of divided clusterStats is equal to 0
+        val divided = getDividedClusters(data, dividableLeafClusters)
+
         // update each index
         val newData = updateClusterIndex(data, divided).cache()
         rddArray = rddArray ++ Array(data)
@@ -157,24 +147,20 @@ class BisectingKMeans private (
           head.unpersist()
           rddArray = rddArray.filterNot(_.hashCode() == head.hashCode())
         }
-
-        // merge the divided clusters with the map as the cluster tree
-        clusters = clusters ++ divided
-        numDividedClusters = data.map(_._1).distinct().count().toInt
-        leafClusters = divided
         step += 1
-
-        log.info(s"${sc.appName} adding ${divided.size} new clusters at step:${step}")
+        log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
       }
     }
     // unpersist kept RDDs
     rddArray.foreach(_.unpersist())
 
+    val nodes = summarizeAsClusters(data, clusterStats)
+
     // build a cluster tree by Map class which is expressed
     log.info(s"Building the cluster tree is started in ${sc.appName}")
-    val root = buildTree(clusters, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters)
-    if (root == None) {
-      new SparkException("Failed to build a cluster tree from a Map type of clusters")
+    val root = buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters)
+    if (root.isEmpty) {
+      new SparkException("Failed to build a cluster tree from a Map type of clusterStats")
     }
 
     // set the elapsed time for training
@@ -185,7 +171,7 @@ class BisectingKMeans private (
     val model = new BisectingKMeansModel(root.get)
     val leavesNodes = model.getClusters
     if (leavesNodes.length < this.numClusters) {
-      log.warn(s"# clusters is less than you want: ${leavesNodes.length} / ${numClusters}")
+      log.warn(s"# clusterStats is less than you want: ${leavesNodes.length} / ${numClusters}")
     }
     model
   }
@@ -194,7 +180,7 @@ class BisectingKMeans private (
    * Assigns the initial cluster index id to all data
    */
   private[clustering]
-  def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
+  def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = {
     data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)}
   }
 
@@ -202,30 +188,24 @@ class BisectingKMeans private (
    * Summarizes data by each cluster as ClusterTree classes
    */
   private[clustering]
-  def summarizeAsClusters(data: RDD[(Long, BV[Double])]): Map[Long, ClusterNode] = {
-    // summarize input data
-    val stats = summarize(data)
-
-    // convert statistics to ClusterTree class
-    stats.map { case (i, (sum, n, sumOfSquares)) =>
-      val center = Vectors.fromBreeze(sum :/ n)
-      val variances = n match {
-        case n if n > 1 => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
-        case _ => Vectors.zeros(sum.size)
-      }
-      (i, new ClusterNode(center, n.toLong, variances))
-    }.toMap
+  def summarizeAsClusters(
+      data: RDD[(BigInt, BV[Double])],
+      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
+
+    stats.map { case (i, stat) =>
+      i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, breezeNorm(stat.variances, 2.0))
+    }
   }
 
   /**
    * Summarizes data by each cluster as Map
    */
   private[clustering]
-  def summarize(data: RDD[(Long, BV[Double])]): Map[Long, (BV[Double], Double, BV[Double])] = {
-    data.mapPartitions { iter =>
+  def summarize(data: RDD[(BigInt, BV[Double])]): Map[BigInt, ClusterNodeStat] = {
+    val stats = data.mapPartitions { iter =>
       // calculate the accumulation of the all point in a partition and count the rows
-      val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
-      iter.foreach { case (idx: Long, point: BV[Double]) =>
+      val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
+      iter.foreach { case (idx: BigInt, point: BV[Double]) =>
         // get a map value or else get a sparse vector
         val (sumBV, n, sumOfSquares) = map
             .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
@@ -236,79 +216,45 @@ class BisectingKMeans private (
       // sum the accumulation and the count in the all partition
       (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
     }.collect().toMap
+
+    stats.map {case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3)}
   }
 
   /**
    * Gets the new divided centers
    */
   private[clustering]
-  def getDividedClusters(data: RDD[(Long, BV[Double])],
-    dividedClusters: Map[Long, ClusterNode]): Map[Long, ClusterNode] = {
+  def getDividedClusters(data: RDD[(BigInt, BV[Double])],
+    leafClusters: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = {
     val sc = data.sparkContext
     val appName = sc.appName
 
     // get keys of dividable clusters
-    val dividableKeys = dividedClusters.filter { case (idx, cluster) =>
-      cluster.variances.toArray.sum > 0.0 && cluster.records >= 2
-    }.keySet
-    if (dividableKeys.size == 0) {
+    val dividableClusters = leafClusters.filter { case (idx, cluster) => cluster.isDividable }
+    if (dividableClusters.isEmpty) {
       log.info(s"There is no dividable clusters in ${appName}.")
-      return Map.empty[Long, ClusterNode]
+      return Map.empty[BigInt, ClusterNodeStat]
     }
 
     // divide input data
-    var dividableData = data.filter { case (idx, point) => dividableKeys.contains(idx)}
-    val dividableClusters = dividedClusters.filter { case (k, v) => dividableKeys.contains(k)}
-    val idealIndexes = dividableKeys.flatMap(idx => Array(2 * idx, 2 * idx + 1).toIterator)
-    var stats = divide(data, dividableClusters)
-
-    // if there are clusters which failed to be divided, retry to split the failed clusters
-    var tryTimes = 1
-    while (stats.size < dividableKeys.size * 2 && tryTimes <= this.maxRetries) {
-      // get the indexes of clusters which is failed to be divided
-      val failedIndexes = idealIndexes.filterNot(stats.keySet.contains).map(idx => idx / 2)
-      val failedCenters = dividedClusters.filter { case (idx, clstr) => failedIndexes.contains(idx)}
-      log.info(s"# failed clusters is ${failedCenters.size} of ${dividableKeys.size}" +
-          s"at ${tryTimes} times in ${appName}")
-
-      // divide the failed clusters again
-      val bcFailedIndexes = sc.broadcast(failedIndexes)
-      dividableData = data.filter { case (idx, point) => bcFailedIndexes.value.contains(idx)}
-      val missingStats = divide(dividableData, failedCenters)
-      stats = stats ++ missingStats
-      tryTimes += 1
-    }
-
-    // make children clusters
-    stats.filter { case (i, (sum, n, sumOfSquares)) => n > 0}
-        .map { case (i, (sum, n, sumOfSquares)) =>
-      val center = Vectors.fromBreeze(sum :/ n)
-      val variances = n match {
-        case 1 => Vectors.sparse(sum.size, Array(), Array())
-        case _ => Vectors.fromBreeze(sumOfSquares.:*(n) - (sum :* sum) :/ (n * (n - 1.0)))
-      }
-      val child = new ClusterNode(center, n.toLong, variances)
-      (i, child)
-    }.toMap
+    val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)}
+    divide(dividableData, dividableClusters)
   }
 
   /**
    * Divides the input data
    *
    * @param data the pairs of cluster index and point which you want to divide
-   * @param clusters the clusters you want to divide AS a Map class
+   * @param currentStats the cluster stats you want to divide AS a Map class
    * @return divided clusters as Map
    */
   private[clustering]
-  def divide(data: RDD[(Long, BV[Double])],
-    clusters: Map[Long, ClusterNode]): Map[Long, (BV[Double], Double, BV[Double])] = {
+  def divide(
+      data: RDD[(BigInt, BV[Double])],
+      currentStats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = {
 
     val sc = data.sparkContext
-    val centers = clusters.map { case (idx, cluster) => (idx, cluster.center.toBreeze)}
-    var newCenters = initChildrenCenter(centers)
-    if (newCenters.size == 0) {
-      return Map.empty[Long, (BV[Double], Double, BV[Double])]
-    }
+    var newCenters = initChildCenters(data, currentStats)
     var bcNewCenters = sc.broadcast(newCenters)
 
     // TODO Supports distance metrics other Euclidean distance metric
@@ -316,9 +262,7 @@ class BisectingKMeans private (
     val bcMetric = sc.broadcast(metric)
 
     val vectorSize = newCenters(newCenters.keySet.min).size
-    var stats = newCenters.keys.map { idx =>
-      (idx, (BSV.zeros[Double](vectorSize).toVector, 0.0, BSV.zeros[Double](vectorSize).toVector))
-    }.toMap
+    var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])]
 
     var subIter = 0
     var diffVariances = Double.MaxValue
@@ -327,7 +271,7 @@ class BisectingKMeans private (
     while (subIter < this.maxIterations && diffVariances > 10E-4) {
       // calculate summary of each cluster
       val eachStats = data.mapPartitions { iter =>
-        val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
+        val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
         iter.foreach { case (idx, point) =>
           // calculate next index number
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
@@ -366,24 +310,48 @@ class BisectingKMeans private (
       oldVariances = variances
       subIter += 1
     }
-    stats
+
+    stats.map { case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3) }
   }
 
   /**
    * Gets the initial centers for bisect k-means
    */
   private[clustering]
-  def initChildrenCenter(clusters: Map[Long, BV[Double]]): Map[Long, BV[Double]] = {
-    val rand = new XORShiftRandom()
-    rand.setSeed(this.seed)
-
-    clusters.flatMap { case (idx, center) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1)
-      val relativeErrorCoefficient = 0.001
-      Array(
-        (2 * idx, center.map(elm => elm - (elm * relativeErrorCoefficient * rand.nextDouble()))),
-        (2 * idx + 1, center.map(elm => elm + (elm * relativeErrorCoefficient * rand.nextDouble())))
-      )
+  def initChildCenters(
+      data: RDD[(BigInt, BV[Double])],
+      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, BV[Double]] = {
+
+    // Since the combination sampleByKey and groupByKey is more expensive,
+    // this as follows would be better.
+    val bcIndeces = data.sparkContext.broadcast(stats.keySet)
+    val samples =  data.mapPartitions { iter =>
+      val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]]
+
+      bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
+      val LOCAL_SAMPLE_SIZE = 20
+      iter.foreach { case (i, point) =>
+        map(i).append(point)
+        // to avoid to increase the memory usage on each map thread,
+        // the number of elements is cut off at the right time.
+        if (map(i).size > LOCAL_SAMPLE_SIZE) {
+          val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
+          map(i) = mutable.ArrayBuffer(elements.head, elements.last)
+        }
+      }
+
+      // in order to reduce the shuffle size, take only two elements
+      map.filterNot(_._2.isEmpty).map { case (i, points) =>
+        val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
+        i -> mutable.ArrayBuffer(elements.head, elements.last)
+      }.toIterator
+    }.reduceByKey { case (points1, points2) =>
+      points1.union(points2)
+    }.collect()
+
+    samples.flatMap { case (i, points) =>
+      val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
+      Array((2 * i, elements.head), (2 * i + 1, elements.last))
     }.toMap
   }
 
@@ -396,9 +364,10 @@ class BisectingKMeans private (
    * @return a built cluster tree
    */
   private[clustering]
-  def buildTree(treeMap: Map[Long, ClusterNode],
-    rootIndex: Long,
-    numClusters: Int): Option[ClusterNode] = {
+  def buildTree(
+      treeMap: Map[BigInt, ClusterNode],
+      rootIndex: BigInt,
+      numClusters: Int): Option[ClusterNode] = {
 
     // if there is no index in the Map
     if (!treeMap.contains(rootIndex)) return None
@@ -407,9 +376,9 @@ class BisectingKMeans private (
     var numLeavesClusters = 1
     val root = treeMap(rootIndex)
     var leavesQueue = Map(rootIndex -> root)
-    while (leavesQueue.size > 0 && numLeavesClusters < numClusters) {
+    while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) {
       // pick up the cluster whose variance is the maximum in the queue
-      val mostScattered = leavesQueue.maxBy(_._2.variancesNorm)
+      val mostScattered = leavesQueue.maxBy(_._2.criterion)
       val mostScatteredKey = mostScattered._1
       val mostScatteredCluster = mostScattered._2
 
@@ -446,8 +415,9 @@ class BisectingKMeans private (
    */
   private[clustering]
   def updateClusterIndex(
-    data: RDD[(Long, BV[Double])],
-    dividedClusters: Map[Long, ClusterNode]): RDD[(Long, BV[Double])] = {
+      data: RDD[(BigInt, BV[Double])],
+      dividedClusters: Map[BigInt, ClusterNodeStat]): RDD[(BigInt, BV[Double])] = {
+
     // extract the centers of the clusters
     val sc = data.sparkContext
     var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
@@ -465,7 +435,7 @@ class BisectingKMeans private (
         case s if s < 2 => (idx, point)
         // update the indexes
         case _ => {
-          val nextCenters = childrenIndexes.map(bcCenters.value(_)).map(_.toBreeze)
+          val nextCenters = childrenIndexes.map(bcCenters.value(_))
           val closestIndex = BisectingKMeans
               .findClosestCenter(bcMetric.value)(nextCenters)(point)
           val nextIndex = 2 * idx + closestIndex
@@ -476,31 +446,44 @@ class BisectingKMeans private (
   }
 }
 
+private[this]
+case class ClusterNodeStat (
+    rows: Long,
+    sums: BV[Double],
+    sumOfSquares: BV[Double]) extends Serializable {
+
+  // initialization
+  val center: BV[Double] = sums :/ rows.toDouble
+  val variances: BV[Double] = rows match {
+    case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
+    case _ => BV.zeros[Double](sums.size)
+  }
+
+  def isDividable: Boolean = breezeAny(variances) && rows >= 2
+}
+
 /**
  * A cluster as a tree node which can have its sub nodes
  *
  * @param center the center of the cluster
- * @param records the number of rows in the cluster
- * @param variances variance vectors
- * @param variancesNorm the norm of variance vector
+ * @param rows the number of rows in the cluster
+ * @param criterion the norm of variance vector
  * @param localHeight the maximal distance between this node and its children
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
  */
 class ClusterNode private (
-  val center: Vector,
-  val records: Long,
-  val variances: Vector,
-  val variancesNorm: Double,
-  private var localHeight: Double,
-  private var parent: Option[ClusterNode],
-  private var children: Seq[ClusterNode]) extends Serializable {
+    val center: Vector,
+    val rows: Long,
+    val criterion: Double,
+    private var localHeight: Double,
+    private var parent: Option[ClusterNode],
+    private var children: Seq[ClusterNode]) extends Serializable {
 
-  require(!variancesNorm.isNaN)
+  require(!criterion.isNaN)
 
-  def this(center: Vector, rows: Long, variances: Vector) =
-    this(center, rows, variances, breezeNorm(variances.toBreeze, 2.0),
-      0.0, None, Array.empty[ClusterNode])
+  def this(center: Vector, rows: Long, criterion: Double) =
+    this(center, rows, criterion, 0.0, None, Array.empty[ClusterNode])
 
   /**
    * Inserts a sub node as its child
@@ -533,7 +516,7 @@ class ClusterNode private (
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
     }
     array.sortWith { case (a, b) =>
-      a.getDepth < b.getDepth && a.variances.toArray.sum < b.variances.toArray.sum
+      a.getDepth < b.getDepth && a.criterion < b.criterion
     }
   }
 
@@ -552,11 +535,9 @@ class ClusterNode private (
   /**
    * Gets the leaves nodes in the cluster tree
    */
-  def getLeavesNodes: Array[ClusterNode] = {
-    this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
-  }
+  def getLeavesNodes: Array[ClusterNode] = this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
 
-  def isLeaf: Boolean = this.children.size == 0
+  def isLeaf: Boolean = this.children.isEmpty
 
   def getParent: Option[ClusterNode] = this.parent
 
@@ -627,7 +608,7 @@ class ClusterNode private (
       (getIndex(treeMap, node.children.head),
           getIndex(treeMap, node.children(1)),
           node.getHeight,
-          node.toArray.filter(_.isLeaf).length)
+          node.toArray.filter(_.isLeaf).size)
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index 6df074e34d23d..f4c4fe1cbfc4f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -127,14 +127,14 @@ class BisectingKMeansModelSuite
     val adjacencyList = model.toAdjacencyList
         .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
     assert(adjacencyList.length === 8)
-    assert(adjacencyList(0) === (0, 1, 1.5652))
-    assert(adjacencyList(1) === (0, 6, 1.5652))
-    assert(adjacencyList(2) === (1, 2, 1.3744))
-    assert(adjacencyList(3) === (1, 5, 1.3744))
-    assert(adjacencyList(4) === (2, 3, 0.5))
-    assert(adjacencyList(5) === (2, 4, 0.5))
-    assert(adjacencyList(6) === (6, 7, 2.5))
-    assert(adjacencyList(7) === (6, 8, 2.5))
+    assert(adjacencyList(0) === (0, 1, 3.2863))
+    assert(adjacencyList(1) === (0, 8, 3.2863))
+    assert(adjacencyList(2) === (1, 2, 2.3184))
+    assert(adjacencyList(3) === (1, 7, 2.3184))
+    assert(adjacencyList(4) === (2, 3, 1.3744))
+    assert(adjacencyList(5) === (2, 6, 1.3744))
+    assert(adjacencyList(6) === (3, 4, 0.5))
+    assert(adjacencyList(7) === (3, 5, 0.5))
 
     // linkage matrix
     val linkageMatrix = model.toLinkageMatrix
@@ -142,30 +142,31 @@ class BisectingKMeansModelSuite
     assert(linkageMatrix.length === 4)
     assert(linkageMatrix(0) === (0, 1, 0.5, 2))
     assert(linkageMatrix(1) === (5, 2, 1.8744, 3))
-    assert(linkageMatrix(2) === (3, 4, 2.5, 2))
-    assert(linkageMatrix(3) === (6, 7, 4.0652, 5))
+    assert(linkageMatrix(2) === (6, 3, 4.1928, 4))
+    assert(linkageMatrix(3) === (7, 4, 7.4791, 5))
   }
 
   test("clustering should be done correctly") {
-    for (numClusters <- Array(9, 99, 999)) {
+    for (numClusters <- Array(9, 19)) {
       val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1)
-      val localData = (1 to 1000).toSeq.map { i =>
+      val localData = (1 to 19).toSeq.map { i =>
         val label = i % numClusters
         val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble)))
         val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector)
         (label, denseVector, sparseVector)
       }
+
       // dense version
       val denseData = sc.parallelize(localData.map(_._2), 2)
       val denseModel = app.run(denseData)
       assert(denseModel.getCenters.length === numClusters)
-      assert(denseModel.getClusters.forall(_.variancesNorm == 0.0))
+      assert(denseModel.getClusters.forall(_.criterion == 0.0))
 
       // sparse version
       val sparseData = sc.parallelize(localData.map(_._3), 2)
       val sparseModel = app.run(sparseData)
       assert(sparseModel.getCenters.length === numClusters)
-      assert(sparseModel.getClusters.forall(_.variancesNorm == 0.0))
+      assert(sparseModel.getClusters.forall(_.criterion == 0.0))
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 8e5d95dfb2846..b93a320890b22 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -47,12 +47,12 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("run") {
-    val algo = new BisectingKMeans().setNumClusters(123)
+    val algo = new BisectingKMeans().setNumClusters(123).setSeed(1)
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
     assert(model.getClusters.length == 123)
-    assert(model.node.getHeight ~== 702.8641 absTol 10E-4)
+    assert(model.node.getHeight ~== 702.86414 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
     assert(model.node.getParent === None)
@@ -62,9 +62,9 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("run with too many cluster size than the records") {
-    val algo = new BisectingKMeans().setNumClusters(123)
+    val algo = new BisectingKMeans().setNumClusters(123).setSeed(1)
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val data = sc.parallelize(localSeed, 2)
+    val data = sc.parallelize(localSeed)
     val model = algo.run(data)
     assert(model.getClusters.length == 100)
     assert(model.node.getHeight ~== 72.12489 absTol 10E-4)
@@ -84,67 +84,77 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val seed = sc.parallelize(localSeed)
     val data = algo.initData(seed)
 
-    val clusters = algo.summarizeAsClusters(data)
+    val clusters = algo.summarize(data)
     assert(clusters.size === 1)
-    assert(clusters(1).center === Vectors.dense(49.5, 49.5))
-    assert(clusters(1).records === 100)
+    assert(clusters(1).center === Vectors.dense(49.5, 49.5).toBreeze)
+    assert(clusters(1).rows === 100)
 
-    val data2 = seed.map(v => ((v.apply(0) / 25).toLong + 1L, v.toBreeze))
-    val clusters2 = algo.summarizeAsClusters(data2)
+    val data2 = seed.map(v => (BigInt((v.apply(0) / 25).toInt + 1), v.toBreeze))
+    val clusters2 = algo.summarize(data2)
     assert(clusters2.size === 4)
-    assert(clusters2(1).center === Vectors.dense(12.0, 12.0))
-    assert(clusters2(1).records === 25)
-    assert(clusters2(2).center === Vectors.dense(37.0, 37.0))
-    assert(clusters2(2).records === 25)
-    assert(clusters2(3).center === Vectors.dense(62.0, 62.0))
-    assert(clusters2(3).records === 25)
-    assert(clusters2(4).center === Vectors.dense(87.0, 87.0))
-    assert(clusters2(4).records === 25)
+    assert(clusters2(1).center === Vectors.dense(12.0, 12.0).toBreeze)
+    assert(clusters2(1).rows === 25)
+    assert(clusters2(2).center === Vectors.dense(37.0, 37.0).toBreeze)
+    assert(clusters2(2).rows === 25)
+    assert(clusters2(3).center === Vectors.dense(62.0, 62.0).toBreeze)
+    assert(clusters2(3).rows === 25)
+    assert(clusters2(4).center === Vectors.dense(87.0, 87.0).toBreeze)
+    assert(clusters2(4).rows === 25)
   }
 
   test("getChildrenCenter") {
     val algo = new BisectingKMeans
-    val centers = Map(
-      2L -> Vectors.dense(1.0, 1.0).toBreeze,
-      3L -> Vectors.dense(2.0, 2.0).toBreeze
+    val local = Seq(
+      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
+      (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1))
+    )
+    val data = sc.parallelize(local)
+    val stats = Map[BigInt, ClusterNodeStat](
+      BigInt(2) -> new ClusterNodeStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)),
+      BigInt(3) -> new ClusterNodeStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2))
     )
-    val initNextCenters = algo.initChildrenCenter(centers)
+    val initNextCenters = algo.initChildCenters(data, stats)
     assert(initNextCenters.size === 4)
     assert(initNextCenters.keySet === Set(4, 5, 6, 7))
   }
 
   test("should divide clusters") {
-    val algo = new BisectingKMeans
-    val seed = (0 to 99).map(i => ((i / 50) + 2L, Vectors.dense(i, i).toBreeze))
-    val data = sc.parallelize(seed)
-    val clusters = algo.summarizeAsClusters(data)
-    val newClusters = algo.getDividedClusters(data, clusters)
+    val algo = new BisectingKMeans().setSeed(5)
+    val local = Seq(
+      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
+      (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)),
+      (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)),
+      (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1))
+    )
+    val data = sc.parallelize(local)
+    val stats = algo.summarize(data)
+    val newClusters = algo.getDividedClusters(data, stats)
 
     assert(newClusters.size === 4)
-    assert(newClusters(4).center === Vectors.dense(12.0, 12.0))
-    assert(newClusters(4).records === 25)
-    assert(newClusters(5).center === Vectors.dense(37.0, 37.0))
-    assert(newClusters(5).records === 25)
-    assert(newClusters(6).center === Vectors.dense(62.0, 62.0))
-    assert(newClusters(6).records === 25)
-    assert(newClusters(7).center === Vectors.dense(87.0, 87.0))
-    assert(newClusters(7).records === 25)
+    assert(newClusters(4).center === BV[Double](1.0, 1.0))
+    assert(newClusters(4).rows === 2)
+    assert(newClusters(5).center === BV[Double](10.0, 10.0))
+    assert(newClusters(5).rows === 2)
+    assert(newClusters(6).center === BV[Double](100.0, 100.0))
+    assert(newClusters(6).rows === 2)
+    assert(newClusters(7).center === BV[Double](110.0, 110.0))
+    assert(newClusters(7).rows === 2)
   }
 
   test("should assign each data to new clusters") {
     val algo = new BisectingKMeans
     val seed = Seq(
-      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)), (2L, Vectors.dense(2.0, 2.0)),
-      (2L, Vectors.dense(3.0, 3.0)), (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
-      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)), (3L, Vectors.dense(8.0, 8.0)),
-      (3L, Vectors.dense(9.0, 9.0)), (3L, Vectors.dense(10.0, 10.0)),
-      (3L, Vectors.dense(11.0, 11.0))
+      (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), (BigInt(2), Vectors.dense(2.0, 2.0)),
+      (BigInt(2), Vectors.dense(3.0, 3.0)), (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)),
+      (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), (BigInt(3), Vectors.dense(8.0, 8.0)),
+      (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)),
+      (BigInt(3), Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze)}
     val newClusters = Map(
-      4L -> new ClusterNode(Vectors.dense(1.0, 1.0), 3, Vectors.dense(1.0, 1.0)),
-      5L -> new ClusterNode(Vectors.dense(4.0, 4.0), 3, Vectors.dense(1.0, 1.0)),
-      6L -> new ClusterNode(Vectors.dense(7.0, 7.0), 3, Vectors.dense(1.0, 1.0)),
-      7L -> new ClusterNode(Vectors.dense(10.0, 10.0), 3, Vectors.dense(1.0, 1.0))
+      BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(5) -> new ClusterNodeStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(6) -> new ClusterNodeStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(7) -> new ClusterNodeStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0))
     )
     val data = sc.parallelize(seed)
     val result = algo.updateClusterIndex(data, newClusters).collect().toSeq
@@ -172,13 +182,6 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(algo.getMaxIterations == 15)
   }
 
-  test("setNumRetries") {
-    val algo = new BisectingKMeans()
-    assert(algo.getMaxRetries == 10)
-    algo.setMaxRetries(15)
-    assert(algo.getMaxRetries == 15)
-  }
-
   test("setSeed") {
     val algo = new BisectingKMeans()
     assert(algo.getSeed == 1)

From 6a51b129260abed13b2c869d77f53852020fe7c5 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 14 Jul 2015 10:11:48 +0900
Subject: [PATCH 27/76] Change the criterion for building a cluster tree from
 variance vectors to avg. costs.

---
 .../mllib/clustering/BisectingKMeans.scala    | 33 +++++++++++++++++--
 .../clustering/BisectingKMeansSuite.scala     |  2 +-
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 091884e603c6d..c5d76874fb73c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -147,6 +147,7 @@ class BisectingKMeans private (
           head.unpersist()
           rddArray = rddArray.filterNot(_.hashCode() == head.hashCode())
         }
+        clusterStats = clusterStats ++ divided
         step += 1
         log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
       }
@@ -154,7 +155,7 @@ class BisectingKMeans private (
     // unpersist kept RDDs
     rddArray.foreach(_.unpersist())
 
-    val nodes = summarizeAsClusters(data, clusterStats)
+    val nodes = calcCriterions(data, clusterStats)
 
     // build a cluster tree by Map class which is expressed
     log.info(s"Building the cluster tree is started in ${sc.appName}")
@@ -188,12 +189,38 @@ class BisectingKMeans private (
    * Summarizes data by each cluster as ClusterTree classes
    */
   private[clustering]
-  def summarizeAsClusters(
+  def calcCriterions(
       data: RDD[(BigInt, BV[Double])],
       stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
 
+    // TODO: support other criteria, such as entropy
+    calcAvgConsts(data, stats)
+  }
+
+  private[clustering]
+  def calcAvgConsts(
+      data: RDD[(BigInt, BV[Double])],
+      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
+
+    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center })
+    val costs = data.mapPartitions { iter =>
+      val counters = mutable.Map.empty[BigInt, (Long, Double)]
+      bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
+      iter.foreach { case (i, point) =>
+        val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0)
+        counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost)
+      }
+      counters.toIterator
+    }.reduceByKey { case((n1, cost1), (n2, cost2)) =>
+      (n1 + n2, cost1 + cost2)
+    }.collectAsMap()
+
     stats.map { case (i, stat) =>
-      i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, breezeNorm(stat.variances, 2.0))
+      val avgCost = costs(i)._1 match {
+        case x if x == 0.0 => 0.0
+        case _ => costs(i)._2 / costs(i)._1
+      }
+      i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost)
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index b93a320890b22..f99e1823f94dc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -52,7 +52,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
     assert(model.getClusters.length == 123)
-    assert(model.node.getHeight ~== 702.86414 absTol 10E-4)
+    assert(model.node.getHeight ~== 705.6925 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
     assert(model.node.getParent === None)

From c8a2a1932f08cc356537ca8b96bb3b733a3eea0e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 14 Jul 2015 15:21:04 +0900
Subject: [PATCH 28/76] Change `toArray` to avoid the TimSort error

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index c5d76874fb73c..9538952dbb17c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -543,7 +543,7 @@ class ClusterNode private (
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
     }
     array.sortWith { case (a, b) =>
-      a.getDepth < b.getDepth && a.criterion < b.criterion
+      a.getDepth < b.getDepth && a.criterion < b.criterion && a.rows < b.rows
     }
   }
 

From 5f899b3e93b4d98d87d72b6b2b291cea97ed5918 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 14 Jul 2015 16:04:07 +0900
Subject: [PATCH 29/76] Format the code, since there are some validation
 problems

---
 .../spark/mllib/clustering/BisectingKMeans.scala      |  6 ++++--
 .../spark/mllib/clustering/BisectingKMeansSuite.scala | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 9538952dbb17c..7780fec90ad42 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -352,7 +352,7 @@ class BisectingKMeans private (
     // Since the combination sampleByKey and groupByKey is more expensive,
     // this as follows would be better.
     val bcIndeces = data.sparkContext.broadcast(stats.keySet)
-    val samples =  data.mapPartitions { iter =>
+    val samples = data.mapPartitions { iter =>
       val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]]
 
       bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
@@ -562,7 +562,9 @@ class ClusterNode private (
   /**
    * Gets the leaves nodes in the cluster tree
    */
-  def getLeavesNodes: Array[ClusterNode] = this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
+  def getLeavesNodes: Array[ClusterNode] = {
+    this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
+  }
 
   def isLeaf: Boolean = this.children.isEmpty
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index f99e1823f94dc..95b4f59772481 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -144,11 +144,12 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("should assign each data to new clusters") {
     val algo = new BisectingKMeans
     val seed = Seq(
-      (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)), (BigInt(2), Vectors.dense(2.0, 2.0)),
-      (BigInt(2), Vectors.dense(3.0, 3.0)), (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)),
-      (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)), (BigInt(3), Vectors.dense(8.0, 8.0)),
-      (BigInt(3), Vectors.dense(9.0, 9.0)), (BigInt(3), Vectors.dense(10.0, 10.0)),
-      (BigInt(3), Vectors.dense(11.0, 11.0))
+      (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)),
+      (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)),
+      (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)),
+      (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)),
+      (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)),
+      (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze)}
     val newClusters = Map(
       BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)),

From 313e87f013d92b178b0c231f5e905eb19f45d34c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 14 Jul 2015 16:55:59 +0900
Subject: [PATCH 30/76] Remove unnecesary comment and import

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 7780fec90ad42..8fd14b2ffa0af 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.collection.{SortedSet, mutable, Map}
+import scala.collection.{mutable, Map}
 
 import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -56,7 +56,6 @@ object BisectingKMeans extends Logging {
  * @param numClusters tne number of clusters you want
  * @param clusterMap the pairs of cluster and its index as Map
  * @param maxIterations the number of maximal iterations
- * @param maxRetries the number of maximum retries
  * @param seed a random seed
  */
 class BisectingKMeans private (

From 3f6b14a7b2b901e1c479ec0c7fd64f3482809338 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 15 Jul 2015 15:30:46 +0900
Subject: [PATCH 31/76] Fix a typo and a few comments

---
 .../apache/spark/mllib/clustering/BisectingKMeans.scala  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 8fd14b2ffa0af..977cb14f7a3a0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -185,7 +185,7 @@ class BisectingKMeans private (
   }
 
   /**
-   * Summarizes data by each cluster as ClusterTree classes
+   * Calculates criterions for building cluster tree
    */
   private[clustering]
   def calcCriterions(
@@ -193,11 +193,14 @@ class BisectingKMeans private (
       stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
 
     // TODO: support other criteria, such as entropy
-    calcAvgConsts(data, stats)
+    calcAvgCosts(data, stats)
   }
 
+  /**
+   * Calculates the average costs of each cluster
+   */
   private[clustering]
-  def calcAvgConsts(
+  def calcAvgCosts(
       data: RDD[(BigInt, BV[Double])],
       stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
 

From 52b47049a12e819496d41ed5a96b396472f4ac18 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 29 Jul 2015 14:42:09 +0900
Subject: [PATCH 32/76] Add a new line above spark project classes

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 977cb14f7a3a0..fbdbbd9ef3c53 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering
 import scala.collection.{mutable, Map}
 
 import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny}
+
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{Logging, SparkException}

From fe87715d86c0f54cfb2422670b4a70c759473277 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 29 Jul 2015 14:43:47 +0900
Subject: [PATCH 33/76] Arrange the order of import statements

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala    | 2 +-
 .../spark/mllib/clustering/BisectingKMeansModelSuite.scala     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index fbdbbd9ef3c53..49a237be037ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -21,9 +21,9 @@ import scala.collection.{mutable, Map}
 
 import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny}
 
+import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{Logging, SparkException}
 
 
 object BisectingKMeans extends Logging {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index f4c4fe1cbfc4f..a7fafb67a76e4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.mllib.clustering
 
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.scalatest.BeforeAndAfterEach
 
 class BisectingKMeansModelSuite
     extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {

From eeef1e7b1ab09818e3b0389be24dc53af3ead32d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 29 Jul 2015 15:15:58 +0900
Subject: [PATCH 34/76] Use `isDefined`, instead of `!= None`

---
 .../apache/spark/mllib/clustering/BisectingKMeansSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 95b4f59772481..04640a0fc97cc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -58,7 +58,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.node.getParent === None)
     assert(model.node.getChildren.head.getParent.get === model.node)
     assert(model.node.getChildren.apply(1).getParent.get === model.node)
-    assert(model.getClusters.forall(_.getParent != None))
+    assert(model.getClusters.forall(_.getParent.isDefined))
   }
 
   test("run with too many cluster size than the records") {

From 3156dd7e0c4fc4f5d1b1116bb689592eee76ec25 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 21 Oct 2015 15:38:46 -0700
Subject: [PATCH 35/76] Update the bisecting k-means

- Add the description about the algorithm and the criterion for build a cluster tree
- Extract utility methods to `BisectingKMeans` object
- Make `BisectingKMeans` `private[clustering]`
- Add `@Since` tags to public methods, parameters and variables
---
 .../mllib/clustering/BisectingKMeans.scala    | 551 ++++++++++--------
 .../clustering/BisectingKMeansModel.scala     |  20 +-
 .../clustering/JavaBisectingKMeansSuite.java  |   6 +-
 .../BisectingKMeansModelSuite.scala           |  10 +-
 .../clustering/BisectingKMeansSuite.scala     | 192 +++---
 5 files changed, 416 insertions(+), 363 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 49a237be037ca..a44cbd32ac5fe 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,36 +17,16 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.collection.{mutable, Map}
+import scala.collection.{Map, mutable}
 
-import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, any => breezeAny}
+import breeze.linalg.{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm}
 
 import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
 
 
-object BisectingKMeans extends Logging {
-
-  private[clustering] val ROOT_INDEX_KEY: BigInt = 1
-
-  /**
-   * Finds the closes cluster's center
-   *
-   * @param metric a distance metric
-   * @param centers centers of the clusters
-   * @param point a target point
-   * @return an index of the array of clusters
-   */
-  private[mllib]
-  def findClosestCenter(metric: (BV[Double], BV[Double]) => Double)
-        (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
-    val (closestCenter, closestIndex) =
-      centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)
-    closestIndex
-  }
-}
-
 /**
  * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm.
  *
@@ -54,78 +34,109 @@ object BisectingKMeans extends Logging {
  * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000.
  * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf
  *
- * @param numClusters tne number of clusters you want
+ * However, we modified it to fit for Spark. This algorithm consists of the two main parts.
+ *
+ * 1. Split clusters until the number of clusters will be enough to build a cluster tree
+ * 2. Build a cluster tree as a binary tree by the splitted clusters
+ *
+ * First, it splits clusters to their children clusters step by step, not considering a cluster
+ * will be included in the final cluster tree or not. That's because it makes the algorithm more
+ * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until
+ * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting
+ * when there are no dividable clusters before the number of clusters will be sufficient. And
+ * it calculates the criterions, such as average cost, entropy and so on, for building a cluster
+ * tree in the first part. The criterion means how large the cluster is. That is, the cluster
+ * whose criterion is maximum of all the clusters is the largest cluster.
+ *
+ * Second, it builds a cluster tree as a binary tree by the result of the first part.
+ * First of all, the cluster tree starts with only the root cluster which includes all points.
+ * So, there are two candidates which can be merged to the cluster tree. Those are the children of
+ * the root. Then, it picks up the larger child of the two and merge it to the cluster tree.
+ * After that, there are tree candidates to merge. Those are the smaller child of the root and
+ * the two children of the larger cluster of the root. It picks up the largest cluster of the tree
+ * and merge it to the * cluster tree. Like this, it continues to pick up the largest one of the
+ * candidates and merge it to the cluster tree until the desired number of clusters is reached.
+ *
+ * @param k tne desired number of clusters
  * @param clusterMap the pairs of cluster and its index as Map
- * @param maxIterations the number of maximal iterations
+ * @param maxIterations the number of maximal iterations to split clusters
  * @param seed a random seed
  */
+@Since("1.6.0")
 class BisectingKMeans private (
-    private var numClusters: Int,
-    private var clusterMap: Map[BigInt, ClusterNode],
+    private var k: Int,
+    private var clusterMap: Map[BigInt, BisectingClusterNode],
     private var maxIterations: Int,
     private var seed: Long) extends Logging {
 
   /**
    * Constructs with the default configuration
    */
-  def this() = this(20, mutable.ListMap.empty[BigInt, ClusterNode], 20, 1)
+  @Since("1.6.0")
+  def this() = this(20, mutable.ListMap.empty[BigInt, BisectingClusterNode], 20, 1)
 
   /**
    * Sets the number of clusters you want
    */
-  def setNumClusters(numClusters: Int): this.type = {
-    this.numClusters = numClusters
+  @Since("1.6.0")
+  def setK(k: Int): this.type = {
+    this.k = k
     this
   }
 
-  def getNumClusters: Int = this.numClusters
+  @Since("1.6.0")
+  def getK: Int = this.k
 
   /**
    * Sets the number of maximal iterations in each clustering step
    */
+  @Since("1.6.0")
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
   }
 
+  @Since("1.6.0")
   def getMaxIterations: Int = this.maxIterations
 
   /**
    * Sets the random seed
    */
+  @Since("1.6.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
   }
 
+  @Since("1.6.0")
   def getSeed: Long = this.seed
 
   /**
-   * Runs the bisecting kmeans algorithm
+   * Runs the bisecting k-means algorithm
    * @param input RDD of vectors
    * @return model for the bisecting kmeans
    */
+  @Since("1.6.0")
   def run(input: RDD[Vector]): BisectingKMeansModel = {
     val sc = input.sparkContext
-    log.info(s"${sc.appName} starts a bisecting kmeans algorithm")
-
-    var data = initData(input).cache()
-    val startTime = System.currentTimeMillis()
 
     // `clusterStats` is described as binary tree structure
     // `clusterStats(1)` means the root of a binary tree
-    var clusterStats = mutable.Map.empty[BigInt, ClusterNodeStat]
+    var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat]
     var step = 1
-    var numDividedClusters = 0
     var noMoreDividable = false
     var rddArray = Array.empty[RDD[(BigInt, BV[Double])]]
     // the number of maximum nodes of a binary tree by given parameter
-    val multiplier = math.ceil(math.log10(this.numClusters) / math.log10(2.0)) + 1
+    val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1
     val maxAllNodesInTree = math.pow(2, multiplier).toInt
 
+    // divide clusters until the number of clusters reachs the condition
+    // or there is no dividable cluster
+    val startTime = System.currentTimeMillis()
+    var data = BisectingKMeans.initData(input).cache()
     while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) {
-      log.info(s"${sc.appName} starts step ${step}")
-      val leafClusters = summarize(data)
+      logInfo(s"${sc.appName} starts step ${step}")
+      val leafClusters = BisectingKMeans.summarizeClusters(data)
       val dividableLeafClusters = leafClusters.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusters
 
@@ -134,13 +145,12 @@ class BisectingKMeans private (
       }
       else {
         // can be clustered if the number of divided clusterStats is equal to 0
-        val divided = getDividedClusters(data, dividableLeafClusters)
-
+        val divided =
+          BisectingKMeans.divideClusters(data, dividableLeafClusters, maxIterations)
         // update each index
-        val newData = updateClusterIndex(data, divided).cache()
+        val newData = BisectingKMeans.updateClusterIndex(data, divided).cache()
         rddArray = rddArray ++ Array(data)
         data = newData
-
         // keep recent 2 cached RDDs in order to run more quickly
         if (rddArray.length > 1) {
           val head = rddArray.head
@@ -149,96 +159,69 @@ class BisectingKMeans private (
         }
         clusterStats = clusterStats ++ divided
         step += 1
-        log.info(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
+        logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
       }
     }
     // unpersist kept RDDs
     rddArray.foreach(_.unpersist())
-
-    val nodes = calcCriterions(data, clusterStats)
+    // create a map of cluster node with their criterions
+    val nodes = BisectingKMeans.createClusterNodes(data, clusterStats)
 
     // build a cluster tree by Map class which is expressed
-    log.info(s"Building the cluster tree is started in ${sc.appName}")
-    val root = buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.numClusters)
+    logInfo(s"Building the cluster tree is started in ${sc.appName}")
+    val root = BisectingKMeans.buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.k)
     if (root.isEmpty) {
       new SparkException("Failed to build a cluster tree from a Map type of clusterStats")
     }
 
     // set the elapsed time for training
     val finishTime = (System.currentTimeMillis() - startTime) / 1000.0
-    log.info(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]")
+    logInfo(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]")
 
     // make a bisecting kmeans model
     val model = new BisectingKMeansModel(root.get)
     val leavesNodes = model.getClusters
-    if (leavesNodes.length < this.numClusters) {
-      log.warn(s"# clusterStats is less than you want: ${leavesNodes.length} / ${numClusters}")
+    if (leavesNodes.length < this.k) {
+      logWarning(s"# clusters is less than you want: ${leavesNodes.length} / ${k}")
     }
     model
   }
+}
 
-  /**
-   * Assigns the initial cluster index id to all data
-   */
-  private[clustering]
-  def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = {
-    data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)}
-  }
 
-  /**
-   * Calculates criterions for building cluster tree
-   */
-  private[clustering]
-  def calcCriterions(
-      data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
+private[clustering] object BisectingKMeans {
 
-    // TODO: support other criteria, such as entropy
-    calcAvgCosts(data, stats)
-  }
+  val ROOT_INDEX_KEY: BigInt = 1
 
   /**
-   * Calculates the average costs of each cluster
+   * Finds the closes cluster's center
+   *
+   * @param metric a distance metric
+   * @param centers centers of the clusters
+   * @param point a target point
+   * @return an index of the array of clusters
    */
-  private[clustering]
-  def calcAvgCosts(
-      data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNode] = {
-
-    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center })
-    val costs = data.mapPartitions { iter =>
-      val counters = mutable.Map.empty[BigInt, (Long, Double)]
-      bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
-      iter.foreach { case (i, point) =>
-        val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0)
-        counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost)
-      }
-      counters.toIterator
-    }.reduceByKey { case((n1, cost1), (n2, cost2)) =>
-      (n1 + n2, cost1 + cost2)
-    }.collectAsMap()
-
-    stats.map { case (i, stat) =>
-      val avgCost = costs(i)._1 match {
-        case x if x == 0.0 => 0.0
-        case _ => costs(i)._2 / costs(i)._1
-      }
-      i -> new ClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost)
-    }
+  def findClosestCenter(metric: (BV[Double], BV[Double]) => Double)
+      (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
+    val (closestCenter, closestIndex) =
+      centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)
+    closestIndex
   }
 
   /**
    * Summarizes data by each cluster as Map
+   *
+   * @param data pairs of point and its cluster index
    */
-  private[clustering]
-  def summarize(data: RDD[(BigInt, BV[Double])]): Map[BigInt, ClusterNodeStat] = {
+  def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = {
+
     val stats = data.mapPartitions { iter =>
       // calculate the accumulation of the all point in a partition and count the rows
       val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
       iter.foreach { case (idx: BigInt, point: BV[Double]) =>
         // get a map value or else get a sparse vector
         val (sumBV, n, sumOfSquares) = map
-            .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
+          .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
         map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
       }
       map.toIterator
@@ -247,76 +230,152 @@ class BisectingKMeans private (
       (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
     }.collect().toMap
 
-    stats.map {case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3)}
+    stats.map {case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3)}
   }
 
   /**
-   * Gets the new divided centers
+   * Assigns the initial cluster index id to all data
    */
-  private[clustering]
-  def getDividedClusters(data: RDD[(BigInt, BV[Double])],
-    leafClusters: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = {
-    val sc = data.sparkContext
-    val appName = sc.appName
+  def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = {
+    data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)}
+  }
 
-    // get keys of dividable clusters
-    val dividableClusters = leafClusters.filter { case (idx, cluster) => cluster.isDividable }
-    if (dividableClusters.isEmpty) {
-      log.info(s"There is no dividable clusters in ${appName}.")
-      return Map.empty[BigInt, ClusterNodeStat]
-    }
+  /**
+   * Gets the initial centers for bisect k-means
+   *
+   * @param data pairs of point and its cluster index
+   * @param stats pairs of cluster index and cluster statistics
+   */
+  def initNextCenters(
+      data: RDD[(BigInt, BV[Double])],
+      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BV[Double]] = {
 
-    // divide input data
-    val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)}
-    divide(dividableData, dividableClusters)
+    // Since the combination sampleByKey and groupByKey is more expensive,
+    // this as follows would be better.
+    val bcIndeces = data.sparkContext.broadcast(stats.keySet)
+    val samples = data.mapPartitions { iter =>
+      val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]]
+
+      bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
+      val LOCAL_SAMPLE_SIZE = 100
+      iter.foreach { case (i, point) =>
+        map(i).append(point)
+        // to avoid to increase the memory usage on each map thread,
+        // the number of elements is cut off at the right time.
+        if (map(i).size > LOCAL_SAMPLE_SIZE) {
+          val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
+          map(i) = mutable.ArrayBuffer(elements.head, elements.last)
+        }
+      }
+
+      // in order to reduce the shuffle size, take only two elements
+      map.filterNot(_._2.isEmpty).map { case (i, points) =>
+        val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
+        i -> mutable.ArrayBuffer(elements.head, elements.last)
+      }.toIterator
+    }.reduceByKey { case (points1, points2) =>
+      points1.union(points2)
+    }.collect()
+
+    val nextCenters = samples.flatMap { case (i, points) =>
+      val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
+      Array((2 * i, elements.head), (2 * i + 1, elements.last))
+    }.toMap
+    if (!stats.keySet.flatMap(idx => Array(2 * idx, 2 * idx + 1)).forall(nextCenters.contains(_))) {
+      throw new SparkException("Failed to initialize centers for next step")
+    }
+    nextCenters
   }
 
   /**
-   * Divides the input data
+   * Updates the indexes of clusters which is divided to its children indexes
    *
-   * @param data the pairs of cluster index and point which you want to divide
-   * @param currentStats the cluster stats you want to divide AS a Map class
-   * @return divided clusters as Map
+   * @param data pairs of point and its cluster index
+   * @param dividedClusters pairs of cluster index and cluster statistics
    */
-  private[clustering]
-  def divide(
+  def updateClusterIndex(
       data: RDD[(BigInt, BV[Double])],
-      currentStats: Map[BigInt, ClusterNodeStat]): Map[BigInt, ClusterNodeStat] = {
+      dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = {
 
+    // extract the centers of the clusters
     val sc = data.sparkContext
-    var newCenters = initChildCenters(data, currentStats)
-    var bcNewCenters = sc.broadcast(newCenters)
+    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
+    val bcCenters = sc.broadcast(centers)
 
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     val bcMetric = sc.broadcast(metric)
 
-    val vectorSize = newCenters(newCenters.keySet.min).size
+    // update the indexes to their children indexes
+    data.map { case (idx, point) =>
+      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c))
+      childrenIndexes.length match {
+        // stay the index if the number of children is not enough
+        case s if s < 2 => (idx, point)
+        // update the indexes
+        case _ => {
+          val nextCenters = childrenIndexes.map(bcCenters.value(_))
+          val closestIndex = BisectingKMeans
+            .findClosestCenter(bcMetric.value)(nextCenters)(point)
+          val nextIndex = 2 * idx + closestIndex
+          (nextIndex, point)
+        }
+      }
+    }
+  }
+
+  /**
+   * Divides clusters according to their statistics
+   *
+   * @param data pairs of point and its cluster index
+   * @param targetClusters target clusters to divide
+   * @param maxIterations the maximum iterations to calculate clusters statistics
+   */
+  def divideClusters(
+      data: RDD[(BigInt, BV[Double])],
+      targetClusters: Map[BigInt, BisectingClusterStat],
+      maxIterations: Int): Map[BigInt, BisectingClusterStat] = {
+    val sc = data.sparkContext
+    val appName = sc.appName
+
+    // get keys of dividable clusters
+    val dividableClusters = targetClusters.filter { case (idx, cluster) => cluster.isDividable }
+    if (dividableClusters.isEmpty) {
+      return Map.empty[BigInt, BisectingClusterStat]
+    }
+    // extract dividable input data
+    val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)}
+
+    var newCenters = BisectingKMeans.initNextCenters(dividableData, dividableClusters)
+    var bcNewCenters = sc.broadcast(newCenters)
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    val bcMetric = sc.broadcast(metric)
     var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])]
 
     var subIter = 0
-    var diffVariances = Double.MaxValue
-    var oldVariances = Double.MaxValue
-    var variances = Double.MaxValue
-    while (subIter < this.maxIterations && diffVariances > 10E-4) {
+    var totalStd = Double.MaxValue
+    var oldTotalStd = Double.MaxValue
+    var relativeError = Double.MaxValue
+    while (subIter < maxIterations && relativeError > 10E-4) {
       // calculate summary of each cluster
-      val eachStats = data.mapPartitions { iter =>
+      val eachStats = dividableData.mapPartitions { iter =>
         val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
         iter.foreach { case (idx, point) =>
           // calculate next index number
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
-              .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
-          if (childrenCenters.length >= 1) {
+            .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
+          if (childrenCenters.length == 2) {
             val closestIndex =
               BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point)
             val nextIndex = 2 * idx + closestIndex
 
             // get a map value or else get a sparse vector
             val (sumBV, n, sumOfSquares) = map
-                .getOrElse(
-                  nextIndex,
-                  (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
-                )
+              .getOrElse(
+                nextIndex,
+                (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+              )
             map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
           }
         }
@@ -333,56 +392,60 @@ class BisectingKMeans private (
       // update summary of each cluster
       stats = eachStats.toMap
 
-      variances = stats.map { case (idx, (sum, n, sumOfSquares)) =>
-        math.pow(sumOfSquares.toArray.sum, 1.0 / sumOfSquares.size)
+      totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) =>
+        sum.toArray.zip(sumOfSquares.toArray).map { case (s, ss) =>
+          math.pow(ss / n - math.pow(s / n, 2), 2.0)
+        }.sum
       }.sum
-      diffVariances = math.abs(oldVariances - variances) / oldVariances
-      oldVariances = variances
+      relativeError = math.abs(oldTotalStd - totalStd) / totalStd
+      oldTotalStd = totalStd
       subIter += 1
     }
-
-    stats.map { case (i, stat) => i -> new ClusterNodeStat(stat._2.toLong, stat._1, stat._3) }
+    stats.map { case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3) }
   }
 
   /**
-   * Gets the initial centers for bisect k-means
+   * Creates the map of cluster stats to the map of cluster nodes with their criterions
+   *
+   * @param data input data
+   * @param stats map of cluster stats which is described as a binary tree
    */
-  private[clustering]
-  def initChildCenters(
+  def createClusterNodes(
       data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, ClusterNodeStat]): Map[BigInt, BV[Double]] = {
+      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
 
-    // Since the combination sampleByKey and groupByKey is more expensive,
-    // this as follows would be better.
-    val bcIndeces = data.sparkContext.broadcast(stats.keySet)
-    val samples = data.mapPartitions { iter =>
-      val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]]
+    // TODO: support other criterion, such as entropy
+    createClusterNodesWithAverageCost(data, stats)
+  }
 
-      bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
-      val LOCAL_SAMPLE_SIZE = 20
+  /**
+   * Creates the map of cluster stats to the map of cluster nodes with their average costs
+   */
+  private def createClusterNodesWithAverageCost(
+      data: RDD[(BigInt, BV[Double])],
+      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
+
+    // calculate average costs of all clusters
+    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center })
+    val costs = data.mapPartitions { iter =>
+      val counters = mutable.Map.empty[BigInt, (Long, Double)]
+      bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
       iter.foreach { case (i, point) =>
-        map(i).append(point)
-        // to avoid to increase the memory usage on each map thread,
-        // the number of elements is cut off at the right time.
-        if (map(i).size > LOCAL_SAMPLE_SIZE) {
-          val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
-          map(i) = mutable.ArrayBuffer(elements.head, elements.last)
-        }
+        val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0)
+        counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost)
       }
+      counters.toIterator
+    }.reduceByKey { case((n1, cost1), (n2, cost2)) =>
+      (n1 + n2, cost1 + cost2)
+    }.collectAsMap()
 
-      // in order to reduce the shuffle size, take only two elements
-      map.filterNot(_._2.isEmpty).map { case (i, points) =>
-        val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
-        i -> mutable.ArrayBuffer(elements.head, elements.last)
-      }.toIterator
-    }.reduceByKey { case (points1, points2) =>
-      points1.union(points2)
-    }.collect()
-
-    samples.flatMap { case (i, points) =>
-      val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 1.0) < breezeNorm(b, 1.0))
-      Array((2 * i, elements.head), (2 * i + 1, elements.last))
-    }.toMap
+    stats.map { case (i, stat) =>
+      val avgCost = costs(i)._1 match {
+        case x if x == 0.0 => 0.0
+        case _ => costs(i)._2 / costs(i)._1
+      }
+      i -> new BisectingClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost)
+    }
   }
 
   /**
@@ -393,11 +456,10 @@ class BisectingKMeans private (
    * @param numClusters the number of clusters you want
    * @return a built cluster tree
    */
-  private[clustering]
-  def buildTree(
-      treeMap: Map[BigInt, ClusterNode],
+  private def buildTree(
+      treeMap: Map[BigInt, BisectingClusterNode],
       rootIndex: BigInt,
-      numClusters: Int): Option[ClusterNode] = {
+      numClusters: Int): Option[BisectingClusterNode] = {
 
     // if there is no index in the Map
     if (!treeMap.contains(rootIndex)) return None
@@ -407,7 +469,7 @@ class BisectingKMeans private (
     val root = treeMap(rootIndex)
     var leavesQueue = Map(rootIndex -> root)
     while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) {
-      // pick up the cluster whose variance is the maximum in the queue
+      // pick up the largest cluster by the maximum criterion of all the clusters
       val mostScattered = leavesQueue.maxBy(_._2.criterion)
       val mostScatteredKey = mostScattered._1
       val mostScatteredCluster = mostScattered._2
@@ -423,7 +485,7 @@ class BisectingKMeans private (
         // TODO Supports distance metrics other Euclidean distance metric
         val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
         val localHeight = children
-            .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
+          .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
         mostScatteredCluster.setLocalHeight(localHeight)
 
         // update the queue
@@ -433,63 +495,9 @@ class BisectingKMeans private (
 
       // remove the cluster which is involved to the cluster tree
       leavesQueue = leavesQueue.filterNot(_ == mostScattered)
-
-      log.info(s"Total Leaves Clusters: ${numLeavesClusters} / ${numClusters}. " +
-          s"Cluster ${childrenIndexes.mkString(",")} are merged.")
     }
     Some(root)
   }
-
-  /**
-   * Updates the indexes of clusters which is divided to its children indexes
-   */
-  private[clustering]
-  def updateClusterIndex(
-      data: RDD[(BigInt, BV[Double])],
-      dividedClusters: Map[BigInt, ClusterNodeStat]): RDD[(BigInt, BV[Double])] = {
-
-    // extract the centers of the clusters
-    val sc = data.sparkContext
-    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
-    val bcCenters = sc.broadcast(centers)
-
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val bcMetric = sc.broadcast(metric)
-
-    // update the indexes to their children indexes
-    data.map { case (idx, point) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c))
-      childrenIndexes.length match {
-        // stay the index if the number of children is not enough
-        case s if s < 2 => (idx, point)
-        // update the indexes
-        case _ => {
-          val nextCenters = childrenIndexes.map(bcCenters.value(_))
-          val closestIndex = BisectingKMeans
-              .findClosestCenter(bcMetric.value)(nextCenters)(point)
-          val nextIndex = 2 * idx + closestIndex
-          (nextIndex, point)
-        }
-      }
-    }
-  }
-}
-
-private[this]
-case class ClusterNodeStat (
-    rows: Long,
-    sums: BV[Double],
-    sumOfSquares: BV[Double]) extends Serializable {
-
-  // initialization
-  val center: BV[Double] = sums :/ rows.toDouble
-  val variances: BV[Double] = rows match {
-    case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
-    case _ => BV.zeros[Double](sums.size)
-  }
-
-  def isDividable: Boolean = breezeAny(variances) && rows >= 2
 }
 
 /**
@@ -497,30 +505,33 @@ case class ClusterNodeStat (
  *
  * @param center the center of the cluster
  * @param rows the number of rows in the cluster
- * @param criterion the norm of variance vector
+ * @param criterion how large a cluster is
  * @param localHeight the maximal distance between this node and its children
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
  */
-class ClusterNode private (
-    val center: Vector,
-    val rows: Long,
-    val criterion: Double,
+@Since("1.6.0")
+class BisectingClusterNode private (
+    @Since("1.6.0") val center: Vector,
+    @Since("1.6.0") val rows: Long,
+    @Since("1.6.0") val criterion: Double,
     private var localHeight: Double,
-    private var parent: Option[ClusterNode],
-    private var children: Seq[ClusterNode]) extends Serializable {
+    private var parent: Option[BisectingClusterNode],
+    private var children: Seq[BisectingClusterNode]) extends Serializable {
 
   require(!criterion.isNaN)
 
+  @Since("1.6.0")
   def this(center: Vector, rows: Long, criterion: Double) =
-    this(center, rows, criterion, 0.0, None, Array.empty[ClusterNode])
+    this(center, rows, criterion, 0.0, None, Array.empty[BisectingClusterNode])
 
   /**
    * Inserts a sub node as its child
    *
    * @param child inserted sub node
    */
-  def insert(child: ClusterNode) {
+  @Since("1.6.0")
+  def insert(child: BisectingClusterNode) {
     insert(Array(child))
   }
 
@@ -529,7 +540,8 @@ class ClusterNode private (
    *
    * @param children inserted sub nodes
    */
-  def insert(children: Array[ClusterNode]) {
+  @Since("1.6.0")
+  def insert(children: Array[BisectingClusterNode]) {
     this.children = this.children ++ children
     children.foreach(child => child.parent = Some(this))
   }
@@ -540,7 +552,8 @@ class ClusterNode private (
    *
    * @return an Array class which the cluster tree is expanded
    */
-  def toArray: Array[ClusterNode] = {
+  @Since("1.6.0")
+  def toArray: Array[BisectingClusterNode] = {
     val array = this.children.size match {
       case 0 => Array(this)
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
@@ -555,6 +568,7 @@ class ClusterNode private (
    *
    * @return the depth from the root
    */
+  @Since("1.6.0")
   def getDepth: Int = {
     this.parent match {
       case None => 0
@@ -565,15 +579,19 @@ class ClusterNode private (
   /**
    * Gets the leaves nodes in the cluster tree
    */
-  def getLeavesNodes: Array[ClusterNode] = {
+  @Since("1.6.0")
+  def getLeavesNodes: Array[BisectingClusterNode] = {
     this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
   }
 
+  @Since("1.6.0")
   def isLeaf: Boolean = this.children.isEmpty
 
-  def getParent: Option[ClusterNode] = this.parent
+  @Since("1.6.0")
+  def getParent: Option[BisectingClusterNode] = this.parent
 
-  def getChildren: Seq[ClusterNode] = this.children
+  @Since("1.6.0")
+  def getChildren: Seq[BisectingClusterNode] = this.children
 
   /**
    * Gets the dendrogram height of the cluster at the cluster tree.
@@ -583,6 +601,7 @@ class ClusterNode private (
    *
    * @return the dendrogram height
    */
+  @Since("1.6.0")
   def getHeight: Double = {
     this.children.size match {
       case 0 => 0.0
@@ -590,14 +609,15 @@ class ClusterNode private (
     }
   }
 
-  private[mllib]
-  def setLocalHeight(height: Double) = this.localHeight = height
+  @Since("1.6.0")
+  def setLocalHeight(height: Double): Unit = this.localHeight = height
 
   /**
    * Converts to an adjacency list
    *
    * @return List[(fromNodeId, toNodeId, distance)]
    */
+  @Since("1.6.0")
   def toAdjacencyList: Array[(Int, Int, Double)] = {
     val nodes = toArray
 
@@ -620,6 +640,7 @@ class ClusterNode private (
    *
    * @return List[(node1, node2, distance, tree size)]
    */
+  @Since("1.6.0")
   def toLinkageMatrix: Array[(Int, Int, Double, Int)] = {
     val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight}
     val leaves = nodes.filter(_.isLeaf)
@@ -630,7 +651,7 @@ class ClusterNode private (
     // If a node only has one-child, the child is regarded as the cluster of the child.
     // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D.
     // ==> A merge list is (B, D), not (B, C).
-    def getIndex(map: Map[ClusterNode, Int], node: ClusterNode): Int = {
+    def getIndex(map: Map[BisectingClusterNode, Int], node: BisectingClusterNode): Int = {
       node.children.size match {
         case 1 => getIndex(map, node.children.head)
         case _ => map(node)
@@ -638,9 +659,33 @@ class ClusterNode private (
     }
     clusters.filterNot(_.isLeaf).map { node =>
       (getIndex(treeMap, node.children.head),
-          getIndex(treeMap, node.children(1)),
-          node.getHeight,
-          node.toArray.filter(_.isLeaf).size)
+        getIndex(treeMap, node.children(1)),
+        node.getHeight,
+        node.toArray.filter(_.isLeaf).length)
     }
   }
 }
+
+
+/**
+ *  This class is used for maneging a cluster statistics
+ *
+ * @param rows the number of points
+ * @param sums the sum of points
+ * @param sumOfSquares the sum of squares of points
+ */
+private[clustering] case class BisectingClusterStat (
+    rows: Long,
+    sums: BV[Double],
+    sumOfSquares: BV[Double]) extends Serializable {
+
+  // initialization
+  val center: BV[Double] = sums :/ rows.toDouble
+  val variances: BV[Double] = rows match {
+    case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
+    case _ => BV.zeros[Double](sums.size)
+  }
+
+  def isDividable: Boolean = breezeAny(variances) && rows >= 2
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 2c257caced02e..b524ae6b370e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.mllib.clustering
 
 import breeze.linalg.{Vector => BV, norm => breezeNorm}
+
 import org.apache.spark.Logging
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
@@ -28,15 +30,21 @@ import org.apache.spark.rdd.RDD
  *
  * @param node a cluster as a tree node
  */
-class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logging {
+@Since("1.6.0")
+class BisectingKMeansModel @Since("1.6.0") (
+    @Since("1.6.0") val node: BisectingClusterNode
+  ) extends Serializable with Logging {
 
-  def getClusters: Array[ClusterNode] = this.node.getLeavesNodes
+  @Since("1.6.0")
+  def getClusters: Array[BisectingClusterNode] = this.node.getLeavesNodes
 
+  @Since("1.6.0")
   def getCenters: Array[Vector] = this.getClusters.map(_.center)
 
   /**
    * Predicts the closest cluster by one point
    */
+  @Since("1.6.0")
   def predict(vector: Vector): Int = {
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
@@ -48,6 +56,7 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg
   /**
    * Predicts the closest cluster by RDD of the points
    */
+  @Since("1.6.0")
   def predict(data: RDD[Vector]): RDD[Int] = {
     val sc = data.sparkContext
 
@@ -65,12 +74,14 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg
   /**
    * Predicts the closest cluster by RDD of the points for Java
    */
+  @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
    * Computes Within Set Sum of Squared Error(WSSSE)
    */
+  @Since("1.6.0")
   def WSSSE(data: RDD[Vector]): Double = {
     val bvCenters = this.getCenters.map(_.toBreeze)
     data.context.broadcast(bvCenters)
@@ -85,11 +96,14 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg
     distances.sum()
   }
 
+  @Since("1.6.0")
   def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
 
+  @Since("1.6.0")
   def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
+  @Since("1.6.0")
   def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
     val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
     this.node.toAdjacencyList.foreach { x =>
@@ -102,9 +116,11 @@ class BisectingKMeansModel(val node: ClusterNode) extends Serializable with Logg
     javaList
   }
 
+  @Since("1.6.0")
   def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix
 
   /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
+  @Since("1.6.0")
   def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
     val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
     this.node.toLinkageMatrix.foreach {x =>
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index 75daf4c26f93b..913ca9ac6169a 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -57,7 +57,7 @@ public void runWithSmallData() {
     Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0);
 
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setNumClusters(1);
+    BisectingKMeans algo = new BisectingKMeans().setK(1);
     BisectingKMeansModel model = algo.run(data.rdd());
     assertEquals(1, model.getCenters().length);
     assertEquals(expectedCenter, model.getCenters()[0]);
@@ -73,7 +73,7 @@ public void runWithDenseVectors() {
       points.add(point);
     }
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters);
+    BisectingKMeans algo = new BisectingKMeans().setK(numClusters);
     BisectingKMeansModel model = algo.run(data.rdd());
     Vector[] centers = model.getCenters();
     assertEquals(numClusters, centers.length);
@@ -103,7 +103,7 @@ public void runWithSparseVectors() {
       points.add(point);
     }
     JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setNumClusters(numClusters);
+    BisectingKMeans algo = new BisectingKMeans().setK(numClusters);
     BisectingKMeansModel model = algo.run(data.rdd());
     Vector[] centers = model.getCenters();
     assertEquals(numClusters, centers.length);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index a7fafb67a76e4..c8e077ecd1413 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -27,7 +27,7 @@ class BisectingKMeansModelSuite
     extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
 
   test("clustering dense vectors") {
-    val app = new BisectingKMeans().setNumClusters(5).setSeed(1)
+    val app = new BisectingKMeans().setK(5).setSeed(1)
 
     val localData = (1 to 100).toSeq.map { i =>
       val label = i % 5
@@ -38,7 +38,7 @@ class BisectingKMeansModelSuite
     val model = app.run(data)
 
     val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[ClusterNode]])
+    assert(clusters.isInstanceOf[Array[BisectingClusterNode]])
     assert(clusters.length === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
@@ -88,7 +88,7 @@ class BisectingKMeansModelSuite
   }
 
   test("clustering sparse vectors") {
-    val app = new BisectingKMeans().setNumClusters(5).setSeed(1)
+    val app = new BisectingKMeans().setK(5).setSeed(1)
 
     val localData = (1 to 100).toSeq.map { i =>
       val label = i % 5
@@ -99,7 +99,7 @@ class BisectingKMeansModelSuite
     val model = app.run(data)
 
     val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[ClusterNode]])
+    assert(clusters.isInstanceOf[Array[BisectingClusterNode]])
     assert(clusters.length === 5)
 
     val centers = model.getCenters.sortBy(_.toArray.sum)
@@ -149,7 +149,7 @@ class BisectingKMeansModelSuite
 
   test("clustering should be done correctly") {
     for (numClusters <- Array(9, 19)) {
-      val app = new BisectingKMeans().setNumClusters(numClusters).setSeed(1)
+      val app = new BisectingKMeans().setK(numClusters).setSeed(1)
       val localData = (1 to 19).toSeq.map { i =>
         val label = i % numClusters
         val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 04640a0fc97cc..43d5f5a2d3c63 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -27,27 +27,8 @@ import org.apache.spark.mllib.util.TestingUtils._
 
 class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  test("the root index is equal to 1") {
-    assert(BisectingKMeans.ROOT_INDEX_KEY === 1)
-  }
-
-  test("findClosestCenter") {
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val centers = Seq(
-      Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze,
-      Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze,
-      Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze
-    )
-
-    for (i <- 0 to (centers.size - 1)) {
-      val point = centers(i)
-      val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point)
-      assert(closestIndex === i)
-    }
-  }
-
   test("run") {
-    val algo = new BisectingKMeans().setNumClusters(123).setSeed(1)
+    val algo = new BisectingKMeans().setK(123).setSeed(1)
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
@@ -62,7 +43,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("run with too many cluster size than the records") {
-    val algo = new BisectingKMeans().setNumClusters(123).setSeed(1)
+    val algo = new BisectingKMeans().setK(123).setSeed(1)
     val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed)
     val model = algo.run(data)
@@ -70,79 +51,73 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.node.getHeight ~== 72.12489 absTol 10E-4)
   }
 
-  test("initializeData") {
-    val algo = new BisectingKMeans
-    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val seed = sc.parallelize(localSeed)
-    val data = algo.initData(seed)
-    assert(data.map(_._1).collect().distinct === Array(1))
+  test("setNumClusters") {
+    val algo = new BisectingKMeans()
+    assert(algo.getK == 20)
+    algo.setK(1000)
+    assert(algo.getK == 1000)
   }
 
-  test("get center stats") {
-    val algo = new BisectingKMeans
-    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val seed = sc.parallelize(localSeed)
-    val data = algo.initData(seed)
-
-    val clusters = algo.summarize(data)
-    assert(clusters.size === 1)
-    assert(clusters(1).center === Vectors.dense(49.5, 49.5).toBreeze)
-    assert(clusters(1).rows === 100)
-
-    val data2 = seed.map(v => (BigInt((v.apply(0) / 25).toInt + 1), v.toBreeze))
-    val clusters2 = algo.summarize(data2)
-    assert(clusters2.size === 4)
-    assert(clusters2(1).center === Vectors.dense(12.0, 12.0).toBreeze)
-    assert(clusters2(1).rows === 25)
-    assert(clusters2(2).center === Vectors.dense(37.0, 37.0).toBreeze)
-    assert(clusters2(2).rows === 25)
-    assert(clusters2(3).center === Vectors.dense(62.0, 62.0).toBreeze)
-    assert(clusters2(3).rows === 25)
-    assert(clusters2(4).center === Vectors.dense(87.0, 87.0).toBreeze)
-    assert(clusters2(4).rows === 25)
+  test("setSubIterations") {
+    val algo = new BisectingKMeans()
+    assert(algo.getMaxIterations == 20)
+    algo.setMaxIterations(15)
+    assert(algo.getMaxIterations == 15)
   }
 
-  test("getChildrenCenter") {
+  test("setSeed") {
+    val algo = new BisectingKMeans()
+    assert(algo.getSeed == 1)
+    algo.setSeed(987)
+    assert(algo.getSeed == 987)
+  }
+
+  test("summarize center stats") {
     val algo = new BisectingKMeans
     val local = Seq(
-      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
-      (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1))
+      (BigInt(4), Vectors.dense(1.5, 1.5).toBreeze),
+      (BigInt(4), Vectors.dense(2.5, 2.5).toBreeze),
+      (BigInt(5), Vectors.dense(11.5, 11.5).toBreeze),
+      (BigInt(5), Vectors.dense(12.5, 12.5).toBreeze),
+      (BigInt(6), Vectors.dense(21.5, 21.5).toBreeze),
+      (BigInt(6), Vectors.dense(22.5, 22.5).toBreeze),
+      (BigInt(7), Vectors.dense(31.5, 31.5).toBreeze),
+      (BigInt(7), Vectors.dense(32.5, 32.5).toBreeze)
     )
     val data = sc.parallelize(local)
-    val stats = Map[BigInt, ClusterNodeStat](
-      BigInt(2) -> new ClusterNodeStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)),
-      BigInt(3) -> new ClusterNodeStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2))
-    )
-    val initNextCenters = algo.initChildCenters(data, stats)
-    assert(initNextCenters.size === 4)
-    assert(initNextCenters.keySet === Set(4, 5, 6, 7))
+
+    val clusters = BisectingKMeans.summarizeClusters(data)
+    assert(clusters.size === 4)
+    assert(clusters(4).center === Vectors.dense(2.0, 2.0).toBreeze)
+    assert(clusters(4).variances === Vectors.dense(0.25, 0.25).toBreeze)
+    assert(clusters(4).rows === 2)
+    assert(clusters(5).center === Vectors.dense(12.0, 12.0).toBreeze)
+    assert(clusters(5).variances === Vectors.dense(0.25, 0.25).toBreeze)
+    assert(clusters(5).rows === 2)
+    assert(clusters(6).center === Vectors.dense(22.0, 22.0).toBreeze)
+    assert(clusters(6).variances === Vectors.dense(0.25, 0.25).toBreeze)
+    assert(clusters(6).rows === 2)
+    assert(clusters(7).center === Vectors.dense(32.0, 32.0).toBreeze)
+    assert(clusters(7).variances === Vectors.dense(0.25, 0.25).toBreeze)
+    assert(clusters(7).rows === 2)
   }
 
-  test("should divide clusters") {
-    val algo = new BisectingKMeans().setSeed(5)
+  test("initialize centers at next step") {
     val local = Seq(
       (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
-      (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)),
-      (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)),
-      (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1))
+      (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1))
     )
     val data = sc.parallelize(local)
-    val stats = algo.summarize(data)
-    val newClusters = algo.getDividedClusters(data, stats)
-
-    assert(newClusters.size === 4)
-    assert(newClusters(4).center === BV[Double](1.0, 1.0))
-    assert(newClusters(4).rows === 2)
-    assert(newClusters(5).center === BV[Double](10.0, 10.0))
-    assert(newClusters(5).rows === 2)
-    assert(newClusters(6).center === BV[Double](100.0, 100.0))
-    assert(newClusters(6).rows === 2)
-    assert(newClusters(7).center === BV[Double](110.0, 110.0))
-    assert(newClusters(7).rows === 2)
+    val stats = Map[BigInt, BisectingClusterStat](
+      BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)),
+      BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2))
+    )
+    val initNextCenters = BisectingKMeans.initNextCenters(data, stats)
+    assert(initNextCenters.size === 4)
+    assert(initNextCenters.keySet === Set(4, 5, 6, 7))
   }
 
   test("should assign each data to new clusters") {
-    val algo = new BisectingKMeans
     val seed = Seq(
       (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)),
       (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)),
@@ -150,43 +125,60 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)),
       (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)),
       (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0))
-    ).map { case (idx, vector) => (idx, vector.toBreeze)}
+    ).map { case (idx, vector) => (idx, vector.toBreeze) }
     val newClusters = Map(
-      BigInt(4) -> new ClusterNodeStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(5) -> new ClusterNodeStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(6) -> new ClusterNodeStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(7) -> new ClusterNodeStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0))
+      BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)),
+      BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0))
     )
     val data = sc.parallelize(seed)
-    val result = algo.updateClusterIndex(data, newClusters).collect().toSeq
+    val result = BisectingKMeans.updateClusterIndex(data, newClusters).collect().toSeq
 
     val expected = Seq(
       (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
       (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)),
       (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)),
       (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0))
-    ).map { case (idx, vector) => (idx, vector.toBreeze)}
+    ).map { case (idx, vector) => (idx, vector.toBreeze) }
     assert(result === expected)
   }
 
-  test("setNumClusters") {
-    val algo = new BisectingKMeans()
-    assert(algo.getNumClusters == 20)
-    algo.setNumClusters(1000)
-    assert(algo.getNumClusters == 1000)
-  }
+  test("findClosestCenter") {
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    val centers = Seq(
+      Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze,
+      Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze,
+      Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze
+    )
 
-  test("setSubIterations") {
-    val algo = new BisectingKMeans()
-    assert(algo.getMaxIterations == 20)
-    algo.setMaxIterations(15)
-    assert(algo.getMaxIterations == 15)
+    for (i <- 0 to (centers.size - 1)) {
+      val point = centers(i)
+      val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point)
+      assert(closestIndex === i)
+    }
   }
 
-  test("setSeed") {
-    val algo = new BisectingKMeans()
-    assert(algo.getSeed == 1)
-    algo.setSeed(987)
-    assert(algo.getSeed == 987)
+  test("should divide clusters correctly") {
+    val local = Seq(
+      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
+      (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)),
+      (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)),
+      (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1))
+    )
+    val data = sc.parallelize(local)
+    val stats = BisectingKMeans.summarizeClusters(data)
+    val newClusters = BisectingKMeans.divideClusters(data, stats, 20)
+
+    assert(newClusters.size === 4)
+    assert(newClusters(4).center === BV[Double](1.0, 1.0))
+    assert(newClusters(4).rows === 2)
+    assert(newClusters(5).center === BV[Double](10.0, 10.0))
+    assert(newClusters(5).rows === 2)
+    assert(newClusters(6).center === BV[Double](100.0, 100.0))
+    assert(newClusters(6).rows === 2)
+    assert(newClusters(7).center === BV[Double](110.0, 110.0))
+    assert(newClusters(7).rows === 2)
   }
+
 }

From 31623ead10d4664c7c5d98b5c72731bcc804bcba Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 22 Oct 2015 08:51:23 -0700
Subject: [PATCH 36/76] Improve a performance

---
 .../apache/spark/mllib/clustering/BisectingKMeans.scala    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index a44cbd32ac5fe..c7b94607206ab 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -19,7 +19,8 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.{Map, mutable}
 
-import breeze.linalg.{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm}
+import breeze.linalg
+  .{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm, sum => breezeSum}
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Since
@@ -393,9 +394,7 @@ private[clustering] object BisectingKMeans {
       stats = eachStats.toMap
 
       totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) =>
-        sum.toArray.zip(sumOfSquares.toArray).map { case (s, ss) =>
-          math.pow(ss / n - math.pow(s / n, 2), 2.0)
-        }.sum
+        breezeSum((sumOfSquares :/ n) :- breezeNorm(sum :/ n, 2.0))
       }.sum
       relativeError = math.abs(oldTotalStd - totalStd) / totalStd
       oldTotalStd = totalStd

From 052c9d6ac42961cc6e8cf7533746e6829b20d236 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 12:45:13 -0700
Subject: [PATCH 37/76] Remove `toAdjacencyList` and `toLinkageList` These will
 be merge at following PR.

---
 .../clustering/BisectingKMeansModel.scala     | 34 --------------
 .../clustering/JavaBisectingKMeansSuite.java  | 15 -------
 .../BisectingKMeansModelSuite.scala           | 44 -------------------
 3 files changed, 93 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index b524ae6b370e3..3c7eb0d50fb6e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -99,39 +99,5 @@ class BisectingKMeansModel @Since("1.6.0") (
   @Since("1.6.0")
   def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
 
-  @Since("1.6.0")
-  def toAdjacencyList: Array[(Int, Int, Double)] = this.node.toAdjacencyList
-
-  /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
-  @Since("1.6.0")
-  def toJavaAdjacencyList: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
-    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
-    this.node.toAdjacencyList.foreach { x =>
-      val edge = new java.util.ArrayList[java.lang.Double]()
-      edge.add(x._1.toDouble)
-      edge.add(x._2.toDouble)
-      edge.add(x._3.toDouble)
-      javaList.add(edge)
-    }
-    javaList
-  }
-
-  @Since("1.6.0")
-  def toLinkageMatrix: Array[(Int, Int, Double, Int)] = this.node.toLinkageMatrix
-
-  /** Since Java doesn't support tuple, we must support the data structure for java and py4j. */
-  @Since("1.6.0")
-  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
-    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
-    this.node.toLinkageMatrix.foreach {x =>
-      val row = new java.util.ArrayList[java.lang.Double]()
-      row.add(x._1.toDouble)
-      row.add(x._2.toDouble)
-      row.add(x._3.toDouble)
-      row.add(x._4.toDouble)
-      javaList.add(row)
-    }
-    javaList
-  }
 }
 
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index 913ca9ac6169a..fb729cfcfa1a2 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -27,7 +27,6 @@
 import org.junit.Test;
 
 import java.io.Serializable;
-import java.util.ArrayList;
 import java.util.List;
 
 import static org.junit.Assert.assertEquals;
@@ -82,13 +81,6 @@ public void runWithDenseVectors() {
     assertEquals(Vectors.dense(2.0, 2.0), centers[2]);
     assertEquals(Vectors.dense(3.0, 3.0), centers[3]);
     assertEquals(Vectors.dense(4.0, 4.0), centers[4]);
-
-    // adjacency list
-    ArrayList<ArrayList<Double>> edges = model.toJavaAdjacencyList();
-    assertEquals(8, edges.size());
-    // linkage matrix
-    ArrayList<ArrayList<Double>> matrix = model.toJavaLinkageMatrix();
-    assertEquals(4, matrix.size());
   }
 
   @Test
@@ -112,12 +104,5 @@ public void runWithSparseVectors() {
     assertEquals(points.get(2), centers[2]);
     assertEquals(points.get(3), centers[3]);
     assertEquals(points.get(4), centers[4]);
-
-    // adjacency list
-    ArrayList<ArrayList<Double>> edges = model.toJavaAdjacencyList();
-    assertEquals(8, edges.size());
-    // linkage matrix
-    ArrayList<ArrayList<Double>> matrix = model.toJavaLinkageMatrix();
-    assertEquals(4, matrix.size());
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index c8e077ecd1413..bda0c9cc999e2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -63,28 +63,6 @@ class BisectingKMeansModelSuite
 
     // compute WSSSE
     assert(model.WSSSE(data) === 0.0)
-
-    // adjacency list
-    val adjacencyList = model.toAdjacencyList
-        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
-    assert(adjacencyList.length === 8)
-    assert(adjacencyList(0) === (0, 1, 2.5981))
-    assert(adjacencyList(1) === (0, 6, 2.5981))
-    assert(adjacencyList(2) === (1, 2, 1.7321))
-    assert(adjacencyList(3) === (1, 5, 1.7321))
-    assert(adjacencyList(4) === (2, 3, 0.866))
-    assert(adjacencyList(5) === (2, 4, 0.866))
-    assert(adjacencyList(6) === (6, 7, 0.866))
-    assert(adjacencyList(7) === (6, 8, 0.866))
-
-    // linkage matrix
-    val linkageMatrix = model.toLinkageMatrix
-        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
-    assert(linkageMatrix.length === 4)
-    assert(linkageMatrix(0) === (0, 1, 0.866, 2))
-    assert(linkageMatrix(1) === (3, 4, 0.866, 2))
-    assert(linkageMatrix(2) === (5, 2, 2.5981, 3))
-    assert(linkageMatrix(3) === (7, 6, 5.1962, 5))
   }
 
   test("clustering sparse vectors") {
@@ -123,28 +101,6 @@ class BisectingKMeansModelSuite
 
     // compute WSSSE
     assert(model.WSSSE(data) === 0.0)
-
-    // adjacency list
-    val adjacencyList = model.toAdjacencyList
-        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3))
-    assert(adjacencyList.length === 8)
-    assert(adjacencyList(0) === (0, 1, 3.2863))
-    assert(adjacencyList(1) === (0, 8, 3.2863))
-    assert(adjacencyList(2) === (1, 2, 2.3184))
-    assert(adjacencyList(3) === (1, 7, 2.3184))
-    assert(adjacencyList(4) === (2, 3, 1.3744))
-    assert(adjacencyList(5) === (2, 6, 1.3744))
-    assert(adjacencyList(6) === (3, 4, 0.5))
-    assert(adjacencyList(7) === (3, 5, 0.5))
-
-    // linkage matrix
-    val linkageMatrix = model.toLinkageMatrix
-        .map(x => (x._1, x._2, math.round(10E3 * x._3) / 10E3, x._4))
-    assert(linkageMatrix.length === 4)
-    assert(linkageMatrix(0) === (0, 1, 0.5, 2))
-    assert(linkageMatrix(1) === (5, 2, 1.8744, 3))
-    assert(linkageMatrix(2) === (6, 3, 4.1928, 4))
-    assert(linkageMatrix(3) === (7, 4, 7.4791, 5))
   }
 
   test("clustering should be done correctly") {

From e13e47fc268472d3edd8df51a2c3d15f6224dd0e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 12:54:39 -0700
Subject: [PATCH 38/76] Remove an unnecessary constructor arg: `clusterMap`

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index c7b94607206ab..abb696c14a969 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -59,14 +59,12 @@ import org.apache.spark.rdd.RDD
  * candidates and merge it to the cluster tree until the desired number of clusters is reached.
  *
  * @param k tne desired number of clusters
- * @param clusterMap the pairs of cluster and its index as Map
  * @param maxIterations the number of maximal iterations to split clusters
  * @param seed a random seed
  */
 @Since("1.6.0")
 class BisectingKMeans private (
     private var k: Int,
-    private var clusterMap: Map[BigInt, BisectingClusterNode],
     private var maxIterations: Int,
     private var seed: Long) extends Logging {
 
@@ -74,7 +72,7 @@ class BisectingKMeans private (
    * Constructs with the default configuration
    */
   @Since("1.6.0")
-  def this() = this(20, mutable.ListMap.empty[BigInt, BisectingClusterNode], 20, 1)
+  def this() = this(20, 20, 1)
 
   /**
    * Sets the number of clusters you want

From ad6b9e2f7b13c8b2ae11198ffe7f903fdfb76f4a Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:00:46 -0700
Subject: [PATCH 39/76] Add `import BisectingKMeans._` inside of
 `BisectingKMeans` class

---
 .../mllib/clustering/BisectingKMeans.scala    | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index abb696c14a969..3388b28ec4d7a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -68,6 +68,8 @@ class BisectingKMeans private (
     private var maxIterations: Int,
     private var seed: Long) extends Logging {
 
+  import BisectingKMeans._
+
   /**
    * Constructs with the default configuration
    */
@@ -132,10 +134,10 @@ class BisectingKMeans private (
     // divide clusters until the number of clusters reachs the condition
     // or there is no dividable cluster
     val startTime = System.currentTimeMillis()
-    var data = BisectingKMeans.initData(input).cache()
+    var data = initData(input).cache()
     while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) {
       logInfo(s"${sc.appName} starts step ${step}")
-      val leafClusters = BisectingKMeans.summarizeClusters(data)
+      val leafClusters = summarizeClusters(data)
       val dividableLeafClusters = leafClusters.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusters
 
@@ -144,10 +146,9 @@ class BisectingKMeans private (
       }
       else {
         // can be clustered if the number of divided clusterStats is equal to 0
-        val divided =
-          BisectingKMeans.divideClusters(data, dividableLeafClusters, maxIterations)
+        val divided = divideClusters(data, dividableLeafClusters, maxIterations)
         // update each index
-        val newData = BisectingKMeans.updateClusterIndex(data, divided).cache()
+        val newData = updateClusterIndex(data, divided).cache()
         rddArray = rddArray ++ Array(data)
         data = newData
         // keep recent 2 cached RDDs in order to run more quickly
@@ -164,11 +165,11 @@ class BisectingKMeans private (
     // unpersist kept RDDs
     rddArray.foreach(_.unpersist())
     // create a map of cluster node with their criterions
-    val nodes = BisectingKMeans.createClusterNodes(data, clusterStats)
+    val nodes = createClusterNodes(data, clusterStats)
 
     // build a cluster tree by Map class which is expressed
     logInfo(s"Building the cluster tree is started in ${sc.appName}")
-    val root = BisectingKMeans.buildTree(nodes, BisectingKMeans.ROOT_INDEX_KEY, this.k)
+    val root = buildTree(nodes, ROOT_INDEX_KEY, this.k)
     if (root.isEmpty) {
       new SparkException("Failed to build a cluster tree from a Map type of clusterStats")
     }
@@ -236,7 +237,7 @@ private[clustering] object BisectingKMeans {
    * Assigns the initial cluster index id to all data
    */
   def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = {
-    data.map { v: Vector => (BisectingKMeans.ROOT_INDEX_KEY, v.toBreeze)}
+    data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)}
   }
 
   /**
@@ -345,7 +346,7 @@ private[clustering] object BisectingKMeans {
     // extract dividable input data
     val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)}
 
-    var newCenters = BisectingKMeans.initNextCenters(dividableData, dividableClusters)
+    var newCenters = initNextCenters(dividableData, dividableClusters)
     var bcNewCenters = sc.broadcast(newCenters)
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
@@ -365,8 +366,7 @@ private[clustering] object BisectingKMeans {
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
             .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
           if (childrenCenters.length == 2) {
-            val closestIndex =
-              BisectingKMeans.findClosestCenter(bcMetric.value)(childrenCenters)(point)
+            val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point)
             val nextIndex = 2 * idx + closestIndex
 
             // get a map value or else get a sparse vector

From 043f5f31037911388f277beb067ffce9da85b30c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:03:05 -0700
Subject: [PATCH 40/76] Rename `rddArray` to `updatedDataHistory` in order to
 make the name more descraptive

---
 .../spark/mllib/clustering/BisectingKMeans.scala     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 3388b28ec4d7a..1ccaafc5a205e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -126,7 +126,7 @@ class BisectingKMeans private (
     var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat]
     var step = 1
     var noMoreDividable = false
-    var rddArray = Array.empty[RDD[(BigInt, BV[Double])]]
+    var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]]
     // the number of maximum nodes of a binary tree by given parameter
     val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1
     val maxAllNodesInTree = math.pow(2, multiplier).toInt
@@ -149,13 +149,13 @@ class BisectingKMeans private (
         val divided = divideClusters(data, dividableLeafClusters, maxIterations)
         // update each index
         val newData = updateClusterIndex(data, divided).cache()
-        rddArray = rddArray ++ Array(data)
+        updatedDataHistory = updatedDataHistory ++ Array(data)
         data = newData
         // keep recent 2 cached RDDs in order to run more quickly
-        if (rddArray.length > 1) {
-          val head = rddArray.head
+        if (updatedDataHistory.length > 1) {
+          val head = updatedDataHistory.head
           head.unpersist()
-          rddArray = rddArray.filterNot(_.hashCode() == head.hashCode())
+          updatedDataHistory = updatedDataHistory.filterNot(_.hashCode() == head.hashCode())
         }
         clusterStats = clusterStats ++ divided
         step += 1
@@ -163,7 +163,7 @@ class BisectingKMeans private (
       }
     }
     // unpersist kept RDDs
-    rddArray.foreach(_.unpersist())
+    updatedDataHistory.foreach(_.unpersist())
     // create a map of cluster node with their criterions
     val nodes = createClusterNodes(data, clusterStats)
 

From 31b05ecfda798a8d2b848d27d738841b0a863c87 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:08:43 -0700
Subject: [PATCH 41/76] Modify `math.log10` to `math.log`

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 1ccaafc5a205e..157db50928720 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -127,8 +127,8 @@ class BisectingKMeans private (
     var step = 1
     var noMoreDividable = false
     var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]]
-    // the number of maximum nodes of a binary tree by given parameter
-    val multiplier = math.ceil(math.log10(this.k) / math.log10(2.0)) + 1
+    // the minimum number of nodes of a binary tree by given parameter
+    val multiplier = math.ceil(math.log(this.k) / math.log(2.0)) + 1
     val maxAllNodesInTree = math.pow(2, multiplier).toInt
 
     // divide clusters until the number of clusters reachs the condition

From 12a60cf4134764a9c03fa98eefc2536feb508d37 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:41:04 -0700
Subject: [PATCH 42/76] Add `getMinimumNumNodeInTree` to calculate the minimum
 number of node in a binary tree

---
 .../mllib/clustering/BisectingKMeans.scala     | 18 +++++++++++++++---
 .../clustering/BisectingKMeansSuite.scala      | 10 ++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 157db50928720..9751fee219059 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -128,14 +128,13 @@ class BisectingKMeans private (
     var noMoreDividable = false
     var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]]
     // the minimum number of nodes of a binary tree by given parameter
-    val multiplier = math.ceil(math.log(this.k) / math.log(2.0)) + 1
-    val maxAllNodesInTree = math.pow(2, multiplier).toInt
+    val numNodeLimit = getMinimumNumNodesInTree(this.k)
 
     // divide clusters until the number of clusters reachs the condition
     // or there is no dividable cluster
     val startTime = System.currentTimeMillis()
     var data = initData(input).cache()
-    while (clusterStats.size < maxAllNodesInTree && noMoreDividable == false) {
+    while (clusterStats.size < numNodeLimit && noMoreDividable == false) {
       logInfo(s"${sc.appName} starts step ${step}")
       val leafClusters = summarizeClusters(data)
       val dividableLeafClusters = leafClusters.filter(_._2.isDividable)
@@ -208,6 +207,19 @@ private[clustering] object BisectingKMeans {
     closestIndex
   }
 
+  /**
+   * Gets the minimum number of nodes in a tree by the number of leaves
+   *
+   * @param k: the number of leaf nodes
+   */
+  def getMinimumNumNodesInTree(k: Int): Int = {
+    val multiplier = math.ceil(math.log(k) / math.log(2.0))
+    // the calculation is same as `math.pow(2, multiplier)`
+    var numNodes = 2
+    (1 to multiplier.toInt).foreach (i => numNodes = numNodes << 1)
+    numNodes
+  }
+
   /**
    * Summarizes data by each cluster as Map
    *
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 43d5f5a2d3c63..c6674b00ebb56 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -159,6 +159,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("should be equal to math.pow") {
+    (1 to 1000).foreach { k =>
+      // the minimum number of nodes of a binary tree by given parameter
+      val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1
+      val expected = math.pow(2, multiplier).toInt
+      val result = BisectingKMeans.getMinimumNumNodesInTree(k)
+      assert(result === expected)
+    }
+  }
+
   test("should divide clusters correctly") {
     val local = Seq(
       (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),

From a13a4048d9d1f370da404993ea336596a6c5cb23 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:43:47 -0700
Subject: [PATCH 43/76] Rename `leafClusters` to `leafClusterStats`

---
 .../apache/spark/mllib/clustering/BisectingKMeans.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 9751fee219059..fe1ad0bebd3f3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -136,9 +136,10 @@ class BisectingKMeans private (
     var data = initData(input).cache()
     while (clusterStats.size < numNodeLimit && noMoreDividable == false) {
       logInfo(s"${sc.appName} starts step ${step}")
-      val leafClusters = summarizeClusters(data)
-      val dividableLeafClusters = leafClusters.filter(_._2.isDividable)
-      clusterStats = clusterStats ++ leafClusters
+      // TODO Remove non-leaf cluster stats from `leafClusterStats`
+      val leafClusterStats = summarizeClusters(data)
+      val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
+      clusterStats = clusterStats ++ leafClusterStats
 
       if (dividableLeafClusters.isEmpty) {
         noMoreDividable = true

From 75564b57dc4c956b54b35baaba24327af98b66a4 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 13:58:11 -0700
Subject: [PATCH 44/76] Modify `BisectingKMeans.updateClusterIndex`

---
 .../spark/mllib/clustering/BisectingKMeans.scala      | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index fe1ad0bebd3f3..32a7aeca48a74 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -310,6 +310,11 @@ private[clustering] object BisectingKMeans {
       data: RDD[(BigInt, BV[Double])],
       dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = {
 
+    // If there is no divided clusters, return the original
+    if (dividedClusters.size == 0) {
+      return data
+    }
+
     // extract the centers of the clusters
     val sc = data.sparkContext
     var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
@@ -323,16 +328,16 @@ private[clustering] object BisectingKMeans {
     data.map { case (idx, point) =>
       val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c))
       childrenIndexes.length match {
-        // stay the index if the number of children is not enough
-        case s if s < 2 => (idx, point)
         // update the indexes
-        case _ => {
+        case s if s == 2 => {
           val nextCenters = childrenIndexes.map(bcCenters.value(_))
           val closestIndex = BisectingKMeans
             .findClosestCenter(bcMetric.value)(nextCenters)(point)
           val nextIndex = 2 * idx + closestIndex
           (nextIndex, point)
         }
+        // stay the index if the number of children is not enough
+        case _ => (idx, point)
       }
     }
   }

From 084b9928ff6eb4b95632a9a52c6cc095f3fb5e64 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:06:14 -0700
Subject: [PATCH 45/76] Move checking whether there are dividable clusters or
 not to below

---
 .../mllib/clustering/BisectingKMeans.scala    | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 32a7aeca48a74..96c57d598e8c1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -141,26 +141,25 @@ class BisectingKMeans private (
       val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusterStats
 
+      // can be clustered if the number of divided clusterStats is equal to 0
+      val divided = divideClusters(data, dividableLeafClusters, maxIterations)
+      // update each index
+      val newData = updateClusterIndex(data, divided).cache()
+      updatedDataHistory = updatedDataHistory ++ Array(data)
+      data = newData
+      // keep recent 2 cached RDDs in order to run more quickly
+      if (updatedDataHistory.length > 1) {
+        val head = updatedDataHistory.head
+        updatedDataHistory = updatedDataHistory.tail
+        head.unpersist()
+      }
+      clusterStats = clusterStats ++ divided
+      step += 1
+      logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
+
       if (dividableLeafClusters.isEmpty) {
         noMoreDividable = true
       }
-      else {
-        // can be clustered if the number of divided clusterStats is equal to 0
-        val divided = divideClusters(data, dividableLeafClusters, maxIterations)
-        // update each index
-        val newData = updateClusterIndex(data, divided).cache()
-        updatedDataHistory = updatedDataHistory ++ Array(data)
-        data = newData
-        // keep recent 2 cached RDDs in order to run more quickly
-        if (updatedDataHistory.length > 1) {
-          val head = updatedDataHistory.head
-          head.unpersist()
-          updatedDataHistory = updatedDataHistory.filterNot(_.hashCode() == head.hashCode())
-        }
-        clusterStats = clusterStats ++ divided
-        step += 1
-        logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
-      }
     }
     // unpersist kept RDDs
     updatedDataHistory.foreach(_.unpersist())

From b6a952df91129cd9ce53ab0f47a824b4859f5682 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:14:51 -0700
Subject: [PATCH 46/76] Make sure the input data keeps the storage level and
 unpersist unnecessary RDDs

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 96c57d598e8c1..65584564a32a0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -161,10 +161,11 @@ class BisectingKMeans private (
         noMoreDividable = true
       }
     }
-    // unpersist kept RDDs
-    updatedDataHistory.foreach(_.unpersist())
     // create a map of cluster node with their criterions
     val nodes = createClusterNodes(data, clusterStats)
+    // unpersist RDDs
+    data.unpersist()
+    updatedDataHistory.foreach(_.unpersist())
 
     // build a cluster tree by Map class which is expressed
     logInfo(s"Building the cluster tree is started in ${sc.appName}")

From 1ba4e4599b003f4de32526398e196e09c6cec3f3 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:18:41 -0700
Subject: [PATCH 47/76] Remove `closestCenter` from `findClosestCenter` because
 it was a unnecessary variable

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 65584564a32a0..15411824095d0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -203,9 +203,8 @@ private[clustering] object BisectingKMeans {
    */
   def findClosestCenter(metric: (BV[Double], BV[Double]) => Double)
       (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
-    val (closestCenter, closestIndex) =
-      centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)
-    closestIndex
+    // get the closest index
+    centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)._2
   }
 
   /**

From cb4fbfe03a8a0f86a31b0735c67d7b51721714ce Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:35:04 -0700
Subject: [PATCH 48/76] Modify `summarizeClusters`

---
 .../apache/spark/mllib/clustering/BisectingKMeans.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 15411824095d0..91b3e55c31ea4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -227,7 +227,7 @@ private[clustering] object BisectingKMeans {
    */
   def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = {
 
-    val stats = data.mapPartitions { iter =>
+    data.mapPartitions { iter =>
       // calculate the accumulation of the all point in a partition and count the rows
       val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
       iter.foreach { case (idx: BigInt, point: BV[Double]) =>
@@ -240,9 +240,9 @@ private[clustering] object BisectingKMeans {
     }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) =>
       // sum the accumulation and the count in the all partition
       (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
-    }.collect().toMap
-
-    stats.map {case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3)}
+    }.map { case (i, (sum, n, sumOfSquares)) =>
+      (i, new BisectingClusterStat(n.toLong, sum, sumOfSquares))
+    }.collectAsMap()
   }
 
   /**

From fbcb9ea9bb7e6b60e19fa631be7691a9a2ff2431 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:43:47 -0700
Subject: [PATCH 49/76] Modify a type

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 91b3e55c31ea4..e97c3ad8a6d88 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -253,7 +253,7 @@ private[clustering] object BisectingKMeans {
   }
 
   /**
-   * Gets the initial centers for bisect k-means
+   * Gets the initial centers for bisecting k-means
    *
    * @param data pairs of point and its cluster index
    * @param stats pairs of cluster index and cluster statistics

From ffbe399b88376d41861f808461b94d06b7387bf4 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 14:48:26 -0700
Subject: [PATCH 50/76] Replace `criterion` with `cost`

---
 .../mllib/clustering/BisectingKMeans.scala    | 28 +++++++++----------
 .../BisectingKMeansModelSuite.scala           |  4 +--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index e97c3ad8a6d88..f037e7979bcc0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -45,9 +45,9 @@ import org.apache.spark.rdd.RDD
  * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until
  * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting
  * when there are no dividable clusters before the number of clusters will be sufficient. And
- * it calculates the criterions, such as average cost, entropy and so on, for building a cluster
- * tree in the first part. The criterion means how large the cluster is. That is, the cluster
- * whose criterion is maximum of all the clusters is the largest cluster.
+ * it calculates the costs, such as average cost, entropy and so on, for building a cluster
+ * tree in the first part. The costs means how large the cluster is. That is, the cluster
+ * whose cost is maximum of all the clusters is the largest cluster.
  *
  * Second, it builds a cluster tree as a binary tree by the result of the first part.
  * First of all, the cluster tree starts with only the root cluster which includes all points.
@@ -161,7 +161,7 @@ class BisectingKMeans private (
         noMoreDividable = true
       }
     }
-    // create a map of cluster node with their criterions
+    // create a map of cluster node with their costs
     val nodes = createClusterNodes(data, clusterStats)
     // unpersist RDDs
     data.unpersist()
@@ -419,7 +419,7 @@ private[clustering] object BisectingKMeans {
   }
 
   /**
-   * Creates the map of cluster stats to the map of cluster nodes with their criterions
+   * Creates the map of cluster stats to the map of cluster nodes with their costs
    *
    * @param data input data
    * @param stats map of cluster stats which is described as a binary tree
@@ -428,7 +428,7 @@ private[clustering] object BisectingKMeans {
       data: RDD[(BigInt, BV[Double])],
       stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
 
-    // TODO: support other criterion, such as entropy
+    // TODO: support other cost, such as entropy
     createClusterNodesWithAverageCost(data, stats)
   }
 
@@ -483,8 +483,8 @@ private[clustering] object BisectingKMeans {
     val root = treeMap(rootIndex)
     var leavesQueue = Map(rootIndex -> root)
     while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) {
-      // pick up the largest cluster by the maximum criterion of all the clusters
-      val mostScattered = leavesQueue.maxBy(_._2.criterion)
+      // pick up the largest cluster by the maximum cost of all the clusters
+      val mostScattered = leavesQueue.maxBy(_._2.cost)
       val mostScatteredKey = mostScattered._1
       val mostScatteredCluster = mostScattered._2
 
@@ -519,7 +519,7 @@ private[clustering] object BisectingKMeans {
  *
  * @param center the center of the cluster
  * @param rows the number of rows in the cluster
- * @param criterion how large a cluster is
+ * @param cost how large a cluster is
  * @param localHeight the maximal distance between this node and its children
  * @param parent the parent cluster of the cluster
  * @param children the children nodes of the cluster
@@ -528,16 +528,16 @@ private[clustering] object BisectingKMeans {
 class BisectingClusterNode private (
     @Since("1.6.0") val center: Vector,
     @Since("1.6.0") val rows: Long,
-    @Since("1.6.0") val criterion: Double,
+    @Since("1.6.0") val cost: Double,
     private var localHeight: Double,
     private var parent: Option[BisectingClusterNode],
     private var children: Seq[BisectingClusterNode]) extends Serializable {
 
-  require(!criterion.isNaN)
+  require(!cost.isNaN)
 
   @Since("1.6.0")
-  def this(center: Vector, rows: Long, criterion: Double) =
-    this(center, rows, criterion, 0.0, None, Array.empty[BisectingClusterNode])
+  def this(center: Vector, rows: Long, cost: Double) =
+    this(center, rows, cost, 0.0, None, Array.empty[BisectingClusterNode])
 
   /**
    * Inserts a sub node as its child
@@ -573,7 +573,7 @@ class BisectingClusterNode private (
       case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
     }
     array.sortWith { case (a, b) =>
-      a.getDepth < b.getDepth && a.criterion < b.criterion && a.rows < b.rows
+      a.getDepth < b.getDepth && a.cost < b.cost && a.rows < b.rows
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index bda0c9cc999e2..667c96eb72d70 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -117,13 +117,13 @@ class BisectingKMeansModelSuite
       val denseData = sc.parallelize(localData.map(_._2), 2)
       val denseModel = app.run(denseData)
       assert(denseModel.getCenters.length === numClusters)
-      assert(denseModel.getClusters.forall(_.criterion == 0.0))
+      assert(denseModel.getClusters.forall(_.cost == 0.0))
 
       // sparse version
       val sparseData = sc.parallelize(localData.map(_._3), 2)
       val sparseModel = app.run(sparseData)
       assert(sparseModel.getCenters.length === numClusters)
-      assert(sparseModel.getClusters.forall(_.criterion == 0.0))
+      assert(sparseModel.getClusters.forall(_.cost == 0.0))
     }
   }
 }

From 6f37028db93d08a24af21545b58156cbeddb9e3e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 15:48:05 -0700
Subject: [PATCH 51/76] Change the constructor args of `BisectingClusterStats`

---
 .../mllib/clustering/BisectingKMeans.scala    | 63 +++++++++++------
 .../clustering/BisectingKMeansSuite.scala     | 70 ++++++++++---------
 2 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index f037e7979bcc0..93807746454b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -191,6 +191,8 @@ class BisectingKMeans private (
 
 private[clustering] object BisectingKMeans {
 
+  import BisectingClusterStat._
+
   val ROOT_INDEX_KEY: BigInt = 1
 
   /**
@@ -241,7 +243,9 @@ private[clustering] object BisectingKMeans {
       // sum the accumulation and the count in the all partition
       (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
     }.map { case (i, (sum, n, sumOfSquares)) =>
-      (i, new BisectingClusterStat(n.toLong, sum, sumOfSquares))
+      val mean = calcMean(n.toLong, sum)
+      val variance = getVariance(n.toLong, sum, sumOfSquares)
+      (i, new BisectingClusterStat(n.toLong, mean, variance))
     }.collectAsMap()
   }
 
@@ -316,7 +320,7 @@ private[clustering] object BisectingKMeans {
 
     // extract the centers of the clusters
     val sc = data.sparkContext
-    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.center)}
+    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)}
     val bcCenters = sc.broadcast(centers)
 
     // TODO Supports distance metrics other Euclidean distance metric
@@ -345,29 +349,30 @@ private[clustering] object BisectingKMeans {
    * Divides clusters according to their statistics
    *
    * @param data pairs of point and its cluster index
-   * @param targetClusters target clusters to divide
+   * @param clusterStats target clusters to divide
    * @param maxIterations the maximum iterations to calculate clusters statistics
    */
   def divideClusters(
       data: RDD[(BigInt, BV[Double])],
-      targetClusters: Map[BigInt, BisectingClusterStat],
+      clusterStats: Map[BigInt, BisectingClusterStat],
       maxIterations: Int): Map[BigInt, BisectingClusterStat] = {
     val sc = data.sparkContext
     val appName = sc.appName
 
     // get keys of dividable clusters
-    val dividableClusters = targetClusters.filter { case (idx, cluster) => cluster.isDividable }
-    if (dividableClusters.isEmpty) {
+    val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable }
+    if (dividableClusterStats.isEmpty) {
       return Map.empty[BigInt, BisectingClusterStat]
     }
     // extract dividable input data
-    val dividableData = data.filter { case (idx, point) => dividableClusters.contains(idx)}
+    val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
 
-    var newCenters = initNextCenters(dividableData, dividableClusters)
+    var newCenters = initNextCenters(dividableData, dividableClusterStats)
     var bcNewCenters = sc.broadcast(newCenters)
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     val bcMetric = sc.broadcast(metric)
+    // pairs of cluster index and (sums, #points, sumOfSquares)
     var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])]
 
     var subIter = 0
@@ -415,7 +420,11 @@ private[clustering] object BisectingKMeans {
       oldTotalStd = totalStd
       subIter += 1
     }
-    stats.map { case (i, stat) => i -> new BisectingClusterStat(stat._2.toLong, stat._1, stat._3) }
+    stats.map { case (i, (sums, rows, sumOfSquares)) =>
+      val mean = calcMean(rows.toLong, sums)
+      val variance = getVariance(rows.toLong, sums, sumOfSquares)
+      i -> new BisectingClusterStat(rows.toLong, mean, variance)
+    }
   }
 
   /**
@@ -440,7 +449,7 @@ private[clustering] object BisectingKMeans {
       stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
 
     // calculate average costs of all clusters
-    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.center })
+    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean })
     val costs = data.mapPartitions { iter =>
       val counters = mutable.Map.empty[BigInt, (Long, Double)]
       bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
@@ -458,7 +467,7 @@ private[clustering] object BisectingKMeans {
         case x if x == 0.0 => 0.0
         case _ => costs(i)._2 / costs(i)._1
       }
-      i -> new BisectingClusterNode(Vectors.fromBreeze(stat.center), stat.rows, avgCost)
+      i -> new BisectingClusterNode(Vectors.fromBreeze(stat.mean), stat.rows, avgCost)
     }
   }
 
@@ -685,21 +694,29 @@ class BisectingClusterNode private (
  *  This class is used for maneging a cluster statistics
  *
  * @param rows the number of points
- * @param sums the sum of points
- * @param sumOfSquares the sum of squares of points
+ * @param mean the sum of points
+ * @param variance the sum of squares of points
  */
 private[clustering] case class BisectingClusterStat (
     rows: Long,
-    sums: BV[Double],
-    sumOfSquares: BV[Double]) extends Serializable {
-
-  // initialization
-  val center: BV[Double] = sums :/ rows.toDouble
-  val variances: BV[Double] = rows match {
-    case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
-    case _ => BV.zeros[Double](sums.size)
-  }
+    mean: BV[Double],
+    variance: Double) extends Serializable {
 
-  def isDividable: Boolean = breezeAny(variances) && rows >= 2
+  def isDividable: Boolean = variance > 0 && rows >= 2
 }
 
+private[clustering] object BisectingClusterStat {
+  // calculate a mean vector
+  def calcMean(rows: Long, sums: BV[Double]): BV[Double] = sums :/ rows.toDouble
+
+  // calculate a variance
+  def getVariance(rows: Long, sums: BV[Double], sumOfSquares: BV[Double]): Double = {
+    val variances: BV[Double] = rows match {
+      case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
+      case _ => BV.zeros[Double](sums.size)
+    }
+    breezeNorm(variances, 2.0)
+  }
+}
+
+
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index c6674b00ebb56..875d2c4f694cd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -86,20 +86,20 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     )
     val data = sc.parallelize(local)
 
-    val clusters = BisectingKMeans.summarizeClusters(data)
-    assert(clusters.size === 4)
-    assert(clusters(4).center === Vectors.dense(2.0, 2.0).toBreeze)
-    assert(clusters(4).variances === Vectors.dense(0.25, 0.25).toBreeze)
-    assert(clusters(4).rows === 2)
-    assert(clusters(5).center === Vectors.dense(12.0, 12.0).toBreeze)
-    assert(clusters(5).variances === Vectors.dense(0.25, 0.25).toBreeze)
-    assert(clusters(5).rows === 2)
-    assert(clusters(6).center === Vectors.dense(22.0, 22.0).toBreeze)
-    assert(clusters(6).variances === Vectors.dense(0.25, 0.25).toBreeze)
-    assert(clusters(6).rows === 2)
-    assert(clusters(7).center === Vectors.dense(32.0, 32.0).toBreeze)
-    assert(clusters(7).variances === Vectors.dense(0.25, 0.25).toBreeze)
-    assert(clusters(7).rows === 2)
+    val clusterStats = BisectingKMeans.summarizeClusters(data)
+    assert(clusterStats.size === 4)
+    assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze)
+    assert(clusterStats(4).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(4).rows === 2)
+    assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze)
+    assert(clusterStats(5).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(5).rows === 2)
+    assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze)
+    assert(clusterStats(6).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(6).rows === 2)
+    assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze)
+    assert(clusterStats(7).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(7).rows === 2)
   }
 
   test("initialize centers at next step") {
@@ -109,8 +109,8 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     )
     val data = sc.parallelize(local)
     val stats = Map[BigInt, BisectingClusterStat](
-      BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, BV.zeros[Double](2)),
-      BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, BV.zeros[Double](2))
+      BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0),
+      BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0)
     )
     val initNextCenters = BisectingKMeans.initNextCenters(data, stats)
     assert(initNextCenters.size === 4)
@@ -126,14 +126,18 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)),
       (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze) }
-    val newClusters = Map(
-      BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, BV[Double](1.0, 1.0)),
-      BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, BV[Double](1.0, 1.0))
+    val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0)
+    val newClusterStats = Map(
+      BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance),
+      BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance),
+      BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance),
+      BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance)
     )
     val data = sc.parallelize(seed)
-    val result = BisectingKMeans.updateClusterIndex(data, newClusters).collect().toSeq
+    val leafClusterStats = BisectingKMeans.summarizeClusters(data)
+    val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
+    val divided = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20)
+    val result = BisectingKMeans.updateClusterIndex(data, divided).collect().toSeq
 
     val expected = Seq(
       (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
@@ -178,17 +182,17 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     )
     val data = sc.parallelize(local)
     val stats = BisectingKMeans.summarizeClusters(data)
-    val newClusters = BisectingKMeans.divideClusters(data, stats, 20)
-
-    assert(newClusters.size === 4)
-    assert(newClusters(4).center === BV[Double](1.0, 1.0))
-    assert(newClusters(4).rows === 2)
-    assert(newClusters(5).center === BV[Double](10.0, 10.0))
-    assert(newClusters(5).rows === 2)
-    assert(newClusters(6).center === BV[Double](100.0, 100.0))
-    assert(newClusters(6).rows === 2)
-    assert(newClusters(7).center === BV[Double](110.0, 110.0))
-    assert(newClusters(7).rows === 2)
+    val newClusterStats = BisectingKMeans.divideClusters(data, stats, 20)
+
+    assert(newClusterStats.size === 4)
+    assert(newClusterStats(4).mean === BV[Double](1.0, 1.0))
+    assert(newClusterStats(4).rows === 2)
+    assert(newClusterStats(5).mean === BV[Double](10.0, 10.0))
+    assert(newClusterStats(5).rows === 2)
+    assert(newClusterStats(6).mean === BV[Double](100.0, 100.0))
+    assert(newClusterStats(6).rows === 2)
+    assert(newClusterStats(7).mean === BV[Double](110.0, 110.0))
+    assert(newClusterStats(7).rows === 2)
   }
 
 }

From 010fd2ca4eadc22ef38805b02bf35820f2117c9d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 15:54:40 -0700
Subject: [PATCH 52/76] Convert `Long` to `BigInt`

---
 .../mllib/clustering/BisectingKMeans.scala    | 50 ++++++++---------
 .../clustering/BisectingKMeansSuite.scala     | 54 +++++++++----------
 2 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 93807746454b6..ed385d9454c76 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -123,10 +123,10 @@ class BisectingKMeans private (
 
     // `clusterStats` is described as binary tree structure
     // `clusterStats(1)` means the root of a binary tree
-    var clusterStats = mutable.Map.empty[BigInt, BisectingClusterStat]
+    var clusterStats = mutable.Map.empty[Long, BisectingClusterStat]
     var step = 1
     var noMoreDividable = false
-    var updatedDataHistory = Array.empty[RDD[(BigInt, BV[Double])]]
+    var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]]
     // the minimum number of nodes of a binary tree by given parameter
     val numNodeLimit = getMinimumNumNodesInTree(this.k)
 
@@ -193,7 +193,7 @@ private[clustering] object BisectingKMeans {
 
   import BisectingClusterStat._
 
-  val ROOT_INDEX_KEY: BigInt = 1
+  val ROOT_INDEX_KEY: Long = 1
 
   /**
    * Finds the closes cluster's center
@@ -227,12 +227,12 @@ private[clustering] object BisectingKMeans {
    *
    * @param data pairs of point and its cluster index
    */
-  def summarizeClusters(data: RDD[(BigInt, BV[Double])]): Map[BigInt, BisectingClusterStat] = {
+  def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = {
 
     data.mapPartitions { iter =>
       // calculate the accumulation of the all point in a partition and count the rows
-      val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
-      iter.foreach { case (idx: BigInt, point: BV[Double]) =>
+      val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
+      iter.foreach { case (idx: Long, point: BV[Double]) =>
         // get a map value or else get a sparse vector
         val (sumBV, n, sumOfSquares) = map
           .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
@@ -252,7 +252,7 @@ private[clustering] object BisectingKMeans {
   /**
    * Assigns the initial cluster index id to all data
    */
-  def initData(data: RDD[Vector]): RDD[(BigInt, BV[Double])] = {
+  def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
     data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)}
   }
 
@@ -263,14 +263,14 @@ private[clustering] object BisectingKMeans {
    * @param stats pairs of cluster index and cluster statistics
    */
   def initNextCenters(
-      data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BV[Double]] = {
+      data: RDD[(Long, BV[Double])],
+      stats: Map[Long, BisectingClusterStat]): Map[Long, BV[Double]] = {
 
     // Since the combination sampleByKey and groupByKey is more expensive,
     // this as follows would be better.
     val bcIndeces = data.sparkContext.broadcast(stats.keySet)
     val samples = data.mapPartitions { iter =>
-      val map = mutable.Map.empty[BigInt, mutable.ArrayBuffer[BV[Double]]]
+      val map = mutable.Map.empty[Long, mutable.ArrayBuffer[BV[Double]]]
 
       bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
       val LOCAL_SAMPLE_SIZE = 100
@@ -310,8 +310,8 @@ private[clustering] object BisectingKMeans {
    * @param dividedClusters pairs of cluster index and cluster statistics
    */
   def updateClusterIndex(
-      data: RDD[(BigInt, BV[Double])],
-      dividedClusters: Map[BigInt, BisectingClusterStat]): RDD[(BigInt, BV[Double])] = {
+      data: RDD[(Long, BV[Double])],
+      dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = {
 
     // If there is no divided clusters, return the original
     if (dividedClusters.size == 0) {
@@ -353,16 +353,16 @@ private[clustering] object BisectingKMeans {
    * @param maxIterations the maximum iterations to calculate clusters statistics
    */
   def divideClusters(
-      data: RDD[(BigInt, BV[Double])],
-      clusterStats: Map[BigInt, BisectingClusterStat],
-      maxIterations: Int): Map[BigInt, BisectingClusterStat] = {
+      data: RDD[(Long, BV[Double])],
+      clusterStats: Map[Long, BisectingClusterStat],
+      maxIterations: Int): Map[Long, BisectingClusterStat] = {
     val sc = data.sparkContext
     val appName = sc.appName
 
     // get keys of dividable clusters
     val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable }
     if (dividableClusterStats.isEmpty) {
-      return Map.empty[BigInt, BisectingClusterStat]
+      return Map.empty[Long, BisectingClusterStat]
     }
     // extract dividable input data
     val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
@@ -373,7 +373,7 @@ private[clustering] object BisectingKMeans {
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     val bcMetric = sc.broadcast(metric)
     // pairs of cluster index and (sums, #points, sumOfSquares)
-    var stats = Map.empty[BigInt, (BV[Double], Double, BV[Double])]
+    var stats = Map.empty[Long, (BV[Double], Double, BV[Double])]
 
     var subIter = 0
     var totalStd = Double.MaxValue
@@ -382,7 +382,7 @@ private[clustering] object BisectingKMeans {
     while (subIter < maxIterations && relativeError > 10E-4) {
       // calculate summary of each cluster
       val eachStats = dividableData.mapPartitions { iter =>
-        val map = mutable.Map.empty[BigInt, (BV[Double], Double, BV[Double])]
+        val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
         iter.foreach { case (idx, point) =>
           // calculate next index number
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
@@ -434,8 +434,8 @@ private[clustering] object BisectingKMeans {
    * @param stats map of cluster stats which is described as a binary tree
    */
   def createClusterNodes(
-      data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
+      data: RDD[(Long, BV[Double])],
+      stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = {
 
     // TODO: support other cost, such as entropy
     createClusterNodesWithAverageCost(data, stats)
@@ -445,13 +445,13 @@ private[clustering] object BisectingKMeans {
    * Creates the map of cluster stats to the map of cluster nodes with their average costs
    */
   private def createClusterNodesWithAverageCost(
-      data: RDD[(BigInt, BV[Double])],
-      stats: Map[BigInt, BisectingClusterStat]): Map[BigInt, BisectingClusterNode] = {
+      data: RDD[(Long, BV[Double])],
+      stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = {
 
     // calculate average costs of all clusters
     val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean })
     val costs = data.mapPartitions { iter =>
-      val counters = mutable.Map.empty[BigInt, (Long, Double)]
+      val counters = mutable.Map.empty[Long, (Long, Double)]
       bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
       iter.foreach { case (i, point) =>
         val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0)
@@ -480,8 +480,8 @@ private[clustering] object BisectingKMeans {
    * @return a built cluster tree
    */
   private def buildTree(
-      treeMap: Map[BigInt, BisectingClusterNode],
-      rootIndex: BigInt,
+      treeMap: Map[Long, BisectingClusterNode],
+      rootIndex: Long,
       numClusters: Int): Option[BisectingClusterNode] = {
 
     // if there is no index in the Map
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 875d2c4f694cd..0e49c6c7afe5c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -75,14 +75,14 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("summarize center stats") {
     val algo = new BisectingKMeans
     val local = Seq(
-      (BigInt(4), Vectors.dense(1.5, 1.5).toBreeze),
-      (BigInt(4), Vectors.dense(2.5, 2.5).toBreeze),
-      (BigInt(5), Vectors.dense(11.5, 11.5).toBreeze),
-      (BigInt(5), Vectors.dense(12.5, 12.5).toBreeze),
-      (BigInt(6), Vectors.dense(21.5, 21.5).toBreeze),
-      (BigInt(6), Vectors.dense(22.5, 22.5).toBreeze),
-      (BigInt(7), Vectors.dense(31.5, 31.5).toBreeze),
-      (BigInt(7), Vectors.dense(32.5, 32.5).toBreeze)
+      (4L, Vectors.dense(1.5, 1.5).toBreeze),
+      (4L, Vectors.dense(2.5, 2.5).toBreeze),
+      (5L, Vectors.dense(11.5, 11.5).toBreeze),
+      (5L, Vectors.dense(12.5, 12.5).toBreeze),
+      (6L, Vectors.dense(21.5, 21.5).toBreeze),
+      (6L, Vectors.dense(22.5, 22.5).toBreeze),
+      (7L, Vectors.dense(31.5, 31.5).toBreeze),
+      (7L, Vectors.dense(32.5, 32.5).toBreeze)
     )
     val data = sc.parallelize(local)
 
@@ -104,13 +104,13 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("initialize centers at next step") {
     val local = Seq(
-      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
-      (BigInt(3), BV[Double](1.9, 1.9)), (BigInt(3), BV[Double](2.1, 2.1))
+      (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)),
+      (3L, BV[Double](1.9, 1.9)), (2L, BV[Double](2.1, 2.1))
     )
     val data = sc.parallelize(local)
-    val stats = Map[BigInt, BisectingClusterStat](
-      BigInt(2) -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0),
-      BigInt(3) -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0)
+    val stats = Map[Long, BisectingClusterStat](
+      2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0),
+      3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0)
     )
     val initNextCenters = BisectingKMeans.initNextCenters(data, stats)
     assert(initNextCenters.size === 4)
@@ -119,19 +119,19 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("should assign each data to new clusters") {
     val seed = Seq(
-      (BigInt(2), Vectors.dense(0.0, 0.0)), (BigInt(2), Vectors.dense(1.0, 1.0)),
-      (BigInt(2), Vectors.dense(2.0, 2.0)), (BigInt(2), Vectors.dense(3.0, 3.0)),
-      (BigInt(2), Vectors.dense(4.0, 4.0)), (BigInt(2), Vectors.dense(5.0, 5.0)),
-      (BigInt(3), Vectors.dense(6.0, 6.0)), (BigInt(3), Vectors.dense(7.0, 7.0)),
-      (BigInt(3), Vectors.dense(8.0, 8.0)), (BigInt(3), Vectors.dense(9.0, 9.0)),
-      (BigInt(3), Vectors.dense(10.0, 10.0)), (BigInt(3), Vectors.dense(11.0, 11.0))
+      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)),
+      (2L, Vectors.dense(2.0, 2.0)), (2L, Vectors.dense(3.0, 3.0)),
+      (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
+      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)),
+      (3L, Vectors.dense(8.0, 8.0)), (3L, Vectors.dense(9.0, 9.0)),
+      (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0))
     ).map { case (idx, vector) => (idx, vector.toBreeze) }
     val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0)
     val newClusterStats = Map(
-      BigInt(4) -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance),
-      BigInt(5) -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance),
-      BigInt(6) -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance),
-      BigInt(7) -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance)
+      4L -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance),
+      5L -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance),
+      6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance),
+      7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance)
     )
     val data = sc.parallelize(seed)
     val leafClusterStats = BisectingKMeans.summarizeClusters(data)
@@ -175,10 +175,10 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("should divide clusters correctly") {
     val local = Seq(
-      (BigInt(2), BV[Double](0.9, 0.9)), (BigInt(2), BV[Double](1.1, 1.1)),
-      (BigInt(2), BV[Double](9.9, 9.9)), (BigInt(2), BV[Double](10.1, 10.1)),
-      (BigInt(3), BV[Double](99.9, 99.9)), (BigInt(3), BV[Double](100.1, 100.1)),
-      (BigInt(3), BV[Double](109.9, 109.9)), (BigInt(3), BV[Double](110.1, 110.1))
+      (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)),
+      (2L, BV[Double](9.9, 9.9)), (2L, BV[Double](10.1, 10.1)),
+      (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)),
+      (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1))
     )
     val data = sc.parallelize(local)
     val stats = BisectingKMeans.summarizeClusters(data)

From e39f69ac70bedbce20d1a028d4fc32a79d88ef3f Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 16:12:59 -0700
Subject: [PATCH 53/76] Modify `BisectingKMeansModel.predict`

---
 .../mllib/clustering/BisectingKMeans.scala    | 21 +++++++++++++++++++
 .../clustering/BisectingKMeansModel.scala     | 15 ++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index ed385d9454c76..afd35d74dc18a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -599,6 +599,27 @@ class BisectingClusterNode private (
     }
   }
 
+  /**
+   * Finds a leaf which is the closest under the node
+   *
+   * @param point target point
+   */
+  @Since("1.6.0")
+  def findClosestLeaf(
+      point: Vector,
+      metric: (BV[Double], BV[Double]) => Double
+    ): BisectingClusterNode = {
+    this.children.size match {
+      case 0 => this
+      case _ => {
+        val bv = point.toBreeze
+        val centers = this.children.map(_.center).map(_.toBreeze)
+        val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(bv)
+        this.children(closestIndex).findClosestLeaf(point, metric)
+      }
+    }
+  }
+
   /**
    * Gets the leaves nodes in the cluster tree
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 3c7eb0d50fb6e..8177cd360d28e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -48,9 +48,11 @@ class BisectingKMeansModel @Since("1.6.0") (
   def predict(vector: Vector): Int = {
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    val closestLeafNode = this.node.findClosestLeaf(vector, metric)
 
+    val closestCenter = closestLeafNode.center
     val centers = this.getCenters.map(_.toBreeze)
-    BisectingKMeans.findClosestCenter(metric)(centers)(vector.toBreeze)
+    BisectingKMeans.findClosestCenter(metric)(centers)(closestCenter.toBreeze)
   }
 
   /**
@@ -59,16 +61,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   @Since("1.6.0")
   def predict(data: RDD[Vector]): RDD[Int] = {
     val sc = data.sparkContext
-
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    sc.broadcast(metric)
-    val centers = this.getCenters.map(_.toBreeze)
-    sc.broadcast(centers)
-
-    data.map{point =>
-      BisectingKMeans.findClosestCenter(metric)(centers)(point.toBreeze)
-    }
+    data.map { p => predict(p) }
   }
 
   /**

From 2e8226d509ead4c14940335bc44c110d4c02c52f Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 16:15:43 -0700
Subject: [PATCH 54/76] Organize import statements

---
 .../spark/mllib/clustering/BisectingKMeans.scala |  3 +--
 .../clustering/JavaBisectingKMeansSuite.java     | 16 ++++++++--------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index afd35d74dc18a..ff55221402e5d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.{Map, mutable}
 
-import breeze.linalg
-  .{SparseVector => BSV, Vector => BV, any => breezeAny, norm => breezeNorm, sum => breezeSum}
+import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, sum => breezeSum}
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Since
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index fb729cfcfa1a2..926bd54e54424 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -17,19 +17,19 @@
 
 package org.apache.spark.mllib.clustering;
 
+import java.io.Serializable;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
 import com.google.common.collect.Lists;
+
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.Serializable;
-import java.util.List;
-
-import static org.junit.Assert.assertEquals;
 
 public class JavaBisectingKMeansSuite implements Serializable {
   private transient JavaSparkContext sc;

From 622499e95e2717625eecd11f5c43e0c9eddc820e Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 16:24:10 -0700
Subject: [PATCH 55/76] Modify the comment inside of `updateClusterIndex`

---
 .../spark/mllib/clustering/BisectingKMeans.scala     | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index ff55221402e5d..47453fb46c169 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -319,7 +319,7 @@ private[clustering] object BisectingKMeans {
 
     // extract the centers of the clusters
     val sc = data.sparkContext
-    var centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)}
+    val centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)}
     val bcCenters = sc.broadcast(centers)
 
     // TODO Supports distance metrics other Euclidean distance metric
@@ -328,17 +328,19 @@ private[clustering] object BisectingKMeans {
 
     // update the indexes to their children indexes
     data.map { case (idx, point) =>
-      val childrenIndexes = Array(2 * idx, 2 * idx + 1).filter(c => bcCenters.value.contains(c))
-      childrenIndexes.length match {
+      // TODO improve how to extract child indexes
+      val childIndexes = Array(2 * idx, 2 * idx + 1)
+      val extractedChildIndexes = childIndexes.filter(c => bcCenters.value.contains(c))
+      extractedChildIndexes.length match {
         // update the indexes
         case s if s == 2 => {
-          val nextCenters = childrenIndexes.map(bcCenters.value(_))
+          val nextCenters = extractedChildIndexes.map(bcCenters.value(_))
           val closestIndex = BisectingKMeans
             .findClosestCenter(bcMetric.value)(nextCenters)(point)
           val nextIndex = 2 * idx + closestIndex
           (nextIndex, point)
         }
-        // stay the index if the number of children is not enough
+        // stay the index if a cluster which a point belongs wasn't divided
         case _ => (idx, point)
       }
     }

From 267908aa577d0732e322a2bde420571fea34ed3a Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 17:21:02 -0700
Subject: [PATCH 56/76] Change the default value of `k` to 2

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 47453fb46c169..7dd3ec5439ae1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -73,7 +73,7 @@ class BisectingKMeans private (
    * Constructs with the default configuration
    */
   @Since("1.6.0")
-  def this() = this(20, 20, 1)
+  def this() = this(2, 20, 1)
 
   /**
    * Sets the number of clusters you want

From a664f6d639d8fd2afb8df40cc7eb38bb5aa79923 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 17:30:31 -0700
Subject: [PATCH 57/76] Change `10E-4` to `10e-4`

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 7dd3ec5439ae1..9a62c1746405d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -380,7 +380,7 @@ private[clustering] object BisectingKMeans {
     var totalStd = Double.MaxValue
     var oldTotalStd = Double.MaxValue
     var relativeError = Double.MaxValue
-    while (subIter < maxIterations && relativeError > 10E-4) {
+    while (subIter < maxIterations && relativeError > 10e-4) {
       // calculate summary of each cluster
       val eachStats = dividableData.mapPartitions { iter =>
         val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]

From 165e191755bb641f10c604f1ec214248f5a104e7 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 26 Oct 2015 17:35:52 -0700
Subject: [PATCH 58/76] Remove `toLinkageMatrix` and `toAdjacencyList` from
 `BisectingKMeans`

---
 .../mllib/clustering/BisectingKMeans.scala    | 53 -------------------
 1 file changed, 53 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 9a62c1746405d..ae06b0151641b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -656,59 +656,6 @@ class BisectingClusterNode private (
 
   @Since("1.6.0")
   def setLocalHeight(height: Double): Unit = this.localHeight = height
-
-  /**
-   * Converts to an adjacency list
-   *
-   * @return List[(fromNodeId, toNodeId, distance)]
-   */
-  @Since("1.6.0")
-  def toAdjacencyList: Array[(Int, Int, Double)] = {
-    val nodes = toArray
-
-    var adjacencyList = Array.empty[(Int, Int, Double)]
-    nodes.foreach { parent =>
-      if (parent.children.size > 1) {
-        val parentIndex = nodes.indexOf(parent)
-        parent.children.foreach { child =>
-          val childIndex = nodes.indexOf(child)
-          adjacencyList = adjacencyList :+(parentIndex, childIndex, parent.localHeight)
-        }
-      }
-    }
-    adjacencyList
-  }
-
-  /**
-   * Converts to a linkage matrix
-   * Returned data format is fit for scipy's dendrogram function
-   *
-   * @return List[(node1, node2, distance, tree size)]
-   */
-  @Since("1.6.0")
-  def toLinkageMatrix: Array[(Int, Int, Double, Int)] = {
-    val nodes = toArray.sortWith { case (a, b) => a.getHeight < b.getHeight}
-    val leaves = nodes.filter(_.isLeaf)
-    val notLeaves = nodes.filterNot(_.isLeaf).filter(_.getChildren.size > 1)
-    val clusters = leaves ++ notLeaves
-    val treeMap = clusters.zipWithIndex.map { case (node, idx) => node -> idx}.toMap
-
-    // If a node only has one-child, the child is regarded as the cluster of the child.
-    // Cluster A has cluster B and Cluster B. B is a leaf. C only has cluster D.
-    // ==> A merge list is (B, D), not (B, C).
-    def getIndex(map: Map[BisectingClusterNode, Int], node: BisectingClusterNode): Int = {
-      node.children.size match {
-        case 1 => getIndex(map, node.children.head)
-        case _ => map(node)
-      }
-    }
-    clusters.filterNot(_.isLeaf).map { node =>
-      (getIndex(treeMap, node.children.head),
-        getIndex(treeMap, node.children(1)),
-        node.getHeight,
-        node.toArray.filter(_.isLeaf).length)
-    }
-  }
 }
 
 

From c417dd10e74892f3d6b594438d7e47aba6335e65 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 28 Oct 2015 14:41:14 -0700
Subject: [PATCH 59/76] Change 10e-4 to 1e-4

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index ae06b0151641b..b46e2a092f23d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -380,7 +380,7 @@ private[clustering] object BisectingKMeans {
     var totalStd = Double.MaxValue
     var oldTotalStd = Double.MaxValue
     var relativeError = Double.MaxValue
-    while (subIter < maxIterations && relativeError > 10e-4) {
+    while (subIter < maxIterations && relativeError > 1e-4) {
       // calculate summary of each cluster
       val eachStats = dividableData.mapPartitions { iter =>
         val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]

From e2f696671fa073ab64110d56ba508ed6154f7231 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 28 Oct 2015 22:56:04 -0700
Subject: [PATCH 60/76] Fix a test for the default value of k

---
 .../apache/spark/mllib/clustering/BisectingKMeansSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 0e49c6c7afe5c..71c8fa66a232a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -53,7 +53,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("setNumClusters") {
     val algo = new BisectingKMeans()
-    assert(algo.getK == 20)
+    assert(algo.getK == 2)
     algo.setK(1000)
     assert(algo.getK == 1000)
   }

From 69e09103de8af2b1804eaa33d810813aaa35138f Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 28 Oct 2015 15:55:14 -0700
Subject: [PATCH 61/76] Replace `mapPartition(...).reduceByKey(...)` with
 `aggregateByKey`

---
 .../mllib/clustering/BisectingKMeans.scala    | 37 ++++++++++---------
 .../clustering/BisectingKMeansSuite.scala     |  8 ++--
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index b46e2a092f23d..d04d45462fe62 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -227,24 +227,27 @@ private[clustering] object BisectingKMeans {
    * @param data pairs of point and its cluster index
    */
   def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = {
-
-    data.mapPartitions { iter =>
-      // calculate the accumulation of the all point in a partition and count the rows
-      val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
-      iter.foreach { case (idx: Long, point: BV[Double]) =>
-        // get a map value or else get a sparse vector
-        val (sumBV, n, sumOfSquares) = map
-          .getOrElse(idx, (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size)))
-        map(idx) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
+    val dimension = data.first()._2.size
+    // zeroValue: (#rows, sum of vectors, sum of squares)
+    val zeroValue = (0L, BV.zeros[Double](dimension), 0.0)
+    val seqOp = (acc: (Long, BV[Double], Double), point: BV[Double]) => {
+      val n = acc._1 + 1L
+      val sums = acc._2 + point
+      val sumOfSquares = acc._3 + math.pow(breezeSum(point), 2.0)
+      (n, sums, sumOfSquares)
+    }
+    val comOp =
+      (acc1: (Long, BV[Double], Double), acc2: (Long, BV[Double], Double)) =>
+        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3)
+
+    val stats = data.aggregateByKey(zeroValue)(seqOp, comOp)
+    stats.map { case (i, (n, sums, sumOfSquare)) =>
+      val meanPoint = calcMean(n, sums)
+      val variance = n match {
+        case n if n < 2 => 0.0
+        case _ => (sumOfSquare / n) - math.pow(breezeSum(sums) / n, 2.0)
       }
-      map.toIterator
-    }.reduceByKey { case ((sum1, n1, sumOfSquares1), (sum2, n2, sumOfSquares2)) =>
-      // sum the accumulation and the count in the all partition
-      (sum1 + sum2, n1 + n2, sumOfSquares1 + sumOfSquares2)
-    }.map { case (i, (sum, n, sumOfSquares)) =>
-      val mean = calcMean(n.toLong, sum)
-      val variance = getVariance(n.toLong, sum, sumOfSquares)
-      (i, new BisectingClusterStat(n.toLong, mean, variance))
+      (i, new BisectingClusterStat(n, meanPoint, variance))
     }.collectAsMap()
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 71c8fa66a232a..6eaa40bb26378 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -89,16 +89,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val clusterStats = BisectingKMeans.summarizeClusters(data)
     assert(clusterStats.size === 4)
     assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze)
-    assert(clusterStats(4).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(4).variance ~== 1.0 absTol 10e-4)
     assert(clusterStats(4).rows === 2)
     assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze)
-    assert(clusterStats(5).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(5).variance ~== 1.0 absTol 10e-4)
     assert(clusterStats(5).rows === 2)
     assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze)
-    assert(clusterStats(6).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(6).variance ~== 1.0 absTol 10e-4)
     assert(clusterStats(6).rows === 2)
     assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze)
-    assert(clusterStats(7).variance ~== 0.3535 absTol 10e-4)
+    assert(clusterStats(7).variance ~== 1.0 absTol 10e-4)
     assert(clusterStats(7).rows === 2)
   }
 

From 675bafb23296f71c0007b73db6e003214cd0dcf9 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 28 Oct 2015 23:15:32 -0700
Subject: [PATCH 62/76] Change `sumOfSquares` vector to scholar at
 `divideClusters`

---
 .../mllib/clustering/BisectingKMeans.scala    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index d04d45462fe62..2a19679f94d1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -377,7 +377,7 @@ private[clustering] object BisectingKMeans {
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     val bcMetric = sc.broadcast(metric)
     // pairs of cluster index and (sums, #points, sumOfSquares)
-    var stats = Map.empty[Long, (BV[Double], Double, BV[Double])]
+    var stats = Map.empty[Long, (BV[Double], Double, Double)]
 
     var subIter = 0
     var totalStd = Double.MaxValue
@@ -386,7 +386,7 @@ private[clustering] object BisectingKMeans {
     while (subIter < maxIterations && relativeError > 1e-4) {
       // calculate summary of each cluster
       val eachStats = dividableData.mapPartitions { iter =>
-        val map = mutable.Map.empty[Long, (BV[Double], Double, BV[Double])]
+        val map = mutable.Map.empty[Long, (BV[Double], Double, Double)]
         iter.foreach { case (idx, point) =>
           // calculate next index number
           val childrenCenters = Array(2 * idx, 2 * idx + 1)
@@ -399,9 +399,9 @@ private[clustering] object BisectingKMeans {
             val (sumBV, n, sumOfSquares) = map
               .getOrElse(
                 nextIndex,
-                (BSV.zeros[Double](point.size), 0.0, BSV.zeros[Double](point.size))
+                (BSV.zeros[Double](point.size), 0.0, 0.0)
               )
-            map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + (point :* point))
+            map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + math.pow(breezeSum(point), 2.0))
           }
         }
         map.toIterator
@@ -417,17 +417,17 @@ private[clustering] object BisectingKMeans {
       // update summary of each cluster
       stats = eachStats.toMap
 
-      totalStd = stats.map { case (idx, (sum, n, sumOfSquares)) =>
-        breezeSum((sumOfSquares :/ n) :- breezeNorm(sum :/ n, 2.0))
+      totalStd = stats.map { case (idx, (sums, n, sumOfSquares)) =>
+        (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0)
       }.sum
       relativeError = math.abs(oldTotalStd - totalStd) / totalStd
       oldTotalStd = totalStd
       subIter += 1
     }
-    stats.map { case (i, (sums, rows, sumOfSquares)) =>
-      val mean = calcMean(rows.toLong, sums)
-      val variance = getVariance(rows.toLong, sums, sumOfSquares)
-      i -> new BisectingClusterStat(rows.toLong, mean, variance)
+    stats.map { case (i, (sums, n, sumOfSquares)) =>
+      val mean = calcMean(n.toLong, sums)
+      val variance = (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0)
+      i -> new BisectingClusterStat(n.toLong, mean, variance)
     }
   }
 

From 704e145ec73b9d519b02a8c16ceecfe61056bdd5 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 28 Oct 2015 23:51:14 -0700
Subject: [PATCH 63/76] Replace a chain of `mapPartition` and `reduceByKey`
 with `aggregateByKey` at `divideClusters`

---
 .../mllib/clustering/BisectingKMeans.scala    | 72 +++++++++----------
 1 file changed, 35 insertions(+), 37 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 2a19679f94d1a..77d3b9ea4aa2a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -372,56 +372,54 @@ private[clustering] object BisectingKMeans {
     val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
 
     var newCenters = initNextCenters(dividableData, dividableClusterStats)
-    var bcNewCenters = sc.broadcast(newCenters)
     // TODO Supports distance metrics other Euclidean distance metric
     val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
     val bcMetric = sc.broadcast(metric)
     // pairs of cluster index and (sums, #points, sumOfSquares)
-    var stats = Map.empty[Long, (BV[Double], Double, Double)]
+    var stats = Map.empty[Long, (BV[Double], Long, Double)]
 
     var subIter = 0
-    var totalStd = Double.MaxValue
-    var oldTotalStd = Double.MaxValue
+    var totalSumOfSquares = Double.MaxValue
+    var oldTotalSumOfSquares = Double.MaxValue
     var relativeError = Double.MaxValue
+    val dimension = dividableData.first()._2.size
+    // TODO add a set method for the threshold, instead of 1e-4
     while (subIter < maxIterations && relativeError > 1e-4) {
-      // calculate summary of each cluster
-      val eachStats = dividableData.mapPartitions { iter =>
-        val map = mutable.Map.empty[Long, (BV[Double], Double, Double)]
-        iter.foreach { case (idx, point) =>
-          // calculate next index number
-          val childrenCenters = Array(2 * idx, 2 * idx + 1)
-            .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
-          if (childrenCenters.length == 2) {
-            val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point)
-            val nextIndex = 2 * idx + closestIndex
-
-            // get a map value or else get a sparse vector
-            val (sumBV, n, sumOfSquares) = map
-              .getOrElse(
-                nextIndex,
-                (BSV.zeros[Double](point.size), 0.0, 0.0)
-              )
-            map(nextIndex) = (sumBV + point, n + 1.0, sumOfSquares + math.pow(breezeSum(point), 2.0))
-          }
+      // convert each index into the closest child index
+      val bcNewCenters = sc.broadcast(newCenters)
+      val nextData = dividableData.map { case (idx, point) =>
+        // calculate next index number
+        val childIndexes = Array(2 * idx, 2 * idx + 1)
+        val childrenCenters = childIndexes
+          .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
+        if (childrenCenters.length != 2) {
+          new SparkException(s"A node whose index is ${idx} doesn't have two children")
         }
-        map.toIterator
-      }.reduceByKey { case ((sv1, n1, sumOfSquares1), (sv2, n2, sumOfSquares2)) =>
-        // sum the accumulation and the count in the all partition
-        (sv1 + sv2, n1 + n2, sumOfSquares1 + sumOfSquares2)
-      }.collect().toMap
+        val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point)
+        val nextIndex = 2 * idx + closestIndex
+        (nextIndex, point)
+      }
 
-      // calculate the center of each cluster
-      newCenters = eachStats.map { case (idx, (sum, n, sumOfSquares)) => (idx, sum :/ n)}
-      bcNewCenters = sc.broadcast(newCenters)
+      // summarize each cluster
+      val zeroValue = (BV.zeros[Double](dimension), 0L, 0.0)
+      val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => {
+        val sums = acc._1 + point
+        val n = acc._2 + 1L
+        val sumOfSquares = acc._3 + (point dot point)
+        (sums, n, sumOfSquares)
+      }
+      val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) =>
+        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3)
+      val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap()
 
+      // calculate the center of each cluster
+      newCenters = tempStats.map {case (idx, (sums, n, sumOfSquares)) => (idx, sums :/ n.toDouble)}
       // update summary of each cluster
-      stats = eachStats.toMap
+      stats = tempStats.toMap
 
-      totalStd = stats.map { case (idx, (sums, n, sumOfSquares)) =>
-        (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0)
-      }.sum
-      relativeError = math.abs(oldTotalStd - totalStd) / totalStd
-      oldTotalStd = totalStd
+      totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfSquares)) => sumOfSquares}.sum
+      relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares
+      oldTotalSumOfSquares = totalSumOfSquares
       subIter += 1
     }
     stats.map { case (i, (sums, n, sumOfSquares)) =>

From ee3ea621a5839fd04ae174553d809a2ad57d2127 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 00:35:54 -0700
Subject: [PATCH 64/76] Modify `getMinimumNumNodesInTree` with `1 <<
 multiplier`

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 77d3b9ea4aa2a..1166cf108060b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -214,11 +214,9 @@ private[clustering] object BisectingKMeans {
    * @param k: the number of leaf nodes
    */
   def getMinimumNumNodesInTree(k: Int): Int = {
-    val multiplier = math.ceil(math.log(k) / math.log(2.0))
     // the calculation is same as `math.pow(2, multiplier)`
-    var numNodes = 2
-    (1 to multiplier.toInt).foreach (i => numNodes = numNodes << 1)
-    numNodes
+    val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1
+    1 << multiplier.toInt
   }
 
   /**

From 73e2c7afd21b29443f6dfe106b87af0cc68944eb Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 00:37:10 -0700
Subject: [PATCH 65/76] Rename `numClusters` parameter to `k`

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 1166cf108060b..ae2d6016648e0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -476,13 +476,13 @@ private[clustering] object BisectingKMeans {
    *
    * @param treeMap divided clusters as a Map class
    * @param rootIndex index you want to start
-   * @param numClusters the number of clusters you want
+   * @param k the number of clusters you want
    * @return a built cluster tree
    */
   private def buildTree(
       treeMap: Map[Long, BisectingClusterNode],
       rootIndex: Long,
-      numClusters: Int): Option[BisectingClusterNode] = {
+      k: Int): Option[BisectingClusterNode] = {
 
     // if there is no index in the Map
     if (!treeMap.contains(rootIndex)) return None
@@ -491,7 +491,7 @@ private[clustering] object BisectingKMeans {
     var numLeavesClusters = 1
     val root = treeMap(rootIndex)
     var leavesQueue = Map(rootIndex -> root)
-    while (leavesQueue.nonEmpty && numLeavesClusters < numClusters) {
+    while (leavesQueue.nonEmpty && numLeavesClusters < k) {
       // pick up the largest cluster by the maximum cost of all the clusters
       val mostScattered = leavesQueue.maxBy(_._2.cost)
       val mostScatteredKey = mostScattered._1

From a876ba233e197bb5228a05fe2946523af1b40ee6 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 00:38:29 -0700
Subject: [PATCH 66/76] Rename a variable in `BisectingKMeansModelSuite`

---
 .../mllib/clustering/BisectingKMeansModelSuite.scala | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index 667c96eb72d70..3c0892c225d5c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -104,11 +104,11 @@ class BisectingKMeansModelSuite
   }
 
   test("clustering should be done correctly") {
-    for (numClusters <- Array(9, 19)) {
-      val app = new BisectingKMeans().setK(numClusters).setSeed(1)
+    for (k <- Array(9, 19)) {
+      val app = new BisectingKMeans().setK(k).setSeed(1)
       val localData = (1 to 19).toSeq.map { i =>
-        val label = i % numClusters
-        val sparseVector = Vectors.sparse(numClusters, Seq((label, label.toDouble)))
+        val label = i % k
+        val sparseVector = Vectors.sparse(k, Seq((label, label.toDouble)))
         val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector)
         (label, denseVector, sparseVector)
       }
@@ -116,13 +116,13 @@ class BisectingKMeansModelSuite
       // dense version
       val denseData = sc.parallelize(localData.map(_._2), 2)
       val denseModel = app.run(denseData)
-      assert(denseModel.getCenters.length === numClusters)
+      assert(denseModel.getCenters.length === k)
       assert(denseModel.getClusters.forall(_.cost == 0.0))
 
       // sparse version
       val sparseData = sc.parallelize(localData.map(_._3), 2)
       val sparseModel = app.run(sparseData)
-      assert(sparseModel.getCenters.length === numClusters)
+      assert(sparseModel.getCenters.length === k)
       assert(sparseModel.getClusters.forall(_.cost == 0.0))
     }
   }

From 1985feafa6254167de6b2d650312456c7bacba57 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 12:06:18 -0700
Subject: [PATCH 67/76] tmp

---
 .../mllib/clustering/BisectingKMeans.scala    | 43 ++++++++-----------
 .../clustering/BisectingKMeansSuite.scala     |  8 ++--
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index ae2d6016648e0..84c3f18d9debb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -225,28 +225,23 @@ private[clustering] object BisectingKMeans {
    * @param data pairs of point and its cluster index
    */
   def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = {
-    val dimension = data.first()._2.size
-    // zeroValue: (#rows, sum of vectors, sum of squares)
-    val zeroValue = (0L, BV.zeros[Double](dimension), 0.0)
-    val seqOp = (acc: (Long, BV[Double], Double), point: BV[Double]) => {
-      val n = acc._1 + 1L
-      val sums = acc._2 + point
-      val sumOfSquares = acc._3 + math.pow(breezeSum(point), 2.0)
-      (n, sums, sumOfSquares)
+    // sum the number of node and points of each cluster
+    val stats = data.map {case (idx, p) =>
+      (idx, (p, 1L))
+    }.reduceByKey {case ((p1, n1), (p2, n2)) => (p1 + p2, n1 + n2) }.collectAsMap()
+
+    // calculate within-cluster sum of squares of each cluster
+    val bcStats = data.sparkContext.broadcast(stats)
+    val sumOfSquaresMap = data.map { case (idx, point) =>
+      val meanPoint = bcStats.value.apply(idx)._1 :/ bcStats.value.apply(idx)._2.toDouble
+      (idx, (point - meanPoint) dot (point - meanPoint))
+    }.reduceByKey(_ + _).collectAsMap()
+
+    stats.map { case (idx, (sumPoint, n)) =>
+      val meanPoint = sumPoint :/ n.toDouble
+      val sumOfSquares = sumOfSquaresMap(idx)
+      (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares))
     }
-    val comOp =
-      (acc1: (Long, BV[Double], Double), acc2: (Long, BV[Double], Double)) =>
-        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3)
-
-    val stats = data.aggregateByKey(zeroValue)(seqOp, comOp)
-    stats.map { case (i, (n, sums, sumOfSquare)) =>
-      val meanPoint = calcMean(n, sums)
-      val variance = n match {
-        case n if n < 2 => 0.0
-        case _ => (sumOfSquare / n) - math.pow(breezeSum(sums) / n, 2.0)
-      }
-      (i, new BisectingClusterStat(n, meanPoint, variance))
-    }.collectAsMap()
   }
 
   /**
@@ -663,14 +658,14 @@ class BisectingClusterNode private (
  *
  * @param rows the number of points
  * @param mean the sum of points
- * @param variance the sum of squares of points
+ * @param sumOfSquares the sum of squares of points
  */
 private[clustering] case class BisectingClusterStat (
     rows: Long,
     mean: BV[Double],
-    variance: Double) extends Serializable {
+    sumOfSquares: Double) extends Serializable {
 
-  def isDividable: Boolean = variance > 0 && rows >= 2
+  def isDividable: Boolean = sumOfSquares > 0 && rows >= 2
 }
 
 private[clustering] object BisectingClusterStat {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 6eaa40bb26378..1ef02676294b3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -89,16 +89,16 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val clusterStats = BisectingKMeans.summarizeClusters(data)
     assert(clusterStats.size === 4)
     assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze)
-    assert(clusterStats(4).variance ~== 1.0 absTol 10e-4)
+    assert(clusterStats(4).sumOfSquares ~== 1.0 absTol 10e-4)
     assert(clusterStats(4).rows === 2)
     assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze)
-    assert(clusterStats(5).variance ~== 1.0 absTol 10e-4)
+    assert(clusterStats(5).sumOfSquares ~== 1.0 absTol 10e-4)
     assert(clusterStats(5).rows === 2)
     assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze)
-    assert(clusterStats(6).variance ~== 1.0 absTol 10e-4)
+    assert(clusterStats(6).sumOfSquares ~== 1.0 absTol 10e-4)
     assert(clusterStats(6).rows === 2)
     assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze)
-    assert(clusterStats(7).variance ~== 1.0 absTol 10e-4)
+    assert(clusterStats(7).sumOfSquares ~== 1.0 absTol 10e-4)
     assert(clusterStats(7).rows === 2)
   }
 

From 629f8970ebcfc78086a98911a9bfbf9a2b0b0db0 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:14:51 -0700
Subject: [PATCH 68/76] Make this implementation more simple

---
 .../mllib/clustering/BisectingKMeans.scala    | 95 +++++++------------
 .../clustering/BisectingKMeansSuite.scala     | 30 +++---
 2 files changed, 48 insertions(+), 77 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 84c3f18d9debb..8f9c5668683a2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -119,46 +119,42 @@ class BisectingKMeans private (
   @Since("1.6.0")
   def run(input: RDD[Vector]): BisectingKMeansModel = {
     val sc = input.sparkContext
+    val startTime = System.currentTimeMillis()
+    var data = initData(input).cache()
+    // this is used for managing calculated cached RDDs
+    var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]]
 
     // `clusterStats` is described as binary tree structure
     // `clusterStats(1)` means the root of a binary tree
-    var clusterStats = mutable.Map.empty[Long, BisectingClusterStat]
-    var step = 1
-    var noMoreDividable = false
-    var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]]
+    var leafClusterStats = summarizeClusters(data)
+    var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
+    var clusterStats = leafClusterStats
+
     // the minimum number of nodes of a binary tree by given parameter
+    var step = 1
     val numNodeLimit = getMinimumNumNodesInTree(this.k)
-
     // divide clusters until the number of clusters reachs the condition
     // or there is no dividable cluster
-    val startTime = System.currentTimeMillis()
-    var data = initData(input).cache()
-    while (clusterStats.size < numNodeLimit && noMoreDividable == false) {
+    while (clusterStats.size < numNodeLimit && dividableLeafClusters.nonEmpty) {
       logInfo(s"${sc.appName} starts step ${step}")
+
+      // can be clustered if the number of divided clusterStats is equal to 0
       // TODO Remove non-leaf cluster stats from `leafClusterStats`
-      val leafClusterStats = summarizeClusters(data)
-      val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
+      val dividedData = divideClusters(data, dividableLeafClusters, maxIterations).cache()
+      leafClusterStats = summarizeClusters(dividedData)
+      dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusterStats
 
-      // can be clustered if the number of divided clusterStats is equal to 0
-      val divided = divideClusters(data, dividableLeafClusters, maxIterations)
       // update each index
-      val newData = updateClusterIndex(data, divided).cache()
-      updatedDataHistory = updatedDataHistory ++ Array(data)
-      data = newData
+      updatedDataHistory = updatedDataHistory ++ Array(dividedData)
+      data = dividedData
       // keep recent 2 cached RDDs in order to run more quickly
       if (updatedDataHistory.length > 1) {
         val head = updatedDataHistory.head
         updatedDataHistory = updatedDataHistory.tail
         head.unpersist()
       }
-      clusterStats = clusterStats ++ divided
       step += 1
-      logInfo(s"${sc.appName} adding ${divided.size} new clusterStats at step:${step}")
-
-      if (dividableLeafClusters.isEmpty) {
-        noMoreDividable = true
-      }
     }
     // create a map of cluster node with their costs
     val nodes = createClusterNodes(data, clusterStats)
@@ -190,8 +186,6 @@ class BisectingKMeans private (
 
 private[clustering] object BisectingKMeans {
 
-  import BisectingClusterStat._
-
   val ROOT_INDEX_KEY: Long = 1
 
   /**
@@ -239,7 +233,7 @@ private[clustering] object BisectingKMeans {
 
     stats.map { case (idx, (sumPoint, n)) =>
       val meanPoint = sumPoint :/ n.toDouble
-      val sumOfSquares = sumOfSquaresMap(idx)
+      val sumOfSquares = math.abs(sumOfSquaresMap(idx))
       (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares))
     }
   }
@@ -309,7 +303,7 @@ private[clustering] object BisectingKMeans {
       dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = {
 
     // If there is no divided clusters, return the original
-    if (dividedClusters.size == 0) {
+    if (dividedClusters.isEmpty) {
       return data
     }
 
@@ -352,14 +346,14 @@ private[clustering] object BisectingKMeans {
   def divideClusters(
       data: RDD[(Long, BV[Double])],
       clusterStats: Map[Long, BisectingClusterStat],
-      maxIterations: Int): Map[Long, BisectingClusterStat] = {
+      maxIterations: Int): RDD[(Long, BV[Double])] = {
     val sc = data.sparkContext
     val appName = sc.appName
 
     // get keys of dividable clusters
     val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable }
     if (dividableClusterStats.isEmpty) {
-      return Map.empty[Long, BisectingClusterStat]
+      return data
     }
     // extract dividable input data
     val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
@@ -371,6 +365,7 @@ private[clustering] object BisectingKMeans {
     // pairs of cluster index and (sums, #points, sumOfSquares)
     var stats = Map.empty[Long, (BV[Double], Long, Double)]
 
+    var nextData = data
     var subIter = 0
     var totalSumOfSquares = Double.MaxValue
     var oldTotalSumOfSquares = Double.MaxValue
@@ -380,7 +375,7 @@ private[clustering] object BisectingKMeans {
     while (subIter < maxIterations && relativeError > 1e-4) {
       // convert each index into the closest child index
       val bcNewCenters = sc.broadcast(newCenters)
-      val nextData = dividableData.map { case (idx, point) =>
+      nextData = dividableData.map { case (idx, point) =>
         // calculate next index number
         val childIndexes = Array(2 * idx, 2 * idx + 1)
         val childrenCenters = childIndexes
@@ -398,28 +393,22 @@ private[clustering] object BisectingKMeans {
       val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => {
         val sums = acc._1 + point
         val n = acc._2 + 1L
-        val sumOfSquares = acc._3 + (point dot point)
-        (sums, n, sumOfSquares)
+        val sumOfNorm = acc._3 + (point dot point)
+        (sums, n, sumOfNorm)
       }
       val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) =>
-        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc2._3)
+        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc1._3)
       val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap()
 
       // calculate the center of each cluster
-      newCenters = tempStats.map {case (idx, (sums, n, sumOfSquares)) => (idx, sums :/ n.toDouble)}
-      // update summary of each cluster
-      stats = tempStats.toMap
+      newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)}
 
-      totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfSquares)) => sumOfSquares}.sum
+      totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum
       relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares
       oldTotalSumOfSquares = totalSumOfSquares
       subIter += 1
     }
-    stats.map { case (i, (sums, n, sumOfSquares)) =>
-      val mean = calcMean(n.toLong, sums)
-      val variance = (sumOfSquares / n) - math.pow(breezeSum(sums), 2.0)
-      i -> new BisectingClusterStat(n.toLong, mean, variance)
-    }
+    nextData
   }
 
   /**
@@ -442,27 +431,11 @@ private[clustering] object BisectingKMeans {
   private def createClusterNodesWithAverageCost(
       data: RDD[(Long, BV[Double])],
       stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = {
-
-    // calculate average costs of all clusters
-    val bcCenters = data.sparkContext.broadcast(stats.map { case (i, stat) => i -> stat.mean })
-    val costs = data.mapPartitions { iter =>
-      val counters = mutable.Map.empty[Long, (Long, Double)]
-      bcCenters.value.foreach {case (i, center) => counters(i) = (0L, 0.0)}
-      iter.foreach { case (i, point) =>
-        val cost = breezeNorm(bcCenters.value.apply(i) - point, 2.0)
-        counters(i) = (counters(i)._1 + 1, counters(i)._2 + cost)
-      }
-      counters.toIterator
-    }.reduceByKey { case((n1, cost1), (n2, cost2)) =>
-      (n1 + n2, cost1 + cost2)
-    }.collectAsMap()
-
-    stats.map { case (i, stat) =>
-      val avgCost = costs(i)._1 match {
-        case x if x == 0.0 => 0.0
-        case _ => costs(i)._2 / costs(i)._1
-      }
-      i -> new BisectingClusterNode(Vectors.fromBreeze(stat.mean), stat.rows, avgCost)
+    stats.map { case (idx, clusterStats) =>
+      val rows = clusterStats.rows
+      val center = clusterStats.mean
+      val cost = math.sqrt(clusterStats.sumOfSquares) / rows
+      idx -> new BisectingClusterNode(Vectors.fromBreeze(center), rows, cost)
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 1ef02676294b3..1177e3b293de2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -33,7 +33,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
     assert(model.getClusters.length == 123)
-    assert(model.node.getHeight ~== 705.6925 absTol 10E-4)
+    assert(model.node.getHeight ~== 702.8641 absTol 10E-4)
 
     // check the relations between a parent cluster and its children
     assert(model.node.getParent === None)
@@ -133,11 +133,10 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance),
       7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance)
     )
-    val data = sc.parallelize(seed)
+    val data = sc.parallelize(seed, 1)
     val leafClusterStats = BisectingKMeans.summarizeClusters(data)
     val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
-    val divided = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20)
-    val result = BisectingKMeans.updateClusterIndex(data, divided).collect().toSeq
+    val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20).collect()
 
     val expected = Seq(
       (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
@@ -180,19 +179,18 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)),
       (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1))
     )
-    val data = sc.parallelize(local)
+    val data = sc.parallelize(local, 1)
     val stats = BisectingKMeans.summarizeClusters(data)
-    val newClusterStats = BisectingKMeans.divideClusters(data, stats, 20)
-
-    assert(newClusterStats.size === 4)
-    assert(newClusterStats(4).mean === BV[Double](1.0, 1.0))
-    assert(newClusterStats(4).rows === 2)
-    assert(newClusterStats(5).mean === BV[Double](10.0, 10.0))
-    assert(newClusterStats(5).rows === 2)
-    assert(newClusterStats(6).mean === BV[Double](100.0, 100.0))
-    assert(newClusterStats(6).rows === 2)
-    assert(newClusterStats(7).mean === BV[Double](110.0, 110.0))
-    assert(newClusterStats(7).rows === 2)
+    val dividedData = BisectingKMeans.divideClusters(data, stats, 20).collect()
+
+    assert(dividedData(0) == (4L, BV[Double](0.9, 0.9)))
+    assert(dividedData(1) == (4L, BV[Double](1.1, 1.1)))
+    assert(dividedData(2) == (5L, BV[Double](9.9, 9.9)))
+    assert(dividedData(3) == (5L, BV[Double](10.1, 10.1)))
+    assert(dividedData(4) == (6L, BV[Double](99.9, 99.9)))
+    assert(dividedData(5) == (6L, BV[Double](100.1, 100.1)))
+    assert(dividedData(6) == (7L, BV[Double](109.9, 109.9)))
+    assert(dividedData(7) == (7L, BV[Double](110.1, 110.1)))
   }
 
 }

From 1f84ded143748ea9dae3f13eb737de394ba0f6e9 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:28:26 -0700
Subject: [PATCH 69/76] Reorganize import statements and adjust parameters and
 return values

---
 .../mllib/clustering/BisectingKMeans.scala    | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 8f9c5668683a2..493849fa477a6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.collection.{Map, mutable}
-
-import breeze.linalg.{SparseVector => BSV, Vector => BV, norm => breezeNorm, sum => breezeSum}
+import breeze.linalg.{Vector => BV, norm => breezeNorm}
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Since
@@ -218,7 +216,10 @@ private[clustering] object BisectingKMeans {
    *
    * @param data pairs of point and its cluster index
    */
-  def summarizeClusters(data: RDD[(Long, BV[Double])]): Map[Long, BisectingClusterStat] = {
+  def summarizeClusters(
+      data: RDD[(Long, BV[Double])]
+    ): collection.Map[Long, BisectingClusterStat] = {
+
     // sum the number of node and points of each cluster
     val stats = data.map {case (idx, p) =>
       (idx, (p, 1L))
@@ -253,15 +254,15 @@ private[clustering] object BisectingKMeans {
    */
   def initNextCenters(
       data: RDD[(Long, BV[Double])],
-      stats: Map[Long, BisectingClusterStat]): Map[Long, BV[Double]] = {
+      stats: collection.Map[Long, BisectingClusterStat]): collection.Map[Long, BV[Double]] = {
 
     // Since the combination sampleByKey and groupByKey is more expensive,
     // this as follows would be better.
     val bcIndeces = data.sparkContext.broadcast(stats.keySet)
     val samples = data.mapPartitions { iter =>
-      val map = mutable.Map.empty[Long, mutable.ArrayBuffer[BV[Double]]]
+      val map = collection.mutable.Map.empty[Long, collection.mutable.ArrayBuffer[BV[Double]]]
 
-      bcIndeces.value.foreach {i => map(i) = mutable.ArrayBuffer.empty[BV[Double]]}
+      bcIndeces.value.foreach {i => map(i) = collection.mutable.ArrayBuffer.empty[BV[Double]]}
       val LOCAL_SAMPLE_SIZE = 100
       iter.foreach { case (i, point) =>
         map(i).append(point)
@@ -269,14 +270,14 @@ private[clustering] object BisectingKMeans {
         // the number of elements is cut off at the right time.
         if (map(i).size > LOCAL_SAMPLE_SIZE) {
           val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
-          map(i) = mutable.ArrayBuffer(elements.head, elements.last)
+          map(i) = collection.mutable.ArrayBuffer(elements.head, elements.last)
         }
       }
 
       // in order to reduce the shuffle size, take only two elements
       map.filterNot(_._2.isEmpty).map { case (i, points) =>
         val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
-        i -> mutable.ArrayBuffer(elements.head, elements.last)
+        i -> collection.mutable.ArrayBuffer(elements.head, elements.last)
       }.toIterator
     }.reduceByKey { case (points1, points2) =>
       points1.union(points2)
@@ -345,7 +346,7 @@ private[clustering] object BisectingKMeans {
    */
   def divideClusters(
       data: RDD[(Long, BV[Double])],
-      clusterStats: Map[Long, BisectingClusterStat],
+      clusterStats: collection.Map[Long, BisectingClusterStat],
       maxIterations: Int): RDD[(Long, BV[Double])] = {
     val sc = data.sparkContext
     val appName = sc.appName
@@ -419,8 +420,8 @@ private[clustering] object BisectingKMeans {
    */
   def createClusterNodes(
       data: RDD[(Long, BV[Double])],
-      stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = {
-
+      stats: collection.Map[Long, BisectingClusterStat]
+    ): collection.Map[Long, BisectingClusterNode] = {
     // TODO: support other cost, such as entropy
     createClusterNodesWithAverageCost(data, stats)
   }
@@ -430,7 +431,9 @@ private[clustering] object BisectingKMeans {
    */
   private def createClusterNodesWithAverageCost(
       data: RDD[(Long, BV[Double])],
-      stats: Map[Long, BisectingClusterStat]): Map[Long, BisectingClusterNode] = {
+      stats: collection.Map[Long, BisectingClusterStat]
+    ): collection.Map[Long, BisectingClusterNode] = {
+
     stats.map { case (idx, clusterStats) =>
       val rows = clusterStats.rows
       val center = clusterStats.mean
@@ -448,7 +451,7 @@ private[clustering] object BisectingKMeans {
    * @return a built cluster tree
    */
   private def buildTree(
-      treeMap: Map[Long, BisectingClusterNode],
+      treeMap: collection.Map[Long, BisectingClusterNode],
       rootIndex: Long,
       k: Int): Option[BisectingClusterNode] = {
 

From 12b322392682dce1be666ad20e5cb72eb2f49197 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:30:34 -0700
Subject: [PATCH 70/76] Rename `WSSSE` to `computeCost`

---
 .../apache/spark/mllib/clustering/BisectingKMeansModel.scala  | 4 ++--
 .../spark/mllib/clustering/BisectingKMeansModelSuite.scala    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 8177cd360d28e..38f4695eb0d26 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -75,7 +75,7 @@ class BisectingKMeansModel @Since("1.6.0") (
    * Computes Within Set Sum of Squared Error(WSSSE)
    */
   @Since("1.6.0")
-  def WSSSE(data: RDD[Vector]): Double = {
+  def computeCost(data: RDD[Vector]): Double = {
     val bvCenters = this.getCenters.map(_.toBreeze)
     data.context.broadcast(bvCenters)
     val distances = data.map {point =>
@@ -90,7 +90,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   }
 
   @Since("1.6.0")
-  def WSSSE(data: JavaRDD[Vector]): Double = this.WSSSE(data.rdd)
+  def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)
 
 }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
index 3c0892c225d5c..ceac039efc8d0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
@@ -62,7 +62,7 @@ class BisectingKMeansModelSuite
     assert(predicted === localData.map(_._1))
 
     // compute WSSSE
-    assert(model.WSSSE(data) === 0.0)
+    assert(model.computeCost(data) === 0.0)
   }
 
   test("clustering sparse vectors") {
@@ -100,7 +100,7 @@ class BisectingKMeansModelSuite
     assert(predicted === localData.map(_._1))
 
     // compute WSSSE
-    assert(model.WSSSE(data) === 0.0)
+    assert(model.computeCost(data) === 0.0)
   }
 
   test("clustering should be done correctly") {

From ef4a3e86ff1289a4508beb600f147699752b7987 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:35:02 -0700
Subject: [PATCH 71/76] Remove `updateClusterIndex`

---
 .../mllib/clustering/BisectingKMeans.scala    | 44 -------------------
 1 file changed, 44 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 493849fa477a6..72a56b9635f16 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -293,50 +293,6 @@ private[clustering] object BisectingKMeans {
     nextCenters
   }
 
-  /**
-   * Updates the indexes of clusters which is divided to its children indexes
-   *
-   * @param data pairs of point and its cluster index
-   * @param dividedClusters pairs of cluster index and cluster statistics
-   */
-  def updateClusterIndex(
-      data: RDD[(Long, BV[Double])],
-      dividedClusters: Map[Long, BisectingClusterStat]): RDD[(Long, BV[Double])] = {
-
-    // If there is no divided clusters, return the original
-    if (dividedClusters.isEmpty) {
-      return data
-    }
-
-    // extract the centers of the clusters
-    val sc = data.sparkContext
-    val centers = dividedClusters.map { case (idx, cluster) => (idx, cluster.mean)}
-    val bcCenters = sc.broadcast(centers)
-
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val bcMetric = sc.broadcast(metric)
-
-    // update the indexes to their children indexes
-    data.map { case (idx, point) =>
-      // TODO improve how to extract child indexes
-      val childIndexes = Array(2 * idx, 2 * idx + 1)
-      val extractedChildIndexes = childIndexes.filter(c => bcCenters.value.contains(c))
-      extractedChildIndexes.length match {
-        // update the indexes
-        case s if s == 2 => {
-          val nextCenters = extractedChildIndexes.map(bcCenters.value(_))
-          val closestIndex = BisectingKMeans
-            .findClosestCenter(bcMetric.value)(nextCenters)(point)
-          val nextIndex = 2 * idx + closestIndex
-          (nextIndex, point)
-        }
-        // stay the index if a cluster which a point belongs wasn't divided
-        case _ => (idx, point)
-      }
-    }
-  }
-
   /**
    * Divides clusters according to their statistics
    *

From 57b06bab01fc9b1404f87ec4c4a8319963a6894c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:35:24 -0700
Subject: [PATCH 72/76] Remove BisectingClusterStat object

---
 .../spark/mllib/clustering/BisectingKMeans.scala | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 72a56b9635f16..00fbcb4519d7e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -599,19 +599,3 @@ private[clustering] case class BisectingClusterStat (
 
   def isDividable: Boolean = sumOfSquares > 0 && rows >= 2
 }
-
-private[clustering] object BisectingClusterStat {
-  // calculate a mean vector
-  def calcMean(rows: Long, sums: BV[Double]): BV[Double] = sums :/ rows.toDouble
-
-  // calculate a variance
-  def getVariance(rows: Long, sums: BV[Double], sumOfSquares: BV[Double]): Double = {
-    val variances: BV[Double] = rows match {
-      case n if n > 1 => sumOfSquares.:/(n.toDouble) - (sums :* sums).:/(n.toDouble * n.toDouble)
-      case _ => BV.zeros[Double](sums.size)
-    }
-    breezeNorm(variances, 2.0)
-  }
-}
-
-

From 5da05d3fd41d5b730d19f595ca9bf622aaf0a14d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 14:42:49 -0700
Subject: [PATCH 73/76] Fix minors

---
 .../mllib/clustering/BisectingKMeans.scala    | 28 +++++++++----------
 .../clustering/BisectingKMeansSuite.scala     |  6 +++-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 00fbcb4519d7e..9354a53884b33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -119,11 +119,11 @@ class BisectingKMeans private (
     val sc = input.sparkContext
     val startTime = System.currentTimeMillis()
     var data = initData(input).cache()
-    // this is used for managing calculated cached RDDs
     var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]]
 
-    // `clusterStats` is described as binary tree structure
+    // `clusterStats` is described as binary tree structure as Map
     // `clusterStats(1)` means the root of a binary tree
+    // `clusterStats(2n)` and `clusterStats(2n+1)` are the children of `clusterStats(n)`
     var leafClusterStats = summarizeClusters(data)
     var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
     var clusterStats = leafClusterStats
@@ -143,16 +143,15 @@ class BisectingKMeans private (
       dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusterStats
 
-      // update each index
+      // keep recent 2 cached RDDs in order to run more quickly
       updatedDataHistory = updatedDataHistory ++ Array(dividedData)
       data = dividedData
-      // keep recent 2 cached RDDs in order to run more quickly
+      step += 1
       if (updatedDataHistory.length > 1) {
         val head = updatedDataHistory.head
         updatedDataHistory = updatedDataHistory.tail
         head.unpersist()
       }
-      step += 1
     }
     // create a map of cluster node with their costs
     val nodes = createClusterNodes(data, clusterStats)
@@ -312,24 +311,25 @@ private[clustering] object BisectingKMeans {
     if (dividableClusterStats.isEmpty) {
       return data
     }
+
     // extract dividable input data
     val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
-
+    // get next initial centers
     var newCenters = initNextCenters(dividableData, dividableClusterStats)
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val bcMetric = sc.broadcast(metric)
-    // pairs of cluster index and (sums, #points, sumOfSquares)
-    var stats = Map.empty[Long, (BV[Double], Long, Double)]
-
     var nextData = data
     var subIter = 0
     var totalSumOfSquares = Double.MaxValue
     var oldTotalSumOfSquares = Double.MaxValue
     var relativeError = Double.MaxValue
     val dimension = dividableData.first()._2.size
-    // TODO add a set method for the threshold, instead of 1e-4
+
+    // TODO Supports distance metrics other Euclidean distance metric
+    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
+    val bcMetric = sc.broadcast(metric)
+
     while (subIter < maxIterations && relativeError > 1e-4) {
+      // TODO add a set method for the threshold, instead of 1e-4
+
       // convert each index into the closest child index
       val bcNewCenters = sc.broadcast(newCenters)
       nextData = dividableData.map { case (idx, point) =>
@@ -360,7 +360,7 @@ private[clustering] object BisectingKMeans {
       // calculate the center of each cluster
       newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)}
 
-      totalSumOfSquares = stats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum
+      totalSumOfSquares = tempStats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum
       relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares
       oldTotalSumOfSquares = totalSumOfSquares
       subIter += 1
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 1177e3b293de2..f0947e336cf7c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.TestingUtils._
 class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("run") {
-    val algo = new BisectingKMeans().setK(123).setSeed(1)
+    val k = 123
+    val algo = new BisectingKMeans().setK(k).setSeed(1)
     val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
     val data = sc.parallelize(localSeed, 2)
     val model = algo.run(data)
@@ -40,6 +41,9 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.node.getChildren.head.getParent.get === model.node)
     assert(model.node.getChildren.apply(1).getParent.get === model.node)
     assert(model.getClusters.forall(_.getParent.isDefined))
+
+    val predicted = model.predict(data)
+    assert(predicted.distinct.count() === k)
   }
 
   test("run with too many cluster size than the records") {

From a50689a144bf5801c831d0b2f33eb6435e87f929 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 29 Oct 2015 16:13:55 -0700
Subject: [PATCH 74/76] Improve `initNextCenters`

---
 .../mllib/clustering/BisectingKMeans.scala    | 63 +++++++------------
 .../clustering/BisectingKMeansSuite.scala     |  6 +-
 2 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 9354a53884b33..1601d6c84e217 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{Vector => BV, norm => breezeNorm}
+import breeze.linalg.{Vector => BV, SparseVector => BSV, norm => breezeNorm}
 
+import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -138,7 +139,7 @@ class BisectingKMeans private (
 
       // can be clustered if the number of divided clusterStats is equal to 0
       // TODO Remove non-leaf cluster stats from `leafClusterStats`
-      val dividedData = divideClusters(data, dividableLeafClusters, maxIterations).cache()
+      val dividedData = divideClusters(data, dividableLeafClusters, maxIterations, seed).cache()
       leafClusterStats = summarizeClusters(dividedData)
       dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
       clusterStats = clusterStats ++ leafClusterStats
@@ -248,47 +249,24 @@ private[clustering] object BisectingKMeans {
   /**
    * Gets the initial centers for bisecting k-means
    *
-   * @param data pairs of point and its cluster index
    * @param stats pairs of cluster index and cluster statistics
+   * @param seed random seed
    */
   def initNextCenters(
-      data: RDD[(Long, BV[Double])],
-      stats: collection.Map[Long, BisectingClusterStat]): collection.Map[Long, BV[Double]] = {
-
-    // Since the combination sampleByKey and groupByKey is more expensive,
-    // this as follows would be better.
-    val bcIndeces = data.sparkContext.broadcast(stats.keySet)
-    val samples = data.mapPartitions { iter =>
-      val map = collection.mutable.Map.empty[Long, collection.mutable.ArrayBuffer[BV[Double]]]
-
-      bcIndeces.value.foreach {i => map(i) = collection.mutable.ArrayBuffer.empty[BV[Double]]}
-      val LOCAL_SAMPLE_SIZE = 100
-      iter.foreach { case (i, point) =>
-        map(i).append(point)
-        // to avoid to increase the memory usage on each map thread,
-        // the number of elements is cut off at the right time.
-        if (map(i).size > LOCAL_SAMPLE_SIZE) {
-          val elements = map(i).sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
-          map(i) = collection.mutable.ArrayBuffer(elements.head, elements.last)
-        }
-      }
+      stats: collection.Map[Long, BisectingClusterStat],
+      seed: Long
+    ): collection.Map[Long, BV[Double]] = {
 
-      // in order to reduce the shuffle size, take only two elements
-      map.filterNot(_._2.isEmpty).map { case (i, points) =>
-        val elements = map(i).toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
-        i -> collection.mutable.ArrayBuffer(elements.head, elements.last)
-      }.toIterator
-    }.reduceByKey { case (points1, points2) =>
-      points1.union(points2)
-    }.collect()
-
-    val nextCenters = samples.flatMap { case (i, points) =>
-      val elements = points.toSeq.sortWith((a, b) => breezeNorm(a, 2.0) < breezeNorm(b, 2.0))
-      Array((2 * i, elements.head), (2 * i + 1, elements.last))
+    val random = new XORShiftRandom()
+    random.setSeed(seed)
+    val nextCenters = stats.flatMap { case (idx, clusterStats) =>
+      val center = clusterStats.mean
+      val stdev = math.sqrt(clusterStats.sumOfSquares) / clusterStats.rows
+      val activeKeys = clusterStats.mean.activeKeysIterator.toArray
+      val activeValues = activeKeys.map(i => random.nextDouble() * stdev)
+      val perturbation = new BSV[Double](activeKeys, activeValues, clusterStats.mean.size)
+      Array((2 * idx, center - perturbation), (2 * idx + 1, center + perturbation))
     }.toMap
-    if (!stats.keySet.flatMap(idx => Array(2 * idx, 2 * idx + 1)).forall(nextCenters.contains(_))) {
-      throw new SparkException("Failed to initialize centers for next step")
-    }
     nextCenters
   }
 
@@ -298,11 +276,15 @@ private[clustering] object BisectingKMeans {
    * @param data pairs of point and its cluster index
    * @param clusterStats target clusters to divide
    * @param maxIterations the maximum iterations to calculate clusters statistics
+   * @param seed random seed
    */
   def divideClusters(
       data: RDD[(Long, BV[Double])],
       clusterStats: collection.Map[Long, BisectingClusterStat],
-      maxIterations: Int): RDD[(Long, BV[Double])] = {
+      maxIterations: Int,
+      seed: Long
+    ): RDD[(Long, BV[Double])] = {
+
     val sc = data.sparkContext
     val appName = sc.appName
 
@@ -315,7 +297,7 @@ private[clustering] object BisectingKMeans {
     // extract dividable input data
     val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
     // get next initial centers
-    var newCenters = initNextCenters(dividableData, dividableClusterStats)
+    var newCenters = initNextCenters(dividableClusterStats, seed)
     var nextData = data
     var subIter = 0
     var totalSumOfSquares = Double.MaxValue
@@ -596,6 +578,7 @@ private[clustering] case class BisectingClusterStat (
     rows: Long,
     mean: BV[Double],
     sumOfSquares: Double) extends Serializable {
+  require(sumOfSquares >= 0.0)
 
   def isDividable: Boolean = sumOfSquares > 0 && rows >= 2
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index f0947e336cf7c..74e12d00c2022 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -116,7 +116,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0),
       3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0)
     )
-    val initNextCenters = BisectingKMeans.initNextCenters(data, stats)
+    val initNextCenters = BisectingKMeans.initNextCenters(stats, 1)
     assert(initNextCenters.size === 4)
     assert(initNextCenters.keySet === Set(4, 5, 6, 7))
   }
@@ -140,7 +140,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val data = sc.parallelize(seed, 1)
     val leafClusterStats = BisectingKMeans.summarizeClusters(data)
     val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
-    val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20).collect()
+    val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20, 1).collect()
 
     val expected = Seq(
       (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
@@ -185,7 +185,7 @@ class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     )
     val data = sc.parallelize(local, 1)
     val stats = BisectingKMeans.summarizeClusters(data)
-    val dividedData = BisectingKMeans.divideClusters(data, stats, 20).collect()
+    val dividedData = BisectingKMeans.divideClusters(data, stats, 20, 1).collect()
 
     assert(dividedData(0) == (4L, BV[Double](0.9, 0.9)))
     assert(dividedData(1) == (4L, BV[Double](1.1, 1.1)))

From d422be759b967ab5c4bef0f6a34aa53f1a3a4c77 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 9 Nov 2015 00:13:21 -0800
Subject: [PATCH 75/76] refactor

---
 .../mllib/clustering/BisectingKMeans.scala    | 771 ++++++++----------
 .../clustering/BisectingKMeansModel.scala     |  73 +-
 .../clustering/JavaBisectingKMeansSuite.java  |  83 +-
 .../BisectingKMeansModelSuite.scala           | 129 ---
 .../clustering/BisectingKMeansSuite.scala     | 296 ++++---
 5 files changed, 537 insertions(+), 815 deletions(-)
 delete mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 1601d6c84e217..9a7916f75dcc7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -17,53 +17,45 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{Vector => BV, SparseVector => BSV, norm => breezeNorm}
+import java.util.Random
 
-import org.apache.spark.util.random.XORShiftRandom
-import org.apache.spark.{Logging, SparkException}
-import org.apache.spark.annotation.Since
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.rdd.RDD
+import scala.collection.mutable
 
+import org.apache.spark.Logging
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
 
 /**
- * This is a divisive hierarchical clustering algorithm based on bisecting k-means algorithm.
- *
- * The main idea of this algorithm is based on "A comparison of document clustering techniques",
- * M. Steinbach, G. Karypis and V. Kumar. Workshop on Text Mining, KDD, 2000.
- * http://cs.fit.edu/~pkc/classes/ml-internet/papers/steinbach00tr.pdf
- *
- * However, we modified it to fit for Spark. This algorithm consists of the two main parts.
+ * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques"
+ * by Steinbach, Karypis, and Kumar, with modification to fit Spark.
+ * The algorithm starts from a single cluster that contains all points.
+ * Iteratively it finds divisible clusters on the bottom level and bisects each of them using
+ * k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible.
+ * The bisecting steps of clusters on the same level are grouped together to increase parallelism.
+ * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters,
+ * larger clusters get higher priority.
  *
- * 1. Split clusters until the number of clusters will be enough to build a cluster tree
- * 2. Build a cluster tree as a binary tree by the splitted clusters
+ * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if
+ *          there are no divisible leaf clusters.
+ * @param maxIterations the max number of k-means iterations to split clusters (default: 20)
+ * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion
+ *                                of points (if < 1.0) of a divisible cluster (default: 1)
+ * @param seed a random seed (default: hash value of the class name)
  *
- * First, it splits clusters to their children clusters step by step, not considering a cluster
- * will be included in the final cluster tree or not. That's because it makes the algorithm more
- * efficient on Spark and splitting a cluster one by one is very slow. It will keep splitting until
- * the number of clusters will be enough to build a cluster tree. Otherwise, it will stop splitting
- * when there are no dividable clusters before the number of clusters will be sufficient. And
- * it calculates the costs, such as average cost, entropy and so on, for building a cluster
- * tree in the first part. The costs means how large the cluster is. That is, the cluster
- * whose cost is maximum of all the clusters is the largest cluster.
- *
- * Second, it builds a cluster tree as a binary tree by the result of the first part.
- * First of all, the cluster tree starts with only the root cluster which includes all points.
- * So, there are two candidates which can be merged to the cluster tree. Those are the children of
- * the root. Then, it picks up the larger child of the two and merge it to the cluster tree.
- * After that, there are tree candidates to merge. Those are the smaller child of the root and
- * the two children of the larger cluster of the root. It picks up the largest cluster of the tree
- * and merge it to the * cluster tree. Like this, it continues to pick up the largest one of the
- * candidates and merge it to the cluster tree until the desired number of clusters is reached.
- *
- * @param k tne desired number of clusters
- * @param maxIterations the number of maximal iterations to split clusters
- * @param seed a random seed
+ * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
+ *     Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ *     KDD Workshop on Text Mining, 2000.]]
  */
 @Since("1.6.0")
+@Experimental
 class BisectingKMeans private (
     private var k: Int,
     private var maxIterations: Int,
+    private var minDivisibleClusterSize: Double,
     private var seed: Long) extends Logging {
 
   import BisectingKMeans._
@@ -72,34 +64,62 @@ class BisectingKMeans private (
    * Constructs with the default configuration
    */
   @Since("1.6.0")
-  def this() = this(2, 20, 1)
+  def this() = this(4, 20, 1.0, classOf[BisectingKMeans].getName.##)
 
   /**
-   * Sets the number of clusters you want
+   * Sets the desired number of leaf clusters (default: 4).
+   * The actual number could be smaller if there are no divisible leaf clusters.
    */
   @Since("1.6.0")
   def setK(k: Int): this.type = {
+    require(k > 0, s"k must be positive but got $k.")
     this.k = k
     this
   }
 
+  /**
+   * Gets the desired number of leaf clusters.
+   */
   @Since("1.6.0")
   def getK: Int = this.k
 
   /**
-   * Sets the number of maximal iterations in each clustering step
+   * Sets the max number of k-means iterations to split clusters (default: 20).
    */
   @Since("1.6.0")
   def setMaxIterations(maxIterations: Int): this.type = {
+    require(maxIterations > 0, s"maxIterations must be positive but got $maxIterations.")
     this.maxIterations = maxIterations
     this
   }
 
+  /**
+   * Gets the max number of k-means iterations to split clusters.
+   */
   @Since("1.6.0")
   def getMaxIterations: Int = this.maxIterations
 
   /**
-   * Sets the random seed
+   * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points
+   * (if < `1.0`) of a divisible cluster (default: 1).
+   */
+  @Since("1.6.0")
+  def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = {
+    require(minDivisibleClusterSize > 0.0,
+      s"minDivisibleClusterSize must be positive but got $minDivisibleClusterSize.")
+    this.minDivisibleClusterSize = minDivisibleClusterSize
+    this
+  }
+
+  /**
+   * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points
+   * (if < `1.0`) of a divisible cluster.
+   */
+  @Since("1.6.0")
+  def getMinDivisibleClusterSize: Double = minDivisibleClusterSize
+
+  /**
+   * Sets the random seed (default: hash value of the class name).
    */
   @Since("1.6.0")
   def setSeed(seed: Long): this.type = {
@@ -107,478 +127,363 @@ class BisectingKMeans private (
     this
   }
 
+  /**
+   * Gets the random seed.
+   */
   @Since("1.6.0")
   def getSeed: Long = this.seed
 
   /**
-   * Runs the bisecting k-means algorithm
+   * Runs the bisecting k-means algorithm.
    * @param input RDD of vectors
    * @return model for the bisecting kmeans
    */
   @Since("1.6.0")
   def run(input: RDD[Vector]): BisectingKMeansModel = {
-    val sc = input.sparkContext
-    val startTime = System.currentTimeMillis()
-    var data = initData(input).cache()
-    var updatedDataHistory = Array.empty[RDD[(Long, BV[Double])]]
-
-    // `clusterStats` is described as binary tree structure as Map
-    // `clusterStats(1)` means the root of a binary tree
-    // `clusterStats(2n)` and `clusterStats(2n+1)` are the children of `clusterStats(n)`
-    var leafClusterStats = summarizeClusters(data)
-    var dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
-    var clusterStats = leafClusterStats
-
-    // the minimum number of nodes of a binary tree by given parameter
-    var step = 1
-    val numNodeLimit = getMinimumNumNodesInTree(this.k)
-    // divide clusters until the number of clusters reachs the condition
-    // or there is no dividable cluster
-    while (clusterStats.size < numNodeLimit && dividableLeafClusters.nonEmpty) {
-      logInfo(s"${sc.appName} starts step ${step}")
-
-      // can be clustered if the number of divided clusterStats is equal to 0
-      // TODO Remove non-leaf cluster stats from `leafClusterStats`
-      val dividedData = divideClusters(data, dividableLeafClusters, maxIterations, seed).cache()
-      leafClusterStats = summarizeClusters(dividedData)
-      dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
-      clusterStats = clusterStats ++ leafClusterStats
-
-      // keep recent 2 cached RDDs in order to run more quickly
-      updatedDataHistory = updatedDataHistory ++ Array(dividedData)
-      data = dividedData
-      step += 1
-      if (updatedDataHistory.length > 1) {
-        val head = updatedDataHistory.head
-        updatedDataHistory = updatedDataHistory.tail
-        head.unpersist()
-      }
+    if (input.getStorageLevel == StorageLevel.NONE) {
+      logWarning(s"The input RDD ${input.id} is not directly cached, which may hurt performance if"
+        + " its parent RDDs are also not cached.")
     }
-    // create a map of cluster node with their costs
-    val nodes = createClusterNodes(data, clusterStats)
-    // unpersist RDDs
-    data.unpersist()
-    updatedDataHistory.foreach(_.unpersist())
-
-    // build a cluster tree by Map class which is expressed
-    logInfo(s"Building the cluster tree is started in ${sc.appName}")
-    val root = buildTree(nodes, ROOT_INDEX_KEY, this.k)
-    if (root.isEmpty) {
-      new SparkException("Failed to build a cluster tree from a Map type of clusterStats")
+    val d = input.map(_.size).first()
+    logInfo(s"Feature dimension: $d.")
+    // Compute and cache vector norms for fast distance computation.
+    val norms = input.map(v => Vectors.norm(v, 2.0)).persist(StorageLevel.MEMORY_AND_DISK)
+    val vectors = input.zip(norms).map { case (x, norm) => new VectorWithNorm(x, norm) }
+    var assignments = vectors.map(v => (ROOT_INDEX, v))
+    var activeClusters = summarize(d, assignments)
+    val rootSummary = activeClusters(ROOT_INDEX)
+    val n = rootSummary.size
+    logInfo(s"Number of points: $n.")
+    logInfo(s"Initial cost: ${rootSummary.cost}.")
+    val minSize = if (minDivisibleClusterSize >= 1.0) {
+      math.ceil(minDivisibleClusterSize).toLong
+    } else {
+      math.ceil(minDivisibleClusterSize * n).toLong
     }
-
-    // set the elapsed time for training
-    val finishTime = (System.currentTimeMillis() - startTime) / 1000.0
-    logInfo(s"Elapsed Time for ${this.getClass.getSimpleName} Training: ${finishTime} [sec]")
-
-    // make a bisecting kmeans model
-    val model = new BisectingKMeansModel(root.get)
-    val leavesNodes = model.getClusters
-    if (leavesNodes.length < this.k) {
-      logWarning(s"# clusters is less than you want: ${leavesNodes.length} / ${k}")
+    logInfo(s"The minimum number of points of a divisible cluster is $minSize.")
+    var inactiveClusters = mutable.Seq.empty[(Long, ClusterSummary)]
+    val random = new Random(seed)
+    var numLeafClustersNeeded = k - 1
+    var level = 1
+    while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < 63) {
+      // Divisible clusters are sufficiently large and have non-trivial cost.
+      var divisibleClusters = activeClusters.filter { case (_, summary) =>
+        (summary.size >= minSize) && (summary.cost > MLUtils.EPSILON * summary.size)
+      }
+      // If we don't need all divisible clusters, take the larger ones.
+      if (divisibleClusters.size > numLeafClustersNeeded) {
+        divisibleClusters = divisibleClusters.toSeq.sortBy { case (_, summary) =>
+            -summary.size
+          }.take(numLeafClustersNeeded)
+          .toMap
+      }
+      if (divisibleClusters.nonEmpty) {
+        val divisibleIndices = divisibleClusters.keys.toSet
+        logInfo(s"Dividing ${divisibleIndices.size} clusters on level $level.")
+        var newClusterCenters = divisibleClusters.flatMap { case (index, summary) =>
+          val (left, right) = splitCenter(summary.center, random)
+          Iterator((leftChildIndex(index), left), (rightChildIndex(index), right))
+        }.map(identity) // workaround for a Scala bug (SI-7005) that produces a not serializable map
+        var newClusters: Map[Long, ClusterSummary] = null
+        var newAssignments: RDD[(Long, VectorWithNorm)] = null
+        for (iter <- 0 until maxIterations) {
+          newAssignments = updateAssignments(assignments, divisibleIndices, newClusterCenters)
+            .filter { case (index, _) =>
+            divisibleIndices.contains(parentIndex(index))
+          }
+          newClusters = summarize(d, newAssignments)
+          newClusterCenters = newClusters.mapValues(_.center).map(identity)
+        }
+        // TODO: Unpersist old indices.
+        val indices = updateAssignments(assignments, divisibleIndices, newClusterCenters).keys
+          .persist(StorageLevel.MEMORY_AND_DISK)
+        assignments = indices.zip(vectors)
+        inactiveClusters ++= activeClusters
+        activeClusters = newClusters
+        numLeafClustersNeeded -= divisibleClusters.size
+      } else {
+        logInfo(s"None active and divisible clusters left on level $level. Stop iterations.")
+        inactiveClusters ++= activeClusters
+        activeClusters = Map.empty
+      }
+      level += 1
     }
-    model
+    val clusters = activeClusters ++ inactiveClusters
+    val root = buildTree(clusters)
+    new BisectingKMeansModel(root)
   }
+
+  /**
+   * Java-friendly version of [[run(RDD[Vector])*]]
+   */
+  def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
 }
 
+private object BisectingKMeans extends Serializable {
 
-private[clustering] object BisectingKMeans {
+  /** The index of the root node of a tree. */
+  private val ROOT_INDEX: Long = 1
 
-  val ROOT_INDEX_KEY: Long = 1
+  private val MAX_DIVISIBLE_CLUSTER_INDEX: Long = Long.MaxValue / 2
 
-  /**
-   * Finds the closes cluster's center
-   *
-   * @param metric a distance metric
-   * @param centers centers of the clusters
-   * @param point a target point
-   * @return an index of the array of clusters
-   */
-  def findClosestCenter(metric: (BV[Double], BV[Double]) => Double)
-      (centers: Seq[BV[Double]])(point: BV[Double]): Int = {
-    // get the closest index
-    centers.zipWithIndex.map { case (center, idx) => (metric(center, point), idx)}.minBy(_._1)._2
-  }
-
-  /**
-   * Gets the minimum number of nodes in a tree by the number of leaves
-   *
-   * @param k: the number of leaf nodes
-   */
-  def getMinimumNumNodesInTree(k: Int): Int = {
-    // the calculation is same as `math.pow(2, multiplier)`
-    val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1
-    1 << multiplier.toInt
+  /** Returns the left child index of the given node index. */
+  private def leftChildIndex(index: Long): Long = {
+    require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index.")
+    2 * index
   }
 
-  /**
-   * Summarizes data by each cluster as Map
-   *
-   * @param data pairs of point and its cluster index
-   */
-  def summarizeClusters(
-      data: RDD[(Long, BV[Double])]
-    ): collection.Map[Long, BisectingClusterStat] = {
-
-    // sum the number of node and points of each cluster
-    val stats = data.map {case (idx, p) =>
-      (idx, (p, 1L))
-    }.reduceByKey {case ((p1, n1), (p2, n2)) => (p1 + p2, n1 + n2) }.collectAsMap()
-
-    // calculate within-cluster sum of squares of each cluster
-    val bcStats = data.sparkContext.broadcast(stats)
-    val sumOfSquaresMap = data.map { case (idx, point) =>
-      val meanPoint = bcStats.value.apply(idx)._1 :/ bcStats.value.apply(idx)._2.toDouble
-      (idx, (point - meanPoint) dot (point - meanPoint))
-    }.reduceByKey(_ + _).collectAsMap()
-
-    stats.map { case (idx, (sumPoint, n)) =>
-      val meanPoint = sumPoint :/ n.toDouble
-      val sumOfSquares = math.abs(sumOfSquaresMap(idx))
-      (idx, new BisectingClusterStat(n, meanPoint, sumOfSquares))
-    }
+  /** Returns the right child index of the given node index. */
+  private def rightChildIndex(index: Long): Long = {
+    require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index + 1.")
+    2 * index + 1
   }
 
-  /**
-   * Assigns the initial cluster index id to all data
-   */
-  def initData(data: RDD[Vector]): RDD[(Long, BV[Double])] = {
-    data.map { v: Vector => (ROOT_INDEX_KEY, v.toBreeze)}
+  /** Returns the parent index of the given node index, or 0 if the input is 1 (root). */
+  private def parentIndex(index: Long): Long = {
+    index / 2
   }
 
   /**
-   * Gets the initial centers for bisecting k-means
-   *
-   * @param stats pairs of cluster index and cluster statistics
-   * @param seed random seed
+   * Summarizes data by each cluster as Map.
+   * @param d feature dimension
+   * @param assignments pairs of point and its cluster index
+   * @return a map from cluster indices to corresponding cluster summaries
    */
-  def initNextCenters(
-      stats: collection.Map[Long, BisectingClusterStat],
-      seed: Long
-    ): collection.Map[Long, BV[Double]] = {
-
-    val random = new XORShiftRandom()
-    random.setSeed(seed)
-    val nextCenters = stats.flatMap { case (idx, clusterStats) =>
-      val center = clusterStats.mean
-      val stdev = math.sqrt(clusterStats.sumOfSquares) / clusterStats.rows
-      val activeKeys = clusterStats.mean.activeKeysIterator.toArray
-      val activeValues = activeKeys.map(i => random.nextDouble() * stdev)
-      val perturbation = new BSV[Double](activeKeys, activeValues, clusterStats.mean.size)
-      Array((2 * idx, center - perturbation), (2 * idx + 1, center + perturbation))
-    }.toMap
-    nextCenters
+  private def summarize(
+      d: Int,
+      assignments: RDD[(Long, VectorWithNorm)]): Map[Long, ClusterSummary] = {
+    assignments.aggregateByKey(new ClusterSummaryAggregator(d))(
+        seqOp = (agg, v) => agg.add(v),
+        combOp = (agg1, agg2) => agg1.merge(agg2)
+      ).mapValues(_.summary)
+      .collect().toMap
   }
 
   /**
-   * Divides clusters according to their statistics
-   *
-   * @param data pairs of point and its cluster index
-   * @param clusterStats target clusters to divide
-   * @param maxIterations the maximum iterations to calculate clusters statistics
-   * @param seed random seed
+   * Cluster summary aggregator.
+   * @param d feature dimension
    */
-  def divideClusters(
-      data: RDD[(Long, BV[Double])],
-      clusterStats: collection.Map[Long, BisectingClusterStat],
-      maxIterations: Int,
-      seed: Long
-    ): RDD[(Long, BV[Double])] = {
-
-    val sc = data.sparkContext
-    val appName = sc.appName
-
-    // get keys of dividable clusters
-    val dividableClusterStats = clusterStats.filter { case (idx, cluster) => cluster.isDividable }
-    if (dividableClusterStats.isEmpty) {
-      return data
+  private class ClusterSummaryAggregator(val d: Int) extends Serializable {
+    private var n: Long = 0L
+    private val sum: Vector = Vectors.zeros(d)
+    private var sumSq: Double = 0.0
+
+    /** Adds a point. */
+    def add(v: VectorWithNorm): this.type = {
+      n += 1L
+      // TODO: use a numerically stable approach to estimate cost
+      sumSq += v.norm * v.norm
+      BLAS.axpy(1.0, v.vector, sum)
+      this
     }
 
-    // extract dividable input data
-    val dividableData = data.filter { case (idx, point) => dividableClusterStats.contains(idx)}
-    // get next initial centers
-    var newCenters = initNextCenters(dividableClusterStats, seed)
-    var nextData = data
-    var subIter = 0
-    var totalSumOfSquares = Double.MaxValue
-    var oldTotalSumOfSquares = Double.MaxValue
-    var relativeError = Double.MaxValue
-    val dimension = dividableData.first()._2.size
-
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val bcMetric = sc.broadcast(metric)
-
-    while (subIter < maxIterations && relativeError > 1e-4) {
-      // TODO add a set method for the threshold, instead of 1e-4
-
-      // convert each index into the closest child index
-      val bcNewCenters = sc.broadcast(newCenters)
-      nextData = dividableData.map { case (idx, point) =>
-        // calculate next index number
-        val childIndexes = Array(2 * idx, 2 * idx + 1)
-        val childrenCenters = childIndexes
-          .filter(x => bcNewCenters.value.contains(x)).map(bcNewCenters.value(_))
-        if (childrenCenters.length != 2) {
-          new SparkException(s"A node whose index is ${idx} doesn't have two children")
-        }
-        val closestIndex = findClosestCenter(bcMetric.value)(childrenCenters)(point)
-        val nextIndex = 2 * idx + closestIndex
-        (nextIndex, point)
-      }
+    /** Merges another aggregator. */
+    def merge(other: ClusterSummaryAggregator): this.type = {
+      n += other.n
+      sumSq += other.sumSq
+      BLAS.axpy(1.0, other.sum, sum)
+      this
+    }
 
-      // summarize each cluster
-      val zeroValue = (BV.zeros[Double](dimension), 0L, 0.0)
-      val seqOp = (acc: (BV[Double], Long, Double), point: BV[Double]) => {
-        val sums = acc._1 + point
-        val n = acc._2 + 1L
-        val sumOfNorm = acc._3 + (point dot point)
-        (sums, n, sumOfNorm)
+    /** Returns the summary. */
+    def summary: ClusterSummary = {
+      val mean = sum.copy
+      if (n > 0L) {
+        BLAS.scal(1.0 / n, mean)
       }
-      val comOp = (acc1: (BV[Double], Long, Double), acc2: (BV[Double], Long, Double)) =>
-        (acc1._1 + acc2._1, acc1._2 + acc2._2, acc1._3 + acc1._3)
-      val tempStats = nextData.aggregateByKey(zeroValue)(seqOp, comOp).collectAsMap()
-
-      // calculate the center of each cluster
-      newCenters = tempStats.map {case (idx, (sums, n, sumOfNorm)) => (idx, sums :/ n.toDouble)}
-
-      totalSumOfSquares = tempStats.map{case (idx, (sums, n, sumOfNorm)) => sumOfNorm}.sum
-      relativeError = math.abs(totalSumOfSquares - oldTotalSumOfSquares) / totalSumOfSquares
-      oldTotalSumOfSquares = totalSumOfSquares
-      subIter += 1
+      val center = new VectorWithNorm(mean)
+      val cost = math.max(sumSq - n * center.norm * center.norm, 0.0)
+      new ClusterSummary(n, center, cost)
     }
-    nextData
   }
 
   /**
-   * Creates the map of cluster stats to the map of cluster nodes with their costs
+   * Bisects a cluster center.
    *
-   * @param data input data
-   * @param stats map of cluster stats which is described as a binary tree
+   * @param center current cluster center
+   * @param random a random number generator
+   * @return initial centers
    */
-  def createClusterNodes(
-      data: RDD[(Long, BV[Double])],
-      stats: collection.Map[Long, BisectingClusterStat]
-    ): collection.Map[Long, BisectingClusterNode] = {
-    // TODO: support other cost, such as entropy
-    createClusterNodesWithAverageCost(data, stats)
+  private def splitCenter(
+      center: VectorWithNorm,
+      random: Random): (VectorWithNorm, VectorWithNorm) = {
+    val d = center.vector.size
+    val norm = center.norm
+    val level = 1e-4 * norm
+    val noise = Vectors.dense(Array.fill(d)(random.nextDouble()))
+    val left = center.vector.copy
+    BLAS.axpy(-level, noise, left)
+    val right = center.vector.copy
+    BLAS.axpy(level, noise, right)
+    (new VectorWithNorm(left), new VectorWithNorm(right))
   }
 
   /**
-   * Creates the map of cluster stats to the map of cluster nodes with their average costs
+   * Updates assignments.
+   * @param assignments current assignments
+   * @param divisibleIndices divisible cluster indices
+   * @param newClusterCenters new cluster centers
+   * @return new assignments
    */
-  private def createClusterNodesWithAverageCost(
-      data: RDD[(Long, BV[Double])],
-      stats: collection.Map[Long, BisectingClusterStat]
-    ): collection.Map[Long, BisectingClusterNode] = {
-
-    stats.map { case (idx, clusterStats) =>
-      val rows = clusterStats.rows
-      val center = clusterStats.mean
-      val cost = math.sqrt(clusterStats.sumOfSquares) / rows
-      idx -> new BisectingClusterNode(Vectors.fromBreeze(center), rows, cost)
+  private def updateAssignments(
+      assignments: RDD[(Long, VectorWithNorm)],
+      divisibleIndices: Set[Long],
+      newClusterCenters: Map[Long, VectorWithNorm]): RDD[(Long, VectorWithNorm)] = {
+    assignments.map { case (index, v) =>
+      if (divisibleIndices.contains(index)) {
+        val children = Seq(leftChildIndex(index), rightChildIndex(index))
+        val selected = children.minBy { child =>
+          KMeans.fastSquaredDistance(newClusterCenters(child), v)
+        }
+        (selected, v)
+      } else {
+        (index, v)
+      }
     }
   }
 
   /**
-   * Builds a cluster tree from a Map of clusters
-   *
-   * @param treeMap divided clusters as a Map class
-   * @param rootIndex index you want to start
-   * @param k the number of clusters you want
-   * @return a built cluster tree
+   * Builds a clustering tree by re-indexing internal and leaf clusters.
+   * @param clusters a map from cluster indices to corresponding cluster summaries
+   * @return the root node of the clustering tree
    */
-  private def buildTree(
-      treeMap: collection.Map[Long, BisectingClusterNode],
-      rootIndex: Long,
-      k: Int): Option[BisectingClusterNode] = {
-
-    // if there is no index in the Map
-    if (!treeMap.contains(rootIndex)) return None
-
-    // build a cluster tree if the queue is empty or until the number of leaf clusters is enough
-    var numLeavesClusters = 1
-    val root = treeMap(rootIndex)
-    var leavesQueue = Map(rootIndex -> root)
-    while (leavesQueue.nonEmpty && numLeavesClusters < k) {
-      // pick up the largest cluster by the maximum cost of all the clusters
-      val mostScattered = leavesQueue.maxBy(_._2.cost)
-      val mostScatteredKey = mostScattered._1
-      val mostScatteredCluster = mostScattered._2
-
-      // relate the most scattered cluster to its children clusters
-      val childrenIndexes = Array(2 * mostScatteredKey, 2 * mostScatteredKey + 1)
-      if (childrenIndexes.forall(i => treeMap.contains(i))) {
-        // insert children to the most scattered cluster
-        val children = childrenIndexes.map(i => treeMap(i))
-        mostScatteredCluster.insert(children)
-
-        // calculate the local dendrogram height
-        // TODO Supports distance metrics other Euclidean distance metric
-        val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-        val localHeight = children
-          .map(child => metric(child.center.toBreeze, mostScatteredCluster.center.toBreeze)).max
-        mostScatteredCluster.setLocalHeight(localHeight)
-
-        // update the queue
-        leavesQueue = leavesQueue ++ childrenIndexes.map(i => i -> treeMap(i)).toMap
-        numLeavesClusters += 1
+  private def buildTree(clusters: Map[Long, ClusterSummary]): ClusteringTreeNode = {
+    var leafIndex = 0
+    var internalIndex = -1
+
+    /**
+     * Builds a subtree from this given node index.
+     */
+    def buildSubTree(rawIndex: Long): ClusteringTreeNode = {
+      val cluster = clusters(rawIndex)
+      val size = cluster.size
+      val center = cluster.center
+      val cost = cluster.cost
+      val isInternal = clusters.contains(leftChildIndex(rawIndex))
+      if (isInternal) {
+        val index = internalIndex
+        internalIndex -= 1
+        val leftIndex = leftChildIndex(rawIndex)
+        val rightIndex = rightChildIndex(rawIndex)
+        val height = math.sqrt(Seq(leftIndex, rightIndex).map { childIndex =>
+          KMeans.fastSquaredDistance(center, clusters(childIndex).center)
+        }.max)
+        val left = buildSubTree(leftIndex)
+        val right = buildSubTree(rightIndex)
+        new ClusteringTreeNode(index, size, center, cost, height, Array(left, right))
+      } else {
+        val index = leafIndex
+        leafIndex += 1
+        val height = 0.0
+        new ClusteringTreeNode(index, size, center, cost, height, Array.empty)
       }
-
-      // remove the cluster which is involved to the cluster tree
-      leavesQueue = leavesQueue.filterNot(_ == mostScattered)
     }
-    Some(root)
+
+    buildSubTree(ROOT_INDEX)
   }
+
+  /**
+   * Summary of a cluster.
+   *
+   * @param size the number of points within this cluster
+   * @param center the center of the points within this cluster
+   * @param cost the sum of squared distances to the center
+   */
+  private case class ClusterSummary(size: Long, center: VectorWithNorm, cost: Double)
 }
 
 /**
- * A cluster as a tree node which can have its sub nodes
+ * Represents a node in a clustering tree.
  *
- * @param center the center of the cluster
- * @param rows the number of rows in the cluster
- * @param cost how large a cluster is
- * @param localHeight the maximal distance between this node and its children
- * @param parent the parent cluster of the cluster
- * @param children the children nodes of the cluster
+ * @param index node index, negative for internal nodes and non-negative for leaf nodes
+ * @param size size of the cluster
+ * @param centerWithNorm cluster center with norm
+ * @param cost cost of the cluster, i.e., the sum of squared distances to the center
+ * @param height height of the node in the dendrogram. Currently this is defined as the max distance
+ *               from the center to the centers of the children's, but subject to change.
+ * @param children children nodes
  */
 @Since("1.6.0")
-class BisectingClusterNode private (
-    @Since("1.6.0") val center: Vector,
-    @Since("1.6.0") val rows: Long,
-    @Since("1.6.0") val cost: Double,
-    private var localHeight: Double,
-    private var parent: Option[BisectingClusterNode],
-    private var children: Seq[BisectingClusterNode]) extends Serializable {
-
-  require(!cost.isNaN)
-
-  @Since("1.6.0")
-  def this(center: Vector, rows: Long, cost: Double) =
-    this(center, rows, cost, 0.0, None, Array.empty[BisectingClusterNode])
-
-  /**
-   * Inserts a sub node as its child
-   *
-   * @param child inserted sub node
-   */
-  @Since("1.6.0")
-  def insert(child: BisectingClusterNode) {
-    insert(Array(child))
+@Experimental
+class ClusteringTreeNode private[clustering] (
+    val index: Int,
+    val size: Long,
+    private val centerWithNorm: VectorWithNorm,
+    val cost: Double,
+    val height: Double,
+    val children: Array[ClusteringTreeNode]) extends Serializable {
+
+  /** Whether this is a leaf node. */
+  val isLeaf: Boolean = children.isEmpty
+
+  require((isLeaf && index >= 0) || (!isLeaf && index < 0))
+
+  /** Cluster center. */
+  def center: Vector = centerWithNorm.vector
+
+  /** Predicts the leaf cluster node index that the input point belongs to. */
+  def predict(point: Vector): Int = {
+    val (index, _) = predict(new VectorWithNorm(point))
+    index
   }
 
-  /**
-   * Inserts sub nodes as its children
-   *
-   * @param children inserted sub nodes
-   */
-  @Since("1.6.0")
-  def insert(children: Array[BisectingClusterNode]) {
-    this.children = this.children ++ children
-    children.foreach(child => child.parent = Some(this))
+  /** Returns the full prediction path from root to leaf. */
+  def predictPath(point: Vector): Array[ClusteringTreeNode] = {
+    predictPath(new VectorWithNorm(point)).toArray
   }
 
-  /**
-   * Converts the tree into Array class
-   * the sub nodes are recursively expanded
-   *
-   * @return an Array class which the cluster tree is expanded
-   */
-  @Since("1.6.0")
-  def toArray: Array[BisectingClusterNode] = {
-    val array = this.children.size match {
-      case 0 => Array(this)
-      case _ => Array(this) ++ this.children.flatMap(child => child.toArray.toIterator)
-    }
-    array.sortWith { case (a, b) =>
-      a.getDepth < b.getDepth && a.cost < b.cost && a.rows < b.rows
+  /** Returns the full prediction path from root to leaf. */
+  private def predictPath(pointWithNorm: VectorWithNorm): List[ClusteringTreeNode] = {
+    if (isLeaf) {
+      this :: Nil
+    } else {
+      val selected = children.minBy { child =>
+        KMeans.fastSquaredDistance(child.centerWithNorm, pointWithNorm)
+      }
+      selected :: selected.predictPath(pointWithNorm)
     }
   }
 
   /**
-   * Gets the depth of the cluster in the tree
-   *
-   * @return the depth from the root
+   * Computes the cost (squared distance to the predicted leaf cluster center) of the input point.
    */
-  @Since("1.6.0")
-  def getDepth: Int = {
-    this.parent match {
-      case None => 0
-      case _ => 1 + this.parent.get.getDepth
-    }
+  def computeCost(point: Vector): Double = {
+    val (_, cost) = predict(new VectorWithNorm(point))
+    cost
   }
 
   /**
-   * Finds a leaf which is the closest under the node
-   *
-   * @param point target point
+   * Predicts the cluster index and the cost of the input point.
    */
-  @Since("1.6.0")
-  def findClosestLeaf(
-      point: Vector,
-      metric: (BV[Double], BV[Double]) => Double
-    ): BisectingClusterNode = {
-    this.children.size match {
-      case 0 => this
-      case _ => {
-        val bv = point.toBreeze
-        val centers = this.children.map(_.center).map(_.toBreeze)
-        val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(bv)
-        this.children(closestIndex).findClosestLeaf(point, metric)
-      }
-    }
+  private def predict(pointWithNorm: VectorWithNorm): (Int, Double) = {
+    predict(pointWithNorm, KMeans.fastSquaredDistance(centerWithNorm, pointWithNorm))
   }
 
   /**
-   * Gets the leaves nodes in the cluster tree
+   * Predicts the cluster index and the cost of the input point.
+   * @param pointWithNorm input point
+   * @param cost the cost to the current center
+   * @return (predicted leaf cluster index, cost)
    */
-  @Since("1.6.0")
-  def getLeavesNodes: Array[BisectingClusterNode] = {
-    this.toArray.filter(_.isLeaf).sortBy(_.center.toArray.sum)
+  private def predict(pointWithNorm: VectorWithNorm, cost: Double): (Int, Double) = {
+    if (isLeaf) {
+      (index, cost)
+    } else {
+      val (selectedChild, minCost) = children.map { child =>
+        (child, KMeans.fastSquaredDistance(child.centerWithNorm, pointWithNorm))
+      }.minBy(_._2)
+      selectedChild.predict(pointWithNorm, minCost)
+    }
   }
 
-  @Since("1.6.0")
-  def isLeaf: Boolean = this.children.isEmpty
-
-  @Since("1.6.0")
-  def getParent: Option[BisectingClusterNode] = this.parent
-
-  @Since("1.6.0")
-  def getChildren: Seq[BisectingClusterNode] = this.children
-
   /**
-   * Gets the dendrogram height of the cluster at the cluster tree.
-   * A dendrogram height is different from a local height.
-   * A dendrogram height means a total height of a node in a tree.
-   * A local height means a maximum distance between a node and its children.
-   *
-   * @return the dendrogram height
+   * Returns all leaf nodes from this node.
    */
-  @Since("1.6.0")
-  def getHeight: Double = {
-    this.children.size match {
-      case 0 => 0.0
-      case _ => this.localHeight + this.children.map(_.getHeight).max
+  def leafNodes: Array[ClusteringTreeNode] = {
+    if (isLeaf) {
+      Array(this)
+    } else {
+      children.flatMap(_.leafNodes)
     }
   }
-
-  @Since("1.6.0")
-  def setLocalHeight(height: Double): Unit = this.localHeight = height
-}
-
-
-/**
- *  This class is used for maneging a cluster statistics
- *
- * @param rows the number of points
- * @param mean the sum of points
- * @param sumOfSquares the sum of squares of points
- */
-private[clustering] case class BisectingClusterStat (
-    rows: Long,
-    mean: BV[Double],
-    sumOfSquares: Double) extends Serializable {
-  require(sumOfSquares >= 0.0)
-
-  def isDividable: Boolean = sumOfSquares > 0 && rows >= 2
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 38f4695eb0d26..5015f1540d920 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -17,80 +17,79 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{Vector => BV, norm => breezeNorm}
-
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Since
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 /**
- * This class is used for the model of the bisecting kmeans
+ * Clustering model produced by [[BisectingKMeans]].
+ * The prediction is done level-by-level from the root node to a leaf node, and at each node among
+ * its children the closest to the input point is selected.
  *
- * @param node a cluster as a tree node
+ * @param root the root node of the clustering tree
  */
 @Since("1.6.0")
+@Experimental
 class BisectingKMeansModel @Since("1.6.0") (
-    @Since("1.6.0") val node: BisectingClusterNode
+    @Since("1.6.0") val root: ClusteringTreeNode
   ) extends Serializable with Logging {
 
+  /**
+   * Leaf cluster centers.
+   */
   @Since("1.6.0")
-  def getClusters: Array[BisectingClusterNode] = this.node.getLeavesNodes
+  def clusterCenters: Array[Vector] = root.leafNodes.map(_.center)
 
-  @Since("1.6.0")
-  def getCenters: Array[Vector] = this.getClusters.map(_.center)
+  /**
+   * Number of leaf clusters.
+   */
+  lazy val k: Int = clusterCenters.length
 
   /**
-   * Predicts the closest cluster by one point
+   * Predicts the index of the cluster that the input point belongs to.
    */
   @Since("1.6.0")
-  def predict(vector: Vector): Int = {
-    // TODO Supports distance metrics other Euclidean distance metric
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val closestLeafNode = this.node.findClosestLeaf(vector, metric)
-
-    val closestCenter = closestLeafNode.center
-    val centers = this.getCenters.map(_.toBreeze)
-    BisectingKMeans.findClosestCenter(metric)(centers)(closestCenter.toBreeze)
+  def predict(point: Vector): Int = {
+    root.predict(point)
   }
 
   /**
-   * Predicts the closest cluster by RDD of the points
+   * Predicts the indices of the clusters that the input points belong to.
    */
   @Since("1.6.0")
-  def predict(data: RDD[Vector]): RDD[Int] = {
-    val sc = data.sparkContext
-    data.map { p => predict(p) }
+  def predict(points: RDD[Vector]): RDD[Int] = {
+    points.map { p => root.predict(p) }
   }
 
   /**
-   * Predicts the closest cluster by RDD of the points for Java
+   * Java-friendly version of [[predict(RDD[Vector])*]]
    */
   @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
-   * Computes Within Set Sum of Squared Error(WSSSE)
+   * Computes the squared distance between the input point and the cluster center it belongs to.
+   */
+  @Since("1.6.0")
+  def computeCost(point: Vector): Double = {
+    root.computeCost(point)
+  }
+
+  /**
+   * Computes the sum of squared distances between the input points and their corresponding cluster
+   * centers.
    */
   @Since("1.6.0")
   def computeCost(data: RDD[Vector]): Double = {
-    val bvCenters = this.getCenters.map(_.toBreeze)
-    data.context.broadcast(bvCenters)
-    val distances = data.map {point =>
-      val bvPoint = point.toBreeze
-      val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-      val idx = BisectingKMeans.findClosestCenter(metric)(bvCenters)(bvPoint)
-      val closestCenter = bvCenters(idx)
-      val distance = metric(bvPoint, closestCenter)
-      distance
-    }
-    distances.sum()
+    data.map(root.computeCost).sum()
   }
 
+  /**
+   * Java-friendly version of [[computeCost(RDD[Vector])*]].
+   */
   @Since("1.6.0")
   def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)
-
 }
-
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index 926bd54e54424..a714620ff7e4b 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -18,13 +18,12 @@
 package org.apache.spark.mllib.clustering;
 
 import java.io.Serializable;
-import java.util.List;
 
+import com.google.common.collect.Lists;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
-import static org.junit.Assert.assertEquals;
-import com.google.common.collect.Lists;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -46,63 +45,29 @@ public void tearDown() {
   }
 
   @Test
-  public void runWithSmallData() {
-    List<Vector> points = Lists.newArrayList(
-        Vectors.dense(1.0, 2.0, 6.0),
-        Vectors.dense(1.0, 3.0, 0.0),
-        Vectors.dense(1.0, 4.0, 6.0)
-    );
-
-    Vector expectedCenter = Vectors.dense(1.0, 3.0, 4.0);
-
-    JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setK(1);
-    BisectingKMeansModel model = algo.run(data.rdd());
-    assertEquals(1, model.getCenters().length);
-    assertEquals(expectedCenter, model.getCenters()[0]);
-  }
-
-  @Test
-  public void runWithDenseVectors() {
-    int numClusters = 5;
-    List<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < 99; i++) {
-      Double elm = (double)(i % numClusters);
-      Vector point = Vectors.dense(elm, elm);
-      points.add(point);
-    }
-    JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setK(numClusters);
-    BisectingKMeansModel model = algo.run(data.rdd());
-    Vector[] centers = model.getCenters();
-    assertEquals(numClusters, centers.length);
-    assertEquals(Vectors.dense(0.0, 0.0), centers[0]);
-    assertEquals(Vectors.dense(1.0, 1.0), centers[1]);
-    assertEquals(Vectors.dense(2.0, 2.0), centers[2]);
-    assertEquals(Vectors.dense(3.0, 3.0), centers[3]);
-    assertEquals(Vectors.dense(4.0, 4.0), centers[4]);
-  }
+  public void twoDimensionalData() {
+    JavaRDD<Vector> points = sc.parallelize(Lists.newArrayList(
+      Vectors.dense(4, -1),
+      Vectors.dense(4, 1),
+      Vectors.sparse(2, new int[] {0}, new double[] {1.0})
+    ), 2);
 
-  @Test
-  public void runWithSparseVectors() {
-    int numClusters = 5;
-    List<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < 99; i++) {
-      int elm = i % numClusters;
-      int indexes[] = {elm};
-      double values[] = {elm};
-      Vector point = Vectors.sparse(numClusters, indexes, values);
-      points.add(point);
+    BisectingKMeans bkm = new BisectingKMeans()
+      .setK(4)
+      .setMaxIterations(2)
+      .setSeed(1L);
+    BisectingKMeansModel model = bkm.run(points);
+    Assert.assertEquals(3, model.k());
+    Assert.assertArrayEquals(new double[] {3.0, 0.0}, model.root().center().toArray(), 1e-12);
+    for (ClusteringTreeNode child: model.root().children()) {
+      double[] center = child.center().toArray();
+      if (center[0] > 2) {
+        Assert.assertEquals(2, child.size());
+        Assert.assertArrayEquals(new double[] {4.0, 0.0}, center, 1e-12);
+      } else {
+        Assert.assertEquals(1, child.size());
+        Assert.assertArrayEquals(new double[] {1.0, 0.0}, center, 1e-12);
+      }
     }
-    JavaRDD<Vector> data = sc.parallelize(points, 2);
-    BisectingKMeans algo = new BisectingKMeans().setK(numClusters);
-    BisectingKMeansModel model = algo.run(data.rdd());
-    Vector[] centers = model.getCenters();
-    assertEquals(numClusters, centers.length);
-    assertEquals(points.get(0), centers[0]);
-    assertEquals(points.get(1), centers[1]);
-    assertEquals(points.get(2), centers[2]);
-    assertEquals(points.get(3), centers[3]);
-    assertEquals(points.get(4), centers[4]);
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
deleted file mode 100644
index ceac039efc8d0..0000000000000
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansModelSuite.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.clustering
-
-import org.scalatest.BeforeAndAfterEach
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-
-class BisectingKMeansModelSuite
-    extends SparkFunSuite with MLlibTestSparkContext with BeforeAndAfterEach {
-
-  test("clustering dense vectors") {
-    val app = new BisectingKMeans().setK(5).setSeed(1)
-
-    val localData = (1 to 100).toSeq.map { i =>
-      val label = i % 5
-      val vector = Vectors.dense(label, label, label)
-      (label, vector)
-    }
-    val data = sc.parallelize(localData.map(_._2))
-    val model = app.run(data)
-
-    val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[BisectingClusterNode]])
-    assert(clusters.length === 5)
-
-    val centers = model.getCenters.sortBy(_.toArray.sum)
-    assert(centers.length === 5)
-    assert(centers(0) === Vectors.dense(0.0, 0.0, 0.0))
-    assert(centers(1) === Vectors.dense(1.0, 1.0, 1.0))
-    assert(centers(2) === Vectors.dense(2.0, 2.0, 2.0))
-    assert(centers(3) === Vectors.dense(3.0, 3.0, 3.0))
-    assert(centers(4) === Vectors.dense(4.0, 4.0, 4.0))
-
-    // predict with one vector
-    assert(model.predict(Vectors.dense(0.0, 0.0, 0.0)) === 0)
-    assert(model.predict(Vectors.dense(0.5, 0.5, 0.5)) === 0)
-    assert(model.predict(Vectors.dense(1.0, 1.0, 1.0)) === 1)
-    assert(model.predict(Vectors.dense(2.0, 2.0, 2.0)) === 2)
-    assert(model.predict(Vectors.dense(3.0, 3.0, 3.0)) === 3)
-    assert(model.predict(Vectors.dense(4.0, 4.0, 4.0)) === 4)
-
-    // predict with a RDD
-    val predicted = model.predict(data).collect()
-    assert(predicted === localData.map(_._1))
-
-    // compute WSSSE
-    assert(model.computeCost(data) === 0.0)
-  }
-
-  test("clustering sparse vectors") {
-    val app = new BisectingKMeans().setK(5).setSeed(1)
-
-    val localData = (1 to 100).toSeq.map { i =>
-      val label = i % 5
-      val vector = Vectors.sparse(5, Seq((label, label.toDouble)))
-      (label, vector)
-    }
-    val data = sc.parallelize(localData.map(_._2))
-    val model = app.run(data)
-
-    val clusters = model.getClusters
-    assert(clusters.isInstanceOf[Array[BisectingClusterNode]])
-    assert(clusters.length === 5)
-
-    val centers = model.getCenters.sortBy(_.toArray.sum)
-    assert(centers.length === 5)
-    assert(centers(0) === Vectors.sparse(5, Array(), Array()))
-    assert(centers(1) === Vectors.sparse(5, Array(1), Array(1.0)))
-    assert(centers(2) === Vectors.sparse(5, Array(2), Array(2.0)))
-    assert(centers(3) === Vectors.sparse(5, Array(3), Array(3.0)))
-    assert(centers(4) === Vectors.sparse(5, Array(4), Array(4.0)))
-
-    // predict with one vector
-    assert(model.predict(Vectors.sparse(5, Array(0), Array(0.0))) === 0)
-    assert(model.predict(Vectors.sparse(5, Array(1), Array(1.0))) === 1)
-    assert(model.predict(Vectors.sparse(5, Array(2), Array(2.0))) === 2)
-    assert(model.predict(Vectors.sparse(5, Array(3), Array(3.0))) === 3)
-    assert(model.predict(Vectors.sparse(5, Array(4), Array(4.0))) === 4)
-
-    // predict with a RDD
-    val predicted = model.predict(data).collect()
-    assert(predicted === localData.map(_._1))
-
-    // compute WSSSE
-    assert(model.computeCost(data) === 0.0)
-  }
-
-  test("clustering should be done correctly") {
-    for (k <- Array(9, 19)) {
-      val app = new BisectingKMeans().setK(k).setSeed(1)
-      val localData = (1 to 19).toSeq.map { i =>
-        val label = i % k
-        val sparseVector = Vectors.sparse(k, Seq((label, label.toDouble)))
-        val denseVector = Vectors.fromBreeze(sparseVector.toBreeze.toDenseVector)
-        (label, denseVector, sparseVector)
-      }
-
-      // dense version
-      val denseData = sc.parallelize(localData.map(_._2), 2)
-      val denseModel = app.run(denseData)
-      assert(denseModel.getCenters.length === k)
-      assert(denseModel.getClusters.forall(_.cost == 0.0))
-
-      // sparse version
-      val sparseData = sc.parallelize(localData.map(_._3), 2)
-      val sparseModel = app.run(sparseData)
-      assert(sparseModel.getCenters.length === k)
-      assert(sparseModel.getClusters.forall(_.cost == 0.0))
-    }
-  }
-}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
index 74e12d00c2022..41b9d5c0d93bb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/BisectingKMeansSuite.scala
@@ -17,184 +17,166 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{Vector => BV, norm => breezeNorm}
-
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-
 class BisectingKMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  test("run") {
-    val k = 123
-    val algo = new BisectingKMeans().setK(k).setSeed(1)
-    val localSeed: Seq[Vector] = (0 to 999).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val data = sc.parallelize(localSeed, 2)
-    val model = algo.run(data)
-    assert(model.getClusters.length == 123)
-    assert(model.node.getHeight ~== 702.8641 absTol 10E-4)
-
-    // check the relations between a parent cluster and its children
-    assert(model.node.getParent === None)
-    assert(model.node.getChildren.head.getParent.get === model.node)
-    assert(model.node.getChildren.apply(1).getParent.get === model.node)
-    assert(model.getClusters.forall(_.getParent.isDefined))
-
-    val predicted = model.predict(data)
-    assert(predicted.distinct.count() === k)
+  test("default values") {
+    val bkm0 = new BisectingKMeans()
+    assert(bkm0.getK === 4)
+    assert(bkm0.getMaxIterations === 20)
+    assert(bkm0.getMinDivisibleClusterSize === 1.0)
+    val bkm1 = new BisectingKMeans()
+    assert(bkm0.getSeed === bkm1.getSeed, "The default seed should be constant.")
   }
 
-  test("run with too many cluster size than the records") {
-    val algo = new BisectingKMeans().setK(123).setSeed(1)
-    val localSeed: Seq[Vector] = (0 to 99).map(i => Vectors.dense(i.toDouble, i.toDouble)).toSeq
-    val data = sc.parallelize(localSeed)
-    val model = algo.run(data)
-    assert(model.getClusters.length == 100)
-    assert(model.node.getHeight ~== 72.12489 absTol 10E-4)
+  test("setter/getter") {
+    val bkm = new BisectingKMeans()
+
+    val k = 10
+    assert(bkm.getK !== k)
+    assert(bkm.setK(k).getK === k)
+    val maxIter = 100
+    assert(bkm.getMaxIterations !== maxIter)
+    assert(bkm.setMaxIterations(maxIter).getMaxIterations === maxIter)
+    val minSize = 2.0
+    assert(bkm.getMinDivisibleClusterSize !== minSize)
+    assert(bkm.setMinDivisibleClusterSize(minSize).getMinDivisibleClusterSize === minSize)
+    val seed = 10L
+    assert(bkm.getSeed !== seed)
+    assert(bkm.setSeed(seed).getSeed === seed)
+
+    intercept[IllegalArgumentException] {
+      bkm.setK(0)
+    }
+    intercept[IllegalArgumentException] {
+      bkm.setMaxIterations(0)
+    }
+    intercept[IllegalArgumentException] {
+      bkm.setMinDivisibleClusterSize(0.0)
+    }
   }
 
-  test("setNumClusters") {
-    val algo = new BisectingKMeans()
-    assert(algo.getK == 2)
-    algo.setK(1000)
-    assert(algo.getK == 1000)
+  test("1D data") {
+    val points = Vectors.sparse(1, Array.empty, Array.empty) +:
+      (1 until 8).map(i => Vectors.dense(i))
+    val data = sc.parallelize(points, 2)
+    val bkm = new BisectingKMeans()
+      .setK(4)
+      .setMaxIterations(1)
+      .setSeed(1L)
+    // The clusters should be
+    // (0, 1, 2, 3, 4, 5, 6, 7)
+    //   - (0, 1, 2, 3)
+    //     - (0, 1)
+    //     - (2, 3)
+    //   - (4, 5, 6, 7)
+    //     - (4, 5)
+    //     - (6, 7)
+    val model = bkm.run(data)
+    assert(model.k === 4)
+    // The total cost should be 8 * 0.5 * 0.5 = 2.0.
+    assert(model.computeCost(data) ~== 2.0 relTol 1e-12)
+    val predictions = data.map(v => (v(0), model.predict(v))).collectAsMap()
+    Range(0, 8, 2).foreach { i =>
+      assert(predictions(i) === predictions(i + 1),
+        s"$i and ${i + 1} should belong to the same cluster.")
+    }
+    val root = model.root
+    assert(root.center(0) ~== 3.5 relTol 1e-12)
+    assert(root.height ~== 2.0 relTol 1e-12)
+    assert(root.children.length === 2)
+    assert(root.children(0).height ~== 1.0 relTol 1e-12)
+    assert(root.children(1).height ~== 1.0 relTol 1e-12)
   }
 
-  test("setSubIterations") {
-    val algo = new BisectingKMeans()
-    assert(algo.getMaxIterations == 20)
-    algo.setMaxIterations(15)
-    assert(algo.getMaxIterations == 15)
+  test("points are the same") {
+    val data = sc.parallelize(Seq.fill(8)(Vectors.dense(1.0, 1.0)), 2)
+    val bkm = new BisectingKMeans()
+      .setK(2)
+      .setMaxIterations(1)
+      .setSeed(1L)
+    val model = bkm.run(data)
+    assert(model.k === 1)
   }
 
-  test("setSeed") {
-    val algo = new BisectingKMeans()
-    assert(algo.getSeed == 1)
-    algo.setSeed(987)
-    assert(algo.getSeed == 987)
+  test("more desired clusters than points") {
+    val data = sc.parallelize(Seq.tabulate(4)(i => Vectors.dense(i)), 2)
+    val bkm = new BisectingKMeans()
+      .setK(8)
+      .setMaxIterations(2)
+      .setSeed(1L)
+    val model = bkm.run(data)
+    assert(model.k === 4)
   }
 
-  test("summarize center stats") {
-    val algo = new BisectingKMeans
-    val local = Seq(
-      (4L, Vectors.dense(1.5, 1.5).toBreeze),
-      (4L, Vectors.dense(2.5, 2.5).toBreeze),
-      (5L, Vectors.dense(11.5, 11.5).toBreeze),
-      (5L, Vectors.dense(12.5, 12.5).toBreeze),
-      (6L, Vectors.dense(21.5, 21.5).toBreeze),
-      (6L, Vectors.dense(22.5, 22.5).toBreeze),
-      (7L, Vectors.dense(31.5, 31.5).toBreeze),
-      (7L, Vectors.dense(32.5, 32.5).toBreeze)
-    )
-    val data = sc.parallelize(local)
-
-    val clusterStats = BisectingKMeans.summarizeClusters(data)
-    assert(clusterStats.size === 4)
-    assert(clusterStats(4).mean === Vectors.dense(2.0, 2.0).toBreeze)
-    assert(clusterStats(4).sumOfSquares ~== 1.0 absTol 10e-4)
-    assert(clusterStats(4).rows === 2)
-    assert(clusterStats(5).mean === Vectors.dense(12.0, 12.0).toBreeze)
-    assert(clusterStats(5).sumOfSquares ~== 1.0 absTol 10e-4)
-    assert(clusterStats(5).rows === 2)
-    assert(clusterStats(6).mean === Vectors.dense(22.0, 22.0).toBreeze)
-    assert(clusterStats(6).sumOfSquares ~== 1.0 absTol 10e-4)
-    assert(clusterStats(6).rows === 2)
-    assert(clusterStats(7).mean === Vectors.dense(32.0, 32.0).toBreeze)
-    assert(clusterStats(7).sumOfSquares ~== 1.0 absTol 10e-4)
-    assert(clusterStats(7).rows === 2)
+  test("min divisible cluster") {
+    val data = sc.parallelize(
+      Seq.tabulate(16)(i => Vectors.dense(i)) ++ Seq.tabulate(4)(i => Vectors.dense(-100.0 - i)),
+      2)
+    val bkm = new BisectingKMeans()
+      .setK(4)
+      .setMinDivisibleClusterSize(10)
+      .setMaxIterations(1)
+      .setSeed(1L)
+    val model = bkm.run(data)
+    assert(model.k === 3)
+    assert(model.predict(Vectors.dense(-100)) === model.predict(Vectors.dense(-97)))
+    assert(model.predict(Vectors.dense(7)) !== model.predict(Vectors.dense(8)))
+
+    bkm.setMinDivisibleClusterSize(0.5)
+    val sameModel = bkm.run(data)
+    assert(sameModel.k === 3)
   }
 
-  test("initialize centers at next step") {
-    val local = Seq(
-      (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)),
-      (3L, BV[Double](1.9, 1.9)), (2L, BV[Double](2.1, 2.1))
-    )
-    val data = sc.parallelize(local)
-    val stats = Map[Long, BisectingClusterStat](
-      2L -> new BisectingClusterStat(2, BV[Double](1.0, 1.0) * 2.0, 0.0),
-      3L -> new BisectingClusterStat(2, BV[Double](2.0, 2.0) * 2.0, 0.0)
-    )
-    val initNextCenters = BisectingKMeans.initNextCenters(stats, 1)
-    assert(initNextCenters.size === 4)
-    assert(initNextCenters.keySet === Set(4, 5, 6, 7))
+  test("larger clusters get selected first") {
+    val data = sc.parallelize(
+      Seq.tabulate(16)(i => Vectors.dense(i)) ++ Seq.tabulate(4)(i => Vectors.dense(-100.0 - i)),
+      2)
+    val bkm = new BisectingKMeans()
+      .setK(3)
+      .setMaxIterations(1)
+      .setSeed(1L)
+    val model = bkm.run(data)
+    assert(model.k === 3)
+    assert(model.predict(Vectors.dense(-100)) === model.predict(Vectors.dense(-97)))
+    assert(model.predict(Vectors.dense(7)) !== model.predict(Vectors.dense(8)))
   }
 
-  test("should assign each data to new clusters") {
-    val seed = Seq(
-      (2L, Vectors.dense(0.0, 0.0)), (2L, Vectors.dense(1.0, 1.0)),
-      (2L, Vectors.dense(2.0, 2.0)), (2L, Vectors.dense(3.0, 3.0)),
-      (2L, Vectors.dense(4.0, 4.0)), (2L, Vectors.dense(5.0, 5.0)),
-      (3L, Vectors.dense(6.0, 6.0)), (3L, Vectors.dense(7.0, 7.0)),
-      (3L, Vectors.dense(8.0, 8.0)), (3L, Vectors.dense(9.0, 9.0)),
-      (3L, Vectors.dense(10.0, 10.0)), (3L, Vectors.dense(11.0, 11.0))
-    ).map { case (idx, vector) => (idx, vector.toBreeze) }
-    val variance = breezeNorm(Vectors.dense(1.0, 1.0).toBreeze, 2.0)
-    val newClusterStats = Map(
-      4L -> new BisectingClusterStat(3L, BV[Double](1.0, 1.0) :* 3.0, variance),
-      5L -> new BisectingClusterStat(3L, BV[Double](4.0, 4.0) :* 3.0, variance),
-      6L -> new BisectingClusterStat(3L, BV[Double](7.0, 7.0) :* 3.0, variance),
-      7L -> new BisectingClusterStat(3L, BV[Double](10.0, 10.0) :* 3.0, variance)
-    )
-    val data = sc.parallelize(seed, 1)
-    val leafClusterStats = BisectingKMeans.summarizeClusters(data)
-    val dividableLeafClusters = leafClusterStats.filter(_._2.isDividable)
-    val result = BisectingKMeans.divideClusters(data, dividableLeafClusters, 20, 1).collect()
-
-    val expected = Seq(
-      (4, Vectors.dense(0.0, 0.0)), (4, Vectors.dense(1.0, 1.0)), (4, Vectors.dense(2.0, 2.0)),
-      (5, Vectors.dense(3.0, 3.0)), (5, Vectors.dense(4.0, 4.0)), (5, Vectors.dense(5.0, 5.0)),
-      (6, Vectors.dense(6.0, 6.0)), (6, Vectors.dense(7.0, 7.0)), (6, Vectors.dense(8.0, 8.0)),
-      (7, Vectors.dense(9.0, 9.0)), (7, Vectors.dense(10.0, 10.0)), (7, Vectors.dense(11.0, 11.0))
-    ).map { case (idx, vector) => (idx, vector.toBreeze) }
-    assert(result === expected)
-  }
-
-  test("findClosestCenter") {
-    val metric = (bv1: BV[Double], bv2: BV[Double]) => breezeNorm(bv1 - bv2, 2.0)
-    val centers = Seq(
-      Vectors.sparse(5, Array(0, 1, 2), Array(0.0, 1.0, 2.0)).toBreeze,
-      Vectors.sparse(5, Array(1, 2, 3), Array(1.0, 2.0, 3.0)).toBreeze,
-      Vectors.sparse(5, Array(2, 3, 4), Array(2.0, 3.0, 4.0)).toBreeze
-    )
-
-    for (i <- 0 to (centers.size - 1)) {
-      val point = centers(i)
-      val closestIndex = BisectingKMeans.findClosestCenter(metric)(centers)(point)
-      assert(closestIndex === i)
+  test("2D data") {
+    val points = Seq(
+      (11, 10), (9, 10), (10, 9), (10, 11),
+      (11, -10), (9, -10), (10, -9), (10, -11),
+      (0, 1), (0, -1)
+    ).map { case (x, y) =>
+      if (x == 0) {
+        Vectors.sparse(2, Array(1), Array(y))
+      } else {
+        Vectors.dense(x, y)
+      }
     }
-  }
-
-  test("should be equal to math.pow") {
-    (1 to 1000).foreach { k =>
-      // the minimum number of nodes of a binary tree by given parameter
-      val multiplier = math.ceil(math.log(k) / math.log(2.0)) + 1
-      val expected = math.pow(2, multiplier).toInt
-      val result = BisectingKMeans.getMinimumNumNodesInTree(k)
-      assert(result === expected)
+    val data = sc.parallelize(points, 2)
+    val bkm = new BisectingKMeans()
+      .setK(3)
+      .setMaxIterations(4)
+      .setSeed(1L)
+    val model = bkm.run(data)
+    assert(model.k === 3)
+    assert(model.root.center ~== Vectors.dense(8, 0) relTol 1e-12)
+    model.root.leafNodes.foreach { node =>
+      if (node.center(0) < 5) {
+        assert(node.size === 2)
+        assert(node.center ~== Vectors.dense(0, 0) relTol 1e-12)
+      } else if (node.center(1) > 0) {
+        assert(node.size === 4)
+        assert(node.center ~== Vectors.dense(10, 10) relTol 1e-12)
+      } else {
+        assert(node.size === 4)
+        assert(node.center ~== Vectors.dense(10, -10) relTol 1e-12)
+      }
     }
   }
-
-  test("should divide clusters correctly") {
-    val local = Seq(
-      (2L, BV[Double](0.9, 0.9)), (2L, BV[Double](1.1, 1.1)),
-      (2L, BV[Double](9.9, 9.9)), (2L, BV[Double](10.1, 10.1)),
-      (3L, BV[Double](99.9, 99.9)), (3L, BV[Double](100.1, 100.1)),
-      (3L, BV[Double](109.9, 109.9)), (3L, BV[Double](110.1, 110.1))
-    )
-    val data = sc.parallelize(local, 1)
-    val stats = BisectingKMeans.summarizeClusters(data)
-    val dividedData = BisectingKMeans.divideClusters(data, stats, 20, 1).collect()
-
-    assert(dividedData(0) == (4L, BV[Double](0.9, 0.9)))
-    assert(dividedData(1) == (4L, BV[Double](1.1, 1.1)))
-    assert(dividedData(2) == (5L, BV[Double](9.9, 9.9)))
-    assert(dividedData(3) == (5L, BV[Double](10.1, 10.1)))
-    assert(dividedData(4) == (6L, BV[Double](99.9, 99.9)))
-    assert(dividedData(5) == (6L, BV[Double](100.1, 100.1)))
-    assert(dividedData(6) == (7L, BV[Double](109.9, 109.9)))
-    assert(dividedData(7) == (7L, BV[Double](110.1, 110.1)))
-  }
-
 }

From 29ccdf9eaa987530435782d2051acbeda3d3ac36 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 9 Nov 2015 11:37:19 -0800
Subject: [PATCH 76/76] Remove a magic number 63 for level limitation

---
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 9a7916f75dcc7..29a7aa0bb63f2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -165,7 +165,7 @@ class BisectingKMeans private (
     val random = new Random(seed)
     var numLeafClustersNeeded = k - 1
     var level = 1
-    while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < 63) {
+    while (activeClusters.nonEmpty && numLeafClustersNeeded > 0 && level < LEVEL_LIMIT) {
       // Divisible clusters are sufficiently large and have non-trivial cost.
       var divisibleClusters = activeClusters.filter { case (_, summary) =>
         (summary.size >= minSize) && (summary.cost > MLUtils.EPSILON * summary.size)
@@ -226,6 +226,8 @@ private object BisectingKMeans extends Serializable {
 
   private val MAX_DIVISIBLE_CLUSTER_INDEX: Long = Long.MaxValue / 2
 
+  private val LEVEL_LIMIT = math.log10(Long.MaxValue) / math.log10(2)
+
   /** Returns the left child index of the given node index. */
   private def leftChildIndex(index: Long): Long = {
     require(index <= MAX_DIVISIBLE_CLUSTER_INDEX, s"Child index out of bound: 2 * $index.")