apache · manishamde · Apr 20, 2014 · Apr 22, 2014 · Apr 22, 2014 · Apr 22, 2014
diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
@@ -294,12 +294,9 @@ The recursive tree construction is stopped at a node when one of the two conditi
 1. The node depth is equal to the `maxDepth` training paramemter
 2. No split candidate leads to an information gain at the node.
 
-### Practical Limitations
-
-The tree implementation stores an Array[Double] of size *O(#features \* #splits \* 2^maxDepth)* in memory for aggregating histograms over partitions. The current implementation might not scale to very deep trees since the memory requirement grows exponentially with tree depth. 
-
-Please drop us a line if you encounter any issues. We are planning to solve this problem in the near future and real-world examples will be great.
+### Implementation Details
 
+The tree implementation stores an Array[Double] of size *O(#features \* #splits \* 2^maxDepth)* in memory for aggregating histograms over partitions. Based upon the 'maxMemory' parameter set during training (default is 128 MB), the task is broken down into smaller groups to avoid out-of-memory errors during computation.
 
 ## Implementation in MLlib
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -31,6 +31,7 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
 import org.apache.spark.mllib.tree.model._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
+import org.apache.spark.util.Utils.memoryStringToMb
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 /**
@@ -58,7 +59,8 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     // Find the splits and the corresponding bins (interval between the splits) using a sample
     // of the input data.
     val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
-    logDebug("numSplits = " + bins(0).length)
+    val numBins = bins(0).length
+    logDebug("numBins = " + numBins)
 
     // depth of the decision tree
     val maxDepth = strategy.maxDepth
@@ -72,7 +74,28 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     val parentImpurities = new Array[Double](maxNumNodes)
     // dummy value for top node (updated during first split calculation)
     val nodes = new Array[Node](maxNumNodes)
+    // num features
+    val numFeatures = input.take(1)(0).features.size
 
+    // Calculate level for single group construction
+
+    // Max memory usage for aggregates
+    val maxMemoryUsage = strategy.maxMemory * 1024 * 1024
+    logDebug("max memory usage for aggregates = " + maxMemoryUsage)
+    val numElementsPerNode = {
+      strategy.algo match {
+        case Classification => 2 * numBins *  numFeatures
+        case Regression => 3 * numBins * numFeatures
+      }
+    }
+    logDebug("numElementsPerNode = " + numElementsPerNode)
+    val arraySizePerNode = 8 * numElementsPerNode // approx. memory usage for bin aggregate array
+    val maxNumberOfNodesPerGroup = scala.math.max(maxMemoryUsage / arraySizePerNode, 1)
+    logDebug("maxNumberOfNodesPerGroup = " + maxNumberOfNodesPerGroup)
+    // nodes at a level is 2^(level-1). level is zero indexed.
+    val maxLevelForSingleGroup = scala.math.max(
+      (scala.math.log(maxNumberOfNodesPerGroup) / scala.math.log(2)).floor.toInt - 1, 0)
+    logDebug("max level for single group = " + maxLevelForSingleGroup)
 
     /*
      * The main idea here is to perform level-wise training of the decision tree nodes thus
@@ -92,7 +115,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
 
         // Find best split for all nodes at a level.
         val splitsStatsForLevel = DecisionTree.findBestSplits(input, parentImpurities, strategy,
-          level, filters, splits, bins)
+          level, filters, splits, bins, maxLevelForSingleGroup)
 
         for ((nodeSplitStats, index) <- splitsStatsForLevel.view.zipWithIndex) {
           // Extract info for nodes at the current level.
@@ -110,6 +133,10 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
       }
     }
 
+    logDebug("#####################################")
+    logDebug("Extracting tree model")
+    logDebug("#####################################")
+
     // Initialize the top or root node of the tree.
     val topNode = nodes(0)
     // Build the full tree using the node info calculated in the level-wise best split calculations.
@@ -260,6 +287,7 @@ object DecisionTree extends Serializable with Logging {
    * @param filters Filters for all nodes at a given level
    * @param splits possible splits for all features
    * @param bins possible bins for all features
+   * @param maxLevelForSingleGroup the deepest level for single-group level-wise computation.
    * @return array of splits with best splits for all nodes at a given level.
    */
   protected[tree] def findBestSplits(
@@ -269,7 +297,50 @@ object DecisionTree extends Serializable with Logging {
       level: Int,
       filters: Array[List[Filter]],
       splits: Array[Array[Split]],
-      bins: Array[Array[Bin]]): Array[(Split, InformationGainStats)] = {
+      bins: Array[Array[Bin]],
+      maxLevelForSingleGroup: Int): Array[(Split, InformationGainStats)] = {
+    // split into groups to avoid memory overflow during aggregation
+    if (level >  maxLevelForSingleGroup) {
+      val numGroups = scala.math.pow(2, (level - maxLevelForSingleGroup)).toInt
+      logDebug("numGroups = " + numGroups)
+      var groupIndex = 0
+      var bestSplits = new Array[(Split, InformationGainStats)](0)
+      while (groupIndex < numGroups) {
+        val bestSplitsForGroup = findBestSplitsPerGroup(input, parentImpurities, strategy, level,
+          filters, splits, bins, numGroups, groupIndex)
+        bestSplits = Array.concat(bestSplits, bestSplitsForGroup)
+        groupIndex += 1
+      }
+      bestSplits
+    } else {
+      findBestSplitsPerGroup(input, parentImpurities, strategy, level, filters, splits, bins)
+    }
+  }
+
+    /**
+   * Returns an array of optimal splits for a group of nodes at a given level
+   *
+   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
+   *              for DecisionTree
+   * @param parentImpurities Impurities for all parent nodes for the current level
+   * @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing
+   *                parameters for construction the DecisionTree
+   * @param level Level of the tree
+   * @param filters Filters for all nodes at a given level
+   * @param splits possible splits for all features
+   * @param bins possible bins for all features
+   * @return array of splits with best splits for all nodes at a given level.
+   */
+  private def findBestSplitsPerGroup(
+      input: RDD[LabeledPoint],
+      parentImpurities: Array[Double],
+      strategy: Strategy,
+      level: Int,
+      filters: Array[List[Filter]],
+      splits: Array[Array[Split]],
+      bins: Array[Array[Bin]],
+      numGroups: Int = 1,
+      groupIndex: Int = 0): Array[(Split, InformationGainStats)] = {
 
     /*
      * The high-level description for the best split optimizations are noted here.
@@ -296,20 +367,23 @@ object DecisionTree extends Serializable with Logging {
      */
 
     // common calculations for multiple nested methods
-    val numNodes = scala.math.pow(2, level).toInt
+    val numNodes = scala.math.pow(2, level).toInt / numGroups
     logDebug("numNodes = " + numNodes)
     // Find the number of features by looking at the first sample.
     val numFeatures = input.first().features.size
     logDebug("numFeatures = " + numFeatures)
     val numBins = bins(0).length
     logDebug("numBins = " + numBins)
 
+    // shift when more than one group is used at deep tree level
+    val groupShift = numNodes * groupIndex
+
     /** Find the filters used before reaching the current code. */
     def findParentFilters(nodeIndex: Int): List[Filter] = {
       if (level == 0) {
         List[Filter]()
       } else {
-        val nodeFilterIndex = scala.math.pow(2, level).toInt - 1 + nodeIndex
+        val nodeFilterIndex = scala.math.pow(2, level).toInt - 1 + nodeIndex + groupShift
         filters(nodeFilterIndex)
       }
     }
@@ -878,7 +952,7 @@ object DecisionTree extends Serializable with Logging {
     // Iterating over all nodes at this level
     var node = 0
     while (node < numNodes) {
-      val nodeImpurityIndex = scala.math.pow(2, level).toInt - 1 + node
+      val nodeImpurityIndex = scala.math.pow(2, level).toInt - 1 + node + groupShift
       val binsForNode: Array[Double] = getBinDataForNode(node)
       logDebug("nodeImpurityIndex = " + nodeImpurityIndex)
       val parentNodeImpurity = parentImpurities(nodeImpurityIndex)
@@ -1085,10 +1159,13 @@ object DecisionTree extends Serializable with Logging {
 
     val maxDepth = options.getOrElse('maxDepth, "1").toString.toInt
     val maxBins = options.getOrElse('maxBins, "100").toString.toInt
+    val maxMemUsage = memoryStringToMb(options.getOrElse('maxMemory, "128m").toString)
 
-    val strategy = new Strategy(algo, impurity, maxDepth, maxBins)
+    val strategy = new Strategy(algo, impurity, maxDepth, maxBins, maxMemory=maxMemUsage)
     val model = DecisionTree.train(trainData, strategy)
 
+
+
     // Load test data.
     val testData = loadLabeledData(sc, options.get('testDataDir).get.toString)
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -35,6 +35,9 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  *                                k) implies the feature n is categorical with k categories 0,
  *                                1, 2, ... , k-1. It's important to note that features are
  *                                zero-indexed.
+ * @param maxMemory maximum memory in MB allocated to histogram aggregation. Default value is
+ *                  128 MB.
+ *
  */
 @Experimental
 class Strategy (
@@ -43,4 +46,5 @@ class Strategy (
     val maxDepth: Int,
     val maxBins: Int = 100,
     val quantileCalculationStrategy: QuantileStrategy = Sort,
-    val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int]()) extends Serializable
+    val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
+    val maxMemory: Int = 128) extends Serializable
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -254,7 +254,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
       categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     val bestSplits = DecisionTree.findBestSplits(rdd, new Array(7), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
 
     val split = bestSplits(0)._1
     assert(split.categories.length === 1)
@@ -281,7 +281,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
       categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd,strategy)
     val bestSplits = DecisionTree.findBestSplits(rdd, new Array(7), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
 
     val split = bestSplits(0)._1
     assert(split.categories.length === 1)
@@ -310,7 +310,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(bins(0).length === 100)
 
     val bestSplits = DecisionTree.findBestSplits(rdd, new Array(7), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
     assert(bestSplits(0)._1.threshold === 10)
@@ -333,7 +333,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(bins(0).length === 100)
 
     val bestSplits = DecisionTree.findBestSplits(rdd, Array(0.0), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
     assert(bestSplits(0)._1.threshold === 10)
@@ -357,7 +357,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(bins(0).length === 100)
 
     val bestSplits = DecisionTree.findBestSplits(rdd, Array(0.0), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
     assert(bestSplits(0)._1.threshold === 10)
@@ -381,7 +381,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(bins(0).length === 100)
 
     val bestSplits = DecisionTree.findBestSplits(rdd, Array(0.0), strategy, 0,
-      Array[List[Filter]](), splits, bins)
+      Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
     assert(bestSplits(0)._1.threshold === 10)