From 6398ae67f2999bf9344ce0fbf8c412dad8f369d1 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 27 Mar 2016 22:13:17 +0000 Subject: [PATCH 01/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. This PR tries to support more options for feature subset size in RandomForest implementation. Previously, RandomForest only support "auto", "all", "sqrt", "log2", "onethird". This PR tries to support any given value to allow model search. In this PR, `featureSubsetStrategy` could be passed with: a) a real number in the range of `(0.0-1.0]` that represents the fraction of the number of features in each subset, b) an integer number (`>0`) that represents the number of features in each subset. --- .../ml/tree/impl/DecisionTreeMetadata.scala | 29 +++++++++++++++++++ .../org/apache/spark/ml/tree/treeParams.scala | 6 +++- .../spark/mllib/tree/RandomForest.scala | 11 +++++-- .../JavaRandomForestClassifierSuite.java | 6 ++++ .../JavaRandomForestRegressorSuite.java | 6 ++++ 5 files changed, 55 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index df8eb5d1f9278..7efd0fc073ba3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -183,11 +183,40 @@ private[spark] object DecisionTreeMetadata extends Logging { } case _ => featureSubsetStrategy } + + object featureSubsetNumber { + def unapply(strategy: String): Option[Int] = try { + val number = strategy.toInt + if (0 < number) { + Some(number) + } else { + None + } + } catch { + case _ : java.lang.NumberFormatException => None + } + } + + object featureSubsetFraction { + def unapply(strategy: String): Option[Double] = try { + val fraction = strategy.toDouble + if (0.0 < fraction && fraction <= 1.0) { + Some(fraction) + } else { + None + } + } catch { + case _ : java.lang.NumberFormatException => None + } + } + val numFeaturesPerNode: Int = _featureSubsetStrategy match { case "all" => numFeatures case "sqrt" => math.sqrt(numFeatures).ceil.toInt case "log2" => math.max(1, (math.log(numFeatures) / math.log(2)).ceil.toInt) case "onethird" => (numFeatures / 3.0).ceil.toInt + case featureSubsetNumber(number) => if (number > numFeatures) numFeatures else number + case featureSubsetFraction(fraction) => (fraction * numFeatures).ceil.toInt } new DecisionTreeMetadata(numFeatures, numExamples, numClasses, numBins.max, diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 78e6d3bfacb53..dbda20ab2d3c9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -329,6 +329,8 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { * - "onethird": use 1/3 of the features * - "sqrt": use sqrt(number of features) * - "log2": use log2(number of features) + * - "(0.0-1.0]": use the specified fraction of features + * - "[1-n]": use the specified number of features * (default = "auto") * * These various settings are based on the following references: @@ -346,7 +348,9 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { "The number of features to consider for splits at each tree node." + s" Supported options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}", (value: String) => - RandomForestParams.supportedFeatureSubsetStrategies.contains(value.toLowerCase)) + RandomForestParams.supportedFeatureSubsetStrategies.contains(value.toLowerCase) + || (try { value.toInt > 0 } catch { case _ : Throwable => false }) + || (try { value.toDouble > 0.0 || value.toDouble <= 1.0} catch { case _ : Throwable => false })) setDefault(featureSubsetStrategy -> "auto") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 1841fa4a95c98..b5be983951300 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -55,10 +55,15 @@ import org.apache.spark.util.Utils * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. * @param featureSubsetStrategy Number of features to consider for splits at each node. * Supported values: "auto", "all", "sqrt", "log2", "onethird". + * Supported numerical values: "(0.0-1.0]", "[1-n]". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "sqrt" for classification and * to "onethird" for regression. + * If a real value "(0.0-1.0]" is set, this parameter specifies + * the fraction of features in each subset. + * If an integer value "[1-n]" is set, this parameter specifies + * the number of features in each subset. * @param seed Random seed for bootstrapping and choosing feature subsets. */ private class RandomForest ( @@ -70,9 +75,11 @@ private class RandomForest ( strategy.assertValid() require(numTrees > 0, s"RandomForest requires numTrees > 0, but was given numTrees = $numTrees.") - require(RandomForest.supportedFeatureSubsetStrategies.contains(featureSubsetStrategy), + require(RandomForest.supportedFeatureSubsetStrategies.contains(featureSubsetStrategy) + || (try { featureSubsetStrategy.toInt > 0 } catch { case _ : Throwable => false }) + || (try { featureSubsetStrategy.toDouble > 0.0 || featureSubsetStrategy.toDouble <= 1.0} catch { case _ : Throwable => false }), s"RandomForest given invalid featureSubsetStrategy: $featureSubsetStrategy." + - s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}.") + s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}, [0.0-1.0], [1-n].") /** * Method to train a decision tree model over an RDD diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index 75061464e5462..315cb185427d7 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -80,6 +80,12 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestClassifier.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } + for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction++) { + rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); + } + for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { + rf.setFeatureSubsetStrategy(Integer.toString(featureSubsetNumber)); + } RandomForestClassificationModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index b6f793f6de89f..9984cead9c7e9 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -80,6 +80,12 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestRegressor.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } + for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction++) { + rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); + } + for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { + rf.setFeatureSubsetStrategy(Integer.toString(featureSubsetNumber)); + } RandomForestRegressionModel model = rf.fit(dataFrame); model.transform(dataFrame); From 326f5a0f10ba89d5efd3bb27aa90841ce8c6dfa3 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 27 Mar 2016 23:20:28 +0000 Subject: [PATCH 02/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Add one additional test in org.apache.spark.mllib.tree.RandomForestSuite to cover the changes in options for feature subset size. --- .../spark/mllib/tree/RandomForestSuite.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala index bec61ba6a003c..abae788e74eb6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala @@ -135,6 +135,23 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(rf1.toDebugString != rf2.toDebugString) } + test("options for feature subset size in RandomForest - SPARK-3724") { + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) + val rdd = sc.parallelize(arr) + val numTrees = 1 + + val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 2, + numClasses = 2, categoricalFeaturesInfo = Map.empty[Int, Int], + useNodeIdCache = true) + + // Both options should be the same as 17 == 50 * 0.34 + val rf1 = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, + featureSubsetStrategy = "17", seed = 123) + val rf2 = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, + featureSubsetStrategy = "0.34", seed = 123) + assert(rf1.toDebugString == rf2.toDebugString) + } + test("model save/load") { val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString From e1543543e5dc35a8fc691705017709d852382e9c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 27 Mar 2016 23:53:30 +0000 Subject: [PATCH 03/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Fix a couple of issues in JavaRandomForestRegressorSuite and JavaRandomForestClassifierSuite. Fix a typo in comment. --- .../main/scala/org/apache/spark/mllib/tree/RandomForest.scala | 2 +- .../ml/classification/JavaRandomForestClassifierSuite.java | 2 +- .../spark/ml/regression/JavaRandomForestRegressorSuite.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index b5be983951300..8f0081e3c5b9c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -79,7 +79,7 @@ private class RandomForest ( || (try { featureSubsetStrategy.toInt > 0 } catch { case _ : Throwable => false }) || (try { featureSubsetStrategy.toDouble > 0.0 || featureSubsetStrategy.toDouble <= 1.0} catch { case _ : Throwable => false }), s"RandomForest given invalid featureSubsetStrategy: $featureSubsetStrategy." + - s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}, [0.0-1.0], [1-n].") + s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}, (0.0-1.0], [1-n].") /** * Method to train a decision tree model over an RDD diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index 315cb185427d7..6b616f174a388 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -80,7 +80,7 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestClassifier.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction++) { + for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); } for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index 9984cead9c7e9..1c7b386c5cbac 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -80,7 +80,7 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestRegressor.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction++) { + for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); } for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { From de3d7ac0f1eda0a9c71a208c29552d8c04cb70ab Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 1 Apr 2016 01:44:30 +0000 Subject: [PATCH 04/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Move tests from mllib to ml. Replace extractors with regex. --- .../ml/tree/impl/DecisionTreeMetadata.scala | 32 +++---------------- .../org/apache/spark/ml/tree/treeParams.scala | 3 +- .../spark/mllib/tree/RandomForest.scala | 6 ++-- .../ml/tree/impl/RandomForestSuite.scala | 20 ++++++++++++ .../spark/mllib/tree/RandomForestSuite.scala | 17 ---------- 5 files changed, 28 insertions(+), 50 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index 7efd0fc073ba3..cc1adc8bf2eed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -184,39 +184,15 @@ private[spark] object DecisionTreeMetadata extends Logging { case _ => featureSubsetStrategy } - object featureSubsetNumber { - def unapply(strategy: String): Option[Int] = try { - val number = strategy.toInt - if (0 < number) { - Some(number) - } else { - None - } - } catch { - case _ : java.lang.NumberFormatException => None - } - } - - object featureSubsetFraction { - def unapply(strategy: String): Option[Double] = try { - val fraction = strategy.toDouble - if (0.0 < fraction && fraction <= 1.0) { - Some(fraction) - } else { - None - } - } catch { - case _ : java.lang.NumberFormatException => None - } - } - + val isIntRegex = "^([1-9]\\d*)$".r + val isFractionRegex = "^(0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$".r val numFeaturesPerNode: Int = _featureSubsetStrategy match { case "all" => numFeatures case "sqrt" => math.sqrt(numFeatures).ceil.toInt case "log2" => math.max(1, (math.log(numFeatures) / math.log(2)).ceil.toInt) case "onethird" => (numFeatures / 3.0).ceil.toInt - case featureSubsetNumber(number) => if (number > numFeatures) numFeatures else number - case featureSubsetFraction(fraction) => (fraction * numFeatures).ceil.toInt + case isIntRegex(number) => if (number.toInt > numFeatures) numFeatures else number.toInt + case isFractionRegex(fraction) => (fraction.toDouble * numFeatures).ceil.toInt } new DecisionTreeMetadata(numFeatures, numExamples, numClasses, numBins.max, diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index dbda20ab2d3c9..5e072ca3a61cc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -349,8 +349,7 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { s" Supported options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}", (value: String) => RandomForestParams.supportedFeatureSubsetStrategies.contains(value.toLowerCase) - || (try { value.toInt > 0 } catch { case _ : Throwable => false }) - || (try { value.toDouble > 0.0 || value.toDouble <= 1.0} catch { case _ : Throwable => false })) + || value.matches("^(?:[1-9]\\d*|0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$")) setDefault(featureSubsetStrategy -> "auto") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 8f0081e3c5b9c..b3650d49414e2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -76,10 +76,10 @@ private class RandomForest ( strategy.assertValid() require(numTrees > 0, s"RandomForest requires numTrees > 0, but was given numTrees = $numTrees.") require(RandomForest.supportedFeatureSubsetStrategies.contains(featureSubsetStrategy) - || (try { featureSubsetStrategy.toInt > 0 } catch { case _ : Throwable => false }) - || (try { featureSubsetStrategy.toDouble > 0.0 || featureSubsetStrategy.toDouble <= 1.0} catch { case _ : Throwable => false }), + || featureSubsetStrategy.matches("^(?:[1-9]\\d*|0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$"), s"RandomForest given invalid featureSubsetStrategy: $featureSubsetStrategy." + - s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}, (0.0-1.0], [1-n].") + s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}," + + s" (0.0-1.0], [1-n].") /** * Method to train a decision tree model over an RDD diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index e64551f03c92f..747bb2ddeec59 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTreeSuite => OldDTSuite, EnsembleTestHelper} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, QuantileStrategy, Strategy => OldStrategy} import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, GiniCalculator} +import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap @@ -508,6 +509,25 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } + test("options for feature subset size in RandomForest - SPARK-3724") { + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) + val rdd = sc.parallelize(arr) + val numTrees = 1 + + val strategy = new OldStrategy(algo = OldAlgo.Classification, impurity = Gini, maxDepth = 2, + numClasses = 2, categoricalFeaturesInfo = Map.empty[Int, Int], + useNodeIdCache = true) + + // Both options should be the same as 17 == 50 * 0.34 + val rf1 = RandomForest.run(rdd, strategy, numTrees = numTrees, + featureSubsetStrategy = "17", seed = 123) + val rf2 = RandomForest.run(rdd, strategy, numTrees = numTrees, + featureSubsetStrategy = "0.34", seed = 123) + val model1 = new RandomForestModel(strategy.algo, rf1.map(_.toOld)) + val model2 = new RandomForestModel(strategy.algo, rf2.map(_.toOld)) + assert(model1.toDebugString == model2.toDebugString) + } + } private object RandomForestSuite { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala index abae788e74eb6..bec61ba6a003c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala @@ -135,23 +135,6 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(rf1.toDebugString != rf2.toDebugString) } - test("options for feature subset size in RandomForest - SPARK-3724") { - val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) - val rdd = sc.parallelize(arr) - val numTrees = 1 - - val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 2, - numClasses = 2, categoricalFeaturesInfo = Map.empty[Int, Int], - useNodeIdCache = true) - - // Both options should be the same as 17 == 50 * 0.34 - val rf1 = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, - featureSubsetStrategy = "17", seed = 123) - val rf2 = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, - featureSubsetStrategy = "0.34", seed = 123) - assert(rf1.toDebugString == rf2.toDebugString) - } - test("model save/load") { val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString From 704a8f0156787b59724ad9f091422da4ad35ddff Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 2 Apr 2016 02:27:03 +0000 Subject: [PATCH 05/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Update pull request based on @sethah's feedback. --- .../ml/tree/impl/DecisionTreeMetadata.scala | 2 +- .../JavaRandomForestClassifierSuite.java | 6 ++-- .../JavaRandomForestRegressorSuite.java | 6 ++-- .../ml/tree/impl/RandomForestSuite.scala | 33 ++++++++----------- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index cc1adc8bf2eed..4b676c77ef34e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -191,7 +191,7 @@ private[spark] object DecisionTreeMetadata extends Logging { case "sqrt" => math.sqrt(numFeatures).ceil.toInt case "log2" => math.max(1, (math.log(numFeatures) / math.log(2)).ceil.toInt) case "onethird" => (numFeatures / 3.0).ceil.toInt - case isIntRegex(number) => if (number.toInt > numFeatures) numFeatures else number.toInt + case isIntRegex(number) => if (BigInt(number) > numFeatures) numFeatures else number.toInt case isFractionRegex(fraction) => (fraction.toDouble * numFeatures).ceil.toInt } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index 6b616f174a388..531283e6d4abc 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -83,9 +83,9 @@ public void runDT() { for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); } - for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { - rf.setFeatureSubsetStrategy(Integer.toString(featureSubsetNumber)); - } + rf.setFeatureSubsetStrategy("1"); + rf.setFeatureSubsetStrategy("100"); + rf.setFeatureSubsetStrategy("1000"); RandomForestClassificationModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index 1c7b386c5cbac..ec67c02e4396d 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -83,9 +83,9 @@ public void runDT() { for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); } - for (int featureSubsetNumber = 1; featureSubsetNumber <= 100; featureSubsetNumber++) { - rf.setFeatureSubsetStrategy(Integer.toString(featureSubsetNumber)); - } + rf.setFeatureSubsetStrategy("1"); + rf.setFeatureSubsetStrategy("100"); + rf.setFeatureSubsetStrategy("1000"); RandomForestRegressionModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 747bb2ddeec59..216db1f8c8f8b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -423,6 +423,13 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { checkFeatureSubsetStrategy(numTrees = 1, "log2", (math.log(numFeatures) / math.log(2)).ceil.toInt) checkFeatureSubsetStrategy(numTrees = 1, "onethird", (numFeatures / 3.0).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 1, "0.1", (0.1 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 1, "0.5", (0.5 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 1, "1.0", (1.0 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 1, "1", 1) + checkFeatureSubsetStrategy(numTrees = 1, "2", 2) + checkFeatureSubsetStrategy(numTrees = 1, numFeatures.toString, numFeatures) + checkFeatureSubsetStrategy(numTrees = 1, (numFeatures * 2).toString, numFeatures) checkFeatureSubsetStrategy(numTrees = 2, "all", numFeatures) checkFeatureSubsetStrategy(numTrees = 2, "auto", math.sqrt(numFeatures).ceil.toInt) @@ -430,6 +437,13 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { checkFeatureSubsetStrategy(numTrees = 2, "log2", (math.log(numFeatures) / math.log(2)).ceil.toInt) checkFeatureSubsetStrategy(numTrees = 2, "onethird", (numFeatures / 3.0).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 2, "0.1", (0.1 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 2, "0.5", (0.5 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 2, "1.0", (1.0 * numFeatures).ceil.toInt) + checkFeatureSubsetStrategy(numTrees = 2, "1", 1) + checkFeatureSubsetStrategy(numTrees = 2, "2", 2) + checkFeatureSubsetStrategy(numTrees = 2, numFeatures.toString, numFeatures) + checkFeatureSubsetStrategy(numTrees = 2, (numFeatures * 2).toString, numFeatures) } test("Binary classification with continuous features: subsampling features") { @@ -509,25 +523,6 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } - test("options for feature subset size in RandomForest - SPARK-3724") { - val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) - val rdd = sc.parallelize(arr) - val numTrees = 1 - - val strategy = new OldStrategy(algo = OldAlgo.Classification, impurity = Gini, maxDepth = 2, - numClasses = 2, categoricalFeaturesInfo = Map.empty[Int, Int], - useNodeIdCache = true) - - // Both options should be the same as 17 == 50 * 0.34 - val rf1 = RandomForest.run(rdd, strategy, numTrees = numTrees, - featureSubsetStrategy = "17", seed = 123) - val rf2 = RandomForest.run(rdd, strategy, numTrees = numTrees, - featureSubsetStrategy = "0.34", seed = 123) - val model1 = new RandomForestModel(strategy.algo, rf1.map(_.toOld)) - val model2 = new RandomForestModel(strategy.algo, rf2.map(_.toOld)) - assert(model1.toDebugString == model2.toDebugString) - } - } private object RandomForestSuite { From f02604b6412150609988306ad3f820e801684bdb Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 2 Apr 2016 02:57:37 +0000 Subject: [PATCH 06/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Reduce unneeded tests based on feedbacks from @sethah. --- .../ml/classification/JavaRandomForestClassifierSuite.java | 6 +++--- .../spark/ml/regression/JavaRandomForestRegressorSuite.java | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index 531283e6d4abc..a2f0c1b73685e 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -80,9 +80,9 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestClassifier.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { - rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); - } + rf.setFeatureSubsetStrategy("0.1"); + rf.setFeatureSubsetStrategy("0.9"); + rf.setFeatureSubsetStrategy("1.0"); rf.setFeatureSubsetStrategy("1"); rf.setFeatureSubsetStrategy("100"); rf.setFeatureSubsetStrategy("1000"); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index ec67c02e4396d..261dbe6b3d45d 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -80,9 +80,9 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestRegressor.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - for (double featureSubsetFraction = 0.1; featureSubsetFraction <= 1.0; featureSubsetFraction += 0.1) { - rf.setFeatureSubsetStrategy(Double.toString(featureSubsetFraction)); - } + rf.setFeatureSubsetStrategy("0.1"); + rf.setFeatureSubsetStrategy("0.9"); + rf.setFeatureSubsetStrategy("1.0"); rf.setFeatureSubsetStrategy("1"); rf.setFeatureSubsetStrategy("100"); rf.setFeatureSubsetStrategy("1000"); From c2b662b486d70b41ff47570a6f19de17d1bfe637 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 5 Apr 2016 03:34:41 +0000 Subject: [PATCH 07/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Move repeated regex to a constant (@MLnick). Remove redundant `\\d*` from the end of the regex (@sethah). Rewording the documentation for better explanation (@sethah). --- .../apache/spark/ml/tree/impl/DecisionTreeMetadata.scala | 2 +- .../main/scala/org/apache/spark/ml/tree/treeParams.scala | 7 +++++-- .../scala/org/apache/spark/mllib/tree/RandomForest.scala | 9 +++++---- .../classification/JavaRandomForestClassifierSuite.java | 3 +++ .../ml/regression/JavaRandomForestRegressorSuite.java | 3 +++ 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index 4b676c77ef34e..c7cde1563fc79 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -185,7 +185,7 @@ private[spark] object DecisionTreeMetadata extends Logging { } val isIntRegex = "^([1-9]\\d*)$".r - val isFractionRegex = "^(0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$".r + val isFractionRegex = "^(0?\\.\\d*[1-9]\\d*|1\\.0+)$".r val numFeaturesPerNode: Int = _featureSubsetStrategy match { case "all" => numFeatures case "sqrt" => math.sqrt(numFeatures).ceil.toInt diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 5e072ca3a61cc..ee77389428f75 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -330,7 +330,7 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { * - "sqrt": use sqrt(number of features) * - "log2": use log2(number of features) * - "(0.0-1.0]": use the specified fraction of features - * - "[1-n]": use the specified number of features + * - "n": use n features, for integer 0 < n <= (number of features) * (default = "auto") * * These various settings are based on the following references: @@ -349,7 +349,7 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { s" Supported options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}", (value: String) => RandomForestParams.supportedFeatureSubsetStrategies.contains(value.toLowerCase) - || value.matches("^(?:[1-9]\\d*|0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$")) + || value.matches(RandomForestParams.supportedFeatureSubsetStrategiesRegex)) setDefault(featureSubsetStrategy -> "auto") @@ -396,6 +396,9 @@ private[spark] object RandomForestParams { // These options should be lowercase. final val supportedFeatureSubsetStrategies: Array[String] = Array("auto", "all", "onethird", "sqrt", "log2").map(_.toLowerCase) + + // The regex to capture "(0.0-1.0]", and "n" for integer 0 < n <= (number of features) + final val supportedFeatureSubsetStrategiesRegex = "^(?:[1-9]\\d*|0?\\.\\d*[1-9]\\d*|1\\.0+)$" } private[ml] trait RandomForestClassifierParams diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index b3650d49414e2..7493a9ca7815e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -61,9 +61,10 @@ import org.apache.spark.util.Utils * if numTrees > 1 (forest) set to "sqrt" for classification and * to "onethird" for regression. * If a real value "(0.0-1.0]" is set, this parameter specifies - * the fraction of features in each subset. - * If an integer value "[1-n]" is set, this parameter specifies - * the number of features in each subset. + * the fraction of features in each subset. + * If an integer value "n" is set, this parameter specifies + * the number of features used in each subset, + * for integer 0 < n <= (number of features). * @param seed Random seed for bootstrapping and choosing feature subsets. */ private class RandomForest ( @@ -76,7 +77,7 @@ private class RandomForest ( strategy.assertValid() require(numTrees > 0, s"RandomForest requires numTrees > 0, but was given numTrees = $numTrees.") require(RandomForest.supportedFeatureSubsetStrategies.contains(featureSubsetStrategy) - || featureSubsetStrategy.matches("^(?:[1-9]\\d*|0\\.\\d*[1-9]\\d*\\d*|1\\.0+)$"), + || featureSubsetStrategy.matches(NewRFParams.supportedFeatureSubsetStrategiesRegex), s"RandomForest given invalid featureSubsetStrategy: $featureSubsetStrategy." + s" Supported values: ${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}," + s" (0.0-1.0], [1-n].") diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index a2f0c1b73685e..f9dddb91d8d6c 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -80,6 +80,9 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestClassifier.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } + rf.setFeatureSubsetStrategy(".1"); + rf.setFeatureSubsetStrategy(".10"); + rf.setFeatureSubsetStrategy("0.10"); rf.setFeatureSubsetStrategy("0.1"); rf.setFeatureSubsetStrategy("0.9"); rf.setFeatureSubsetStrategy("1.0"); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index 261dbe6b3d45d..bfec9a2ead7ec 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -80,6 +80,9 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestRegressor.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } + rf.setFeatureSubsetStrategy(".1"); + rf.setFeatureSubsetStrategy(".10"); + rf.setFeatureSubsetStrategy("0.10"); rf.setFeatureSubsetStrategy("0.1"); rf.setFeatureSubsetStrategy("0.9"); rf.setFeatureSubsetStrategy("1.0"); From bebd544bf411717ac22899f79627b0811b1da8c5 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 5 Apr 2016 16:12:47 +0000 Subject: [PATCH 08/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Remove unneeded import. --- .../scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 216db1f8c8f8b..6921953ffeae7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -27,7 +27,6 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTreeSuite => OldDTSuite, EnsembleTestHelper} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, QuantileStrategy, Strategy => OldStrategy} import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, GiniCalculator} -import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap From 13edc07ec3f5aebdea4e6e4a635656393aebfd8e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 6 Apr 2016 01:53:33 +0000 Subject: [PATCH 09/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Reorganize the wording in the comment (@MLnick). --- .../main/scala/org/apache/spark/ml/tree/treeParams.scala | 4 ++-- .../scala/org/apache/spark/mllib/tree/RandomForest.scala | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index ee77389428f75..0767dc17e5562 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -329,8 +329,8 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { * - "onethird": use 1/3 of the features * - "sqrt": use sqrt(number of features) * - "log2": use log2(number of features) - * - "(0.0-1.0]": use the specified fraction of features - * - "n": use n features, for integer 0 < n <= (number of features) + * - "n": when n is in the range (0, 1.0], use n * number of features. When n + * is in the range (1, number of features), use n features. * (default = "auto") * * These various settings are based on the following references: diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 7493a9ca7815e..26755849ad1a2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -60,11 +60,10 @@ import org.apache.spark.util.Utils * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "sqrt" for classification and * to "onethird" for regression. - * If a real value "(0.0-1.0]" is set, this parameter specifies - * the fraction of features in each subset. - * If an integer value "n" is set, this parameter specifies - * the number of features used in each subset, - * for integer 0 < n <= (number of features). + * If a real value "n" in the range (0, 1.0] is set, + * use n * number of features. + * If an integer value "n" in the range (1, num features) is set, + * use n features. * @param seed Random seed for bootstrapping and choosing feature subsets. */ private class RandomForest ( From 08feaaa716ecd2d6c229124dd2de6cf7d36e1ba4 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 6 Apr 2016 15:23:53 +0000 Subject: [PATCH 10/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Consolidate test cases so that both Java and Scala are properly covered. --- .../JavaRandomForestClassifierSuite.java | 17 +++++---- .../JavaRandomForestRegressorSuite.java | 17 +++++---- .../ml/tree/impl/RandomForestSuite.scala | 36 +++++++++++-------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index f9dddb91d8d6c..cc33ebae5d1c2 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -80,15 +80,14 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestClassifier.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - rf.setFeatureSubsetStrategy(".1"); - rf.setFeatureSubsetStrategy(".10"); - rf.setFeatureSubsetStrategy("0.10"); - rf.setFeatureSubsetStrategy("0.1"); - rf.setFeatureSubsetStrategy("0.9"); - rf.setFeatureSubsetStrategy("1.0"); - rf.setFeatureSubsetStrategy("1"); - rf.setFeatureSubsetStrategy("100"); - rf.setFeatureSubsetStrategy("1000"); + String realStrategies[] = {".1", ".10", "0.10", "0.1", "0.9", "1.0"}; + for (String strategy: realStrategies) { + rf.setFeatureSubsetStrategy(strategy); + } + String integerStrategies[] = {"1", "10", "100", "1000", "10000"}; + for (String strategy: integerStrategies) { + rf.setFeatureSubsetStrategy(strategy); + } RandomForestClassificationModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index bfec9a2ead7ec..f3e02c2666a94 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -80,15 +80,14 @@ public void runDT() { for (String featureSubsetStrategy: RandomForestRegressor.supportedFeatureSubsetStrategies()) { rf.setFeatureSubsetStrategy(featureSubsetStrategy); } - rf.setFeatureSubsetStrategy(".1"); - rf.setFeatureSubsetStrategy(".10"); - rf.setFeatureSubsetStrategy("0.10"); - rf.setFeatureSubsetStrategy("0.1"); - rf.setFeatureSubsetStrategy("0.9"); - rf.setFeatureSubsetStrategy("1.0"); - rf.setFeatureSubsetStrategy("1"); - rf.setFeatureSubsetStrategy("100"); - rf.setFeatureSubsetStrategy("1000"); + String realStrategies[] = {".1", ".10", "0.10", "0.1", "0.9", "1.0"}; + for (String strategy: realStrategies) { + rf.setFeatureSubsetStrategy(strategy); + } + String integerStrategies[] = {"1", "10", "100", "1000", "10000"}; + for (String strategy: integerStrategies) { + rf.setFeatureSubsetStrategy(strategy); + } RandomForestRegressionModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 6921953ffeae7..14a3daae21e04 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -422,13 +422,18 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { checkFeatureSubsetStrategy(numTrees = 1, "log2", (math.log(numFeatures) / math.log(2)).ceil.toInt) checkFeatureSubsetStrategy(numTrees = 1, "onethird", (numFeatures / 3.0).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 1, "0.1", (0.1 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 1, "0.5", (0.5 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 1, "1.0", (1.0 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 1, "1", 1) - checkFeatureSubsetStrategy(numTrees = 1, "2", 2) - checkFeatureSubsetStrategy(numTrees = 1, numFeatures.toString, numFeatures) - checkFeatureSubsetStrategy(numTrees = 1, (numFeatures * 2).toString, numFeatures) + + val realStrategies = Array(".1", ".10", "0.10", "0.1", "0.9", "1.0") + for (strategy <- realStrategies) { + val expected = (strategy.toDouble * numFeatures).ceil.toInt + checkFeatureSubsetStrategy(numTrees = 1, strategy, expected) + } + + val integerStrategies = Array("1", "10", "100", "1000", "10000") + for (strategy <- integerStrategies) { + val expected = if (strategy.toInt < numFeatures) strategy.toInt else numFeatures + checkFeatureSubsetStrategy(numTrees = 1, strategy, expected) + } checkFeatureSubsetStrategy(numTrees = 2, "all", numFeatures) checkFeatureSubsetStrategy(numTrees = 2, "auto", math.sqrt(numFeatures).ceil.toInt) @@ -436,13 +441,16 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { checkFeatureSubsetStrategy(numTrees = 2, "log2", (math.log(numFeatures) / math.log(2)).ceil.toInt) checkFeatureSubsetStrategy(numTrees = 2, "onethird", (numFeatures / 3.0).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 2, "0.1", (0.1 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 2, "0.5", (0.5 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 2, "1.0", (1.0 * numFeatures).ceil.toInt) - checkFeatureSubsetStrategy(numTrees = 2, "1", 1) - checkFeatureSubsetStrategy(numTrees = 2, "2", 2) - checkFeatureSubsetStrategy(numTrees = 2, numFeatures.toString, numFeatures) - checkFeatureSubsetStrategy(numTrees = 2, (numFeatures * 2).toString, numFeatures) + + for (strategy <- realStrategies) { + val expected = (strategy.toDouble * numFeatures).ceil.toInt + checkFeatureSubsetStrategy(numTrees = 2, strategy, expected) + } + + for (strategy <- integerStrategies) { + val expected = if (strategy.toInt < numFeatures) strategy.toInt else numFeatures + checkFeatureSubsetStrategy(numTrees = 2, strategy, expected) + } } test("Binary classification with continuous features: subsampling features") { From 8a4c29804848ccbcf472ec330be04536013deb8b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 8 Apr 2016 15:22:27 +0000 Subject: [PATCH 11/11] [SPARK-3724][MLLIB] RandomForest: More options for feature subset size. Add test cases to cover invalid strategies. --- .../JavaRandomForestClassifierSuite.java | 11 +++++++++++ .../regression/JavaRandomForestRegressorSuite.java | 11 +++++++++++ .../spark/ml/tree/impl/RandomForestSuite.scala | 14 ++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index cc33ebae5d1c2..5aec52ac72b18 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -22,6 +22,7 @@ import java.util.Map; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -88,6 +89,16 @@ public void runDT() { for (String strategy: integerStrategies) { rf.setFeatureSubsetStrategy(strategy); } + String invalidStrategies[] = {"-.1", "-.10", "-0.10", ".0", "0.0", "1.1", "0"}; + for (String strategy: invalidStrategies) { + try { + rf.setFeatureSubsetStrategy(strategy); + Assert.fail("Expected exception to be thrown for invalid strategies"); + } catch (Exception e) { + Assert.assertTrue(e instanceof IllegalArgumentException); + } + } + RandomForestClassificationModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index f3e02c2666a94..a8736669f72e7 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -22,6 +22,7 @@ import java.util.Map; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -88,6 +89,16 @@ public void runDT() { for (String strategy: integerStrategies) { rf.setFeatureSubsetStrategy(strategy); } + String invalidStrategies[] = {"-.1", "-.10", "-0.10", ".0", "0.0", "1.1", "0"}; + for (String strategy: invalidStrategies) { + try { + rf.setFeatureSubsetStrategy(strategy); + Assert.fail("Expected exception to be thrown for invalid strategies"); + } catch (Exception e) { + Assert.assertTrue(e instanceof IllegalArgumentException); + } + } + RandomForestRegressionModel model = rf.fit(dataFrame); model.transform(dataFrame); diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 14a3daae21e04..c0b877eafc4ca 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -435,6 +435,14 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { checkFeatureSubsetStrategy(numTrees = 1, strategy, expected) } + val invalidStrategies = Array("-.1", "-.10", "-0.10", ".0", "0.0", "1.1", "0") + for (invalidStrategy <- invalidStrategies) { + intercept[MatchError]{ + val metadata = + DecisionTreeMetadata.buildMetadata(rdd, strategy, numTrees = 1, invalidStrategy) + } + } + checkFeatureSubsetStrategy(numTrees = 2, "all", numFeatures) checkFeatureSubsetStrategy(numTrees = 2, "auto", math.sqrt(numFeatures).ceil.toInt) checkFeatureSubsetStrategy(numTrees = 2, "sqrt", math.sqrt(numFeatures).ceil.toInt) @@ -451,6 +459,12 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { val expected = if (strategy.toInt < numFeatures) strategy.toInt else numFeatures checkFeatureSubsetStrategy(numTrees = 2, strategy, expected) } + for (invalidStrategy <- invalidStrategies) { + intercept[MatchError]{ + val metadata = + DecisionTreeMetadata.buildMetadata(rdd, strategy, numTrees = 2, invalidStrategy) + } + } } test("Binary classification with continuous features: subsampling features") {