From 7ee0837d04b8cddf014c3f4e3f4cf831d2c1e48a Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 29 Jun 2016 15:06:54 -0700 Subject: [PATCH 1/6] [SPARK-16307] [ML] Add test to verify the predicted variances of a DT on toy data --- .../DecisionTreeRegressorSuite.scala | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 9afb742406ec8..c5f49150cfe63 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -19,9 +19,10 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} @@ -36,10 +37,21 @@ class DecisionTreeRegressorSuite private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ + private var toyData: RDD[LabeledPoint] = _ + override def beforeAll() { super.beforeAll() + categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints().map(_.asML)) + toyData = sc.parallelize(Seq( + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(1.0))), + LabeledPoint(3.0, Vectors.dense(Array(2.0))), + LabeledPoint(10.0, Vectors.dense(Array(3.0))), + LabeledPoint(12.0, Vectors.dense(Array(4.0))), + LabeledPoint(14.0, Vectors.dense(Array(5.0)))) + ) } ///////////////////////////////////////////////////////////////////////////// @@ -96,6 +108,15 @@ class DecisionTreeRegressorSuite assert(variance === expectedVariance, s"Expected variance $expectedVariance but got $variance.") } + + val toyDF = TreeTests.setMetadata(toyData, Map.empty[Int, Int], 0) + dt.setMaxDepth(1) + .setMaxBins(6) + .setSeed(0) + val expectVariances = dt.fit(toyDF).transform(toyDF).select("variance").collect().map { + case Row(variance: Double) => variance } + val trueVariances = Array(0.667, 0.667, 0.667, 2.667, 2.667, 2.667) + trueVariances.zip(expectVariances).foreach(x => x._1 ~== x._2 absTol 1e-3) } test("Feature importance with toy data") { From 04bdb3a62fea614bbffd030e1fb9cdda3fd11cfa Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 1 Jul 2016 11:15:52 -0700 Subject: [PATCH 2/6] cosmetics --- .../ml/regression/DecisionTreeRegressorSuite.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index c5f49150cfe63..79b49281ae634 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -113,10 +113,17 @@ class DecisionTreeRegressorSuite dt.setMaxDepth(1) .setMaxBins(6) .setSeed(0) - val expectVariances = dt.fit(toyDF).transform(toyDF).select("variance").collect().map { + val calculatedVariances = dt.fit(toyDF).transform(toyDF).select("variance").collect().map { case Row(variance: Double) => variance } - val trueVariances = Array(0.667, 0.667, 0.667, 2.667, 2.667, 2.667) - trueVariances.zip(expectVariances).foreach(x => x._1 ~== x._2 absTol 1e-3) + + // Since max depth is set to 1, the best split point is that which splits the data + // into (0.0, 1.0, 2.0) and (10.0, 12.0, 14.0). The predicted variance for each + // data point in the left node is 0.667 and for each data point in the right node + // is 2.667 + val expectedVariances = Array(0.667, 0.667, 0.667, 2.667, 2.667, 2.667) + calculatedVariances.zip(expectedVariances).foreach { case (actual, expected) => + assert(actual ~== expected absTol 1e-3) + } } test("Feature importance with toy data") { From 9eed4003dd25c3915b73384259c63678910508ac Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 1 Jul 2016 15:41:46 -0700 Subject: [PATCH 3/6] cosmetic --- .../spark/ml/regression/DecisionTreeRegressorSuite.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 79b49281ae634..bbf7306e6e9cc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -113,8 +113,10 @@ class DecisionTreeRegressorSuite dt.setMaxDepth(1) .setMaxBins(6) .setSeed(0) - val calculatedVariances = dt.fit(toyDF).transform(toyDF).select("variance").collect().map { - case Row(variance: Double) => variance } + val transformToyDF = dt.fit(toyDF).transform(toyDF) + val calculatedVariances = transformToyDF.select(dt.getVarianceCol).collect().map { + case Row(variance: Double) => variance + } // Since max depth is set to 1, the best split point is that which splits the data // into (0.0, 1.0, 2.0) and (10.0, 12.0, 14.0). The predicted variance for each From 72b4ec40d27da48f0197885a4ea72363502449b6 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 5 Jul 2016 10:12:46 -0700 Subject: [PATCH 4/6] Move varianceData to treeTests --- .../regression/DecisionTreeRegressorSuite.scala | 16 +++++----------- .../apache/spark/ml/tree/impl/TreeTests.scala | 12 ++++++++++++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index bbf7306e6e9cc..471047d8e7285 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -44,14 +44,6 @@ class DecisionTreeRegressorSuite categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints().map(_.asML)) - toyData = sc.parallelize(Seq( - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(1.0))), - LabeledPoint(3.0, Vectors.dense(Array(2.0))), - LabeledPoint(10.0, Vectors.dense(Array(3.0))), - LabeledPoint(12.0, Vectors.dense(Array(4.0))), - LabeledPoint(14.0, Vectors.dense(Array(5.0)))) - ) } ///////////////////////////////////////////////////////////////////////////// @@ -109,12 +101,14 @@ class DecisionTreeRegressorSuite s"Expected variance $expectedVariance but got $variance.") } - val toyDF = TreeTests.setMetadata(toyData, Map.empty[Int, Int], 0) + + val varianceData: RDD[LabeledPoint] = TreeTests.varianceData(sc) + val varianceDF = TreeTests.setMetadata(varianceData, Map.empty[Int, Int], 0) dt.setMaxDepth(1) .setMaxBins(6) .setSeed(0) - val transformToyDF = dt.fit(toyDF).transform(toyDF) - val calculatedVariances = transformToyDF.select(dt.getVarianceCol).collect().map { + val transformVarDF = dt.fit(varianceDF).transform(varianceDF) + val calculatedVariances = transformVarDF.select(dt.getVarianceCol).collect().map { case Row(variance: Double) => variance } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala index d2fa8d0d6335d..c90cb8ca1034c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala @@ -182,6 +182,18 @@ private[ml] object TreeTests extends SparkFunSuite { new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)) )) + /** + * Create some toy data for testing correctness of variance. + */ + def varianceData(sc: SparkContext): RDD[LabeledPoint] = sc.parallelize(Seq( + new LabeledPoint(1.0, Vectors.dense(Array(0.0))), + new LabeledPoint(2.0, Vectors.dense(Array(1.0))), + new LabeledPoint(3.0, Vectors.dense(Array(2.0))), + new LabeledPoint(10.0, Vectors.dense(Array(3.0))), + new LabeledPoint(12.0, Vectors.dense(Array(4.0))), + new LabeledPoint(14.0, Vectors.dense(Array(5.0))) + )) + /** * Mapping from all Params to valid settings which differ from the defaults. * This is useful for tests which need to exercise all Params, such as save/load. From 4ca2d37168b94d16fb328ceb4319be9a0bb323d5 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 5 Jul 2016 10:17:14 -0700 Subject: [PATCH 5/6] remove unused import --- .../apache/spark/ml/regression/DecisionTreeRegressorSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 471047d8e7285..1a3eed1895802 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ From 856e3e8a81995a0c4a733d9fbebdc2f5d9993180 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 5 Jul 2016 10:20:38 -0700 Subject: [PATCH 6/6] cosmit --- .../spark/ml/regression/DecisionTreeRegressorSuite.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 1a3eed1895802..15fa26e8b5272 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -37,11 +37,8 @@ class DecisionTreeRegressorSuite private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ - private var toyData: RDD[LabeledPoint] = _ - override def beforeAll() { super.beforeAll() - categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints().map(_.asML)) } @@ -101,7 +98,6 @@ class DecisionTreeRegressorSuite s"Expected variance $expectedVariance but got $variance.") } - val varianceData: RDD[LabeledPoint] = TreeTests.varianceData(sc) val varianceDF = TreeTests.setMetadata(varianceData, Map.empty[Int, Int], 0) dt.setMaxDepth(1)