From 86dde8ffa1f8e0824b4832047f3d86690c0edfb9 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 25 Aug 2017 17:08:48 +0800 Subject: [PATCH 1/4] init pr --- .../MultilayerPerceptronClassifierSuite.scala | 2 + .../ProbabilisticClassifierSuite.scala | 55 ++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala index c294e4ad54bf7..58695dd212581 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala @@ -104,6 +104,8 @@ class MultilayerPerceptronClassifierSuite case Row(p: Vector, e: Vector) => assert(p ~== e absTol 1e-3) } + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, MultilayerPerceptronClassificationModel](model, strongDataset) } test("test model probability") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala index 172c64aab9d3d..40d10d5f19bbd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -18,7 +18,10 @@ package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.sql.{DataFrame, Row} final class TestProbabilisticClassificationModel( override val uid: String, @@ -91,4 +94,54 @@ object ProbabilisticClassifierSuite { "thresholds" -> Array(0.4, 0.6) ) + def probabilisticClassifierGenericTest[ + FeaturesType, + M <: ProbabilisticClassificationModel[FeaturesType, M]]( + model: M, testData: DataFrame): Unit = { + + val allColModel = model.copy(ParamMap.empty) + .setRawPredictionCol("rawPredictionAll") + .setProbabilityCol("probabilityAll") + .setPredictionCol("predictionAll") + val allColResult = allColModel.transform(testData) + + for (rawPredictionCol <- Seq("", "rawPredictionSingle")) { + for (probabilityCol <- Seq("", "probabilitySingle")) { + for (predictionCol <- Seq("", "predictionSingle")) { + val newModel = model.copy(ParamMap.empty) + .setRawPredictionCol(rawPredictionCol) + .setProbabilityCol(probabilityCol) + .setPredictionCol(predictionCol) + + val result = newModel.transform(allColResult) + + import org.apache.spark.sql.functions._ + + val resultRawPredictionCol = + if (rawPredictionCol.isEmpty) col("rawPredictionAll") else col(rawPredictionCol) + val resultProbabilityCol = + if (probabilityCol.isEmpty) col("probabilityAll") else col(probabilityCol) + val resultPredictionCol = + if (predictionCol.isEmpty) col("predictionAll") else col(predictionCol) + + result.select( + resultRawPredictionCol, col("rawPredictionAll"), + resultProbabilityCol, col("probabilityAll"), + resultPredictionCol, col("predictionAll") + ).collect().foreach { + case Row( + rawPredictionSingle: Vector, rawPredictionAll: Vector, + probabilitySingle: Vector, probabilityAll: Vector, + predictionSingle: Double, predictionAll: Double + ) => { + assert(rawPredictionSingle.asInstanceOf[Vector] ~== rawPredictionAll relTol 1E-3) + assert(probabilitySingle.asInstanceOf[Vector] ~== probabilityAll relTol 1E-3) + assert(predictionSingle.asInstanceOf[Double] ~== predictionAll relTol 1E-3) + } + } + } + } + } + } + } From fca5257df1c0cdf763bb5737a16046afdfd7277e Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 28 Aug 2017 14:25:33 +0800 Subject: [PATCH 2/4] update tests --- .../ml/classification/DecisionTreeClassifierSuite.scala | 3 +++ .../apache/spark/ml/classification/GBTClassifierSuite.scala | 3 +++ .../spark/ml/classification/LogisticRegressionSuite.scala | 6 ++++++ .../apache/spark/ml/classification/NaiveBayesSuite.scala | 6 ++++++ .../ml/classification/ProbabilisticClassifierSuite.scala | 4 ++-- .../ml/classification/RandomForestClassifierSuite.scala | 2 ++ 6 files changed, 22 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index 918ab27e2730b..cb5ea7082c1ac 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -262,6 +262,9 @@ class DecisionTreeClassifierSuite assert(Vectors.dense(rawPred.toArray.map(_ / sum)) === probPred, "probability prediction mismatch") } + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, DecisionTreeClassificationModel](newTree, newData) } test("training with 1-category categorical feature") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index 1f79e0d4e6228..88b1566e27891 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -219,6 +219,9 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext resultsUsingPredict.zip(results.select(predictionCol).as[Double].collect()).foreach { case (pred1, pred2) => assert(pred1 === pred2) } + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, GBTClassificationModel](gbtModel, validationDataset) } test("GBT parameter stepSize should be in interval (0, 1]") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 6649fa402527d..b7aa5768d5167 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -496,6 +496,9 @@ class LogisticRegressionSuite resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { case (pred1, pred2) => assert(pred1 === pred2) } + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, LogisticRegressionModel](model, smallMultinomialDataset) } test("binary logistic regression: Predictor, Classifier methods") { @@ -550,6 +553,9 @@ class LogisticRegressionSuite resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { case (pred1, pred2) => assert(pred1 === pred2) } + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, LogisticRegressionModel](model, smallBinaryDataset) } test("coefficients and intercept methods") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index 3a2be236f1257..efd064c4e1958 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -160,6 +160,9 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa val featureAndProbabilities = model.transform(validationDataset) .select("features", "probability") validateProbabilities(featureAndProbabilities, model, "multinomial") + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, NaiveBayesModel](model, testDataset) } test("Naive Bayes with weighted samples") { @@ -213,6 +216,9 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa val featureAndProbabilities = model.transform(validationDataset) .select("features", "probability") validateProbabilities(featureAndProbabilities, model, "bernoulli") + + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, NaiveBayesModel](model, testDataset) } test("detect negative values") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala index 40d10d5f19bbd..74c7024bfe865 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{DataFrame, Dataset, Row} final class TestProbabilisticClassificationModel( override val uid: String, @@ -97,7 +97,7 @@ object ProbabilisticClassifierSuite { def probabilisticClassifierGenericTest[ FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]]( - model: M, testData: DataFrame): Unit = { + model: M, testData: Dataset[_]): Unit = { val allColModel = model.copy(ParamMap.empty) .setRawPredictionCol("rawPredictionAll") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index ca2954d2f32c4..9e9db68095c65 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -155,6 +155,8 @@ class RandomForestClassifierSuite "probability prediction mismatch") assert(probPred.toArray.sum ~== 1.0 relTol 1E-5) } + ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + Vector, RandomForestClassificationModel](model, df) } test("Fitting without numClasses in metadata") { From d7bc994596693cec45b73cc49b6586da2a8cfffc Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 29 Aug 2017 21:02:53 +0800 Subject: [PATCH 3/4] update --- .../DecisionTreeClassifierSuite.scala | 2 +- .../ml/classification/GBTClassifierSuite.scala | 2 +- .../LogisticRegressionSuite.scala | 4 ++-- .../MultilayerPerceptronClassifierSuite.scala | 2 +- .../ml/classification/NaiveBayesSuite.scala | 4 ++-- .../ProbabilisticClassifierSuite.scala | 18 ++++++++++++------ .../RandomForestClassifierSuite.scala | 2 +- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index cb5ea7082c1ac..98c879ece62d6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -263,7 +263,7 @@ class DecisionTreeClassifierSuite "probability prediction mismatch") } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, DecisionTreeClassificationModel](newTree, newData) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index 88b1566e27891..8000143d4d142 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -220,7 +220,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext case (pred1, pred2) => assert(pred1 === pred2) } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, GBTClassificationModel](gbtModel, validationDataset) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index b7aa5768d5167..46bf1b55ae40c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -497,7 +497,7 @@ class LogisticRegressionSuite case (pred1, pred2) => assert(pred1 === pred2) } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, LogisticRegressionModel](model, smallMultinomialDataset) } @@ -554,7 +554,7 @@ class LogisticRegressionSuite case (pred1, pred2) => assert(pred1 === pred2) } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, LogisticRegressionModel](model, smallBinaryDataset) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala index 58695dd212581..d3141ec708560 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala @@ -104,7 +104,7 @@ class MultilayerPerceptronClassifierSuite case Row(p: Vector, e: Vector) => assert(p ~== e absTol 1e-3) } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, MultilayerPerceptronClassificationModel](model, strongDataset) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index efd064c4e1958..9730dd68a3b27 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -161,7 +161,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa .select("features", "probability") validateProbabilities(featureAndProbabilities, model, "multinomial") - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, NaiveBayesModel](model, testDataset) } @@ -217,7 +217,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa .select("features", "probability") validateProbabilities(featureAndProbabilities, model, "bernoulli") - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, NaiveBayesModel](model, testDataset) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala index 74c7024bfe865..f4a6b827a69ea 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -18,10 +18,10 @@ package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{Dataset, Row} final class TestProbabilisticClassificationModel( override val uid: String, @@ -94,7 +94,13 @@ object ProbabilisticClassifierSuite { "thresholds" -> Array(0.4, 0.6) ) - def probabilisticClassifierGenericTest[ + /** + * Add test for prediction using the model with all combinations of + * output columns (rawPrediction/probability/prediction) turned on/off. + * Make sure the output column values match, presumably by comparing vs. + * the case with all 3 output columns turned on. + */ + def testPredictMethods[ FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]]( model: M, testData: Dataset[_]): Unit = { @@ -134,9 +140,9 @@ object ProbabilisticClassifierSuite { probabilitySingle: Vector, probabilityAll: Vector, predictionSingle: Double, predictionAll: Double ) => { - assert(rawPredictionSingle.asInstanceOf[Vector] ~== rawPredictionAll relTol 1E-3) - assert(probabilitySingle.asInstanceOf[Vector] ~== probabilityAll relTol 1E-3) - assert(predictionSingle.asInstanceOf[Double] ~== predictionAll relTol 1E-3) + assert(rawPredictionSingle ~== rawPredictionAll relTol 1E-3) + assert(probabilitySingle ~== probabilityAll relTol 1E-3) + assert(predictionSingle ~== predictionAll relTol 1E-3) } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index 9e9db68095c65..2cca2e6c04698 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -155,7 +155,7 @@ class RandomForestClassifierSuite "probability prediction mismatch") assert(probPred.toArray.sum ~== 1.0 relTol 1E-5) } - ProbabilisticClassifierSuite.probabilisticClassifierGenericTest[ + ProbabilisticClassifierSuite.testPredictMethods[ Vector, RandomForestClassificationModel](model, df) } From f13cd73926e80173228637da2015c7d6e7a0e848 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 1 Sep 2017 00:07:12 +0800 Subject: [PATCH 4/4] update comments --- .../ml/classification/ProbabilisticClassifierSuite.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala index f4a6b827a69ea..4ecd5a05365eb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -95,10 +95,11 @@ object ProbabilisticClassifierSuite { ) /** - * Add test for prediction using the model with all combinations of - * output columns (rawPrediction/probability/prediction) turned on/off. - * Make sure the output column values match, presumably by comparing vs. - * the case with all 3 output columns turned on. + * Helper for testing that a ProbabilisticClassificationModel computes + * the same predictions across all combinations of output columns + * (rawPrediction/probability/prediction) turned on/off. Makes sure the + * output column values match by comparing vs. the case with all 3 output + * columns turned on. */ def testPredictMethods[ FeaturesType,