From 158ceedaaae83ccd224e1de8a182688bf1a7b111 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 30 Sep 2016 21:19:05 +0800 Subject: [PATCH 1/2] create pr --- .../ml/classification/NaiveBayesSuite.scala | 67 ++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index 597428d036c7a..3157a75909521 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -22,7 +22,7 @@ import scala.util.Random import breeze.linalg.{DenseVector => BDV, Vector => BV} import breeze.stats.distributions.{Multinomial => BrzMultinomial} -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.classification.NaiveBayes.{Bernoulli, Multinomial} import org.apache.spark.ml.classification.NaiveBayesSuite._ import org.apache.spark.ml.feature.{Instance, LabeledPoint} @@ -106,6 +106,11 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa } } + test("model types") { + assert(Multinomial === "multinomial") + assert(Bernoulli === "bernoulli") + } + test("params") { ParamsSuite.checkParams(new NaiveBayes) val model = new NaiveBayesModel("nb", pi = Vectors.dense(Array(0.2, 0.8)), @@ -228,6 +233,66 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa validateProbabilities(featureAndProbabilities, model, "bernoulli") } + test("detect negative values") { + val dense = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(-1.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(0.0)))) + intercept[SparkException] { + new NaiveBayes().fit(dense) + } + val sparse = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.sparse(1, Array(0), Array(1.0))), + LabeledPoint(0.0, Vectors.sparse(1, Array(0), Array(-1.0))), + LabeledPoint(1.0, Vectors.sparse(1, Array(0), Array(1.0))), + LabeledPoint(1.0, Vectors.sparse(1, Array.empty, Array.empty)))) + intercept[SparkException] { + new NaiveBayes().fit(sparse) + } + val nan = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.sparse(1, Array(0), Array(1.0))), + LabeledPoint(0.0, Vectors.sparse(1, Array(0), Array(Double.NaN))), + LabeledPoint(1.0, Vectors.sparse(1, Array(0), Array(1.0))), + LabeledPoint(1.0, Vectors.sparse(1, Array.empty, Array.empty)))) + intercept[SparkException] { + new NaiveBayes().fit(nan) + } + } + + test("detect non zero or one values in Bernoulli") { + val badTrain = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(2.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(0.0)))) + + intercept[SparkException] { + new NaiveBayes().setModelType(Bernoulli).setSmoothing(1.0).fit(badTrain) + } + + val okTrain = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(0.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(0.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(1.0)))) + + val model = new NaiveBayes().setModelType(Bernoulli).setSmoothing(1.0).fit(okTrain) + + val badPredict = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(2.0)), + LabeledPoint(1.0, Vectors.dense(1.0)), + LabeledPoint(1.0, Vectors.dense(0.0)))) + + intercept[SparkException] { + model.transform(badPredict).collect() + } + } + test("read/write") { def checkModelData(model: NaiveBayesModel, model2: NaiveBayesModel): Unit = { assert(model.pi === model2.pi) From 618def9a953c0c3e240c716549f86ce42877c1da Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 30 Sep 2016 21:33:07 +0800 Subject: [PATCH 2/2] remove some unused imports --- .../scala/org/apache/spark/ml/feature/LabeledPoint.scala | 2 +- .../org/apache/spark/ml/feature/QuantileDiscretizer.scala | 2 +- .../src/main/scala/org/apache/spark/ml/python/MLSerDe.scala | 5 ----- .../scala/org/apache/spark/ml/regression/GBTRegressor.scala | 2 +- .../org/apache/spark/ml/regression/LinearRegression.scala | 1 - .../org/apache/spark/ml/classification/NaiveBayesSuite.scala | 2 +- python/pyspark/ml/classification.py | 1 - 7 files changed, 4 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala index 6cefa7086c881..7d8e4adcc2259 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.linalg.Vector /** * :: Experimental :: * - * Class that represents the features and labels of a data point. + * Class that represents the features and label of a data point. * * @param label Label for this data point. * @param features List of features for this data point. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 1e59d71a70955..05e034d90f6a3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{DoubleType, StructType} +import org.apache.spark.sql.types.StructType /** * Params for [[QuantileDiscretizer]]. diff --git a/mllib/src/main/scala/org/apache/spark/ml/python/MLSerDe.scala b/mllib/src/main/scala/org/apache/spark/ml/python/MLSerDe.scala index 4b805e145482a..da62f8518e363 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/python/MLSerDe.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/python/MLSerDe.scala @@ -19,17 +19,12 @@ package org.apache.spark.ml.python import java.io.OutputStream import java.nio.{ByteBuffer, ByteOrder} -import java.util.{ArrayList => JArrayList} - -import scala.collection.JavaConverters._ import net.razorvine.pickle._ -import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.SerDeUtil import org.apache.spark.ml.linalg._ import org.apache.spark.mllib.api.python.SerDeBase -import org.apache.spark.rdd.RDD /** * SerDe utility functions for pyspark.ml. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index ce355938ec1c7..bb01f9d5a364c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -21,7 +21,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.feature.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 7fddfd9b10f84..536c58f998080 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -37,7 +37,6 @@ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.evaluation.RegressionMetrics -import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index 3157a75909521..e934e5ea42b16 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -25,7 +25,7 @@ import breeze.stats.distributions.{Multinomial => BrzMultinomial} import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.classification.NaiveBayes.{Bernoulli, Multinomial} import org.apache.spark.ml.classification.NaiveBayesSuite._ -import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 505e7bffd1763..ea60fab029582 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -16,7 +16,6 @@ # import operator -import warnings from pyspark import since, keyword_only from pyspark.ml import Estimator, Model