From eecb4d73e1404aa5c2bb744d6a80e65cb85d2f6c Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 14 Nov 2016 18:13:58 +0800 Subject: [PATCH 1/4] create pr --- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 6 ++++-- .../scala/org/apache/spark/mllib/feature/HashingTF.scala | 2 ++ .../src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index f9156b642785f..501401e0fe5ee 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -192,25 +192,27 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def this(numTopFeatures: Int) { this() + require(numTopFeatures > 0, s"numTopFeatures must be positive but got $numTopFeatures") this.numTopFeatures = numTopFeatures } @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { + require(value > 0, s"numTopFeatures must be positive but got $value") numTopFeatures = value this } @Since("2.1.0") def setPercentile(value: Double): this.type = { - require(0.0 <= value && value <= 1.0, "Percentile must be in [0,1]") + require(0.0 <= value && value <= 1.0, s"Percentile must be in [0,1] but got $value") percentile = value this } @Since("2.1.0") def setFpr(value: Double): this.type = { - require(0.0 <= value && value <= 1.0, "FPR must be in [0,1]") + require(0.0 <= value && value <= 1.0, s"FPR must be in [0,1] but got $value") fpr = value this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index bc26655104a9b..3c69fb41c834b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -38,6 +38,7 @@ import org.apache.spark.util.Utils */ @Since("1.1.0") class HashingTF(val numFeatures: Int) extends Serializable { + require(numFeatures > 0, s"numFeatures must be positive but got $numFeatures") import HashingTF._ @@ -65,6 +66,7 @@ class HashingTF(val numFeatures: Int) extends Serializable { */ @Since("2.0.0") def setHashAlgorithm(value: String): this.type = { + require(Array(Murmur3, Native).contains(value), s"hashAlgorithm: $value was not supported.") hashAlgorithm = value this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index bb4b37ef21a84..15da4cc741e24 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -38,6 +38,7 @@ import org.apache.spark.rdd.RDD */ @Since("1.1.0") class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) { + require(minDocFreq > 0, s"minDocFreq must be positive but got $minDocFreq") @Since("1.1.0") def this() = this(0) From 5ff61e6e53d403937354ca1f5029538b2db65603 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 14 Nov 2016 19:24:36 +0800 Subject: [PATCH 2/4] create pr --- .../scala/org/apache/spark/ml/feature/IDF.scala | 3 ++- .../scala/org/apache/spark/ml/feature/PCA.scala | 3 ++- .../org/apache/spark/ml/feature/Word2Vec.scala | 13 ++++++++----- .../spark/ml/regression/IsotonicRegression.scala | 3 ++- .../spark/ml/regression/LinearRegression.scala | 5 ++++- .../scala/org/apache/spark/ml/tree/treeParams.scala | 4 +++- 6 files changed, 21 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala index 6386dd8a10801..46a0730f5ddb8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -44,7 +44,8 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol * @group param */ final val minDocFreq = new IntParam( - this, "minDocFreq", "minimum number of documents in which a term should appear for filtering") + this, "minDocFreq", "minimum number of documents in which a term should appear for filtering" + + " (>= 0)", ParamValidators.gtEq(0)) setDefault(minDocFreq -> 0) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 6b913480fdc28..444006fe1edb6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -44,7 +44,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC * The number of principal components. * @group param */ - final val k: IntParam = new IntParam(this, "k", "the number of principal components") + final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)", + ParamValidators.gt(0)) /** @group getParam */ def getK: Int = $(k) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index d53f3df514dff..3ed08c983d561 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -43,7 +43,8 @@ private[feature] trait Word2VecBase extends Params * @group param */ final val vectorSize = new IntParam( - this, "vectorSize", "the dimension of codes after transforming from words") + this, "vectorSize", "the dimension of codes after transforming from words (> 0)", + ParamValidators.gt(0)) setDefault(vectorSize -> 100) /** @group getParam */ @@ -55,7 +56,8 @@ private[feature] trait Word2VecBase extends Params * @group expertParam */ final val windowSize = new IntParam( - this, "windowSize", "the window size (context words from [-window, window])") + this, "windowSize", "the window size (context words from [-window, window]) (> 0)", + ParamValidators.gt(0)) setDefault(windowSize -> 5) /** @group expertGetParam */ @@ -67,7 +69,8 @@ private[feature] trait Word2VecBase extends Params * @group param */ final val numPartitions = new IntParam( - this, "numPartitions", "number of partitions for sentences of words") + this, "numPartitions", "number of partitions for sentences of words (> 0)", + ParamValidators.gt(0)) setDefault(numPartitions -> 1) /** @group getParam */ @@ -80,7 +83,7 @@ private[feature] trait Word2VecBase extends Params * @group param */ final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " + - "appear to be included in the word2vec model's vocabulary") + "appear to be included in the word2vec model's vocabulary (>= 0)", ParamValidators.gtEq(0)) setDefault(minCount -> 5) /** @group getParam */ @@ -95,7 +98,7 @@ private[feature] trait Word2VecBase extends Params */ final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " + "(in words) of each sentence in the input data. Any sentence longer than this threshold will " + - "be divided into chunks up to the size.") + "be divided into chunks up to the size (> 0)", ParamValidators.gt(0)) setDefault(maxSentenceLength -> 1000) /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index cd7b4f2a9c56e..4d274f3a5bbf1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -61,7 +61,8 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures * @group param */ final val featureIndex: IntParam = new IntParam(this, "featureIndex", - "The index of the feature if featuresCol is a vector column, no effect otherwise.") + "The index of the feature if featuresCol is a vector column, no effect otherwise (>= 0)", + ParamValidators.gtEq(0)) /** @group getParam */ final def getFeatureIndex: Int = $(featureIndex) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 9639b07496c13..65954e3ea2f9b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -171,7 +171,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String * @group setParam */ @Since("1.6.0") - def setSolver(value: String): this.type = set(solver, value) + def setSolver(value: String): this.type = { + require(Array("auto", "l-bfgs", "normal").contains(value), s"Solver $value was not supported.") + set(solver, value) + } setDefault(solver -> "auto") /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 57c7e44e97607..5a551533be9ca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -73,11 +73,13 @@ private[ml] trait DecisionTreeParams extends PredictorParams /** * Minimum information gain for a split to be considered at a tree node. + * Should be >= 0.0. * (default = 0.0) * @group param */ final val minInfoGain: DoubleParam = new DoubleParam(this, "minInfoGain", - "Minimum information gain for a split to be considered at a tree node.") + "Minimum information gain for a split to be considered at a tree node.", + ParamValidators.gtEq(0.0)) /** * Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be From 89c7f1aaee562aad5983556dac27c4507b78697b Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 14 Nov 2016 19:34:54 +0800 Subject: [PATCH 3/4] update --- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 6 ++---- .../scala/org/apache/spark/mllib/feature/HashingTF.scala | 2 -- .../src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 501401e0fe5ee..f9156b642785f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -192,27 +192,25 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def this(numTopFeatures: Int) { this() - require(numTopFeatures > 0, s"numTopFeatures must be positive but got $numTopFeatures") this.numTopFeatures = numTopFeatures } @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { - require(value > 0, s"numTopFeatures must be positive but got $value") numTopFeatures = value this } @Since("2.1.0") def setPercentile(value: Double): this.type = { - require(0.0 <= value && value <= 1.0, s"Percentile must be in [0,1] but got $value") + require(0.0 <= value && value <= 1.0, "Percentile must be in [0,1]") percentile = value this } @Since("2.1.0") def setFpr(value: Double): this.type = { - require(0.0 <= value && value <= 1.0, s"FPR must be in [0,1] but got $value") + require(0.0 <= value && value <= 1.0, "FPR must be in [0,1]") fpr = value this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 3c69fb41c834b..bc26655104a9b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -38,7 +38,6 @@ import org.apache.spark.util.Utils */ @Since("1.1.0") class HashingTF(val numFeatures: Int) extends Serializable { - require(numFeatures > 0, s"numFeatures must be positive but got $numFeatures") import HashingTF._ @@ -66,7 +65,6 @@ class HashingTF(val numFeatures: Int) extends Serializable { */ @Since("2.0.0") def setHashAlgorithm(value: String): this.type = { - require(Array(Murmur3, Native).contains(value), s"hashAlgorithm: $value was not supported.") hashAlgorithm = value this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 15da4cc741e24..bb4b37ef21a84 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -38,7 +38,6 @@ import org.apache.spark.rdd.RDD */ @Since("1.1.0") class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) { - require(minDocFreq > 0, s"minDocFreq must be positive but got $minDocFreq") @Since("1.1.0") def this() = this(0) From 587fb9c8d233ec8d83750d4e1d39996da20b34e4 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 15 Nov 2016 10:37:16 +0800 Subject: [PATCH 4/4] array->set; show supported options --- .../org/apache/spark/ml/regression/LinearRegression.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 65954e3ea2f9b..71c542adf6f6f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -172,7 +172,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.6.0") def setSolver(value: String): this.type = { - require(Array("auto", "l-bfgs", "normal").contains(value), s"Solver $value was not supported.") + require(Set("auto", "l-bfgs", "normal").contains(value), + s"Solver $value was not supported. Supported options: auto, l-bfgs, normal") set(solver, value) } setDefault(solver -> "auto")