From 65dcb0434518c1b4d18ecbc9fe47587320a798fa Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Tue, 5 Apr 2016 16:39:53 -0700 Subject: [PATCH 01/11] move binary param out of CountVectorizerModel without test --- .../spark/ml/feature/CountVectorizer.scala | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 5694b3890fba4..47f48ae377283 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -42,7 +42,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * vocabSize terms ordered by term frequency across the corpus. * * Default: 2^18^ - * @group param + * + * @group param */ val vocabSize: IntParam = new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0)) @@ -57,7 +58,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * if this is a double in [0,1), then this specifies the fraction of documents. * * Default: 1 - * @group param + * + * @group param */ val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + " different documents a term must appear in to be included in the vocabulary." + @@ -87,7 +89,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * affect fitting. * * Default: 1 - * @group param + * + * @group param */ val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" + " a document. For each document, terms with frequency/count less than the given threshold are" + @@ -100,6 +103,24 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getMinTF: Double = $(minTF) + + /** + * Binary toggle to control the output vector values. + * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for + * discrete probabilistic models that model binary events rather than integer counts. + * Default: false + * + * @group param + */ + val binary: BooleanParam = + new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + + "This is useful for discrete probabilistic models that model binary events rather " + + "than integer counts") + + /** @group getParam */ + def getBinary: Boolean = $(binary) + + setDefault(binary -> false) } /** @@ -127,6 +148,9 @@ class CountVectorizer(override val uid: String) /** @group setParam */ def setMinTF(value: Double): this.type = set(minTF, value) + /** @group setParam */ + def setBinary(value: Boolean): this.type = set(binary, value) + setDefault(vocabSize -> (1 << 18), minDF -> 1) override def fit(dataset: DataFrame): CountVectorizerModel = { @@ -149,7 +173,11 @@ class CountVectorizer(override val uid: String) }.filter { case (word, (wc, df)) => df >= minDf }.map { case (word, (count, dfCount)) => - (word, count) + if ($(binary)) { + (word, 1L) + } else { + (word, count) + } }.cache() val fullVocabSize = wordCounts.count() val vocab: Array[String] = { @@ -184,7 +212,8 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] { /** * :: Experimental :: * Converts a text document to a sparse vector of token counts. - * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. + * + * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. */ @Experimental class CountVectorizerModel(override val uid: String, val vocabulary: Array[String]) @@ -206,26 +235,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin /** @group setParam */ def setMinTF(value: Double): this.type = set(minTF, value) - /** - * Binary toggle to control the output vector values. - * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for - * discrete probabilistic models that model binary events rather than integer counts. - * Default: false - * @group param - */ - val binary: BooleanParam = - new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events rather " + - "than integer counts") - - /** @group getParam */ - def getBinary: Boolean = $(binary) - /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - setDefault(binary -> false) - /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None From dcad7996c762c25db8b37d8a07bd9dce1a807418 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Tue, 5 Apr 2016 23:16:53 -0700 Subject: [PATCH 02/11] Add test case and revert one change --- .../spark/ml/feature/CountVectorizer.scala | 6 +----- .../ml/feature/CountVectorizerSuite.scala | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 47f48ae377283..e1300defa3e30 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -173,11 +173,7 @@ class CountVectorizer(override val uid: String) }.filter { case (word, (wc, df)) => df >= minDf }.map { case (word, (count, dfCount)) => - if ($(binary)) { - (word, 1L) - } else { - (word, count) - } + (word, count) }.cache() val fullVocabSize = wordCounts.count() val vocab: Array[String] = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 04f165c5f1e74..a18ca8faae2d0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -115,6 +115,27 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("CountVectorizer with binary") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a b c d e a b"), + Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))), + (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))), + (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))), + (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0))))) + ).toDF("id", "words", "expected") + val cv = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setBinary(true) + .fit(df) + assert(cv.vocabulary === Array("a", "b", "c", "d", "e")) + + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + test("CountVectorizer throws exception when vocab is empty") { intercept[IllegalArgumentException] { val df = sqlContext.createDataFrame(Seq( From b625caef23b2a607229976beb90ee682987b6210 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 09:50:18 -0700 Subject: [PATCH 03/11] merge tests, remove extrace leading space, address review comments. --- .../spark/ml/feature/CountVectorizer.scala | 12 +++-- .../ml/feature/CountVectorizerSuite.scala | 45 +++++++++---------- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index e1300defa3e30..f45e028f1cbfe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -42,8 +42,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * vocabSize terms ordered by term frequency across the corpus. * * Default: 2^18^ - * - * @group param + * + * @group param */ val vocabSize: IntParam = new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0)) @@ -113,9 +113,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * @group param */ val binary: BooleanParam = - new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events rather " + - "than integer counts") + new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.") /** @group getParam */ def getBinary: Boolean = $(binary) @@ -208,8 +206,8 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] { /** * :: Experimental :: * Converts a text document to a sparse vector of token counts. - * - * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. + * + * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. */ @Experimental class CountVectorizerModel(override val uid: String, val vocabulary: Array[String]) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index a18ca8faae2d0..b431cb6fc0f7f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -114,28 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext assert(features ~== expected absTol 1e-14) } } - - test("CountVectorizer with binary") { - val df = sqlContext.createDataFrame(Seq( - (0, split("a b c d e a b"), - Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))), - (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))), - (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))), - (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0))))) - ).toDF("id", "words", "expected") - val cv = new CountVectorizer() - .setInputCol("words") - .setOutputCol("features") - .setBinary(true) - .fit(df) - assert(cv.vocabulary === Array("a", "b", "c", "d", "e")) - - cv.transform(df).select("features", "expected").collect().foreach { - case Row(features: Vector, expected: Vector) => - assert(features ~== expected absTol 1e-14) - } - } - + test("CountVectorizer throws exception when vocab is empty") { intercept[IllegalArgumentException] { val df = sqlContext.createDataFrame(Seq( @@ -189,7 +168,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } - test("CountVectorizerModel with binary") { + test("CountVectorizerModel and CountVectorizer with binary") { val df = sqlContext.createDataFrame(Seq( (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))), (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), @@ -204,6 +183,26 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) } + + // CountVectorizer test + val df2 = sqlContext.createDataFrame(Seq( + (0, split("a b c d e a b"), + Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))), + (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))), + (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))), + (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0))))) + ).toDF("id", "words", "expected") + val cv2 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setBinary(true) + .fit(df2) + assert(cv2.vocabulary === Array("a", "b", "c", "d", "e")) + + cv2.transform(df2).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } } test("CountVectorizer read/write") { From 7e6daa85b6e9b947184783063cd8b428d56edfe1 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 10:07:17 -0700 Subject: [PATCH 04/11] formatting, remove unnecessary spaces and lines added by my editor --- .../spark/ml/feature/CountVectorizer.scala | 20 ++++++++----------- .../ml/feature/CountVectorizerSuite.scala | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index f45e028f1cbfe..4810b7c070e71 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -42,7 +42,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * vocabSize terms ordered by term frequency across the corpus. * * Default: 2^18^ - * * @group param */ val vocabSize: IntParam = @@ -58,8 +57,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * if this is a double in [0,1), then this specifies the fraction of documents. * * Default: 1 - * - * @group param + * @group param */ val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + " different documents a term must appear in to be included in the vocabulary." + @@ -89,8 +87,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * affect fitting. * * Default: 1 - * - * @group param + * @group param */ val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" + " a document. For each document, terms with frequency/count less than the given threshold are" + @@ -105,13 +102,12 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit def getMinTF: Double = $(minTF) /** - * Binary toggle to control the output vector values. - * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for - * discrete probabilistic models that model binary events rather than integer counts. - * Default: false - * - * @group param - */ + * Binary toggle to control the output vector values. + * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for + * discrete probabilistic models that model binary events rather than integer counts. + * Default: false + * @group param + */ val binary: BooleanParam = new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index b431cb6fc0f7f..071671e8573aa 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -114,7 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext assert(features ~== expected absTol 1e-14) } } - + test("CountVectorizer throws exception when vocab is empty") { intercept[IllegalArgumentException] { val df = sqlContext.createDataFrame(Seq( From 1f1e36dc8f727893c13badf75db36401d9f84742 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 10:10:17 -0700 Subject: [PATCH 05/11] remove additional line added by editor --- .../main/scala/org/apache/spark/ml/feature/CountVectorizer.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 4810b7c070e71..f1be971a6ae94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -202,7 +202,6 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] { /** * :: Experimental :: * Converts a text document to a sparse vector of token counts. - * * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. */ @Experimental From 7c89370f4458e1c656db4180df3668928e148755 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 10:37:20 -0700 Subject: [PATCH 06/11] revise the df and use the same df for two tests. --- .../spark/ml/feature/CountVectorizerSuite.scala | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 071671e8573aa..101534ce6187a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -170,7 +170,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext test("CountVectorizerModel and CountVectorizer with binary") { val df = sqlContext.createDataFrame(Seq( - (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))), + (0, split("a a a a b b b b c d"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) )).toDF("id", "words", "expected") @@ -185,21 +185,14 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } // CountVectorizer test - val df2 = sqlContext.createDataFrame(Seq( - (0, split("a b c d e a b"), - Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))), - (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))), - (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))), - (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0))))) - ).toDF("id", "words", "expected") val cv2 = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setBinary(true) - .fit(df2) - assert(cv2.vocabulary === Array("a", "b", "c", "d", "e")) + .fit(df) + assert(cv2.vocabulary === Array("a", "b", "c", "d")) - cv2.transform(df2).select("features", "expected").collect().foreach { + cv2.transform(df).select("features", "expected").collect().foreach { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) } From 5b35fb9f3cba3dd68e3bbdac7d927e312726bb37 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 11:16:44 -0700 Subject: [PATCH 07/11] remove extra space and assert in the test --- .../org/apache/spark/ml/feature/CountVectorizerSuite.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 101534ce6187a..e07edc10ac747 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -190,8 +190,6 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext .setOutputCol("features") .setBinary(true) .fit(df) - assert(cv2.vocabulary === Array("a", "b", "c", "d")) - cv2.transform(df).select("features", "expected").collect().foreach { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) From de81c35faaf6848d21562a8c131aff723928014d Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 11:28:23 -0700 Subject: [PATCH 08/11] split long line into 2 lines. remove the annoying extra space added by editor again. --- .../org/apache/spark/ml/feature/CountVectorizerSuite.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index e07edc10ac747..e8cd7e48f2646 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -114,7 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext assert(features ~== expected absTol 1e-14) } } - + test("CountVectorizer throws exception when vocab is empty") { intercept[IllegalArgumentException] { val df = sqlContext.createDataFrame(Seq( @@ -170,7 +170,8 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext test("CountVectorizerModel and CountVectorizer with binary") { val df = sqlContext.createDataFrame(Seq( - (0, split("a a a a b b b b c d"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), + (0, split("a a a a b b b b c d"), + Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) )).toDF("id", "words", "expected") From 42fdfee0dd583d73f533b760d3a689c68c562d9f Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 6 Apr 2016 11:35:12 -0700 Subject: [PATCH 09/11] remove space at the end of line --- .../org/apache/spark/ml/feature/CountVectorizerSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index e8cd7e48f2646..97dabe38c51fa 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -170,7 +170,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext test("CountVectorizerModel and CountVectorizer with binary") { val df = sqlContext.createDataFrame(Seq( - (0, split("a a a a b b b b c d"), + (0, split("a a a a b b b b c d"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) From e1bffd7609e7cb7d5a36066f799f03bd10b53c94 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Thu, 7 Apr 2016 13:13:47 -0700 Subject: [PATCH 10/11] move setDefault into class; change the order of tests --- .../org/apache/spark/ml/feature/CountVectorizer.scala | 6 ++++-- .../apache/spark/ml/feature/CountVectorizerSuite.scala | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index f1be971a6ae94..1858581702315 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -113,8 +113,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getBinary: Boolean = $(binary) - - setDefault(binary -> false) } /** @@ -147,6 +145,8 @@ class CountVectorizer(override val uid: String) setDefault(vocabSize -> (1 << 18), minDF -> 1) + setDefault(binary -> false) + override def fit(dataset: DataFrame): CountVectorizerModel = { transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) @@ -227,6 +227,8 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) + setDefault(binary -> false) + /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 97dabe38c51fa..ff0de06e27d01 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -176,21 +176,22 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) )).toDF("id", "words", "expected") - val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + // CountVectorizer test + val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setBinary(true) + .fit(df) cv.transform(df).select("features", "expected").collect().foreach { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) } - // CountVectorizer test - val cv2 = new CountVectorizer() + // CountVectorizerModel test + val cv2 = new CountVectorizerModel(cv.vocabulary) .setInputCol("words") .setOutputCol("features") .setBinary(true) - .fit(df) cv2.transform(df).select("features", "expected").collect().foreach { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) From e693c60e8059976ec179c298453017a486792712 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Fri, 8 Apr 2016 09:38:46 -0700 Subject: [PATCH 11/11] move setDefault back to trait --- .../scala/org/apache/spark/ml/feature/CountVectorizer.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 1858581702315..f1be971a6ae94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -113,6 +113,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getBinary: Boolean = $(binary) + + setDefault(binary -> false) } /** @@ -145,8 +147,6 @@ class CountVectorizer(override val uid: String) setDefault(vocabSize -> (1 << 18), minDF -> 1) - setDefault(binary -> false) - override def fit(dataset: DataFrame): CountVectorizerModel = { transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) @@ -227,8 +227,6 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - setDefault(binary -> false) - /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None