From 51695a09cb3c78a67f12e234d19cf6e1142a7b69 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sat, 5 Mar 2016 01:19:35 -0800 Subject: [PATCH 1/3] add toggle for countvectorizer --- .../spark/ml/feature/CountVectorizer.scala | 29 ++++++++++++++++++- .../ml/feature/CountVectorizerSuite.scala | 18 ++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index cf151458f0917..e96152bfaf012 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -207,6 +207,27 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin /** @group setParam */ def setMinTF(value: Double): this.type = set(minTF, value) + /** + * Binary toggle to control the output vector values. + * If True, all non zero counts are set to 1. This is useful for discrete probabilistic + * models that model binary events rather than integer counts + * + * Default: false + * @group param + */ + val binary: BooleanParam = + new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + + "This is useful for discrete probabilistic models that model binary events rather " + + "than integer counts") + + /** @group getParam */ + def getBinary: Boolean = $(binary) + + /** @group setParam */ + def setBinary(value: Boolean): this.type = set(binary, value) + + setDefault(binary -> false) + /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None @@ -233,7 +254,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin } else { tokenCount * minTf } - Vectors.sparse(dictBr.value.size, termCounts.filter(_._2 >= effectiveMinTF).toSeq) + val effectiveCounts = if ($(binary)) { + termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq + } + else { + termCounts.filter(_._2 >= effectiveMinTF).toSeq + } + Vectors.sparse(dictBr.value.size, effectiveCounts) } dataset.withColumn($(outputCol), vectorizer(col($(inputCol)))) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 9c9999017317d..8f4428bb8f1e7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -168,6 +168,24 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("CountVectorizerModel with binary") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))), + (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), + (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) + )).toDF("id", "words", "expected") + + // minTF: count + val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + .setInputCol("words") + .setOutputCol("features") + .setBinary(true) + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + test("CountVectorizer read/write") { val t = new CountVectorizer() .setInputCol("myInputCol") From 1b70bf00efbb4b61f3ba7a2dcc35ccd2267d18cc Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sat, 5 Mar 2016 10:27:51 -0800 Subject: [PATCH 2/3] fix comment --- .../org/apache/spark/ml/feature/CountVectorizerSuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 8f4428bb8f1e7..04f165c5f1e74 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -157,7 +157,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext (3, split("e e e e e"), Vectors.sparse(4, Seq()))) ).toDF("id", "words", "expected") - // minTF: count + // minTF: set frequency val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) .setInputCol("words") .setOutputCol("features") @@ -175,7 +175,6 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) )).toDF("id", "words", "expected") - // minTF: count val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) .setInputCol("words") .setOutputCol("features") From 4172226708457e037a4300bd65aa0567c4f30f68 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 18 Mar 2016 13:50:36 -0700 Subject: [PATCH 3/3] refine comment --- .../spark/ml/feature/CountVectorizer.scala | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index a3845d39777a4..5694b3890fba4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -207,13 +207,12 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin def setMinTF(value: Double): this.type = set(minTF, value) /** - * Binary toggle to control the output vector values. - * If True, all non zero counts are set to 1. This is useful for discrete probabilistic - * models that model binary events rather than integer counts - * - * Default: false - * @group param - */ + * Binary toggle to control the output vector values. + * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for + * discrete probabilistic models that model binary events rather than integer counts. + * Default: false + * @group param + */ val binary: BooleanParam = new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + "This is useful for discrete probabilistic models that model binary events rather " + @@ -248,17 +247,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin } tokenCount += 1 } - val effectiveMinTF = if (minTf >= 1.0) { - minTf - } else { - tokenCount * minTf - } + val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf val effectiveCounts = if ($(binary)) { termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq - } - else { + } else { termCounts.filter(_._2 >= effectiveMinTF).toSeq } + Vectors.sparse(dictBr.value.size, effectiveCounts) } dataset.withColumn($(outputCol), vectorizer(col($(inputCol))))