From 51695a09cb3c78a67f12e234d19cf6e1142a7b69 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 5 Mar 2016 01:19:35 -0800
Subject: [PATCH 1/3] add toggle for countvectorizer

---
 .../spark/ml/feature/CountVectorizer.scala    | 29 ++++++++++++++++++-
 .../ml/feature/CountVectorizerSuite.scala     | 18 ++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index cf151458f0917..e96152bfaf012 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -207,6 +207,27 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   /** @group setParam */
   def setMinTF(value: Double): this.type = set(minTF, value)
 
+  /**
+    * Binary toggle to control the output vector values.
+    * If True, all non zero counts are set to 1. This is useful for discrete probabilistic
+    * models that model binary events rather than integer counts
+    *
+    * Default: false
+    * @group param
+    */
+  val binary: BooleanParam =
+    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
+      "This is useful for discrete probabilistic models that model binary events rather " +
+      "than integer counts")
+
+  /** @group getParam */
+  def getBinary: Boolean = $(binary)
+
+  /** @group setParam */
+  def setBinary(value: Boolean): this.type = set(binary, value)
+
+  setDefault(binary -> false)
+
   /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
   private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
 
@@ -233,7 +254,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
       } else {
         tokenCount * minTf
       }
-      Vectors.sparse(dictBr.value.size, termCounts.filter(_._2 >= effectiveMinTF).toSeq)
+      val effectiveCounts = if ($(binary)) {
+        termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
+      }
+      else {
+        termCounts.filter(_._2 >= effectiveMinTF).toSeq
+      }
+      Vectors.sparse(dictBr.value.size, effectiveCounts)
     }
     dataset.withColumn($(outputCol), vectorizer(col($(inputCol))))
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 9c9999017317d..8f4428bb8f1e7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -168,6 +168,24 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
+  test("CountVectorizerModel with binary") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+      (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
+      (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
+    )).toDF("id", "words", "expected")
+
+    // minTF: count
+    val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setBinary(true)
+    cv.transform(df).select("features", "expected").collect().foreach {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14)
+    }
+  }
+
   test("CountVectorizer read/write") {
     val t = new CountVectorizer()
       .setInputCol("myInputCol")

From 1b70bf00efbb4b61f3ba7a2dcc35ccd2267d18cc Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 5 Mar 2016 10:27:51 -0800
Subject: [PATCH 2/3] fix comment

---
 .../org/apache/spark/ml/feature/CountVectorizerSuite.scala     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 8f4428bb8f1e7..04f165c5f1e74 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -157,7 +157,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       (3, split("e e e e e"), Vectors.sparse(4, Seq())))
     ).toDF("id", "words", "expected")
 
-    // minTF: count
+    // minTF: set frequency
     val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
       .setInputCol("words")
       .setOutputCol("features")
@@ -175,7 +175,6 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
     )).toDF("id", "words", "expected")
 
-    // minTF: count
     val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
       .setInputCol("words")
       .setOutputCol("features")

From 4172226708457e037a4300bd65aa0567c4f30f68 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 18 Mar 2016 13:50:36 -0700
Subject: [PATCH 3/3] refine comment

---
 .../spark/ml/feature/CountVectorizer.scala    | 23 ++++++++-----------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index a3845d39777a4..5694b3890fba4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -207,13 +207,12 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   def setMinTF(value: Double): this.type = set(minTF, value)
 
   /**
-    * Binary toggle to control the output vector values.
-    * If True, all non zero counts are set to 1. This is useful for discrete probabilistic
-    * models that model binary events rather than integer counts
-    *
-    * Default: false
-    * @group param
-    */
+   * Binary toggle to control the output vector values.
+   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+   * discrete probabilistic models that model binary events rather than integer counts.
+   * Default: false
+   * @group param
+   */
   val binary: BooleanParam =
     new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
       "This is useful for discrete probabilistic models that model binary events rather " +
@@ -248,17 +247,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
         }
         tokenCount += 1
       }
-      val effectiveMinTF = if (minTf >= 1.0) {
-        minTf
-      } else {
-        tokenCount * minTf
-      }
+      val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
       val effectiveCounts = if ($(binary)) {
         termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
-      }
-      else {
+      } else {
         termCounts.filter(_._2 >= effectiveMinTF).toSeq
       }
+
       Vectors.sparse(dictBr.value.size, effectiveCounts)
     }
     dataset.withColumn($(outputCol), vectorizer(col($(inputCol))))