From 65dcb0434518c1b4d18ecbc9fe47587320a798fa Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Tue, 5 Apr 2016 16:39:53 -0700
Subject: [PATCH 01/11] move binary param out of CountVectorizerModel without
 test

---
 .../spark/ml/feature/CountVectorizer.scala    | 56 +++++++++++--------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 5694b3890fba4..47f48ae377283 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -42,7 +42,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * vocabSize terms ordered by term frequency across the corpus.
    *
    * Default: 2^18^
-   * @group param
+    *
+    * @group param
    */
   val vocabSize: IntParam =
     new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0))
@@ -57,7 +58,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * if this is a double in [0,1), then this specifies the fraction of documents.
    *
    * Default: 1
-   * @group param
+    *
+    * @group param
    */
   val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" +
     " different documents a term must appear in to be included in the vocabulary." +
@@ -87,7 +89,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * affect fitting.
    *
    * Default: 1
-   * @group param
+    *
+    * @group param
    */
   val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" +
     " a document. For each document, terms with frequency/count less than the given threshold are" +
@@ -100,6 +103,24 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
 
   /** @group getParam */
   def getMinTF: Double = $(minTF)
+
+  /**
+    * Binary toggle to control the output vector values.
+    * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+    * discrete probabilistic models that model binary events rather than integer counts.
+    * Default: false
+    *
+    * @group param
+    */
+  val binary: BooleanParam =
+    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
+      "This is useful for discrete probabilistic models that model binary events rather " +
+      "than integer counts")
+
+  /** @group getParam */
+  def getBinary: Boolean = $(binary)
+
+  setDefault(binary -> false)
 }
 
 /**
@@ -127,6 +148,9 @@ class CountVectorizer(override val uid: String)
   /** @group setParam */
   def setMinTF(value: Double): this.type = set(minTF, value)
 
+  /** @group setParam */
+  def setBinary(value: Boolean): this.type = set(binary, value)
+
   setDefault(vocabSize -> (1 << 18), minDF -> 1)
 
   override def fit(dataset: DataFrame): CountVectorizerModel = {
@@ -149,7 +173,11 @@ class CountVectorizer(override val uid: String)
     }.filter { case (word, (wc, df)) =>
       df >= minDf
     }.map { case (word, (count, dfCount)) =>
-      (word, count)
+      if ($(binary)) {
+        (word, 1L)
+      } else {
+        (word, count)
+      }
     }.cache()
     val fullVocabSize = wordCounts.count()
     val vocab: Array[String] = {
@@ -184,7 +212,8 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] {
 /**
  * :: Experimental ::
  * Converts a text document to a sparse vector of token counts.
- * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
+  *
+  * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
  */
 @Experimental
 class CountVectorizerModel(override val uid: String, val vocabulary: Array[String])
@@ -206,26 +235,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   /** @group setParam */
   def setMinTF(value: Double): this.type = set(minTF, value)
 
-  /**
-   * Binary toggle to control the output vector values.
-   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
-   * discrete probabilistic models that model binary events rather than integer counts.
-   * Default: false
-   * @group param
-   */
-  val binary: BooleanParam =
-    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
-      "This is useful for discrete probabilistic models that model binary events rather " +
-      "than integer counts")
-
-  /** @group getParam */
-  def getBinary: Boolean = $(binary)
-
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  setDefault(binary -> false)
-
   /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
   private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
 

From dcad7996c762c25db8b37d8a07bd9dce1a807418 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Tue, 5 Apr 2016 23:16:53 -0700
Subject: [PATCH 02/11] Add test case and revert one change

---
 .../spark/ml/feature/CountVectorizer.scala    |  6 +-----
 .../ml/feature/CountVectorizerSuite.scala     | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 47f48ae377283..e1300defa3e30 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -173,11 +173,7 @@ class CountVectorizer(override val uid: String)
     }.filter { case (word, (wc, df)) =>
       df >= minDf
     }.map { case (word, (count, dfCount)) =>
-      if ($(binary)) {
-        (word, 1L)
-      } else {
-        (word, count)
-      }
+      (word, count)
     }.cache()
     val fullVocabSize = wordCounts.count()
     val vocab: Array[String] = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 04f165c5f1e74..a18ca8faae2d0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -115,6 +115,27 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
+  test("CountVectorizer with binary") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, split("a b c d e a b"),
+        Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))),
+      (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))),
+      (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))),
+      (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0)))))
+    ).toDF("id", "words", "expected")
+    val cv = new CountVectorizer()
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setBinary(true)
+      .fit(df)
+    assert(cv.vocabulary === Array("a", "b", "c", "d", "e"))
+
+    cv.transform(df).select("features", "expected").collect().foreach {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14)
+    }
+  }
+
   test("CountVectorizer throws exception when vocab is empty") {
     intercept[IllegalArgumentException] {
       val df = sqlContext.createDataFrame(Seq(

From b625caef23b2a607229976beb90ee682987b6210 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 09:50:18 -0700
Subject: [PATCH 03/11] merge tests, remove extrace leading space, address
 review comments.

---
 .../spark/ml/feature/CountVectorizer.scala    | 12 +++--
 .../ml/feature/CountVectorizerSuite.scala     | 45 +++++++++----------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index e1300defa3e30..f45e028f1cbfe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -42,8 +42,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * vocabSize terms ordered by term frequency across the corpus.
    *
    * Default: 2^18^
-    *
-    * @group param
+   *
+   * @group param
    */
   val vocabSize: IntParam =
     new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0))
@@ -113,9 +113,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
     * @group param
     */
   val binary: BooleanParam =
-    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
-      "This is useful for discrete probabilistic models that model binary events rather " +
-      "than integer counts")
+    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
 
   /** @group getParam */
   def getBinary: Boolean = $(binary)
@@ -208,8 +206,8 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] {
 /**
  * :: Experimental ::
  * Converts a text document to a sparse vector of token counts.
-  *
-  * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
+ *
+ * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
  */
 @Experimental
 class CountVectorizerModel(override val uid: String, val vocabulary: Array[String])
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index a18ca8faae2d0..b431cb6fc0f7f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -114,28 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(features ~== expected absTol 1e-14)
     }
   }
-
-  test("CountVectorizer with binary") {
-    val df = sqlContext.createDataFrame(Seq(
-      (0, split("a b c d e a b"),
-        Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))),
-      (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))),
-      (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))),
-      (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0)))))
-    ).toDF("id", "words", "expected")
-    val cv = new CountVectorizer()
-      .setInputCol("words")
-      .setOutputCol("features")
-      .setBinary(true)
-      .fit(df)
-    assert(cv.vocabulary === Array("a", "b", "c", "d", "e"))
-
-    cv.transform(df).select("features", "expected").collect().foreach {
-      case Row(features: Vector, expected: Vector) =>
-        assert(features ~== expected absTol 1e-14)
-    }
-  }
-
+  
   test("CountVectorizer throws exception when vocab is empty") {
     intercept[IllegalArgumentException] {
       val df = sqlContext.createDataFrame(Seq(
@@ -189,7 +168,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
-  test("CountVectorizerModel with binary") {
+  test("CountVectorizerModel and CountVectorizer with binary") {
     val df = sqlContext.createDataFrame(Seq(
       (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
       (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
@@ -204,6 +183,26 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
+
+    // CountVectorizer test
+    val df2 = sqlContext.createDataFrame(Seq(
+      (0, split("a b c d e a b"),
+        Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))),
+      (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))),
+      (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))),
+      (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0)))))
+    ).toDF("id", "words", "expected")
+    val cv2 = new CountVectorizer()
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setBinary(true)
+      .fit(df2)
+    assert(cv2.vocabulary === Array("a", "b", "c", "d", "e"))
+
+    cv2.transform(df2).select("features", "expected").collect().foreach {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14)
+    }
   }
 
   test("CountVectorizer read/write") {

From 7e6daa85b6e9b947184783063cd8b428d56edfe1 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 10:07:17 -0700
Subject: [PATCH 04/11] formatting, remove unnecessary spaces and lines added
 by my editor

---
 .../spark/ml/feature/CountVectorizer.scala    | 20 ++++++++-----------
 .../ml/feature/CountVectorizerSuite.scala     |  2 +-
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index f45e028f1cbfe..4810b7c070e71 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -42,7 +42,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * vocabSize terms ordered by term frequency across the corpus.
    *
    * Default: 2^18^
-   *
    * @group param
    */
   val vocabSize: IntParam =
@@ -58,8 +57,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * if this is a double in [0,1), then this specifies the fraction of documents.
    *
    * Default: 1
-    *
-    * @group param
+   * @group param
    */
   val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" +
     " different documents a term must appear in to be included in the vocabulary." +
@@ -89,8 +87,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * affect fitting.
    *
    * Default: 1
-    *
-    * @group param
+   * @group param
    */
   val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" +
     " a document. For each document, terms with frequency/count less than the given threshold are" +
@@ -105,13 +102,12 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   def getMinTF: Double = $(minTF)
 
   /**
-    * Binary toggle to control the output vector values.
-    * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
-    * discrete probabilistic models that model binary events rather than integer counts.
-    * Default: false
-    *
-    * @group param
-    */
+   * Binary toggle to control the output vector values.
+   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+   * discrete probabilistic models that model binary events rather than integer counts.
+   * Default: false
+   * @group param
+   */
   val binary: BooleanParam =
     new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index b431cb6fc0f7f..071671e8573aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -114,7 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(features ~== expected absTol 1e-14)
     }
   }
-  
+ 
   test("CountVectorizer throws exception when vocab is empty") {
     intercept[IllegalArgumentException] {
       val df = sqlContext.createDataFrame(Seq(

From 1f1e36dc8f727893c13badf75db36401d9f84742 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 10:10:17 -0700
Subject: [PATCH 05/11] remove additional line added by editor

---
 .../main/scala/org/apache/spark/ml/feature/CountVectorizer.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 4810b7c070e71..f1be971a6ae94 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -202,7 +202,6 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] {
 /**
  * :: Experimental ::
  * Converts a text document to a sparse vector of token counts.
- *
  * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
  */
 @Experimental

From 7c89370f4458e1c656db4180df3668928e148755 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 10:37:20 -0700
Subject: [PATCH 06/11] revise the df and use the same df for two tests.

---
 .../spark/ml/feature/CountVectorizerSuite.scala   | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 071671e8573aa..101534ce6187a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -170,7 +170,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
 
   test("CountVectorizerModel and CountVectorizer with binary") {
     val df = sqlContext.createDataFrame(Seq(
-      (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+      (0, split("a a a a b b b b c d"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
       (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
     )).toDF("id", "words", "expected")
@@ -185,21 +185,14 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
 
     // CountVectorizer test
-    val df2 = sqlContext.createDataFrame(Seq(
-      (0, split("a b c d e a b"),
-        Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))),
-      (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 1.0)))),
-      (2, split("c c"), Vectors.sparse(5, Seq((2, 1.0)))),
-      (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 1.0)))))
-    ).toDF("id", "words", "expected")
     val cv2 = new CountVectorizer()
       .setInputCol("words")
       .setOutputCol("features")
       .setBinary(true)
-      .fit(df2)
-    assert(cv2.vocabulary === Array("a", "b", "c", "d", "e"))
+      .fit(df)
+    assert(cv2.vocabulary === Array("a", "b", "c", "d"))
 
-    cv2.transform(df2).select("features", "expected").collect().foreach {
+    cv2.transform(df).select("features", "expected").collect().foreach {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }

From 5b35fb9f3cba3dd68e3bbdac7d927e312726bb37 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 11:16:44 -0700
Subject: [PATCH 07/11] remove extra space and assert in the test

---
 .../org/apache/spark/ml/feature/CountVectorizerSuite.scala      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 101534ce6187a..e07edc10ac747 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -190,8 +190,6 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setOutputCol("features")
       .setBinary(true)
       .fit(df)
-    assert(cv2.vocabulary === Array("a", "b", "c", "d"))
-
     cv2.transform(df).select("features", "expected").collect().foreach {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)

From de81c35faaf6848d21562a8c131aff723928014d Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 11:28:23 -0700
Subject: [PATCH 08/11] split long line into 2 lines. remove the annoying extra
 space added by editor again.

---
 .../org/apache/spark/ml/feature/CountVectorizerSuite.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index e07edc10ac747..e8cd7e48f2646 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -114,7 +114,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(features ~== expected absTol 1e-14)
     }
   }
- 
+
   test("CountVectorizer throws exception when vocab is empty") {
     intercept[IllegalArgumentException] {
       val df = sqlContext.createDataFrame(Seq(
@@ -170,7 +170,8 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
 
   test("CountVectorizerModel and CountVectorizer with binary") {
     val df = sqlContext.createDataFrame(Seq(
-      (0, split("a a a a b b b b c d"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
+      (0, split("a a a a b b b b c d"), 
+      Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
       (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
     )).toDF("id", "words", "expected")

From 42fdfee0dd583d73f533b760d3a689c68c562d9f Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 6 Apr 2016 11:35:12 -0700
Subject: [PATCH 09/11] remove space at the end of line

---
 .../org/apache/spark/ml/feature/CountVectorizerSuite.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index e8cd7e48f2646..97dabe38c51fa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -170,7 +170,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
 
   test("CountVectorizerModel and CountVectorizer with binary") {
     val df = sqlContext.createDataFrame(Seq(
-      (0, split("a a a a b b b b c d"), 
+      (0, split("a a a a b b b b c d"),
       Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
       (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))

From e1bffd7609e7cb7d5a36066f799f03bd10b53c94 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Thu, 7 Apr 2016 13:13:47 -0700
Subject: [PATCH 10/11] move setDefault into class; change the order of tests

---
 .../org/apache/spark/ml/feature/CountVectorizer.scala    | 6 ++++--
 .../apache/spark/ml/feature/CountVectorizerSuite.scala   | 9 +++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index f1be971a6ae94..1858581702315 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -113,8 +113,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
 
   /** @group getParam */
   def getBinary: Boolean = $(binary)
-
-  setDefault(binary -> false)
 }
 
 /**
@@ -147,6 +145,8 @@ class CountVectorizer(override val uid: String)
 
   setDefault(vocabSize -> (1 << 18), minDF -> 1)
 
+  setDefault(binary -> false)
+
   override def fit(dataset: DataFrame): CountVectorizerModel = {
     transformSchema(dataset.schema, logging = true)
     val vocSize = $(vocabSize)
@@ -227,6 +227,8 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
+  setDefault(binary -> false)
+
   /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
   private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 97dabe38c51fa..ff0de06e27d01 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -176,21 +176,22 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
     )).toDF("id", "words", "expected")
 
-    val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+    // CountVectorizer test
+    val cv = new CountVectorizer()
       .setInputCol("words")
       .setOutputCol("features")
       .setBinary(true)
+      .fit(df)
     cv.transform(df).select("features", "expected").collect().foreach {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
 
-    // CountVectorizer test
-    val cv2 = new CountVectorizer()
+    // CountVectorizerModel test
+    val cv2 = new CountVectorizerModel(cv.vocabulary)
       .setInputCol("words")
       .setOutputCol("features")
       .setBinary(true)
-      .fit(df)
     cv2.transform(df).select("features", "expected").collect().foreach {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)

From e693c60e8059976ec179c298453017a486792712 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Fri, 8 Apr 2016 09:38:46 -0700
Subject: [PATCH 11/11] move setDefault back to trait

---
 .../scala/org/apache/spark/ml/feature/CountVectorizer.scala | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 1858581702315..f1be971a6ae94 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -113,6 +113,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
 
   /** @group getParam */
   def getBinary: Boolean = $(binary)
+
+  setDefault(binary -> false)
 }
 
 /**
@@ -145,8 +147,6 @@ class CountVectorizer(override val uid: String)
 
   setDefault(vocabSize -> (1 << 18), minDF -> 1)
 
-  setDefault(binary -> false)
-
   override def fit(dataset: DataFrame): CountVectorizerModel = {
     transformSchema(dataset.schema, logging = true)
     val vocSize = $(vocabSize)
@@ -227,8 +227,6 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  setDefault(binary -> false)
-
   /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
   private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None