apache · hhbyyh · Jun 29, 2015 · Jun 29, 2015 · Jun 30, 2015 · Jul 2, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param.{ParamMap, ParamValidators, IntParam}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{Vectors, VectorUDT, Vector}
+import org.apache.spark.sql.types.{StringType, ArrayType, DataType}
+
+/**
+ * :: Experimental ::
+ * Converts a text document to a sparse vector of token counts.
+ * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
+ */
+@Experimental
+class CountVectorizer (override val uid: String, vocabulary: Array[String])
+  extends UnaryTransformer[Seq[String], Vector, CountVectorizer] {
+
+  def this(vocabulary: Array[String]) = this(Identifiable.randomUID("countVectorizer"), vocabulary)
+
+  /**
+   * Corpus-specific stop words filter. Terms with count less than the given threshold are ignored.
+   * Default: 1
+   * @group param
+   */
+  val minTermCounts: IntParam = new IntParam(this, "minTermCounts",
+    "lower bound of effective term counts (>= 1)", ParamValidators.gtEq(1))
+
+  /** @group setParam */
+  def setMinTermCounts(value: Int): this.type = set(minTermCounts, value)
+
+  /** @group getParam */
+  def getMinTermCounts: Int = $(minTermCounts)
+
+  setDefault(minTermCounts -> 1)
+
+  override protected def createTransformFunc: Seq[String] => Vector = {
+    val dict = vocabulary.zipWithIndex.toMap
+    document =>
+      val termCounts = mutable.HashMap.empty[Int, Double]
+      document.foreach { term =>
+        dict.get(term) match {
+          case Some(index) => termCounts.put(index, termCounts.getOrElse(index, 0.0) + 1.0)
+          case None => // ignore terms not in the vocabulary
+        }
+      }
+      Vectors.sparse(dict.size, termCounts.filter(_._2 >= $(minTermCounts)).toSeq)
+  }
+
+  override protected def validateInputType(inputType: DataType): Unit = {
+    require(inputType.sameType(ArrayType(StringType)),
+      s"Input type must be Array type but got $inputType.")
+  }
+
+  override protected def outputDataType: DataType = new VectorUDT()
+
+  override def copy(extra: ParamMap): CountVectorizer = {
+    val copied = new CountVectorizer(uid, vocabulary)
+    copyValues(copied, extra)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("params") {
+    ParamsSuite.checkParams(new CountVectorizer(Array("empty")))
+  }
+
+  test("CountVectorizer common cases") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a b c d".split(" ").toSeq),
+      (1, "a b b c d  a".split(" ").toSeq),
+      (2, "a".split(" ").toSeq),
+      (3, "".split(" ").toSeq), // empty string
+      (3, "a notInDict d".split(" ").toSeq)  // with words not in vocabulary
+    )).toDF("id", "words")
+    val cv = new CountVectorizer(Array("a", "b", "c", "d"))
+      .setInputCol("words")
+      .setOutputCol("features")
+    val output = cv.transform(df)
+    val features = output.select("features").collect()
+
+    val expected = Seq(
+      Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0))),
+      Vectors.sparse(4, Seq((0, 2.0), (1, 2.0), (2, 1.0), (3, 1.0))),
+      Vectors.sparse(4, Seq((0, 1.0))),
+      Vectors.sparse(4, Seq()),
+      Vectors.sparse(4, Seq((0, 1.0), (3, 1.0))))
+
+    features.zip(expected).foreach(p =>
+      assert(p._1.getAs[Vector](0) ~== p._2 absTol 1e-14)
+    )
+  }
+
+  test("CountVectorizer with minTermCounts") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a a a b b c c c d ".split(" ").toSeq),
+      (1, "c c c c c c".split(" ").toSeq),
+      (2, "a".split(" ").toSeq),
+      (3, "e e e e e".split(" ").toSeq)
+    )).toDF("id", "words")
+    val cv = new CountVectorizer(Array("a", "b", "c", "d"))
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setMinTermCounts(3)
+    val output = cv.transform(df)
+    val features = output.select("features").collect()
+
+    val expected = Seq(
+      Vectors.sparse(4, Seq((0, 3.0), (2, 3.0))),
+      Vectors.sparse(4, Seq((2, 6.0))),
+      Vectors.sparse(4, Seq()),
+      Vectors.sparse(4, Seq()))
+
+    features.zip(expected).foreach(p =>
+      assert(p._1.getAs[Vector](0) ~== p._2 absTol 1e-14)
+    )
+  }
+}
+
+