improve according to comments

apache · Aug 28, 2015 · 007c369 · 007c369
1 parent 4e37227
commit 007c369
Showing 1 changed file with 58 additions and 30 deletions.
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -213,12 +213,40 @@ for feature in result.select("result").take(3):
 
 ## CountVectorizer
 
-As a transformer, `CountVectorizerModel` converts a collection of text documents to vectors of token counts.
-It takes parameter `vocabulary: Array[String]` and produces sparse representations for the documents over the vocabulary, which can then be passed to other algorithms like LDA.
+`CountVectorizer` and `CountVectorizerModel` aim to help convert a collection of text documents
+ to vectors of token counts. When an a-priori dictionary is not available, `CountVectorizer` can
+ be used as an `Estimator` to extract the vocabulary and generates a `CountVectorizerModel`. The
+ model produces sparse representations for the documents over the vocabulary, which can then be
+ passed to other algorithms like LDA.
 
-When an a-priori dictionary is not available, `CountVectorizer` can be used as an Estimator to extract the vocabulary and generates a `CountVectorizerModel`.
-It will select the top `vocabSize` words ordered by term frequency across the corpus.
-An optional parameter "minDF" also affect the fitting process by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be included in the vocabulary.
+ During the fitting process, `CountVectorizer` will select the top `vocabSize` words ordered by
+ term frequency across the corpus. An optional parameter "minDF" also affect the fitting process
+ by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be
+ included in the vocabulary.
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id` and `texts`:
+
+~~~~
+ id | texts
+----|----------
+ 0  | Array("a", "b", "c")
+ 1  | Array("a", "b", "b", "c", "a")
+~~~~
+
+each row in`texts` is a document of type Array[String].
+Invoking fit of `CountVectorizer` produces a `CountVectorizerModel` with vocabulary (a, b, c),
+then the output column "vector" after transformation contains:
+
+~~~~
+ id | texts                           | vector
+----|---------------------------------|---------------
+ 0  | Array("a", "b", "c")            | (3,[0,1,2],[1.0,1.0,1.0])
+ 1  | Array("a", "b", "b", "c", "a")  | (3,[0,1,2],[2.0,2.0,1.0])
+~~~~
+
+each vector represents the token counts of the document over the vocabulary.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -234,20 +262,20 @@ val df = sqlContext.createDataFrame(Seq(
   (1, Array("a", "b", "b", "c", "a"))
 )).toDF("id", "words")
 
-// define CountVectorizerModel with a-priori vocabulary
-val cv = new CountVectorizerModel(Array("a", "b", "c"))
+// fit a CountVectorizerModel from the corpus
+val cvModel: CountVectorizerModel = new CountVectorizer()
   .setInputCol("words")
   .setOutputCol("features")
+  .setVocabSize(3)
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df)
 
-// alternatively, fit a CountVectorizerModel from the corpus
-val cv2: CountVectorizerModel = new CountVectorizer()
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+val cvm = new CountVectorizerModel(Array("a", "b", "c"))
   .setInputCol("words")
   .setOutputCol("features")
-  .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more than 2 documents to be included in the vocabulary
-  .fit(df)
 
-cv.transform(df).select("features").collect()
+cvModel.transform(df).select("features").show()
 {% endhighlight %}
 </div>
 
@@ -260,34 +288,34 @@ import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.CountVectorizer;
 import org.apache.spark.ml.feature.CountVectorizerModel;
 import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
 
 // Input data: Each row is a bag of words from a sentence or document.
 JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("a b c".split(" "))),
-  RowFactory.create(Arrays.asList("a b b c a".split(" ")))
+  RowFactory.create(Arrays.asList("a", "b", "c")),
+  RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
 ));
-StructType schema = new StructType(new StructField[]{
+StructType schema = new StructType(new StructField [] {
   new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
 });
-DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
-
-// define CountVectorizerModel with a-priori vocabulary
-CountVectorizerModel cv = new CountVectorizerModel(new String[]{"a", "b", "c"})
-  .setInputCol("text")
-  .setOutputCol("feature");
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
 
-// alternatively, fit a CountVectorizerModel from the corpus
-CountVectorizerModel cv2 = new CountVectorizer()
+// fit a CountVectorizerModel from the corpus
+CountVectorizerModel cvModel = new CountVectorizer()
   .setInputCol("text")
   .setOutputCol("feature")
   .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more than 2 documents to be included in the vocabulary
-  .fit(documentDF);
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df);
 
-DataFrame result = cv.transform(documentDF);
-for (Row r: result.select("feature").take(3)) {
-  System.out.println(r);
-}
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+  .setInputCol("text")
+  .setOutputCol("feature");
+
+cvModel.transform(df).show();
 {% endhighlight %}
 </div>
 </div>