From 0f1fa34198459e32cb5099a0720e8d4bf053b33e Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Sep 2015 10:07:57 +0800
Subject: [PATCH 1/3] add python for countVectorizer

---
 python/pyspark/ml/feature.py | 133 +++++++++++++++++++++++++++++++++--
 1 file changed, 127 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index d955307e27efd..302048dc1eaa8 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,12 +26,12 @@
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
-           'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
-           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
-           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
-
+__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
+           'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
+           'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'SQLTransformer',
+           'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
+           'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
+           'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
 
 @inherit_doc
 class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
@@ -166,6 +166,127 @@ def getSplits(self):
         return self.getOrDefault(self.splits)
 
 
+@inherit_doc
+class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]],
+    which converts text documents to sparse vectors of token counts.
+
+    >>> df = sentenceData = sqlContext.createDataFrame([(0, ["a", "b", "c"]),
+    ... (1, ["a", "b", "b", "c", "a"])], ["label", "raw"])
+    >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
+    >>> model = cv.fit(df)
+    >>> model.transform(sentenceData).show(truncate=False)
+    +-----+---------------+-------------------------+
+    |label|raw            |vectors                  |
+    +-----+---------------+-------------------------+
+    |0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
+    |1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+    +-----+---------------+-------------------------+
+    ...
+    """
+
+    # a placeholder to make it appear in the generated doc
+    vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary")
+    minDF = Param(Params._dummy(), "minDF", "Specifies the minimum number of different documents"
+                  " a term must appear in to be included in the vocabulary. If this is an integer"
+                  " >= 1, this specifies the number of documents the term must appear in; if this"
+                  " is a double in [0,1), then this specifies the fraction of documents.")
+
+    minTF = Param(Params._dummy(), "minTF", "Filter to ignore rare words in a document. For each"
+                  " document, terms with frequency/count less than the given threshold are ignored"
+                  ". If this is an integer >= 1, then this specifies a count (of times the term "
+                  "must appear in the document); if this is a double in [0,1), then this specifies"
+                  " a fraction (out of the document's token count). Note that the parameter is "
+                  "only used in transform of CountVectorizerModel and does not affect fitting.")
+
+    @keyword_only
+    def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
+        """
+        __init__(self, minDF=1.0, minTF=1.0, vocabSize=1<<18, inputCol=None, outputCol=None)
+        """
+        super(CountVectorizer, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.CountVectorizer", self.uid)
+        self.minDF = Param(self, "minDF", "Specifies the minimum number of different documents"
+                  " a term must appear in to be included in the vocabulary. If this is an integer"
+                  " >= 1, this specifies the number of documents the term must appear in; if this"
+                  " is a double in [0,1), then this specifies the fraction of documents.")
+        self.minTF = Param(self, "minTF", "Filter to ignore rare words in a document. For each"
+                  " document, terms with frequency/count less than the given threshold are ignored"
+                  ". If this is an integer >= 1, then this specifies a count (of times the term "
+                  "must appear in the document); if this is a double in [0,1), then this specifies"
+                  " a fraction (out of the document's token count). Note that the parameter is "
+                  "only used in transform of CountVectorizerModel and does not affect fitting.")
+        self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary")
+        self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None):
+        """
+        setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None)
+        Sets params for this CountVectorizer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setMinDF(self, value):
+        """
+        Sets the value of :py:attr:`minDF`.
+        """
+        self._paramMap[self.minDF] = value
+        return self
+
+    def getMinDF(self):
+        """
+        Gets the value of minDF or its default value.
+        """
+        return self.getOrDefault(self.minDF)
+
+    def setMinTF(self, value):
+        """
+        Sets the value of :py:attr:`minTF`.
+        """
+        self._paramMap[self.minTF] = value
+        return self
+
+    def getMinTF(self):
+        """
+        Gets the value of minTF or its default value.
+        """
+        return self.getOrDefault(self.minTF)
+
+    def setVocabSize(self, value):
+        """
+        Sets the value of :py:attr:`vocabSize`.
+        """
+        self._paramMap[self.vocabSize] = value
+        return self
+
+    def getVocabSize(self):
+        """
+        Gets the value of vocabSize or its default value.
+        """
+        return self.getOrDefault(self.vocabSize)
+
+    def _create_model(self, java_model):
+        return CountVectorizerModel(java_model)
+
+
+class CountVectorizerModel(JavaModel):
+    """
+    Model fitted by CountVectorizer. Converts a text document to a sparse vector of token counts.
+    """
+
+    @property
+    def vocabulary(self):
+        """
+        An Array over terms. Only the terms in the vocabulary will be counted.
+        """
+        return self._call_java("vocabulary")
+
 @inherit_doc
 class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     """

From d22ba5a997aef2f0a21c97bcb2ab2ed7226f770b Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Sep 2015 15:05:22 +0800
Subject: [PATCH 2/3] style fix

---
 python/pyspark/ml/feature.py | 49 ++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 302048dc1eaa8..056055baa4505 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -188,16 +188,18 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
 
     # a placeholder to make it appear in the generated doc
     vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary")
-    minDF = Param(Params._dummy(), "minDF", "Specifies the minimum number of different documents"
-                  " a term must appear in to be included in the vocabulary. If this is an integer"
-                  " >= 1, this specifies the number of documents the term must appear in; if this"
-                  " is a double in [0,1), then this specifies the fraction of documents.")
-
-    minTF = Param(Params._dummy(), "minTF", "Filter to ignore rare words in a document. For each"
-                  " document, terms with frequency/count less than the given threshold are ignored"
-                  ". If this is an integer >= 1, then this specifies a count (of times the term "
-                  "must appear in the document); if this is a double in [0,1), then this specifies"
-                  " a fraction (out of the document's token count). Note that the parameter is "
+    minDF = Param(Params._dummy(), "minDF",
+                  "Specifies the minimum number of different documents a term must appear in " +
+                  "to be included in the vocabulary. If this is an integer >= 1, this specifies " +
+                  "the number of documents the term must appear in; if this is a double in " +
+                  "[0,1), then this specifies the fraction of documents.")
+
+    minTF = Param(Params._dummy(), "minTF",
+                  "Filter to ignore rare words in a document. For each document, terms with " +
+                  "frequency/count less than the given threshold are ignored. If this is an " +
+                  "integer >= 1, then this specifies a count (of times the term must appear in" +
+                  " the document); if this is a double in [0,1), then this specifies a " +
+                  "fraction (out of the document's token count). Note that the parameter is " +
                   "only used in transform of CountVectorizerModel and does not affect fitting.")
 
     @keyword_only
@@ -208,15 +210,19 @@ def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outpu
         super(CountVectorizer, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.feature.CountVectorizer", self.uid)
-        self.minDF = Param(self, "minDF", "Specifies the minimum number of different documents"
-                  " a term must appear in to be included in the vocabulary. If this is an integer"
-                  " >= 1, this specifies the number of documents the term must appear in; if this"
-                  " is a double in [0,1), then this specifies the fraction of documents.")
-        self.minTF = Param(self, "minTF", "Filter to ignore rare words in a document. For each"
-                  " document, terms with frequency/count less than the given threshold are ignored"
-                  ". If this is an integer >= 1, then this specifies a count (of times the term "
-                  "must appear in the document); if this is a double in [0,1), then this specifies"
-                  " a fraction (out of the document's token count). Note that the parameter is "
+        self.minDF = \
+            Param(self, "minDF",
+                  "Specifies the minimum number of different documents a term must appear in " +
+                  "to be included in the vocabulary. If this is an integer >= 1, this specifies " +
+                  "the number of documents the term must appear in; if this is a double in " +
+                  "[0,1), then this specifies the fraction of documents.")
+        self.minTF = \
+            Param(self, "minTF",
+                  "Filter to ignore rare words in a document. For each document, terms with " +
+                  "frequency/count less than the given threshold are ignored. If this is an " +
+                  "integer >= 1, then this specifies a count (of times the term must appear in" +
+                  " the document); if this is a double in [0,1), then this specifies a " +
+                  "fraction (out of the document's token count). Note that the parameter is " +
                   "only used in transform of CountVectorizerModel and does not affect fitting.")
         self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary")
         self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18)
@@ -224,9 +230,9 @@ def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outpu
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None):
+    def setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None):
         """
-        setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None)
+        setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None)
         Sets params for this CountVectorizer.
         """
         kwargs = self.setParams._input_kwargs
@@ -287,6 +293,7 @@ def vocabulary(self):
         """
         return self._call_java("vocabulary")
 
+
 @inherit_doc
 class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     """

From dd0e933269832645f35c42a59d4d41ec4ef7f3fb Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Sep 2015 15:41:17 +0800
Subject: [PATCH 3/3] add an extra line

---
 python/pyspark/ml/feature.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 056055baa4505..ab05f84e935ae 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -33,6 +33,7 @@
            'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
            'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
 
+
 @inherit_doc
 class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     """