From 0f1fa34198459e32cb5099a0720e8d4bf053b33e Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Sep 2015 10:07:57 +0800 Subject: [PATCH 1/3] add python for countVectorizer --- python/pyspark/ml/feature.py | 133 +++++++++++++++++++++++++++++++++-- 1 file changed, 127 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index d955307e27efd..302048dc1eaa8 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -26,12 +26,12 @@ from pyspark.mllib.common import inherit_doc from pyspark.mllib.linalg import _convert_to_vector -__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', - 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', - 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', - 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', - 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] - +__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', + 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', + 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'SQLTransformer', + 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', + 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', + 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): @@ -166,6 +166,127 @@ def getSplits(self): return self.getOrDefault(self.splits) +@inherit_doc +class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): + """ + Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]], + which converts text documents to sparse vectors of token counts. + + >>> df = sentenceData = sqlContext.createDataFrame([(0, ["a", "b", "c"]), + ... (1, ["a", "b", "b", "c", "a"])], ["label", "raw"]) + >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") + >>> model = cv.fit(df) + >>> model.transform(sentenceData).show(truncate=False) + +-----+---------------+-------------------------+ + |label|raw |vectors | + +-----+---------------+-------------------------+ + |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])| + |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])| + +-----+---------------+-------------------------+ + ... + """ + + # a placeholder to make it appear in the generated doc + vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary") + minDF = Param(Params._dummy(), "minDF", "Specifies the minimum number of different documents" + " a term must appear in to be included in the vocabulary. If this is an integer" + " >= 1, this specifies the number of documents the term must appear in; if this" + " is a double in [0,1), then this specifies the fraction of documents.") + + minTF = Param(Params._dummy(), "minTF", "Filter to ignore rare words in a document. For each" + " document, terms with frequency/count less than the given threshold are ignored" + ". If this is an integer >= 1, then this specifies a count (of times the term " + "must appear in the document); if this is a double in [0,1), then this specifies" + " a fraction (out of the document's token count). Note that the parameter is " + "only used in transform of CountVectorizerModel and does not affect fitting.") + + @keyword_only + def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): + """ + __init__(self, minDF=1.0, minTF=1.0, vocabSize=1<<18, inputCol=None, outputCol=None) + """ + super(CountVectorizer, self).__init__() + self._java_obj = self._new_java_obj( + "org.apache.spark.ml.feature.CountVectorizer", self.uid) + self.minDF = Param(self, "minDF", "Specifies the minimum number of different documents" + " a term must appear in to be included in the vocabulary. If this is an integer" + " >= 1, this specifies the number of documents the term must appear in; if this" + " is a double in [0,1), then this specifies the fraction of documents.") + self.minTF = Param(self, "minTF", "Filter to ignore rare words in a document. For each" + " document, terms with frequency/count less than the given threshold are ignored" + ". If this is an integer >= 1, then this specifies a count (of times the term " + "must appear in the document); if this is a double in [0,1), then this specifies" + " a fraction (out of the document's token count). Note that the parameter is " + "only used in transform of CountVectorizerModel and does not affect fitting.") + self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary") + self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None): + """ + setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None) + Sets params for this CountVectorizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMinDF(self, value): + """ + Sets the value of :py:attr:`minDF`. + """ + self._paramMap[self.minDF] = value + return self + + def getMinDF(self): + """ + Gets the value of minDF or its default value. + """ + return self.getOrDefault(self.minDF) + + def setMinTF(self, value): + """ + Sets the value of :py:attr:`minTF`. + """ + self._paramMap[self.minTF] = value + return self + + def getMinTF(self): + """ + Gets the value of minTF or its default value. + """ + return self.getOrDefault(self.minTF) + + def setVocabSize(self, value): + """ + Sets the value of :py:attr:`vocabSize`. + """ + self._paramMap[self.vocabSize] = value + return self + + def getVocabSize(self): + """ + Gets the value of vocabSize or its default value. + """ + return self.getOrDefault(self.vocabSize) + + def _create_model(self, java_model): + return CountVectorizerModel(java_model) + + +class CountVectorizerModel(JavaModel): + """ + Model fitted by CountVectorizer. Converts a text document to a sparse vector of token counts. + """ + + @property + def vocabulary(self): + """ + An Array over terms. Only the terms in the vocabulary will be counted. + """ + return self._call_java("vocabulary") + @inherit_doc class DCT(JavaTransformer, HasInputCol, HasOutputCol): """ From d22ba5a997aef2f0a21c97bcb2ab2ed7226f770b Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Sep 2015 15:05:22 +0800 Subject: [PATCH 2/3] style fix --- python/pyspark/ml/feature.py | 49 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 302048dc1eaa8..056055baa4505 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -188,16 +188,18 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): # a placeholder to make it appear in the generated doc vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary") - minDF = Param(Params._dummy(), "minDF", "Specifies the minimum number of different documents" - " a term must appear in to be included in the vocabulary. If this is an integer" - " >= 1, this specifies the number of documents the term must appear in; if this" - " is a double in [0,1), then this specifies the fraction of documents.") - - minTF = Param(Params._dummy(), "minTF", "Filter to ignore rare words in a document. For each" - " document, terms with frequency/count less than the given threshold are ignored" - ". If this is an integer >= 1, then this specifies a count (of times the term " - "must appear in the document); if this is a double in [0,1), then this specifies" - " a fraction (out of the document's token count). Note that the parameter is " + minDF = Param(Params._dummy(), "minDF", + "Specifies the minimum number of different documents a term must appear in " + + "to be included in the vocabulary. If this is an integer >= 1, this specifies " + + "the number of documents the term must appear in; if this is a double in " + + "[0,1), then this specifies the fraction of documents.") + + minTF = Param(Params._dummy(), "minTF", + "Filter to ignore rare words in a document. For each document, terms with " + + "frequency/count less than the given threshold are ignored. If this is an " + + "integer >= 1, then this specifies a count (of times the term must appear in" + + " the document); if this is a double in [0,1), then this specifies a " + + "fraction (out of the document's token count). Note that the parameter is " + "only used in transform of CountVectorizerModel and does not affect fitting.") @keyword_only @@ -208,15 +210,19 @@ def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outpu super(CountVectorizer, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.CountVectorizer", self.uid) - self.minDF = Param(self, "minDF", "Specifies the minimum number of different documents" - " a term must appear in to be included in the vocabulary. If this is an integer" - " >= 1, this specifies the number of documents the term must appear in; if this" - " is a double in [0,1), then this specifies the fraction of documents.") - self.minTF = Param(self, "minTF", "Filter to ignore rare words in a document. For each" - " document, terms with frequency/count less than the given threshold are ignored" - ". If this is an integer >= 1, then this specifies a count (of times the term " - "must appear in the document); if this is a double in [0,1), then this specifies" - " a fraction (out of the document's token count). Note that the parameter is " + self.minDF = \ + Param(self, "minDF", + "Specifies the minimum number of different documents a term must appear in " + + "to be included in the vocabulary. If this is an integer >= 1, this specifies " + + "the number of documents the term must appear in; if this is a double in " + + "[0,1), then this specifies the fraction of documents.") + self.minTF = \ + Param(self, "minTF", + "Filter to ignore rare words in a document. For each document, terms with " + + "frequency/count less than the given threshold are ignored. If this is an " + + "integer >= 1, then this specifies a count (of times the term must appear in" + + " the document); if this is a double in [0,1), then this specifies a " + + "fraction (out of the document's token count). Note that the parameter is " + "only used in transform of CountVectorizerModel and does not affect fitting.") self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary") self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18) @@ -224,9 +230,9 @@ def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outpu self.setParams(**kwargs) @keyword_only - def setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None): + def setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None): """ - setParams(self, minDF=1, minTF=1, vocabSize=1<<18, inputCol=None, outputCol=None) + setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None) Sets params for this CountVectorizer. """ kwargs = self.setParams._input_kwargs @@ -287,6 +293,7 @@ def vocabulary(self): """ return self._call_java("vocabulary") + @inherit_doc class DCT(JavaTransformer, HasInputCol, HasOutputCol): """ From dd0e933269832645f35c42a59d4d41ec4ef7f3fb Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Sep 2015 15:41:17 +0800 Subject: [PATCH 3/3] add an extra line --- python/pyspark/ml/feature.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 056055baa4505..ab05f84e935ae 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,6 +33,7 @@ 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] + @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """