-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-10482] [ML] Add Python interface for ml.CountVectorizer #8650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,11 +26,12 @@ | |
from pyspark.mllib.common import inherit_doc | ||
from pyspark.mllib.linalg import _convert_to_vector | ||
|
||
__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', | ||
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', | ||
'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', | ||
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', | ||
'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] | ||
__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', | ||
'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', | ||
'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'SQLTransformer', | ||
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', | ||
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', | ||
'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] | ||
|
||
|
||
@inherit_doc | ||
|
@@ -166,6 +167,134 @@ def getSplits(self): | |
return self.getOrDefault(self.splits) | ||
|
||
|
||
@inherit_doc | ||
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): | ||
""" | ||
Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]], | ||
which converts text documents to sparse vectors of token counts. | ||
|
||
>>> df = sentenceData = sqlContext.createDataFrame([(0, ["a", "b", "c"]), | ||
... (1, ["a", "b", "b", "c", "a"])], ["label", "raw"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following style might be better: df = sqlContext.createDataFrame(
[(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
["label", "raw"]) |
||
>>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") | ||
>>> model = cv.fit(df) | ||
>>> model.transform(sentenceData).show(truncate=False) | ||
+-----+---------------+-------------------------+ | ||
|label|raw |vectors | | ||
+-----+---------------+-------------------------+ | ||
|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])| | ||
|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])| | ||
+-----+---------------+-------------------------+ | ||
... | ||
""" | ||
|
||
# a placeholder to make it appear in the generated doc | ||
vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary") | ||
minDF = Param(Params._dummy(), "minDF", | ||
"Specifies the minimum number of different documents a term must appear in " + | ||
"to be included in the vocabulary. If this is an integer >= 1, this specifies " + | ||
"the number of documents the term must appear in; if this is a double in " + | ||
"[0,1), then this specifies the fraction of documents.") | ||
|
||
minTF = Param(Params._dummy(), "minTF", | ||
"Filter to ignore rare words in a document. For each document, terms with " + | ||
"frequency/count less than the given threshold are ignored. If this is an " + | ||
"integer >= 1, then this specifies a count (of times the term must appear in" + | ||
" the document); if this is a double in [0,1), then this specifies a " + | ||
"fraction (out of the document's token count). Note that the parameter is " + | ||
"only used in transform of CountVectorizerModel and does not affect fitting.") | ||
|
||
@keyword_only | ||
def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): | ||
""" | ||
__init__(self, minDF=1.0, minTF=1.0, vocabSize=1<<18, inputCol=None, outputCol=None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
""" | ||
super(CountVectorizer, self).__init__() | ||
self._java_obj = self._new_java_obj( | ||
"org.apache.spark.ml.feature.CountVectorizer", self.uid) | ||
self.minDF = \ | ||
Param(self, "minDF", | ||
"Specifies the minimum number of different documents a term must appear in " + | ||
"to be included in the vocabulary. If this is an integer >= 1, this specifies " + | ||
"the number of documents the term must appear in; if this is a double in " + | ||
"[0,1), then this specifies the fraction of documents.") | ||
self.minTF = \ | ||
Param(self, "minTF", | ||
"Filter to ignore rare words in a document. For each document, terms with " + | ||
"frequency/count less than the given threshold are ignored. If this is an " + | ||
"integer >= 1, then this specifies a count (of times the term must appear in" + | ||
" the document); if this is a double in [0,1), then this specifies a " + | ||
"fraction (out of the document's token count). Note that the parameter is " + | ||
"only used in transform of CountVectorizerModel and does not affect fitting.") | ||
self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary") | ||
self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18) | ||
kwargs = self.__init__._input_kwargs | ||
self.setParams(**kwargs) | ||
|
||
@keyword_only | ||
def setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None): | ||
""" | ||
setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None) | ||
Sets params for this CountVectorizer. | ||
""" | ||
kwargs = self.setParams._input_kwargs | ||
return self._set(**kwargs) | ||
|
||
def setMinDF(self, value): | ||
""" | ||
Sets the value of :py:attr:`minDF`. | ||
""" | ||
self._paramMap[self.minDF] = value | ||
return self | ||
|
||
def getMinDF(self): | ||
""" | ||
Gets the value of minDF or its default value. | ||
""" | ||
return self.getOrDefault(self.minDF) | ||
|
||
def setMinTF(self, value): | ||
""" | ||
Sets the value of :py:attr:`minTF`. | ||
""" | ||
self._paramMap[self.minTF] = value | ||
return self | ||
|
||
def getMinTF(self): | ||
""" | ||
Gets the value of minTF or its default value. | ||
""" | ||
return self.getOrDefault(self.minTF) | ||
|
||
def setVocabSize(self, value): | ||
""" | ||
Sets the value of :py:attr:`vocabSize`. | ||
""" | ||
self._paramMap[self.vocabSize] = value | ||
return self | ||
|
||
def getVocabSize(self): | ||
""" | ||
Gets the value of vocabSize or its default value. | ||
""" | ||
return self.getOrDefault(self.vocabSize) | ||
|
||
def _create_model(self, java_model): | ||
return CountVectorizerModel(java_model) | ||
|
||
|
||
class CountVectorizerModel(JavaModel): | ||
""" | ||
Model fitted by CountVectorizer. Converts a text document to a sparse vector of token counts. | ||
""" | ||
|
||
@property | ||
def vocabulary(self): | ||
""" | ||
An Array over terms. Only the terms in the vocabulary will be counted. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
""" | ||
return self._call_java("vocabulary") | ||
|
||
|
||
@inherit_doc | ||
class DCT(JavaTransformer, HasInputCol, HasOutputCol): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove
sentenceData =