Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 134 additions & 5 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'SQLTransformer',
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']


@inherit_doc
Expand Down Expand Up @@ -166,6 +167,134 @@ def getSplits(self):
return self.getOrDefault(self.splits)


@inherit_doc
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
"""
Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]],
which converts text documents to sparse vectors of token counts.

>>> df = sentenceData = sqlContext.createDataFrame([(0, ["a", "b", "c"]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove sentenceData =

... (1, ["a", "b", "b", "c", "a"])], ["label", "raw"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following style might be better:

df = sqlContext.createDataFrame(
    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
    ["label", "raw"])

>>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
>>> model = cv.fit(df)
>>> model.transform(sentenceData).show(truncate=False)
+-----+---------------+-------------------------+
|label|raw |vectors |
+-----+---------------+-------------------------+
|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|
|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+-----+---------------+-------------------------+
...
"""

# a placeholder to make it appear in the generated doc
vocabSize = Param(Params._dummy(), "vocabSize", "max size of the vocabulary")
minDF = Param(Params._dummy(), "minDF",
"Specifies the minimum number of different documents a term must appear in " +
"to be included in the vocabulary. If this is an integer >= 1, this specifies " +
"the number of documents the term must appear in; if this is a double in " +
"[0,1), then this specifies the fraction of documents.")

minTF = Param(Params._dummy(), "minTF",
"Filter to ignore rare words in a document. For each document, terms with " +
"frequency/count less than the given threshold are ignored. If this is an " +
"integer >= 1, then this specifies a count (of times the term must appear in" +
" the document); if this is a double in [0,1), then this specifies a " +
"fraction (out of the document's token count). Note that the parameter is " +
"only used in transform of CountVectorizerModel and does not affect fitting.")

@keyword_only
def __init__(self, minDF=1.0, minTF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
"""
__init__(self, minDF=1.0, minTF=1.0, vocabSize=1<<18, inputCol=None, outputCol=None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 << 18

"""
super(CountVectorizer, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.feature.CountVectorizer", self.uid)
self.minDF = \
Param(self, "minDF",
"Specifies the minimum number of different documents a term must appear in " +
"to be included in the vocabulary. If this is an integer >= 1, this specifies " +
"the number of documents the term must appear in; if this is a double in " +
"[0,1), then this specifies the fraction of documents.")
self.minTF = \
Param(self, "minTF",
"Filter to ignore rare words in a document. For each document, terms with " +
"frequency/count less than the given threshold are ignored. If this is an " +
"integer >= 1, then this specifies a count (of times the term must appear in" +
" the document); if this is a double in [0,1), then this specifies a " +
"fraction (out of the document's token count). Note that the parameter is " +
"only used in transform of CountVectorizerModel and does not affect fitting.")
self.vocabSize = Param(self, "vocabSize", "max size of the vocabulary")
self._setDefault(minDF=1.0, minTF=1.0, vocabSize=1 << 18)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None):
"""
setParams(self, minDF=1, minTF=1, vocabSize=1 << 18, inputCol=None, outputCol=None)
Sets params for this CountVectorizer.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

def setMinDF(self, value):
"""
Sets the value of :py:attr:`minDF`.
"""
self._paramMap[self.minDF] = value
return self

def getMinDF(self):
"""
Gets the value of minDF or its default value.
"""
return self.getOrDefault(self.minDF)

def setMinTF(self, value):
"""
Sets the value of :py:attr:`minTF`.
"""
self._paramMap[self.minTF] = value
return self

def getMinTF(self):
"""
Gets the value of minTF or its default value.
"""
return self.getOrDefault(self.minTF)

def setVocabSize(self, value):
"""
Sets the value of :py:attr:`vocabSize`.
"""
self._paramMap[self.vocabSize] = value
return self

def getVocabSize(self):
"""
Gets the value of vocabSize or its default value.
"""
return self.getOrDefault(self.vocabSize)

def _create_model(self, java_model):
return CountVectorizerModel(java_model)


class CountVectorizerModel(JavaModel):
"""
Model fitted by CountVectorizer. Converts a text document to a sparse vector of token counts.
"""

@property
def vocabulary(self):
"""
An Array over terms. Only the terms in the vocabulary will be counted.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

array

"""
return self._call_java("vocabulary")


@inherit_doc
class DCT(JavaTransformer, HasInputCol, HasOutputCol):
"""
Expand Down