Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-9772] [PySpark] [ML] Add Python API for ml.feature.VectorSlicer #8102

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 90 additions & 5 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion',
'RegexTokenizer', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel',
'StopWordsRemover']
'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
'Word2Vec', 'Word2VecModel']


@inherit_doc
Expand Down Expand Up @@ -1298,6 +1298,91 @@ class VectorIndexerModel(JavaModel):
"""


@inherit_doc
class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
"""
.. note:: Experimental

This class takes a feature vector and outputs a new feature vector with a subarray
of the original features.

The subset of features can be specified with either indices (`setIndices()`)
or names (`setNames()`). At least one feature must be selected. Duplicate features
are not allowed, so there can be no overlap between selected indices and names.

The output vector will order features with the selected indices first (in the order given),
followed by the selected names (in the order given).

>>> from pyspark.mllib.linalg import Vectors
>>> df = sqlContext.createDataFrame([
... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
>>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
>>> vs.transform(df).head().sliced
DenseVector([2.3, 1.0])
"""

# a placeholder to make it appear in the generated doc
indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
"a vector column. There can be no overlap with names.")
names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
"a vector column. These names must be specified by ML " +
"org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
"indices.")

@keyword_only
def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
"""
__init__(self, inputCol=None, outputCol=None, indices=None, names=None)
"""
super(VectorSlicer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
self.indices = Param(self, "indices", "An array of indices to select features from " +
"a vector column. There can be no overlap with names.")
self.names = Param(self, "names", "An array of feature names to select features from " +
"a vector column. These names must be specified by ML " +
"org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
"with indices.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
"""
setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
Sets params for this VectorSlicer.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

def setIndices(self, value):
"""
Sets the value of :py:attr:`indices`.
"""
self._paramMap[self.indices] = value
return self

def getIndices(self):
"""
Gets the value of indices or its default value.
"""
return self.getOrDefault(self.indices)

def setNames(self, value):
"""
Sets the value of :py:attr:`names`.
"""
self._paramMap[self.names] = value
return self

def getNames(self):
"""
Gets the value of names or its default value.
"""
return self.getOrDefault(self.names)


@inherit_doc
@ignore_unicode_prefix
class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
Expand Down