diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8c26cfbd5a47d..1c423486be8d9 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -27,11 +27,11 @@ from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', - 'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', - 'RegexTokenizer', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', - 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', - 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', - 'StopWordsRemover'] + 'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', + 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', + 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', + 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', + 'Word2Vec', 'Word2VecModel'] @inherit_doc @@ -1298,6 +1298,91 @@ class VectorIndexerModel(JavaModel): """ +@inherit_doc +class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + This class takes a feature vector and outputs a new feature vector with a subarray + of the original features. + + The subset of features can be specified with either indices (`setIndices()`) + or names (`setNames()`). At least one feature must be selected. Duplicate features + are not allowed, so there can be no overlap between selected indices and names. + + The output vector will order features with the selected indices first (in the order given), + followed by the selected names (in the order given). + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([ + ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), + ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), + ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) + >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) + >>> vs.transform(df).head().sliced + DenseVector([2.3, 1.0]) + """ + + # a placeholder to make it appear in the generated doc + indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + + "a vector column. There can be no overlap with names.") + names = Param(Params._dummy(), "names", "An array of feature names to select features from " + + "a vector column. These names must be specified by ML " + + "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + + "indices.") + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): + """ + __init__(self, inputCol=None, outputCol=None, indices=None, names=None) + """ + super(VectorSlicer, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) + self.indices = Param(self, "indices", "An array of indices to select features from " + + "a vector column. There can be no overlap with names.") + self.names = Param(self, "names", "An array of feature names to select features from " + + "a vector column. These names must be specified by ML " + + "org.apache.spark.ml.attribute.Attribute. There can be no overlap " + + "with indices.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, indices=None, names=None): + """ + setParams(self, inputCol=None, outputCol=None, indices=None, names=None): + Sets params for this VectorSlicer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setIndices(self, value): + """ + Sets the value of :py:attr:`indices`. + """ + self._paramMap[self.indices] = value + return self + + def getIndices(self): + """ + Gets the value of indices or its default value. + """ + return self.getOrDefault(self.indices) + + def setNames(self, value): + """ + Sets the value of :py:attr:`names`. + """ + self._paramMap[self.names] = value + return self + + def getNames(self): + """ + Gets the value of names or its default value. + """ + return self.getOrDefault(self.names) + + @inherit_doc @ignore_unicode_prefix class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):