From 54bffab42600aa184eb01134410f266199d19a97 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 11 Aug 2015 18:53:47 +0800 Subject: [PATCH 1/2] Add Python API for ml.feature.VectorSlicer --- python/pyspark/ml/feature.py | 86 ++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index cb4dfa21298ce..da33cc33f0005 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -949,6 +949,92 @@ class VectorIndexerModel(JavaModel): """ +@inherit_doc +class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + This class takes a feature vector and outputs a new feature vector with a subarray + of the original features. + + The subset of features can be specified with either indices (`setIndices()`) + or names (`setNames()`). At least one feature must be selected. Duplicate features + are not allowed, so there can be no overlap between selected indices and names. + + The output vector will order features with the selected indices first (in the order given), + followed by the selected names (in the order given). + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([ + ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), + ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), + ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) + >>> vs = VectorSlicer(inputCol="features", outputCol="expected", indices=[1, 4]) + >>> vs.transform(df).head().expected + DenseVector([2.3, 1.0]) + """ + + # a placeholder to make it appear in the generated doc + indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + + "a vector column. There can be no overlap with `names`.") + names = Param(Params._dummy(), "names", "An array of feature names to select features from " + + "a vector column. These names must be specified by ML " + + "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap with " + + "`indices`.") + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, indices=[], names=[]): + """ + __init__(self, inputCol=None, outputCol=None, indices=[], names=[]) + """ + super(VectorSlicer, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) + self.indices = Param(self, "indices", "An array of indices to select features from " + + "a vector column. There can be no overlap with `names`.") + self.names = Param(self, "names", "An array of feature names to select features from " + + "a vector column. These names must be specified by ML " + + "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap " + + "with `indices`.") + self._setDefault(indices=[], names=[]) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, indices=[], names=[]): + """ + setParams(self, inputCol=None, outputCol=None, indices=[], names=[]): + Sets params for this VectorSlicer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setIndices(self, value): + """ + Sets the value of :py:attr:`indices`. + """ + self._paramMap[self.indices] = value + return self + + def getIndices(self): + """ + Gets the value of indices or its default value. + """ + return self.getOrDefault(self.indices) + + def setNames(self, value): + """ + Sets the value of :py:attr:`names`. + """ + self._paramMap[self.names] = value + return self + + def getNames(self): + """ + Gets the value of names or its default value. + """ + return self.getOrDefault(self.names) + + @inherit_doc @ignore_unicode_prefix class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): From 5d2f75bdd4ab0592a630631299542771917c5f79 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 19 Aug 2015 16:52:05 +0800 Subject: [PATCH 2/2] address comments --- python/pyspark/ml/feature.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index da33cc33f0005..e05bd7abb0436 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -29,7 +29,8 @@ __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', - 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] + 'VectorSlicer', 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', + 'RFormulaModel'] @inherit_doc @@ -969,40 +970,39 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) - >>> vs = VectorSlicer(inputCol="features", outputCol="expected", indices=[1, 4]) - >>> vs.transform(df).head().expected + >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) + >>> vs.transform(df).head().sliced DenseVector([2.3, 1.0]) """ # a placeholder to make it appear in the generated doc indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with `names`.") + "a vector column. There can be no overlap with names.") names = Param(Params._dummy(), "names", "An array of feature names to select features from " + "a vector column. These names must be specified by ML " + - "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap with " + - "`indices`.") + "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + + "indices.") @keyword_only - def __init__(self, inputCol=None, outputCol=None, indices=[], names=[]): + def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): """ - __init__(self, inputCol=None, outputCol=None, indices=[], names=[]) + __init__(self, inputCol=None, outputCol=None, indices=None, names=None) """ super(VectorSlicer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) self.indices = Param(self, "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with `names`.") + "a vector column. There can be no overlap with names.") self.names = Param(self, "names", "An array of feature names to select features from " + "a vector column. These names must be specified by ML " + - "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap " + - "with `indices`.") - self._setDefault(indices=[], names=[]) + "org.apache.spark.ml.attribute.Attribute. There can be no overlap " + + "with indices.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None, indices=[], names=[]): + def setParams(self, inputCol=None, outputCol=None, indices=None, names=None): """ - setParams(self, inputCol=None, outputCol=None, indices=[], names=[]): + setParams(self, inputCol=None, outputCol=None, indices=None, names=None): Sets params for this VectorSlicer. """ kwargs = self.setParams._input_kwargs