From 54bffab42600aa184eb01134410f266199d19a97 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 11 Aug 2015 18:53:47 +0800
Subject: [PATCH 1/2] Add Python API for ml.feature.VectorSlicer

---
 python/pyspark/ml/feature.py | 86 ++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index cb4dfa21298ce..da33cc33f0005 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -949,6 +949,92 @@ class VectorIndexerModel(JavaModel):
     """
 
 
+@inherit_doc
+class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    This class takes a feature vector and outputs a new feature vector with a subarray
+    of the original features.
+
+    The subset of features can be specified with either indices (`setIndices()`)
+    or names (`setNames()`).  At least one feature must be selected. Duplicate features
+    are not allowed, so there can be no overlap between selected indices and names.
+
+    The output vector will order features with the selected indices first (in the order given),
+    followed by the selected names (in the order given).
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
+    ...     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
+    ...     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
+    >>> vs = VectorSlicer(inputCol="features", outputCol="expected", indices=[1, 4])
+    >>> vs.transform(df).head().expected
+    DenseVector([2.3, 1.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
+                    "a vector column. There can be no overlap with `names`.")
+    names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
+                  "a vector column. These names must be specified by ML " +
+                  "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap with " +
+                  "`indices`.")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, indices=[], names=[]):
+        """
+        __init__(self, inputCol=None, outputCol=None, indices=[], names=[])
+        """
+        super(VectorSlicer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
+        self.indices = Param(self, "indices", "An array of indices to select features from " +
+                             "a vector column. There can be no overlap with `names`.")
+        self.names = Param(self, "names", "An array of feature names to select features from " +
+                           "a vector column. These names must be specified by ML " +
+                           "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap " +
+                           "with `indices`.")
+        self._setDefault(indices=[], names=[])
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, indices=[], names=[]):
+        """
+        setParams(self, inputCol=None, outputCol=None, indices=[], names=[]):
+        Sets params for this VectorSlicer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setIndices(self, value):
+        """
+        Sets the value of :py:attr:`indices`.
+        """
+        self._paramMap[self.indices] = value
+        return self
+
+    def getIndices(self):
+        """
+        Gets the value of indices or its default value.
+        """
+        return self.getOrDefault(self.indices)
+
+    def setNames(self, value):
+        """
+        Sets the value of :py:attr:`names`.
+        """
+        self._paramMap[self.names] = value
+        return self
+
+    def getNames(self):
+        """
+        Gets the value of names or its default value.
+        """
+        return self.getOrDefault(self.names)
+
+
 @inherit_doc
 @ignore_unicode_prefix
 class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):

From 5d2f75bdd4ab0592a630631299542771917c5f79 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 19 Aug 2015 16:52:05 +0800
Subject: [PATCH 2/2] address comments

---
 python/pyspark/ml/feature.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index da33cc33f0005..e05bd7abb0436 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -29,7 +29,8 @@
 __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
            'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
            'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
-           'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+           'VectorSlicer', 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula',
+           'RFormulaModel']
 
 
 @inherit_doc
@@ -969,40 +970,39 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
     ...     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
     ...     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
     ...     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
-    >>> vs = VectorSlicer(inputCol="features", outputCol="expected", indices=[1, 4])
-    >>> vs.transform(df).head().expected
+    >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
+    >>> vs.transform(df).head().sliced
     DenseVector([2.3, 1.0])
     """
 
     # a placeholder to make it appear in the generated doc
     indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
-                    "a vector column. There can be no overlap with `names`.")
+                    "a vector column. There can be no overlap with names.")
     names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
                   "a vector column. These names must be specified by ML " +
-                  "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap with " +
-                  "`indices`.")
+                  "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
+                  "indices.")
 
     @keyword_only
-    def __init__(self, inputCol=None, outputCol=None, indices=[], names=[]):
+    def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
         """
-        __init__(self, inputCol=None, outputCol=None, indices=[], names=[])
+        __init__(self, inputCol=None, outputCol=None, indices=None, names=None)
         """
         super(VectorSlicer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
         self.indices = Param(self, "indices", "An array of indices to select features from " +
-                             "a vector column. There can be no overlap with `names`.")
+                             "a vector column. There can be no overlap with names.")
         self.names = Param(self, "names", "An array of feature names to select features from " +
                            "a vector column. These names must be specified by ML " +
-                           "`org.apache.spark.ml.attribute.Attribute`s. There can be no overlap " +
-                           "with `indices`.")
-        self._setDefault(indices=[], names=[])
+                           "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
+                           "with indices.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, inputCol=None, outputCol=None, indices=[], names=[]):
+    def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         """
-        setParams(self, inputCol=None, outputCol=None, indices=[], names=[]):
+        setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         Sets params for this VectorSlicer.
         """
         kwargs = self.setParams._input_kwargs