From 82683a0f5d9989110762528edc1b174c004235c7 Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Wed, 1 Apr 2015 22:43:02 +0900
Subject: [PATCH 1/4] [SPARK-6643] Implement StandardScalerModel missing
 methods

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 38 +++++++++++++++++--
 python/pyspark/mllib/feature.py               | 16 ++++++++
 python/pyspark/mllib/tests.py                 | 24 ++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 662ec5fbed453..bda4f9f87e819 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -434,8 +434,39 @@ private[python] class PythonMLLibAPI extends Serializable {
     new Normalizer(p).transform(rdd)
   }
 
+  private[python] class StandardScalerModelWrapper(model: StandardScalerModel)
+    extends VectorTransformer {
+    /**
+     * Wrapper of StandardScalerModel transform method
+     * @param vector
+     * @return
+     */
+    def transform(vector: Vector): Vector = model.transform(vector)
+
+    /**
+     * Setter of the boolean which decides
+     * whether it uses mean or not
+     * @param withMean
+     * @return
+     */
+    def setWithMean(withMean: Boolean): this.type = {
+      model.setWithMean(withMean)
+      this
+    }
+
+    /**
+     * Setter of the boolean which decides
+     * whether it uses mean or not
+     * @param withStd
+     * @return
+     */
+    def setWithStd(withStd: Boolean): this.type = {
+      model.setWithStd(withStd)
+      this
+    }
+  }
   /**
-   * Java stub for IDF.fit(). This stub returns a
+   * Java stub for StandardScaler.fit(). This stub returns a
    * handle to the Java object instead of the content of the Java object.
    * Extra care needs to be taken in the Python code to ensure it gets freed on
    * exit; see the Py4J documentation.
@@ -443,8 +474,9 @@ private[python] class PythonMLLibAPI extends Serializable {
   def fitStandardScaler(
       withMean: Boolean,
       withStd: Boolean,
-      data: JavaRDD[Vector]): StandardScalerModel = {
-    new StandardScaler(withMean, withStd).fit(data.rdd)
+      data: JavaRDD[Vector]): StandardScalerModelWrapper = {
+    val model = new StandardScaler(withMean, withStd).fit(data.rdd)
+    new StandardScalerModelWrapper(model)
   }
 
   /**
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 4bfe3014ef748..03ca2c9006356 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -132,6 +132,22 @@ def transform(self, vector):
         """
         return JavaVectorTransformer.transform(self, vector)
 
+    def setWithMean(self, withMean):
+        """
+        Setter of the boolean which decides
+        whether it uses mean or not
+        """
+        self.call("setWithMean", withMean)
+        return self
+
+    def setWithStd(self, withStd):
+        """
+        Setter of the boolean which decides
+        whether it uses mean or not
+        """
+        self.call("setWithStd", withStd)
+        return self
+
 
 class StandardScaler(object):
     """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 3bb0f0ca68128..42ca7084b1f28 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -42,6 +42,7 @@
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
 from pyspark.mllib.feature import IDF
+from pyspark.mllib.feature import StandardScaler
 from pyspark.serializers import PickleSerializer
 from pyspark.sql import SQLContext
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
@@ -634,6 +635,29 @@ def test_idf_model(self):
         idf = model.idf()
         self.assertEqual(len(idf), 11)
 
+
+class StandardScalerTests(PySparkTestCase):
+    def test_model_setters(self):
+        data = [
+            [1.0, 2.0, 3.0],
+            [2.0, 3.0, 4.0],
+            [3.0, 4.0, 5.0]
+        ]
+        model = StandardScaler().fit(self.sc.parallelize(data))
+        self.assertIsNotNone(model.setWithMean(True))
+        self.assertIsNotNone(model.setWithStd(True))
+        self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
+
+    def test_model_transform(self):
+        data = [
+            [1.0, 2.0, 3.0],
+            [2.0, 3.0, 4.0],
+            [3.0, 4.0, 5.0]
+        ]
+        model = StandardScaler().fit(self.sc.parallelize(data))
+        self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
+
+
 if __name__ == "__main__":
     if not _have_scipy:
         print "NOTE: Skipping SciPy tests as it does not seem to be installed"

From 66bb2ab04bf23913970d561cab80c70f6ce986ac Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Fri, 3 Apr 2015 22:47:54 +0900
Subject: [PATCH 2/4] Fix typos

---
 .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala      | 2 +-
 python/pyspark/mllib/feature.py                                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index bda4f9f87e819..6000083e126bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -456,7 +456,7 @@ private[python] class PythonMLLibAPI extends Serializable {
 
     /**
      * Setter of the boolean which decides
-     * whether it uses mean or not
+     * whether it uses std or not
      * @param withStd
      * @return
      */
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 03ca2c9006356..44d4bcf488326 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -143,7 +143,7 @@ def setWithMean(self, withMean):
     def setWithStd(self, withStd):
         """
         Setter of the boolean which decides
-        whether it uses mean or not
+        whether it uses std or not
         """
         self.call("setWithStd", withStd)
         return self

From 578f5ee430ee1a7ec7b9fe28ef51a9d487d0a3fb Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Sat, 4 Apr 2015 11:10:04 +0900
Subject: [PATCH 3/4] Remove unnecessary class

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 38 ++-----------------
 1 file changed, 3 insertions(+), 35 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 870f68126459b..280a1bffc1537 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -433,38 +433,7 @@ private[python] class PythonMLLibAPI extends Serializable {
   def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
     new Normalizer(p).transform(rdd)
   }
-
-  private[python] class StandardScalerModelWrapper(model: StandardScalerModel)
-    extends VectorTransformer {
-    /**
-     * Wrapper of StandardScalerModel transform method
-     * @param vector
-     * @return
-     */
-    def transform(vector: Vector): Vector = model.transform(vector)
-
-    /**
-     * Setter of the boolean which decides
-     * whether it uses mean or not
-     * @param withMean
-     * @return
-     */
-    def setWithMean(withMean: Boolean): this.type = {
-      model.setWithMean(withMean)
-      this
-    }
-
-    /**
-     * Setter of the boolean which decides
-     * whether it uses std or not
-     * @param withStd
-     * @return
-     */
-    def setWithStd(withStd: Boolean): this.type = {
-      model.setWithStd(withStd)
-      this
-    }
-  }
+  
   /**
    * Java stub for StandardScaler.fit(). This stub returns a
    * handle to the Java object instead of the content of the Java object.
@@ -474,9 +443,8 @@ private[python] class PythonMLLibAPI extends Serializable {
   def fitStandardScaler(
       withMean: Boolean,
       withStd: Boolean,
-      data: JavaRDD[Vector]): StandardScalerModelWrapper = {
-    val model = new StandardScaler(withMean, withStd).fit(data.rdd)
-    new StandardScalerModelWrapper(model)
+      data: JavaRDD[Vector]): StandardScalerModel = {
+    new StandardScaler(withMean, withStd).fit(data.rdd)
   }
 
   /**

From fafd6900d2f2e8c4a511208e68ed801aac27f08a Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Sat, 11 Apr 2015 20:41:22 +0900
Subject: [PATCH 4/4] Fix for lint-python

---
 python/pyspark/mllib/tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 25c31f374db6f..ddf48de11dce9 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -685,6 +685,7 @@ def test_word2vec_get_vectors(self):
         model = Word2Vec().fit(self.sc.parallelize(data))
         self.assertEquals(len(model.getVectors()), 3)
 
+
 class StandardScalerTests(PySparkTestCase):
     def test_model_setters(self):
         data = [