From 82683a0f5d9989110762528edc1b174c004235c7 Mon Sep 17 00:00:00 2001 From: lewuathe Date: Wed, 1 Apr 2015 22:43:02 +0900 Subject: [PATCH 1/4] [SPARK-6643] Implement StandardScalerModel missing methods --- .../mllib/api/python/PythonMLLibAPI.scala | 38 +++++++++++++++++-- python/pyspark/mllib/feature.py | 16 ++++++++ python/pyspark/mllib/tests.py | 24 ++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 662ec5fbed453..bda4f9f87e819 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -434,8 +434,39 @@ private[python] class PythonMLLibAPI extends Serializable { new Normalizer(p).transform(rdd) } + private[python] class StandardScalerModelWrapper(model: StandardScalerModel) + extends VectorTransformer { + /** + * Wrapper of StandardScalerModel transform method + * @param vector + * @return + */ + def transform(vector: Vector): Vector = model.transform(vector) + + /** + * Setter of the boolean which decides + * whether it uses mean or not + * @param withMean + * @return + */ + def setWithMean(withMean: Boolean): this.type = { + model.setWithMean(withMean) + this + } + + /** + * Setter of the boolean which decides + * whether it uses mean or not + * @param withStd + * @return + */ + def setWithStd(withStd: Boolean): this.type = { + model.setWithStd(withStd) + this + } + } /** - * Java stub for IDF.fit(). This stub returns a + * Java stub for StandardScaler.fit(). This stub returns a * handle to the Java object instead of the content of the Java object. * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. @@ -443,8 +474,9 @@ private[python] class PythonMLLibAPI extends Serializable { def fitStandardScaler( withMean: Boolean, withStd: Boolean, - data: JavaRDD[Vector]): StandardScalerModel = { - new StandardScaler(withMean, withStd).fit(data.rdd) + data: JavaRDD[Vector]): StandardScalerModelWrapper = { + val model = new StandardScaler(withMean, withStd).fit(data.rdd) + new StandardScalerModelWrapper(model) } /** diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 4bfe3014ef748..03ca2c9006356 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -132,6 +132,22 @@ def transform(self, vector): """ return JavaVectorTransformer.transform(self, vector) + def setWithMean(self, withMean): + """ + Setter of the boolean which decides + whether it uses mean or not + """ + self.call("setWithMean", withMean) + return self + + def setWithStd(self, withStd): + """ + Setter of the boolean which decides + whether it uses mean or not + """ + self.call("setWithStd", withStd) + return self + class StandardScaler(object): """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 3bb0f0ca68128..42ca7084b1f28 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -42,6 +42,7 @@ from pyspark.mllib.random import RandomRDDs from pyspark.mllib.stat import Statistics from pyspark.mllib.feature import IDF +from pyspark.mllib.feature import StandardScaler from pyspark.serializers import PickleSerializer from pyspark.sql import SQLContext from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase @@ -634,6 +635,29 @@ def test_idf_model(self): idf = model.idf() self.assertEqual(len(idf), 11) + +class StandardScalerTests(PySparkTestCase): + def test_model_setters(self): + data = [ + [1.0, 2.0, 3.0], + [2.0, 3.0, 4.0], + [3.0, 4.0, 5.0] + ] + model = StandardScaler().fit(self.sc.parallelize(data)) + self.assertIsNotNone(model.setWithMean(True)) + self.assertIsNotNone(model.setWithStd(True)) + self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0])) + + def test_model_transform(self): + data = [ + [1.0, 2.0, 3.0], + [2.0, 3.0, 4.0], + [3.0, 4.0, 5.0] + ] + model = StandardScaler().fit(self.sc.parallelize(data)) + self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0])) + + if __name__ == "__main__": if not _have_scipy: print "NOTE: Skipping SciPy tests as it does not seem to be installed" From 66bb2ab04bf23913970d561cab80c70f6ce986ac Mon Sep 17 00:00:00 2001 From: lewuathe Date: Fri, 3 Apr 2015 22:47:54 +0900 Subject: [PATCH 2/4] Fix typos --- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 2 +- python/pyspark/mllib/feature.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index bda4f9f87e819..6000083e126bb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -456,7 +456,7 @@ private[python] class PythonMLLibAPI extends Serializable { /** * Setter of the boolean which decides - * whether it uses mean or not + * whether it uses std or not * @param withStd * @return */ diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 03ca2c9006356..44d4bcf488326 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -143,7 +143,7 @@ def setWithMean(self, withMean): def setWithStd(self, withStd): """ Setter of the boolean which decides - whether it uses mean or not + whether it uses std or not """ self.call("setWithStd", withStd) return self From 578f5ee430ee1a7ec7b9fe28ef51a9d487d0a3fb Mon Sep 17 00:00:00 2001 From: lewuathe Date: Sat, 4 Apr 2015 11:10:04 +0900 Subject: [PATCH 3/4] Remove unnecessary class --- .../mllib/api/python/PythonMLLibAPI.scala | 38 ++----------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 870f68126459b..280a1bffc1537 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -433,38 +433,7 @@ private[python] class PythonMLLibAPI extends Serializable { def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = { new Normalizer(p).transform(rdd) } - - private[python] class StandardScalerModelWrapper(model: StandardScalerModel) - extends VectorTransformer { - /** - * Wrapper of StandardScalerModel transform method - * @param vector - * @return - */ - def transform(vector: Vector): Vector = model.transform(vector) - - /** - * Setter of the boolean which decides - * whether it uses mean or not - * @param withMean - * @return - */ - def setWithMean(withMean: Boolean): this.type = { - model.setWithMean(withMean) - this - } - - /** - * Setter of the boolean which decides - * whether it uses std or not - * @param withStd - * @return - */ - def setWithStd(withStd: Boolean): this.type = { - model.setWithStd(withStd) - this - } - } + /** * Java stub for StandardScaler.fit(). This stub returns a * handle to the Java object instead of the content of the Java object. @@ -474,9 +443,8 @@ private[python] class PythonMLLibAPI extends Serializable { def fitStandardScaler( withMean: Boolean, withStd: Boolean, - data: JavaRDD[Vector]): StandardScalerModelWrapper = { - val model = new StandardScaler(withMean, withStd).fit(data.rdd) - new StandardScalerModelWrapper(model) + data: JavaRDD[Vector]): StandardScalerModel = { + new StandardScaler(withMean, withStd).fit(data.rdd) } /** From fafd6900d2f2e8c4a511208e68ed801aac27f08a Mon Sep 17 00:00:00 2001 From: lewuathe Date: Sat, 11 Apr 2015 20:41:22 +0900 Subject: [PATCH 4/4] Fix for lint-python --- python/pyspark/mllib/tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 25c31f374db6f..ddf48de11dce9 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -685,6 +685,7 @@ def test_word2vec_get_vectors(self): model = Word2Vec().fit(self.sc.parallelize(data)) self.assertEquals(len(model.getVectors()), 3) + class StandardScalerTests(PySparkTestCase): def test_model_setters(self): data = [