From 5f81d81ff785b6078fb9bcf03f591e0e31819b09 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 22 May 2015 11:58:30 +0530 Subject: [PATCH 1/2] [SPARK-7605] [MLlib] Python API for ElementwiseProduct --- docs/mllib-feature-extraction.md | 22 ++++++++++ .../mllib/api/python/PythonMLLibAPI.scala | 8 ++++ python/pyspark/mllib/feature.py | 40 ++++++++++++++++++- python/pyspark/mllib/tests.py | 13 ++++++ 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index f723cd6b9dfab..4e935cfd7cc17 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -525,6 +525,28 @@ val transformer = new ElementwiseProduct(transformingVector) val transformedData = transformer.transform(parsedData) val transformedData2 = parsedData.map(x => transformer.transform(x)) +{% endhighlight %} + + +
+{% highlight python %} +from pyspark import SparkContext +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.feature import ElementwiseProduct + +# Load and parse the data +sc = SparkContext() +data = sc.textFile("data/mllib/kmeans_data.txt") +parsedData = data.map(lambda x: [float(t) for t in x.split(" ")]) + +# Create weight vector. +transformingVector = Vectors.dense([0.0, 1.0, 2.0]) +transformer = ElementwiseProduct(transformingVector) + +# Batch transform. +transformedData = transformer.transform(parsedData) +transformedData2 = transformer.transform(parsedData.first()) + {% endhighlight %}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 2fa54df6fc2b2..de2919717e119 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -701,6 +701,14 @@ private[python] class PythonMLLibAPI extends Serializable { } } + def elementwiseProductVector(weight: Vector, vector: Vector): Vector = { + new ElementwiseProduct(weight).transform(vector) + } + + def elementwiseProductVector(weight: Vector, vector: JavaRDD[Vector]): JavaRDD[Vector] = { + new ElementwiseProduct(weight).transform(vector) + } + /** * Java stub for mllib Statistics.colStats(X: RDD[Vector]). * TODO figure out return type. diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index aac305db6c19a..684e275387d33 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -33,12 +33,13 @@ from pyspark import SparkContext from pyspark.rdd import RDD, ignore_unicode_prefix from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper -from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector, _convert_to_vector +from pyspark.mllib.linalg import ( + Vector, Vectors, DenseVector, SparseVector, _convert_to_vector) from pyspark.mllib.regression import LabeledPoint __all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler', 'HashingTF', 'IDFModel', 'IDF', 'Word2Vec', 'Word2VecModel', - 'ChiSqSelector', 'ChiSqSelectorModel'] + 'ChiSqSelector', 'ChiSqSelectorModel', 'ElementwiseProduct'] class VectorTransformer(object): @@ -525,6 +526,41 @@ def fit(self, data): return Word2VecModel(jmodel) +class ElementwiseProduct(VectorTransformer): + """ + .. note:: Experimental + + Scales each column of the vector, with the supplied weight vector. + i.e the elementwise product. + + >>> weight = Vectors.dense([1.0, 2.0, 3.0]) + >>> eprod = ElementwiseProduct(weight) + >>> a = Vectors.dense([2.0, 1.0, 3.0]) + >>> eprod.transform(a) + DenseVector([2.0, 2.0, 9.0]) + >>> b = Vectors.dense([9.0, 3.0, 4.0]) + >>> rdd = sc.parallelize([a, b]) + >>> eprod.transform(rdd).collect() + [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])] + """ + def __init__(self, vector): + if not isinstance(vector, Vector): + raise ValueError( + "vector should be a Vector got %s." % type(vector)) + self.vector = vector + + def transform(self, vector): + """ + Computes the Hadamard product of the vector. + """ + if isinstance(vector, RDD): + vector = vector.map(_convert_to_vector) + + else: + vector = _convert_to_vector(vector) + return callMLlibFunc("elementwiseProductVector", self.vector, vector) + + def _test(): import doctest from pyspark import SparkContext diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 36a4c7a5408c6..d14c3d59c8229 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -46,6 +46,7 @@ from pyspark.mllib.feature import Word2Vec from pyspark.mllib.feature import IDF from pyspark.mllib.feature import StandardScaler +from pyspark.mllib.feature import ElementwiseProduct from pyspark.serializers import PickleSerializer from pyspark.sql import SQLContext @@ -818,6 +819,18 @@ def test_model_transform(self): self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0])) +class ElementwiseProductTests(MLlibTestCase): + def test_model_transform(self): + weight = Vectors.dense([3, 2, 1]) + + densevec = Vectors.dense([4, 5, 6]) + sparsevec = Vectors.sparse(3, [0], [1]) + eprod = ElementwiseProduct(weight) + self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) + self.assertEqual( + eprod.transform(sparsevec), SparseVector(3, [0], [3])) + + if __name__ == "__main__": if not _have_scipy: print("NOTE: Skipping SciPy tests as it does not seem to be installed") From 79d1ef57b19a1580c8c360633e37048a466dc6f6 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Thu, 18 Jun 2015 02:00:02 +0530 Subject: [PATCH 2/2] Consistent and support list / array types --- .../apache/spark/mllib/api/python/PythonMLLibAPI.scala | 8 ++++---- python/pyspark/mllib/feature.py | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index de2919717e119..64227261712af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -701,12 +701,12 @@ private[python] class PythonMLLibAPI extends Serializable { } } - def elementwiseProductVector(weight: Vector, vector: Vector): Vector = { - new ElementwiseProduct(weight).transform(vector) + def elementwiseProductVector(scalingVector: Vector, vector: Vector): Vector = { + new ElementwiseProduct(scalingVector).transform(vector) } - def elementwiseProductVector(weight: Vector, vector: JavaRDD[Vector]): JavaRDD[Vector] = { - new ElementwiseProduct(weight).transform(vector) + def elementwiseProductVector(scalingVector: Vector, vector: JavaRDD[Vector]): JavaRDD[Vector] = { + new ElementwiseProduct(scalingVector).transform(vector) } /** diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 684e275387d33..416a5e12edcab 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -543,11 +543,8 @@ class ElementwiseProduct(VectorTransformer): >>> eprod.transform(rdd).collect() [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])] """ - def __init__(self, vector): - if not isinstance(vector, Vector): - raise ValueError( - "vector should be a Vector got %s." % type(vector)) - self.vector = vector + def __init__(self, scalingVector): + self.scalingVector = _convert_to_vector(scalingVector) def transform(self, vector): """ @@ -558,7 +555,7 @@ def transform(self, vector): else: vector = _convert_to_vector(vector) - return callMLlibFunc("elementwiseProductVector", self.vector, vector) + return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector) def _test():