From 83df25f1989ed86a4e8670f3449a9e2ac90e8323 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 10 Jun 2014 00:41:37 -0700 Subject: [PATCH 1/4] Support numpy 1.4 --- python/pyspark/mllib/_common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py index a411a5d5914e0..ca1620dabec22 100644 --- a/python/pyspark/mllib/_common.py +++ b/python/pyspark/mllib/_common.py @@ -454,7 +454,7 @@ def _squared_distance(v1, v2): v2 = _convert_vector(v2) if type(v1) == ndarray and type(v2) == ndarray: diff = v1 - v2 - return diff.dot(diff) + return numpy.dot(diff, diff) elif type(v1) == ndarray: return v2.squared_distance(v1) else: @@ -469,7 +469,9 @@ def _dot(vec, target): calling numpy.dot of the two vectors, but for SciPy ones, we have to transpose them because they're column vectors. """ - if type(vec) == ndarray or type(vec) == SparseVector: + if type(vec) == ndarray: + return numpy.dot(vec, target) + elif type(vec) == SparseVector: return vec.dot(target) elif type(vec) == list: return _convert_vector(vec).dot(target) From db9bfbdca1fa75d84611688a4491138113ac7ec6 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 10 Jun 2014 00:45:13 -0700 Subject: [PATCH 2/4] Remove use of skipIf --- python/pyspark/mllib/tests.py | 15 ++++++++++++++- python/pyspark/tests.py | 3 ++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 37ccf1d590743..54e7a0a357cd8 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -158,7 +158,6 @@ def test_regression(self): self.assertTrue(rr_model.predict(features[3]) > 0) -@unittest.skipIf(not _have_scipy, "SciPy not installed") class SciPyTests(PySparkTestCase): """ Test both vector operations and MLlib algorithms with SciPy sparse matrices, @@ -166,6 +165,8 @@ class SciPyTests(PySparkTestCase): """ def test_serialize(self): + if not _have_scipy: + return from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 @@ -182,6 +183,8 @@ def test_serialize(self): self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok()))) def test_dot(self): + if not _have_scipy: + return from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 @@ -196,6 +199,8 @@ def test_dot(self): self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(lil, mat))) def test_squared_distance(self): + if not _have_scipy: + return from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 3 @@ -208,6 +213,8 @@ def test_squared_distance(self): self.assertEquals(15.0, _squared_distance(sv, lil)) def scipy_matrix(self, size, values): + if not _have_scipy: + return """Create a column SciPy matrix from a dictionary of values""" from scipy.sparse import lil_matrix lil = lil_matrix((size, 1)) @@ -216,6 +223,8 @@ def scipy_matrix(self, size, values): return lil def test_clustering(self): + if not _have_scipy: + return from pyspark.mllib.clustering import KMeans data = [ self.scipy_matrix(3, {1: 1.0}), @@ -228,6 +237,8 @@ def test_clustering(self): self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3])) def test_classification(self): + if not _have_scipy: + return from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), @@ -257,6 +268,8 @@ def test_classification(self): self.assertTrue(nb_model.predict(features[3]) > 0) def test_regression(self): + if not _have_scipy: + return from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 1f2a6ea941cf2..ff1b9f2087e05 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -363,11 +363,12 @@ def test_single_script_on_cluster(self): self.assertIn("[2, 4, 6]", out) -@unittest.skipIf(not _have_scipy, "SciPy not installed") class SciPyTests(PySparkTestCase): """General PySpark tests that depend on scipy """ def test_serialize(self): + if _not_have_scipy: + return from scipy.special import gammaln x = range(1, 5) expected = map(gammaln, x) From 1b835dd8d4251aa06c6119846fafdf3f4f53d752 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 10 Jun 2014 00:45:36 -0700 Subject: [PATCH 3/4] TEMPORARY CHANGE --- dev/run-tests | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index 93d6692f83ca8..7452c29fbb5ab 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -46,30 +46,6 @@ fi set -e set -o pipefail -echo "=========================================================================" -echo "Running Apache RAT checks" -echo "=========================================================================" -dev/check-license - -echo "=========================================================================" -echo "Running Scala style checks" -echo "=========================================================================" -dev/scalastyle - -echo "=========================================================================" -echo "Running Spark unit tests" -echo "=========================================================================" -# echo "q" is needed because sbt on encountering a build file with failure -# (either resolution or compilation) prompts the user for input either q, r, -# etc to quit or retry. This echo is there to make it not block. -if [ -n "$_RUN_SQL_TESTS" ]; then - echo -e "q\n" | SPARK_HIVE=true sbt/sbt clean assembly test | \ - grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" -else - echo -e "q\n" | sbt/sbt clean assembly test | \ - grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" -fi - echo "=========================================================================" echo "Running PySpark tests" echo "=========================================================================" From 4623de4cfc3cb82043f24d95edd7259dcc7718b0 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 10 Jun 2014 00:52:42 -0700 Subject: [PATCH 4/4] small fix --- python/pyspark/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index ff1b9f2087e05..45eaf0f239822 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -367,7 +367,7 @@ class SciPyTests(PySparkTestCase): """General PySpark tests that depend on scipy """ def test_serialize(self): - if _not_have_scipy: + if not _have_scipy: return from scipy.special import gammaln x = range(1, 5)