From 83df25f1989ed86a4e8670f3449a9e2ac90e8323 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Tue, 10 Jun 2014 00:41:37 -0700
Subject: [PATCH 1/4] Support numpy 1.4

---
 python/pyspark/mllib/_common.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index a411a5d5914e0..ca1620dabec22 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -454,7 +454,7 @@ def _squared_distance(v1, v2):
     v2 = _convert_vector(v2)
     if type(v1) == ndarray and type(v2) == ndarray:
         diff = v1 - v2
-        return diff.dot(diff)
+        return numpy.dot(diff, diff)
     elif type(v1) == ndarray:
         return v2.squared_distance(v1)
     else:
@@ -469,7 +469,9 @@ def _dot(vec, target):
     calling numpy.dot of the two vectors, but for SciPy ones, we
     have to transpose them because they're column vectors.
     """
-    if type(vec) == ndarray or type(vec) == SparseVector:
+    if type(vec) == ndarray:
+        return numpy.dot(vec, target)
+    elif type(vec) == SparseVector:
         return vec.dot(target)
     elif type(vec) == list:
         return _convert_vector(vec).dot(target)

From db9bfbdca1fa75d84611688a4491138113ac7ec6 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Tue, 10 Jun 2014 00:45:13 -0700
Subject: [PATCH 2/4] Remove use of skipIf

---
 python/pyspark/mllib/tests.py | 15 ++++++++++++++-
 python/pyspark/tests.py       |  3 ++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 37ccf1d590743..54e7a0a357cd8 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -158,7 +158,6 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
 
-@unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
     """
     Test both vector operations and MLlib algorithms with SciPy sparse matrices,
@@ -166,6 +165,8 @@ class SciPyTests(PySparkTestCase):
     """
 
     def test_serialize(self):
+        if not _have_scipy:
+            return
         from scipy.sparse import lil_matrix
         lil = lil_matrix((4, 1))
         lil[1, 0] = 1
@@ -182,6 +183,8 @@ def test_serialize(self):
         self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
 
     def test_dot(self):
+        if not _have_scipy:
+            return
         from scipy.sparse import lil_matrix
         lil = lil_matrix((4, 1))
         lil[1, 0] = 1
@@ -196,6 +199,8 @@ def test_dot(self):
         self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(lil, mat)))
 
     def test_squared_distance(self):
+        if not _have_scipy:
+            return
         from scipy.sparse import lil_matrix
         lil = lil_matrix((4, 1))
         lil[1, 0] = 3
@@ -208,6 +213,8 @@ def test_squared_distance(self):
         self.assertEquals(15.0, _squared_distance(sv, lil))
 
     def scipy_matrix(self, size, values):
+        if not _have_scipy:
+            return
         """Create a column SciPy matrix from a dictionary of values"""
         from scipy.sparse import lil_matrix
         lil = lil_matrix((size, 1))
@@ -216,6 +223,8 @@ def scipy_matrix(self, size, values):
         return lil
 
     def test_clustering(self):
+        if not _have_scipy:
+            return
         from pyspark.mllib.clustering import KMeans
         data = [
             self.scipy_matrix(3, {1: 1.0}),
@@ -228,6 +237,8 @@ def test_clustering(self):
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
     def test_classification(self):
+        if not _have_scipy:
+            return
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
         data = [
             LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
@@ -257,6 +268,8 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
     def test_regression(self):
+        if not _have_scipy:
+            return
         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
             RidgeRegressionWithSGD
         data = [
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 1f2a6ea941cf2..ff1b9f2087e05 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -363,11 +363,12 @@ def test_single_script_on_cluster(self):
         self.assertIn("[2, 4, 6]", out)
 
 
-@unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
     """General PySpark tests that depend on scipy """
 
     def test_serialize(self):
+        if _not_have_scipy: 
+            return
         from scipy.special import gammaln
         x = range(1, 5)
         expected = map(gammaln, x)

From 1b835dd8d4251aa06c6119846fafdf3f4f53d752 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Tue, 10 Jun 2014 00:45:36 -0700
Subject: [PATCH 3/4] TEMPORARY CHANGE

---
 dev/run-tests | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 93d6692f83ca8..7452c29fbb5ab 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -46,30 +46,6 @@ fi
 set -e
 set -o pipefail
 
-echo "========================================================================="
-echo "Running Apache RAT checks"
-echo "========================================================================="
-dev/check-license
-
-echo "========================================================================="
-echo "Running Scala style checks"
-echo "========================================================================="
-dev/scalastyle
-
-echo "========================================================================="
-echo "Running Spark unit tests"
-echo "========================================================================="
-# echo "q" is needed because sbt on encountering a build file with failure 
-# (either resolution or compilation) prompts the user for input either q, r, 
-# etc to quit or retry. This echo is there to make it not block.
-if [ -n "$_RUN_SQL_TESTS" ]; then
-  echo -e "q\n" | SPARK_HIVE=true sbt/sbt clean assembly test | \
-    grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-else
-  echo -e "q\n" | sbt/sbt clean assembly test | \
-    grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-fi
-
 echo "========================================================================="
 echo "Running PySpark tests"
 echo "========================================================================="

From 4623de4cfc3cb82043f24d95edd7259dcc7718b0 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Tue, 10 Jun 2014 00:52:42 -0700
Subject: [PATCH 4/4] small fix

---
 python/pyspark/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index ff1b9f2087e05..45eaf0f239822 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -367,7 +367,7 @@ class SciPyTests(PySparkTestCase):
     """General PySpark tests that depend on scipy """
 
     def test_serialize(self):
-        if _not_have_scipy: 
+        if not _have_scipy: 
             return
         from scipy.special import gammaln
         x = range(1, 5)