Merge branch 'master' of git://git.apache.org/spark into SPARK-3802

apache · Oct 7, 2014 · 8b64bb7 · 8b64bb7
2 parents 7090e17 + 70e824f
commit 8b64bb7
Show file tree

Hide file tree

Showing 20 changed files with 457 additions and 279 deletions.
diff --git a/dev/run-tests b/dev/run-tests
@@ -24,6 +24,16 @@ cd "$FWDIR"
 # Remove work directory
 rm -rf ./work
 
+source "$FWDIR/dev/run-tests-codes.sh"
+
+CURRENT_BLOCK=$BLOCK_GENERAL
+
+function handle_error () {
+  echo "[error] Got a return code of $? on line $1 of the run-tests script."
+  exit $CURRENT_BLOCK
+}
+
+
 # Build against the right verison of Hadoop.
 {
   if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
@@ -91,33 +101,43 @@ if [ -n "$AMPLAB_JENKINS" ]; then
   fi
 fi
 
-# Fail fast
-set -e
 set -o pipefail
+trap 'handle_error $LINENO' ERR
 
 echo ""
 echo "========================================================================="
 echo "Running Apache RAT checks"
 echo "========================================================================="
+
+CURRENT_BLOCK=$BLOCK_RAT
+
 ./dev/check-license
 
 echo ""
 echo "========================================================================="
 echo "Running Scala style checks"
 echo "========================================================================="
+
+CURRENT_BLOCK=$BLOCK_SCALA_STYLE
+
 ./dev/lint-scala
 
 echo ""
 echo "========================================================================="
 echo "Running Python style checks"
 echo "========================================================================="
+
+CURRENT_BLOCK=$BLOCK_PYTHON_STYLE
+
 ./dev/lint-python
 
 echo ""
 echo "========================================================================="
 echo "Building Spark"
 echo "========================================================================="
 
+CURRENT_BLOCK=$BLOCK_BUILD
+
 {
   # We always build with Hive because the PySpark Spark SQL tests need it.
   BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
@@ -141,6 +161,8 @@ echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
 
+CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS
+
 {
   # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
   # This must be a single argument, as it is.
@@ -175,10 +197,16 @@ echo ""
 echo "========================================================================="
 echo "Running PySpark tests"
 echo "========================================================================="
+
+CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS
+
 ./python/run-tests
 
 echo ""
 echo "========================================================================="
 echo "Detecting binary incompatibilites with MiMa"
 echo "========================================================================="
+
+CURRENT_BLOCK=$BLOCK_MIMA
+
 ./dev/mima
diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+readonly BLOCK_GENERAL=10
+readonly BLOCK_RAT=11
+readonly BLOCK_SCALA_STYLE=12
+readonly BLOCK_PYTHON_STYLE=13
+readonly BLOCK_BUILD=14
+readonly BLOCK_SPARK_UNIT_TESTS=15
+readonly BLOCK_PYSPARK_UNIT_TESTS=16
+readonly BLOCK_MIMA=17
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
@@ -26,9 +26,23 @@
 FWDIR="$(cd `dirname $0`/..; pwd)"
 cd "$FWDIR"
 
+source "$FWDIR/dev/run-tests-codes.sh"
+
 COMMENTS_URL="https://api.github.com/repos/apache/spark/issues/$ghprbPullId/comments"
 PULL_REQUEST_URL="https://github.com/apache/spark/pull/$ghprbPullId"
 
+# Important Environment Variables
+# ---
+# $ghprbActualCommit
+#+  This is the hash of the most recent commit in the PR.
+#+  The merge-base of this and master is the commit from which the PR was branched.
+# $sha1
+#+  If the patch merges cleanly, this is a reference to the merge commit hash
+#+    (e.g. "origin/pr/2606/merge").
+#+  If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
+#+  The merge-base of this and master in the case of a clean merge is the most recent commit
+#+    against master.
+
 COMMIT_URL="https://github.com/apache/spark/commit/${ghprbActualCommit}"
 # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
 SHORT_COMMIT_HASH="${ghprbActualCommit:0:7}"
@@ -84,42 +98,46 @@ function post_message () {
   fi
 }
 
+
+# We diff master...$ghprbActualCommit because that gets us changes introduced in the PR
+#+ and not anything else added to master since the PR was branched.
+
 # check PR merge-ability and check for new public classes
 {
   if [ "$sha1" == "$ghprbActualCommit" ]; then
-    merge_note=" * This patch **does not** merge cleanly!"
+    merge_note=" * This patch **does not merge cleanly**."
   else
     merge_note=" * This patch merges cleanly."
+  fi
+
+  source_files=$(
+      git diff master...$ghprbActualCommit --name-only  `# diff patch against master from branch point` \
+    | grep -v -e "\/test"                               `# ignore files in test directories` \
+    | grep -e "\.py$" -e "\.java$" -e "\.scala$"        `# include only code files` \
+    | tr "\n" " "
+  )
+  new_public_classes=$(
+      git diff master...$ghprbActualCommit ${source_files}      `# diff patch against master from branch point` \
+    | grep "^\+"                              `# filter in only added lines` \
+    | sed -r -e "s/^\+//g"                    `# remove the leading +` \
+    | grep -e "trait " -e "class "            `# filter in lines with these key words` \
+    | grep -e "{" -e "("                      `# filter in lines with these key words, too` \
+    | grep -v -e "\@\@" -e "private"          `# exclude lines with these words` \
+    | grep -v -e "^// " -e "^/\*" -e "^ \* "  `# exclude comment lines` \
+    | sed -r -e "s/\{.*//g"                   `# remove from the { onwards` \
+    | sed -r -e "s/\}//g"                     `# just in case, remove }; they mess the JSON` \
+    | sed -r -e "s/\"/\\\\\"/g"               `# escape double quotes; they mess the JSON` \
+    | sed -r -e "s/^(.*)$/\`\1\`/g"           `# surround with backticks for style` \
+    | sed -r -e "s/^/  \* /g"                 `# prepend '  *' to start of line` \
+    | sed -r -e "s/$/\\\n/g"                  `# append newline to end of line` \
+    | tr -d "\n"                              `# remove actual LF characters`
+  )
 
-    source_files=$(
-        git diff master... --name-only              `# diff patch against master from branch point` \
-      | grep -v -e "\/test"                         `# ignore files in test directories` \
-      | grep -e "\.py$" -e "\.java$" -e "\.scala$"  `# include only code files` \
-      | tr "\n" " "
-    )
-    new_public_classes=$(
-        git diff master... ${source_files}      `# diff patch against master from branch point` \
-      | grep "^\+"                              `# filter in only added lines` \
-      | sed -r -e "s/^\+//g"                    `# remove the leading +` \
-      | grep -e "trait " -e "class "            `# filter in lines with these key words` \
-      | grep -e "{" -e "("                      `# filter in lines with these key words, too` \
-      | grep -v -e "\@\@" -e "private"          `# exclude lines with these words` \
-      | grep -v -e "^// " -e "^/\*" -e "^ \* "  `# exclude comment lines` \
-      | sed -r -e "s/\{.*//g"                   `# remove from the { onwards` \
-      | sed -r -e "s/\}//g"                     `# just in case, remove }; they mess the JSON` \
-      | sed -r -e "s/\"/\\\\\"/g"               `# escape double quotes; they mess the JSON` \
-      | sed -r -e "s/^(.*)$/\`\1\`/g"           `# surround with backticks for style` \
-      | sed -r -e "s/^/  \* /g"                 `# prepend '  *' to start of line` \
-      | sed -r -e "s/$/\\\n/g"                  `# append newline to end of line` \
-      | tr -d "\n"                              `# remove actual LF characters`
-    )
-
-    if [ "$new_public_classes" == "" ]; then
-      public_classes_note=" * This patch adds no public classes."
-    else
-      public_classes_note=" * This patch adds the following public classes _(experimental)_:"
-      public_classes_note="${public_classes_note}\n${new_public_classes}"
-    fi
+  if [ -z "$new_public_classes" ]; then
+    public_classes_note=" * This patch adds no public classes."
+  else
+    public_classes_note=" * This patch adds the following public classes _(experimental)_:"
+    public_classes_note="${public_classes_note}\n${new_public_classes}"
   fi
 }
 
@@ -147,12 +165,30 @@ function post_message () {
 
     post_message "$fail_message"
     exit $test_result
+  elif [ "$test_result" -eq "0" ]; then
+    test_result_note=" * This patch **passes all tests**."
   else
-    if [ "$test_result" -eq "0" ]; then
-      test_result_note=" * This patch **passes** unit tests."
+    if [ "$test_result" -eq "$BLOCK_GENERAL" ]; then
+      failing_test="some tests"
+    elif [ "$test_result" -eq "$BLOCK_RAT" ]; then
+      failing_test="RAT tests"
+    elif [ "$test_result" -eq "$BLOCK_SCALA_STYLE" ]; then
+      failing_test="Scala style tests"
+    elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
+      failing_test="Python style tests"
+    elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
+      failing_test="to build"
+    elif [ "$test_result" -eq "$BLOCK_SPARK_UNIT_TESTS" ]; then
+      failing_test="Spark unit tests"
+    elif [ "$test_result" -eq "$BLOCK_PYSPARK_UNIT_TESTS" ]; then
+      failing_test="PySpark unit tests"
+    elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then
+      failing_test="MiMa tests"
     else
-      test_result_note=" * This patch **fails** unit tests."
+      failing_test="some tests"
     fi
+
+    test_result_note=" * This patch **fails $failing_test**."
   fi
 }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -75,6 +75,8 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double
   def predict(testData: Vector): Double = {
     predictPoint(testData, weights, intercept)
   }
+
+  override def toString() = "(weights=%s, intercept=%s)".format(weights, intercept)
 }
 
 /**

diff --git a/python/docs/modules.rst b/python/docs/modules.rst
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -410,6 +410,7 @@ def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
         Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
         a local file system (available on all nodes), or any Hadoop-supported file system URI.
         The mechanism is as follows:
+
             1. A Java RDD is created from the SequenceFile or other InputFormat, and the key
                and value Writable classes
             2. Serialization is attempted via Pyrolite pickling

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
@@ -89,11 +89,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         @param regParam:          The regularizer parameter (default: 1.0).
         @param regType:           The type of regularizer used for training
                                   our model.
-                                  Allowed values: "l1" for using L1Updater,
-                                                  "l2" for using
-                                                       SquaredL2Updater,
-                                                  "none" for no regularizer.
-                                  (default: "none")
+
+                                  :Allowed values:
+                                     - "l1" for using L1Updater
+                                     - "l2" for using SquaredL2Updater
+                                     - "none" for no regularizer
+
+                                     (default: "none")
+
         @param intercept:         Boolean parameter which indicates the use
                                   or not of the augmented representation for
                                   training data (i.e. whether bias features
@@ -158,11 +161,14 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
         @param initialWeights:    The initial weights (default: None).
         @param regType:           The type of regularizer used for training
                                   our model.
-                                  Allowed values: "l1" for using L1Updater,
-                                                  "l2" for using
-                                                       SquaredL2Updater,
-                                                  "none" for no regularizer.
-                                  (default: "none")
+
+                                  :Allowed values:
+                                     - "l1" for using L1Updater
+                                     - "l2" for using SquaredL2Updater,
+                                     - "none" for no regularizer.
+
+                                     (default: "none")
+
         @param intercept:         Boolean parameter which indicates the use
                                   or not of the augmented representation for
                                   training data (i.e. whether bias features

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
@@ -22,7 +22,7 @@
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
 
-__all__ = ['LabeledPoint', 'LinearModel', 'LinearRegressionModel', 'RidgeRegressionModel'
+__all__ = ['LabeledPoint', 'LinearModel', 'LinearRegressionModel', 'RidgeRegressionModel',
            'LinearRegressionWithSGD', 'LassoWithSGD', 'RidgeRegressionWithSGD']
 
 
@@ -66,6 +66,9 @@ def weights(self):
     def intercept(self):
         return self._intercept
 
+    def __repr__(self):
+        return "(weights=%s, intercept=%s)" % (self._coeff, self._intercept)
+
 
 class LinearRegressionModelBase(LinearModel):
 
@@ -152,11 +155,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         @param regParam:          The regularizer parameter (default: 1.0).
         @param regType:           The type of regularizer used for training
                                   our model.
-                                  Allowed values: "l1" for using L1Updater,
-                                                  "l2" for using
-                                                       SquaredL2Updater,
-                                                  "none" for no regularizer.
-                                  (default: "none")
+
+                                  :Allowed values:
+                                     - "l1" for using L1Updater,
+                                     - "l2" for using SquaredL2Updater,
+                                     - "none" for no regularizer.
+
+                                     (default: "none")
+
         @param intercept:         Boolean parameter which indicates the use
                                   or not of the augmented representation for
                                   training data (i.e. whether bias features

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
@@ -32,7 +32,7 @@
 from pyspark.serializers import PickleSerializer
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.tests import PySparkTestCase
+from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
 
 
 _have_scipy = False

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
@@ -48,6 +48,7 @@ def __del__(self):
     def predict(self, x):
         """
         Predict the label of one or more examples.
+
         :param x:  Data point (feature vector),
                    or an RDD of data points (feature vectors).
         """

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -1208,6 +1208,7 @@ def saveAsSequenceFile(self, path, compressionCodecClass=None):
         Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
         system, using the L{org.apache.hadoop.io.Writable} types that we convert from the
         RDD's key and value types. The mechanism is as follows:
+
             1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.
             2. Keys and values of this Java RDD are converted to Writables and written out.