Merge branch 'apache/master' into yh/pod-template

apache · Aug 22, 2018 · 8ef756e · 8ef756e
2 parents 1da79a8 + a998e9d
commit 8ef756e
Show file tree

Hide file tree

Showing 181 changed files with 3,359 additions and 779 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,6 +77,7 @@ target/
 unit-tests.log
 work/
 docs/.jekyll-metadata
+*.crc
 
 # For Hive
 TempStatsStore/

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -228,7 +228,6 @@ org.apache.xbean:xbean-asm5-shaded
 com.squareup.okhttp3:logging-interceptor
 com.squareup.okhttp3:okhttp
 com.squareup.okio:okio
-net.java.dev.jets3t:jets3t
 org.apache.spark:spark-catalyst_2.11
 org.apache.spark:spark-kvstore_2.11
 org.apache.spark:spark-launcher_2.11
@@ -447,7 +446,6 @@ org.slf4j:jul-to-slf4j
 org.slf4j:slf4j-api
 org.slf4j:slf4j-log4j12
 com.github.scopt:scopt_2.11
-org.bouncycastle:bcprov-jdk15on
 
 core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
 core/src/main/resources/org/apache/spark/ui/static/*dataTables*

diff --git a/NOTICE b/NOTICE
@@ -26,5 +26,3 @@ The following provides more details on the included cryptographic software:
 This software uses Apache Commons Crypto (https://commons.apache.org/proper/commons-crypto/) to
 support authentication, and encryption and decryption of data sent across the network between
 services.
-
-This software includes Bouncy Castle (http://bouncycastle.org/) to support the jets3t library.
diff --git a/NOTICE-binary b/NOTICE-binary
@@ -27,8 +27,6 @@ This software uses Apache Commons Crypto (https://commons.apache.org/proper/comm
 support authentication, and encryption and decryption of data sent across the network between
 services.
 
-This software includes Bouncy Castle (http://bouncycastle.org/) to support the jets3t library.
-
 
 // ------------------------------------------------------------------
 // NOTICE file corresponding to the section 4d of The Apache License,
@@ -1162,25 +1160,6 @@ NonlinearMinimizer class in package breeze.optimize.proximal is distributed with
 2015, Debasish Das (Verizon), all rights reserved.
 
 
-   =========================================================================
-   ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
-   ==  Version 2.0, in this case for the distribution of jets3t.          ==
-   =========================================================================
-
-   This product includes software developed by:
-
-   The Apache Software Foundation (http://www.apache.org/).
-
-   The ExoLab Project (http://www.exolab.org/)
-
-   Sun Microsystems (http://www.sun.com/)
-
-   Codehaus (http://castor.codehaus.org)
-
-   Tatu Saloranta (http://wiki.fasterxml.com/TatuSaloranta)
-
-
-
 stream-lib
 Copyright 2016 AddThis
 

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -117,6 +117,7 @@ exportMethods("arrange",
               "dropna",
               "dtypes",
               "except",
+              "exceptAll",
               "explain",
               "fillna",
               "filter",
@@ -131,6 +132,7 @@ exportMethods("arrange",
               "hint",
               "insertInto",
               "intersect",
+              "intersectAll",
               "isLocal",
               "isStreaming",
               "join",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2848,6 +2848,35 @@ setMethod("intersect",
             dataFrame(intersected)
           })
 
+#' intersectAll
+#'
+#' Return a new SparkDataFrame containing rows in both this SparkDataFrame
+#' and another SparkDataFrame while preserving the duplicates.
+#' This is equivalent to \code{INTERSECT ALL} in SQL. Also as standard in
+#' SQL, this function resolves columns by position (not by name).
+#'
+#' @param x a SparkDataFrame.
+#' @param y a SparkDataFrame.
+#' @return A SparkDataFrame containing the result of the intersect all operation.
+#' @family SparkDataFrame functions
+#' @aliases intersectAll,SparkDataFrame,SparkDataFrame-method
+#' @rdname intersectAll
+#' @name intersectAll
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- read.json(path)
+#' df2 <- read.json(path2)
+#' intersectAllDF <- intersectAll(df1, df2)
+#' }
+#' @note intersectAll since 2.4.0
+setMethod("intersectAll",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            intersected <- callJMethod(x@sdf, "intersectAll", y@sdf)
+            dataFrame(intersected)
+          })
+
 #' except
 #'
 #' Return a new SparkDataFrame containing rows in this SparkDataFrame
@@ -2867,7 +2896,6 @@ setMethod("intersect",
 #' df2 <- read.json(path2)
 #' exceptDF <- except(df, df2)
 #' }
-#' @rdname except
 #' @note except since 1.4.0
 setMethod("except",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
@@ -2876,6 +2904,35 @@ setMethod("except",
             dataFrame(excepted)
           })
 
+#' exceptAll
+#'
+#' Return a new SparkDataFrame containing rows in this SparkDataFrame
+#' but not in another SparkDataFrame while preserving the duplicates.
+#' This is equivalent to \code{EXCEPT ALL} in SQL. Also as standard in
+#' SQL, this function resolves columns by position (not by name).
+#'
+#' @param x a SparkDataFrame.
+#' @param y a SparkDataFrame.
+#' @return A SparkDataFrame containing the result of the except all operation.
+#' @family SparkDataFrame functions
+#' @aliases exceptAll,SparkDataFrame,SparkDataFrame-method
+#' @rdname exceptAll
+#' @name exceptAll
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- read.json(path)
+#' df2 <- read.json(path2)
+#' exceptAllDF <- exceptAll(df1, df2)
+#' }
+#' @note exceptAll since 2.4.0
+setMethod("exceptAll",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            excepted <- callJMethod(x@sdf, "exceptAll", y@sdf)
+            dataFrame(excepted)
+          })
+
 #' Save the contents of SparkDataFrame to a data source.
 #'
 #' The data source is specified by the \code{source} and a set of options (...).

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -471,6 +471,9 @@ setGeneric("explain", function(x, ...) { standardGeneric("explain") })
 #' @rdname except
 setGeneric("except", function(x, y) { standardGeneric("except") })
 
+#' @rdname exceptAll
+setGeneric("exceptAll", function(x, y) { standardGeneric("exceptAll") })
+
 #' @rdname nafunctions
 setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") })
 
@@ -495,6 +498,9 @@ setGeneric("insertInto", function(x, tableName, ...) { standardGeneric("insertIn
 #' @rdname intersect
 setGeneric("intersect", function(x, y) { standardGeneric("intersect") })
 
+#' @rdname intersectAll
+setGeneric("intersectAll", function(x, y) { standardGeneric("intersectAll") })
+
 #' @rdname isLocal
 setGeneric("isLocal", function(x) { standardGeneric("isLocal") })
 

diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -382,10 +382,10 @@ test_that("spark.mlp", {
   trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
-  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 3))
+  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 2))
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
-  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 3), handleInvalid = "skip")
+  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 2), handleInvalid = "skip")
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "list")
 

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2482,6 +2482,25 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
   unlink(jsonPath2)
 })
 
+test_that("intersectAll() and exceptAll()", {
+  df1 <- createDataFrame(list(list("a", 1), list("a", 1), list("a", 1),
+                              list("a", 1), list("b", 3), list("c", 4)),
+                         schema = c("a", "b"))
+  df2 <- createDataFrame(list(list("a", 1), list("a", 1), list("b", 3)), schema = c("a", "b"))
+  intersectAllExpected <- data.frame("a" = c("a", "a", "b"), "b" = c(1, 1, 3),
+                                       stringsAsFactors = FALSE)
+  exceptAllExpected <- data.frame("a" = c("a", "a", "c"), "b" = c(1, 1, 4),
+                                    stringsAsFactors = FALSE)
+  intersectAllDf <- arrange(intersectAll(df1, df2), df1$a)
+  expect_is(intersectAllDf, "SparkDataFrame")
+  exceptAllDf <- arrange(exceptAll(df1, df2), df1$a)
+  expect_is(exceptAllDf, "SparkDataFrame")
+  intersectAllActual <- collect(intersectAllDf)
+  expect_identical(intersectAllActual, intersectAllExpected)
+  exceptAllActual <- collect(exceptAllDf)
+  expect_identical(exceptAllActual, exceptAllExpected)
+})
+
 test_that("withColumn() and withColumnRenamed()", {
   df <- read.json(jsonPath)
   newDF <- withColumn(df, "newAge", df$age + 2)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -654,7 +654,7 @@ We use Titanic data set to show how to use `spark.mlp` in classification.
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
 # fit a Multilayer Perceptron Classification Model
-model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c( 0, 0, 0, 5, 5, 5, 9, 9, 9))
+model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2, 2), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c( 0, 0, 5, 5, 9, 9))
 ```
 
 To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell.

diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
@@ -71,6 +71,7 @@ function build {
   )
   local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
   local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
+  local RDOCKERFILE=${RDOCKERFILE:-"$IMG_PATH/spark/bindings/R/Dockerfile"}
 
   docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
     -t $(image_ref spark) \
@@ -79,11 +80,16 @@ function build {
   docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
     -t $(image_ref spark-py) \
     -f "$PYDOCKERFILE" .
+
+  docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
+    -t $(image_ref spark-r) \
+    -f "$RDOCKERFILE" .
 }
 
 function push {
   docker push "$(image_ref spark)"
   docker push "$(image_ref spark-py)"
+  docker push "$(image_ref spark-r)"
 }
 
 function usage {
@@ -97,12 +103,13 @@ Commands:
   push        Push a pre-built image to a registry. Requires a repository address to be provided.
 
 Options:
-  -f file     Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
-  -p file     Dockerfile with Python baked in. By default builds the Dockerfile shipped with Spark.
-  -r repo     Repository address.
-  -t tag      Tag to apply to the built image, or to identify the image to be pushed.
-  -m          Use minikube's Docker daemon.
-  -n          Build docker image with --no-cache
+  -f file               Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
+  -p file               Dockerfile to build for PySpark Jobs. Builds Python dependencies and ships with Spark.
+  -R file               Dockerfile to build for SparkR Jobs. Builds R dependencies and ships with Spark.
+  -r repo               Repository address.
+  -t tag                Tag to apply to the built image, or to identify the image to be pushed.
+  -m                    Use minikube's Docker daemon.
+  -n                    Build docker image with --no-cache
   -b arg      Build arg to build or push the image. For multiple build args, this option needs to
               be used separately for each build arg.
 
@@ -133,14 +140,16 @@ REPO=
 TAG=
 BASEDOCKERFILE=
 PYDOCKERFILE=
+RDOCKERFILE=
 NOCACHEARG=
 BUILD_PARAMS=
-while getopts f:p:mr:t:n:b: option
+while getopts f:p:R:mr:t:n:b: option
 do
  case "${option}"
  in
  f) BASEDOCKERFILE=${OPTARG};;
  p) PYDOCKERFILE=${OPTARG};;
+ R) RDOCKERFILE=${OPTARG};;
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
  n) NOCACHEARG="--no-cache";;

diff --git a/...network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/...network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -234,7 +234,7 @@ public void onComplete(String streamId) throws IOException {
              callback.onSuccess(ByteBuffer.allocate(0));
            } catch (Exception ex) {
              IOException ioExc = new IOException("Failure post-processing complete stream;" +
-               " failing this rpc and leaving channel active");
+               " failing this rpc and leaving channel active", ex);
              callback.onFailure(ioExc);
              streamHandler.onFailure(streamId, ioExc);
            }

diff --git a/...shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java b/...shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java
@@ -42,7 +42,7 @@ public abstract class BlockTransferMessage implements Encodable {
   /** Preceding every serialized message is its type, which allows us to deserialize it. */
   public enum Type {
     OPEN_BLOCKS(0), UPLOAD_BLOCK(1), REGISTER_EXECUTOR(2), STREAM_HANDLE(3), REGISTER_DRIVER(4),
-    HEARTBEAT(5);
+    HEARTBEAT(5), UPLOAD_BLOCK_STREAM(6);
 
     private final byte id;
 
@@ -67,6 +67,7 @@ public static BlockTransferMessage fromByteBuffer(ByteBuffer msg) {
         case 3: return StreamHandle.decode(buf);
         case 4: return RegisterDriver.decode(buf);
         case 5: return ShuffleServiceHeartbeat.decode(buf);
+        case 6: return UploadBlockStream.decode(buf);
         default: throw new IllegalArgumentException("Unknown message type: " + type);
       }
     }