ahshahid · ahshahid · Dec 8, 2020 · Nov 24, 2020 · Nov 25, 2020 · Nov 25, 2020
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -153,7 +153,7 @@ jobs:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20201015
+      image: dongjoon/apache-spark-github-action-image:20201025
     strategy:
       fail-fast: false
       matrix:
@@ -414,7 +414,7 @@ jobs:
     - name: Build with SBT
       run: |
         ./dev/change-scala-version.sh 2.13
-        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile
+        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pscala-2.13 compile test:compile
 
   hadoop-2:
     name: Hadoop 2 build with SBT

diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
@@ -0,0 +1,30 @@
+name: Publish Snapshot
+
+on:
+  schedule:
+  - cron: '0 0 * * *'
+
+jobs:
+  publish-snapshot:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@master
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: snapshot-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          snapshot-maven-
+    - name: Install Java 8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 8
+    - name: Publish snapshot
+      env:
+        ASF_USERNAME: ${{ secrets.NEXUS_USER }}
+        ASF_PASSWORD: ${{ secrets.NEXUS_PW }}
+        GPG_KEY: "not_used"
+        GPG_PASSPHRASE: "not_used"
+      run: ./dev/create-release/release-build.sh publish-snapshot
diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
@@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple
 
 First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
 
-Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`).
 
 To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
 

diff --git a/R/install-dev.bat b/R/install-dev.bat
@@ -26,7 +26,7 @@ MKDIR %SPARK_HOME%\R\lib
 
 rem When you pass the package path directly as an argument to R CMD INSTALL,
 rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at
-rem R 4.0. To work around this, directly go to the directoy and install it.
+rem R 4.0. To work around this, directly go to the directory and install it.
 rem See also SPARK-32074
 pushd %SPARK_HOME%\R\pkg\
 R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" .

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.1.0
+Version: 3.2.0
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -202,6 +202,7 @@ exportMethods("%<=>%",
               "%in%",
               "abs",
               "acos",
+              "acosh",
               "add_months",
               "alias",
               "approx_count_distinct",
@@ -222,6 +223,7 @@ exportMethods("%<=>%",
               "array_remove",
               "array_repeat",
               "array_sort",
+              "array_to_vector",
               "array_transform",
               "arrays_overlap",
               "array_union",
@@ -232,8 +234,10 @@ exportMethods("%<=>%",
               "asc_nulls_last",
               "ascii",
               "asin",
+              "asinh",
               "assert_true",
               "atan",
+              "atanh",
               "atan2",
               "avg",
               "base64",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2772,7 +2772,7 @@ setMethod("merge",
 #' Creates a list of columns by replacing the intersected ones with aliases
 #'
 #' Creates a list of columns by replacing the intersected ones with aliases.
-#' The name of the alias column is formed by concatanating the original column name and a suffix.
+#' The name of the alias column is formed by concatenating the original column name and a suffix.
 #'
 #' @param x a SparkDataFrame
 #' @param intersectedColNames a list of intersected column names of the SparkDataFrame
@@ -3231,7 +3231,7 @@ setMethod("describe",
 #' \item stddev
 #' \item min
 #' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
+#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%")
 #' }
 #' If no statistics are given, this function computes count, mean, stddev, min,
 #' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
@@ -3743,7 +3743,7 @@ setMethod("histogram",
 #'
 #' @param x a SparkDataFrame.
 #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
-#' @param tableName yhe name of the table in the external database.
+#' @param tableName the name of the table in the external database.
 #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
 #'             save mode (it is 'error' by default)
 #' @param ... additional JDBC database connection properties.

diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical",
                                                                   MAXINT)))))
             # If the first sample didn't turn out large enough, keep trying to
             # take samples; this shouldn't happen often because we use a big
-            # multiplier for thei initial size
+            # multiplier for the initial size
             while (length(samples) < total)
               samples <- collectRDD(sampleRDD(x, withReplacement, fraction,
                                            as.integer(ceiling(stats::runif(1,
@@ -1512,7 +1512,7 @@ setMethod("glom",
 #'
 #' @param x An RDD.
 #' @param y An RDD.
-#' @return a new RDD created by performing the simple union (witout removing
+#' @return a new RDD created by performing the simple union (without removing
 #' duplicates) of two input RDDs.
 #' @examples
 #'\dontrun{

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
       })
     }
 
-    # SPAKR-SQL does not support '.' in column name, so replace it with '_'
+    # SPARK-SQL does not support '.' in column name, so replace it with '_'
     # TODO(davies): remove this once SPARK-2775 is fixed
     names <- lapply(names, function(n) {
       nn <- gsub(".", "_", n, fixed = TRUE)

diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
@@ -54,7 +54,7 @@ setMethod("show", "WindowSpec",
 #' Defines the partitioning columns in a WindowSpec.
 #'
 #' @param x a WindowSpec.
-#' @param col a column to partition on (desribed by the name or Column).
+#' @param col a column to partition on (described by the name or Column).
 #' @param ... additional column(s) to partition on.
 #' @return A WindowSpec.
 #' @rdname partitionBy
@@ -231,7 +231,7 @@ setMethod("rangeBetween",
 #' @rdname over
 #' @name over
 #' @aliases over,Column,WindowSpec-method
-#' @family colum_func
+#' @family column_func
 #' @examples
 #' \dontrun{
 #'   df <- createDataFrame(mtcars)

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -135,7 +135,7 @@ createMethods()
 #' @rdname alias
 #' @name alias
 #' @aliases alias,Column-method
-#' @family colum_func
+#' @family column_func
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(iris)
@@ -161,7 +161,7 @@ setMethod("alias",
 #'
 #' @rdname substr
 #' @name substr
-#' @family colum_func
+#' @family column_func
 #' @aliases substr,Column-method
 #'
 #' @param x a Column.
@@ -187,7 +187,7 @@ setMethod("substr", signature(x = "Column"),
 #'
 #' @rdname startsWith
 #' @name startsWith
-#' @family colum_func
+#' @family column_func
 #' @aliases startsWith,Column-method
 #'
 #' @param x vector of character string whose "starts" are considered
@@ -206,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"),
 #'
 #' @rdname endsWith
 #' @name endsWith
-#' @family colum_func
+#' @family column_func
 #' @aliases endsWith,Column-method
 #'
 #' @param x vector of character string whose "ends" are considered
@@ -224,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"),
 #'
 #' @rdname between
 #' @name between
-#' @family colum_func
+#' @family column_func
 #' @aliases between,Column-method
 #'
 #' @param x a Column
@@ -251,7 +251,7 @@ setMethod("between", signature(x = "Column"),
 # nolint end
 #' @rdname cast
 #' @name cast
-#' @family colum_func
+#' @family column_func
 #' @aliases cast,Column-method
 #'
 #' @examples
@@ -300,7 +300,7 @@ setMethod("%in%",
 #'              Can be a single value or a Column.
 #' @rdname otherwise
 #' @name otherwise
-#' @family colum_func
+#' @family column_func
 #' @aliases otherwise,Column-method
 #' @note otherwise since 1.5.0
 setMethod("otherwise",
@@ -440,7 +440,7 @@ setMethod("withField",
 #' )
 #'
 #' # However, if you are going to add/replace multiple nested fields,
-#' # it is preffered to extract out the nested struct before
+#' # it is preferred to extract out the nested struct before
 #' # adding/replacing multiple fields e.g.
 #' head(
 #'   withColumn(

diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) {
   # For instance, for numSerializedSlices of 22, length of 50
   #  [1]  0  0  2  2  4  4  6  6  6  9  9 11 11 13 13 15 15 15 18 18 20 20 22 22 22
   # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47
-  # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced.
+  # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced.
   # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD
   if (numSerializedSlices > 0) {
     unlist(lapply(0: (numSerializedSlices - 1), function(x) {
@@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) {
 #' This change affects both createDataFrame and spark.lapply.
 #' In the specific one case that it is used to convert R native object into SparkDataFrame, it has
 #' always been kept at the default of 1. In the case the object is large, we are explicitly setting
-#' the parallism to numSlices (which is still 1).
+#' the parallelism to numSlices (which is still 1).
 #'
 #' Specifically, we are changing to split positions to match the calculation in positions() of
 #' ParallelCollectionRDD in Spark.

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -250,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) {
 
   keys <- readMultipleObjects(inputCon)
 
-  # Read keys to map with each groupped batch later.
+  # Read keys to map with each grouped batch later.
   list(keys = keys, data = data)
 }
 

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -144,7 +144,7 @@ NULL
 #' @param y Column to compute on.
 #' @param pos In \itemize{
 #'                \item \code{locate}: a start position of search.
-#'                \item \code{overlay}: a start postiton for replacement.
+#'                \item \code{overlay}: a start position for replacement.
 #'                }
 #' @param len In \itemize{
 #'               \item \code{lpad} the maximum length of each output result.
@@ -357,7 +357,13 @@ NULL
 #' @examples
 #' \dontrun{
 #' df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
-#' head(select(df, vector_to_array(df$features)))
+#' head(
+#'   withColumn(
+#'     withColumn(df, "array", vector_to_array(df$features)),
+#'     "vector",
+#'     array_to_vector(column("array"))
+#'   )
+#' )
 #' }
 NULL
 
@@ -455,6 +461,19 @@ setMethod("acos",
             column(jc)
           })
 
+#' @details
+#' \code{acosh}: Computes inverse hyperbolic cosine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases acosh acosh,Column-method
+#' @note acosh since 3.1.0
+setMethod("acosh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "acosh", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
 #'
@@ -522,6 +541,19 @@ setMethod("asin",
             column(jc)
           })
 
+#' @details
+#' \code{asinh}: Computes inverse hyperbolic sine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases asinh asinh,Column-method
+#' @note asinh since 3.1.0
+setMethod("asinh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "asinh", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{atan}: Returns the inverse tangent of the given value,
 #' as if computed by \code{java.lang.Math.atan()}
@@ -536,6 +568,19 @@ setMethod("atan",
             column(jc)
           })
 
+#' @details
+#' \code{atanh}: Computes inverse hyperbolic tangent of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases atanh atanh,Column-method
+#' @note atanh since 3.1.0
+setMethod("atanh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "atanh", x@jc)
+            column(jc)
+          })
+
 #' avg
 #'
 #' Aggregate function: returns the average of the values in a group.
@@ -2879,7 +2924,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           })
 
 #' @details
-#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is
+#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is
 #' a long value, it will return a long value else it will return an integer value.
 #'
 #' @rdname column_math_functions
@@ -4570,6 +4615,24 @@ setMethod("timestamp_seconds",
             column(jc)
           })
 
+#' @details
+#' \code{array_to_vector} Converts a column of array of numeric type into
+#' a column of dense vectors in MLlib
+#'
+#' @rdname column_ml_functions
+#' @aliases array_to_vector array_to_vector,Column-method
+#' @note array_to_vector since 3.1.0
+setMethod("array_to_vector",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic(
+              "org.apache.spark.ml.functions",
+              "array_to_vector",
+              x@jc
+            )
+            column(jc)
+          })
+
 #' @details
 #' \code{vector_to_array} Converts a column of MLlib sparse/dense vectors into
 #' a column of dense arrays.