[SPARK-44125][R] Support Java 21 in SparkR

### What changes were proposed in this pull request? This PR aims to support Java 21 in SparkR. Arrow-related issue will be fixed when we upgrade Arrow library. Also, the following JIRA is created to re-enable them even in Java 21. - SPARK-44127 Reenable `test_sparkSQL_arrow.R` in Java 21 ### Why are the changes needed? To be ready for Java 21. ### Does this PR introduce _any_ user-facing change? No, this is additional support. ### How was this patch tested? Pass the CIs and do manual tests. ``` $ java -version openjdk version "21-ea" 2023-09-19 OpenJDK Runtime Environment (build 21-ea+27-2343) OpenJDK 64-Bit Server VM (build 21-ea+27-2343, mixed mode, sharing) $ build/sbt test:package -Psparkr -Phive $ R/install-dev.sh; R/run-tests.sh ... ══ Skipped ═════════════════════════════════════════════════════════════════════ 1. createDataFrame/collect Arrow optimization ('test_sparkSQL_arrow.R:29:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 2. createDataFrame/collect Arrow optimization - many partitions (partition order test) ('test_sparkSQL_arrow.R:47:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 3. createDataFrame/collect Arrow optimization - type specification ('test_sparkSQL_arrow.R:54:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 4. dapply() Arrow optimization ('test_sparkSQL_arrow.R:79:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 5. dapply() Arrow optimization - type specification ('test_sparkSQL_arrow.R:114:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 6. dapply() Arrow optimization - type specification (date and timestamp) ('test_sparkSQL_arrow.R:144:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 7. gapply() Arrow optimization ('test_sparkSQL_arrow.R:154:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 8. gapply() Arrow optimization - type specification ('test_sparkSQL_arrow.R:198:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 9. gapply() Arrow optimization - type specification (date and timestamp) ('test_sparkSQL_arrow.R:231:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 10. Arrow optimization - unsupported types ('test_sparkSQL_arrow.R:243:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 11. SPARK-32478: gapply() Arrow optimization - error message for schema mismatch ('test_sparkSQL_arrow.R:255:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 12. SPARK-43789: Automatically pick the number of partitions based on Arrow batch size ('test_sparkSQL_arrow.R:265:3') - Reason: sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21") is TRUE 13. sparkJars tag in SparkContext ('test_Windows.R:22:5') - Reason: This test is only for Windows, skipped ══ DONE ════════════════════════════════════════════════════════════════════════ ... * DONE Status: 2 NOTEs See ‘/Users/dongjoon/APACHE/spark-merge/R/SparkR.Rcheck/00check.log’ for details. + popd Tests passed. ``` Closes #41680 from dongjoon-hyun/SPARK-44125. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
apache · Apr 30, 2024 · 7556aa1 · 7556aa1
1 parent 37b67ae
commit 7556aa1
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 2 deletions.
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
@@ -93,8 +93,10 @@ checkJavaVersion <- function() {
       }, javaVersionOut)
 
   javaVersionStr <- strsplit(javaVersionFilter[[1]], '"', fixed = TRUE)[[1L]][2]
-  # javaVersionStr is of the form 1.8.0_92/9.0.x/11.0.x.
-  # We are using 8, 9, 10, 11 for sparkJavaVersion.
+  # javaVersionStr is of the form 1.8.0_92/11.0.x./17.0.x/21-ea/21
+  # We are using 8, 11, 17, and 21 for sparkJavaVersion.
+  javaVersionStr <- strsplit(javaVersionStr, "-ea", fixed = TRUE)[[1L]]
+
   versions <- strsplit(javaVersionStr, ".", fixed = TRUE)[[1L]]
   if ("1" == versions[1]) {
     javaVersionNum <- as.integer(versions[2])

diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -26,6 +26,8 @@ sparkSession <- sparkR.session(
 
 test_that("createDataFrame/collect Arrow optimization", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
 
   conf <- callJMethod(sparkSession, "conf")
   arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
@@ -43,12 +45,16 @@ test_that("createDataFrame/collect Arrow optimization", {
 
 test_that("createDataFrame/collect Arrow optimization - many partitions (partition order test)", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)),
                collect(createDataFrame(mtcars, numPartitions = 1)))
 })
 
 test_that("createDataFrame/collect Arrow optimization - type specification", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   rdf <- data.frame(list(list(a = 1,
                               b = "a",
                               c = TRUE,
@@ -73,6 +79,8 @@ test_that("createDataFrame/collect Arrow optimization - type specification", {
 
 test_that("dapply() Arrow optimization", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   df <- createDataFrame(mtcars)
 
   conf <- callJMethod(sparkSession, "conf")
@@ -107,6 +115,8 @@ test_that("dapply() Arrow optimization", {
 
 test_that("dapply() Arrow optimization - type specification", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   # Note that regular dapply() seems not supporting date and timestamps
   # whereas Arrow-optimized dapply() does.
   rdf <- data.frame(list(list(a = 1,
@@ -136,6 +146,8 @@ test_that("dapply() Arrow optimization - type specification", {
 
 test_that("dapply() Arrow optimization - type specification (date and timestamp)", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
                               b = as.POSIXct("1990-02-24 12:34:56"))))
   df <- createDataFrame(rdf)
@@ -145,6 +157,8 @@ test_that("dapply() Arrow optimization - type specification (date and timestamp)
 
 test_that("gapply() Arrow optimization", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   df <- createDataFrame(mtcars)
 
   conf <- callJMethod(sparkSession, "conf")
@@ -188,6 +202,8 @@ test_that("gapply() Arrow optimization", {
 
 test_that("gapply() Arrow optimization - type specification", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   # Note that regular gapply() seems not supporting date and timestamps
   # whereas Arrow-optimized gapply() does.
   rdf <- data.frame(list(list(a = 1,
@@ -220,6 +236,8 @@ test_that("gapply() Arrow optimization - type specification", {
 
 test_that("gapply() Arrow optimization - type specification (date and timestamp)", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
                               b = as.POSIXct("1990-02-24 12:34:56"))))
   df <- createDataFrame(rdf)
@@ -231,6 +249,8 @@ test_that("gapply() Arrow optimization - type specification (date and timestamp)
 
 test_that("Arrow optimization - unsupported types", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
 
   expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type")
   expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type")
@@ -242,6 +262,8 @@ test_that("Arrow optimization - unsupported types", {
 
 test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
   df <- createDataFrame(list(list(a = 1L, b = "a")))
 
   expect_error(
@@ -251,6 +273,8 @@ test_that("SPARK-32478: gapply() Arrow optimization - error message for schema m
 
 test_that("SPARK-43789: Automatically pick the number of partitions based on Arrow batch size", {
   skip_if_not_installed("arrow")
+  # TODO(SPARK-44127) Reenable test_sparkSQL_arrow.R in Java 21
+  skip_if(sparkR.callJStatic("org.apache.spark.util.Utils", "isJavaVersionAtLeast21"))
 
   conf <- callJMethod(sparkSession, "conf")
   maxRecordsPerBatch <- sparkR.conf("spark.sql.execution.arrow.maxRecordsPerBatch")[[1]]