From c63e37ee64581abe5d3c639508627233acb2fd70 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 3 Jun 2022 10:25:40 -0700 Subject: [PATCH] [SPARK-39372][R] Support R 4.2.0 ### What changes were proposed in this pull request? This PR proposes: - Updates AppVeyor to use the latest R version 4.2.0. - Uses the correct way of checking if an object is a matrix: `is.matrix`. After R 4.2.0, `class(upperBoundsOnCoefficients) != "matrix")` fails: ``` -- 1. Error (test_mllib_classification.R:245:3): spark.logit ------------------- Error in `if (class(upperBoundsOnCoefficients) != "matrix") { stop("upperBoundsOnCoefficients must be a matrix.") }`: the condition has length > 1 ``` This fixes `spark.logit` when `lowerBoundsOnCoefficients` or `upperBoundsOnCoefficients` is specified. - Explicitly use the first element in `is.na` comparison. From R 4.2.0, it throws an exception as below: ``` Error in if (is.na(c(1, 2))) print("abc") : the condition has length > 1 ``` Previously it was a warning. This fixes `createDataFrame` or `as.DataFrame` when the data type is a nested complex type. ### Why are the changes needed? To support/test the latest R. R community tends to use the latest versions aggressively. ### Does this PR introduce _any_ user-facing change? Yes, after this PR, we officially support R 4.2.0 in SparkR. ### How was this patch tested? CI in this PR should test it out. Closes #36758 from HyukjinKwon/upgrade-r-appveyor. Lead-authored-by: Hyukjin Kwon Co-authored-by: Hyukjin Kwon Signed-off-by: Dongjoon Hyun --- R/pkg/R/mllib_classification.R | 4 ++-- R/pkg/R/serialize.R | 7 ++++++- dev/appveyor-install-dependencies.ps1 | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 093467ecf7d28..7204f8bb7dff4 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -322,7 +322,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(lowerBoundsOnCoefficients)) { - if (class(lowerBoundsOnCoefficients) != "matrix") { + if (!is.matrix(lowerBoundsOnCoefficients)) { stop("lowerBoundsOnCoefficients must be a matrix.") } row <- nrow(lowerBoundsOnCoefficients) @@ -331,7 +331,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(upperBoundsOnCoefficients)) { - if (class(upperBoundsOnCoefficients) != "matrix") { + if (!is.matrix(upperBoundsOnCoefficients)) { stop("upperBoundsOnCoefficients must be a matrix.") } diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 7760d9be16f0b..85c318f30c338 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -58,7 +58,12 @@ writeObject <- function(con, object, writeType = TRUE) { # Checking types is needed here, since 'is.na' only handles atomic vectors, # lists and pairlists if (type %in% c("integer", "character", "logical", "double", "numeric")) { - if (is.na(object)) { + if (is.na(object[[1]])) { + # Uses the first element for now to keep the behavior same as R before + # 4.2.0. This is wrong because we should differenciate c(NA) from a + # single NA as the former means array(null) and the latter means null + # in Spark SQL. However, it requires non-trivial comparison to distinguish + # both in R. We should ideally fix this. object <- NULL type <- "NULL" } diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1 index d469c98fdb3a2..19b49b90b3859 100644 --- a/dev/appveyor-install-dependencies.ps1 +++ b/dev/appveyor-install-dependencies.ps1 @@ -129,7 +129,7 @@ $env:PATH = "$env:HADOOP_HOME\bin;" + $env:PATH Pop-Location # ========================== R -$rVer = "4.0.2" +$rVer = "4.2.0" $rToolsVer = "4.0.2" InstallR