diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5213c5277b23f..7fa5bdd729740 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,7 +100,7 @@ jobs: \"build\": \"$build\", \"pyspark\": \"$pyspark\", \"pyspark-pandas\": \"$pandas\", - \"sparkr\": \"false\", + \"sparkr\": \"$sparkr\", \"tpcds-1g\": \"$tpcds\", \"docker-integration-tests\": \"$docker\", \"scala-213\": \"$build\", @@ -712,6 +712,7 @@ jobs: apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('remotes', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" + Rscript -e "remotes::install_version('ragg', version='1.2.5', repos='https://cloud.r-project.org')" Rscript -e "remotes::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" Rscript -e "remotes::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" gem install bundler -v 2.4.22 diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 2fe8817fdb388..69d44cec8ab76 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -546,6 +546,11 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { error = function(e) { FALSE })) { obj <- get(nodeChar, envir = func.env, inherits = FALSE) if (is.function(obj)) { + if (is.primitive(obj)) { + # Primitive functions have no closure to clean. + assign(nodeChar, obj, envir = newEnv) + break + } # If the node is a function call. funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F, ifnotfound = list(list(NULL)))[[1]] diff --git a/R/run-tests.sh b/R/run-tests.sh index 90a60eda03871..20442ca89117d 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -58,10 +58,11 @@ if [[ $FAILED != 0 || $NUM_TEST_WARNING != 0 ]]; then echo -en "\033[0m" # No color exit -1 else - # We have 2 NOTEs: for RoxygenNote and one in Jenkins only "No repository set" + # We have 3 NOTEs: for RoxygenNote, one in Jenkins only "No repository set", + # and "Lost braces" in Rd files due to R 4.4+ stricter checkRd # For non-latest version branches, one WARNING for package version - if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) && - ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 1) ]]; then + if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 3) && + ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) ]]; then cat $CRAN_CHECK_LOG_FILE echo -en "\033[31m" # Red echo "Had CRAN check errors; see logs." diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 42637942fa091..306b6f73fd160 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -19,10 +19,9 @@ # See also in https://hub.docker.com/_/ubuntu FROM ubuntu:jammy +ENV FULL_REFRESH_DATE 20260514 SHELL ["/bin/bash", "-o", "pipefail", "-c"] -ENV FULL_REFRESH_DATE 20260420 - ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true @@ -106,15 +105,12 @@ ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library RUN python3.8 -m pip install setuptools virtualenv RUN python3.9 -m pip install setuptools virtualenv -RUN python3.8 -m pip install --only-binary=pandas numpy pandas 'scipy<1.9' coverage 'matplotlib==3.7.2' 'mypy==0.982' -RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982' +RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' 'Werkzeug==2.1.2' +RUN python3.8 -m pip install 'numpy' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982' 'beniget==0.4.1' 'pyproject-metadata==0.8.1' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' +RUN python3.8 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' # Add torch as a testing dependency for TorchDistributor RUN python3.9 -m pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval - -# pyarrow -RUN python3.9 -m pip install 'pyarrow<13.0.0' -RUN python3.8 -m pip install 'pyarrow<13.0.0' diff --git a/python/mypy.ini b/python/mypy.ini index ef0ee36ef8543..43ba5c5b744d1 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -82,7 +82,7 @@ disallow_untyped_defs = False ; Allow untyped def and disable certain error codes in examples -[mypy-python.sql.udtf] +[mypy-sql.udtf] disallow_untyped_defs = False disable_error_code = attr-defined,arg-type,call-arg,union-attr @@ -166,6 +166,12 @@ ignore_missing_imports = True [mypy-grpc.*] ignore_missing_imports = True +[mypy-tornado.*] +ignore_missing_imports = True + +[mypy-xmlrunner.*] +ignore_missing_imports = True + ; pydantic is pulled in transitively (e.g. via mlflow). mypy has issues ; serializing pydantic v2's recursive JsonValue type, so skip following it. [mypy-pydantic.*] diff --git a/python/pyspark/ml/connect/classification.py b/python/pyspark/ml/connect/classification.py index f8b525db8edd6..33a7d09e9b824 100644 --- a/python/pyspark/ml/connect/classification.py +++ b/python/pyspark/ml/connect/classification.py @@ -43,8 +43,12 @@ from pyspark.ml.connect.io_utils import ParamsReadWrite, CoreModelReadWrite from pyspark.sql.functions import lit, count, countDistinct -import torch -import torch.nn as torch_nn +try: + import torch + import torch.nn as torch_nn +except ImportError: + torch = None # type: ignore[assignment] + torch_nn = None # type: ignore[assignment] class _LogisticRegressionParams( diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py b/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py index 84d5829122af1..5601d6bfffbfd 100644 --- a/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +++ b/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py @@ -218,6 +218,7 @@ def test_save_load(self): loaded_model.transform(eval_df1.toPandas()) +@unittest.skipIf(not have_torch, "torch is required") class ClassificationTests(ClassificationTestsMixin, unittest.TestCase): def setUp(self) -> None: self.spark = SparkSession.builder.master("local[2]").getOrCreate() diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py b/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py index 5fd4f6f16cfaf..ff5165e1cdc73 100644 --- a/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +++ b/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py @@ -164,6 +164,7 @@ def test_pipeline_copy(): assert lorv2.getOrDefault(lorv2.maxIter) == 200 +@unittest.skipIf(not have_torch, "torch is required") class PipelineTests(PipelineTestsMixin, unittest.TestCase): def setUp(self) -> None: self.spark = SparkSession.builder.master("local[2]").getOrCreate() diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py b/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py index 0ade227540c7e..302deb5562121 100644 --- a/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +++ b/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py @@ -272,6 +272,7 @@ def test_crossvalidator_with_fold_col(self): cv.fit(train_dataset) +@unittest.skipIf(not have_torch, "torch is required") class CrossValidatorTests(CrossValidatorTestsMixin, unittest.TestCase): def setUp(self) -> None: self.spark = SparkSession.builder.master("local[2]").getOrCreate() diff --git a/python/pyspark/ml/torch/data.py b/python/pyspark/ml/torch/data.py index 0a5597fbd241e..cb7e7f1b68ac9 100644 --- a/python/pyspark/ml/torch/data.py +++ b/python/pyspark/ml/torch/data.py @@ -15,7 +15,10 @@ # limitations under the License. # -import torch +try: + import torch +except ImportError: + torch = None # type: ignore[assignment] import numpy as np from typing import Any, Callable, Iterator from pyspark.sql.types import StructType diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 43af8bb427a5a..b25b1be864959 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -1613,8 +1613,8 @@ def __init__(self, child: "LogicalPlan", table_name: str) -> None: self.table_name: Optional[str] = table_name self.provider: Optional[str] = None self.partitioning_columns: List["ColumnOrName"] = [] - self.options: dict[str, Optional[str]] = {} - self.table_properties: dict[str, Optional[str]] = {} + self.options: Dict[str, Optional[str]] = {} + self.table_properties: Dict[str, Optional[str]] = {} self.mode: Optional[str] = None self.overwrite_condition: Optional["ColumnOrName"] = None diff --git a/python/run-tests.py b/python/run-tests.py index ca8ddb5ff8635..6e4a1da18a38f 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -207,7 +207,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_ def get_default_python_executables(): - python_execs = [x for x in ["python3.9", "pypy3"] if which(x)] + python_execs = [x for x in ["python3.9", "python3.8", "pypy3"] if which(x)] if "python3.9" not in python_execs: p = which("python3")