apache · holdenk · May 7, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -100,7 +100,7 @@ jobs:
               \"build\": \"$build\",
               \"pyspark\": \"$pyspark\",
               \"pyspark-pandas\": \"$pandas\",
-              \"sparkr\": \"false\",
+              \"sparkr\": \"$sparkr\",
               \"tpcds-1g\": \"$tpcds\",
               \"docker-integration-tests\": \"$docker\",
               \"scala-213\": \"$build\",
@@ -712,6 +712,7 @@ jobs:
         apt-get update -y
         apt-get install -y ruby ruby-dev
         Rscript -e "install.packages(c('remotes', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
+        Rscript -e "remotes::install_version('ragg', version='1.2.5', repos='https://cloud.r-project.org')"
         Rscript -e "remotes::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
         Rscript -e "remotes::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
         gem install bundler -v 2.4.22

diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
@@ -546,6 +546,11 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
                        error = function(e) { FALSE })) {
             obj <- get(nodeChar, envir = func.env, inherits = FALSE)
             if (is.function(obj)) {
+              if (is.primitive(obj)) {
+                # Primitive functions have no closure to clean.
+                assign(nodeChar, obj, envir = newEnv)
+                break
+              }
               # If the node is a function call.
               funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F,
                                ifnotfound = list(list(NULL)))[[1]]

diff --git a/R/run-tests.sh b/R/run-tests.sh
@@ -58,10 +58,11 @@ if [[ $FAILED != 0 || $NUM_TEST_WARNING != 0 ]]; then
     echo -en "\033[0m"  # No color
     exit -1
 else
-    # We have 2 NOTEs: for RoxygenNote and one in Jenkins only "No repository set"
+    # We have 3 NOTEs: for RoxygenNote, one in Jenkins only "No repository set",
+    # and "Lost braces" in Rd files due to R 4.4+ stricter checkRd
     # For non-latest version branches, one WARNING for package version
-    if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) &&
-          ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 1) ]]; then
+    if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 3) &&
+          ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) ]]; then
       cat $CRAN_CHECK_LOG_FILE
       echo -en "\033[31m"  # Red
       echo "Had CRAN check errors; see logs."

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -19,10 +19,9 @@
 # See also in https://hub.docker.com/_/ubuntu
 FROM ubuntu:jammy
 
+ENV FULL_REFRESH_DATE 20260514
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-ENV FULL_REFRESH_DATE 20260420
-
 ENV DEBIAN_FRONTEND noninteractive
 ENV DEBCONF_NONINTERACTIVE_SEEN true
 
@@ -106,15 +105,12 @@ ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library
 RUN python3.8 -m pip install setuptools virtualenv
 RUN python3.9 -m pip install setuptools virtualenv
 
-RUN python3.8 -m pip  install --only-binary=pandas numpy pandas 'scipy<1.9' coverage 'matplotlib==3.7.2' 'mypy==0.982'
-RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982'
+RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' 'Werkzeug==2.1.2'
+RUN python3.8 -m pip install 'numpy' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982' 'beniget==0.4.1' 'pyproject-metadata==0.8.1'
 
 # Add Python deps for Spark Connect.
 RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'
+RUN python3.8 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'
 
 # Add torch as a testing dependency for TorchDistributor
 RUN python3.9 -m pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
-
-# pyarrow
-RUN python3.9 -m pip install 'pyarrow<13.0.0'
-RUN python3.8 -m pip install 'pyarrow<13.0.0'
diff --git a/python/mypy.ini b/python/mypy.ini
@@ -82,7 +82,7 @@ disallow_untyped_defs = False
 
 ; Allow untyped def and disable certain error codes in examples
 
-[mypy-python.sql.udtf]
+[mypy-sql.udtf]
 disallow_untyped_defs = False
 disable_error_code = attr-defined,arg-type,call-arg,union-attr
 
@@ -166,6 +166,12 @@ ignore_missing_imports = True
 [mypy-grpc.*]
 ignore_missing_imports = True
 
+[mypy-tornado.*]
+ignore_missing_imports = True
+
+[mypy-xmlrunner.*]
+ignore_missing_imports = True
+
 ; pydantic is pulled in transitively (e.g. via mlflow). mypy has issues
 ; serializing pydantic v2's recursive JsonValue type, so skip following it.
 [mypy-pydantic.*]

diff --git a/python/pyspark/ml/connect/classification.py b/python/pyspark/ml/connect/classification.py
@@ -43,8 +43,12 @@
 from pyspark.ml.connect.io_utils import ParamsReadWrite, CoreModelReadWrite
 from pyspark.sql.functions import lit, count, countDistinct
 
-import torch
-import torch.nn as torch_nn
+try:
+    import torch
+    import torch.nn as torch_nn
+except ImportError:
+    torch = None  # type: ignore[assignment]
+    torch_nn = None  # type: ignore[assignment]
 
 
 class _LogisticRegressionParams(

diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py b/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py
@@ -218,6 +218,7 @@ def test_save_load(self):
             loaded_model.transform(eval_df1.toPandas())
 
 
+@unittest.skipIf(not have_torch, "torch is required")
 class ClassificationTests(ClassificationTestsMixin, unittest.TestCase):
     def setUp(self) -> None:
         self.spark = SparkSession.builder.master("local[2]").getOrCreate()

diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py b/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py
@@ -164,6 +164,7 @@ def test_pipeline_copy():
         assert lorv2.getOrDefault(lorv2.maxIter) == 200
 
 
+@unittest.skipIf(not have_torch, "torch is required")
 class PipelineTests(PipelineTestsMixin, unittest.TestCase):
     def setUp(self) -> None:
         self.spark = SparkSession.builder.master("local[2]").getOrCreate()

diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py b/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py
@@ -272,6 +272,7 @@ def test_crossvalidator_with_fold_col(self):
         cv.fit(train_dataset)
 
 
+@unittest.skipIf(not have_torch, "torch is required")
 class CrossValidatorTests(CrossValidatorTestsMixin, unittest.TestCase):
     def setUp(self) -> None:
         self.spark = SparkSession.builder.master("local[2]").getOrCreate()

diff --git a/python/pyspark/ml/torch/data.py b/python/pyspark/ml/torch/data.py
@@ -15,7 +15,10 @@
 # limitations under the License.
 #
 
-import torch
+try:
+    import torch
+except ImportError:
+    torch = None  # type: ignore[assignment]
 import numpy as np
 from typing import Any, Callable, Iterator
 from pyspark.sql.types import StructType

diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
@@ -1613,8 +1613,8 @@ def __init__(self, child: "LogicalPlan", table_name: str) -> None:
         self.table_name: Optional[str] = table_name
         self.provider: Optional[str] = None
         self.partitioning_columns: List["ColumnOrName"] = []
-        self.options: dict[str, Optional[str]] = {}
-        self.table_properties: dict[str, Optional[str]] = {}
+        self.options: Dict[str, Optional[str]] = {}
+        self.table_properties: Dict[str, Optional[str]] = {}
         self.mode: Optional[str] = None
         self.overwrite_condition: Optional["ColumnOrName"] = None
 

diff --git a/python/run-tests.py b/python/run-tests.py
@@ -207,7 +207,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.9", "pypy3"] if which(x)]
+    python_execs = [x for x in ["python3.9", "python3.8", "pypy3"] if which(x)]
 
     if "python3.9" not in python_execs:
         p = which("python3")