Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9659620
[SPARK-56525][INFRA] Run apt-get update before installing R dependencies
sarutak May 7, 2026
b70f85e
[INFRA] Fix docker build for dev/infra by pinning scipy<1.10 and pyth…
sarutak May 9, 2026
be69e7f
Pin beniget==0.4.1 and limit pyproject-metadata<0.9.0
sarutak May 9, 2026
2f1dc51
Update FULL_REFRESH_DATE
sarutak May 9, 2026
976e0a2
Pin pyproject-metadata==0.8.1
sarutak May 10, 2026
542b7ea
Skip mypy for pydantic and sqlalchemy
sarutak May 10, 2026
c84bfa8
fix(sparkr): Skip primitive functions in cleanClosure
sarutak May 10, 2026
ae042e4
Fix R/run-tests.sh for "Lost braces" in Rd files due to R 4.4+ strict…
sarutak May 13, 2026
67cae39
Fix to pass dev/lint-python
sarutak May 13, 2026
458389d
fix(infra): Pin Werkzeug and ragg for CI compatibility
sarutak May 13, 2026
c5f5e21
Merge remote-tracking branch 'apache-github/branch-3.5' into SPARK-56…
sfc-gh-hkarau May 14, 2026
0f8d055
Re-enable R
sfc-gh-hkarau May 14, 2026
bb3d769
Re-enable Python 3.8 in test matrix and fix runtime compatibility issues
sfc-gh-hkarau May 13, 2026
4c6c228
Remove duplicate install line
sfc-gh-hkarau May 14, 2026
14d0559
Remove duplicated pydantic/sqlalchemy configs
sfc-gh-hkarau May 15, 2026
f94a32c
Do we need that constraints file idk
sfc-gh-hkarau May 15, 2026
5dbd5b7
Ignore tornade xmlrunner.
sfc-gh-hkarau May 15, 2026
9b420e7
Unpin numpy can't solve for 1.25.1
sfc-gh-hkarau May 15, 2026
988ebfa
Skip torch when missing
sfc-gh-hkarau May 16, 2026
748dacc
Fix sql.udtf mypy
sfc-gh-hkarau May 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
\"build\": \"$build\",
\"pyspark\": \"$pyspark\",
\"pyspark-pandas\": \"$pandas\",
\"sparkr\": \"false\",
\"sparkr\": \"$sparkr\",
\"tpcds-1g\": \"$tpcds\",
\"docker-integration-tests\": \"$docker\",
\"scala-213\": \"$build\",
Expand Down Expand Up @@ -712,6 +712,7 @@ jobs:
apt-get update -y
apt-get install -y ruby ruby-dev
Rscript -e "install.packages(c('remotes', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
Rscript -e "remotes::install_version('ragg', version='1.2.5', repos='https://cloud.r-project.org')"
Rscript -e "remotes::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
Rscript -e "remotes::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
gem install bundler -v 2.4.22
Expand Down
5 changes: 5 additions & 0 deletions R/pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,11 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
error = function(e) { FALSE })) {
obj <- get(nodeChar, envir = func.env, inherits = FALSE)
if (is.function(obj)) {
if (is.primitive(obj)) {
# Primitive functions have no closure to clean.
assign(nodeChar, obj, envir = newEnv)
break
}
# If the node is a function call.
funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F,
ifnotfound = list(list(NULL)))[[1]]
Expand Down
7 changes: 4 additions & 3 deletions R/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,11 @@ if [[ $FAILED != 0 || $NUM_TEST_WARNING != 0 ]]; then
echo -en "\033[0m" # No color
exit -1
else
# We have 2 NOTEs: for RoxygenNote and one in Jenkins only "No repository set"
# We have 3 NOTEs: for RoxygenNote, one in Jenkins only "No repository set",
# and "Lost braces" in Rd files due to R 4.4+ stricter checkRd
# For non-latest version branches, one WARNING for package version
if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) &&
($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 1) ]]; then
if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 3) &&
($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) ]]; then
cat $CRAN_CHECK_LOG_FILE
echo -en "\033[31m" # Red
echo "Had CRAN check errors; see logs."
Expand Down
12 changes: 4 additions & 8 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy

ENV FULL_REFRESH_DATE 20260514
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

ENV FULL_REFRESH_DATE 20260420

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

Expand Down Expand Up @@ -106,15 +105,12 @@ ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library
RUN python3.8 -m pip install setuptools virtualenv
RUN python3.9 -m pip install setuptools virtualenv

RUN python3.8 -m pip install --only-binary=pandas numpy pandas 'scipy<1.9' coverage 'matplotlib==3.7.2' 'mypy==0.982'
RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982'
RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' 'Werkzeug==2.1.2'
RUN python3.8 -m pip install 'numpy' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982' 'beniget==0.4.1' 'pyproject-metadata==0.8.1'

# Add Python deps for Spark Connect.
RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'
RUN python3.8 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'

# Add torch as a testing dependency for TorchDistributor
RUN python3.9 -m pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval

# pyarrow
RUN python3.9 -m pip install 'pyarrow<13.0.0'
RUN python3.8 -m pip install 'pyarrow<13.0.0'
8 changes: 7 additions & 1 deletion python/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ disallow_untyped_defs = False

; Allow untyped def and disable certain error codes in examples

[mypy-python.sql.udtf]
[mypy-sql.udtf]
disallow_untyped_defs = False
disable_error_code = attr-defined,arg-type,call-arg,union-attr

Expand Down Expand Up @@ -166,6 +166,12 @@ ignore_missing_imports = True
[mypy-grpc.*]
ignore_missing_imports = True

[mypy-tornado.*]
ignore_missing_imports = True

[mypy-xmlrunner.*]
ignore_missing_imports = True

; pydantic is pulled in transitively (e.g. via mlflow). mypy has issues
; serializing pydantic v2's recursive JsonValue type, so skip following it.
[mypy-pydantic.*]
Expand Down
8 changes: 6 additions & 2 deletions python/pyspark/ml/connect/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,12 @@
from pyspark.ml.connect.io_utils import ParamsReadWrite, CoreModelReadWrite
from pyspark.sql.functions import lit, count, countDistinct

import torch
import torch.nn as torch_nn
try:
import torch
import torch.nn as torch_nn
except ImportError:
torch = None # type: ignore[assignment]
torch_nn = None # type: ignore[assignment]


class _LogisticRegressionParams(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ def test_save_load(self):
loaded_model.transform(eval_df1.toPandas())


@unittest.skipIf(not have_torch, "torch is required")
class ClassificationTests(ClassificationTestsMixin, unittest.TestCase):
def setUp(self) -> None:
self.spark = SparkSession.builder.master("local[2]").getOrCreate()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def test_pipeline_copy():
assert lorv2.getOrDefault(lorv2.maxIter) == 200


@unittest.skipIf(not have_torch, "torch is required")
class PipelineTests(PipelineTestsMixin, unittest.TestCase):
def setUp(self) -> None:
self.spark = SparkSession.builder.master("local[2]").getOrCreate()
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def test_crossvalidator_with_fold_col(self):
cv.fit(train_dataset)


@unittest.skipIf(not have_torch, "torch is required")
class CrossValidatorTests(CrossValidatorTestsMixin, unittest.TestCase):
def setUp(self) -> None:
self.spark = SparkSession.builder.master("local[2]").getOrCreate()
Expand Down
5 changes: 4 additions & 1 deletion python/pyspark/ml/torch/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
# limitations under the License.
#

import torch
try:
import torch
except ImportError:
torch = None # type: ignore[assignment]
import numpy as np
from typing import Any, Callable, Iterator
from pyspark.sql.types import StructType
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/sql/connect/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -1613,8 +1613,8 @@ def __init__(self, child: "LogicalPlan", table_name: str) -> None:
self.table_name: Optional[str] = table_name
self.provider: Optional[str] = None
self.partitioning_columns: List["ColumnOrName"] = []
self.options: dict[str, Optional[str]] = {}
self.table_properties: dict[str, Optional[str]] = {}
self.options: Dict[str, Optional[str]] = {}
self.table_properties: Dict[str, Optional[str]] = {}
self.mode: Optional[str] = None
self.overwrite_condition: Optional["ColumnOrName"] = None

Expand Down
2 changes: 1 addition & 1 deletion python/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_


def get_default_python_executables():
python_execs = [x for x in ["python3.9", "pypy3"] if which(x)]
python_execs = [x for x in ["python3.9", "python3.8", "pypy3"] if which(x)]

if "python3.9" not in python_execs:
p = which("python3")
Expand Down