From dc603dbeeecd6d4a5c5cec6f2d8f7c9e7fc23ec1 Mon Sep 17 00:00:00 2001 From: Venkatesh Bingi Date: Fri, 26 Sep 2025 00:41:06 +0000 Subject: [PATCH 1/2] Upgrade sklearn to 1.4.2 and numpy to 2.1.0 with modern Python packaging - Upgrade scikit-learn from 1.2.1 to 1.4.2 - Upgrade numpy to 2.1.0 - Upgrade PyArrow from 14.0.1 to 17.0.0 with proper Arrow C++ integration - Replace Miniconda with system Python 3.10 and uv package manager - Add pyproject.toml for modern Python packaging standards - Update MLIO build to support Arrow 17.0.0 compatibility - Modernize dependency management and remove Node.js dependencies - Update test fixtures and version checks across all components - Add cleanup fixture for multi-model endpoint tests --- docker/{1.2-1-1 => 1.4-2}/base/Dockerfile.cpu | 147 ++++++++---------- .../extension/Dockerfile.cpu | 8 +- docker/{1.2-1-1 => 1.4-2}/extension/README.md | 0 .../{1.2-1-1 => 1.4-2}/final/Dockerfile.cpu | 18 +-- .../resources/libffi7_3.3-6_arm64.deb | Bin .../resources/mms/ExecutionParameters.java | 0 .../resources/mms/config.properties.tmp | 0 .../resources/mms/endpoints-1.0.jar | Bin pyproject.toml | 34 ++++ requirements.txt | 29 ++-- setup.py | 2 +- test-requirements.txt | 13 +- test/conftest.py | 2 +- .../test_multiple_model_endpoint.py | 21 +++ .../models/pickled-model-1/sklearn-model | Bin 4252 -> 4354 bytes .../models/pickled-model-2/sklearn-model | Bin 3658 -> 4354 bytes test/unit/test_modules.py | 4 +- tox.ini | 2 +- 18 files changed, 163 insertions(+), 117 deletions(-) rename docker/{1.2-1-1 => 1.4-2}/base/Dockerfile.cpu (51%) rename docker/{1.2-1-1 => 1.4-2}/extension/Dockerfile.cpu (63%) rename docker/{1.2-1-1 => 1.4-2}/extension/README.md (100%) rename docker/{1.2-1-1 => 1.4-2}/final/Dockerfile.cpu (66%) rename docker/{1.2-1-1 => 1.4-2}/resources/libffi7_3.3-6_arm64.deb (100%) rename docker/{1.2-1-1 => 1.4-2}/resources/mms/ExecutionParameters.java (100%) rename docker/{1.2-1-1 => 1.4-2}/resources/mms/config.properties.tmp (100%) rename docker/{1.2-1-1 => 1.4-2}/resources/mms/endpoints-1.0.jar (100%) create mode 100644 pyproject.toml diff --git a/docker/1.2-1-1/base/Dockerfile.cpu b/docker/1.4-2/base/Dockerfile.cpu similarity index 51% rename from docker/1.2-1-1/base/Dockerfile.cpu rename to docker/1.4-2/base/Dockerfile.cpu index 16278337..a6ad871c 100644 --- a/docker/1.2-1-1/base/Dockerfile.cpu +++ b/docker/1.4-2/base/Dockerfile.cpu @@ -24,18 +24,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Main image FROM ubuntu:${UBUNTU_VERSION}@sha256:${UBUNTU_IMAGE_DIGEST} -ARG MINICONDA_VERSION=24.7.1 -ARG CONDA_CHECKSUM=684cda724bc37e3bbbb342e440fc4cac515c92e91a489eb4359feca35382894b -ARG CONDA_PY_VERSION=310 -ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 -ARG PYARROW_VERSION=14.0.1 +ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=v0.9.0 ENV DEBIAN_FRONTEND=noninteractive # Install python and other scikit-learn runtime dependencies -# Dependency list from http://scikit-learn.org/stable/developers/advanced_installation.html#installing-build-dependencies RUN apt-get update && \ apt-get -y upgrade && \ apt-get -y install --no-install-recommends \ @@ -55,17 +50,30 @@ RUN apt-get update && \ linux-libc-dev \ libxml2 \ libsqlite3-0 \ + software-properties-common \ + ca-certificates \ + lsb-release \ + && \ + # Add Apache Arrow repository + wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ + apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ + apt-get update && \ + apt-get install -y -V libarrow-dev=17.0.0-1 libarrow-dataset-dev=17.0.0-1 libparquet-dev=17.0.0-1 libarrow-acero-dev=17.0.0-1 && \ + # Add deadsnakes PPA for Python 3.10 + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get -y install --no-install-recommends \ + python3.10 \ + python3.10-dev \ + python3.10-distutils \ && \ # MLIO build dependencies - # Official Ubuntu APT repositories do not contain an up-to-date version of CMake required to build MLIO. - # Kitware contains the latest version of CMake. wget http://es.archive.ubuntu.com/ubuntu/pool/main/libf/libffi/libffi7_3.3-4_amd64.deb && \ dpkg -i libffi7_3.3-4_amd64.deb && \ apt-get -y install --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ - software-properties-common \ && \ wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ gpg --dearmor - | \ @@ -76,7 +84,6 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends \ autoconf \ automake \ - build-essential \ cmake \ cmake-data \ doxygen \ @@ -85,83 +92,68 @@ RUN apt-get update && \ libssl-dev \ libtool \ ninja-build \ - python3-dev \ - python3-distutils \ - python3-pip \ zlib1g-dev \ && \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install --upgrade certifi && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3 && \ apt-get clean && \ - # Node.js setup - mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | \ - gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ - echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | \ - tee /etc/apt/sources.list.d/nodesource.list && \ - apt-get update && \ - apt-get install -y nodejs && \ - npm install -g npm@latest && \ rm -rf /var/lib/apt/lists/* RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime && \ dpkg-reconfigure --frontend noninteractive tzdata -RUN cd /tmp && \ - curl -L --output /tmp/Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-py${CONDA_PY_VERSION}_${MINICONDA_VERSION}-0-Linux-x86_64.sh && \ - echo "${CONDA_CHECKSUM} /tmp/Miniconda3.sh" | sha256sum -c - && \ - bash /tmp/Miniconda3.sh -bfp /miniconda3 && \ - rm /tmp/Miniconda3.sh && \ - # Remove this when we move to Miniconda version with conda package version 4.13.0+ - rm -rf /miniconda3/pkgs/conda-4.12.0-py38h06a4308_0/info/test/* - -ENV PATH=/miniconda3/bin:${PATH} +# Install uv for fast Python package management +RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ + mv /root/.local/bin/uv /usr/local/bin/uv + +ENV PATH=/usr/local/bin:${PATH} ENV PIP_ROOT_USER_ACTION=ignore # Install MLIO with Apache Arrow integration -# We could install mlio-py from conda, but it comes with extra support such as image reader that increases image size -# which increases training time. We build from source to minimize the image size. -RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ - # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html - conda config --system --set auto_update_conda false && \ - conda config --system --set show_channel_urls true && \ - echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \ - conda install -c conda-forge python=${PYTHON_VERSION} --solver classic && \ - conda install conda=${CONDA_PKG_VERSION} --solver classic && \ - conda update -y conda && \ - conda install -c conda-forge pyarrow=${PYARROW_VERSION} --solver classic && \ - cd /miniconda3/pkgs/libgrpc-*/info/test/examples/node && \ - npm install minimist@latest protobufjs@latest && \ - # Remove Node.js, npm, and their dependencies - apt-get purge -y nodejs npm && \ - apt-get autoremove -y && \ - # Final cleanup - rm -rf /etc/apt/sources.list.d/nodesource.list \ - /etc/apt/keyrings/nodesource.gpg \ - /etc/apt/sources.list.d/kitware.list && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - # Continue with the rest of the build process - conda install pip --force-reinstall && \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install wheel && \ - cd /tmp && \ - git clone --branch ${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ - cd mlio && \ - build-tools/build-dependency build/third-party all && \ +# First install Arrow C++ libraries (needed for MLIO compilation) +RUN uv pip install --system pyarrow==${PYARROW_VERSION} + +# Clone MLIO repository +RUN cd /tmp && \ + git clone --branch ${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio + +# Patch MLIO for Arrow 17.0.0 +RUN cd /tmp/mlio && \ + sed -i 's/find_package(Arrow 14.0.1 REQUIRED/find_package(Arrow 17.0.0 REQUIRED/g' CMakeLists.txt && \ + sed -i 's/pyarrow==14.0.1/pyarrow==17.0.0/g' src/mlio-py/setup.py + +# Build MLIO third-party dependencies (includes Arrow C++) +RUN cd /tmp/mlio && \ + build-tools/build-dependency build/third-party all + +# Configure MLIO build +RUN cd /tmp/mlio && \ mkdir -p build/release && \ cd build/release && \ - cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. + +# Build MLIO core +RUN cd /tmp/mlio/build/release && \ cmake --build . && \ - cmake --build . --target install && \ - cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \ - -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \ + cmake --build . --target install + +# Configure MLIO Python extension +RUN cd /tmp/mlio/build/release && \ + cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/usr/bin/python3" \ + -DMLIO_INCLUDE_ARROW_INTEGRATION=ON -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. + +# Build MLIO Python extension +RUN cd /tmp/mlio/build/release && \ cmake --build . --target mlio-py && \ - cmake --build . --target mlio-arrow && \ - cd ../../src/mlio-py && \ + cmake --build . --target mlio-arrow + +# Build and install MLIO Python wheel +RUN cd /tmp/mlio/src/mlio-py && \ python3 setup.py bdist_wheel && \ - python3 -m pip install dist/*.whl && \ - cp -r /tmp/mlio/build/third-party/lib/libtbb* /usr/local/lib/ && \ + uv pip install --system dist/*.whl + +# Copy TBB libraries and cleanup +RUN cp -r /tmp/mlio/build/third-party/lib/libtbb* /usr/local/lib/ && \ ldconfig && \ rm -rf /tmp/mlio @@ -180,19 +172,18 @@ ENV PATH="/usr/local/bin:${PATH}" # This command will check the version and print it to the build logs RUN sqlite3 --version -RUN apt list --installed - # Install awscli RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ unzip awscliv2.zip && \ ./aws/install && \ rm -r aws awscliv2.zip -# Python won’t try to write .pyc or .pyo files on the import of source modules +# Python won't try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 -# Install Scikit-Learn -# Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4. -# Scikit-learn now requires Python 3.6 or newer. -RUN python3 -m pip install --no-cache -I scikit-learn==1.2.1 \ No newline at end of file +# Install core scientific packages with exact versions +RUN uv pip install --system --no-cache \ + numpy==2.1.0 \ + scikit-learn==1.4.2 \ + pyarrow==17.0.0 diff --git a/docker/1.2-1-1/extension/Dockerfile.cpu b/docker/1.4-2/extension/Dockerfile.cpu similarity index 63% rename from docker/1.2-1-1/extension/Dockerfile.cpu rename to docker/1.4-2/extension/Dockerfile.cpu index 1317c8ec..8550725e 100644 --- a/docker/1.2-1-1/extension/Dockerfile.cpu +++ b/docker/1.4-2/extension/Dockerfile.cpu @@ -1,9 +1,9 @@ -FROM preprod-sklearn:1.2-1 +FROM preprod-sklearn:1.4-2 -RUN pip freeze | grep -q 'scikit-learn==1.2.1'; \ +RUN pip freeze | grep -q 'scikit-learn==1.4.2'; \ if [ $? -eq 0 ]; \ - then echo 'scikit-learn version 1.2.1 requirement met'; \ - else echo 'ERROR: Expected scikit-learn version is 1.2.1, check base images for scikit-learn version' && \ + then echo 'scikit-learn version 1.4.2 requirement met'; \ + else echo 'ERROR: Expected scikit-learn version is 1.4.2, check base images for scikit-learn version' && \ exit 1; fi RUN pip install --upgrade --no-cache --no-deps sagemaker-scikit-learn-extension==2.5.0 diff --git a/docker/1.2-1-1/extension/README.md b/docker/1.4-2/extension/README.md similarity index 100% rename from docker/1.2-1-1/extension/README.md rename to docker/1.4-2/extension/README.md diff --git a/docker/1.2-1-1/final/Dockerfile.cpu b/docker/1.4-2/final/Dockerfile.cpu similarity index 66% rename from docker/1.2-1-1/final/Dockerfile.cpu rename to docker/1.4-2/final/Dockerfile.cpu index d6bb03e9..bd7148c9 100644 --- a/docker/1.2-1-1/final/Dockerfile.cpu +++ b/docker/1.4-2/final/Dockerfile.cpu @@ -1,25 +1,19 @@ -FROM sklearn-base:1.2-1-1 -ENV SAGEMAKER_SKLEARN_VERSION 1.2-1-1 +FROM sklearn-base:1.4-2 +ENV SAGEMAKER_SKLEARN_VERSION 1.4-2 ENV PIP_ROOT_USER_ACTION=ignore LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +# Install remaining packages via pip COPY requirements.txt /requirements.txt -RUN python -m pip install -r /requirements.txt && \ +RUN uv pip install --system -r /requirements.txt && \ rm /requirements.txt # Fix Python 3.10 compatibility for sagemaker-containers -RUN python3 -c "import sys; sys.path.insert(0, '/miniconda3/lib/python3.10/site-packages'); \ - import sagemaker_containers._mapping as m; \ - import collections.abc; \ - setattr(collections, 'Mapping', collections.abc.Mapping); \ - exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ - sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py +RUN python3 -c "import sys; import os; site_packages = '/usr/local/lib/python3.10/dist-packages'; mapping_file = os.path.join(site_packages, 'sagemaker_containers/_mapping.py'); exec('if os.path.exists(mapping_file):\\n with open(mapping_file, \"r\") as f:\\n content = f.read()\\n content = content.replace(\"collections.Mapping\", \"collections.abc.Mapping\")\\n with open(mapping_file, \"w\") as f:\\n f.write(content)')" COPY dist/sagemaker_sklearn_container-2.0-py3-none-any.whl /sagemaker_sklearn_container-2.0-py3-none-any.whl -RUN rm /miniconda3/lib/python3.10/site-packages/**/REQUESTED && \ - rm /miniconda3/lib/python3.10/site-packages/**/direct_url.json -RUN python3 -m pip install --no-cache /sagemaker_sklearn_container-2.0-py3-none-any.whl && \ +RUN uv pip install --system --no-cache /sagemaker_sklearn_container-2.0-py3-none-any.whl && \ rm /sagemaker_sklearn_container-2.0-py3-none-any.whl ENV SAGEMAKER_TRAINING_MODULE sagemaker_sklearn_container.training:main diff --git a/docker/1.2-1-1/resources/libffi7_3.3-6_arm64.deb b/docker/1.4-2/resources/libffi7_3.3-6_arm64.deb similarity index 100% rename from docker/1.2-1-1/resources/libffi7_3.3-6_arm64.deb rename to docker/1.4-2/resources/libffi7_3.3-6_arm64.deb diff --git a/docker/1.2-1-1/resources/mms/ExecutionParameters.java b/docker/1.4-2/resources/mms/ExecutionParameters.java similarity index 100% rename from docker/1.2-1-1/resources/mms/ExecutionParameters.java rename to docker/1.4-2/resources/mms/ExecutionParameters.java diff --git a/docker/1.2-1-1/resources/mms/config.properties.tmp b/docker/1.4-2/resources/mms/config.properties.tmp similarity index 100% rename from docker/1.2-1-1/resources/mms/config.properties.tmp rename to docker/1.4-2/resources/mms/config.properties.tmp diff --git a/docker/1.2-1-1/resources/mms/endpoints-1.0.jar b/docker/1.4-2/resources/mms/endpoints-1.0.jar similarity index 100% rename from docker/1.2-1-1/resources/mms/endpoints-1.0.jar rename to docker/1.4-2/resources/mms/endpoints-1.0.jar diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..af835272 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "sagemaker-sklearn-container" +version = "2.0" +description = "SageMaker Scikit-learn Container" +requires-python = "==3.10.*" +license = "Apache-2.0" +authors = [{name = "Amazon Web Services"}] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", +] +dynamic = ["readme", "dependencies", "optional-dependencies"] + +[project.scripts] +serve = "sagemaker_sklearn_container.serving:serving_entrypoint" + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +readme = {file = ["README.rst"]} +dependencies = {file = ["requirements.txt"]} +optional-dependencies.test = {file = ["test-requirements.txt"]} + +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["test*"] + +[tool.setuptools.package-dir] +"" = "src" diff --git a/requirements.txt b/requirements.txt index 34de7e78..bcc8bc22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,30 @@ +numpy==2.1.0 +scikit-learn==1.4.2 +scipy>=1.9.0 +pandas>=2.0.0 +pyarrow==17.0.0 boto3==1.28.57 botocore>=1.31.57,<1.32.0 +certifi cryptography Flask==1.1.1 -itsdangerous==2.0.1 +gevent==23.9.1 gunicorn==23.0.0 +itsdangerous==2.0.1 +jinja2<3.0 +MarkupSafe<2.0 model-archiver==1.0.3 multi-model-server==1.1.1 -pandas==1.5.3 protobuf==3.20.2 psutil==5.7.2 -python-dateutil==2.8.1 +python-dateutil +PyYAML==6.0.1 retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.2.0 sagemaker-training==4.8.0 -scikit-learn==1.2.1 -scipy==1.9.3 -urllib3==1.26.17 +setuptools six==1.15.0 -jinja2==3.0.3 -MarkupSafe==2.1.1 -numpy==1.24.1 -gevent==23.9.1 +urllib3==1.26.17 Werkzeug==2.0.3 -setuptools -wheel -certifi -PyYAML==6.0.1 \ No newline at end of file +wheel==0.45.1 diff --git a/setup.py b/setup.py index 3f4e84cc..5be88421 100644 --- a/setup.py +++ b/setup.py @@ -47,5 +47,5 @@ def read(fname): 'console_scripts': 'serve=sagemaker_sklearn_container.serving:serving_entrypoint' }, - python_requires='>=3.6', + python_requires='>=3.10', ) diff --git a/test-requirements.txt b/test-requirements.txt index 8e94ffe5..c79be276 100755 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,14 +1,19 @@ -Flask -PyYAML boto3>=1.24.17 coverage flake8 +Flask mock +numpy==2.1.0 +pandas +pyarrow==17.0.0 +pyOpenSSL==23.1.0 pytest pytest-cov pytest-xdist -python-dateutil==2.8.1 +python-dateutil>=2.8.2 PyYAML requests>=2.23.0 sagemaker>=1.3.0,<2 -tox \ No newline at end of file +scikit-learn==1.4.2 +scipy>=1.9.0 +tox diff --git a/test/conftest.py b/test/conftest.py index 4b46dde4..2c7708d4 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -43,7 +43,7 @@ def pytest_addoption(parser): parser.addoption('--install-container-support', '-C', action='store_true') parser.addoption('--docker-base-name', default='sk-learn') parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default='1.2.1') + parser.addoption('--framework-version', default='1.4.2') parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major)) parser.addoption('--processor', choices=['cpu'], default='cpu') # If not specified, will default to {framework-version}-{processor}-py{py-version} diff --git a/test/integration/test_multiple_model_endpoint.py b/test/integration/test_multiple_model_endpoint.py index f6acf79f..ba6b417d 100644 --- a/test/integration/test_multiple_model_endpoint.py +++ b/test/integration/test_multiple_model_endpoint.py @@ -121,6 +121,27 @@ def make_unload_model_request(model_name): return response.status_code, response.content.decode(encodings.utf_8.getregentry().name) +@pytest.fixture(autouse=True) +def cleanup_models(): + """Cleanup fixture to unload all models between tests""" + yield # Run the test + # Cleanup after test + try: + code, res = make_list_model_request() + if code == 200: + models_data = json.loads(res) if isinstance(res, str) else res + models = models_data.get('models', []) + for model in models: + model_name = model.get('modelName') + if model_name: + try: + make_unload_model_request(model_name) + except Exception: + pass # Ignore individual unload errors + except Exception: + pass # Ignore cleanup errors + + def test_ping(): res = requests.get(PING_URL) assert res.status_code == 200 diff --git a/test/resources/models/pickled-model-1/sklearn-model b/test/resources/models/pickled-model-1/sklearn-model index b5bf17704506f5eb168990352fc1fd398357c97c..d775e11915f33eb68f6b2495c841a6db41f70a82 100644 GIT binary patch literal 4354 zcmd6pO>7%Q6vw@BoY?tvQaf=#LI?>pAY^DMRRmNv1QC>#ED=ii*05fW{kC{N*qLof z1*ja-HqzXfN|X}<&b@Hqz=1n(LvY~|i34AUNPMXh0^ZE}D_}JdF_P!t@yxt=Gw=7_ ze|ImAUj03r&`lKBD@|&JUXgl?y45BvlDZ#K#-+$_SYFL{AJ@YT+Q3=nhw^QC*IV)p zxhyhh<+v8dIgxL>;UPsAcGF^vGMs;m+75Gk?=;T$^I?Y$ry&|hG0k3$tdQ0S4MXKK z&4Q-GaaD@Zs=iUiwO%7N8gP&mG3PoSVU`;-DI?lcnj<3HJByLkrD=$QnmkXP25?dj zEt}(cL@%zz7!nCFrF_?^#UqRp-Os48vLT*d(=$_{=0O;@I7N%R^6I!jgduF=wd{~u zjLIID8bj+~+>o*DfLP4{tRACj*Y|wxdyY+Pq+StO(qc)@(NLk2B6X~IqIg_JRWap} z4s|BY36bp#NnnMR%j7al!YpKA4(8zqobk{KFADEDH-S&`dU;?<-u9+s6#MW;U@s{eaKsliJz*jcuwz8xu4}F1l#fimO%lN4H=gPe6Fy?p-((noQQNMML;ix!H zLuoGuSp7EzGjU16)9|vH#{LZUS?r%tJL^3J`|d2D<+IQ*Z^!;Q+_{YX^Ekc_`xlgl z=vWsDJk{syl}j=*%b32b&PQ2kvA6x%8nn{%>4(9oiLXD{EseIHVY<(i+e4K{F?P;s zM)>&ZlPpyzmnAeX_4n(REUPV16>foB@%O)J<)c$Smiy9I-9xrYjNnAUC?zldMIY^UQ^8&gf`FqVm~zW%-+%Q-Z+#tTe2Pmv$Zd;tqnc1 z@3b~|4#n(?&sF^{;l14n4CsTkn9wd~3w-&9On$*=FI6aOjymv7_FcM)@$z;&0lY_tY7UKW&SD E1K!G=PXGV_ literal 4252 zcmdUxU29xL7{_;$Y?Ix*ZPISCtwq|qL}an4ZF5oBNX1AG9wAmQ6dC56oy~Lfyk*Xu z)Zhy$wk9};(h&r`5J3>V6BK$Q)tk}}Q1HHbFMa^^nK_%yCT`o1?8Q28b~10zdFD6& z|Kz+o^x3E7oGG%JyzVo~!x{?{7C1huk!HkMB9yGG(6A8&&zR;4TgBca;_8N)e^0%w z=426Pc>xvJCo9`w1ZNn+^=Xo@1pA+6u9tXGcm?~r#c;VN7{>`pjwfp=<*Y#%=Q`#{ z68oOOQ6+~QlTk6CZ;%Fy1t?oi1zt!J8pJ+Jh?%Oa0a@km6uiBd6jvqAV`x_ zSK#}I8C=dVzYf?%U&eRraXH@FQv5>SOC(0_xa=S@l%4v|O zIT(dWD8m#?!%;W}Gf;slp5BCm$?U9FvV`C`b20aZl0_$qL?Sqi)ut>)kwX#}&slBB zBO!S$auRjSNS)E3Oza#T}l9fGnkl`)vKp==Gnlr;#` zZ5D8Jx&CLy8jjg8vola_|0$|s%7U4wQNqo-Xb6znVlM`WEFeX{6AWI>MQW2*jAzFB)jst+A z9BrQ-4r62($*g3!jl;yg*i|n7@&1n!od{=b*r5nZRso*EcaL`p8nY?Lf;J~Io!)I$ z-HR5}4n-=sdLACv=%BU@E{~0~a=_b#>*5|rQtUYW$ab2a>B8s(aa4u>Wc9y?ejTYrwoA(w!zPC8DG_-w@nYy~9@yjFY)KBs5 z>cSY`fQJJqp4j)_=`wAWqz)X16Zm^DWkwf8?V-wqWz^WSVBJq?D-Ti;ylmy+l`b1X ziK6@-$_q~3UORdFm-=8AUhqzrxc>q5S6#C`B&ct{Sn&HBlrW&?mkt8zMN_7Fp{QT) z{Qg{jgL*^@YVA~42kuW%rCxyg;j3H2{SC@+`ttNr;Q-dl6jkg6D7${4-rt}y>x~}5 zdOZOpzxk=pt#`%9;>3Xzm5Hz?|MhOJ|MtI7)ae7=QJM9Q_riKVfBn&He_QWCpHut0 f%zL7!&p){PW`BeF#C&%?dI&}B{ybpWcW3Qi3bUD^ diff --git a/test/resources/models/pickled-model-2/sklearn-model b/test/resources/models/pickled-model-2/sklearn-model index bcea8bbec2dfed563f154fce4c0d00088e8cfbf5..d775e11915f33eb68f6b2495c841a6db41f70a82 100644 GIT binary patch literal 4354 zcmd6pO>7%Q6vw@BoY?tvQaf=#LI?>pAY^DMRRmNv1QC>#ED=ii*05fW{kC{N*qLof z1*ja-HqzXfN|X}<&b@Hqz=1n(LvY~|i34AUNPMXh0^ZE}D_}JdF_P!t@yxt=Gw=7_ ze|ImAUj03r&`lKBD@|&JUXgl?y45BvlDZ#K#-+$_SYFL{AJ@YT+Q3=nhw^QC*IV)p zxhyhh<+v8dIgxL>;UPsAcGF^vGMs;m+75Gk?=;T$^I?Y$ry&|hG0k3$tdQ0S4MXKK z&4Q-GaaD@Zs=iUiwO%7N8gP&mG3PoSVU`;-DI?lcnj<3HJByLkrD=$QnmkXP25?dj zEt}(cL@%zz7!nCFrF_?^#UqRp-Os48vLT*d(=$_{=0O;@I7N%R^6I!jgduF=wd{~u zjLIID8bj+~+>o*DfLP4{tRACj*Y|wxdyY+Pq+StO(qc)@(NLk2B6X~IqIg_JRWap} z4s|BY36bp#NnnMR%j7al!YpKA4(8zqobk{KFADEDH-S&`dU;?<-u9+s6#MW;U@s{eaKsliJz*jcuwz8xu4}F1l#fimO%lN4H=gPe6Fy?p-((noQQNMML;ix!H zLuoGuSp7EzGjU16)9|vH#{LZUS?r%tJL^3J`|d2D<+IQ*Z^!;Q+_{YX^Ekc_`xlgl z=vWsDJk{syl}j=*%b32b&PQ2kvA6x%8nn{%>4(9oiLXD{EseIHVY<(i+e4K{F?P;s zM)>&ZlPpyzmnAeX_4n(REUPV16>foB@%O)J<)c$Smiy9I-9xrYjNnAUC?zldMIY^UQ^8&gf`FqVm~zW%-+%Q-Z+#tTe2Pmv$Zd;tqnc1 z@3b~|4#n(?&sF^{;l14n4CsTkn9wd~3w-&9On$*=FI6aOjymv7_FcM)@$z;&0lY_tY7UKW&SD E1K!G=PXGV_ literal 3658 zcmd6p&u<$=6vyo(PHe|X>@-eHRS*Y$xTvydN`nuG7!?uJMT-;~kU(fOyB_B)cz+hvq{4d#W+wYsc9IzDHiP_nvBy|(YZ6c4xAF3yHNSGU#KkJSgN zE(>VoxK!YrtoGap&d`P3rD4cIoPUwoPU!gF6`UXD!{&})91WCQ2=^k&S(`A4~TixviRv|Gs{g5%7_*xO1LEZ-NQ;51MNvf%p`32i)Qbtq>& z@{VtX>hj;%plZrFjLY&dF(V_3*C=;204~%ateJ9NQ%Vm(nHWk5mP{xcQ&2IcVY$x& zPHk=ep0I`$6IPEFYW=UWlHw;Gb@5D*E(*wt++e?<3J5nR>OU4iA#mX_>vQOw)|h zjz5e|p{&EVQRvxqC9@asc?O@0_-rVKYjV-+3nmWM$jPEd{73|m2#FS6ao_-;Am{t1 zhs)?0@<}Ph+!kQ*f!I|q-|^n}2NABBa6}PWMh;%UcP|VI8l%a{oGwmex_rD?^(>Z{ zu27_cyU)TojSd<;a9JE&kWOYGz_Vsu7Q3t?hz(8`6+F-5a!yBcsdP};0+EEMI=I`s zb+`G*&wi4<#$+q%H!Y}JwUHE+fLa>| ysBbsEe`2yhr9v7BRBx~Tq5Pk}dJli{`74tRD!I&0w?-+w1k~~|MX}^xLgOEpEz=VK diff --git a/test/unit/test_modules.py b/test/unit/test_modules.py index 41acffa8..6d08de41 100644 --- a/test/unit/test_modules.py +++ b/test/unit/test_modules.py @@ -4,9 +4,9 @@ def test_pandas_version(): import pandas as pd major, minor, patch = pd.__version__.split('.') - assert major == '1' + assert major == '2' def test_pyarrow_to_parquet_conversion_regression_issue_106(): df = pd.DataFrame({'x': [1, 2]}) - df.to_parquet('test.parquet', engine='pyarrow') + df.to_parquet('test.parquet', engine='pyarrow') \ No newline at end of file diff --git a/tox.ini b/tox.ini index b268e6d2..1613ed13 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,7 @@ max-line-length = 120 [testenv] deps = - sklearn1.2: scikit-learn==1.2.1 + sklearn1.4.2: scikit-learn==1.4.2 -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt conda_deps= From 6b6db263584e2018424f59a32586b30c4082a2dd Mon Sep 17 00:00:00 2001 From: Venkatesh Bingi Date: Fri, 26 Sep 2025 20:31:39 +0000 Subject: [PATCH 2/2] Upgrading base image and moving mlio build to a seperate stage --- docker/1.4-2/base/Dockerfile.cpu | 133 ++++++++++++++++--------------- test/unit/test_modules.py | 2 +- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/docker/1.4-2/base/Dockerfile.cpu b/docker/1.4-2/base/Dockerfile.cpu index a6ad871c..9ac17115 100644 --- a/docker/1.4-2/base/Dockerfile.cpu +++ b/docker/1.4-2/base/Dockerfile.cpu @@ -1,13 +1,10 @@ -ARG UBUNTU_VERSION=20.04 -ARG UBUNTU_IMAGE_DIGEST=874aca52f79ae5f8258faff03e10ce99ae836f6e7d2df6ecd3da5c1cad3a912b +ARG UBUNTU_VERSION=24.04 +ARG UBUNTU_IMAGE_DIGEST=b359f1067efa76f37863778f7b6d0e8d911e3ee8efa807ad01fbf5dc1ef9006b # Build stage for SQLite compilation FROM ubuntu:${UBUNTU_VERSION}@sha256:${UBUNTU_IMAGE_DIGEST} as sqlite-builder RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - wget \ - ca-certificates \ - && \ + build-essential wget ca-certificates && \ cd /tmp && \ wget https://www.sqlite.org/2025/sqlite-autoconf-3500200.tar.gz && \ tar xzf sqlite-autoconf-3500200.tar.gz && \ @@ -21,39 +18,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Main image -FROM ubuntu:${UBUNTU_VERSION}@sha256:${UBUNTU_IMAGE_DIGEST} +# MLIO builder stage with Ubuntu 20.04 +FROM ubuntu:20.04@sha256:874aca52f79ae5f8258faff03e10ce99ae836f6e7d2df6ecd3da5c1cad3a912b as mlio-builder ARG PYTHON_VERSION=3.10 -ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=v0.9.0 +ARG PYARROW_VERSION=17.0.0 ENV DEBIAN_FRONTEND=noninteractive # Install python and other scikit-learn runtime dependencies RUN apt-get update && \ - apt-get -y upgrade && \ apt-get -y install --no-install-recommends \ - build-essential \ - curl \ - git \ - jq \ - libatlas-base-dev \ - nginx \ - openjdk-8-jdk-headless \ - unzip \ - wget \ - expat \ - tzdata \ - apparmor\ - libgstreamer1.0-0 \ - linux-libc-dev \ - libxml2 \ - libsqlite3-0 \ - software-properties-common \ - ca-certificates \ - lsb-release \ - && \ + build-essential curl git wget ca-certificates lsb-release software-properties-common && \ # Add Apache Arrow repository wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ @@ -63,18 +40,12 @@ RUN apt-get update && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ apt-get -y install --no-install-recommends \ - python3.10 \ - python3.10-dev \ - python3.10-distutils \ - && \ + python3.10 python3.10-dev python3.10-distutils && \ # MLIO build dependencies wget http://es.archive.ubuntu.com/ubuntu/pool/main/libf/libffi/libffi7_3.3-4_amd64.deb && \ dpkg -i libffi7_3.3-4_amd64.deb && \ apt-get -y install --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - && \ + apt-transport-https gnupg && \ wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ gpg --dearmor - | \ tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ @@ -82,18 +53,7 @@ RUN apt-get update && \ apt-get update && \ rm /usr/share/keyrings/kitware-archive-keyring.gpg && \ apt-get install -y --no-install-recommends \ - autoconf \ - automake \ - cmake \ - cmake-data \ - doxygen \ - kitware-archive-keyring \ - libcurl4-openssl-dev \ - libssl-dev \ - libtool \ - ninja-build \ - zlib1g-dev \ - && \ + autoconf automake cmake cmake-data doxygen kitware-archive-keyring libcurl4-openssl-dev libssl-dev libtool ninja-build zlib1g-dev && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ curl -sS https://bootstrap.pypa.io/get-pip.py | python3 && \ apt-get clean && \ @@ -102,17 +62,9 @@ RUN apt-get update && \ RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime && \ dpkg-reconfigure --frontend noninteractive tzdata -# Install uv for fast Python package management -RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ - mv /root/.local/bin/uv /usr/local/bin/uv - -ENV PATH=/usr/local/bin:${PATH} ENV PIP_ROOT_USER_ACTION=ignore -# Install MLIO with Apache Arrow integration -# First install Arrow C++ libraries (needed for MLIO compilation) -RUN uv pip install --system pyarrow==${PYARROW_VERSION} - +# Build MLIO from scratch # Clone MLIO repository RUN cd /tmp && \ git clone --branch ${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio @@ -147,15 +99,64 @@ RUN cd /tmp/mlio/build/release && \ cmake --build . --target mlio-py && \ cmake --build . --target mlio-arrow -# Build and install MLIO Python wheel +# Build MLIO Python wheel RUN cd /tmp/mlio/src/mlio-py && \ - python3 setup.py bdist_wheel && \ - uv pip install --system dist/*.whl + python3 setup.py bdist_wheel -# Copy TBB libraries and cleanup -RUN cp -r /tmp/mlio/build/third-party/lib/libtbb* /usr/local/lib/ && \ - ldconfig && \ - rm -rf /tmp/mlio +# Copy TBB libraries and MLIO shared libraries to a location we can copy from +RUN mkdir -p /mlio-artifacts && \ + cp -r /tmp/mlio/build/third-party/lib/libtbb* /mlio-artifacts/ && \ + cp /usr/local/lib/libmlio* /mlio-artifacts/ 2>/dev/null || true && \ + cp /tmp/mlio/src/mlio-py/dist/*.whl /mlio-artifacts/ + +# Main image +FROM ubuntu:${UBUNTU_VERSION}@sha256:${UBUNTU_IMAGE_DIGEST} + +ARG PYTHON_VERSION=3.10 +ARG PYARROW_VERSION=17.0.0 + +ENV DEBIAN_FRONTEND=noninteractive + +# Install runtime dependencies only +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install --no-install-recommends \ + curl git jq libatlas-base-dev nginx openjdk-8-jdk-headless unzip wget expat tzdata apparmor \ + libgstreamer1.0-0 libxml2 libsqlite3-0 software-properties-common ca-certificates lsb-release \ + build-essential linux-libc-dev && \ + # Add Apache Arrow repository for runtime libraries only + wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ + apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ + apt-get update && \ + apt-get install -y -V libarrow-dev=17.0.0-1 libarrow-dataset-dev=17.0.0-1 libparquet-dev=17.0.0-1 libarrow-acero-dev=17.0.0-1 && \ + # Add deadsnakes PPA for Python 3.10 + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get -y install --no-install-recommends \ + python3.10 python3.10-distutils python3.10-dev && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime && \ + dpkg-reconfigure --frontend noninteractive tzdata + +# Install uv for fast Python package management +RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ + mv /root/.local/bin/uv /usr/local/bin/uv + +ENV PATH=/usr/local/bin:${PATH} +ENV PIP_ROOT_USER_ACTION=ignore + +# Copy MLIO wheel, TBB libraries, and MLIO shared libraries from builder stage +COPY --from=mlio-builder /mlio-artifacts/*.whl /tmp/ +COPY --from=mlio-builder /mlio-artifacts/libtbb* /usr/local/lib/ +COPY --from=mlio-builder /mlio-artifacts/libmlio* /usr/local/lib/ + +# Install MLIO wheel +RUN uv pip install --system /tmp/*.whl && \ + rm /tmp/*.whl # Copy compiled SQLite from builder stage COPY --from=sqlite-builder /usr/local/bin/sqlite3 /usr/local/bin/sqlite3 diff --git a/test/unit/test_modules.py b/test/unit/test_modules.py index 6d08de41..f56053f9 100644 --- a/test/unit/test_modules.py +++ b/test/unit/test_modules.py @@ -9,4 +9,4 @@ def test_pandas_version(): def test_pyarrow_to_parquet_conversion_regression_issue_106(): df = pd.DataFrame({'x': [1, 2]}) - df.to_parquet('test.parquet', engine='pyarrow') \ No newline at end of file + df.to_parquet('test.parquet', engine='pyarrow')