From 7b6ab127e1b1bc383656c0423b8ae67a756c9990 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 19:39:54 -0800 Subject: [PATCH 01/12] build ec2 image --- dlc_developer_config.toml | 6 +++--- pytorch/inference/buildspec-arm64.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..806702116b86 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -15,7 +15,7 @@ neuronx_mode = false graviton_mode = false # Please only set it to true if you are preparing a ARM64 related PR # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) -arm64_mode = false +arm64_mode = true # Please only set it to True if you are preparing a HABANA related PR # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false @@ -37,11 +37,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true +build_training = false build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index 40dc6c936417..d7936b4a4390 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-ec2.yml +buildspec_pointer: buildspec-arm64-2-6-ec2.yml From 843b6185b9e59309f76cedf7d8488d222cd52a0b Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 20:27:11 -0800 Subject: [PATCH 02/12] try building with ubuntu 24.04 --- pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu index 3281081505e6..c4c614d0edc8 100644 --- a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu @@ -26,7 +26,7 @@ ARG SM_TOOLKIT_VERSION # |_| \_\___|\___|_| .__/ \___| # |_| ######################################################## -FROM arm64v8/ubuntu:22.04 as ec2 +FROM arm64v8/ubuntu:24.04 as ec2 LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" From ba202e0c951ef661bebb36003ab2f664be809d54 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 21:06:07 -0800 Subject: [PATCH 03/12] revert to ubuntu 22.04 --- pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu index c4c614d0edc8..3281081505e6 100644 --- a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu @@ -26,7 +26,7 @@ ARG SM_TOOLKIT_VERSION # |_| \_\___|\___|_| .__/ \___| # |_| ######################################################## -FROM arm64v8/ubuntu:24.04 as ec2 +FROM arm64v8/ubuntu:22.04 as ec2 LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" From a3d45c374eacf52d4cce2ff383184daf707eadc5 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 22:21:51 -0800 Subject: [PATCH 04/12] try to pin Flask version --- pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu index 3281081505e6..7982554d36a8 100644 --- a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu @@ -173,6 +173,8 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s # Patches # py-vuln: 71064 RUN pip install --no-cache-dir -U "requests>=2.32.3" +# CVE-2025-47278 +RUN pip install --no-cache-dir -U "Flask>=3.1.1" # add necessary certificate for aws sdk cpp download RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt From cb190011988d09cd26c1b581003c91b023c208f1 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 19 Nov 2025 14:35:23 -0800 Subject: [PATCH 05/12] unpin Flask version and rebuild ec2 image --- pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu index 7982554d36a8..3281081505e6 100644 --- a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu @@ -173,8 +173,6 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s # Patches # py-vuln: 71064 RUN pip install --no-cache-dir -U "requests>=2.32.3" -# CVE-2025-47278 -RUN pip install --no-cache-dir -U "Flask>=3.1.1" # add necessary certificate for aws sdk cpp download RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt From b7217055e805991d7d943ed0ab9e51aa81af29f2 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 19 Nov 2025 18:10:40 -0800 Subject: [PATCH 06/12] remove sitecustomize.py copy command in both dockerfiles --- pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu | 1 - pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu | 1 - 2 files changed, 2 deletions(-) diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu index 3281081505e6..3af1e8b34153 100644 --- a/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.arm64.cpu @@ -190,7 +190,6 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.py # add telemetry COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/deep_learning_container.py -# COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ diff --git a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu index 83af40c0335f..761448b89387 100644 --- a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu +++ b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu @@ -218,7 +218,6 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.py # add telemetry COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py -COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py RUN chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ From 5d03caef31289194de9c14837296e740d5873a1d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 19 Nov 2025 18:12:21 -0800 Subject: [PATCH 07/12] rebuild ec2 --- dlc_developer_config.toml | 4 ++-- pytorch/inference/buildspec-arm64.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 806702116b86..c00787ee8b6d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,11 +42,11 @@ build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = false -build_inference = true +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = true [notify] ### Notify on test failures diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index d7936b4a4390..40dc6c936417 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-ec2.yml +buildspec_pointer: buildspec-arm64-2-6-ec2.yml From ca2ca1b121810e5a31c4e9df40c7bc0489e5b718 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 19 Nov 2025 18:17:38 -0800 Subject: [PATCH 08/12] try building sm image --- pytorch/inference/buildspec-arm64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index 40dc6c936417..0a70f13b973c 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-ec2.yml +buildspec_pointer: buildspec-arm64-2-6-sm.yml From 4186462ae01de671d0dd29c679275b431a30e5ee Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 19 Nov 2025 20:40:57 -0800 Subject: [PATCH 09/12] pin arrow to 1.3.0 and rebuild ec2 image --- pytorch/inference/buildspec-arm64.yml | 2 +- pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index 0a70f13b973c..40dc6c936417 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-sm.yml +buildspec_pointer: buildspec-arm64-2-6-ec2.yml diff --git a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu index 761448b89387..4e350258dd11 100644 --- a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu +++ b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.arm64.gpu @@ -173,6 +173,7 @@ RUN pip install --no-cache-dir \ boto3 \ scipy \ opencv-python \ + arrow==1.3.0 \ nvgpu \ numpy \ pyopenssl \ From d7621bdfbb21bae167c6817f6147912225149309 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 20 Nov 2025 10:31:11 -0800 Subject: [PATCH 10/12] add py_scan_allowlist for ec2 images and rebuild --- pytorch/inference/buildspec-arm64.yml | 2 +- .../2.6/py3/Dockerfile.ec2.arm64.cpu.py_scan_allowlist.json | 3 +++ .../py3/cu124/Dockerfile.ec2.arm64.gpu.py_scan_allowlist.json | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 pytorch/inference/docker/2.6/py3/Dockerfile.ec2.arm64.cpu.py_scan_allowlist.json create mode 100644 pytorch/inference/docker/2.6/py3/cu124/Dockerfile.ec2.arm64.gpu.py_scan_allowlist.json diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index 40dc6c936417..d7936b4a4390 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-ec2.yml +buildspec_pointer: buildspec-arm64-2-6-ec2.yml diff --git a/pytorch/inference/docker/2.6/py3/Dockerfile.ec2.arm64.cpu.py_scan_allowlist.json b/pytorch/inference/docker/2.6/py3/Dockerfile.ec2.arm64.cpu.py_scan_allowlist.json new file mode 100644 index 000000000000..4882e42c6ceb --- /dev/null +++ b/pytorch/inference/docker/2.6/py3/Dockerfile.ec2.arm64.cpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "78828": "Affected versions of the PyTorch package are vulnerable to Denial of Service (DoS) due to improper handling in the MKLDNN pooling implementation. The torch.mkldnn_max_pool2d function fails to properly validate input parameters, allowing crafted inputs to trigger resource exhaustion or crashes in the underlying MKLDNN library. An attacker with local access can exploit this vulnerability by passing specially crafted tensor dimensions or parameters to the max pooling function, causing the application to become unresponsive or crash." +} diff --git a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.ec2.arm64.gpu.py_scan_allowlist.json b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.ec2.arm64.gpu.py_scan_allowlist.json new file mode 100644 index 000000000000..4882e42c6ceb --- /dev/null +++ b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.ec2.arm64.gpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "78828": "Affected versions of the PyTorch package are vulnerable to Denial of Service (DoS) due to improper handling in the MKLDNN pooling implementation. The torch.mkldnn_max_pool2d function fails to properly validate input parameters, allowing crafted inputs to trigger resource exhaustion or crashes in the underlying MKLDNN library. An attacker with local access can exploit this vulnerability by passing specially crafted tensor dimensions or parameters to the max pooling function, causing the application to become unresponsive or crash." +} From 75968cb2f9e418540206ad85ea91f1ffc81fe3cd Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 20 Nov 2025 12:11:44 -0800 Subject: [PATCH 11/12] Revert toml changes --- dlc_developer_config.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index c00787ee8b6d..2ddfe8ccb932 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -15,7 +15,7 @@ neuronx_mode = false graviton_mode = false # Please only set it to true if you are preparing a ARM64 related PR # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) -arm64_mode = true +arm64_mode = false # Please only set it to True if you are preparing a HABANA related PR # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false @@ -37,16 +37,16 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = false -build_inference = true +build_training = true +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = true [notify] ### Notify on test failures From f5c05af4d388ac1d4a78c83001d1e2e28ca250ed Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 20 Nov 2025 12:13:01 -0800 Subject: [PATCH 12/12] Revert buildspec change --- pytorch/inference/buildspec-arm64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml index d7936b4a4390..82cfaee1feb0 100644 --- a/pytorch/inference/buildspec-arm64.yml +++ b/pytorch/inference/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-6-ec2.yml +buildspec_pointer: buildspec-arm64-2-6-ec2.yml \ No newline at end of file