From e98def2fd66aedcd3e44bd29f1bc11d88efc4b88 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 22 Jul 2025 15:51:38 -0400 Subject: [PATCH 01/38] [NVIDIA_IMEX] Add resource to install Nvidia-imex --- .../nvidia_imex/nvidia_imex_alinux2023.rb | 19 ++ .../nvidia_imex/nvidia_imex_amazon2.rb | 17 ++ .../nvidia_imex/nvidia_imex_redhat8.rb | 19 ++ .../nvidia_imex/nvidia_imex_rocky8.rb | 19 ++ .../nvidia_imex/nvidia_imex_ubuntu22+.rb | 19 ++ .../partial/_nvidia_imex_common.rb | 64 ++++++ .../nvidia-imex/nvidia-imex-config.erb | 217 ++++++++++++++++++ .../nvidia-imex/nvidia-imex-nodes.erb | 3 + .../nvidia-imex/nvidia-imex.service.erb | 26 +++ 9 files changed, 403 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb create mode 100644 cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb create mode 100644 cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb create mode 100644 cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb new file mode 100644 index 0000000000..bb39282744 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'amazon' do |node| + node['platform_version'].to_i == 2023 +end + +use 'partial/_nvidia_imex_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb new file mode 100644 index 0000000000..b863b9ea00 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'amazon', platform_version: '2' + +use 'partial/_nvidia_imex_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb new file mode 100644 index 0000000000..aeff2dffdd --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'redhat' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_nvidia_imex_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb new file mode 100644 index 0000000000..2f91dd302a --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'rocky' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_nvidia_imex_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb new file mode 100644 index 0000000000..b3c26d0a94 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'ubuntu' do |node| + node['platform_version'].to_i >= 22 +end + +use 'partial/_nvidia_imex_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb new file mode 100644 index 0000000000..023d318b7e --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true +# +# Copyright:: 2013-2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +unified_mode true +default_action :install + +action :install do + return unless nvidia_enabled_or_installed? + return if on_docker? || imex_installed + + # Add NVIDIA repo for nvidia-imex + nvidia_repo 'add nvidia repository' do + action :add + end + + directory "#{node['cluster']['shared_dir']}/nvidia-imex" + + template "#{node['cluster']['shared_dir']}/nvidia-imex/config.cfg" do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + end + + template "#{node['cluster']['shared_dir']}/nvidia-imex/nodes_config.cfg" do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + end + + template "/etc/systemd/system/nvidia-imex.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + end + + package 'nvidia-imex' do + retries 3 + retry_delay 5 + version node['cluster']['nvidia']['driver_version'] + end +end + +def imex_installed + ::File.exist?('/usr/bin/nvidia-imex') || ::File.exist?('/usr/bin/nvidia-imex-ctl') +end + +def nvidia_enabled_or_installed? + nvidia_enabled? || nvidia_installed? +end diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb new file mode 100644 index 0000000000..a6c5403883 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb @@ -0,0 +1,217 @@ +# NVIDIA IMEX configuration file. +# Note: This configuration file is read during IMEX startup. So, IMEX +# service restart is required for new settings to take effect. + +# Description: IMEX logging levels +# Possible Values: +# 0 - All the logging is disabled +# 1 - Set log level to CRITICAL and above +# 2 - Set log level to ERROR and above +# 3 - Set log level to WARNING and above +# 4 - Set log level to INFO and above +# Default Value: 4 +LOG_LEVEL=4 + +# Description: Filename for IMEX logs +# Possible Values: +# Full path/filename string (max length of 256). Logs will be redirected +# to console(stderr). If the specified log file can't be opened or the +# path is empty. +# Default Value: /var/log/nvidia-imex.log +LOG_FILE_NAME=/var/log/nvidia-imex.log + +# Description: Filename for IMEX stats logging +# Possible Values: +# Full path/filename string (max length of 256). Stats will be redirected +# to console(stderr), if the specified stats file can't be opened or the +# path is empty. +# Default Value: /var/log/nvidia-imex-stats.log +# Note: If STATS_FILE_NAME is configured same as LOG_FILE_NAME, then stats will +# be redirected to the path/filename specified by LOG_FILE_NAME. +STATS_FILE_NAME=/var/log/nvidia-imex-stats.log + +# Description: Append to an existing log file or overwrite the logs +# Possible Values: +# 0 - No (Log file will be overwritten) +# 1 - Yes (Append to existing log) +# Default Value: 1 +LOG_APPEND_TO_LOG=1 + +# Description: Max size of log file (in MB) +# Possible Values: +# Any Integer values +# Default Value: 1024 +LOG_FILE_MAX_SIZE=1024 + +# Description: Number of times the IMEX log is rotated once it reaches LOG_FILE_MAX_SIZE +# Possible Values: +# 0 - Log is not rotated. Logging is stopped once the IMEX log file reaches +# the size specified in LOG_FILE_MAX_SIZE +# Non-zero Integer - Log is rotated upto the number of times specified in LOG_MAX_ROTATE_COUNT, +# after the size of the log file reaches the size specified in LOG_FILE_MAX_SIZE. +# Combined IMEX log size is LOG_FILE_MAX_SIZE multipled by LOG_MAX_ROTATE_COUNT+1 +# Once this threshold is reached, the oldest log file is purged and reused. +LOG_MAX_ROTATE_COUNT=3 + +# Description: Redirect all the logs to syslog instead of logging to file +# Possible Values: +# 0 - No +# 1 - Yes +# Default Value: 0 +LOG_USE_SYSLOG=0 + +# Description: daemonize IMEX on start-up +# Possible Values: +# 0 - No (Do not daemonize and run IMEX as a normal process) +# 1 - Yes (Run IMEX process as Unix daemon +# Default Value: 1 +DAEMONIZE=1 + +# Description: Network interface to listen for IMEX peer communication. +# OPTIONAL - empty value will determine the bind IP from the node config file. +# Possible Values: +# A valid IPv4 address +# A valid IPv6 address +# No value - Determine bind IP from the node configuration file. +# Default Value: +BIND_INTERFACE_IP= + +# Description: Starting TCP port number for IMEX peer communication +# Possible Values: +# Any value between 0 and 65535 +# Default Value: 50000 +SERVER_PORT=50000 + +# Description: Name of file containing IP addresses of nodes +# Possible Values: +# Full path/filename string (max length of 256). +# Default Value: /etc/nvidia-imex/nodes_config.cfg +IMEX_NODE_CONFIG_FILE=<%= node['cluster']['shared_dir'] %>/nodes_config.cfg + +# Description: Name of the network interface used for communication. +# OPTIONAL - If empty, network interface will be determined by matching bind IP to +# node configuration file. Only necessary to configure if the bind IP +# is IPv6 link-local and on multiple network interfaces. +# Possible Values: +# Interface names like eth0, ens32 .. etc +# Default Value: +NETWORK_INTERFACE= + +# Description: Controls whether IMEX should complete initialization without establishing quorum +# Possible values: +# NONE: Do not wait for any quorum with other nodes. +# RECOVERY: In case of unsafe IMEX termination, wait until all nodes that had previously imported +# have connected, allowing them time to safely clean up any potentially hanging references +# Default value: RECOVERY +IMEX_WAIT_FOR_QUORUM=RECOVERY + +# Description: Enable authentication and encryption between nodes. +# Possible Values: +# 0: Disable encryption and authentication +# 1: Enable encryption and authentication +# Default value: 0 +IMEX_ENABLE_AUTH_ENCRYPTION=0 + +# Description: Controls the security mechanism used by IMEX for authentication and encryption between nodes. +# If IMEX_ENABLE_AUTH_ENCRYPTION is enabled (1), then IMEX_AUTH_ENCRYPTIPON_MODE must be configured +# as one of the supported values. An empty or unexpected value will prevent initialization. +# Possible Values: +# SSL_TLS: Default - Use SSL/mTLS for authentication and encryption. +# GSS_AUTH_ENCRYPT: Use GSSAPI for authentication, integrity and encryption. +# GSS_AUTH_ONLY: Use GSSAPI for authentication and integrity only, encryption will be disabled. +IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS + +### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS mode. ### + +# Description: This determines how IMEX will try to retrieve the keys, certificates, and certificate +# authority for authentication and encryption. +# If IMEX_AUTH_ENCRYPTION_MODE is SSL_TLS, then IMEX_AUTH_SOURCE must be configured +# as one of the supported values. An empty or unexpected value will prevent initialization. +# Possible Values: +# FILE: The provided values are paths to files on the file system. +# ENV_PATH: The provided values are environment variable names to retrieve, and the values in the +# environment variables are treated as paths to files on the file system. +# ENV_VAL: The provided values are environment variable names to retrieve, and the values in the +# environment variables are treated as the actual values for the key/cert/cert auth. +IMEX_AUTH_SOURCE= + +# Description: These fields are interpreted based on how IMEX_AUTH_SOURCE is configured +IMEX_SERVER_KEY= +IMEX_SERVER_CERT= +IMEX_SERVER_CERT_AUTH= +IMEX_CLIENT_KEY= +IMEX_CLIENT_CERT= +IMEX_CLIENT_CERT_AUTH= + +# Description: Override the target hostname for authentication of the certificates and keys. This allows +# certificates with common names that do not match the ip addresses provided for the nodes. +# Example: +# If the certificate has the subject: +# "/C=US/ST=CA/L=Santa Clara/O=NVIDIA/OU=Test/CN=localhost" +# The certificate validation will expect the connection hostname to be "localhost", by +# setting IMEX_SECURITY_TARGET_OVERRIDE=localhost you can cause override the connection +# hostname for security purposes to be "localhost", allowing the connection to succeed. +IMEX_SECURITY_TARGET_OVERRIDE= + +### This is the end of IMEX SSL_TLS mode config parameters. ### + +### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode. ### + +# Description: Service Principal Name to use when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Default Value: host +IMEX_GSS_SERVICE_NAME=host + +# Description: GSSAPI timeout (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# -1 : Default - Retry indefinitely +# >= 0: Number of seconds to wait before triggering clean up +IMEX_GSS_TIMEOUT_SEC=-1 + +# Description: GSSAPI retry interval (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# 5 : Default - Retry every 5 seconds +# >= 0: Number of seconds to wait before retrying +IMEX_GSS_RETRY_INTERVAL_SEC=5 + +# Description: GSSAPI security context lifetime (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT +# or GSS_AUTH_ONLY. +# Possible Values: +# -1 : Default - Indefinite lifetime (limited by the credential lifetime) +# >= 0: Security context lifetime in seconds +IMEX_GSS_SEC_CONTEXT_LIFETIME_SEC=-1 + +# Description: Determines IMEX behavior during fatal GSSAPI failures or timeouts, when IMEX_AUTH_ENCRYPTION_MODE +# is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# 1 : Default - Shutdown IMEX daemon +# 0 : Terminate connection to the failing peer node +IMEX_GSS_SHUTDOWN_ON_FAILURE=1 + +### This is the end of IMEX GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode config parameters. ### + +# Description: Enabled the command/control service to allow for querying information from the IMEX daemon. +# Must be used with IMEX_CMD_PORT (optionally IMEX_CMD_BIND_INTERFACE_IP) and/or +# IMEX_CMD_UNIX_DOMAIN_PATH +IMEX_CMD_ENABLED=1 + +# Description: IP address to use to bind the command/control service. Ignored if IMEX_CMD_ENABLED=0 +# If empty, (but IMEX_CMD_PORT is specified), it will bind to all available interfaces. +IMEX_CMD_BIND_INTERFACE_IP= + +# Description: Port to bind to (in conjunction with IMEX_CMD_BIND_INTERFACE) for the command/control service. +# Ignored if IMEX_CMD_ENABLED=0 +IMEX_CMD_PORT=50005 + +# Description: Unix domain socket path to attach to for the command/control service. Ignored if IMEX_CMD_ENABLED=0 +IMEX_CMD_UNIX_DOMAIN_PATH= + +# Description: Determines how long to wait after detecting that the IMEX daemon has lost connection to another +# node before triggering clean up imports and exports from that node. If a connection is reestablished +# before the grace period expires, and IMEX is able to identify that it is the same instance previously +# connected, then no clean up is required. If a connection is established and IMEX detects that it is +# a new instance (i.e. someone restarted the IMEX daemon), then clean up will be immediately triggered +# regardless of grace period. +# -1: Default - Wait indefinitely +# 0: Immediately trigger clean up +# >0: Number of seconds to wait before triggering clean up +IMEX_NODE_DISCONNECTED_GRACE_TIME=-1 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb new file mode 100644 index 0000000000..22a1737bcf --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb @@ -0,0 +1,3 @@ +## Please replace below fake IP's +172.31.51.93 +172.31.48.43 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb new file mode 100644 index 0000000000..94d2687a21 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb @@ -0,0 +1,26 @@ +[Unit] +Description=NVIDIA IMEX service +After=network-online.target +Requires=network-online.target + +[Service] +Environment="KRB5_CLIENT_KTNAME=/etc/krb5.keytab" +User=root +PrivateTmp=false +Type=forking +TimeoutStartSec=infinity + +ExecStart=/usr/bin/nvidia-imex -c <%= node['cluster']['shared_dir'] %>/nvidia-imex/config.cfg + +LimitCORE=infinity + +Restart=on-failure +RestartSec=1s + +[Install] +WantedBy=multi-user.target + + + + + From 5f03bbb9d5b1727189d4e221bf7a43c1adf25c6e Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 22 Jul 2025 18:40:09 -0400 Subject: [PATCH 02/38] [NVIDIA_IMEX] Adding Unit test for IMEX installation --- .../spec/unit/resources/nvidia_imex_spec.rb | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb new file mode 100644 index 0000000000..2a5ba87f33 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -0,0 +1,199 @@ +require 'spec_helper' + +shared_dir = "SHARED_DIR" +nvidia_version = "NVIDIA_VERSION" +nvidia_imex_shared_dir = "#{shared_dir}/nvidia-imex" + +class ConvergeNvidiaImex + def self.install(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'install' do + action :install + end + end + end +end + +describe 'nvidia_imex:nvidia_enabled_or_installed?' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:resource) do + ConvergeNvidiaImex.install(chef_run) + chef_run.find_resource('nvidia_imex', 'install') + end + + context "when nvidia not enabled and not installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(false) + end + + it 'is false' do + expect(resource.nvidia_enabled_or_installed?).to eq(false) + end + end + + context "when nvidia not enabled but its already installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(true) + end + + it 'is true' do + expect(resource.nvidia_enabled_or_installed?).to eq(true) + end + end + + context "when nvidia is enabled but its not installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(false) + end + + it 'is true' do + expect(resource.nvidia_enabled_or_installed?).to eq(true) + end + end + end + end +end + +describe 'nvidia_imex:imex_installed' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + imex_binary = '/usr/bin/nvidia-imex' + imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' + cached(:chef_run) do + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:resource) do + ConvergeNvidiaImex.install(chef_run) + chef_run.find_resource('nvidia_imex', 'install') + end + + context "when #{imex_binary} and #{imex_ctl_binary} does not exist" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + + it 'is false' do + expect(resource.imex_installed).to eq(false) + end + end + + context "when #{imex_binary} and #{imex_ctl_binary} exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(true) + allow(File).to receive(:exist?).with(imex_binary).and_return(true) + end + + it 'is true' do + expect(resource.imex_installed).to eq(true) + end + end + + context "when #{imex_binary} exists and #{imex_ctl_binary} does not exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(true) + end + + it 'is true' do + expect(resource.imex_installed).to eq(true) + end + end + + context "when #{imex_binary} does not exists and #{imex_ctl_binary} exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(true) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + + it 'is true' do + expect(resource.imex_installed).to eq(true) + end + end + end + end +end + +describe 'nvidia_imex:install' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + context 'when nvidia not enabled' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:nvidia_enabled_or_installed?).and_return(false) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.install(runner) + end + cached(:node) { chef_run.node } + + it 'does not install nvidia-imex' do + is_expected.not_to install_package('nvidia-imex') + end + end + + context 'when nvidia-imex binary already exists' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed).and_return(true) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.install(runner) + end + cached(:node) { chef_run.node } + + it 'does not install nvidia-imex' do + is_expected.not_to install_package('nvidia-imex') + end + end + + context 'when nvidia is enabled' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:nvidia_enabled_or_installed?).and_return(true) + allow(res).to receive(:imex_installed).and_return(false) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + + before do + chef_run.node.override['cluster']['shared_dir'] = shared_dir + chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version + ConvergeNvidiaImex.install(chef_run) + end + + it 'installs nvidia-imex' do + is_expected.to add_nvidia_repo('add nvidia repository') + is_expected.to create_directory(nvidia_imex_shared_dir) + + is_expected.to create_template("#{nvidia_imex_shared_dir}/config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + is_expected.to install_package('nvidia-imex') + .with(retries: 3) + .with(retry_delay: 5) + .with(version: nvidia_version) + end + end + end + end +end From 024ba4e45d8a6f8985ef4531ef3077fd2d7d5f9e Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 22 Jul 2025 18:49:50 -0400 Subject: [PATCH 03/38] [NVIDIA_IMEX] Not Install Nvidia-imex for Isolated regions --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 2 +- .../spec/unit/resources/nvidia_imex_spec.rb | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 023d318b7e..335b13c58a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -17,7 +17,7 @@ action :install do return unless nvidia_enabled_or_installed? - return if on_docker? || imex_installed + return if on_docker? || imex_installed || aws_region.start_with?("us-iso") # Add NVIDIA repo for nvidia-imex nvidia_repo 'add nvidia repository' do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 2a5ba87f33..10e36c75fb 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -165,6 +165,7 @@ def self.install(chef_run) before do chef_run.node.override['cluster']['shared_dir'] = shared_dir + chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version ConvergeNvidiaImex.install(chef_run) end From d53bf01095ade3c4fc0dc088ebcea60bf132ee3f Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 22 Jul 2025 18:50:15 -0400 Subject: [PATCH 04/38] [NVIDIA_IMEX] Install Nvdia-Imex as part of Build Image --- .../recipes/install/nvidia_install.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb index 7b13be7549..c3a8cea6e9 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb @@ -24,3 +24,5 @@ fabric_manager 'Install Nvidia Fabric Manager' nvidia_dcgm 'install Nvidia datacenter-gpu-manager' + +nvidia_imex 'Install Nvidia-imex' \ No newline at end of file From d925ce256422c08962c571ca8c1fcf5d4da3fd0f Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 15:43:01 -0400 Subject: [PATCH 05/38] [Nvidia-imex] Never Install NVIDIA Imex for AL2 --- .../resources/nvidia_imex/nvidia_imex_amazon2.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index b863b9ea00..e43bd0c4a4 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -15,3 +15,8 @@ provides :nvidia_imex, platform: 'amazon', platform_version: '2' use 'partial/_nvidia_imex_common.rb' + +def imex_installed + # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver + return true +end \ No newline at end of file From a46954b89891640b2cb8a2edd42ffc9f44d5f874 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 16:33:51 -0400 Subject: [PATCH 06/38] [Nvidia-imex] Add unit tests for NVidia Imex --- .../spec/unit/resources/nvidia_imex_spec.rb | 89 +++++++++++++------ 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 10e36c75fb..ec67db5ab6 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -3,6 +3,8 @@ shared_dir = "SHARED_DIR" nvidia_version = "NVIDIA_VERSION" nvidia_imex_shared_dir = "#{shared_dir}/nvidia-imex" +imex_binary = '/usr/bin/nvidia-imex' +imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' class ConvergeNvidiaImex def self.install(chef_run) @@ -64,8 +66,6 @@ def self.install(chef_run) describe 'nvidia_imex:imex_installed' do for_all_oses do |platform, version| context "on #{platform}#{version}" do - imex_binary = '/usr/bin/nvidia-imex' - imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' cached(:chef_run) do runner(platform: platform, version: version, step_into: ['nvidia_imex']) end @@ -80,8 +80,14 @@ def self.install(chef_run) allow(File).to receive(:exist?).with(imex_binary).and_return(false) end - it 'is false' do - expect(resource.imex_installed).to eq(false) + if platform == 'amazon' && version =='2' + it 'is true' do + expect(resource.imex_installed).to eq(true) + end + else + it 'is false' do + expect(resource.imex_installed).to eq(false) + end end end @@ -158,7 +164,8 @@ def self.install(chef_run) cached(:chef_run) do stubs_for_resource('nvidia_imex') do |res| allow(res).to receive(:nvidia_enabled_or_installed?).and_return(true) - allow(res).to receive(:imex_installed).and_return(false) + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) end runner(platform: platform, version: version, step_into: ['nvidia_imex']) end @@ -170,29 +177,55 @@ def self.install(chef_run) ConvergeNvidiaImex.install(chef_run) end - it 'installs nvidia-imex' do - is_expected.to add_nvidia_repo('add nvidia repository') - is_expected.to create_directory(nvidia_imex_shared_dir) - - is_expected.to create_template("#{nvidia_imex_shared_dir}/config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - is_expected.to install_package('nvidia-imex') - .with(retries: 3) - .with(retry_delay: 5) - .with(version: nvidia_version) + if platform == 'amazon' && version == '2' + it 'does not install nvidia-imex' do + is_expected.not_to add_nvidia_repo('add nvidia repository') + is_expected.not_to create_directory(nvidia_imex_shared_dir) + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + is_expected.not_to install_package('nvidia-imex') + .with(retries: 3) + .with(retry_delay: 5) + .with(version: nvidia_version) + end + else + it 'installs nvidia-imex' do + is_expected.to add_nvidia_repo('add nvidia repository') + is_expected.to create_directory(nvidia_imex_shared_dir) + + is_expected.to create_template("#{nvidia_imex_shared_dir}/config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + is_expected.to install_package('nvidia-imex') + .with(retries: 3) + .with(retry_delay: 5) + .with(version: nvidia_version) + end end end end From ff89ace4a787bee9135aefee7d4d4d8896e79478 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 16:36:23 -0400 Subject: [PATCH 07/38] [Nvidia-imex] Cookstyle changes --- .../recipes/install/nvidia_install.rb | 2 +- .../resources/nvidia_imex/nvidia_imex_amazon2.rb | 4 ++-- .../spec/unit/resources/nvidia_imex_spec.rb | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb index c3a8cea6e9..b7ab7c4d52 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb @@ -25,4 +25,4 @@ nvidia_dcgm 'install Nvidia datacenter-gpu-manager' -nvidia_imex 'Install Nvidia-imex' \ No newline at end of file +nvidia_imex 'Install Nvidia-imex' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index e43bd0c4a4..543c953137 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -18,5 +18,5 @@ def imex_installed # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver - return true -end \ No newline at end of file + true +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index ec67db5ab6..123ed57c8f 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -80,7 +80,7 @@ def self.install(chef_run) allow(File).to receive(:exist?).with(imex_binary).and_return(false) end - if platform == 'amazon' && version =='2' + if platform == 'amazon' && version == '2' it 'is true' do expect(resource.imex_installed).to eq(true) end @@ -192,9 +192,9 @@ def self.install(chef_run) .with(group: 'root') .with(mode: '0755') is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') .with(mode: '0644') is_expected.not_to install_package('nvidia-imex') .with(retries: 3) From cdce37fc33e24a875ef2cec88de96cab59635c0b Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 17:24:28 -0400 Subject: [PATCH 08/38] [NVIDIA_IMEX] Adding Kitchen test for Installation and Configuration --- .../test/controls/nvidia_imex_spec.rb | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb new file mode 100644 index 0000000000..f80aa62f86 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -0,0 +1,48 @@ +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +control 'tag:install_expected_versions_of_nvidia_imex_installed' do + only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } + + describe package('nvidia-imex') do + it { should be_installed } + its('version') { should match /#{node['cluster']['nvidia']['driver_version']}/ } + end + + %w(/usr/bin/nvidia-imex /usr/bin/nvidia-imex-ctl).each do |path| + describe file(path) do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0755' } + end + end + + nvidia_imex_dir = "#{node['cluster']['shared_dir']}/nvidia-imex" + + %w("#{nvidia_imex_dir}/config.cfg" "#{nvidia_imex_dir}/nodes_config.cfg").each do |conf_files| + describe file(conf_files) do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0755' } + end + end +end + +control 'tag:config_nvidia_fabric_manager_enabled' do + only_if { instance.nvs_switch_enabled? } + + describe service('nvidia-imex') do + it { should be_enabled } + it { should be_running } + end +end From 6726f3ebe7eaaf8b58d6412628255d2cc60ffac7 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 18:02:09 -0400 Subject: [PATCH 09/38] [FABRIC MANAGER] Using common library for getting NVSwitch count --- .../libraries/nvidia.rb | 18 ++++++++++++++++++ .../partial/_fabric_manager_common.rb | 10 ++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index a5952b0313..3e9e59873e 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -18,3 +18,21 @@ def is_process_running(process_name) !ps.stdout.strip.empty? end + +# +# Get Count of GPUs in instance +# +def get_nvswitch_count(device_id) + shell_out("lspci -d #{device_id} | wc -l").stdout.strip.to_i +end + +def get_device_ids + # A100 (P4), H100(P5), B200(P6) and GB200()p6e) systems have NVSwitches + # NVSwitch device id is 10de:1af1 for P4 instance + # NVSwitch device id is 10de:22a3 for P5 instance + # NVSwitch device id is 10de:2901 for P6 instance + # NVSwitch device id is 10de:2941 for P6e instance + # We sum the count for all these deviceIds as output of lscpi command will be >0 + # for only one device ID based on the instance type + { 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' } +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 057ac80c17..4b6c74d0ea 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -54,12 +54,6 @@ def _nvidia_driver_version # Get number of nv switches def get_nvswitches - # A100 (P4), H100(P5) and B200(P6) systems have NVSwitches - # NVSwitch device id is 10de:1af1 for P4 instance - # NVSwitch device id is 10de:22a3 for P5 instance - # NVSwitch device id is 10de:2901 for P6 instance - # We sum the count for all these deviceIds as output of lscpi command will be >0 - # for only one device ID based on the instance type - nvswitch_device_ids = ['10de:1af1', '10de:22a3', '10de:2901'] - nvswitch_device_ids.sum { |id| shell_out("lspci -d #{id} | wc -l").stdout.strip.to_i } + nvswitch_device_ids = get_device_ids.values + nvswitch_device_ids.sum { |id| get_nvswitch_count(id) } end From 368fbeed800e267fcad853b2b12c36a3e9f74d49 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 23 Jul 2025 18:10:11 -0400 Subject: [PATCH 10/38] [NVIDIA-IMEX] Configure Nvidia-imex only if we use Gb200 instance --- .../nvidia_imex/partial/_nvidia_imex_common.rb | 11 +++++++++++ .../test/controls/nvidia_imex_spec.rb | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 335b13c58a..9a4d2eef1c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -55,6 +55,17 @@ end end +action :configure do + return unless imex_installed + # Start nvidia-imex on p6e-gb200 + if get_nvswitch_count(get_device_ids['gb200']) > 1 + service 'nvidia-imex' do + action %i(start enable) + supports status: true + end unless on_docker? + end +end + def imex_installed ::File.exist?('/usr/bin/nvidia-imex') || ::File.exist?('/usr/bin/nvidia-imex-ctl') end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index f80aa62f86..ac03d1cef7 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -28,7 +28,7 @@ nvidia_imex_dir = "#{node['cluster']['shared_dir']}/nvidia-imex" - %w("#{nvidia_imex_dir}/config.cfg" "#{nvidia_imex_dir}/nodes_config.cfg").each do |conf_files| + ["#{nvidia_imex_dir}/config.cfg", "#{nvidia_imex_dir}/nodes_config.cfg"].each do |conf_files| describe file(conf_files) do it { should exist } its('owner') { should eq 'root' } @@ -39,7 +39,7 @@ end control 'tag:config_nvidia_fabric_manager_enabled' do - only_if { instance.nvs_switch_enabled? } + only_if { instance.nvs_switch_enabled? && node['cluster']['node_type'] == "ComputeFleet" } describe service('nvidia-imex') do it { should be_enabled } From 9f09451a94dc519c27911e250de498347b4e17eb Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 24 Jul 2025 17:50:12 -0400 Subject: [PATCH 11/38] [NVIDIA-IMEX] USe specific Version naming for nvidia-imex installation --- .../resources/nvidia_imex/nvidia_imex_alinux2023.rb | 5 +++++ .../resources/nvidia_imex/nvidia_imex_redhat8.rb | 5 +++++ .../resources/nvidia_imex/nvidia_imex_rocky8.rb | 5 +++++ .../resources/nvidia_imex/nvidia_imex_ubuntu22+.rb | 5 +++++ .../nvidia_imex/partial/_nvidia_imex_common.rb | 2 +- .../spec/unit/resources/nvidia_imex_spec.rb | 13 ++++++++++--- 6 files changed, 31 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index bb39282744..5b2cdd945f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -17,3 +17,8 @@ end use 'partial/_nvidia_imex_common.rb' + +def _nvidia_imex_version + nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] + "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index aeff2dffdd..ba1b145faa 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -17,3 +17,8 @@ end use 'partial/_nvidia_imex_common.rb' + +def _nvidia_imex_version + nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] + "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index 2f91dd302a..0216613628 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -17,3 +17,8 @@ end use 'partial/_nvidia_imex_common.rb' + +def _nvidia_imex_version + nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] + "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index b3c26d0a94..191e1e9fd3 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -17,3 +17,8 @@ end use 'partial/_nvidia_imex_common.rb' + +def _nvidia_imex_version + nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] + "#{nvidia_major_version}_#{node['cluster']['nvidia']['driver_version']}-1" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 9a4d2eef1c..08383255c8 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -51,7 +51,7 @@ package 'nvidia-imex' do retries 3 retry_delay 5 - version node['cluster']['nvidia']['driver_version'] + version _nvidia_imex_version end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 123ed57c8f..dbb711b763 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -1,7 +1,7 @@ require 'spec_helper' shared_dir = "SHARED_DIR" -nvidia_version = "NVIDIA_VERSION" +nvidia_version = "1.2.3" nvidia_imex_shared_dir = "#{shared_dir}/nvidia-imex" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' @@ -169,6 +169,13 @@ def self.install(chef_run) end runner(platform: platform, version: version, step_into: ['nvidia_imex']) end + cached(:nvidia_imex_version) do + if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' + "1-1.2.3-1" + else + "1_1.2.3-1" + end + end before do chef_run.node.override['cluster']['shared_dir'] = shared_dir @@ -199,7 +206,7 @@ def self.install(chef_run) is_expected.not_to install_package('nvidia-imex') .with(retries: 3) .with(retry_delay: 5) - .with(version: nvidia_version) + .with(version: nvidia_imex_version) end else it 'installs nvidia-imex' do @@ -224,7 +231,7 @@ def self.install(chef_run) is_expected.to install_package('nvidia-imex') .with(retries: 3) .with(retry_delay: 5) - .with(version: nvidia_version) + .with(version: nvidia_imex_version) end end end From f743ef83470a622e98cc63981bcfe5046fdb8674 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Sun, 27 Jul 2025 21:01:00 -0400 Subject: [PATCH 12/38] [NVIDIA-IMEX] Install Nvidia-imex and flush cache before it --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 08383255c8..867601a0da 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -51,6 +51,7 @@ package 'nvidia-imex' do retries 3 retry_delay 5 + flush_cache({ before: true }) version _nvidia_imex_version end end From 750563566e664daae50185f2abd2b321607204f1 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Sun, 27 Jul 2025 21:05:03 -0400 Subject: [PATCH 13/38] [NVIDIA-IMEX] Redirect nvidia-imex to system logs which are pushed in CW --- .../templates/nvidia-imex/nvidia-imex-config.erb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb index a6c5403883..1059e94af2 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb @@ -18,7 +18,7 @@ LOG_LEVEL=4 # to console(stderr). If the specified log file can't be opened or the # path is empty. # Default Value: /var/log/nvidia-imex.log -LOG_FILE_NAME=/var/log/nvidia-imex.log +# LOG_FILE_NAME=/var/log/nvidia-imex.log # Description: Filename for IMEX stats logging # Possible Values: @@ -28,7 +28,7 @@ LOG_FILE_NAME=/var/log/nvidia-imex.log # Default Value: /var/log/nvidia-imex-stats.log # Note: If STATS_FILE_NAME is configured same as LOG_FILE_NAME, then stats will # be redirected to the path/filename specified by LOG_FILE_NAME. -STATS_FILE_NAME=/var/log/nvidia-imex-stats.log +# STATS_FILE_NAME=/var/log/nvidia-imex-stats.log # Description: Append to an existing log file or overwrite the logs # Possible Values: @@ -41,7 +41,7 @@ LOG_APPEND_TO_LOG=1 # Possible Values: # Any Integer values # Default Value: 1024 -LOG_FILE_MAX_SIZE=1024 +# LOG_FILE_MAX_SIZE=1024 # Description: Number of times the IMEX log is rotated once it reaches LOG_FILE_MAX_SIZE # Possible Values: @@ -58,7 +58,7 @@ LOG_MAX_ROTATE_COUNT=3 # 0 - No # 1 - Yes # Default Value: 0 -LOG_USE_SYSLOG=0 +LOG_USE_SYSLOG=1 # Description: daemonize IMEX on start-up # Possible Values: From 4bfca5bae7566f95d159e9f2f6dd371b573fa578 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Sun, 27 Jul 2025 21:24:36 -0400 Subject: [PATCH 14/38] [NVIDIA-IMEX] Install With specific version in name --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 867601a0da..003f186155 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -48,11 +48,11 @@ action :create end - package 'nvidia-imex' do + package "nvidia-imex-#{_nvidia_imex_version}" do retries 3 retry_delay 5 flush_cache({ before: true }) - version _nvidia_imex_version + # version _nvidia_imex_version end end From dc8e30a50a583473eb2cdd54fcfdee4569a3ffe8 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 09:47:26 -0400 Subject: [PATCH 15/38] [NVIDIA-IMEX] Removing flush cache as it does not exist for package resource --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 003f186155..844683747b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -51,7 +51,6 @@ package "nvidia-imex-#{_nvidia_imex_version}" do retries 3 retry_delay 5 - flush_cache({ before: true }) # version _nvidia_imex_version end end From 2190669db3b4df02ca53ab47b563fd7a07b0ce46 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 12:16:32 -0400 Subject: [PATCH 16/38] [NVIDIA-IMEX] Installing NVIDIA IMEx using install_packages resource --- .../partial/_nvidia_imex_common.rb | 10 ++++---- .../spec/unit/resources/nvidia_imex_spec.rb | 23 ++++++++++++------- .../test/controls/nvidia_imex_spec.rb | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 844683747b..af5e6a120a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -48,11 +48,13 @@ action :create end - package "nvidia-imex-#{_nvidia_imex_version}" do - retries 3 - retry_delay 5 - # version _nvidia_imex_version + install_packages 'Install nvidia-imex' do + packages "nvidia-imex-#{_nvidia_imex_version}" + action :install end + # Save Imex version in Node Attributes for InSpec Tests + node.default['cluster']['nvidia']['imex']['version'] = _nvidia_imex_version + node_attributes 'dump node attributes' end action :configure do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index dbb711b763..90e9a6c8f4 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -176,6 +176,7 @@ def self.install(chef_run) "1_1.2.3-1" end end + cached(:node) { chef_run.node } before do chef_run.node.override['cluster']['shared_dir'] = shared_dir @@ -203,10 +204,13 @@ def self.install(chef_run) .with(user: 'root') .with(group: 'root') .with(mode: '0644') - is_expected.not_to install_package('nvidia-imex') - .with(retries: 3) - .with(retry_delay: 5) - .with(version: nvidia_imex_version) + is_expected.not_to install_install_packages('Install nvidia-imex') + .with(packages: "nvidia-imex-#{nvidia_imex_version}") + .with(action: %i(install)) + end + it 'does not set nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).not_to eq(nvidia_imex_version) + is_expected.not_to write_node_attributes('dump node attributes') end else it 'installs nvidia-imex' do @@ -228,10 +232,13 @@ def self.install(chef_run) .with(user: 'root') .with(group: 'root') .with(mode: '0644') - is_expected.to install_package('nvidia-imex') - .with(retries: 3) - .with(retry_delay: 5) - .with(version: nvidia_imex_version) + is_expected.to install_install_packages('Install nvidia-imex') + .with(packages: "nvidia-imex-#{nvidia_imex_version}") + .with(action: %i(install)) + end + it 'sets nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).to eq(nvidia_imex_version) + is_expected.to write_node_attributes('dump node attributes') end end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index ac03d1cef7..3a4f212962 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -14,7 +14,7 @@ describe package('nvidia-imex') do it { should be_installed } - its('version') { should match /#{node['cluster']['nvidia']['driver_version']}/ } + its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } end %w(/usr/bin/nvidia-imex /usr/bin/nvidia-imex-ctl).each do |path| From 4c7291d8bd23660c3a751a3a8946e625ca5092a6 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 15:12:08 -0400 Subject: [PATCH 17/38] [NVIDIA-IMEX] Adding Unit tests for Configuration of nvidia-imex --- .../nvidia_imex/nvidia_imex_amazon2.rb | 4 + .../partial/_nvidia_imex_common.rb | 20 +++-- .../spec/unit/resources/nvidia_imex_spec.rb | 77 +++++++++++++++++++ .../test/controls/nvidia_imex_spec.rb | 8 +- 4 files changed, 98 insertions(+), 11 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index 543c953137..f639dd50c4 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -20,3 +20,7 @@ def imex_installed # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver true end + +action :configure do + # Do nothing +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index af5e6a120a..9d908cf191 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -24,23 +24,23 @@ action :add end - directory "#{node['cluster']['shared_dir']}/nvidia-imex" + directory "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}" - template "#{node['cluster']['shared_dir']}/nvidia-imex/config.cfg" do + template "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}/config.cfg" do source 'nvidia-imex/nvidia-imex-config.erb' owner 'root' group 'root' mode '0755' end - template "#{node['cluster']['shared_dir']}/nvidia-imex/nodes_config.cfg" do + template "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}/nodes_config.cfg" do source 'nvidia-imex/nvidia-imex-nodes.erb' owner 'root' group 'root' mode '0755' end - template "/etc/systemd/system/nvidia-imex.service" do + template "/etc/systemd/system/#{nvidia_imex_service}.service" do source 'nvidia-imex/nvidia-imex.service.erb' owner 'root' group 'root' @@ -49,7 +49,7 @@ end install_packages 'Install nvidia-imex' do - packages "nvidia-imex-#{_nvidia_imex_version}" + packages "#{nvidia_imex_service}-#{_nvidia_imex_version}" action :install end # Save Imex version in Node Attributes for InSpec Tests @@ -61,15 +61,19 @@ return unless imex_installed # Start nvidia-imex on p6e-gb200 if get_nvswitch_count(get_device_ids['gb200']) > 1 - service 'nvidia-imex' do + service nvidia_imex_service do action %i(start enable) supports status: true - end unless on_docker? + end end end +def nvidia_imex_service + 'nvidia-imex' +end + def imex_installed - ::File.exist?('/usr/bin/nvidia-imex') || ::File.exist?('/usr/bin/nvidia-imex-ctl') + ::File.exist?("/usr/bin/#{nvidia_imex_service}") || ::File.exist?("/usr/bin/#{nvidia_imex_service}-ctl") end def nvidia_enabled_or_installed? diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 90e9a6c8f4..d211d3901d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -14,6 +14,14 @@ def self.install(chef_run) end end end + + def self.configure(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'configure' do + action :configure + end + end + end end describe 'nvidia_imex:nvidia_enabled_or_installed?' do @@ -245,3 +253,72 @@ def self.install(chef_run) end end end + +describe 'nvidia_imex:configure' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + context 'when nvidia-imex binary is not installed' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed).and_return(false) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + it 'does not configure nvidia-imex' do + is_expected.not_to configure_nvidia_imex('nvidia-imex') + end + end + + context 'when get_nvswitch_count > 1' do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + end + + if platform == 'amazon' && version == '2' + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + end + else + it 'starts nvidia-imex service' do + is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + end + end + end + + context 'when get_nvswitch_count <= 1' do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + end + + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + end + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index 3a4f212962..da4383eed2 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -12,12 +12,14 @@ control 'tag:install_expected_versions_of_nvidia_imex_installed' do only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } - describe package('nvidia-imex') do + nvidia_imex_service = 'nvidia-imex' + + describe package(nvidia_imex_service) do it { should be_installed } its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } end - %w(/usr/bin/nvidia-imex /usr/bin/nvidia-imex-ctl).each do |path| + ["/usr/bin/#{nvidia_imex_service}", "/usr/bin/#{nvidia_imex_service}-ctl"].each do |path| describe file(path) do it { should exist } its('owner') { should eq 'root' } @@ -26,7 +28,7 @@ end end - nvidia_imex_dir = "#{node['cluster']['shared_dir']}/nvidia-imex" + nvidia_imex_dir = "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}" ["#{nvidia_imex_dir}/config.cfg", "#{nvidia_imex_dir}/nodes_config.cfg"].each do |conf_files| describe file(conf_files) do From c336054bccfb8d530afdbdbc296163035ab52824 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 16:03:30 -0400 Subject: [PATCH 18/38] [NVIDIA-IMEX] Configuring nvidia-imex only for gb200 and ComputeFleet node --- .../recipes/config/nvidia_config.rb | 4 ++ .../partial/_nvidia_imex_common.rb | 4 +- .../spec/unit/resources/nvidia_imex_spec.rb | 47 ++++++++++--------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb index 06fc108637..f9f4ea2fc6 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb @@ -24,3 +24,7 @@ end include_recipe "aws-parallelcluster-platform::nvidia_uvm" + +nvidia_imex 'Configure nvidia-imex' do + action :configure +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 9d908cf191..c4d2243093 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -58,8 +58,8 @@ end action :configure do - return unless imex_installed - # Start nvidia-imex on p6e-gb200 + return unless imex_installed && node['cluster']['node_type'] == "ComputeFleet" + # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if get_nvswitch_count(get_device_ids['gb200']) > 1 service nvidia_imex_service do action %i(start enable) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index d211d3901d..af9c50a9b7 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -257,7 +257,7 @@ def self.configure(chef_run) describe 'nvidia_imex:configure' do for_all_oses do |platform, version| context "on #{platform}#{version}" do - context 'when nvidia-imex binary is not installed' do + context "when nvidia-imex binary is not installed" do cached(:chef_run) do stubs_for_resource('nvidia_imex') do |res| allow(res).to receive(:imex_installed).and_return(false) @@ -272,34 +272,37 @@ def self.configure(chef_run) end end - context 'when get_nvswitch_count > 1' do - cached(:chef_run) do - stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed).and_return(true) - allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) - allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + %w(HeadNode LoginNode ComputeFleet).each do |node_type| + context "when get_nvswitch_count > 1 on #{node_type} node" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) end - runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) - ConvergeNvidiaImex.configure(runner) - end - cached(:node) { chef_run.node } + cached(:node) { chef_run.node } - before do - chef_run.node.override['cluster']['region'] = 'aws_region' - end - - if platform == 'amazon' && version == '2' - it 'does not configure nvidia-imex' do - is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['node_type'] = node_type + ConvergeNvidiaImex.configure(chef_run) end - else - it 'starts nvidia-imex service' do - is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + + if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + end + else + it 'it starts nvidia-imex service' do + is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + end end end end - context 'when get_nvswitch_count <= 1' do + context "when get_nvswitch_count <= 1" do cached(:chef_run) do stubs_for_provider('nvidia_imex[configure]') do |pro| allow(pro).to receive(:imex_installed).and_return(true) From dd7e0ef44b56c70acc39c53eaee61a0d7e717bfc Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 16:08:15 -0400 Subject: [PATCH 19/38] [NVIDIA-IMEX] Not check installation of nvidia-imex for Alinux2 --- .../test/controls/nvidia_imex_spec.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index da4383eed2..008b509e1d 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_versions_of_nvidia_imex_installed' do - only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } + only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? } nvidia_imex_service = 'nvidia-imex' @@ -41,7 +41,7 @@ end control 'tag:config_nvidia_fabric_manager_enabled' do - only_if { instance.nvs_switch_enabled? && node['cluster']['node_type'] == "ComputeFleet" } + only_if { instance.nvs_switch_enabled? && node['cluster']['node_type'] == "ComputeFleet" && !os_properties.alinux2? } describe service('nvidia-imex') do it { should be_enabled } From f9c324feb34afc372ff00c0297cf0d2f54de03fd Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 16:31:28 -0400 Subject: [PATCH 20/38] [NVIDIA-IMEX] Inspec Test --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 1 + .../test/controls/nvidia_imex_spec.rb | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index c4d2243093..3b0fef81fc 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -54,6 +54,7 @@ end # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = _nvidia_imex_version + node.default['cluster']['nvidia']['imex']['package'] = "#{nvidia_imex_service}-#{node['cluster']['nvidia']['driver_version'].split('.')[0]}" node_attributes 'dump node attributes' end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index 008b509e1d..6a82a4b6e8 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -14,7 +14,7 @@ nvidia_imex_service = 'nvidia-imex' - describe package(nvidia_imex_service) do + describe package("#{node['cluster']['nvidia']['imex']['package']}") do it { should be_installed } its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } end From ee601b77cd0fa77be68acb2fd6a4a85ef73373f0 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 28 Jul 2025 17:34:09 -0400 Subject: [PATCH 21/38] [NVIDIA-IMEX] Setting Nvidia-imex node attributes which should show the version and package name as seen on the repositories --- .../resources/nvidia_imex/nvidia_imex_alinux2023.rb | 3 +-- .../resources/nvidia_imex/nvidia_imex_redhat8.rb | 3 +-- .../resources/nvidia_imex/nvidia_imex_rocky8.rb | 3 +-- .../resources/nvidia_imex/nvidia_imex_ubuntu22+.rb | 3 +-- .../nvidia_imex/partial/_nvidia_imex_common.rb | 12 ++++++++++-- .../spec/unit/resources/nvidia_imex_spec.rb | 11 ++++++----- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index 5b2cdd945f..be838b6460 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -19,6 +19,5 @@ use 'partial/_nvidia_imex_common.rb' def _nvidia_imex_version - nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] - "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" + "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index ba1b145faa..f0fb13c7e2 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -19,6 +19,5 @@ use 'partial/_nvidia_imex_common.rb' def _nvidia_imex_version - nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] - "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" + "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index 0216613628..8c60db55e2 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -19,6 +19,5 @@ use 'partial/_nvidia_imex_common.rb' def _nvidia_imex_version - nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] - "#{nvidia_major_version}-#{node['cluster']['nvidia']['driver_version']}-1" + "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index 191e1e9fd3..b77d13a734 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -19,6 +19,5 @@ use 'partial/_nvidia_imex_common.rb' def _nvidia_imex_version - nvidia_major_version = node['cluster']['nvidia']['driver_version'].split('.')[0] - "#{nvidia_major_version}_#{node['cluster']['nvidia']['driver_version']}-1" + "#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 3b0fef81fc..316fca8be7 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -53,8 +53,8 @@ action :install end # Save Imex version in Node Attributes for InSpec Tests - node.default['cluster']['nvidia']['imex']['version'] = _nvidia_imex_version - node.default['cluster']['nvidia']['imex']['package'] = "#{nvidia_imex_service}-#{node['cluster']['nvidia']['driver_version'].split('.')[0]}" + node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version + node.default['cluster']['nvidia']['imex']['package'] = "#{nvidia_imex_service}-#{nvidia_driver_major_version}" node_attributes 'dump node attributes' end @@ -69,10 +69,18 @@ end end +def nvidia_driver_major_version + node['cluster']['nvidia']['driver_version'].split('.')[0] +end + def nvidia_imex_service 'nvidia-imex' end +def nvidia_imex_full_version + "#{node['cluster']['nvidia']['driver_version']}-1" +end + def imex_installed ::File.exist?("/usr/bin/#{nvidia_imex_service}") || ::File.exist?("/usr/bin/#{nvidia_imex_service}-ctl") end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index af9c50a9b7..3cea658347 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -177,11 +177,12 @@ def self.configure(chef_run) end runner(platform: platform, version: version, step_into: ['nvidia_imex']) end - cached(:nvidia_imex_version) do + cached(:nvidia_imex_version) { "1.2.3-1" } + cached(:nvidia_imex_name) do if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' - "1-1.2.3-1" + "nvidia-imex-1-#{nvidia_imex_version}" else - "1_1.2.3-1" + "nvidia-imex-1_#{nvidia_imex_version}" end end cached(:node) { chef_run.node } @@ -213,7 +214,7 @@ def self.configure(chef_run) .with(group: 'root') .with(mode: '0644') is_expected.not_to install_install_packages('Install nvidia-imex') - .with(packages: "nvidia-imex-#{nvidia_imex_version}") + .with(packages: "#{nvidia_imex_name}") .with(action: %i(install)) end it 'does not set nvidia-imex version' do @@ -241,7 +242,7 @@ def self.configure(chef_run) .with(group: 'root') .with(mode: '0644') is_expected.to install_install_packages('Install nvidia-imex') - .with(packages: "nvidia-imex-#{nvidia_imex_version}") + .with(packages: "#{nvidia_imex_name}") .with(action: %i(install)) end it 'sets nvidia-imex version' do From f40e93674a6abb01a19c254cef8e3135413fb12b Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 29 Jul 2025 09:53:11 -0400 Subject: [PATCH 22/38] [NVIDIA-IMEX] Test epoch version --- .../resources/nvidia_imex/nvidia_imex_ubuntu22+.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index b77d13a734..4e99b8b1d5 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -19,5 +19,8 @@ use 'partial/_nvidia_imex_common.rb' def _nvidia_imex_version - "#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" + "1:#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" + # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. + # See details here: + # https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver end From 876fccbb8b63bafd7eebf3a58fd8b6f7b459a425 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 29 Jul 2025 12:59:07 -0400 Subject: [PATCH 23/38] [NVIDIA-IMEX] Add Version and package name for debian installation --- .../nvidia_imex/nvidia_imex_alinux2023.rb | 1 + .../nvidia_imex/nvidia_imex_amazon2.rb | 1 + .../nvidia_imex/nvidia_imex_redhat8.rb | 1 + .../nvidia_imex/nvidia_imex_rocky8.rb | 1 + .../nvidia_imex/nvidia_imex_ubuntu22+.rb | 6 ++--- .../partial/_nvidia_imex_common.rb | 11 +++++---- .../partial/_nvidia_imex_debian.rb | 22 ++++++++++++++++++ .../nvidia_imex/partial/_nvidia_imex_rhel.rb | 20 ++++++++++++++++ .../spec/unit/resources/nvidia_imex_spec.rb | 23 +++++++++++++++---- 9 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index be838b6460..4b22671903 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -17,6 +17,7 @@ end use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' def _nvidia_imex_version "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index f639dd50c4..c807366837 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -15,6 +15,7 @@ provides :nvidia_imex, platform: 'amazon', platform_version: '2' use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' def imex_installed # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index f0fb13c7e2..0832631fe5 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -17,6 +17,7 @@ end use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' def _nvidia_imex_version "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index 8c60db55e2..287135268a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -17,6 +17,7 @@ end use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' def _nvidia_imex_version "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index 4e99b8b1d5..b391d776c1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -17,10 +17,8 @@ end use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_debian.rb' def _nvidia_imex_version - "1:#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" - # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. - # See details here: - # https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver + "#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 316fca8be7..02c3120c78 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true # -# Copyright:: 2013-2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. @@ -48,14 +48,15 @@ action :create end - install_packages 'Install nvidia-imex' do - packages "#{nvidia_imex_service}-#{_nvidia_imex_version}" - action :install - end + action_install_imex # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version node.default['cluster']['nvidia']['imex']['package'] = "#{nvidia_imex_service}-#{nvidia_driver_major_version}" node_attributes 'dump node attributes' + + nvidia_repo 'remove nvidia repository' do + action :remove + end end action :configure do diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb new file mode 100644 index 0000000000..1deebf2e8d --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_imex do + apt_package "Install nvidia-imex" do + package_name "#{nvidia_imex_service}-#{nvidia_driver_major_version}" + version nvidia_imex_full_version + retries 10 + retry_delay 5 + end +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb new file mode 100644 index 0000000000..8ff025a315 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_imex do + install_packages 'Install nvidia-imex' do + packages "#{nvidia_imex_service}-#{_nvidia_imex_version}" + action :install + end +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 3cea658347..ca6bfe229a 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -178,11 +178,12 @@ def self.configure(chef_run) runner(platform: platform, version: version, step_into: ['nvidia_imex']) end cached(:nvidia_imex_version) { "1.2.3-1" } + cached(:nvidia_imex_package) { "nvidia-imex-1" } cached(:nvidia_imex_name) do if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' - "nvidia-imex-1-#{nvidia_imex_version}" + "#{nvidia_imex_package}-#{nvidia_imex_version}" else - "nvidia-imex-1_#{nvidia_imex_version}" + "#{nvidia_imex_package}_#{nvidia_imex_version}" end end cached(:node) { chef_run.node } @@ -219,7 +220,9 @@ def self.configure(chef_run) end it 'does not set nvidia-imex version' do expect(node.default['cluster']['nvidia']['imex']['version']).not_to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).not_to eq(nvidia_imex_package) is_expected.not_to write_node_attributes('dump node attributes') + is_expected.not_to remove_nvidia_repo('remove nvidia repository') end else it 'installs nvidia-imex' do @@ -241,13 +244,23 @@ def self.configure(chef_run) .with(user: 'root') .with(group: 'root') .with(mode: '0644') - is_expected.to install_install_packages('Install nvidia-imex') - .with(packages: "#{nvidia_imex_name}") - .with(action: %i(install)) + if platform == 'ubuntu' + is_expected.to install_apt_package('Install nvidia-imex') + .with(package_name: nvidia_imex_package) + .with(version: nvidia_imex_version) + .with(retries: 10) + .with(retry_delay: 5) + else + is_expected.to install_install_packages('Install nvidia-imex') + .with(packages: nvidia_imex_name) + .with(action: %i(install)) + end end it 'sets nvidia-imex version' do expect(node.default['cluster']['nvidia']['imex']['version']).to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).to eq(nvidia_imex_package) is_expected.to write_node_attributes('dump node attributes') + is_expected.to remove_nvidia_repo('remove nvidia repository') end end end From caa3c91e97695a01f9e5292f8e64c96eda29373f Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 29 Jul 2025 15:30:23 -0400 Subject: [PATCH 24/38] [NVIDIA-IMEX] Add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a6f66dc3b..eef76b57c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. +- Install nvidia-imex for all OSs except AL2. **BUG FIXES** - Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. From c78a8d4cc5953e428f998fafa59ca6d43a84eb73 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 29 Jul 2025 18:10:15 -0400 Subject: [PATCH 25/38] Add unit test for checking configuration of nvidia-imex --- .../spec/unit/recipes/nvidia_spec.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb index d27afb1df4..718e5d2f5d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb @@ -15,6 +15,10 @@ is_expected.to configure_gdrcopy('Configure gdrcopy') end + it 'configures nvidia-imex' do + is_expected.to configure_nvidia_imex('Configure nvidia-imex') + end + it 'loads nvidia-uvm kernel module' do is_expected.to load_kernel_module('nvidia-uvm') end From 9bed1c0705506d4b65ce6a82ad994fcaec22a6c9 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 29 Jul 2025 22:24:31 -0400 Subject: [PATCH 26/38] [Nvidia-Imex] Use nvidia-imex shared directory for Inspec and configuration files --- .../attributes/platform.rb | 3 ++ .../partial/_nvidia_imex_common.rb | 6 +-- .../spec/unit/resources/nvidia_imex_spec.rb | 5 +-- .../nvidia-imex/nvidia-imex-config.erb | 2 +- .../nvidia-imex/nvidia-imex.service.erb | 2 +- .../test/controls/nvidia_imex_spec.rb | 41 ++++++++++++------- 6 files changed, 37 insertions(+), 22 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 9b6e3e06be..a673f05059 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -22,6 +22,9 @@ default['cluster']['nvidia']['driver_version'] = '550.127.08' end +# Nvidia-imex +default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" + # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" default['cluster']['dcv']['authenticator']['user_id'] = node['cluster']['reserved_base_uid'] + 3 diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 02c3120c78..bf25cb513e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -24,16 +24,16 @@ action :add end - directory "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}" + directory node['cluster']['nvidia']['imex']['shared_dir'] - template "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}/config.cfg" do + template "#{node['cluster']['nvidia']['imex']['shared_dir']}/config.cfg" do source 'nvidia-imex/nvidia-imex-config.erb' owner 'root' group 'root' mode '0755' end - template "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}/nodes_config.cfg" do + template "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config.cfg" do source 'nvidia-imex/nvidia-imex-nodes.erb' owner 'root' group 'root' diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index ca6bfe229a..2ae180a8f6 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -1,8 +1,7 @@ require 'spec_helper' -shared_dir = "SHARED_DIR" nvidia_version = "1.2.3" -nvidia_imex_shared_dir = "#{shared_dir}/nvidia-imex" +nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' @@ -189,7 +188,7 @@ def self.configure(chef_run) cached(:node) { chef_run.node } before do - chef_run.node.override['cluster']['shared_dir'] = shared_dir + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version ConvergeNvidiaImex.install(chef_run) diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb index 1059e94af2..4a144ad8f6 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb @@ -86,7 +86,7 @@ SERVER_PORT=50000 # Possible Values: # Full path/filename string (max length of 256). # Default Value: /etc/nvidia-imex/nodes_config.cfg -IMEX_NODE_CONFIG_FILE=<%= node['cluster']['shared_dir'] %>/nodes_config.cfg +IMEX_NODE_CONFIG_FILE=<%= node['cluster']['nvidia']['imex']['shared_dir'] %>/nodes_config.cfg # Description: Name of the network interface used for communication. # OPTIONAL - If empty, network interface will be determined by matching bind IP to diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb index 94d2687a21..f0f731e48d 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb @@ -10,7 +10,7 @@ PrivateTmp=false Type=forking TimeoutStartSec=infinity -ExecStart=/usr/bin/nvidia-imex -c <%= node['cluster']['shared_dir'] %>/nvidia-imex/config.cfg +ExecStart=/usr/bin/nvidia-imex -c <%= node['cluster']['nvidia']['imex']['shared_dir'] %>/config.cfg LimitCORE=infinity diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index 6a82a4b6e8..b075baeba6 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -13,12 +13,6 @@ only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? } nvidia_imex_service = 'nvidia-imex' - - describe package("#{node['cluster']['nvidia']['imex']['package']}") do - it { should be_installed } - its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } - end - ["/usr/bin/#{nvidia_imex_service}", "/usr/bin/#{nvidia_imex_service}-ctl"].each do |path| describe file(path) do it { should exist } @@ -28,15 +22,34 @@ end end - nvidia_imex_dir = "#{node['cluster']['shared_dir']}/#{nvidia_imex_service}" + nvidia_imex_dir = "#{node['cluster']['nvidia']['imex']['shared_dir']}" - ["#{nvidia_imex_dir}/config.cfg", "#{nvidia_imex_dir}/nodes_config.cfg"].each do |conf_files| - describe file(conf_files) do - it { should exist } - its('owner') { should eq 'root' } - its('group') { should eq 'root' } - its('mode') { should cmp '0755' } - end + describe file("#{nvidia_imex_dir}/config.cfg") do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0755' } + its('content') { should match %r{/IMEX_NODE_CONFIG_FILE=#{nvidia_imex_dir}/nodes_config.cfg/} } + end + + describe file("#{nvidia_imex_dir}/nodes_config.cfg") do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0755' } + end + + describe file("/etc/systemd/system/#{nvidia_imex_service}.service") do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0644' } + its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{nvidia_imex_dir}/config.cfg} } + end + + describe package("#{node['cluster']['nvidia']['imex']['package']}") do + it { should be_installed } + its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } end end From 8efff28c50fed0af4eebf2be374d09d390dbc63d Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 12:47:02 -0400 Subject: [PATCH 27/38] [Nvidia-Imex] Update copyright year --- .../resources/nvidia_imex/nvidia_imex_alinux2023.rb | 2 +- .../resources/nvidia_imex/nvidia_imex_amazon2.rb | 2 +- .../resources/nvidia_imex/nvidia_imex_redhat8.rb | 2 +- .../resources/nvidia_imex/nvidia_imex_rocky8.rb | 2 +- .../resources/nvidia_imex/nvidia_imex_ubuntu22+.rb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index 4b22671903..da5ee50752 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index c807366837..bffd2415a6 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index 0832631fe5..6fb8ad1f68 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index 287135268a..019eafd2dc 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index b391d776c1..d80f923c44 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. From 2efe7bdb95e3c9f7072c97dbd9cf88ab8cea2813 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 12:54:17 -0400 Subject: [PATCH 28/38] [Nvidia-Imex] Adding correct comments --- cookbooks/aws-parallelcluster-platform/attributes/platform.rb | 2 +- cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb | 2 -- .../recipes/install/nvidia_install.rb | 2 +- .../resources/fabric_manager/partial/_fabric_manager_common.rb | 2 ++ .../templates/nvidia-imex/nvidia-imex-nodes.erb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index a673f05059..1fc2c2e12a 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -22,7 +22,7 @@ default['cluster']['nvidia']['driver_version'] = '550.127.08' end -# Nvidia-imex +# nvidia-imex default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" # DCV diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index 3e9e59873e..62d52e6eaa 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -32,7 +32,5 @@ def get_device_ids # NVSwitch device id is 10de:22a3 for P5 instance # NVSwitch device id is 10de:2901 for P6 instance # NVSwitch device id is 10de:2941 for P6e instance - # We sum the count for all these deviceIds as output of lscpi command will be >0 - # for only one device ID based on the instance type { 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' } end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb index b7ab7c4d52..04823b28f2 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb @@ -25,4 +25,4 @@ nvidia_dcgm 'install Nvidia datacenter-gpu-manager' -nvidia_imex 'Install Nvidia-imex' +nvidia_imex 'Install nvidia-imex' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 4b6c74d0ea..f078ec9d6d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -54,6 +54,8 @@ def _nvidia_driver_version # Get number of nv switches def get_nvswitches + # We sum the count for all these deviceIds as output of lscpi command will be >0 + # for only one device ID based on the instance type nvswitch_device_ids = get_device_ids.values nvswitch_device_ids.sum { |id| get_nvswitch_count(id) } end diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb index 22a1737bcf..d48070b80f 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb @@ -1,3 +1,3 @@ -## Please replace below fake IP's +## Please replace below fake IP's with correct IP address of launched instances in Gb200 Capacity Block 172.31.51.93 172.31.48.43 \ No newline at end of file From 95e12280f86ccf2cc49dd6c2e79141ff378a1fa3 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 13:01:36 -0400 Subject: [PATCH 29/38] [Nvidia-Imex] Updating function names --- .../nvidia_imex/nvidia_imex_amazon2.rb | 2 +- .../partial/_nvidia_imex_common.rb | 6 +++--- .../spec/unit/resources/nvidia_imex_spec.rb | 20 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index bffd2415a6..5f0c765bb7 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -17,7 +17,7 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' -def imex_installed +def imex_installed? # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver true end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index bf25cb513e..d1fe19efb2 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -17,7 +17,7 @@ action :install do return unless nvidia_enabled_or_installed? - return if on_docker? || imex_installed || aws_region.start_with?("us-iso") + return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") # Add NVIDIA repo for nvidia-imex nvidia_repo 'add nvidia repository' do @@ -60,7 +60,7 @@ end action :configure do - return unless imex_installed && node['cluster']['node_type'] == "ComputeFleet" + return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if get_nvswitch_count(get_device_ids['gb200']) > 1 service nvidia_imex_service do @@ -82,7 +82,7 @@ def nvidia_imex_full_version "#{node['cluster']['nvidia']['driver_version']}-1" end -def imex_installed +def imex_installed? ::File.exist?("/usr/bin/#{nvidia_imex_service}") || ::File.exist?("/usr/bin/#{nvidia_imex_service}-ctl") end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 2ae180a8f6..e091cec519 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -70,7 +70,7 @@ def self.configure(chef_run) end end -describe 'nvidia_imex:imex_installed' do +describe 'nvidia_imex:imex_installed?' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -89,11 +89,11 @@ def self.configure(chef_run) if platform == 'amazon' && version == '2' it 'is true' do - expect(resource.imex_installed).to eq(true) + expect(resource.imex_installed?).to eq(true) end else it 'is false' do - expect(resource.imex_installed).to eq(false) + expect(resource.imex_installed?).to eq(false) end end end @@ -105,7 +105,7 @@ def self.configure(chef_run) end it 'is true' do - expect(resource.imex_installed).to eq(true) + expect(resource.imex_installed?).to eq(true) end end @@ -116,7 +116,7 @@ def self.configure(chef_run) end it 'is true' do - expect(resource.imex_installed).to eq(true) + expect(resource.imex_installed?).to eq(true) end end @@ -127,7 +127,7 @@ def self.configure(chef_run) end it 'is true' do - expect(resource.imex_installed).to eq(true) + expect(resource.imex_installed?).to eq(true) end end end @@ -155,7 +155,7 @@ def self.configure(chef_run) context 'when nvidia-imex binary already exists' do cached(:chef_run) do stubs_for_resource('nvidia_imex') do |res| - allow(res).to receive(:imex_installed).and_return(true) + allow(res).to receive(:imex_installed?).and_return(true) end runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) ConvergeNvidiaImex.install(runner) @@ -273,7 +273,7 @@ def self.configure(chef_run) context "when nvidia-imex binary is not installed" do cached(:chef_run) do stubs_for_resource('nvidia_imex') do |res| - allow(res).to receive(:imex_installed).and_return(false) + allow(res).to receive(:imex_installed?).and_return(false) end runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) ConvergeNvidiaImex.configure(runner) @@ -289,7 +289,7 @@ def self.configure(chef_run) context "when get_nvswitch_count > 1 on #{node_type} node" do cached(:chef_run) do stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed).and_return(true) + allow(pro).to receive(:imex_installed?).and_return(true) allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) end @@ -318,7 +318,7 @@ def self.configure(chef_run) context "when get_nvswitch_count <= 1" do cached(:chef_run) do stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed).and_return(true) + allow(pro).to receive(:imex_installed?).and_return(true) allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) end From 329166a8dceb988a3ae8386be19198914528953e Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 13:09:26 -0400 Subject: [PATCH 30/38] [Nvidia-Imex] Remove _nvidia_imex_version as it is not needed --- .../resources/nvidia_imex/nvidia_imex_alinux2023.rb | 4 ---- .../resources/nvidia_imex/nvidia_imex_redhat8.rb | 4 ---- .../resources/nvidia_imex/nvidia_imex_rocky8.rb | 4 ---- .../resources/nvidia_imex/nvidia_imex_ubuntu22+.rb | 4 ---- .../resources/nvidia_imex/partial/_nvidia_imex_rhel.rb | 2 +- 5 files changed, 1 insertion(+), 17 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index da5ee50752..4d5f803363 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -18,7 +18,3 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' - -def _nvidia_imex_version - "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index 6fb8ad1f68..704fa7768f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -18,7 +18,3 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' - -def _nvidia_imex_version - "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index 019eafd2dc..e28231d121 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -18,7 +18,3 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' - -def _nvidia_imex_version - "#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index d80f923c44..7a9a455d82 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -18,7 +18,3 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_debian.rb' - -def _nvidia_imex_version - "#{nvidia_driver_major_version}_#{nvidia_imex_full_version}" -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb index 8ff025a315..3cebe94ad8 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -14,7 +14,7 @@ action :install_imex do install_packages 'Install nvidia-imex' do - packages "#{nvidia_imex_service}-#{_nvidia_imex_version}" + packages "#{nvidia_imex_service}-#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" action :install end end From abc3f1f3d8a5bb78f0f61afc143bb8f9d76f844b Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 13:16:53 -0400 Subject: [PATCH 31/38] [Nvidia-Imex] Update action sequence for service --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 2 +- .../spec/unit/resources/nvidia_imex_spec.rb | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index d1fe19efb2..af6fabadda 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -64,7 +64,7 @@ # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if get_nvswitch_count(get_device_ids['gb200']) > 1 service nvidia_imex_service do - action %i(start enable) + action %i(enable start) supports status: true end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index e091cec519..97724b05e5 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -305,11 +305,11 @@ def self.configure(chef_run) if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) it 'does not configure nvidia-imex' do - is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end else it 'it starts nvidia-imex service' do - is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end end @@ -332,7 +332,7 @@ def self.configure(chef_run) end it 'does not configure nvidia-imex' do - is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true }) + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end end From 9f711b82b1efb76ebd2475210f1c74e8663639bb Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 20:17:21 -0400 Subject: [PATCH 32/38] [NVIDIA-IMEX] Comment the official docs for nvidia-imex service file --- .../templates/nvidia-imex/nvidia-imex.service.erb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb index f0f731e48d..3d3a577a3a 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb @@ -1,10 +1,12 @@ +# This file is created by ParallelCluster by following default settings +# as given by official NVIDIA docs https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/gettingstarted.html#on-linux-based-systems + [Unit] Description=NVIDIA IMEX service After=network-online.target Requires=network-online.target [Service] -Environment="KRB5_CLIENT_KTNAME=/etc/krb5.keytab" User=root PrivateTmp=false Type=forking From f68542ecbf19189697632b090fb882686910b8fd Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 20:32:15 -0400 Subject: [PATCH 33/38] [NVIDIA-IMEX] Using common naming convention for package name --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 6 +++++- .../resources/nvidia_imex/partial/_nvidia_imex_debian.rb | 2 +- .../resources/nvidia_imex/partial/_nvidia_imex_rhel.rb | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index af6fabadda..00e7cbb807 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -51,7 +51,7 @@ action_install_imex # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version - node.default['cluster']['nvidia']['imex']['package'] = "#{nvidia_imex_service}-#{nvidia_driver_major_version}" + node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package node_attributes 'dump node attributes' nvidia_repo 'remove nvidia repository' do @@ -70,6 +70,10 @@ end end +def nvidia_imex_package + "#{nvidia_imex_service}-#{nvidia_driver_major_version}" +end + def nvidia_driver_major_version node['cluster']['nvidia']['driver_version'].split('.')[0] end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb index 1deebf2e8d..1d07af83d8 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb @@ -14,7 +14,7 @@ action :install_imex do apt_package "Install nvidia-imex" do - package_name "#{nvidia_imex_service}-#{nvidia_driver_major_version}" + package_name nvidia_imex_package version nvidia_imex_full_version retries 10 retry_delay 5 diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb index 3cebe94ad8..662120bcca 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -14,7 +14,7 @@ action :install_imex do install_packages 'Install nvidia-imex' do - packages "#{nvidia_imex_service}-#{nvidia_driver_major_version}-#{nvidia_imex_full_version}" + packages "#{nvidia_imex_package}-#{nvidia_imex_full_version}" action :install end end From f9a9aed4f80695db87ebe2853032df58c4076a6c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 30 Jul 2025 20:40:21 -0400 Subject: [PATCH 34/38] [NVIDIA-IMEX] Correcting kitchen test --- .../test/controls/nvidia_imex_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index b075baeba6..97754cfcc5 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -29,7 +29,7 @@ its('owner') { should eq 'root' } its('group') { should eq 'root' } its('mode') { should cmp '0755' } - its('content') { should match %r{/IMEX_NODE_CONFIG_FILE=#{nvidia_imex_dir}/nodes_config.cfg/} } + its('content') { should match %r{IMEX_NODE_CONFIG_FILE=#{nvidia_imex_dir}/nodes_config.cfg} } end describe file("#{nvidia_imex_dir}/nodes_config.cfg") do From d43d64084ddbf6edd18ebe7a4e762d687561d7e6 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 1 Aug 2025 16:36:03 -0400 Subject: [PATCH 35/38] [NVIDIA_IMEX] Install nvidia-imex from s3 --- .../partial/_nvidia_imex_debian.rb | 28 ++++++++++++++--- .../nvidia_imex/partial/_nvidia_imex_rhel.rb | 30 +++++++++++++++++-- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb index 1d07af83d8..7f163e704e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb @@ -13,10 +13,30 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_imex do - apt_package "Install nvidia-imex" do - package_name nvidia_imex_package - version nvidia_imex_full_version - retries 10 + remote_file "#{node['cluster']['sources_dir']}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.deb" do + source "#{nvidia_imex_url}" + mode '0644' + retries 3 retry_delay 5 + action :create_if_missing end + + bash "Install nvidia-imex" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-NVIDIA_IMEX + set -e + dpkg -i #{nvidia_imex_package}-#{nvidia_imex_full_version}.deb && apt-mark hold #{nvidia_imex_package} + NVIDIA_IMEX + retries 3 + retry_delay 5 + end +end + +def nvidia_imex_url + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}_#{nvidia_imex_full_version}_#{arch_suffix}.deb" +end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb index 662120bcca..aacd951cd5 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -13,8 +13,32 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_imex do - install_packages 'Install nvidia-imex' do - packages "#{nvidia_imex_package}-#{nvidia_imex_full_version}" - action :install + remote_file "#{node['cluster']['sources_dir']}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.rpm" do + source "#{nvidia_imex_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing end + + package 'yum-plugin-versionlock' + bash "Install nvidia-imex" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-NVIDIA_IMEX + set -e + yum install -y #{nvidia_imex_package}-#{nvidia_imex_full_version}.rpm + yum versionlock #{nvidia_imex_package} + NVIDIA_IMEX + retries 3 + retry_delay 5 + end +end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def nvidia_imex_url + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm" end From b6544a2fd1a71ed7cc2fd68402181544bd8ef99e Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 1 Aug 2025 17:37:58 -0400 Subject: [PATCH 36/38] [NVIDIA_IMEX] Install nvidia-imex from s3 --- .../nvidia_imex/nvidia_imex_alinux2023.rb | 4 + .../nvidia_imex/nvidia_imex_redhat8.rb | 4 + .../nvidia_imex/nvidia_imex_rocky8.rb | 4 + .../nvidia_imex/nvidia_imex_ubuntu22+.rb | 4 + .../partial/_nvidia_imex_common.rb | 73 +++++++++++-------- .../nvidia_imex/partial/_nvidia_imex_rhel.rb | 2 +- .../nvidia-imex/nvidia-imex-config.erb | 2 +- .../nvidia-imex/nvidia-imex.service.erb | 2 +- 8 files changed, 61 insertions(+), 34 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb index 4d5f803363..0e3b1bb0de 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -18,3 +18,7 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' + +def platform + "amzn#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb index 704fa7768f..2cca43251f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -18,3 +18,7 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb index e28231d121..8957e080c6 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -18,3 +18,7 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_rhel.rb' + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb index 7a9a455d82..5472947de1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -18,3 +18,7 @@ use 'partial/_nvidia_imex_common.rb' use 'partial/_nvidia_imex_debian.rb' + +def platform + "ubuntu#{node['platform_version'].delete('.')}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 00e7cbb807..e74b83c3b9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -19,50 +19,49 @@ return unless nvidia_enabled_or_installed? return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") - # Add NVIDIA repo for nvidia-imex - nvidia_repo 'add nvidia repository' do - action :add - end - directory node['cluster']['nvidia']['imex']['shared_dir'] - template "#{node['cluster']['nvidia']['imex']['shared_dir']}/config.cfg" do - source 'nvidia-imex/nvidia-imex-config.erb' - owner 'root' - group 'root' - mode '0755' - end - - template "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config.cfg" do - source 'nvidia-imex/nvidia-imex-nodes.erb' - owner 'root' - group 'root' - mode '0755' - end - - template "/etc/systemd/system/#{nvidia_imex_service}.service" do - source 'nvidia-imex/nvidia-imex.service.erb' - owner 'root' - group 'root' - mode '0644' - action :create - end - action_install_imex # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package node_attributes 'dump node attributes' - - nvidia_repo 'remove nvidia repository' do - action :remove - end end action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if get_nvswitch_count(get_device_ids['gb200']) > 1 + # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, + # if one doesn't already exist in a common, shared location. + template nvidia_imex_nodes_conf_file do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + action :create + not_if { file_exists_and_cluster_update?(nvidia_imex_nodes_conf_file) } + end + + template nvidia_imex_main_conf_file do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + action :create + not_if { file_exists_and_cluster_update?(nvidia_imex_main_conf_file) } + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) + end + + template "/etc/systemd/system/#{nvidia_imex_service}.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) + end + service nvidia_imex_service do action %i(enable start) supports status: true @@ -93,3 +92,15 @@ def imex_installed? def nvidia_enabled_or_installed? nvidia_enabled? || nvidia_installed? end + +def file_exists_and_cluster_update?(file_path) + ::File.exist?(file_path) && !are_queues_updated? +end + +def nvidia_imex_main_conf_file + "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['launch_template_id']}.cfg" +end + +def nvidia_imex_nodes_conf_file + "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb index aacd951cd5..d48be7aad7 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -40,5 +40,5 @@ def arch_suffix end def nvidia_imex_url - "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm" end diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb index 4a144ad8f6..bc34b08091 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb @@ -86,7 +86,7 @@ SERVER_PORT=50000 # Possible Values: # Full path/filename string (max length of 256). # Default Value: /etc/nvidia-imex/nodes_config.cfg -IMEX_NODE_CONFIG_FILE=<%= node['cluster']['nvidia']['imex']['shared_dir'] %>/nodes_config.cfg +IMEX_NODE_CONFIG_FILE=<%= @imex_nodes_config_file_path %> # Description: Name of the network interface used for communication. # OPTIONAL - If empty, network interface will be determined by matching bind IP to diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb index 3d3a577a3a..fbead02aa1 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb @@ -12,7 +12,7 @@ PrivateTmp=false Type=forking TimeoutStartSec=infinity -ExecStart=/usr/bin/nvidia-imex -c <%= node['cluster']['nvidia']['imex']['shared_dir'] %>/config.cfg +ExecStart=/usr/bin/nvidia-imex -c <%= @imex_main_config_file_path %> LimitCORE=infinity From 1251c232c246eea209dcc26353ce6120c579e413 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 1 Aug 2025 23:09:33 -0400 Subject: [PATCH 37/38] [NVIDIA_IMEX] Update unit tests --- .../spec/unit/resources/nvidia_imex_spec.rb | 216 +++++++++++------- .../test/controls/nvidia_imex_spec.rb | 33 +-- 2 files changed, 137 insertions(+), 112 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 97724b05e5..8608b88a9e 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -1,9 +1,12 @@ require 'spec_helper' nvidia_version = "1.2.3" +SOURCE_DIR = 'SOURCE_DIR'.freeze nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' +launch_template_id = 'lt-123456789012' +cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN' class ConvergeNvidiaImex def self.install(chef_run) @@ -146,7 +149,6 @@ def self.configure(chef_run) ConvergeNvidiaImex.install(runner) end cached(:node) { chef_run.node } - it 'does not install nvidia-imex' do is_expected.not_to install_package('nvidia-imex') end @@ -167,99 +169,103 @@ def self.configure(chef_run) end end - context 'when nvidia is enabled' do - cached(:chef_run) do - stubs_for_resource('nvidia_imex') do |res| - allow(res).to receive(:nvidia_enabled_or_installed?).and_return(true) - allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) - allow(File).to receive(:exist?).with(imex_binary).and_return(false) + %w(aarch64 x86_64).each do |arm_or_x86| + context "when nvidia is enabled on #{arm_or_x86}" do + cached(:nvidia_imex_version) { "1.2.3-1" } + cached(:nvidia_imex_package) { "nvidia-imex-1" } + cached(:nvidia_imex_name) do + if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' + "#{nvidia_imex_package}-#{nvidia_imex_version}" + else + "#{nvidia_imex_package}_#{nvidia_imex_version}" + end end - runner(platform: platform, version: version, step_into: ['nvidia_imex']) - end - cached(:nvidia_imex_version) { "1.2.3-1" } - cached(:nvidia_imex_package) { "nvidia-imex-1" } - cached(:nvidia_imex_name) do - if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' - "#{nvidia_imex_package}-#{nvidia_imex_version}" - else - "#{nvidia_imex_package}_#{nvidia_imex_version}" + cached(:url_arch) do + if %(redhat rocky amazon).include?(platform) + arm_or_x86 + elsif platform == 'ubuntu' + arm_or_x86 == 'x86_64' ? 'amd64' : 'arm64' + else + arm_or_x86 == 'x86_64' ? 'x86_64' : 'aarch64' + end + end + cached(:url_suffix) do + if %(redhat rocky).include?(platform) + "rhel#{version}/#{nvidia_imex_name}.#{url_arch}" + elsif platform == 'amazon' && version == '2023' + "amzn2023/#{nvidia_imex_name}.#{url_arch}" + else + "#{platform}#{version.delete('.')}/#{nvidia_imex_name}_#{url_arch}" + end end - end - cached(:node) { chef_run.node } - - before do - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir - chef_run.node.override['cluster']['region'] = 'aws_region' - chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version - ConvergeNvidiaImex.install(chef_run) - end - if platform == 'amazon' && version == '2' - it 'does not install nvidia-imex' do - is_expected.not_to add_nvidia_repo('add nvidia repository') - is_expected.not_to create_directory(nvidia_imex_shared_dir) - is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - is_expected.not_to install_install_packages('Install nvidia-imex') - .with(packages: "#{nvidia_imex_name}") - .with(action: %i(install)) + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:nvidia_enabled_or_installed?).and_return(true) + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) end - it 'does not set nvidia-imex version' do - expect(node.default['cluster']['nvidia']['imex']['version']).not_to eq(nvidia_imex_version) - expect(node.default['cluster']['nvidia']['imex']['package']).not_to eq(nvidia_imex_package) - is_expected.not_to write_node_attributes('dump node attributes') - is_expected.not_to remove_nvidia_repo('remove nvidia repository') + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir + chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR + chef_run.node.automatic['kernel']['machine'] = arm_or_x86 + chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version + ConvergeNvidiaImex.install(chef_run) end - else - it 'installs nvidia-imex' do - is_expected.to add_nvidia_repo('add nvidia repository') - is_expected.to create_directory(nvidia_imex_shared_dir) - - is_expected.to create_template("#{nvidia_imex_shared_dir}/config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - if platform == 'ubuntu' - is_expected.to install_apt_package('Install nvidia-imex') - .with(package_name: nvidia_imex_package) - .with(version: nvidia_imex_version) - .with(retries: 10) - .with(retry_delay: 5) - else - is_expected.to install_install_packages('Install nvidia-imex') - .with(packages: nvidia_imex_name) + if platform == 'amazon' && version == '2' + it 'does not install nvidia-imex' do + is_expected.not_to create_directory(nvidia_imex_shared_dir) + is_expected.not_to install_install_packages('Install nvidia-imex') + .with(packages: "#{nvidia_imex_name}") .with(action: %i(install)) end - end - it 'sets nvidia-imex version' do - expect(node.default['cluster']['nvidia']['imex']['version']).to eq(nvidia_imex_version) - expect(node.default['cluster']['nvidia']['imex']['package']).to eq(nvidia_imex_package) - is_expected.to write_node_attributes('dump node attributes') - is_expected.to remove_nvidia_repo('remove nvidia repository') + it 'does not set nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).not_to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).not_to eq(nvidia_imex_package) + is_expected.not_to write_node_attributes('dump node attributes') + end + else + + it 'installs nvidia-imex' do + is_expected.to create_directory(nvidia_imex_shared_dir) + if platform == 'ubuntu' + is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with( + source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb", + mode: '0644', + retries: 3, + retry_delay: 5 + ) + is_expected.to run_bash('Install nvidia-imex') + .with(user: 'root') + .with_retries(3) + .with_retry_delay(5) + .with_code(/ set -e\n dpkg -i #{nvidia_imex_package}-#{nvidia_imex_version}.deb && apt-mark hold #{nvidia_imex_package}/) + else + is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.rpm").with( + source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.rpm", + mode: '0644', + retries: 3, + retry_delay: 5 + ) + is_expected.to install_package('yum-plugin-versionlock') + is_expected.to run_bash("Install nvidia-imex") + .with(user: 'root') + .with_retries(3) + .with_retry_delay(5) + .with_code(/yum install -y #{nvidia_imex_name}.rpm/) + end + end + it 'sets nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).to eq(nvidia_imex_package) + is_expected.to write_node_attributes('dump node attributes') + end end end end @@ -299,16 +305,52 @@ def self.configure(chef_run) before do chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['node_type'] = node_type + chef_run.node.override['cluster']['launch_template_id'] = launch_template_id ConvergeNvidiaImex.configure(chef_run) end if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) it 'does not configure nvidia-imex' do + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end else it 'it starts nvidia-imex service' do + is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index 97754cfcc5..b3524db81d 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -22,31 +22,6 @@ end end - nvidia_imex_dir = "#{node['cluster']['nvidia']['imex']['shared_dir']}" - - describe file("#{nvidia_imex_dir}/config.cfg") do - it { should exist } - its('owner') { should eq 'root' } - its('group') { should eq 'root' } - its('mode') { should cmp '0755' } - its('content') { should match %r{IMEX_NODE_CONFIG_FILE=#{nvidia_imex_dir}/nodes_config.cfg} } - end - - describe file("#{nvidia_imex_dir}/nodes_config.cfg") do - it { should exist } - its('owner') { should eq 'root' } - its('group') { should eq 'root' } - its('mode') { should cmp '0755' } - end - - describe file("/etc/systemd/system/#{nvidia_imex_service}.service") do - it { should exist } - its('owner') { should eq 'root' } - its('group') { should eq 'root' } - its('mode') { should cmp '0644' } - its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{nvidia_imex_dir}/config.cfg} } - end - describe package("#{node['cluster']['nvidia']['imex']['package']}") do it { should be_installed } its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } @@ -56,6 +31,14 @@ control 'tag:config_nvidia_fabric_manager_enabled' do only_if { instance.nvs_switch_enabled? && node['cluster']['node_type'] == "ComputeFleet" && !os_properties.alinux2? } + describe file("/etc/systemd/system/nvidia-imex.service") do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0644' } + its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} } + end + describe service('nvidia-imex') do it { should be_enabled } it { should be_running } From c9c81cd6bf062ebe1545b66d813dd7579bccb933 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 1 Aug 2025 23:15:53 -0400 Subject: [PATCH 38/38] [NVIDIA Driver] Upgrade NVIDIA driver to 570.172.08 for all except AL2 Removing test URL --- CHANGELOG.md | 1 + cookbooks/aws-parallelcluster-platform/attributes/platform.rb | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eef76b57c1..384e2e4789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: libfabric-aws-2.1.0-1 - Rdma-core: rdma-core-57.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6 +- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. **BUG FIXES** - Fix a bug in the installation of ARM Performance Library that was causing the build image fail in isolated environments. diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 1fc2c2e12a..8201c2c04e 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -16,7 +16,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '570.86.15' +default['cluster']['nvidia']['driver_version'] = '570.172.08' default['cluster']['nvidia']['dcgm_version'] = '3.3.6' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['nvidia']['driver_version'] = '550.127.08'