diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a6f66dc3b..384e2e4789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. +- Install nvidia-imex for all OSs except AL2. **BUG FIXES** - Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. @@ -38,6 +39,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: libfabric-aws-2.1.0-1 - Rdma-core: rdma-core-57.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6 +- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. **BUG FIXES** - Fix a bug in the installation of ARM Performance Library that was causing the build image fail in isolated environments. diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 9b6e3e06be..8201c2c04e 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -16,12 +16,15 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '570.86.15' +default['cluster']['nvidia']['driver_version'] = '570.172.08' default['cluster']['nvidia']['dcgm_version'] = '3.3.6' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['nvidia']['driver_version'] = '550.127.08' end +# nvidia-imex +default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" + # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" default['cluster']['dcv']['authenticator']['user_id'] = node['cluster']['reserved_base_uid'] + 3 diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index a5952b0313..62d52e6eaa 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -18,3 +18,19 @@ def is_process_running(process_name) !ps.stdout.strip.empty? end + +# +# Get Count of GPUs in instance +# +def get_nvswitch_count(device_id) + shell_out("lspci -d #{device_id} | wc -l").stdout.strip.to_i +end + +def get_device_ids + # A100 (P4), H100(P5), B200(P6) and GB200()p6e) systems have NVSwitches + # NVSwitch device id is 10de:1af1 for P4 instance + # NVSwitch device id is 10de:22a3 for P5 instance + # NVSwitch device id is 10de:2901 for P6 instance + # NVSwitch device id is 10de:2941 for P6e instance + { 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' } +end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb index 06fc108637..f9f4ea2fc6 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb @@ -24,3 +24,7 @@ end include_recipe "aws-parallelcluster-platform::nvidia_uvm" + +nvidia_imex 'Configure nvidia-imex' do + action :configure +end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb index 7b13be7549..04823b28f2 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb @@ -24,3 +24,5 @@ fabric_manager 'Install Nvidia Fabric Manager' nvidia_dcgm 'install Nvidia datacenter-gpu-manager' + +nvidia_imex 'Install nvidia-imex' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 057ac80c17..f078ec9d6d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -54,12 +54,8 @@ def _nvidia_driver_version # Get number of nv switches def get_nvswitches - # A100 (P4), H100(P5) and B200(P6) systems have NVSwitches - # NVSwitch device id is 10de:1af1 for P4 instance - # NVSwitch device id is 10de:22a3 for P5 instance - # NVSwitch device id is 10de:2901 for P6 instance # We sum the count for all these deviceIds as output of lscpi command will be >0 # for only one device ID based on the instance type - nvswitch_device_ids = ['10de:1af1', '10de:22a3', '10de:2901'] - nvswitch_device_ids.sum { |id| shell_out("lspci -d #{id} | wc -l").stdout.strip.to_i } + nvswitch_device_ids = get_device_ids.values + nvswitch_device_ids.sum { |id| get_nvswitch_count(id) } end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb new file mode 100644 index 0000000000..0e3b1bb0de --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'amazon' do |node| + node['platform_version'].to_i == 2023 +end + +use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' + +def platform + "amzn#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb new file mode 100644 index 0000000000..5f0c765bb7 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'amazon', platform_version: '2' + +use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' + +def imex_installed? + # We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver + true +end + +action :configure do + # Do nothing +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb new file mode 100644 index 0000000000..2cca43251f --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'redhat' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb new file mode 100644 index 0000000000..8957e080c6 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'rocky' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_rhel.rb' + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb new file mode 100644 index 0000000000..5472947de1 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_imex, platform: 'ubuntu' do |node| + node['platform_version'].to_i >= 22 +end + +use 'partial/_nvidia_imex_common.rb' +use 'partial/_nvidia_imex_debian.rb' + +def platform + "ubuntu#{node['platform_version'].delete('.')}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb new file mode 100644 index 0000000000..e74b83c3b9 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +unified_mode true +default_action :install + +action :install do + return unless nvidia_enabled_or_installed? + return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") + + directory node['cluster']['nvidia']['imex']['shared_dir'] + + action_install_imex + # Save Imex version in Node Attributes for InSpec Tests + node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version + node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package + node_attributes 'dump node attributes' +end + +action :configure do + return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" + # Start nvidia-imex on p6e-gb200 and only on ComputeFleet + if get_nvswitch_count(get_device_ids['gb200']) > 1 + # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, + # if one doesn't already exist in a common, shared location. + template nvidia_imex_nodes_conf_file do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + action :create + not_if { file_exists_and_cluster_update?(nvidia_imex_nodes_conf_file) } + end + + template nvidia_imex_main_conf_file do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + action :create + not_if { file_exists_and_cluster_update?(nvidia_imex_main_conf_file) } + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) + end + + template "/etc/systemd/system/#{nvidia_imex_service}.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) + end + + service nvidia_imex_service do + action %i(enable start) + supports status: true + end + end +end + +def nvidia_imex_package + "#{nvidia_imex_service}-#{nvidia_driver_major_version}" +end + +def nvidia_driver_major_version + node['cluster']['nvidia']['driver_version'].split('.')[0] +end + +def nvidia_imex_service + 'nvidia-imex' +end + +def nvidia_imex_full_version + "#{node['cluster']['nvidia']['driver_version']}-1" +end + +def imex_installed? + ::File.exist?("/usr/bin/#{nvidia_imex_service}") || ::File.exist?("/usr/bin/#{nvidia_imex_service}-ctl") +end + +def nvidia_enabled_or_installed? + nvidia_enabled? || nvidia_installed? +end + +def file_exists_and_cluster_update?(file_path) + ::File.exist?(file_path) && !are_queues_updated? +end + +def nvidia_imex_main_conf_file + "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['launch_template_id']}.cfg" +end + +def nvidia_imex_nodes_conf_file + "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb new file mode 100644 index 0000000000..7f163e704e --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_debian.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_imex do + remote_file "#{node['cluster']['sources_dir']}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.deb" do + source "#{nvidia_imex_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + + bash "Install nvidia-imex" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-NVIDIA_IMEX + set -e + dpkg -i #{nvidia_imex_package}-#{nvidia_imex_full_version}.deb && apt-mark hold #{nvidia_imex_package} + NVIDIA_IMEX + retries 3 + retry_delay 5 + end +end + +def nvidia_imex_url + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}_#{nvidia_imex_full_version}_#{arch_suffix}.deb" +end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb new file mode 100644 index 0000000000..d48be7aad7 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_imex do + remote_file "#{node['cluster']['sources_dir']}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.rpm" do + source "#{nvidia_imex_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + + package 'yum-plugin-versionlock' + bash "Install nvidia-imex" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-NVIDIA_IMEX + set -e + yum install -y #{nvidia_imex_package}-#{nvidia_imex_full_version}.rpm + yum versionlock #{nvidia_imex_package} + NVIDIA_IMEX + retries 3 + retry_delay 5 + end +end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def nvidia_imex_url + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb index d27afb1df4..718e5d2f5d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/nvidia_spec.rb @@ -15,6 +15,10 @@ is_expected.to configure_gdrcopy('Configure gdrcopy') end + it 'configures nvidia-imex' do + is_expected.to configure_nvidia_imex('Configure nvidia-imex') + end + it 'loads nvidia-uvm kernel module' do is_expected.to load_kernel_module('nvidia-uvm') end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb new file mode 100644 index 0000000000..8608b88a9e --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -0,0 +1,382 @@ +require 'spec_helper' + +nvidia_version = "1.2.3" +SOURCE_DIR = 'SOURCE_DIR'.freeze +nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" +imex_binary = '/usr/bin/nvidia-imex' +imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' +launch_template_id = 'lt-123456789012' +cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN' + +class ConvergeNvidiaImex + def self.install(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'install' do + action :install + end + end + end + + def self.configure(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'configure' do + action :configure + end + end + end +end + +describe 'nvidia_imex:nvidia_enabled_or_installed?' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:resource) do + ConvergeNvidiaImex.install(chef_run) + chef_run.find_resource('nvidia_imex', 'install') + end + + context "when nvidia not enabled and not installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(false) + end + + it 'is false' do + expect(resource.nvidia_enabled_or_installed?).to eq(false) + end + end + + context "when nvidia not enabled but its already installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(true) + end + + it 'is true' do + expect(resource.nvidia_enabled_or_installed?).to eq(true) + end + end + + context "when nvidia is enabled but its not installed" do + before do + allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) + allow_any_instance_of(Object).to receive(:nvidia_installed?).and_return(false) + end + + it 'is true' do + expect(resource.nvidia_enabled_or_installed?).to eq(true) + end + end + end + end +end + +describe 'nvidia_imex:imex_installed?' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:resource) do + ConvergeNvidiaImex.install(chef_run) + chef_run.find_resource('nvidia_imex', 'install') + end + + context "when #{imex_binary} and #{imex_ctl_binary} does not exist" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + + if platform == 'amazon' && version == '2' + it 'is true' do + expect(resource.imex_installed?).to eq(true) + end + else + it 'is false' do + expect(resource.imex_installed?).to eq(false) + end + end + end + + context "when #{imex_binary} and #{imex_ctl_binary} exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(true) + allow(File).to receive(:exist?).with(imex_binary).and_return(true) + end + + it 'is true' do + expect(resource.imex_installed?).to eq(true) + end + end + + context "when #{imex_binary} exists and #{imex_ctl_binary} does not exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(true) + end + + it 'is true' do + expect(resource.imex_installed?).to eq(true) + end + end + + context "when #{imex_binary} does not exists and #{imex_ctl_binary} exists" do + before do + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(true) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + + it 'is true' do + expect(resource.imex_installed?).to eq(true) + end + end + end + end +end + +describe 'nvidia_imex:install' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + context 'when nvidia not enabled' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:nvidia_enabled_or_installed?).and_return(false) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.install(runner) + end + cached(:node) { chef_run.node } + it 'does not install nvidia-imex' do + is_expected.not_to install_package('nvidia-imex') + end + end + + context 'when nvidia-imex binary already exists' do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed?).and_return(true) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.install(runner) + end + cached(:node) { chef_run.node } + + it 'does not install nvidia-imex' do + is_expected.not_to install_package('nvidia-imex') + end + end + + %w(aarch64 x86_64).each do |arm_or_x86| + context "when nvidia is enabled on #{arm_or_x86}" do + cached(:nvidia_imex_version) { "1.2.3-1" } + cached(:nvidia_imex_package) { "nvidia-imex-1" } + cached(:nvidia_imex_name) do + if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' + "#{nvidia_imex_package}-#{nvidia_imex_version}" + else + "#{nvidia_imex_package}_#{nvidia_imex_version}" + end + end + cached(:url_arch) do + if %(redhat rocky amazon).include?(platform) + arm_or_x86 + elsif platform == 'ubuntu' + arm_or_x86 == 'x86_64' ? 'amd64' : 'arm64' + else + arm_or_x86 == 'x86_64' ? 'x86_64' : 'aarch64' + end + end + cached(:url_suffix) do + if %(redhat rocky).include?(platform) + "rhel#{version}/#{nvidia_imex_name}.#{url_arch}" + elsif platform == 'amazon' && version == '2023' + "amzn2023/#{nvidia_imex_name}.#{url_arch}" + else + "#{platform}#{version.delete('.')}/#{nvidia_imex_name}_#{url_arch}" + end + end + + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:nvidia_enabled_or_installed?).and_return(true) + allow(File).to receive(:exist?).with(imex_ctl_binary).and_return(false) + allow(File).to receive(:exist?).with(imex_binary).and_return(false) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir + chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR + chef_run.node.automatic['kernel']['machine'] = arm_or_x86 + chef_run.node.override['cluster']['nvidia']['driver_version'] = nvidia_version + ConvergeNvidiaImex.install(chef_run) + end + if platform == 'amazon' && version == '2' + it 'does not install nvidia-imex' do + is_expected.not_to create_directory(nvidia_imex_shared_dir) + is_expected.not_to install_install_packages('Install nvidia-imex') + .with(packages: "#{nvidia_imex_name}") + .with(action: %i(install)) + end + it 'does not set nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).not_to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).not_to eq(nvidia_imex_package) + is_expected.not_to write_node_attributes('dump node attributes') + end + else + + it 'installs nvidia-imex' do + is_expected.to create_directory(nvidia_imex_shared_dir) + if platform == 'ubuntu' + is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with( + source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb", + mode: '0644', + retries: 3, + retry_delay: 5 + ) + is_expected.to run_bash('Install nvidia-imex') + .with(user: 'root') + .with_retries(3) + .with_retry_delay(5) + .with_code(/ set -e\n dpkg -i #{nvidia_imex_package}-#{nvidia_imex_version}.deb && apt-mark hold #{nvidia_imex_package}/) + else + is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.rpm").with( + source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.rpm", + mode: '0644', + retries: 3, + retry_delay: 5 + ) + is_expected.to install_package('yum-plugin-versionlock') + is_expected.to run_bash("Install nvidia-imex") + .with(user: 'root') + .with_retries(3) + .with_retry_delay(5) + .with_code(/yum install -y #{nvidia_imex_name}.rpm/) + end + end + it 'sets nvidia-imex version' do + expect(node.default['cluster']['nvidia']['imex']['version']).to eq(nvidia_imex_version) + expect(node.default['cluster']['nvidia']['imex']['package']).to eq(nvidia_imex_package) + is_expected.to write_node_attributes('dump node attributes') + end + end + end + end + end + end +end + +describe 'nvidia_imex:configure' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + context "when nvidia-imex binary is not installed" do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed?).and_return(false) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + it 'does not configure nvidia-imex' do + is_expected.not_to configure_nvidia_imex('nvidia-imex') + end + end + + %w(HeadNode LoginNode ComputeFleet).each do |node_type| + context "when get_nvswitch_count > 1 on #{node_type} node" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed?).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir + chef_run.node.override['cluster']['node_type'] = node_type + chef_run.node.override['cluster']['launch_template_id'] = launch_template_id + ConvergeNvidiaImex.configure(chef_run) + end + + if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) + it 'does not configure nvidia-imex' do + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + else + it 'it starts nvidia-imex service' do + is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + end + end + end + + context "when get_nvswitch_count <= 1" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed?).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + end + + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb new file mode 100644 index 0000000000..bc34b08091 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb @@ -0,0 +1,217 @@ +# NVIDIA IMEX configuration file. +# Note: This configuration file is read during IMEX startup. So, IMEX +# service restart is required for new settings to take effect. + +# Description: IMEX logging levels +# Possible Values: +# 0 - All the logging is disabled +# 1 - Set log level to CRITICAL and above +# 2 - Set log level to ERROR and above +# 3 - Set log level to WARNING and above +# 4 - Set log level to INFO and above +# Default Value: 4 +LOG_LEVEL=4 + +# Description: Filename for IMEX logs +# Possible Values: +# Full path/filename string (max length of 256). Logs will be redirected +# to console(stderr). If the specified log file can't be opened or the +# path is empty. +# Default Value: /var/log/nvidia-imex.log +# LOG_FILE_NAME=/var/log/nvidia-imex.log + +# Description: Filename for IMEX stats logging +# Possible Values: +# Full path/filename string (max length of 256). Stats will be redirected +# to console(stderr), if the specified stats file can't be opened or the +# path is empty. +# Default Value: /var/log/nvidia-imex-stats.log +# Note: If STATS_FILE_NAME is configured same as LOG_FILE_NAME, then stats will +# be redirected to the path/filename specified by LOG_FILE_NAME. +# STATS_FILE_NAME=/var/log/nvidia-imex-stats.log + +# Description: Append to an existing log file or overwrite the logs +# Possible Values: +# 0 - No (Log file will be overwritten) +# 1 - Yes (Append to existing log) +# Default Value: 1 +LOG_APPEND_TO_LOG=1 + +# Description: Max size of log file (in MB) +# Possible Values: +# Any Integer values +# Default Value: 1024 +# LOG_FILE_MAX_SIZE=1024 + +# Description: Number of times the IMEX log is rotated once it reaches LOG_FILE_MAX_SIZE +# Possible Values: +# 0 - Log is not rotated. Logging is stopped once the IMEX log file reaches +# the size specified in LOG_FILE_MAX_SIZE +# Non-zero Integer - Log is rotated upto the number of times specified in LOG_MAX_ROTATE_COUNT, +# after the size of the log file reaches the size specified in LOG_FILE_MAX_SIZE. +# Combined IMEX log size is LOG_FILE_MAX_SIZE multipled by LOG_MAX_ROTATE_COUNT+1 +# Once this threshold is reached, the oldest log file is purged and reused. +LOG_MAX_ROTATE_COUNT=3 + +# Description: Redirect all the logs to syslog instead of logging to file +# Possible Values: +# 0 - No +# 1 - Yes +# Default Value: 0 +LOG_USE_SYSLOG=1 + +# Description: daemonize IMEX on start-up +# Possible Values: +# 0 - No (Do not daemonize and run IMEX as a normal process) +# 1 - Yes (Run IMEX process as Unix daemon +# Default Value: 1 +DAEMONIZE=1 + +# Description: Network interface to listen for IMEX peer communication. +# OPTIONAL - empty value will determine the bind IP from the node config file. +# Possible Values: +# A valid IPv4 address +# A valid IPv6 address +# No value - Determine bind IP from the node configuration file. +# Default Value: +BIND_INTERFACE_IP= + +# Description: Starting TCP port number for IMEX peer communication +# Possible Values: +# Any value between 0 and 65535 +# Default Value: 50000 +SERVER_PORT=50000 + +# Description: Name of file containing IP addresses of nodes +# Possible Values: +# Full path/filename string (max length of 256). +# Default Value: /etc/nvidia-imex/nodes_config.cfg +IMEX_NODE_CONFIG_FILE=<%= @imex_nodes_config_file_path %> + +# Description: Name of the network interface used for communication. +# OPTIONAL - If empty, network interface will be determined by matching bind IP to +# node configuration file. Only necessary to configure if the bind IP +# is IPv6 link-local and on multiple network interfaces. +# Possible Values: +# Interface names like eth0, ens32 .. etc +# Default Value: +NETWORK_INTERFACE= + +# Description: Controls whether IMEX should complete initialization without establishing quorum +# Possible values: +# NONE: Do not wait for any quorum with other nodes. +# RECOVERY: In case of unsafe IMEX termination, wait until all nodes that had previously imported +# have connected, allowing them time to safely clean up any potentially hanging references +# Default value: RECOVERY +IMEX_WAIT_FOR_QUORUM=RECOVERY + +# Description: Enable authentication and encryption between nodes. +# Possible Values: +# 0: Disable encryption and authentication +# 1: Enable encryption and authentication +# Default value: 0 +IMEX_ENABLE_AUTH_ENCRYPTION=0 + +# Description: Controls the security mechanism used by IMEX for authentication and encryption between nodes. +# If IMEX_ENABLE_AUTH_ENCRYPTION is enabled (1), then IMEX_AUTH_ENCRYPTIPON_MODE must be configured +# as one of the supported values. An empty or unexpected value will prevent initialization. +# Possible Values: +# SSL_TLS: Default - Use SSL/mTLS for authentication and encryption. +# GSS_AUTH_ENCRYPT: Use GSSAPI for authentication, integrity and encryption. +# GSS_AUTH_ONLY: Use GSSAPI for authentication and integrity only, encryption will be disabled. +IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS + +### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS mode. ### + +# Description: This determines how IMEX will try to retrieve the keys, certificates, and certificate +# authority for authentication and encryption. +# If IMEX_AUTH_ENCRYPTION_MODE is SSL_TLS, then IMEX_AUTH_SOURCE must be configured +# as one of the supported values. An empty or unexpected value will prevent initialization. +# Possible Values: +# FILE: The provided values are paths to files on the file system. +# ENV_PATH: The provided values are environment variable names to retrieve, and the values in the +# environment variables are treated as paths to files on the file system. +# ENV_VAL: The provided values are environment variable names to retrieve, and the values in the +# environment variables are treated as the actual values for the key/cert/cert auth. +IMEX_AUTH_SOURCE= + +# Description: These fields are interpreted based on how IMEX_AUTH_SOURCE is configured +IMEX_SERVER_KEY= +IMEX_SERVER_CERT= +IMEX_SERVER_CERT_AUTH= +IMEX_CLIENT_KEY= +IMEX_CLIENT_CERT= +IMEX_CLIENT_CERT_AUTH= + +# Description: Override the target hostname for authentication of the certificates and keys. This allows +# certificates with common names that do not match the ip addresses provided for the nodes. +# Example: +# If the certificate has the subject: +# "/C=US/ST=CA/L=Santa Clara/O=NVIDIA/OU=Test/CN=localhost" +# The certificate validation will expect the connection hostname to be "localhost", by +# setting IMEX_SECURITY_TARGET_OVERRIDE=localhost you can cause override the connection +# hostname for security purposes to be "localhost", allowing the connection to succeed. +IMEX_SECURITY_TARGET_OVERRIDE= + +### This is the end of IMEX SSL_TLS mode config parameters. ### + +### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode. ### + +# Description: Service Principal Name to use when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Default Value: host +IMEX_GSS_SERVICE_NAME=host + +# Description: GSSAPI timeout (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# -1 : Default - Retry indefinitely +# >= 0: Number of seconds to wait before triggering clean up +IMEX_GSS_TIMEOUT_SEC=-1 + +# Description: GSSAPI retry interval (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# 5 : Default - Retry every 5 seconds +# >= 0: Number of seconds to wait before retrying +IMEX_GSS_RETRY_INTERVAL_SEC=5 + +# Description: GSSAPI security context lifetime (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT +# or GSS_AUTH_ONLY. +# Possible Values: +# -1 : Default - Indefinite lifetime (limited by the credential lifetime) +# >= 0: Security context lifetime in seconds +IMEX_GSS_SEC_CONTEXT_LIFETIME_SEC=-1 + +# Description: Determines IMEX behavior during fatal GSSAPI failures or timeouts, when IMEX_AUTH_ENCRYPTION_MODE +# is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY. +# Possible Values: +# 1 : Default - Shutdown IMEX daemon +# 0 : Terminate connection to the failing peer node +IMEX_GSS_SHUTDOWN_ON_FAILURE=1 + +### This is the end of IMEX GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode config parameters. ### + +# Description: Enabled the command/control service to allow for querying information from the IMEX daemon. +# Must be used with IMEX_CMD_PORT (optionally IMEX_CMD_BIND_INTERFACE_IP) and/or +# IMEX_CMD_UNIX_DOMAIN_PATH +IMEX_CMD_ENABLED=1 + +# Description: IP address to use to bind the command/control service. Ignored if IMEX_CMD_ENABLED=0 +# If empty, (but IMEX_CMD_PORT is specified), it will bind to all available interfaces. +IMEX_CMD_BIND_INTERFACE_IP= + +# Description: Port to bind to (in conjunction with IMEX_CMD_BIND_INTERFACE) for the command/control service. +# Ignored if IMEX_CMD_ENABLED=0 +IMEX_CMD_PORT=50005 + +# Description: Unix domain socket path to attach to for the command/control service. Ignored if IMEX_CMD_ENABLED=0 +IMEX_CMD_UNIX_DOMAIN_PATH= + +# Description: Determines how long to wait after detecting that the IMEX daemon has lost connection to another +# node before triggering clean up imports and exports from that node. If a connection is reestablished +# before the grace period expires, and IMEX is able to identify that it is the same instance previously +# connected, then no clean up is required. If a connection is established and IMEX detects that it is +# a new instance (i.e. someone restarted the IMEX daemon), then clean up will be immediately triggered +# regardless of grace period. +# -1: Default - Wait indefinitely +# 0: Immediately trigger clean up +# >0: Number of seconds to wait before triggering clean up +IMEX_NODE_DISCONNECTED_GRACE_TIME=-1 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb new file mode 100644 index 0000000000..d48070b80f --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-nodes.erb @@ -0,0 +1,3 @@ +## Please replace below fake IP's with correct IP address of launched instances in Gb200 Capacity Block +172.31.51.93 +172.31.48.43 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb new file mode 100644 index 0000000000..fbead02aa1 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb @@ -0,0 +1,28 @@ +# This file is created by ParallelCluster by following default settings +# as given by official NVIDIA docs https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/gettingstarted.html#on-linux-based-systems + +[Unit] +Description=NVIDIA IMEX service +After=network-online.target +Requires=network-online.target + +[Service] +User=root +PrivateTmp=false +Type=forking +TimeoutStartSec=infinity + +ExecStart=/usr/bin/nvidia-imex -c <%= @imex_main_config_file_path %> + +LimitCORE=infinity + +Restart=on-failure +RestartSec=1s + +[Install] +WantedBy=multi-user.target + + + + + diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb new file mode 100644 index 0000000000..b3524db81d --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -0,0 +1,46 @@ +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +control 'tag:install_expected_versions_of_nvidia_imex_installed' do + only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? } + + nvidia_imex_service = 'nvidia-imex' + ["/usr/bin/#{nvidia_imex_service}", "/usr/bin/#{nvidia_imex_service}-ctl"].each do |path| + describe file(path) do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0755' } + end + end + + describe package("#{node['cluster']['nvidia']['imex']['package']}") do + it { should be_installed } + its('version') { should match /#{node['cluster']['nvidia']['imex']['version']}/ } + end +end + +control 'tag:config_nvidia_fabric_manager_enabled' do + only_if { instance.nvs_switch_enabled? && node['cluster']['node_type'] == "ComputeFleet" && !os_properties.alinux2? } + + describe file("/etc/systemd/system/nvidia-imex.service") do + it { should exist } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + its('mode') { should cmp '0644' } + its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} } + end + + describe service('nvidia-imex') do + it { should be_enabled } + it { should be_running } + end +end