From a8085b87b552b1689fc0a68162b99e1caf7ecff1 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 14 Mar 2025 08:05:27 -0700 Subject: [PATCH 01/12] Upgrade Slurm to version 24.05.7 (from 24.05.6) Signed-off-by: Hanwen --- CHANGELOG.md | 2 +- cookbooks/aws-parallelcluster-slurm/attributes/versions.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47c9de2f7f..7dcf3043e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** -- Upgrade Slurm to version 24.05.6. +- Upgrade Slurm to version 24.05.7. - Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2. - Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2. - Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20). diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 7b62fe188a..c4c457b90c 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,8 +1,8 @@ # Slurm -default['cluster']['slurm']['version'] = '24-05-6-1' +default['cluster']['slurm']['version'] = '24-05-7-1' default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' -default['cluster']['slurm']['sha256'] = '0ba810649ebc1c3b1c1d7102dbd5365e53fd7ce7c25ab2108bd0196b6988ddb2' +default['cluster']['slurm']['sha256'] = '297e85853314a0a4d227ca66bb44179c099f0de5d86e83ffe21cb464b9ad3709' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge default['cluster']['munge']['munge_version'] = '0.5.16' From 2efb587842eccca894bd634d312088ff2ac6cec6 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 14 Mar 2025 11:14:18 -0700 Subject: [PATCH 02/12] Use separate commands when copying files and preserving permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `cp -p` fails with the following error on Ubuntu 24: ``` STDERR: cp: preserving permissions for ‘/local/home/ubuntu/.ssh/authorized_keys’: Operation not supported ``` Signed-off-by: Hanwen --- .../recipes/config/cluster_user.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb index 58e21c855d..7cd35e402d 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb @@ -73,7 +73,9 @@ bash "copy_auth_file" do code <<-PERMS set -e - cp -p #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + cp #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chmod --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chown --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys PERMS only_if { node['cluster']['default_user_home'] == 'local' } end @@ -90,7 +92,9 @@ bash "copy_auth_file" do code <<-PERMS set -e - cp -p #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + cp #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chmod --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chown --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys PERMS only_if { node['cluster']['default_user_home'] == 'local' } end From d5415213ac8cdcd622275bffdea8f0634add1fd4 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 13 Mar 2025 12:55:02 -0700 Subject: [PATCH 03/12] Shortening RHEL/Rock boot time by deprioritizing ipv6 and disable internet check This commit saves time because the os won't retry on unsupported ipv6 and optional Internet connection Signed-off-by: Hanwen --- CHANGELOG.md | 4 ++++ .../redhat/dns_domain/99-disable-ipv6-metadata.cfg | 3 +++ .../files/redhat/dns_domain/NetworkManager.conf | 7 +++++++ .../rocky/dns_domain/99-disable-ipv6-metadata.cfg | 3 +++ .../files/rocky/dns_domain/NetworkManager.conf | 7 +++++++ .../resources/dns_domain/dns_domain_redhat8.rb | 10 ++++++++++ .../resources/dns_domain/dns_domain_rocky8.rb | 10 ++++++++++ 7 files changed, 44 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg create mode 100644 cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dcf3043e5..0f510ee2bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,10 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Remove generation of DSA keys for login nodes as DSA, which became unsupported in OpenSSH 9.7+. - Set instance ID and instance type information in Slurm upon compute nodes launch. - Install NVIDIA drivers without the option 'no-cc-version-check', which is now deprecated in the NVIDIA installer. +- Reduce RHEL/Rocky Linux boot time by the following network customization: + - Configuring higher priority to IPv4 than IPv6 + - Disabling Internet connectivity check + - Configuring only IPv4 IMDS endpoint to cloud-init **BUG FIXES** - Remove usage of cfn-init for compute node bootstrapping to reduce node scale-up time. diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg new file mode 100644 index 0000000000..71dd7f17ad --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg @@ -0,0 +1,3 @@ +datasource: + Ec2: + metadata_urls: [ 'http://169.254.169.254' ] \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf index 64540b3c48..3caa0bccac 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf @@ -23,6 +23,13 @@ plugins = ifcfg-rh, dhcp = dhclient +[connection] +ipv4.route-metric=100 +ipv6.route-metric=200 + +[connectivity] +enabled=false + [logging] # When debugging NetworkManager, enabling debug logging is of great help. # diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg new file mode 100644 index 0000000000..71dd7f17ad --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg @@ -0,0 +1,3 @@ +datasource: + Ec2: + metadata_urls: [ 'http://169.254.169.254' ] \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf index 64540b3c48..3caa0bccac 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf @@ -23,6 +23,13 @@ plugins = ifcfg-rh, dhcp = dhclient +[connection] +ipv4.route-metric=100 +ipv6.route-metric=200 + +[connectivity] +enabled=false + [logging] # When debugging NetworkManager, enabling debug logging is of great help. # diff --git a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb index 3cb10a8d8f..4b63e5eca8 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb @@ -33,6 +33,16 @@ mode '0644' end + # Disable ipv6 IMDS in cloud init to speed up + cookbook_file '99-disable-ipv6-metadata.cfg' do + path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg' + source 'dns_domain/99-disable-ipv6-metadata.cfg' + cookbook 'aws-parallelcluster-slurm' + user 'root' + group 'root' + mode '0644' + end + action_update_search_domain network_service 'Restart network service' end diff --git a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb index 4ad36e1856..448619f0ac 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb @@ -33,6 +33,16 @@ mode '0644' end + # Disable ipv6 IMDS in cloud init to speed up + cookbook_file '99-disable-ipv6-metadata.cfg' do + path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg' + source 'dns_domain/99-disable-ipv6-metadata.cfg' + cookbook 'aws-parallelcluster-slurm' + user 'root' + group 'root' + mode '0644' + end + action_update_search_domain network_service 'Restart network service' end From 2c1029cab22eb663fc6319c83ad67e835ec706f3 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Tue, 18 Mar 2025 16:01:36 -0400 Subject: [PATCH 04/12] [Ubuntu24] Install Lustre client for Ubuntu24 (#2912) Co-authored-by: Himani Anil Deshpande --- CHANGELOG.md | 1 - .../resources/lustre/partial/_install_lustre_debian.rb | 1 - .../test/controls/lustre_spec.rb | 10 +++++----- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f510ee2bc..609290892a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste ------ **ENHANCEMENTS** - Add support for Ubuntu 24.04. - Notice that ParallelCluster official AMI for Ubuntu 24.04 does not support Lustre. - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** diff --git a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb index 13a797c7f8..78a7b13149 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and limitations under the License. action :setup do - return if node['platform_version'].to_i == 24 apt_repository 'fsxlustreclientrepo' do uri "https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu" components ['main'] diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb index ef15c538d5..0d1f5b707d 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb @@ -27,7 +27,7 @@ end end - if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker? && !os_properties.ubuntu2404? + if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker? # TODO: restore installation and check on docker when Lustre is available for RH8.9 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/install-lustre-client.html unless inspec.os.release.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7")) @@ -55,7 +55,7 @@ end end - if os_properties.debian_family? && !os_properties.ubuntu2404? + if os_properties.debian_family? describe apt('https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu') do it { should exist } it { should be_enabled } @@ -89,7 +89,7 @@ control 'tag:install_lustre_lnet_kernel_module_enabled' do title "Verify that lnet kernel module is enabled" - only_if { !os_properties.on_docker? && !os_properties.alinux? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? && !os_properties.alinux? } describe kernel_module("lnet") do it { should be_loaded } it { should_not be_disabled } @@ -98,7 +98,7 @@ end control 'lustre_mounted' do - only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? } describe mount('/shared_dir') do it { should be_mounted } its('type') { should eq 'lustre' } @@ -106,7 +106,7 @@ end control 'lustre_unmounted' do - only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? } describe mount('/shared_dir') do it { should_not be_mounted } From 11e8def9d689c555407a9383a9e3ef93f7e28def Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Wed, 19 Mar 2025 16:35:21 -0400 Subject: [PATCH 05/12] Adding Ubuntu24 for kitchen tests for storage (#2915) * Adding Ubuntu24 for kitchen tests for storage * Adding Rocky9, rhel9 and al2023 --------- Co-authored-by: Himani Anil Deshpande --- .../test/controls/cloudwatch_spec.rb | 7 +++---- .../test/controls/sticky_bits_spec.rb | 2 +- .../test/controls/mysql_client_spec.rb | 5 ++--- test/environments/kitchen.rb | 20 +++++++++++++++---- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index 0d1f4783f1..d88b2e2c2f 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -20,14 +20,13 @@ describe 'Check the presence of the cloudwatch package gpg key' # In Ubuntu >20.04 due to environment variable the keyring is placed under home of the user ubuntu with the permission of root - ubuntu2004 = os_properties.ubuntu2004? - ubuntu2204 = os_properties.ubuntu2204? - keyring = (ubuntu2004 || ubuntu2204) && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' + + keyring = os_properties.ubuntu? && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' sudo = os_properties.redhat_on_docker? ? '' : 'sudo' describe bash("#{sudo} gpg --list-keys #{keyring}") do # Don't check exit status for Ubuntu20 because it returns 2 when executed in the validate phase of a created AMI # os_properties cannot be used in the describe block level. It can be used within an it{} block - its('exit_status') { should eq 0 } unless ubuntu2004 || ubuntu2204 + its('exit_status') { should eq 0 } unless os_properties.ubuntu? its('stdout') { should match /3B789C72/ } its('stdout') { should match /Amazon CloudWatch Agent/ } end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb index c72f192973..1068082914 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb @@ -1,7 +1,7 @@ control 'tag:config_sticky_bits_configured' do title 'Check sticky bits configuration' - if (os_properties.ubuntu2004? || os_properties.ubuntu2204?) && !os_properties.on_docker? + if os_properties.ubuntu? && !os_properties.on_docker? # This test passes on Mac but doesn't work as GitHub action. describe kernel_parameter('fs.protected_regular') do its('value') { should eq 0 } diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb index bbe1ddf088..bd50750ec8 100644 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb @@ -13,13 +13,14 @@ title "MySql client is installed" mysql_packages = [] + ubuntu = os_properties.ubuntu? if os.redhat? mysql_packages.concat %w(mysql-community-client-plugins mysql-community-common mysql-community-devel mysql-community-libs) if os_properties.alinux2? || os_properties.centos7? mysql_packages.concat %w(mysql-community-libs-compat) end - elsif os_properties.ubuntu2004? || os_properties.ubuntu2204? + elsif ubuntu mysql_packages.concat %w(libmysqlclient-dev libmysqlclient21) else describe "unsupported OS" do @@ -27,8 +28,6 @@ end end - ubuntu = os_properties.ubuntu? - mysql_packages.each do |pkg| describe package(pkg) do it { should be_installed } diff --git a/test/environments/kitchen.rb b/test/environments/kitchen.rb index ec880d60b6..d8702b5cdf 100644 --- a/test/environments/kitchen.rb +++ b/test/environments/kitchen.rb @@ -5,29 +5,41 @@ name 'kitchen' default_attributes 'kitchen_hooks' => { 'ebs_mount-vol_array/alinux2' => '', + 'ebs_mount-vol_array/alinux2023' => '', 'ebs_mount-vol_array/rhel8' => '', - 'ebs_mount-vol_array/centos7' => '', + 'ebs_mount-vol_array/rhel9' => '', 'ebs_mount-vol_array/ubuntu2004' => '', 'ebs_mount-vol_array/ubuntu2204' => '', + 'ebs_mount-vol_array/ubuntu2404' => '', 'ebs_mount-vol_array/rocky8' => '', + 'ebs_mount-vol_array/rocky9' => '', 'ebs_unmount-vol_array/alinux2' => '', + 'ebs_unmount-vol_array/alinux2023' => '', 'ebs_unmount-vol_array/rhel8' => '', - 'ebs_unmount-vol_array/centos7' => '', + 'ebs_unmount-vol_array/rhel9' => '', 'ebs_unmount-vol_array/ubuntu2004' => '', 'ebs_unmount-vol_array/ubuntu2204' => '', + 'ebs_unmount-vol_array/ubuntu2404' => '', 'ebs_unmount-vol_array/rocky8' => '', + 'ebs_unmount-vol_array/rocky9' => '', 'raid_mount-raid_vol_array/alinux2' => '', + 'raid_mount-raid_vol_array/alinux2023' => '', 'raid_mount-raid_vol_array/rhel8' => '', - 'raid_mount-raid_vol_array/centos7' => '', + 'raid_mount-raid_vol_array/rhel9' => '', 'raid_mount-raid_vol_array/ubuntu2004' => '', 'raid_mount-raid_vol_array/ubuntu2204' => '', + 'raid_mount-raid_vol_array/ubuntu2404' => '', 'raid_mount-raid_vol_array/rocky8' => '', + 'raid_mount-raid_vol_array/rocky9' => '', 'raid_unmount-raid_vol_array/alinux2' => '', + 'raid_unmount-raid_vol_array/alinux2023' => '', 'raid_unmount-raid_vol_array/rhel8' => '', - 'raid_unmount-raid_vol_array/centos7' => '', + 'raid_unmount-raid_vol_array/rhel9' => '', 'raid_unmount-raid_vol_array/ubuntu2004' => '', 'raid_unmount-raid_vol_array/ubuntu2204' => '', + 'raid_unmount-raid_vol_array/ubuntu2404' => '', 'raid_unmount-raid_vol_array/rocky8' => '', + 'raid_unmount-raid_vol_array/rocky9' => '', 'lustre_mount-fsx_fs_id_array' => ["fs-0ab11b3ade43091fe"], 'lustre_mount-fsx_dns_name_array' => ["fs-0ab11b3ade43091fe.fsx.us-west-2.amazonaws.com"], 'lustre_mount-fsx_mount_name_array' => ["qz5b7bev"], From a490d2b624d8c3139de4daa63dc1044f40148705 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:24:30 -0400 Subject: [PATCH 06/12] [Kitchen test] Add Ubuntu Os for C_state Install phase tests (#2916) Co-authored-by: Himani Anil Deshpande --- .../test/controls/c_states_spec.rb | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb index 86b5686008..e560019afd 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb @@ -3,16 +3,13 @@ title 'Check the configuration to disable c states' only_if { !os_properties.on_docker? && os_properties.x86? } - if os_properties.ubuntu2004? - describe file('/etc/default/grub') do - it { should exist } - its('content') { should match(/processor.max_cstate=1/) } - its('content') { should match(/intel_idle.max_cstate=1/) } - end - describe file('/boot/grub/grub.cfg') do - it { should exist } - its('content') { should match(/processor.max_cstate=1/) } - its('content') { should match(/intel_idle.max_cstate=1/) } + if os_properties.ubuntu? + %w(/etc/default/grub /boot/grub/grub.cfg).each do |file_path| + describe file(file_path) do + it { should exist } + its('content') { should match(/processor.max_cstate=1/) } + its('content') { should match(/intel_idle.max_cstate=1/) } + end end else describe bash('cpupower idle-info') do From 04b4ca9fa209b989090d66de1dd2942883dca125 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Thu, 20 Mar 2025 10:22:59 -0400 Subject: [PATCH 07/12] [Kitchen Test] Use os_properties outside it block (#2918) Co-authored-by: Himani Anil Deshpande --- .../test/controls/cloudwatch_spec.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index d88b2e2c2f..ee35703090 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -20,13 +20,13 @@ describe 'Check the presence of the cloudwatch package gpg key' # In Ubuntu >20.04 due to environment variable the keyring is placed under home of the user ubuntu with the permission of root - - keyring = os_properties.ubuntu? && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' + is_ubuntu = os_properties.ubuntu? + keyring = is_ubuntu && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' sudo = os_properties.redhat_on_docker? ? '' : 'sudo' describe bash("#{sudo} gpg --list-keys #{keyring}") do # Don't check exit status for Ubuntu20 because it returns 2 when executed in the validate phase of a created AMI # os_properties cannot be used in the describe block level. It can be used within an it{} block - its('exit_status') { should eq 0 } unless os_properties.ubuntu? + its('exit_status') { should eq 0 } unless is_ubuntu its('stdout') { should match /3B789C72/ } its('stdout') { should match /Amazon CloudWatch Agent/ } end From 60b00ad17e1c822e95b061b139ce3c5eef7b40e7 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 20 Mar 2025 12:50:40 -0700 Subject: [PATCH 08/12] Clean resolv.conf if it's not managed by system at the end of AMI build The conditional statement avoids cleaning if the `/etc/resolv.conf` is a symbolic link. It is a symbolic link when it is managed by other systems. Cleaning the `/etc/resolv.conf` speed up instance launch because it wouldn't try to use name server from the AMI creation environment. The delay was shown in `/var/log/cloud-init.log`: ``` 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://169.254.169.254 took 40.099 seconds 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://[fd00:ec2::254] took 0.000 seconds 2025-03-19 16:00:17,731 - util.py[DEBUG]: Resolving URL: http://instance-data.:8773 took 10.010 seconds ``` Example content of `/etc/resolv.conf`: ``` cat /etc/resolv.conf # Generated by NetworkManager search ec2.internal nameserver 192.168.0.2 ``` Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index 2943b51f38..00f98efc16 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -20,5 +20,10 @@ if [ "${ID}${VERSION_ID}" == "centos7" ]; then rm -f /etc/sysconfig/network-scripts/ifcfg-eth0 fi +# Clean resolv.conf if it's not managed by system +if [ ! -L "/etc/resolv.conf" ]; then + echo -n > /etc/resolv.conf +fi + find /var/log -type f -exec /bin/rm -v {} \; touch /var/log/lastlog From e0c6e128758177ac15de1687dc0be4cc68e7911e Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 20 Mar 2025 13:25:01 -0700 Subject: [PATCH 09/12] Remove nameserver from resolv.conf at the end of AMI build The conditional statement avoids cleaning if the `/etc/resolv.conf` is a symbolic link. It is a symbolic link when it is managed by other systems. Cleaning the `/etc/resolv.conf` speeds up instance launch because it wouldn't try to use name server from the AMI creation environment. The delay was shown in `/var/log/cloud-init.log`: ``` 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://169.254.169.254 took 40.099 seconds 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://[fd00:ec2::254] took 0.000 seconds 2025-03-19 16:00:17,731 - util.py[DEBUG]: Resolving URL: http://instance-data.:8773 took 10.010 seconds ``` Example content of `/etc/resolv.conf`: ``` cat /etc/resolv.conf # Generated by NetworkManager search ec2.internal nameserver 192.168.0.2 ``` Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index 00f98efc16..bb14fa8e53 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -22,7 +22,7 @@ fi # Clean resolv.conf if it's not managed by system if [ ! -L "/etc/resolv.conf" ]; then - echo -n > /etc/resolv.conf + sed -i '/^nameserver/d' /etc/resolv.conf fi find /var/log -type f -exec /bin/rm -v {} \; From 035bc14509fa055e841e021dfbf4b4ca25a72d25 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 21 Mar 2025 12:58:02 -0700 Subject: [PATCH 10/12] Clean Resolv.conf in official AMI build 1. Only cleanup resolv conf during official AMI build. In the future, we will evaluate to apply this improvement to all AMI builds. 2. Also clean up `/run/systemd/resolve/resolv.conf `. This file exists on Ubuntu Signed-off-by: Hanwen --- .../aws-parallelcluster-platform/files/ami_cleanup.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index bb14fa8e53..7a50cc215a 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -1,5 +1,7 @@ #!/bin/bash +IS_OFFICIAL_AMI_BUILD=${1:-"false"} + # clean up cloud init artifacts https://cloudinit.readthedocs.io/en/latest/topics/cli.html#clean cloud-init clean -s @@ -21,8 +23,10 @@ if [ "${ID}${VERSION_ID}" == "centos7" ]; then fi # Clean resolv.conf if it's not managed by system -if [ ! -L "/etc/resolv.conf" ]; then - sed -i '/^nameserver/d' /etc/resolv.conf +if [ "${IS_OFFICIAL_AMI_BUILD}" == "true" ]; then + echo "Clean resolv.conf for official AMIs" + echo -n > /etc/resolv.conf + rm -f /run/systemd/resolve/resolv.conf fi find /var/log -type f -exec /bin/rm -v {} \; From 98246dc4ce2e47e9b524776a9865dc78db0c81a8 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Tue, 25 Mar 2025 09:51:12 -0400 Subject: [PATCH 11/12] Remove route metric to check if it affects route tables (#2923) Co-authored-by: Himani Anil Deshpande --- .../files/redhat/dns_domain/NetworkManager.conf | 4 ---- .../files/rocky/dns_domain/NetworkManager.conf | 4 ---- 2 files changed, 8 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf index 3caa0bccac..e89b4a02fd 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf @@ -23,10 +23,6 @@ plugins = ifcfg-rh, dhcp = dhclient -[connection] -ipv4.route-metric=100 -ipv6.route-metric=200 - [connectivity] enabled=false diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf index 3caa0bccac..e89b4a02fd 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf @@ -23,10 +23,6 @@ plugins = ifcfg-rh, dhcp = dhclient -[connection] -ipv4.route-metric=100 -ipv6.route-metric=200 - [connectivity] enabled=false From fda6bf22ec22444054d85dca4a532e742d188034 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Tue, 25 Mar 2025 12:29:49 -0700 Subject: [PATCH 12/12] Consider string "true" as turning on Lustre, Nvidia installation Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-environment/libraries/fsx.rb | 2 +- cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb | 2 +- .../fabric_manager/partial/_fabric_manager_common.rb | 2 +- .../resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb | 2 +- .../test/controls/enroot_spec.rb | 6 +++--- .../test/controls/nvidia_dcgm_spec.rb | 2 +- .../test/controls/nvidia_fabric_manager_spec.rb | 2 +- .../aws-parallelcluster-slurm/test/controls/pyxis_spec.rb | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb b/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb index 40f4871a79..70e1916084 100644 --- a/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb +++ b/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb @@ -10,5 +10,5 @@ def aws_domain_for_fsx(region) end def lustre_enabled? - ['yes', true].include?(node['cluster']['lustre']['enabled']) + ['yes', true, 'true'].include?(node['cluster']['lustre']['enabled']) end diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index 29f258490e..a5952b0313 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -1,5 +1,5 @@ def nvidia_enabled? - ['yes', true].include?(node['cluster']['nvidia']['enabled']) + ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) end # diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 1c5ac45aba..027766f98f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -45,7 +45,7 @@ def _fabric_manager_enabled end def _nvidia_enabled - nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled + nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled end def _nvidia_driver_version diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb index 7ad1032211..bd4278f9a6 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb @@ -24,7 +24,7 @@ end def _nvidia_enabled - nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled + nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled end def package_version diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb index a8d92f625e..47eac71c1d 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_version_of_enroot_installed' do - only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } expected_enroot_version = node['cluster']['enroot']['version'] @@ -31,7 +31,7 @@ end control 'tag:config_enroot_enabled_on_graphic_instances' do - only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe 'enroot service should be enabled' do subject { command("enroot version") } its('exit_status') { should cmp == 0 } @@ -39,7 +39,7 @@ end control 'tag:config_enroot_disabled_on_non_graphic_instances' do - only_if { !os_properties.on_docker? && !['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && !['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe 'enroot service should be disabled' do subject { command("enroot version") } diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb index 6ee542a651..15ddf1c512 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb @@ -11,7 +11,7 @@ control 'tag:install_nvidia_dcgm_installed' do only_if do - ['yes', true].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && + ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && (!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?)) end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb index 4b3564af59..242ce90e5f 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do - only_if { !os_properties.arm? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe package(node['cluster']['nvidia']['fabricmanager']['package']) do it { should be_installed } diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb index d63f77de56..0cc7e8697e 100644 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_pyxis_installed' do - only_if { ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } title 'Checks Pyxis has been installed'