From 1cf81976c7384a53e2840f0a6e6a5432f5e9820d Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 25 Jul 2023 10:15:32 -0400 Subject: [PATCH 01/13] Bump version to 3.7.0b1 (#2384) --- cookbooks/aws-parallelcluster-shared/attributes/versions.rb | 6 +++--- kitchen.ec2.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 0369ce18c9..6d0318a7be 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -2,7 +2,7 @@ default['cluster']['python-version'] = '3.9.16' # ParallelCluster versions -default['cluster']['parallelcluster-version'] = '3.7.0' -default['cluster']['parallelcluster-cookbook-version'] = '3.7.0' -default['cluster']['parallelcluster-node-version'] = '3.7.0' +default['cluster']['parallelcluster-version'] = '3.7.0b1' +default['cluster']['parallelcluster-cookbook-version'] = '3.7.0b1' +default['cluster']['parallelcluster-node-version'] = '3.7.0b1' default['cluster']['parallelcluster-awsbatch-cli-version'] = '1.1.0' diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index e122487376..da76beccb6 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -1,5 +1,5 @@ <% - pcluster_version = ENV['KITCHEN_PCLUSTER_VERSION'] || '3.7.0' + pcluster_version = ENV['KITCHEN_PCLUSTER_VERSION'] || '3.7.0b1' pcluster_prefix = "aws-parallelcluster-#{pcluster_version}" %> --- From 08918dbf509cc797889277130a12cdf6a94fd551 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 25 Jul 2023 11:42:33 -0400 Subject: [PATCH 02/13] Use the license dir variable in the EULA reference for the armpl modulefile template (#2386) --- .../templates/arm_pl/armpl_modulefile.erb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/templates/arm_pl/armpl_modulefile.erb b/cookbooks/aws-parallelcluster-platform/templates/arm_pl/armpl_modulefile.erb index 75d459c99b..8fa637f048 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/arm_pl/armpl_modulefile.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/arm_pl/armpl_modulefile.erb @@ -44,5 +44,5 @@ puts stderr "At compile time add '-I' and at link time" # EULA if [ module-info mode load ] { - puts stderr "Use of the free of charge version of Arm Performance Libraries is subject to the terms and conditions of the Arm Performance Libraries (free version) - End User License Agreement (EULA). A copy of the EULA can be found in the '$root/arm-performance-libraries_${major_minor_version}_gcc-${gcc_version}/license_terms' folder" + puts stderr "Use of the free of charge version of Arm Performance Libraries is subject to the terms and conditions of the Arm Performance Libraries (free version) - End User License Agreement (EULA). A copy of the EULA can be found in the '<%= @armpl_license_dir %>' folder" } From 4143edd5adc347e6d203bbfc3fe9c540c0eb5809 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 25 Jul 2023 16:00:35 -0400 Subject: [PATCH 03/13] Add retry to fix sporadic nfs service failure (#2387) This failure causes cluster creation failure. It happened about once every 200 times. Signed-off-by: Hanwen Co-authored-by: Hanwen --- .../resources/nfs/partial/_configure.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-environment/resources/nfs/partial/_configure.rb b/cookbooks/aws-parallelcluster-environment/resources/nfs/partial/_configure.rb index bdc054a249..785604be1f 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/nfs/partial/_configure.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/nfs/partial/_configure.rb @@ -24,6 +24,8 @@ service node['nfs']['service']['server'] do action %i(restart enable) supports restart: true + retries 5 + retry_delay 10 end unless on_docker? else service node['nfs']['service']['server'] do From a4ce69b7110ca631d0753bddfef73f6e5e441680 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Wed, 2 Aug 2023 12:26:05 -0400 Subject: [PATCH 04/13] Set --no-cc-version-check flag to build on ubuntu 22.04 (#2389) The gcc compiler used to build the kernel on Ubuntu 22.04 images we use is 11.3, but the gcc apt package seems to install 11.4 for use in the terminal. This flag will prevent the NVIDIA driver from failing to install due to the minor version difference in gcc. --- .../resources/nvidia_driver/partial/_nvidia_driver_common.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index bba7b62377..63e2788901 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -61,13 +61,14 @@ end # Install driver + # TODO remove --no-cc-version-check when we can update ubuntu 22 images bash 'nvidia.run advanced' do user 'root' group 'root' cwd '/tmp' code <<-NVIDIA set -e - ./nvidia.run --silent --dkms --disable-nouveau + ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check rm -f /tmp/nvidia.run NVIDIA creates '/usr/bin/nvidia-smi' From 6881d0fbec9a71aea4b2cbef1362612ea30bc8cf Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Thu, 3 Aug 2023 12:03:46 -0400 Subject: [PATCH 05/13] Move the NFS config file override from the setup action to configure via override_server_template (#2391) The override_server_template function runs during bootstrapping to set the parameter needed for the OS specific config file. The setup action works locally since that action runs during local tests, but does not work during bootstrapping when the node attributes from the setup action are not present --- .../resources/nfs/nfs_ubuntu22+.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/nfs/nfs_ubuntu22+.rb b/cookbooks/aws-parallelcluster-environment/resources/nfs/nfs_ubuntu22+.rb index 53929336f3..792b0fec18 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/nfs/nfs_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/nfs/nfs_ubuntu22+.rb @@ -29,11 +29,11 @@ action_install_nfs action_install_nfs4 action_disable_start_at_boot - node.default['nfs']['config']['server_template'] = '/etc/nfs.conf.d/parallelcluster-nfs.conf' end action_class do def override_server_template + node.default['nfs']['config']['server_template'] = '/etc/nfs.conf.d/parallelcluster-nfs.conf' edit_resource(:template, node['nfs']['config']['server_template']) do source 'nfs/nfs-ubuntu22+.conf.erb' cookbook 'aws-parallelcluster-environment' From ead3ee1ec44664f00a9af3fec5a5e3838b7635dd Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Thu, 3 Aug 2023 13:44:53 -0400 Subject: [PATCH 06/13] Move ulimit check to install phase and switch user to reflect changes to the config (#2393) Until now ulimit -Sn was used in the config step to confirm the changes in the install step are reflected in the running OS. This is no longer working in Ubuntu 22 because for whatever reason, the session that is created for Ubuntu 22 doesn't have the changes reflected, even though the files are in place. After launching an instance from the AMI however, the changes are there. To improve this test, we need to move it to the install step and check it immediately by switching users and forcing the loading of the limits. --- .../kitchen.platform-install.yml | 2 +- .../test/controls/users_spec.rb | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/kitchen.platform-install.yml b/cookbooks/aws-parallelcluster-platform/kitchen.platform-install.yml index 9ebefc9dd2..8bc608fc58 100644 --- a/cookbooks/aws-parallelcluster-platform/kitchen.platform-install.yml +++ b/cookbooks/aws-parallelcluster-platform/kitchen.platform-install.yml @@ -308,4 +308,4 @@ suites: - recipe[aws-parallelcluster-platform::users] verifier: controls: - - /tag:install_users/ \ No newline at end of file + - /tag:install_users/ diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb index 4742b6ca02..a51e8a1775 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb @@ -18,6 +18,9 @@ describe limits_conf("/etc/security/limits.d/00_all_limits.conf") do its('*') { should include ['-', 'nofile', "10000"] } end + describe bash("sudo -u #{user} bash -c 'ulimit -Sn'") do + its('stdout') { should cmp >= '8192' } + end end control 'tag:config_admin_user_and_group_correctly_defined' do @@ -33,11 +36,3 @@ its('gid') { should eq node['cluster']['cluster_admin_group_id'] } end end - -control 'tag:config_ulimit_is_not_lower_than_8192' do - only_if { !instance.custom_ami? } - - describe bash("ulimit -Sn") do - its('stdout') { should cmp >= '8192' } - end -end From c16dd420993874c27ace0921d99729f524dc7225 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 3 Aug 2023 10:48:50 -0700 Subject: [PATCH 07/13] Upgrade EFA to 1.25.0 The hash of the installer can be retrieved here https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-verify.html Signed-off-by: Hanwen --- CHANGELOG.md | 8 ++++---- .../resources/efa/partial/_common.rb | 4 ++-- .../spec/unit/resources/efa_spec.rb | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 486e88ad53..08cccf04dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,13 +19,13 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists. - Upgrade NVIDIA driver to version 470.199.02. - Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise. -- Upgrade EFA installer to `1.24.0` - - Efa-driver: `efa-2.4.1-1` +- Upgrade EFA installer to `1.25.0` + - Efa-driver: `efa-2.5.0-1` - Efa-config: `efa-config-1.15-1` - Efa-profile: `efa-profile-1.5-1` - - Libfabric-aws: `libfabric-aws-1.18.0-1` + - Libfabric-aws: `libfabric-aws-1.18.1-1` - Rdma-core: `rdma-core-46.0-1` - - Open MPI: `openmpi40-aws-4.1.5-1` + - Open MPI: `openmpi40-aws-4.1.5-3` - Upgrade Slurm to version 23.02.3. - Upgrade ARM PL to version 23.04.1 for Ubuntu 22.04 only. diff --git a/cookbooks/aws-parallelcluster-environment/resources/efa/partial/_common.rb b/cookbooks/aws-parallelcluster-environment/resources/efa/partial/_common.rb index 71a304695a..4c610fb009 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efa/partial/_common.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efa/partial/_common.rb @@ -17,8 +17,8 @@ # EFA setup: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html # -property :efa_version, String, default: '1.24.0' -property :efa_checksum, String, default: '878623f819a0d9099d76ecd41cf4f569d4c3aac0c9bb7ba9536347c50b6bf88e' +property :efa_version, String, default: '1.25.0' +property :efa_checksum, String, default: '98b7b26ce031a2d6a93de2297cc71b03af647194866369ca53b60d82d45ad342' action :setup do if efa_installed? && !::File.exist?(efa_tarball) diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index f779b09fd9..5e399b8965 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -2,8 +2,8 @@ # parallelcluster default source dir defined in attributes source_dir = '/opt/parallelcluster/sources' -efa_version = '1.24.0' -efa_checksum = '878623f819a0d9099d76ecd41cf4f569d4c3aac0c9bb7ba9536347c50b6bf88e' +efa_version = '1.25.0' +efa_checksum = '98b7b26ce031a2d6a93de2297cc71b03af647194866369ca53b60d82d45ad342' class ConvergeEfa def self.setup(chef_run) From 12ed6d5481760ed1872905dbd1f2b23c1816d4eb Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Mon, 31 Jul 2023 17:22:56 +0200 Subject: [PATCH 08/13] Upgrade slurm to version 23.02.4 Signed-off-by: Jacopo De Amicis --- CHANGELOG.md | 2 +- .../aws-parallelcluster-slurm/attributes/slurm_attributes.rb | 2 +- cookbooks/aws-parallelcluster-slurm/attributes/versions.rb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08cccf04dd..bbce1d9025 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: `libfabric-aws-1.18.1-1` - Rdma-core: `rdma-core-46.0-1` - Open MPI: `openmpi40-aws-4.1.5-3` -- Upgrade Slurm to version 23.02.3. +- Upgrade Slurm to version 23.02.4. - Upgrade ARM PL to version 23.04.1 for Ubuntu 22.04 only. **BUG FIXES** diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index bd954f9178..b43bc88211 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -3,7 +3,7 @@ # Slurm attributes shared between install_slurm and configure_slurm_accounting default['cluster']['slurm']['commit'] = '' -default['cluster']['slurm']['sha256'] = 'c41747e4484011cf376d6d4bc73b6c4696cdc0f7db4f64174f111bb9f53fb603' +default['cluster']['slurm']['sha256'] = '7290143a71ce2797d0df3423f08396fd5c0ae4504749ff372d6860b2d6a3a1b0' default['cluster']['slurm']['install_dir'] = '/opt/slurm' default['cluster']['dns_domain'] = nil diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 00c7a6a743..178722504c 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,4 +1,4 @@ # Slurm -default['cluster']['slurm']['version'] = '23-02-3-1' +default['cluster']['slurm']['version'] = '23-02-4-1' # Munge default['cluster']['munge']['munge_version'] = '0.5.15' From 9baae48db058d20e28d61740ea196f8e84637362 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Fri, 4 Aug 2023 09:58:07 -0400 Subject: [PATCH 09/13] [Release-3.7] Move ulimit check back to config since it fails on jenkins (#2400) --- .../test/controls/users_spec.rb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb index a51e8a1775..47374b725d 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/users_spec.rb @@ -18,9 +18,6 @@ describe limits_conf("/etc/security/limits.d/00_all_limits.conf") do its('*') { should include ['-', 'nofile', "10000"] } end - describe bash("sudo -u #{user} bash -c 'ulimit -Sn'") do - its('stdout') { should cmp >= '8192' } - end end control 'tag:config_admin_user_and_group_correctly_defined' do @@ -36,3 +33,11 @@ its('gid') { should eq node['cluster']['cluster_admin_group_id'] } end end + +control 'tag:config_ulimit_is_not_lower_than_8192' do + only_if { !instance.custom_ami? } + + describe bash("sudo -u #{user} bash -c 'ulimit -Sn'") do + its('stdout') { should cmp >= '8192' } + end +end From b87af3207b31dc29b67ef33ec36639dec2b548c9 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Fri, 4 Aug 2023 16:33:10 -0400 Subject: [PATCH 10/13] [Release 3.7] update the volume size default to 40 to address issues with Centos 7 free space (#2403) --- kitchen.ec2.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index da76beccb6..9275330ba9 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -89,7 +89,7 @@ platforms: block_device_mappings: - device_name: /dev/xvda ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 35 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -115,7 +115,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 35 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -141,7 +141,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 35 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -167,7 +167,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 35 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -193,7 +193,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 35 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> From ae692aecc722a0e706cfe8a869065925c6273c28 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 8 Aug 2023 16:15:24 -0400 Subject: [PATCH 11/13] [Release 3.7] Add ubuntu 22.04 to dcv connect script and mysql client install check (#2406) --- .../files/dcv/pcluster_dcv_connect.sh | 2 +- .../test/controls/mysql_client_spec.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh b/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh index 415adcbfde..78b2a3dd20 100644 --- a/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh +++ b/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh @@ -112,7 +112,7 @@ main() { os=$(< /etc/chef/dna.json jq -r .cluster.base_os) _log "Input parameters: user: ${user}, OS: ${os}, shared_folder_path: ${shared_folder_path}." - if ! [[ "${os}" =~ ^(alinux2|ubuntu2004|centos[7-8]|rhel8)$ ]]; then + if ! [[ "${os}" =~ ^(alinux2|ubuntu2004|ubuntu2204|centos[7-8]|rhel8)$ ]]; then _fail "OS not supported." fi diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb index 644aa912c3..35d403e1b8 100644 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb @@ -16,7 +16,7 @@ if os.redhat? mysql_packages.concat %w(mysql-community-client-plugins mysql-community-common mysql-community-devel mysql-community-libs mysql-community-libs-compat) - elsif os_properties.ubuntu2004? + elsif os_properties.ubuntu2004? || os_properties.ubuntu2204? mysql_packages.concat %w(libmysqlclient-dev libmysqlclient21) else describe "unsupported OS" do From 3f7656bd9444efcac1aaf3decb730b05dfafccf3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 9 Aug 2023 12:01:40 +0200 Subject: [PATCH 12/13] [Fault Tolerance] Add retry with delay to the block to copy Munge key and the blocks to start Chronyd and Munge services. Signed-off-by: Giacomo Marciani --- .../resources/chrony/partial/_chrony_common.rb | 2 ++ cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/resources/chrony/partial/_chrony_common.rb b/cookbooks/aws-parallelcluster-platform/resources/chrony/partial/_chrony_common.rb index 222ad0d51b..2868bdc05b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/chrony/partial/_chrony_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/chrony/partial/_chrony_common.rb @@ -39,6 +39,8 @@ supports restart: false reload_command chrony_reload_command action %i(enable start) + retries 5 + retry_delay 10 end unless redhat_on_docker? end diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index 181296cd75..9dcbb9fa6e 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -60,6 +60,8 @@ def enable_munge_service service "munge" do supports restart: true action %i(enable start) + retries 5 + retry_delay 10 end end @@ -111,6 +113,8 @@ def setup_munge_compute_node # Enforce correct permission on the key chmod 0600 /etc/munge/munge.key COMPUTE_MUNGE_KEY + retries 5 + retry_delay 10 end enable_munge_service From 618b122f8ada88968dd6065a2fcc7eb29e142ddd Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Fri, 28 Jul 2023 14:50:17 -0400 Subject: [PATCH 13/13] Upgrading NVIDIA driver,fabric manager and Cuda to v535 and v12.2 respectively --- CHANGELOG.md | 4 ++- .../attributes/platform.rb | 2 +- .../recipes/install/cuda.rb | 6 ++-- .../fabric_manager_ubuntu20+.rb | 2 +- .../nvidia_driver/nvidia_driver_amazon2.rb | 4 +++ .../partial/_nvidia_driver_common.rb | 6 +++- .../spec/unit/recipes/cuda_spec.rb | 6 ++-- .../unit/resources/fabric_manager_spec.rb | 4 +-- .../spec/unit/resources/nvidia_driver_spec.rb | 34 ++++++++++++------- 9 files changed, 44 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbce1d9025..f9fb97bff3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones. - Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists. -- Upgrade NVIDIA driver to version 470.199.02. +- Upgrade NVIDIA driver to version 535.54.03. +- Upgrade CUDA library to version 12.2.0. +- Upgrade NVIDIA Fabric manager to `nvidia-fabricmanager-535` - Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise. - Upgrade EFA installer to `1.25.0` - Efa-driver: `efa-2.5.0-1` diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index a740a36fef..bb4e6066ab 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -11,7 +11,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '470.199.02' +default['cluster']['nvidia']['driver_version'] = '535.54.03' # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index c8a25c0fd1..a1d8ffa5ad 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -19,13 +19,13 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux -cuda_version = '11.8' +cuda_version = '12.2' cuda_patch = '0' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '520.61.05' +cuda_version_suffix = '535.54.03' cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux' cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" -cuda_samples_version = '11.8' +cuda_samples_version = '12.2' cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" tmp_cuda_run = '/tmp/cuda.run' tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index e92db45cda..c01265485d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -20,7 +20,7 @@ use 'partial/_fabric_manager_install_debian.rb' def fabric_manager_package - 'nvidia-fabricmanager-470' + 'nvidia-fabricmanager-535' end def fabric_manager_version diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb index f42dff44ec..69067a8b09 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb @@ -20,3 +20,7 @@ def set_compiler? # Amazon linux 2 with Kernel 5 need to set CC to /usr/bin/gcc10-gcc using dkms override node['kernel']['release'].split('.')[0].to_i == 5 end + +def compiler_version + 'CC=/usr/bin/gcc10-gcc' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index 63e2788901..84b7308be9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -68,7 +68,7 @@ cwd '/tmp' code <<-NVIDIA set -e - ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check + #{compiler_version} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check rm -f /tmp/nvidia.run NVIDIA creates '/usr/bin/nvidia-smi' @@ -103,3 +103,7 @@ def rebuild_initramfs? def set_compiler? false end + +def compiler_version + "" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 71bba02fde..7505e655de 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -1,10 +1,10 @@ require 'spec_helper' describe 'aws-parallelcluster-platform::cuda' do - cached(:cuda_version) { '11.8' } + cached(:cuda_version) { '12.2' } cached(:cuda_patch) { '0' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '520.61.05' } + cached(:cuda_version_suffix) { '535.54.03' } context 'when nvidia not enabled' do cached(:chef_run) do @@ -20,7 +20,7 @@ context 'when on arm' do cached(:cuda_arch) { 'linux_sbsa' } cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } - cached(:cuda_samples_version) { '11.8' } + cached(:cuda_samples_version) { '12.2' } cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" } cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index dc225f3b57..eb186fc76a 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -167,7 +167,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' } + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } context 'when fabric manager is to install' do @@ -218,7 +218,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' } + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } context('when nvswithes are > 1') do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 3e23631c94..84d0eba0a9 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -203,22 +203,32 @@ def self.setup(chef_run, nvidia_driver_version: nil) mode: '0644' ) end + it 'installs nvidia driver' do + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check}) + .with_code(%r{rm -f /tmp/nvidia.run}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') end - end - - it 'installs nvidia driver' do - is_expected.to run_bash('nvidia.run advanced') - .with( - user: 'root', - group: 'root', - cwd: '/tmp', - creates: '/usr/bin/nvidia-smi' - ) - .with_code(%r{./nvidia.run --silent --dkms --disable-nouveau}) - .with_code(%r{rm -f /tmp/nvidia.run}) + it 'installs nvidia driver' do + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check}) + .with_code(%r{rm -f /tmp/nvidia.run}) + end end if platform == 'ubuntu'