diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dc45d1d90..ecc904e311 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** +- Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2. +- Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2. - On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. - Upgrade `aws-cfn-bootstrap` to version 2.0-32. - Upgrade amazon-efs-utils to version 2.1.0. diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 14639a219d..b0febf2560 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -16,8 +16,11 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '550.127.08' +default['cluster']['nvidia']['driver_version'] = '570.86.15' default['cluster']['nvidia']['dcgm_version'] = '3.3.6' +if platform?('amazon') && node['platform_version'] == "2" + default['cluster']['nvidia']['driver_version'] = '550.127.08' +end # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index 152968a057..3c7ba588bb 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -19,13 +19,20 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux -cuda_version = '12.4' -cuda_patch = '1' +cuda_version = '12.8' +cuda_patch = '0' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '550.54.15' +cuda_version_suffix = '570.86.10' +cuda_samples_version = '12.8' +if platform?('amazon') && node['platform_version'] == "2" + cuda_version = '12.4' + cuda_patch = '1' + cuda_complete_version = "#{cuda_version}.#{cuda_patch}" + cuda_version_suffix = '550.54.15' + cuda_samples_version = '12.4' +end cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux' cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" -cuda_samples_version = '12.4' cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" tmp_cuda_run = '/tmp/cuda.run' tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index 393c9fb312..e47bec4ecd 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -20,7 +20,7 @@ use 'partial/_fabric_manager_install_debian.rb' def fabric_manager_package - 'nvidia-fabricmanager-550' + 'nvidia-fabricmanager-570' end def fabric_manager_version diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 5e241d8955..297d1ae932 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -1,10 +1,10 @@ require 'spec_helper' describe 'aws-parallelcluster-platform::cuda' do - cached(:cuda_version) { '12.4' } - cached(:cuda_patch) { '1' } + cached(:cuda_version) { '12.8' } + cached(:cuda_patch) { '0' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '550.54.15' } + cached(:cuda_version_suffix) { '570.86.10' } context 'when nvidia not enabled' do cached(:chef_run) do @@ -20,7 +20,7 @@ context 'when on arm' do cached(:cuda_arch) { 'linux_sbsa' } cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } - cached(:cuda_samples_version) { '12.4' } + cached(:cuda_samples_version) { '12.8' } cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" } cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index f4b409a60c..11df489b8a 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -168,7 +168,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-550' : 'nvidia-fabric-manager' } + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570' : 'nvidia-fabric-manager' } cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } context 'when fabric manager is to install' do