Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security.

**CHANGES**
- Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2.
- Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2.
- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.
- Upgrade `aws-cfn-bootstrap` to version 2.0-32.
- Upgrade amazon-efs-utils to version 2.1.0.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '550.127.08'
default['cluster']['nvidia']['driver_version'] = '570.86.15'
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
if platform?('amazon') && node['platform_version'] == "2"
default['cluster']['nvidia']['driver_version'] = '550.127.08'
end

# DCV
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
Expand Down
15 changes: 11 additions & 4 deletions cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,20 @@

# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
cuda_version = '12.4'
cuda_patch = '1'
cuda_version = '12.8'
cuda_patch = '0'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '550.54.15'
cuda_version_suffix = '570.86.10'
Copy link
Contributor

@gmarciani gmarciani Feb 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be 570.86.15?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no because cuda 12.8 supports >= 570.26, but the cuda version suffix is 570.86.10
Just like the previous driver version we had was 550.127.08 but the cuda 12.4 suffix was 550.54.15

cuda_samples_version = '12.8'
if platform?('amazon') && node['platform_version'] == "2"
cuda_version = '12.4'
cuda_patch = '1'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '550.54.15'
cuda_samples_version = '12.4'
end
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_samples_version = '12.4'
cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz"
tmp_cuda_run = '/tmp/cuda.run'
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use 'partial/_fabric_manager_install_debian.rb'

def fabric_manager_package
'nvidia-fabricmanager-550'
'nvidia-fabricmanager-570'
end

def fabric_manager_version
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
require 'spec_helper'

describe 'aws-parallelcluster-platform::cuda' do
cached(:cuda_version) { '12.4' }
cached(:cuda_patch) { '1' }
cached(:cuda_version) { '12.8' }
cached(:cuda_patch) { '0' }
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
cached(:cuda_version_suffix) { '550.54.15' }
cached(:cuda_version_suffix) { '570.86.10' }

context 'when nvidia not enabled' do
cached(:chef_run) do
Expand All @@ -20,7 +20,7 @@
context 'when on arm' do
cached(:cuda_arch) { 'linux_sbsa' }
cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
cached(:cuda_samples_version) { '12.4' }
cached(:cuda_samples_version) { '12.8' }
cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" }

cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-550' : 'nvidia-fabric-manager' }
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

context 'when fabric manager is to install' do
Expand Down
Loading