Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
**CHANGES**
- Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones.
- Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists.
- Upgrade NVIDIA driver to version 470.199.02.
- Upgrade NVIDIA driver to version 535.54.03.
- Upgrade CUDA library to version 12.2.0.
- Upgrade NVIDIA Fabric manager to `nvidia-fabricmanager-535`
- Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise.
- Upgrade EFA installer to `1.25.0`
- Efa-driver: `efa-2.5.0-1`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '470.199.02'
default['cluster']['nvidia']['driver_version'] = '535.54.03'

# DCV
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@

# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
cuda_version = '11.8'
cuda_version = '12.2'
cuda_patch = '0'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '520.61.05'
cuda_version_suffix = '535.54.03'
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_samples_version = '11.8'
cuda_samples_version = '12.2'
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
tmp_cuda_run = '/tmp/cuda.run'
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use 'partial/_fabric_manager_install_debian.rb'

def fabric_manager_package
'nvidia-fabricmanager-470'
'nvidia-fabricmanager-535'
end

def fabric_manager_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ def set_compiler?
# Amazon linux 2 with Kernel 5 need to set CC to /usr/bin/gcc10-gcc using dkms override
node['kernel']['release'].split('.')[0].to_i == 5
end

def compiler_version
'CC=/usr/bin/gcc10-gcc'
end
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
cwd '/tmp'
code <<-NVIDIA
set -e
./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
#{compiler_version} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
rm -f /tmp/nvidia.run
NVIDIA
creates '/usr/bin/nvidia-smi'
Expand Down Expand Up @@ -103,3 +103,7 @@ def rebuild_initramfs?
def set_compiler?
false
end

def compiler_version
""
end
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
require 'spec_helper'

describe 'aws-parallelcluster-platform::cuda' do
cached(:cuda_version) { '11.8' }
cached(:cuda_version) { '12.2' }
cached(:cuda_patch) { '0' }
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
cached(:cuda_version_suffix) { '520.61.05' }
cached(:cuda_version_suffix) { '535.54.03' }

context 'when nvidia not enabled' do
cached(:chef_run) do
Expand All @@ -20,7 +20,7 @@
context 'when on arm' do
cached(:cuda_arch) { 'linux_sbsa' }
cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
cached(:cuda_samples_version) { '11.8' }
cached(:cuda_samples_version) { '12.2' }
cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" }

cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }

context 'when fabric manager is to install' do
Expand Down Expand Up @@ -218,7 +218,7 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }

context('when nvswithes are > 1') do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,22 +203,32 @@ def self.setup(chef_run, nvidia_driver_version: nil)
mode: '0644'
)
end
it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
.with_code(%r{rm -f /tmp/nvidia.run})
end
else
it "doesn't install gcc10" do
is_expected.not_to install_package('gcc10')
end
end

it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau})
.with_code(%r{rm -f /tmp/nvidia.run})
it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
.with_code(%r{rm -f /tmp/nvidia.run})
end
end

if platform == 'ubuntu'
Expand Down