diff --git a/CHANGELOG.md b/CHANGELOG.md index 12231c2bdf..3f172094ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +3.14.1 +------ + +**CHANGES** +- Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes + and achieve better performance at scale. +- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index 5cfa7ea906..791e7a8311 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -72,6 +72,13 @@ end end + # Load kernel modules in best effort + kernel_modules_to_load.each do |km| + execute "Load kernel module if exposed by the kernel: #{km}" do + command "if modinfo #{km}; then modprobe #{km}; fi" + end + end + # Install driver bash 'nvidia.run advanced' do user 'root' @@ -126,3 +133,7 @@ def nvidia_kernel_module "kernel-open" end end + +def kernel_modules_to_load + %w(drm_client_lib) +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 4d0b8b57ca..ad71e58a4b 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -140,11 +140,27 @@ def self.setup(chef_run, nvidia_driver_version: nil) end end +describe 'nvidia_driver:kernel_modules_to_load' do + cached(:chef_run) do + ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) + end + + cached(:resource) do + ConvergeNvidiaDriver.setup(chef_run) + chef_run.find_resource('nvidia_driver', 'setup') + end + + it 'returns expected kernel modules' do + expect(resource.kernel_modules_to_load).to eq(%w(drm_client_lib)) + end +end + describe 'nvidia_driver:setup' do for_all_oses do |platform, version| cached(:nvidia_arch) { 'nvidia_arch' } cached(:nvidia_kernel_module) { 'nvidia_kernel_module' } cached(:nvidia_driver_version) { 'nvidia_driver_version' } + cached(:kernel_modules_to_load) { %w(module1 module2) } cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } context "on #{platform}#{version} when nvidia_driver not enabled" do @@ -176,6 +192,7 @@ def self.setup(chef_run, nvidia_driver_version: nil) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) + allow(res).to receive(:kernel_modules_to_load).and_return(kernel_modules_to_load) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -220,6 +237,14 @@ def self.setup(chef_run, nvidia_driver_version: nil) ) end + it 'loads kernel modules in they are exposed by the kernel' do + kernel_modules_to_load.each do |km| + is_expected.to run_execute("Load kernel module if exposed by the kernel: #{km}").with( + command: "if modinfo #{km}; then modprobe #{km}; fi" + ) + end + end + if platform == 'amazon' compiler_version = version == '2023' ? 'gcc' : 'gcc10' compiler_path = version == '2023' ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc'