diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 6566160ec..973f74fb7 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -24,7 +24,6 @@ end # nvidia-imex -default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" default['cluster']['nvidia']['imex']['force_configuration'] = false # NVIDIA NVLSM diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index fc126e43e..f791eb5f1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -19,21 +19,51 @@ return unless nvidia_enabled_or_installed? return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") - directory node['cluster']['nvidia']['imex']['shared_dir'] - action_install_imex + + # Create Imex configuration files + action_create_configuration_files # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package node_attributes 'dump node attributes' end +action :create_configuration_files do + # We create or update IMEX configuration files if ParallelCluster is installing IMEX + template nvidia_imex_nodes_conf_file do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + action :create + end + + template nvidia_imex_main_conf_file do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + action :create + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) + end + + # We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file. + template "/etc/systemd/system/#{nvidia_imex_service}.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) + end +end + action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if is_gb200_node? || enable_force_configuration? - # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, - # if one doesn't already exist in a common, shared location. + # Create the file if this is missing otherwise Imex service will not start template nvidia_imex_nodes_conf_file do source 'nvidia-imex/nvidia-imex-nodes.erb' owner 'root' @@ -42,24 +72,6 @@ action :create_if_missing end - template nvidia_imex_main_conf_file do - source 'nvidia-imex/nvidia-imex-config.erb' - owner 'root' - group 'root' - mode '0755' - action :create_if_missing - variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) - end - - template "/etc/systemd/system/#{nvidia_imex_service}.service" do - source 'nvidia-imex/nvidia-imex.service.erb' - owner 'root' - group 'root' - mode '0644' - action :create - variables(imex_main_config_file_path: nvidia_imex_main_conf_file) - end - service nvidia_imex_service do action %i(enable start) supports status: true @@ -92,11 +104,11 @@ def nvidia_enabled_or_installed? end def nvidia_imex_main_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "/etc/nvidia-imex/config.cfg" end def nvidia_imex_nodes_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "/etc/nvidia-imex/nodes_config.cfg" end def enable_force_configuration? diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 0985bffdb..44da50615 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -2,11 +2,12 @@ nvidia_version = "1.2.3" SOURCE_DIR = 'SOURCE_DIR'.freeze -nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" +nvidia_imex_dir = "/etc/nvidia-imex" +imex_main_conf_file = "#{nvidia_imex_dir}/config.cfg" +imex_nodes_conf_file = "#{nvidia_imex_dir}/nodes_config.cfg" +imex_service_file = "/etc/systemd/system/nvidia-imex.service" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' -queue_name = 'queue-name' -compute_resource_name = 'compute-resource-name' cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN' class ConvergeNvidiaImex @@ -18,6 +19,14 @@ def self.install(chef_run) end end + def self.create_configuration_files(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'create_configuration_files' do + action :create_configuration_files + end + end + end + def self.configure(chef_run) chef_run.converge_dsl('aws-parallelcluster-platform') do nvidia_imex 'configure' do @@ -231,7 +240,6 @@ def self.configure(chef_run) cached(:node) { chef_run.node } before do - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR @@ -241,7 +249,6 @@ def self.configure(chef_run) end if platform == 'amazon' && version == '2' it 'does not install nvidia-imex' do - is_expected.not_to create_directory(nvidia_imex_shared_dir) is_expected.not_to install_install_packages('Install nvidia-imex') .with(packages: "#{nvidia_imex_name}") .with(action: %i(install)) @@ -254,7 +261,6 @@ def self.configure(chef_run) else it 'installs nvidia-imex' do - is_expected.to create_directory(nvidia_imex_shared_dir) if platform == 'ubuntu' is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with( source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb", @@ -294,6 +300,38 @@ def self.configure(chef_run) end end +describe 'nvidia_imex:create_configuration_files' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.create_configuration_files(runner) + end + cached(:node) { chef_run.node } + + it 'does create Imex configuration files' do + is_expected.to create_template("#{imex_nodes_conf_file}") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{imex_main_conf_file}") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{imex_nodes_conf_file}" }) + is_expected.to create_template(imex_service_file) + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{imex_main_conf_file}" }) + end + end + end +end + describe 'nvidia_imex:configure' do [%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator| for_all_oses do |platform, version| @@ -329,54 +367,27 @@ def self.configure(chef_run) before do chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['node_type'] = node_type - chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name - chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name ConvergeNvidiaImex.configure(chef_run) end if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) it 'does not configure nvidia-imex' do - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + is_expected.not_to create_if_missing_template("#{imex_nodes_conf_file}") .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') .with(user: 'root') .with(group: 'root') .with(mode: '0755') - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end else it 'it starts nvidia-imex service' do - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + is_expected.to create_if_missing_template("#{imex_nodes_conf_file}") .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') .with(user: 'root') .with(group: 'root') .with(mode: '0755') - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index b3524db81..36a1c714a 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -36,7 +36,7 @@ its('owner') { should eq 'root' } its('group') { should eq 'root' } its('mode') { should cmp '0644' } - its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} } + its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg} } end describe service('nvidia-imex') do