Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
end

# nvidia-imex
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
default['cluster']['nvidia']['imex']['force_configuration'] = false

# NVIDIA NVLSM
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,51 @@
return unless nvidia_enabled_or_installed?
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso")

directory node['cluster']['nvidia']['imex']['shared_dir']

action_install_imex

# Create Imex configuration files
action_create_configuration_files
# Save Imex version in Node Attributes for InSpec Tests
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package
node_attributes 'dump node attributes'
end

action :create_configuration_files do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] What about calling any the action install_configuration_files? I think could be a good practice to call actions that are part of the install phase as install_SOMETHING and actions called in configuration phase configure_SOMETHING.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer keeping the action name as creation as the action of the template is create. Even though we use it in Install phase of our recipes, we are creating these configuration files.

# We create or update IMEX configuration files if ParallelCluster is installing IMEX
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] this comment is correct, but it makes more sense to write it where create_configuration_files is called.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its Create or update because as per chef template the action create is https://docs.chef.io/resources/template/

(default) Create a file. If a file already exists (but does not match), update that file to match.

template nvidia_imex_nodes_conf_file do
source 'nvidia-imex/nvidia-imex-nodes.erb'
owner 'root'
group 'root'
mode '0755'
action :create
end

template nvidia_imex_main_conf_file do
source 'nvidia-imex/nvidia-imex-config.erb'
owner 'root'
group 'root'
mode '0755'
action :create
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
end

# We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file.
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
source 'nvidia-imex/nvidia-imex.service.erb'
owner 'root'
group 'root'
mode '0644'
action :create
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
end
end

action :configure do
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
if is_gb200_node? || enable_force_configuration?
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
# if one doesn't already exist in a common, shared location.
# Create the file if this is missing otherwise Imex service will not start
template nvidia_imex_nodes_conf_file do
source 'nvidia-imex/nvidia-imex-nodes.erb'
owner 'root'
Expand All @@ -42,24 +72,6 @@
action :create_if_missing
end

template nvidia_imex_main_conf_file do
source 'nvidia-imex/nvidia-imex-config.erb'
owner 'root'
group 'root'
mode '0755'
action :create_if_missing
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
end

template "/etc/systemd/system/#{nvidia_imex_service}.service" do
source 'nvidia-imex/nvidia-imex.service.erb'
owner 'root'
group 'root'
mode '0644'
action :create
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
end

service nvidia_imex_service do
action %i(enable start)
supports status: true
Expand Down Expand Up @@ -92,11 +104,11 @@ def nvidia_enabled_or_installed?
end

def nvidia_imex_main_conf_file
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
"/etc/nvidia-imex/config.cfg"
end

def nvidia_imex_nodes_conf_file
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
"/etc/nvidia-imex/nodes_config.cfg"
end

def enable_force_configuration?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

nvidia_version = "1.2.3"
SOURCE_DIR = 'SOURCE_DIR'.freeze
nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex"
nvidia_imex_dir = "/etc/nvidia-imex"
imex_main_conf_file = "#{nvidia_imex_dir}/config.cfg"
imex_nodes_conf_file = "#{nvidia_imex_dir}/nodes_config.cfg"
imex_service_file = "/etc/systemd/system/nvidia-imex.service"
imex_binary = '/usr/bin/nvidia-imex'
imex_ctl_binary = '/usr/bin/nvidia-imex-ctl'
queue_name = 'queue-name'
compute_resource_name = 'compute-resource-name'
cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN'

class ConvergeNvidiaImex
Expand All @@ -18,6 +19,14 @@ def self.install(chef_run)
end
end

def self.create_configuration_files(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
nvidia_imex 'create_configuration_files' do
action :create_configuration_files
end
end
end

def self.configure(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
nvidia_imex 'configure' do
Expand Down Expand Up @@ -231,7 +240,6 @@ def self.configure(chef_run)
cached(:node) { chef_run.node }

before do
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url
chef_run.node.override['cluster']['region'] = 'aws_region'
chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR
Expand All @@ -241,7 +249,6 @@ def self.configure(chef_run)
end
if platform == 'amazon' && version == '2'
it 'does not install nvidia-imex' do
is_expected.not_to create_directory(nvidia_imex_shared_dir)
is_expected.not_to install_install_packages('Install nvidia-imex')
.with(packages: "#{nvidia_imex_name}")
.with(action: %i(install))
Expand All @@ -254,7 +261,6 @@ def self.configure(chef_run)
else

it 'installs nvidia-imex' do
is_expected.to create_directory(nvidia_imex_shared_dir)
if platform == 'ubuntu'
is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with(
source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb",
Expand Down Expand Up @@ -294,6 +300,38 @@ def self.configure(chef_run)
end
end

describe 'nvidia_imex:create_configuration_files' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
ConvergeNvidiaImex.create_configuration_files(runner)
end
cached(:node) { chef_run.node }

it 'does create Imex configuration files' do
is_expected.to create_template("#{imex_nodes_conf_file}")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.to create_template("#{imex_main_conf_file}")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{imex_nodes_conf_file}" })
is_expected.to create_template(imex_service_file)
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{imex_main_conf_file}" })
end
end
end
end

describe 'nvidia_imex:configure' do
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
for_all_oses do |platform, version|
Expand Down Expand Up @@ -329,54 +367,27 @@ def self.configure(chef_run)
before do
chef_run.node.override['cluster']['region'] = 'aws_region'
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
chef_run.node.override['cluster']['node_type'] = node_type
chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name
chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name

ConvergeNvidiaImex.configure(chef_run)
end

if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
it 'does not configure nvidia-imex' do
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
is_expected.not_to create_if_missing_template("#{imex_nodes_conf_file}")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
else
it 'it starts nvidia-imex service' do
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
is_expected.to create_if_missing_template("#{imex_nodes_conf_file}")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
its('mode') { should cmp '0644' }
its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} }
its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg} }
end

describe service('nvidia-imex') do
Expand Down
Loading