-
Notifications
You must be signed in to change notification settings - Fork 107
[Nvidia-Imex] Installing Nvidia-imex as part of ParallelCluster Build Image #2996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e98def2
5f03bbb
024ba4e
d53bf01
d925ce2
a46954b
ff89ace
cdce37f
6726f3e
368fbee
9f09451
f743ef8
7505635
4bfca5b
dc8e30a
2190669
4c7291d
c336054
dd7e0ef
f9c324f
ee601b7
f40e936
876fccb
caa3c91
c78a8d4
9bed1c0
8efff28
2efe7bd
95e1228
329166a
abc3f1f
9f711b8
f68542e
f9a9aed
d43d640
b6544a2
1251c23
c9c81cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# frozen_string_literal: true | ||
|
||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
provides :nvidia_imex, platform: 'amazon' do |node| | ||
node['platform_version'].to_i == 2023 | ||
end | ||
|
||
use 'partial/_nvidia_imex_common.rb' | ||
use 'partial/_nvidia_imex_rhel.rb' | ||
|
||
def platform | ||
"amzn#{node['platform_version'].to_i}" | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# frozen_string_literal: true | ||
|
||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
provides :nvidia_imex, platform: 'amazon', platform_version: '2' | ||
|
||
use 'partial/_nvidia_imex_common.rb' | ||
use 'partial/_nvidia_imex_rhel.rb' | ||
|
||
def imex_installed? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a blocker but this function name is misleading. |
||
# We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver | ||
true | ||
end | ||
|
||
action :configure do | ||
# Do nothing | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# frozen_string_literal: true | ||
|
||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
provides :nvidia_imex, platform: 'redhat' do |node| | ||
node['platform_version'].to_i >= 8 | ||
end | ||
|
||
use 'partial/_nvidia_imex_common.rb' | ||
use 'partial/_nvidia_imex_rhel.rb' | ||
|
||
def platform | ||
"rhel#{node['platform_version'].to_i}" | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# frozen_string_literal: true | ||
|
||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
provides :nvidia_imex, platform: 'rocky' do |node| | ||
node['platform_version'].to_i >= 8 | ||
end | ||
|
||
use 'partial/_nvidia_imex_common.rb' | ||
use 'partial/_nvidia_imex_rhel.rb' | ||
|
||
def platform | ||
"rhel#{node['platform_version'].to_i}" | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# frozen_string_literal: true | ||
|
||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
provides :nvidia_imex, platform: 'ubuntu' do |node| | ||
node['platform_version'].to_i >= 22 | ||
end | ||
|
||
use 'partial/_nvidia_imex_common.rb' | ||
use 'partial/_nvidia_imex_debian.rb' | ||
|
||
def platform | ||
"ubuntu#{node['platform_version'].delete('.')}" | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# frozen_string_literal: true | ||
# | ||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
unified_mode true | ||
default_action :install | ||
|
||
action :install do | ||
return unless nvidia_enabled_or_installed? | ||
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") | ||
|
||
directory node['cluster']['nvidia']['imex']['shared_dir'] | ||
|
||
action_install_imex | ||
# Save Imex version in Node Attributes for InSpec Tests | ||
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version | ||
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package | ||
node_attributes 'dump node attributes' | ||
end | ||
|
||
action :configure do | ||
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" | ||
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet | ||
if get_nvswitch_count(get_device_ids['gb200']) > 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, it is safer to:
|
||
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, | ||
# if one doesn't already exist in a common, shared location. | ||
template nvidia_imex_nodes_conf_file do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The overall design assumes the imex nodes config file to be shared in the cluster. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets have an offline discussion as there would be blockers on the Job wide deployment model There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed we will keep the exiting changes and we can later make changes as per naming convention or whichever is easier for the design of Custom Actions we recommend |
||
source 'nvidia-imex/nvidia-imex-nodes.erb' | ||
owner 'root' | ||
group 'root' | ||
mode '0755' | ||
action :create | ||
not_if { file_exists_and_cluster_update?(nvidia_imex_nodes_conf_file) } | ||
end | ||
|
||
template nvidia_imex_main_conf_file do | ||
source 'nvidia-imex/nvidia-imex-config.erb' | ||
owner 'root' | ||
group 'root' | ||
mode '0755' | ||
action :create | ||
not_if { file_exists_and_cluster_update?(nvidia_imex_main_conf_file) } | ||
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) | ||
end | ||
|
||
template "/etc/systemd/system/#{nvidia_imex_service}.service" do | ||
source 'nvidia-imex/nvidia-imex.service.erb' | ||
owner 'root' | ||
group 'root' | ||
mode '0644' | ||
action :create | ||
variables(imex_main_config_file_path: nvidia_imex_main_conf_file) | ||
end | ||
|
||
service nvidia_imex_service do | ||
action %i(enable start) | ||
supports status: true | ||
end | ||
end | ||
end | ||
|
||
def nvidia_imex_package | ||
"#{nvidia_imex_service}-#{nvidia_driver_major_version}" | ||
end | ||
|
||
def nvidia_driver_major_version | ||
node['cluster']['nvidia']['driver_version'].split('.')[0] | ||
end | ||
|
||
def nvidia_imex_service | ||
'nvidia-imex' | ||
end | ||
|
||
def nvidia_imex_full_version | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Misleading function name: the function is expected to return the imex full version, but it does contain the imex versio. suffix, as the full imex version is made of ${nvidia_driver_major_version}-${nvidia_driver_version}-1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The full version is not necessarily the one you refer. When we do a apt/dnf list installed packages we see the |
||
"#{node['cluster']['nvidia']['driver_version']}-1" | ||
end | ||
|
||
def imex_installed? | ||
::File.exist?("/usr/bin/#{nvidia_imex_service}") || ::File.exist?("/usr/bin/#{nvidia_imex_service}-ctl") | ||
end | ||
|
||
def nvidia_enabled_or_installed? | ||
himani2411 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
nvidia_enabled? || nvidia_installed? | ||
end | ||
|
||
def file_exists_and_cluster_update?(file_path) | ||
::File.exist?(file_path) && !are_queues_updated? | ||
end | ||
|
||
def nvidia_imex_main_conf_file | ||
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['launch_template_id']}.cfg" | ||
end | ||
|
||
def nvidia_imex_nodes_conf_file | ||
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg" | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# frozen_string_literal: true | ||
# | ||
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). | ||
# You may not use this file except in compliance with the License. | ||
# A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. | ||
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. | ||
# See the License for the specific language governing permissions and limitations under the License. | ||
|
||
action :install_imex do | ||
remote_file "#{node['cluster']['sources_dir']}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.deb" do | ||
source "#{nvidia_imex_url}" | ||
mode '0644' | ||
retries 3 | ||
retry_delay 5 | ||
action :create_if_missing | ||
end | ||
|
||
bash "Install nvidia-imex" do | ||
user 'root' | ||
cwd node['cluster']['sources_dir'] | ||
code <<-NVIDIA_IMEX | ||
set -e | ||
dpkg -i #{nvidia_imex_package}-#{nvidia_imex_full_version}.deb && apt-mark hold #{nvidia_imex_package} | ||
NVIDIA_IMEX | ||
retries 3 | ||
retry_delay 5 | ||
end | ||
end | ||
|
||
def nvidia_imex_url | ||
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}_#{nvidia_imex_full_version}_#{arch_suffix}.deb" | ||
end | ||
|
||
def arch_suffix | ||
arm_instance? ? 'arm64' : 'amd64' | ||
end |
Uh oh!
There was an error while loading. Please reload this page.