From c6c5a4a0533924ca45e67a3cca953071bdf23514 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 13 Jun 2024 06:33:13 -0400 Subject: [PATCH 1/6] Retrieve dependencies from s3 bucket rather than from the open-internet --- .../recipes/awsbatch_virtualenv.rb | 1 - .../recipes/install.rb | 25 ++++++++++- .../install/custom_parallelcluster_node.rb | 6 +++ .../recipes/install/parallelcluster_node.rb | 25 +++++++++-- .../recipes/install/cfn_bootstrap.rb | 22 +++++++++- .../recipes/install/awscli.rb | 2 +- .../recipes/install/cookbook_virtualenv.rb | 27 +++++++++++- .../recipes/install/cuda.rb | 4 +- .../recipes/install/intel_mpi.rb | 2 +- .../arm_pl/partial/_arm_pl_common.rb | 8 ++-- .../resources/dcv/partial/_dcv_common.rb | 2 +- .../fabric_manager_alinux2023.rb | 4 ++ .../fabric_manager/fabric_manager_amazon2.rb | 4 ++ .../fabric_manager/fabric_manager_centos7.rb | 4 ++ .../fabric_manager/fabric_manager_redhat8.rb | 4 ++ .../fabric_manager/fabric_manager_rocky8.rb | 4 ++ .../fabric_manager_ubuntu20+.rb | 4 ++ .../partial/_fabric_manager_common.rb | 8 ---- .../partial/_fabric_manager_install_debian.rb | 20 ++++++++- .../partial/_fabric_manager_install_rhel.rb | 11 ++++- .../gdrcopy/partial/_gdrcopy_common.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_alinux2023.rb | 4 ++ .../nvidia_dcgm/nvidia_dcgm_amazon2.rb | 5 +++ .../nvidia_dcgm/nvidia_dcgm_centos7.rb | 5 +++ .../nvidia_dcgm/nvidia_dcgm_redhat8.rb | 5 +++ .../nvidia_dcgm/nvidia_dcgm_rocky8.rb | 5 +++ .../nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb | 5 ++- .../partial/_nvidia_dcgm_alinux2_centos7.rb | 19 --------- .../partial/_nvidia_dcgm_common.rb | 14 +------ .../partial/_nvidia_dcgm_debian.rb | 42 +++++++++++++++++++ .../partial/_nvidia_dcgm_rhel.rb .rb | 42 +++++++++++++++++++ .../attributes/environment.rb | 4 +- .../attributes/versions.rb | 4 +- .../resources/activate_virtual_env.rb | 33 ++++----------- .../resources/install_pyenv.rb | 15 ++++++- .../attributes/slurm_attributes.rb | 4 +- .../attributes/versions.rb | 8 ++-- .../recipes/install/install_jwt.rb | 2 +- .../recipes/install/install_pmix.rb | 2 +- 39 files changed, 308 insertions(+), 99 deletions(-) delete mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb diff --git a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb index 936a15edd9..5a702b9782 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb @@ -25,7 +25,6 @@ activate_virtual_env virtualenv_name do pyenv_path virtualenv_path python_version python_version - not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path diff --git a/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb b/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb index 5c00e89d06..85e7b3a829 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb @@ -43,12 +43,33 @@ mkdir aws-parallelcluster-awsbatch-cli tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-* + + aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']} + tar xzf awsbatch-dependencies.tgz + cd awsbatch + #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index + cd .. + #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/ CLI end else # Install aws-parallelcluster-awsbatch-cli package - execute "pip_install_parallelcluster_awsbatch_cli" do - command "#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install aws-parallelcluster-awsbatch-cli==#{node['cluster']['parallelcluster-awsbatch-cli-version']}" + bash "install aws-parallelcluster-awsbatch-cli" do + cwd Chef::Config[:file_cache_path] + code <<-CLI + set -e + package_url=#{node['cluster']['artifacts_build_url']}/awsbatch/aws-parallelcluster.tgz + aws s3 cp ${package_url} aws-parallelcluster.tgz --region #{node['cluster']['region']} + mkdir aws-parallelcluster-awsbatch-cli + tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli + aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']} + tar xzf awsbatch-dependencies.tgz + cd awsbatch + #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index + cd .. + cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-* + #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/ + CLI end end diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb index d137b1849e..efec81abc4 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb @@ -38,6 +38,12 @@ mkdir aws-parallelcluster-custom-node tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-custom-node cd aws-parallelcluster-custom-node/*aws-parallelcluster-node-* + aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']} + tar xzf node-dependencies.tgz + cd node + #{node_virtualenv_path}/bin/pip install * -f ./ --no-index + cd .. + pip install . deactivate NODE diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb index 063acb04b4..ae7b70b222 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb @@ -36,8 +36,27 @@ if is_custom_node? include_recipe 'aws-parallelcluster-computefleet::custom_parallelcluster_node' else - pyenv_pip 'aws-parallelcluster-node' do - version node['cluster']['parallelcluster-node-version'] - virtualenv virtualenv_path + bash "install official aws-parallelcluster-node" do + cwd Chef::Config[:file_cache_path] + code <<-NODE + set -e + [[ ":$PATH:" != *":/usr/local/bin:"* ]] && PATH="/usr/local/bin:${PATH}" + echo "PATH is $PATH" + source #{node_virtualenv_path}/bin/activate + pip uninstall --yes aws-parallelcluster-node + node_url=#{node['cluster']['artifacts_build_url']}/node/aws-parallelcluster-node.tgz + aws s3 cp ${node_url} aws-parallelcluster-node.tgz --region #{node['cluster']['region']} + rm -fr aws-parallelcluster-node + mkdir aws-parallelcluster-node + tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-node + aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']} + tar xzf node-dependencies.tgz + cd node + #{node_virtualenv_path}/bin/pip install * -f ./ --no-index + cd .. + cd aws-parallelcluster-node/*aws-parallelcluster-node-* + pip install . + deactivate + NODE end end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index 05f0257091..dbaadab117 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -15,7 +15,7 @@ virtualenv_name = 'cfn_bootstrap_virtualenv' pyenv_root = node['cluster']['system_pyenv_root'] # FIXME: Python Version cfn_bootstrap_virtualenv due to a bug with cfn-hup -python_version = '3.9.19' +python_version = '3.9.17' virtualenv_path = "#{pyenv_root}/versions/#{python_version}/envs/#{virtualenv_name}" node.default['cluster']['cfn_bootstrap_virtualenv_path'] = virtualenv_path @@ -33,6 +33,26 @@ not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end +remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cfn-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end + +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf cfn-dependencies.tgz + cd cfn + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ +end + cfnbootstrap_version = '2.0-28' cfnbootstrap_package = "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb index 9f6fb6aacf..99adafcc46 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb @@ -35,5 +35,5 @@ end bash 'install awscli' do - code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" + code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb index afca952e0b..02016eeb15 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb @@ -24,6 +24,29 @@ activate_virtual_env cookbook_virtualenv_name do pyenv_path cookbook_virtualenv_path python_version cookbook_python_version - requirements_path "cookbook_virtualenv/requirements.txt" - not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } +end + +cookbook_file "#{virtualenv_path}/requirements.txt" do + source "cookbook_virtualenv/requirements.txt" + mode '0755' +end + +remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cookbook-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end + +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf cookbook-dependencies.tgz + cd dependencies + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index ecea9784f0..cca8a2f443 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -24,9 +24,9 @@ cuda_complete_version = "#{cuda_version}.#{cuda_patch}" cuda_version_suffix = '535.104.05' cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux' -cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" +cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" cuda_samples_version = '12.2' -cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" +cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" tmp_cuda_run = '/tmp/cuda.run' tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz' diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb index e7d2bc8983..19068414c8 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb @@ -29,7 +29,7 @@ intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}" intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh" intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}" -intelmpi_installer_url = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{aws_domain}/archives/impi/#{intelmpi_installer}" +intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}" intelmpi_qt_version = '6.4.2' # Prerequisite for module install diff --git a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb index 284d86179f..a91416809d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb @@ -55,8 +55,8 @@ armpl_tarball_name = "arm-performance-libraries_#{armpl_version}_#{armpl_platform}_gcc-#{gcc_major_minor_version}.tar" armpl_url = %W( - https://#{new_resource.region}-aws-parallelcluster.s3.#{new_resource.region}.#{new_resource.aws_domain} - archives/armpl/#{armpl_platform} + #{node['cluster']['artifacts_s3_url']} + armpl/#{armpl_platform} #{armpl_tarball_name} ).join('/') @@ -111,7 +111,7 @@ end gcc_version = "#{gcc_major_minor_version}.#{new_resource.gcc_patch_version}" - gcc_url = "https://ftp.gnu.org/gnu/gcc/gcc-#{gcc_version}/gcc-#{gcc_version}.tar.gz" + gcc_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/gcc/gcc-#{gcc_version}.tar.gz" gcc_tarball = "#{new_resource.sources_dir}/gcc-#{gcc_version}.tar.gz" # Get gcc tarball @@ -137,7 +137,7 @@ tar -xf #{gcc_tarball} cd gcc-#{gcc_version} # Patch the download_prerequisites script to download over https and not ftp. This works better in China regions. - sed -i "s#ftp://gcc\.gnu\.org#https://gcc.gnu.org#g" ./contrib/download_prerequisites + sed -i "s#ftp://gcc\.gnu\.org##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites ./contrib/download_prerequisites mkdir build && cd build ../configure --prefix=/opt/arm/armpl/gcc/#{gcc_version} --disable-bootstrap --enable-checking=release --enable-languages=c,c++,fortran --disable-multilib diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_dcv_common.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_dcv_common.rb index 1ff78ac4f0..3f8172ea99 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_dcv_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_dcv_common.rb @@ -272,7 +272,7 @@ def dcv_gpu_accel_supported? end def dcv_url - "https://d1uj6qtbmh3dt5.cloudfront.net/#{node['cluster']['dcv']['version'].split('-')[0]}/Servers/#{dcv_package}.tgz" + "#{node['cluster']['artifacts_s3_url']}/dependencies/dcv/#{dcv_package}.tgz" end def dcv_tarball diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb index 82412454ed..d13bf55a17 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + 'rhel9' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb index 5856bebff4..d7cb12ac30 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb @@ -24,3 +24,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + 'rhel7' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb index 5fcddd3761..502ccc0124 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + 'rhel7' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb index 1eb5216da0..8441a33c1d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + "rhel#{node['platform_version'].to_i}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb index 8d12f10331..5173135505 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + "rhel#{node['platform_version'].to_i}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index c01265485d..aff2f9ca73 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version "#{_nvidia_driver_version}*" end + +def platform + "ubuntu#{node['platform_version'].delete('.')}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index bf1c45750b..62b832e66e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -26,16 +26,8 @@ node.default['cluster']['nvidia']['fabricmanager']['version'] = fabric_manager_version node_attributes "dump node attributes" - # Add NVIDIA repo for fabric manager and datacenter-gpu-manager - nvidia_repo 'add nvidia repository' do - action :add - end - action_install_package - nvidia_repo 'remove nvidia repository' do - action :remove - end end action :configure do diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb index 79629f4998..a45141138a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb @@ -15,9 +15,27 @@ action :install_package do # For ubuntu, CINC17 apt-package resources need full versions for `version` execute "install_fabricmanager_for_ubuntu" do - command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\ + bash "Install #{fabric_manager_package}" do + user 'root' + code <<-FABRIC_MANAGER + set -e + aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb + FABRIC_MANAGER + retries 3 + retry_delay 5 + end + + command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\ "&& apt-mark hold #{fabric_manager_package}" retries 3 retry_delay 5 end end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' +end + +def fabric_manager_url + "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb index 4339622f4b..316da4478d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb @@ -18,10 +18,19 @@ user 'root' code <<-FABRIC_MANAGER_INSTALL set -e - yum install -y #{fabric_manager_package}-#{fabric_manager_version} + aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm --region #{node['cluster']['region']} + yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm yum versionlock #{fabric_manager_package} FABRIC_MANAGER_INSTALL retries 3 retry_delay 5 end end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def fabric_manager_url + "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb index db58f83591..bef0644160 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb @@ -112,5 +112,5 @@ def gdrcopy_version_extended end def gdrcopy_url - "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v#{gdrcopy_version}.tar.gz" + "#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb index 7bda32b49c..4fb4ac156d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb @@ -21,3 +21,7 @@ def _nvidia_dcgm_enabled _nvidia_enabled end + +def platform + 'rhel9' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb index 261128cb3b..3bde89bb6e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb @@ -15,7 +15,12 @@ provides :nvidia_dcgm, platform: 'amazon', platform_version: '2' use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled !arm_instance? && _nvidia_enabled end + +def platform + 'rhel7' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb index 00d5c18ea7..8dffb4bd04 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled !arm_instance? && _nvidia_enabled end + +def platform + 'rhel7' +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb index 88a2e98e71..b6564420eb 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled _nvidia_enabled end + +def platform + "rhel#{node['platform_version'].to_i}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb index b56aa2cf5b..e3e5a05523 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled _nvidia_enabled end + +def platform + "rhel#{node['platform_version'].to_i}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb index 520c655e37..d7fc683531 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb @@ -17,11 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_debian.rb' def _nvidia_dcgm_enabled _nvidia_enabled end -def package_version - "1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver +def platform + "ubuntu#{node['platform_version'].delete('.')}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb deleted file mode 100644 index 8543561c23..0000000000 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb +++ /dev/null @@ -1,19 +0,0 @@ -# frozen_string_literal: true -# -# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -action :setup do - return if arm_instance? || !_nvidia_enabled - - action_install_package -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb index b02b5476f9..627e59b515 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb @@ -20,20 +20,8 @@ action :setup do return unless _nvidia_dcgm_enabled - # Add NVIDIA repo for fabric manager and datacenter-gpu-manager - nvidia_repo 'add nvidia repository' do - action :add - end + action_install_package - package 'datacenter-gpu-manager' do - retries 3 - retry_delay 5 - version package_version - end - - nvidia_repo 'remove nvidia repository' do - action :remove - end end def _nvidia_enabled diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb new file mode 100644 index 0000000000..30dde588eb --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true +# +# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_package do + bash "Install #{dcgm_package}" do + user 'root' + code <<-DCGM_INSTALL + set -e + aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb --region #{node['cluster']['region']} + apt -y install #{dcgm_package}-#{package_version}.deb + DCGM_INSTALL + retries 3 + retry_delay 5 + end +end + +def dcgm_url + "#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" +end + +def dcgm_package + 'datacenter-gpu-manager' +end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' +end + +def package_version + node['cluster']['nvidia']['dcgm_version'] +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb new file mode 100644 index 0000000000..4d9ca0b365 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true +# +# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_package do + bash "Install #{dcgm_package}" do + user 'root' + code <<-DCGM_INSTALL + set -e + aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm --region #{node['cluster']['region']} + yum install -y #{dcgm_package}-#{package_version}.rpm + DCGM_INSTALL + retries 3 + retry_delay 5 + end +end + +def dcgm_url + "#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" +end + +def dcgm_package + 'datacenter-gpu-manager' +end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def package_version + node['cluster']['nvidia']['dcgm_version'] +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb index 6e11dce797..11deeebb3e 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb @@ -5,4 +5,6 @@ # URL for ParallelCluster Artifacts stored in public S3 buckets # ['cluster']['region'] will need to be defined by image_dna.json during AMI build. -default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives" +default['cluster']['base_build_url'] = "s3://aws-parallelcluster-dev-build-dependencies" +default['cluster']['artifacts_s3_url'] = "https://aws-parallelcluster-dev-commercial.s3.#{node['cluster']['aws_domain']}/archives" +default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies" \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index c5f3e648ba..bdfe5ae39a 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -1,5 +1,7 @@ # Python Version -default['cluster']['python-version'] = '3.9.19' +default['cluster']['python-version'] = '3.9.17' +default['cluster']['python-major-minor-version'] = '3.9' + # ParallelCluster versions default['cluster']['parallelcluster-version'] = '3.10.0' diff --git a/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb b/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb index 22da62c1c2..32b29cdb14 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb @@ -15,29 +15,14 @@ default_action :run action :run do - pyenv_script "pyenv virtualenv #{new_resource.pyenv_name}" do - code "pyenv virtualenv #{new_resource.python_version} #{new_resource.pyenv_name}" - user new_resource.user if new_resource.user - end - - pyenv_pip "pip" do - virtualenv new_resource.pyenv_path - user new_resource.user if new_resource.user - action :upgrade - end - - unless new_resource.requirements_path.empty? - # Copy requirements file - cookbook_file "#{new_resource.pyenv_path}/requirements.txt" do - source new_resource.requirements_path - mode '0755' - end - - # Install given requirements in the virtual environment - pyenv_pip "#{new_resource.pyenv_path}/requirements.txt" do - virtualenv new_resource.pyenv_path - user new_resource.user if new_resource.user - requirement true - end + bash 'create venv' do + user 'root' + group 'root' + cwd "#{node['cluster']['system_pyenv_root']}" + code <<-VENV + set -e + versions/#{new_resource.python_version}/bin/python#{node['cluster']['python-major-minor-version']} -m venv #{new_resource.pyenv_path} + source #{new_resource.pyenv_path}/bin/activate + VENV end end diff --git a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb index 1d9f86bded..0dd3c8daae 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb @@ -29,8 +29,19 @@ recursive true end - pyenv_install 'system' do - prefix prefix + bash "install python #{python_version}" do + user 'root' + group 'root' + cwd "#{prefix}" + code <<-VENV + set -e + aws s3 cp #{node['cluster']['artifacts_build_url']}/python/Python-#{python_version}.tgz Python-#{python_version}.tgz --region #{node['cluster']['region']} + tar -xzf Python-#{python_version}.tgz + cd Python-#{python_version} + ./configure --prefix=#{prefix}/versions/#{python_version} + make + make install + VENV end # Remove the profile.d script that the pyenv cookbook writes. diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index 9c5e80350b..b9e97d9f17 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -13,8 +13,8 @@ default['cluster']['enable_nss_slurm'] = node['cluster']['directory_service']['enabled'] # PMIX Version and Checksum -default['cluster']['pmix']['version'] = '5.0.2' -default['cluster']['pmix']['sha256'] = '133e79c44d426043fa54b80649ecc97607b915ad8c5cc119575a3dd0c4104941' +default['cluster']['pmix']['version'] = '4.2.9' +default['cluster']['pmix']['sha256'] = '00ddb36fb81c31519972079a218c3cdd903510fc3910abaf4d484068fa29e884' # Slurmdbd default['cluster']['slurmdbd_service_enabled'] = "true" diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 2bf3947af4..b2671d3d2e 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -3,8 +3,8 @@ default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' default['cluster']['slurm']['sha256'] = 'b25127efd69a47c14bd65dfa3bff2687b5350c5290eafb601f923faebe6fd238' -default['cluster']['slurm']['base_url'] = "https://github.com/SchedMD/slurm/archive" +default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge -default['cluster']['munge']['munge_version'] = '0.5.16' -default['cluster']['munge']['sha256'] = 'fa27205d6d29ce015b0d967df8f3421067d7058878e75d0d5ec3d91f4d32bb57' -default['cluster']['munge']['base_url'] = "https://github.com/dun/munge/archive" +default['cluster']['munge']['munge_version'] = '0.5.15' +default['cluster']['munge']['sha256'] = '51b2c81d1a7ec2ab5d486fa51b50c7e79eb1810ca6687b6ed65f3601abc55614' +default['cluster']['munge']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/munge" diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb index d75daf60cb..5dd551857e 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb @@ -16,7 +16,7 @@ # limitations under the License. jwt_version = '1.15.3' -jwt_url = "https://github.com/benmcollins/libjwt/archive/refs/tags/v#{jwt_version}.tar.gz" +jwt_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt/v#{jwt_version}.tar.gz" jwt_tarball = "#{node['cluster']['sources_dir']}/libjwt-#{jwt_version}.tar.gz" jwt_sha256 = 'cb2fd95123689e7d209a3a8c060e02f68341c9a5ded524c0cd881a8cd20d711f' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb index ad6f2d3c73..6548b203e7 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb @@ -17,7 +17,7 @@ # PMIx software pmix_version = node['cluster']['pmix']['version'] -pmix_url = "https://github.com/openpmix/openpmix/releases/download/v#{pmix_version}/pmix-#{pmix_version}.tar.gz" +pmix_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/pmix/pmix-#{pmix_version}.tar.gz" pmix_sha256 = node['cluster']['pmix']['sha256'] pmix_tarball = "#{node['cluster']['sources_dir']}/pmix-#{pmix_version}.tar.gz" From 232a1f69af0553e00ad6ee23e742d2cb10b86ebf Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 13 Jun 2024 08:05:41 -0400 Subject: [PATCH 2/6] Modify spec test to expect dependencies to be retrieved from an s3 bucket --- .../spec/unit/recipes/node_spec.rb | 5 +- .../spec/unit/recipes/cfn_bootstrap_spec.rb | 10 ++- .../spec/unit/recipes/awscli_spec.rb | 3 +- .../unit/recipes/cookbook_virtualenv_spec.rb | 25 ++++---- .../spec/unit/recipes/cuda_spec.rb | 6 +- .../spec/unit/resources/arm_pl_spec.rb | 6 +- .../spec/unit/resources/dcv_spec.rb | 2 +- .../unit/resources/fabric_manager_spec.rb | 6 +- .../spec/unit/resources/gdrcopy_spec.rb | 2 +- .../spec/unit/resources/nvidia_dcgm_spec.rb | 6 +- .../spec/unit/resources/nvidia_driver_spec.rb | 2 +- .../resources/activate_virtual_env_spec.rb | 36 +++-------- .../spec/unit/resources/install_pyenv_spec.rb | 62 +++++++++++++------ 13 files changed, 90 insertions(+), 81 deletions(-) diff --git a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb index fd128dfbf6..6de0f9bdda 100644 --- a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb +++ b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb @@ -36,10 +36,7 @@ end it 'installs official node package' do - is_expected.to install_pyenv_pip('aws-parallelcluster-node').with( - version: node_version, - virtualenv: virtualenv_path - ) + is_expected.to run_bash('install official aws-parallelcluster-node') end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb index 38be3e2fb5..eda316c16c 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb @@ -5,7 +5,7 @@ context "on #{platform}#{version}" do cached(:cfnbootstrap_version) { '2.0-28' } cached(:cfnbootstrap_package) { "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" } - cached(:python_version) { '3.9.19' } + cached(:python_version) { '3.9.17' } cached(:system_pyenv_root) { 'system_pyenv_root' } cached(:virtualenv_path) { "system_pyenv_root/versions/#{python_version}/envs/cfn_bootstrap_virtualenv" } @@ -30,6 +30,14 @@ ) end + it 'installs python packages' do + is_expected.to run_bash("pip install").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['base_dir']}" + ) + end + it 'sets virtualenv path' do expect(node.default['cluster']['cfn_bootstrap_virtualenv_path']).to eq(virtualenv_path) is_expected.to write_node_attributes('dump node attributes') diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb index 4f3c1136ec..58e0bde2a7 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb @@ -31,8 +31,7 @@ it 'installs awscli into cookbook virtualev path' do is_expected.to run_bash('install awscli') - .with_code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" - end + .with_code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end end context "when awscli is not installed" do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb index 8a4ad068c9..88c42dadff 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb @@ -24,8 +24,7 @@ it 'activates cookbook vistualenv' do is_expected.to run_activate_virtual_env('cookbook_virtualenv').with( pyenv_path: virtualenv_path, - python_version: python_version, - requirements_path: "cookbook_virtualenv/requirements.txt" + python_version: python_version ) end @@ -33,20 +32,20 @@ expect(node.default['cluster']['cookbook_virtualenv_path']).to eq(virtualenv_path) is_expected.to write_node_attributes('dump node attributes') end - end - context "when cookbook virtualenv already installed" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - node.override['cluster']['system_pyenv_root'] = system_pyenv_root - node.override['cluster']['python-version'] = python_version - end - allow(File).to receive(:exist?).with("#{virtualenv_path}/bin/activate").and_return(true) - runner.converge(described_recipe) + it 'copies requirements file' do + is_expected.to create_cookbook_file("#{virtualenv_path}/requirements.txt").with( + source: "cookbook_virtualenv/requirements.txt", + mode: '0755' + ) end - it 'does not activate cookbook virtualenv' do - is_expected.not_to run_activate_virtual_env('cookbook_virtualenv') + it 'installs python packages' do + is_expected.to run_bash("pip install").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['base_dir']}" + ).with_code(/tar xzf cookbook-dependencies.tgz/) end end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index add4ced887..2ce426ef84 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -19,9 +19,9 @@ context 'when on arm' do cached(:cuda_arch) { 'linux_sbsa' } - cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } + cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } cached(:cuda_samples_version) { '12.2' } - cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" } + cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" } cached(:chef_run) do allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) @@ -83,7 +83,7 @@ context 'when not on arm' do cached(:cuda_arch) { 'linux' } - cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } + cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } cached(:chef_run) do allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb index 5438d19b37..f258be0a4e 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb @@ -45,11 +45,11 @@ def self.setup(chef_run) cached(:modulefile_dir) { platform == 'ubuntu' ? '/usr/share/modules/modulefiles' : '/usr/share/Modules/modulefiles' } cached(:armpl_version) { "#{armpl_major_minor_version}" } cached(:armpl_tarball_name) { "arm-performance-libraries_#{armpl_version}_#{armpl_platform}_gcc-#{gcc_major_minor_version}.tar" } - cached(:armpl_url) { "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.#{aws_domain}/archives/armpl/#{armpl_platform}/#{armpl_tarball_name}" } + cached(:armpl_url) { "https://bucket.s3.amazonaws.com/archives/armpl/#{armpl_platform}/#{armpl_tarball_name}" } cached(:armpl_installer) { "#{sources_dir}/#{armpl_tarball_name}" } cached(:armpl_name) { "arm-performance-libraries_#{armpl_version}_#{armpl_platform}" } cached(:gcc_version) { "#{gcc_major_minor_version}.#{gcc_patch_version}" } - cached(:gcc_url) { "https://ftp.gnu.org/gnu/gcc/gcc-#{gcc_version}/gcc-#{gcc_version}.tar.gz" } + cached(:gcc_url) { "https://bucket.s3.amazonaws.com/archives/dependencies/gcc/gcc-#{gcc_version}.tar.gz" } cached(:gcc_tarball) { "#{sources_dir}/gcc-#{gcc_version}.tar.gz" } cached(:gcc_modulefile) { "/opt/arm/armpl/#{armpl_version}/modulefiles/armpl/gcc-#{gcc_major_minor_version}" } @@ -57,6 +57,7 @@ def self.setup(chef_run) cached(:chef_run) do runner = runner(platform: platform, version: version, step_into: ['arm_pl']) do |node| node.override['conditions']['arm_pl_supported'] = false + node.override['cluster']['artifacts_s3_url'] = "https://bucket.s3.amazonaws.com/archives" end ConvergeArmPl.setup(runner) end @@ -73,6 +74,7 @@ def self.setup(chef_run) node.override['conditions']['arm_pl_supported'] = true node.override['cluster']['sources_dir'] = sources_dir node.override['cluster']['region'] = aws_region + node.override['cluster']['artifacts_s3_url'] = "https://bucket.s3.amazonaws.com/archives" end allow_any_instance_of(Object).to receive(:aws_domain).and_return(aws_domain) ConvergeArmPl.setup(runner) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb index 8ba950f3b5..3a5670354f 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb @@ -239,7 +239,7 @@ def self.nothing(chef_run) end it 'returns dcv_url' do - expect(resource.dcv_url).to eq("https://d1uj6qtbmh3dt5.cloudfront.net/#{dcv_major_minor}/Servers/#{dcv_package}.tgz") + expect(resource.dcv_url).to eq("#{node['cluster']['artifacts_s3_url']}/dependencies/dcv/#{dcv_package}.tgz") end end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index d0ea520fb2..3bc8da77fd 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -164,6 +164,7 @@ def self.configure(chef_run) describe 'fabric_manager:setup' do cached(:nvidia_driver_version) { 'nvidia_driver_version' } + cached(:aws_region) { 'test_region' } for_all_oses do |platform, version| context "on #{platform}#{version}" do @@ -195,7 +196,7 @@ def self.configure(chef_run) is_expected.to run_execute('install_fabricmanager_for_ubuntu') .with_retries(3) .with_retry_delay(5) - .with_command("apt -y install #{fabric_manager_package}=#{fabric_manager_version} && apt-mark hold #{fabric_manager_package}") + .with_command("apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}") end else it 'installs yum-plugin-versionlock' do @@ -208,7 +209,8 @@ def self.configure(chef_run) .with_retries(3) .with_retry_delay(5) .with(code: %( set -e - yum install -y #{fabric_manager_package}-#{fabric_manager_version} + aws s3 cp #{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.x86_64.rpm #{fabric_manager_package}-#{fabric_manager_version}.rpm --region test_region + yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm yum versionlock #{fabric_manager_package} )) end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb index 2f795953f4..4dd9b83d8a 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb @@ -196,7 +196,7 @@ def self.configure(chef_run) end cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' } cached(:gdrcopy_tarball) { "#{sources_dir}/gdrcopy-#{gdrcopy_version}.tar.gz" } - cached(:gdrcopy_url) { "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v#{gdrcopy_version}.tar.gz" } + cached(:gdrcopy_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz" } cached(:gdrcopy_dependencies) do case platform when 'ubuntu' diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb index 742e4a697e..ca32faed4c 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb @@ -146,7 +146,7 @@ def self.setup(chef_run, nvidia_enabled: nil) cached(:node) { chef_run.node } it 'does not install datacenter gpu manager' do - is_expected.not_to install_package('datacenter-gpu-manager') + is_expected.not_to run_bash('Install datacenter-gpu-manager') end end @@ -166,11 +166,11 @@ def self.setup(chef_run, nvidia_enabled: nil) if %w(centos amazon).include?(platform) it 'does not install datacenter gpu manager' do - is_expected.not_to install_package('datacenter-gpu-manager') + is_expected.not_to run_bash('Install datacenter-gpu-manager') end else it 'installs datacenter gpu manager' do - is_expected.to install_package('datacenter-gpu-manager') + is_expected.to run_bash('Install datacenter-gpu-manager') end end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index b852ef496c..741fddbcc9 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -168,7 +168,7 @@ def self.setup(chef_run, nvidia_driver_version: nil) else cached(:nvidia_driver_version) { 'nvidia_driver_version' } end - cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } + cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb index 8ee5ff214c..2c59ceebba 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb @@ -35,34 +35,13 @@ def self.run(chef_run, pyenv_name:, pyenv_path:, python_version:, user:, require is_expected.to run_activate_virtual_env('run') end - it 'runs pyenv script' do - is_expected.to run_pyenv_script("pyenv virtualenv #{pyenv_name}").with( - code: "pyenv virtualenv #{python_version} #{pyenv_name}", - user: user - ) - end - - it 'upgrades pyenv pip' do - is_expected.to upgrade_pyenv_pip("pip").with( - virtualenv: pyenv_path, - user: user - ) - end - - it 'copies requirements file' do - is_expected.to create_cookbook_file("#{pyenv_path}/requirements.txt").with( - source: requirements_path, - mode: '0755' - ) - end - - it 'installs requirements in the virtual environment' do - is_expected.to install_pyenv_pip("#{pyenv_path}/requirements.txt").with( - virtualenv: pyenv_path, - user: user, - requirement: true - ) - end + it 'creates venv' do + is_expected.to run_bash("create venv").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['system_pyenv_root']}" + ).with_code(%r{source pyenv_path/bin/activate}) + end end context "without requirements" do @@ -74,7 +53,6 @@ def self.run(chef_run, pyenv_name:, pyenv_path:, python_version:, user:, require it 'does not install requirements' do is_expected.not_to create_cookbook_file("#{pyenv_path}/requirements.txt") - is_expected.not_to install_pyenv_pip("#{pyenv_path}/requirements.txt") end end end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb index 339145cba7..fad6efad8f 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb @@ -37,16 +37,22 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use is_expected.to create_directory(system_pyenv_root).with_recursive(true) end - it 'installs pyenv system' do - is_expected.to install_pyenv_install('system').with_prefix(system_pyenv_root) + it 'downloads python tarball' do + is_expected.to create_if_missing_remote_file("#{node['cluster']['system_pyenv_root']}/Python-#{python_version}.tgz").with( + source: "#{node['cluster']['artifacts_s3_url']}/dependencies/python/Python-#{python_version}.tgz", + mode: '0644', + retries: 3, + retry_delay: 5 + ) end - it 'deletes /etc/profile.d/pyenv.sh to avoid exposing the ParallelCluster pyenv installation to customers' do || - is_expected.to delete_file('/etc/profile.d/pyenv.sh') - end - - it 'installs default python version' do - is_expected.to install_pyenv_python(python_version) + it 'installs python' do + is_expected.to run_bash("install python #{python_version}").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['system_pyenv_root']}" + ).with_code(/tar -xzf Python-#{python_version}.tgz/) + .with_code(%r{./configure --prefix=#{node['cluster']['system_pyenv_root']}/versions/#{python_version}}) end end @@ -66,12 +72,21 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use is_expected.to create_directory(system_pyenv_root).with_recursive(true) end - it 'installs pyenv system' do - is_expected.to install_pyenv_install('system').with_prefix(system_pyenv_root) + it 'downloads python tarball' do + is_expected.to create_if_missing_remote_file("#{system_pyenv_root}/Python-#{python_version}.tgz").with( + source: "https://www.python.org/ftp/python/#{python_version}/Python-#{python_version}.tgz", + mode: '0644', + retries: 3, + retry_delay: 5 + ) end - it 'installs default python version' do - is_expected.to install_pyenv_python(python_version) + it 'installs python' do + is_expected.to run_bash("install python #{python_version}").with( + user: 'root', + group: 'root', + cwd: "#{system_pyenv_root}" + ) end end @@ -91,19 +106,28 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use cached(:pyenv_root) { "pyenv_root" } cached(:python_version) { 'python_version' } cached(:chef_run) do - runner = runner(platform: platform, version: version, step_into: ['install_pyenv']) + runner = runner(platform: platform, version: version, step_into: ['install_pyenv']) do |node| + node.override['cluster']['python-version'] = python_version + node.override['cluster']['artifacts_s3_url'] = "https://bucket.s3.#{aws_domain}/archives" + end ConvergeInstallPyenv.run(runner, user_only: true, user: user, python_version: python_version, pyenv_root: pyenv_root) end - it 'installs pyenv for user' do - is_expected.to install_pyenv_install('user').with( - user: user, - prefix: pyenv_root + it 'downloads python tarball' do + is_expected.to create_if_missing_remote_file("#{pyenv_root}/Python-#{python_version}.tgz").with( + source: "https://www.python.org/ftp/python/#{python_version}/Python-#{python_version}.tgz", + mode: '0644', + retries: 3, + retry_delay: 5 ) end - it 'installs pyenv_python for user' do - is_expected.to install_pyenv_python(python_version).with_user(user) + it 'installs python' do + is_expected.to run_bash("install python #{python_version}").with( + user: user, + group: 'root', + cwd: "#{pyenv_root}" + ) end it 'installs pyenv plugin virtualenv' do From ffe2f94dd34a62754c78d7d2e8f70937e4ebf713 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 13 Jun 2024 08:13:07 -0400 Subject: [PATCH 3/6] Get python dependency using https rather than aws cli --- .../fabric_manager_alinux2023.rb | 2 +- .../fabric_manager/fabric_manager_amazon2.rb | 2 +- .../fabric_manager/fabric_manager_centos7.rb | 2 +- .../fabric_manager/fabric_manager_redhat8.rb | 2 +- .../fabric_manager/fabric_manager_rocky8.rb | 2 +- .../fabric_manager_ubuntu20+.rb | 2 +- .../partial/_fabric_manager_common.rb | 1 - .../partial/_fabric_manager_install_debian.rb | 2 +- .../partial/_fabric_manager_install_rhel.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_alinux2023.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_amazon2.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_centos7.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_redhat8.rb | 2 +- .../nvidia_dcgm/nvidia_dcgm_rocky8.rb | 2 +- .../partial/_nvidia_dcgm_common.rb | 1 - .../partial/_nvidia_dcgm_debian.rb | 2 +- ..._dcgm_rhel.rb .rb => _nvidia_dcgm_rhel.rb} | 2 +- .../partial/_nvidia_driver_common.rb | 4 +- .../spec/unit/recipes/awscli_spec.rb | 3 +- .../unit/resources/fabric_manager_spec.rb | 6 +- .../attributes/environment.rb | 2 +- .../attributes/versions.rb | 1 - .../resources/install_pyenv.rb | 73 +++++++++---------- .../resources/activate_virtual_env_spec.rb | 2 +- .../spec/unit/resources/install_pyenv_spec.rb | 16 ++-- 25 files changed, 61 insertions(+), 78 deletions(-) rename cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/{_nvidia_dcgm_rhel.rb .rb => _nvidia_dcgm_rhel.rb} (99%) diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb index d13bf55a17..46c891ed43 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb @@ -29,4 +29,4 @@ def fabric_manager_version def platform 'rhel9' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb index d7cb12ac30..375dcb02ce 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb @@ -27,4 +27,4 @@ def fabric_manager_version def platform 'rhel7' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb index 502ccc0124..e66bea4c2d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb @@ -29,4 +29,4 @@ def fabric_manager_version def platform 'rhel7' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb index 8441a33c1d..223cabaf89 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb @@ -29,4 +29,4 @@ def fabric_manager_version def platform "rhel#{node['platform_version'].to_i}" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb index 5173135505..c0d76676c2 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb @@ -29,4 +29,4 @@ def fabric_manager_version def platform "rhel#{node['platform_version'].to_i}" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index aff2f9ca73..ac6de0b145 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -29,4 +29,4 @@ def fabric_manager_version def platform "ubuntu#{node['platform_version'].delete('.')}" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 62b832e66e..1c5ac45aba 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -27,7 +27,6 @@ node_attributes "dump node attributes" action_install_package - end action :configure do diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb index a45141138a..6c5a358b7f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb @@ -38,4 +38,4 @@ def arch_suffix def fabric_manager_url "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb index 316da4478d..2bf6b73d2b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb @@ -33,4 +33,4 @@ def arch_suffix def fabric_manager_url "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb index 4fb4ac156d..9947442633 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb @@ -24,4 +24,4 @@ def _nvidia_dcgm_enabled def platform 'rhel9' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb index 3bde89bb6e..293fc1bc78 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb @@ -23,4 +23,4 @@ def _nvidia_dcgm_enabled def platform 'rhel7' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb index 8dffb4bd04..2170999305 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb @@ -25,4 +25,4 @@ def _nvidia_dcgm_enabled def platform 'rhel7' -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb index b6564420eb..247ef2fcad 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb @@ -25,4 +25,4 @@ def _nvidia_dcgm_enabled def platform "rhel#{node['platform_version'].to_i}" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb index e3e5a05523..f8630ef2da 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb @@ -25,4 +25,4 @@ def _nvidia_dcgm_enabled def platform "rhel#{node['platform_version'].to_i}" -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb index 627e59b515..7ad1032211 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb @@ -21,7 +21,6 @@ return unless _nvidia_dcgm_enabled action_install_package - end def _nvidia_enabled diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb index 30dde588eb..4bae3f6fa0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -39,4 +39,4 @@ def arch_suffix def package_version node['cluster']['nvidia']['dcgm_version'] -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb similarity index 99% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb index 4d9ca0b365..7ea74facd7 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb .rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -39,4 +39,4 @@ def arch_suffix def package_version node['cluster']['nvidia']['dcgm_version'] -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index aae0fb8bff..fed953ad4e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -97,7 +97,7 @@ def _nvidia_driver_version end def nvidia_driver_url - "https://us.download.nvidia.com/tesla/#{_nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{_nvidia_driver_version}.run" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{_nvidia_driver_version}.run" end def nvidia_driver_enabled? @@ -126,4 +126,4 @@ def nvidia_kernel_module else "kernel-open" end -end +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb index 58e0bde2a7..2bdcab53db 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb @@ -31,7 +31,8 @@ it 'installs awscli into cookbook virtualev path' do is_expected.to run_bash('install awscli') - .with_code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end + .with_code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" + end end context "when awscli is not installed" do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index 3bc8da77fd..88a79da853 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -208,11 +208,7 @@ def self.configure(chef_run) .with(user: 'root') .with_retries(3) .with_retry_delay(5) - .with(code: %( set -e - aws s3 cp #{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.x86_64.rpm #{fabric_manager_package}-#{fabric_manager_version}.rpm --region test_region - yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm - yum versionlock #{fabric_manager_package} -)) + .with_code(/yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm/) end end end diff --git a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb index 11deeebb3e..8bf13c5fe0 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb @@ -7,4 +7,4 @@ # ['cluster']['region'] will need to be defined by image_dna.json during AMI build. default['cluster']['base_build_url'] = "s3://aws-parallelcluster-dev-build-dependencies" default['cluster']['artifacts_s3_url'] = "https://aws-parallelcluster-dev-commercial.s3.#{node['cluster']['aws_domain']}/archives" -default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies" \ No newline at end of file +default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies" diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index bdfe5ae39a..87ca20c00d 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -2,7 +2,6 @@ default['cluster']['python-version'] = '3.9.17' default['cluster']['python-major-minor-version'] = '3.9' - # ParallelCluster versions default['cluster']['parallelcluster-version'] = '3.10.0' default['cluster']['parallelcluster-cookbook-version'] = '3.10.0' diff --git a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb index 0dd3c8daae..cd857e9d0f 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb @@ -4,60 +4,53 @@ unified_mode true # Resource:: to create a Python virtual environment for a given user - -property :python_version, String -property :prefix, String property :user_only, [true, false], default: false property :user, String - +property :python_version, String +property :prefix, String default_action :run action :run do python_version = new_resource.python_version || node['cluster']['python-version'] + python_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/python/Python-#{python_version}.tgz" + + if new_resource.python_version + python_url = "https://www.python.org/ftp/python/#{python_version}/Python-#{python_version}.tgz" + end if new_resource.user_only raise "user property is required for resource install_pyenv when user_only is set to true" unless new_resource.user - - pyenv_install 'user' do - user new_resource.user - prefix new_resource.prefix if new_resource.prefix - end + prefix = new_resource.prefix || "#{::File.expand_path("~#{user}")}/.pyenv" else prefix = new_resource.prefix || node['cluster']['system_pyenv_root'] + end - directory prefix do - recursive true - end - - bash "install python #{python_version}" do - user 'root' - group 'root' - cwd "#{prefix}" - code <<-VENV - set -e - aws s3 cp #{node['cluster']['artifacts_build_url']}/python/Python-#{python_version}.tgz Python-#{python_version}.tgz --region #{node['cluster']['region']} - tar -xzf Python-#{python_version}.tgz - cd Python-#{python_version} - ./configure --prefix=#{prefix}/versions/#{python_version} - make - make install - VENV - end - - # Remove the profile.d script that the pyenv cookbook writes. - # This is done in order to avoid exposing the ParallelCluster pyenv installation to customers - # on login. - file '/etc/profile.d/pyenv.sh' do - action :delete - end + directory prefix do + recursive true end - pyenv_python python_version do - user new_resource.user if new_resource.user_only + remote_file "#{prefix}/Python-#{python_version}.tgz" do + source "#{python_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing end - pyenv_plugin 'virtualenv' do - git_url 'https://github.com/pyenv/pyenv-virtualenv' - user new_resource.user if new_resource.user_only + user = new_resource.user || 'root' + + bash "install python #{python_version}" do + user user + group 'root' + cwd "#{prefix}" + code <<-VENV + set -e + tar -xzf Python-#{python_version}.tgz + cd Python-#{python_version} + ./configure --prefix=#{prefix}/versions/#{python_version} + make + make install + VENV end -end + +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb index 2c59ceebba..9341cd1f42 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/activate_virtual_env_spec.rb @@ -41,7 +41,7 @@ def self.run(chef_run, pyenv_name:, pyenv_path:, python_version:, user:, require group: 'root', cwd: "#{node['cluster']['system_pyenv_root']}" ).with_code(%r{source pyenv_path/bin/activate}) - end + end end context "without requirements" do diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb index fad6efad8f..614f7485bd 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb @@ -24,12 +24,13 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use runner = runner(platform: platform, version: version, step_into: ['install_pyenv']) do |node| node.override['cluster']['system_pyenv_root'] = system_pyenv_root node.override['cluster']['python-version'] = python_version + node.override['cluster']['artifacts_s3_url'] = "https://bucket.s3.#{aws_domain}/archives" end ConvergeInstallPyenv.run(runner) end cached(:node) { chef_run.node } - it 'runs istall_pyenv' do + it 'runs install_pyenv' do is_expected.to run_install_pyenv('run') end @@ -52,7 +53,7 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use group: 'root', cwd: "#{node['cluster']['system_pyenv_root']}" ).with_code(/tar -xzf Python-#{python_version}.tgz/) - .with_code(%r{./configure --prefix=#{node['cluster']['system_pyenv_root']}/versions/#{python_version}}) + .with_code(%r{./configure --prefix=#{node['cluster']['system_pyenv_root']}/versions/#{python_version}}) end end @@ -110,6 +111,8 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use node.override['cluster']['python-version'] = python_version node.override['cluster']['artifacts_s3_url'] = "https://bucket.s3.#{aws_domain}/archives" end + # ConvergeInstallPyenv.run(runner) + # runner = runner(platform: platform, version: version, step_into: ['install_pyenv']) ConvergeInstallPyenv.run(runner, user_only: true, user: user, python_version: python_version, pyenv_root: pyenv_root) end @@ -129,15 +132,8 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use cwd: "#{pyenv_root}" ) end - - it 'installs pyenv plugin virtualenv' do - is_expected.to install_pyenv_plugin('virtualenv').with( - git_url: 'https://github.com/pyenv/pyenv-virtualenv', - user: user - ) - end end end end end -end +end \ No newline at end of file From fbc9d4bcad2fc0e0f0fee444fe25b75861f60f0f Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 13 Jun 2024 20:47:55 -0400 Subject: [PATCH 4/6] Point s3 url for dependency download to production bucket --- .../recipes/install/cfn_bootstrap.rb | 2 +- .../spec/unit/recipes/cfn_bootstrap_spec.rb | 2 +- .../aws-parallelcluster-shared/attributes/environment.rb | 4 ++-- cookbooks/aws-parallelcluster-shared/attributes/versions.rb | 2 +- .../aws-parallelcluster-slurm/attributes/slurm_attributes.rb | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index dbaadab117..616b4c93a7 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -15,7 +15,7 @@ virtualenv_name = 'cfn_bootstrap_virtualenv' pyenv_root = node['cluster']['system_pyenv_root'] # FIXME: Python Version cfn_bootstrap_virtualenv due to a bug with cfn-hup -python_version = '3.9.17' +python_version = '3.9.19' virtualenv_path = "#{pyenv_root}/versions/#{python_version}/envs/#{virtualenv_name}" node.default['cluster']['cfn_bootstrap_virtualenv_path'] = virtualenv_path diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb index eda316c16c..97f6d302fe 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb @@ -5,7 +5,7 @@ context "on #{platform}#{version}" do cached(:cfnbootstrap_version) { '2.0-28' } cached(:cfnbootstrap_package) { "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" } - cached(:python_version) { '3.9.17' } + cached(:python_version) { '3.9.19' } cached(:system_pyenv_root) { 'system_pyenv_root' } cached(:virtualenv_path) { "system_pyenv_root/versions/#{python_version}/envs/cfn_bootstrap_virtualenv" } diff --git a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb index 8bf13c5fe0..0cb3409609 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb @@ -5,6 +5,6 @@ # URL for ParallelCluster Artifacts stored in public S3 buckets # ['cluster']['region'] will need to be defined by image_dna.json during AMI build. -default['cluster']['base_build_url'] = "s3://aws-parallelcluster-dev-build-dependencies" -default['cluster']['artifacts_s3_url'] = "https://aws-parallelcluster-dev-commercial.s3.#{node['cluster']['aws_domain']}/archives" +default['cluster']['base_build_url'] = "s3://#{node['cluster']['region']}-aws-parallelcluster" default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies" +default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives" \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 87ca20c00d..08c7f1a9b2 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -1,5 +1,5 @@ # Python Version -default['cluster']['python-version'] = '3.9.17' +default['cluster']['python-version'] = '3.9.19' default['cluster']['python-major-minor-version'] = '3.9' # ParallelCluster versions diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index b9e97d9f17..9c5e80350b 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -13,8 +13,8 @@ default['cluster']['enable_nss_slurm'] = node['cluster']['directory_service']['enabled'] # PMIX Version and Checksum -default['cluster']['pmix']['version'] = '4.2.9' -default['cluster']['pmix']['sha256'] = '00ddb36fb81c31519972079a218c3cdd903510fc3910abaf4d484068fa29e884' +default['cluster']['pmix']['version'] = '5.0.2' +default['cluster']['pmix']['sha256'] = '133e79c44d426043fa54b80649ecc97607b915ad8c5cc119575a3dd0c4104941' # Slurmdbd default['cluster']['slurmdbd_service_enabled'] = "true" From 4259befb09a2f780b1d65c1427d5c417a40f979d Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Mon, 17 Jun 2024 10:58:09 -0400 Subject: [PATCH 5/6] Fix fabric manager install to install from a local package --- .../recipes/awsbatch_virtualenv.rb | 21 ++++++++++++++ .../recipes/install.rb | 24 ++-------------- .../install/custom_parallelcluster_node.rb | 14 ++++++---- .../recipes/install/parallelcluster_node.rb | 24 ++-------------- .../spec/unit/recipes/node_spec.rb | 2 +- .../recipes/install/awscli.rb | 2 +- .../recipes/install/cookbook_virtualenv.rb | 6 +--- .../arm_pl/partial/_arm_pl_common.rb | 2 +- .../fabric_manager_ubuntu20+.rb | 2 +- .../partial/_fabric_manager_install_debian.rb | 28 ++++++++++--------- .../partial/_fabric_manager_install_rhel.rb | 12 ++++++-- .../nvidia_dcgm/nvidia_dcgm_alinux2023.rb | 1 + .../partial/_nvidia_dcgm_debian.rb | 14 ++++++++-- .../nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb | 12 ++++++-- .../partial/_nvidia_driver_common.rb | 2 +- .../spec/unit/recipes/awscli_spec.rb | 2 +- .../unit/recipes/cookbook_virtualenv_spec.rb | 7 ----- .../unit/resources/fabric_manager_spec.rb | 8 +++--- .../attributes/environment.rb | 5 ++-- .../resources/install_pyenv.rb | 5 ++-- .../spec/unit/resources/install_pyenv_spec.rb | 2 +- .../attributes/versions.rb | 4 +-- 22 files changed, 98 insertions(+), 101 deletions(-) diff --git a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb index 5a702b9782..cbcd61cf00 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb @@ -25,6 +25,27 @@ activate_virtual_env virtualenv_name do pyenv_path virtualenv_path python_version python_version + not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } +end + +remote_file "#{node['cluster']['base_dir']}/awsbatch-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end + +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf awsbatch-dependencies.tgz + cd awsbatch + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ end node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path diff --git a/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb b/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb index 85e7b3a829..0057c6bb65 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb @@ -44,32 +44,12 @@ tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-* - aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']} - tar xzf awsbatch-dependencies.tgz - cd awsbatch - #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index - cd .. - #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/ CLI end else # Install aws-parallelcluster-awsbatch-cli package - bash "install aws-parallelcluster-awsbatch-cli" do - cwd Chef::Config[:file_cache_path] - code <<-CLI - set -e - package_url=#{node['cluster']['artifacts_build_url']}/awsbatch/aws-parallelcluster.tgz - aws s3 cp ${package_url} aws-parallelcluster.tgz --region #{node['cluster']['region']} - mkdir aws-parallelcluster-awsbatch-cli - tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli - aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']} - tar xzf awsbatch-dependencies.tgz - cd awsbatch - #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index - cd .. - cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-* - #{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/ - CLI + execute "pip_install_parallelcluster_awsbatch_cli" do + command "#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install aws-parallelcluster-awsbatch-cli==#{node['cluster']['parallelcluster-awsbatch-cli-version']}" end end diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb index efec81abc4..9bdee2da57 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb @@ -20,6 +20,14 @@ # TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the # bash block to a recipe that uses the pyenv resource. +remote_file "#{Chef::Config[:file_cache_path]}/node-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end + bash "install custom aws-parallelcluster-node" do cwd Chef::Config[:file_cache_path] code <<-NODE @@ -38,12 +46,6 @@ mkdir aws-parallelcluster-custom-node tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-custom-node cd aws-parallelcluster-custom-node/*aws-parallelcluster-node-* - aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']} - tar xzf node-dependencies.tgz - cd node - #{node_virtualenv_path}/bin/pip install * -f ./ --no-index - cd .. - pip install . deactivate NODE diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb index ae7b70b222..8b3f0ebed5 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb @@ -36,27 +36,7 @@ if is_custom_node? include_recipe 'aws-parallelcluster-computefleet::custom_parallelcluster_node' else - bash "install official aws-parallelcluster-node" do - cwd Chef::Config[:file_cache_path] - code <<-NODE - set -e - [[ ":$PATH:" != *":/usr/local/bin:"* ]] && PATH="/usr/local/bin:${PATH}" - echo "PATH is $PATH" - source #{node_virtualenv_path}/bin/activate - pip uninstall --yes aws-parallelcluster-node - node_url=#{node['cluster']['artifacts_build_url']}/node/aws-parallelcluster-node.tgz - aws s3 cp ${node_url} aws-parallelcluster-node.tgz --region #{node['cluster']['region']} - rm -fr aws-parallelcluster-node - mkdir aws-parallelcluster-node - tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-node - aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']} - tar xzf node-dependencies.tgz - cd node - #{node_virtualenv_path}/bin/pip install * -f ./ --no-index - cd .. - cd aws-parallelcluster-node/*aws-parallelcluster-node-* - pip install . - deactivate - NODE + execute "install official aws-parallelcluster-node" do + command "#{virtualenv_path}/bin/pip install aws-parallelcluster-node==#{node['cluster']['parallelcluster-node-version']}" end end diff --git a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb index 6de0f9bdda..b4158581b7 100644 --- a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb +++ b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/node_spec.rb @@ -36,7 +36,7 @@ end it 'installs official node package' do - is_expected.to run_bash('install official aws-parallelcluster-node') + is_expected.to run_execute('install official aws-parallelcluster-node') end end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb index 99adafcc46..9f6fb6aacf 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb @@ -35,5 +35,5 @@ end bash 'install awscli' do - code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" + code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb index 02016eeb15..63fe8b5ecf 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb @@ -24,11 +24,7 @@ activate_virtual_env cookbook_virtualenv_name do pyenv_path cookbook_virtualenv_path python_version cookbook_python_version -end - -cookbook_file "#{virtualenv_path}/requirements.txt" do - source "cookbook_virtualenv/requirements.txt" - mode '0755' + not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } end remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do diff --git a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb index a91416809d..c37febf4e3 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb @@ -137,7 +137,7 @@ tar -xf #{gcc_tarball} cd gcc-#{gcc_version} # Patch the download_prerequisites script to download over https and not ftp. This works better in China regions. - sed -i "s#ftp://gcc\.gnu\.org##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites + sed -i "s#ftp://gcc\.gnu\.org/pub/gcc/infrastructure##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites ./contrib/download_prerequisites mkdir build && cd build ../configure --prefix=/opt/arm/armpl/gcc/#{gcc_version} --disable-bootstrap --enable-checking=release --enable-languages=c,c++,fortran --disable-multilib diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index ac6de0b145..2b700b387a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -24,7 +24,7 @@ def fabric_manager_package end def fabric_manager_version - "#{_nvidia_driver_version}*" + "#{_nvidia_driver_version}" end def platform diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb index 6c5a358b7f..5dc4266f24 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb @@ -14,19 +14,21 @@ action :install_package do # For ubuntu, CINC17 apt-package resources need full versions for `version` - execute "install_fabricmanager_for_ubuntu" do - bash "Install #{fabric_manager_package}" do - user 'root' - code <<-FABRIC_MANAGER - set -e - aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb - FABRIC_MANAGER - retries 3 - retry_delay 5 - end + remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.deb" do + source "#{fabric_manager_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end - command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\ - "&& apt-mark hold #{fabric_manager_package}" + bash "install_fabricmanager_for_ubuntu" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-FABRIC_MANAGER + set -e + dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package} + FABRIC_MANAGER retries 3 retry_delay 5 end @@ -37,5 +39,5 @@ def arch_suffix end def fabric_manager_url - "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb index 2bf6b73d2b..7a7b49ea2a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb @@ -13,12 +13,20 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do + remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.rpm" do + source "#{fabric_manager_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + package 'yum-plugin-versionlock' bash "Install #{fabric_manager_package}" do user 'root' + cwd node['cluster']['sources_dir'] code <<-FABRIC_MANAGER_INSTALL set -e - aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm --region #{node['cluster']['region']} yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm yum versionlock #{fabric_manager_package} FABRIC_MANAGER_INSTALL @@ -32,5 +40,5 @@ def arch_suffix end def fabric_manager_url - "#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb index 9947442633..b984b2fb82 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_alinux2023.rb @@ -17,6 +17,7 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled _nvidia_enabled diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb index 4bae3f6fa0..5ca316daad 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -13,12 +13,20 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do + remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do + source "#{dcgm_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + bash "Install #{dcgm_package}" do user 'root' + cwd node['cluster']['sources_dir'] code <<-DCGM_INSTALL set -e - aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb --region #{node['cluster']['region']} - apt -y install #{dcgm_package}-#{package_version}.deb + dpkg -i #{dcgm_package}-#{package_version}.deb DCGM_INSTALL retries 3 retry_delay 5 @@ -26,7 +34,7 @@ end def dcgm_url - "#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" end def dcgm_package diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb index 7ea74facd7..997762acd1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -13,11 +13,19 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do + remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do + source "#{dcgm_url}" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + bash "Install #{dcgm_package}" do user 'root' + cwd node['cluster']['sources_dir'] code <<-DCGM_INSTALL set -e - aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm --region #{node['cluster']['region']} yum install -y #{dcgm_package}-#{package_version}.rpm DCGM_INSTALL retries 3 @@ -26,7 +34,7 @@ end def dcgm_url - "#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" + "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" end def dcgm_package diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index fed953ad4e..947e68f49f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -126,4 +126,4 @@ def nvidia_kernel_module else "kernel-open" end -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb index 2bdcab53db..4f3c1136ec 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/awscli_spec.rb @@ -31,7 +31,7 @@ it 'installs awscli into cookbook virtualev path' do is_expected.to run_bash('install awscli') - .with_code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" + .with_code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb index 88c42dadff..65b83d509e 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb @@ -33,13 +33,6 @@ is_expected.to write_node_attributes('dump node attributes') end - it 'copies requirements file' do - is_expected.to create_cookbook_file("#{virtualenv_path}/requirements.txt").with( - source: "cookbook_virtualenv/requirements.txt", - mode: '0755' - ) - end - it 'installs python packages' do is_expected.to run_bash("pip install").with( user: 'root', diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index 88a79da853..0614a073b1 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -169,7 +169,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } - cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } + cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } context 'when fabric manager is to install' do cached(:chef_run) do @@ -193,10 +193,10 @@ def self.configure(chef_run) if platform == 'ubuntu' it 'installs fabric manager for ubuntu' do - is_expected.to run_execute('install_fabricmanager_for_ubuntu') + is_expected.to run_bash('install_fabricmanager_for_ubuntu') .with_retries(3) .with_retry_delay(5) - .with_command("apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}") + .with_code(/dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}/) end else it 'installs yum-plugin-versionlock' do @@ -222,7 +222,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } - cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } + cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } context('when nvswithes are > 1') do cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb index 0cb3409609..e8916e7683 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/environment.rb @@ -5,6 +5,5 @@ # URL for ParallelCluster Artifacts stored in public S3 buckets # ['cluster']['region'] will need to be defined by image_dna.json during AMI build. -default['cluster']['base_build_url'] = "s3://#{node['cluster']['region']}-aws-parallelcluster" -default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies" -default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives" \ No newline at end of file +default['cluster']['artifacts_build_url'] = "s3://#{node['cluster']['region']}-aws-parallelcluster/archives/dependencies" +default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives" diff --git a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb index cd857e9d0f..9c09b509a0 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb @@ -30,7 +30,7 @@ end remote_file "#{prefix}/Python-#{python_version}.tgz" do - source "#{python_url}" + source python_url mode '0644' retries 3 retry_delay 5 @@ -52,5 +52,4 @@ make install VENV end - -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb index 614f7485bd..b60e086a3c 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/install_pyenv_spec.rb @@ -136,4 +136,4 @@ def self.run(chef_run, python_version: nil, pyenv_root: nil, user_only: nil, use end end end -end \ No newline at end of file +end diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index b2671d3d2e..95bf5efeef 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -5,6 +5,6 @@ default['cluster']['slurm']['sha256'] = 'b25127efd69a47c14bd65dfa3bff2687b5350c5290eafb601f923faebe6fd238' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge -default['cluster']['munge']['munge_version'] = '0.5.15' -default['cluster']['munge']['sha256'] = '51b2c81d1a7ec2ab5d486fa51b50c7e79eb1810ca6687b6ed65f3601abc55614' +default['cluster']['munge']['munge_version'] = '0.5.16' +default['cluster']['munge']['sha256'] = 'fa27205d6d29ce015b0d967df8f3421067d7058878e75d0d5ec3d91f4d32bb57' default['cluster']['munge']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/munge" From 48739149cba309b6e516aa442b595cea87a619e2 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Mon, 17 Jun 2024 13:59:38 -0400 Subject: [PATCH 6/6] Update changelog to include changes to build-image --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b52cb71b73..2a87727618 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **ENHANCEMENTS** - Add support for external Slurmdbd. +- Allow build-image to be run in an isolated network. **CHANGES** - Upgrade Cinc Client to version to 18.4.12 from 18.2.7.