From 22687cdf36aa673b4df93fe78232ab533591bc69 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Mon, 9 Jan 2023 16:11:49 +0100 Subject: [PATCH 1/8] Add steps to apply patches to Slurm tarball Filename pattern expansion is guaranteed to be sorted alphabetically: https://www.gnu.org/software/bash/manual/bash.html#Filename-Expansion Use nullglob bash option to handle case with no patches in patch folder. Signed-off-by: Jacopo De Amicis --- .../install_slurm/slurm_patches/.gitignore | 0 .../recipes/install_slurm.rb | 20 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/.gitignore diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/.gitignore b/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/.gitignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb index 7898282647..e1580a3897 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb @@ -72,6 +72,14 @@ end end +# Copy Slurm patches +remote_directory "#{node['cluster']['sources_dir']}/slurm_patches" do + source 'install_slurm/slurm_patches' + mode '0755' + action :create + recursive true +end + # Install Slurm bash 'make install' do user 'root' @@ -85,11 +93,23 @@ tar xf #{slurm_tarball} cd slurm-slurm-#{node['cluster']['slurm']['version']} + + # Apply possible Slurm patches + shopt -s nullglob # with this an empty slurm_patches directory does not trigger the loop + for patch in #{node['cluster']['sources_dir']}/slurm_patches/*.diff; do + git apply ${patch} + done + shopt -u nullglob + + # Configure Slurm ./configure --prefix=#{node['cluster']['slurm']['install_dir']} --with-pmix=/opt/pmix --with-jwt=/opt/libjwt --enable-slurmrestd + + # Build Slurm CORES=$(grep processor /proc/cpuinfo | wc -l) make -j $CORES make install make install-contrib + deactivate SLURM # TODO: Fix, so it works for upgrade From b864e5a0f447577610a482c93d57b12ef418353b Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 10:52:56 +0100 Subject: [PATCH 2/8] Add README.md for Slurm patches folder Signed-off-by: Jacopo De Amicis --- .../default/install_slurm/slurm_patches/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/README.md diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/README.md b/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/README.md new file mode 100644 index 0000000000..aad56694f3 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/README.md @@ -0,0 +1,10 @@ +# Slurm patches configuration + +This folder is for patch files for the Slurm source code. Patches must be produced by git +(e.g. via `git diff` or by downloading the diff from a commit in GitHub). + +Patch files must be prepended with a zero-padded number in order to allow the cookbook to +apply the patches in a defined order. For example: +- 01_hashabc.diff +- 02_hashdef.diff +- 03 hash123.diff From ad0b8853bdfeb02fbbadbba52b10d7898d2b4285 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 13:18:55 +0100 Subject: [PATCH 3/8] Add Slurm patch for scontrol update nodelist sorting bug Signed-off-by: Jacopo De Amicis --- .../slurm_patches/01_528187722c_no_NEWS.diff | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/01_528187722c_no_NEWS.diff diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/01_528187722c_no_NEWS.diff b/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/01_528187722c_no_NEWS.diff new file mode 100644 index 0000000000..12d45e4951 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/01_528187722c_no_NEWS.diff @@ -0,0 +1,126 @@ +#diff --git a/NEWS b/NEWS +#index ef51ddaee35..b847ab43ee9 100644 +#--- a/NEWS +#+++ b/NEWS +#@@ -9,6 +9,8 @@ documents those changes that are of interest to users and administrators. +# ping_nodes() calls all nodes to re-register. +# -- openapi/dbv0.0.3[6-8] - fix segfault that could arrise from Slurm database +# connnection failing. +#+ -- Fix regression introduced in 22.05.0rc1 when updating a NodeName= +#+ with NodeAddr and/or NodeHostname if the specified nodelist wasn't sorted. +# +# * Changes in Slurm 22.05.7 +# ========================== +diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 +index d20c7a768ec..8913345038d 100644 +--- a/doc/man/man1/scontrol.1 ++++ b/doc/man/man1/scontrol.1 +@@ -1,4 +1,4 @@ +-.TH scontrol "1" "Slurm Commands" "November 2022" "Slurm Commands" ++.TH scontrol "1" "Slurm Commands" "January 2023" "Slurm Commands" + + .SH "NAME" + scontrol \- view or modify Slurm configuration and state. +@@ -474,7 +474,7 @@ unique (e.g. tux2,tux1,tux2 = tux[2,1\-2]). If you wanted a sorted list use + + .TP + \fBhostlistsorted\fR +-Takes a list of host names and prints a sorted, unique hostlist ++Takes a list of host names and prints a sorted (but not unique) hostlist + expression for them. See \fBhostlist\fR. + .IP + +diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c +index ef4b111a595..a86e529c338 100644 +--- a/src/slurmctld/node_mgr.c ++++ b/src/slurmctld/node_mgr.c +@@ -1368,14 +1368,18 @@ int update_node(update_node_msg_t *update_node_msg, uid_t auth_uid) + hostlist_t host_list, hostaddr_list = NULL, hostname_list = NULL; + uint32_t base_state = 0, node_flags, state_val; + time_t now = time(NULL); ++ bool uniq = true; + + if (update_node_msg->node_names == NULL ) { + info("%s: invalid node name", __func__); + return ESLURM_INVALID_NODE_NAME; + } + ++ if (update_node_msg->node_addr || update_node_msg->node_hostname) ++ uniq = false; ++ + if (!(host_list = nodespec_to_hostlist(update_node_msg->node_names, +- NULL))) ++ uniq, NULL))) + return ESLURM_INVALID_NODE_NAME; + + if (!(node_cnt = hostlist_count(host_list))) { +@@ -4867,7 +4871,7 @@ extern int delete_nodes(char *names, char **err_msg) + + lock_slurmctld(write_lock); + +- if (!(to_delete = nodespec_to_hostlist(names, NULL))) { ++ if (!(to_delete = nodespec_to_hostlist(names, true, NULL))) { + ret_rc = ESLURM_INVALID_NODE_NAME; + goto cleanup; + } +diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c +index d13e71d353f..b00d9f3392a 100644 +--- a/src/slurmctld/partition_mgr.c ++++ b/src/slurmctld/partition_mgr.c +@@ -192,7 +192,7 @@ extern int build_part_bitmap(part_record_t *part_ptr) + node_record_count - 1); + } + +- if (!(host_list = nodespec_to_hostlist(part_ptr->orig_nodes, ++ if (!(host_list = nodespec_to_hostlist(part_ptr->orig_nodes, true, + &part_ptr->nodesets))) { + /* Error, restore original bitmap */ + FREE_NULL_BITMAP(part_ptr->node_bitmap); +diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c +index f3e3a75deed..fb0f4e9a2f8 100644 +--- a/src/slurmctld/read_config.c ++++ b/src/slurmctld/read_config.c +@@ -307,7 +307,9 @@ static void _add_nodes_with_feature(hostlist_t hl, char *feature) + } + } + +-extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets) ++extern hostlist_t nodespec_to_hostlist(const char *nodes, ++ bool uniq, ++ char **nodesets) + { + int count; + slurm_conf_nodeset_t *ptr, **ptr_array; +@@ -354,7 +356,8 @@ extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets) + } + } + +- hostlist_uniq(hl); ++ if (uniq) ++ hostlist_uniq(hl); + return hl; + } + +diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h +index d254cdda52e..b31a5c43f99 100644 +--- a/src/slurmctld/slurmctld.h ++++ b/src/slurmctld/slurmctld.h +@@ -2941,6 +2941,7 @@ extern int update_node_avail_features(char *node_names, char *avail_features, + * Handles node range expressions, nodesets and ALL keyword. + * + * IN nodes - nodelist that can have nodesets or ALL in it. ++ * IN uniq - call hostlist_uniq() before returning the hostlist + * OUT nodesets (optional) - list of nodesets found in nodes string + * + * RET NULL on error, hostlist_t otherwise. +@@ -2948,7 +2949,9 @@ extern int update_node_avail_features(char *node_names, char *avail_features, + * NOTE: Caller must FREE_NULL_HOSTLIST() returned hostlist_t. + * NOTE: Caller should interpret a non-NULL but empty hostlist conveniently. + */ +-extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets); ++extern hostlist_t nodespec_to_hostlist(const char *nodes, ++ bool uniq, ++ char **nodesets); + + /* + * set_node_reboot_reason - appropriately set node reason with reboot message From 501a2f2c555af4001021d59fd12cdc7a1542ff43 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 14:13:25 +0100 Subject: [PATCH 4/8] Add git to base packages for AMI build Signed-off-by: Jacopo De Amicis --- attributes/default.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index bbf10ef638..00bbcd5294 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -464,7 +464,7 @@ libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm python python-pip libssh2-devel libgcrypt-devel libevent-devel glibc-static bind-utils iproute NetworkManager-config-routing-rules python3 python3-pip iptables libcurl-devel yum-plugin-versionlock - coreutils moreutils sssd sssd-tools sssd-ldap curl) + coreutils moreutils sssd sssd-tools sssd-ldap curl git) default['cluster']['rhel']['extra_repo'] = 'rhui-REGION-rhel-server-optional' if node['platform_version'].to_i == 7 && node['kernel']['machine'] == 'aarch64' @@ -501,7 +501,7 @@ r-base libblas-dev libffi-dev libxml2-dev mdadm libgcrypt20-dev libmysqlclient-dev libevent-dev iproute2 python3 python3-pip libatlas-base-dev libglvnd-dev iptables libcurl4-openssl-dev - coreutils moreutils sssd sssd-tools sssd-ldap curl) + coreutils moreutils sssd sssd-tools sssd-ldap curl git) case node['platform_version'] when '18.04' From 828b88879b5d1351581986a84a4e07b673ff68e6 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 14:29:45 +0100 Subject: [PATCH 5/8] Add logging of applied Slurm patches Signed-off-by: Jacopo De Amicis --- cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb index e1580a3897..8f4f71031e 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb @@ -97,7 +97,9 @@ # Apply possible Slurm patches shopt -s nullglob # with this an empty slurm_patches directory does not trigger the loop for patch in #{node['cluster']['sources_dir']}/slurm_patches/*.diff; do + echo "Applying patch ${patch}..." git apply ${patch} + echo "...DONE." done shopt -u nullglob From 76415905dc1f517d4a5ab7153b4c507b133def31 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 15:59:12 +0100 Subject: [PATCH 6/8] Add entry to the CHANGELOG.md Signed-off-by: Jacopo De Amicis --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 752209e9c0..7f2393a985 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +3.4.1 +----- + +**BUG FIXES** +- Fix an issue with the Slurm scheduler that might incorrectly apply updates to its internal registry of compute nodes. This might result in EC2 instances to become inaccessible or backed by an incorrect instance type. + 3.4.0 ------ From 62357b0ed4c9fb129e1385655e73c7202733a1f5 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 16:11:55 +0100 Subject: [PATCH 7/8] Fix code-linters failures Signed-off-by: Jacopo De Amicis --- cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb index 8f4f71031e..85730fb50c 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb @@ -93,7 +93,7 @@ tar xf #{slurm_tarball} cd slurm-slurm-#{node['cluster']['slurm']['version']} - + # Apply possible Slurm patches shopt -s nullglob # with this an empty slurm_patches directory does not trigger the loop for patch in #{node['cluster']['sources_dir']}/slurm_patches/*.diff; do From bf513df7a9cf41983ede3004978b05f26d8224b3 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Tue, 10 Jan 2023 18:02:57 +0100 Subject: [PATCH 8/8] Use patch instead of git apply to patch Slurm Remove git from the base packages for CentOS and Debian (Ubuntu). Signed-off-by: Jacopo De Amicis --- attributes/default.rb | 4 ++-- cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index 00bbcd5294..bbf10ef638 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -464,7 +464,7 @@ libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm python python-pip libssh2-devel libgcrypt-devel libevent-devel glibc-static bind-utils iproute NetworkManager-config-routing-rules python3 python3-pip iptables libcurl-devel yum-plugin-versionlock - coreutils moreutils sssd sssd-tools sssd-ldap curl git) + coreutils moreutils sssd sssd-tools sssd-ldap curl) default['cluster']['rhel']['extra_repo'] = 'rhui-REGION-rhel-server-optional' if node['platform_version'].to_i == 7 && node['kernel']['machine'] == 'aarch64' @@ -501,7 +501,7 @@ r-base libblas-dev libffi-dev libxml2-dev mdadm libgcrypt20-dev libmysqlclient-dev libevent-dev iproute2 python3 python3-pip libatlas-base-dev libglvnd-dev iptables libcurl4-openssl-dev - coreutils moreutils sssd sssd-tools sssd-ldap curl git) + coreutils moreutils sssd sssd-tools sssd-ldap curl) case node['platform_version'] when '18.04' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb index 85730fb50c..b7a3d1990f 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb @@ -98,7 +98,7 @@ shopt -s nullglob # with this an empty slurm_patches directory does not trigger the loop for patch in #{node['cluster']['sources_dir']}/slurm_patches/*.diff; do echo "Applying patch ${patch}..." - git apply ${patch} + patch --ignore-whitespace -p1 < ${patch} echo "...DONE." done shopt -u nullglob