From a3d67f7344061737b8565cfe9f91cf44d744ff21 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 1 Mar 2024 10:52:36 +0100 Subject: [PATCH 1/4] [DFSM] Clarify comments and resource description related to cluster config file. Signed-off-by: Giacomo Marciani --- .../aws-parallelcluster-platform/resources/fetch_config.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb index c98ccc7473..0f2b492ad0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb @@ -80,7 +80,8 @@ if new_resource.update # Wait for the head node to write the config version file, which is the signal that - # all configuration files within the shared folder are aligned with the latest config version. + # all configuration files (cluster config, change-set, ...) have been written on the shared folder + # and are aligned with the latest config version. # This is required only on update because on create it is guaranteed that this recipe is executed on compute node # only after it has completed on head node. wait_cluster_config_file(sync_file_login_nodes) From fe8fabe1582fcba9bea58cd5f1568dd3ec81c023 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 1 Mar 2024 10:55:33 +0100 Subject: [PATCH 2/4] [DFSM] Make the check 'are_mount_or_unmount_required?' resilient to the lack of the changeset file. In particular, we consider mount/unmount not required if the change-set file is missing. Signed-off-by: Giacomo Marciani --- .../libraries/update.rb | 7 ++++++- .../spec/unit/libraries/update_spec.rb | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb index ffcc0d590b..d4747d502c 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb @@ -36,7 +36,12 @@ def are_bulk_custom_slurm_settings_updated? def are_mount_or_unmount_required? require 'json' - change_set = JSON.load_file("#{node['cluster']['shared_dir']}/change-set.json") + + change_set_path = node["cluster"]["change_set_path"] + + return false unless ::File.exist?(change_set_path) + + change_set = JSON.load_file(change_set_path) change_set["changeSet"].each do |change| next unless change["updatePolicy"] == "SHARED_STORAGE_UPDATE_POLICY" diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb index e1ef443e0d..d68ea19e6c 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb @@ -1,20 +1,33 @@ require 'spec_helper' describe "aws-parallelcluster-slurm:libraries:are_mount_unmount_required" do + CHANGE_SET_PATH = "/CHANGE_SET_PATH" + let(:node) do { - "cluster" => { "shared_dir" => "/SHARED_DIR" }, + "cluster" => { "change_set_path" => CHANGE_SET_PATH }, } end shared_examples "the correct method" do |changeset, expected_result| it "returns #{expected_result}" do - allow(File).to receive(:read).with("/SHARED_DIR/change-set.json").and_return(JSON.dump(changeset)) + if changeset.nil? + allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(false) + allow(File).to receive(:read).with(CHANGE_SET_PATH).and_call_original + else + allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(true) + allow(File).to receive(:read).with(CHANGE_SET_PATH).and_return(JSON.dump(changeset)) + end result = are_mount_or_unmount_required? expect(result).to eq(expected_result) end end + context "when changeset does not exist" do + changeset = nil + include_examples "the correct method", changeset, false + end + context "when changeset is empty" do changeset = { "changeSet" => [], From 42511eb655bb5229f209c038d1fae03b1f2d1223 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 1 Mar 2024 12:37:24 +0100 Subject: [PATCH 3/4] [DFSM][Test] Fix title and var definition in spec test covering method 'are_mount_or_unmount_required'. Signed-off-by: Giacomo Marciani --- .../spec/unit/libraries/update_spec.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb index d68ea19e6c..c4e9c8297b 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb @@ -1,7 +1,7 @@ require 'spec_helper' -describe "aws-parallelcluster-slurm:libraries:are_mount_unmount_required" do - CHANGE_SET_PATH = "/CHANGE_SET_PATH" +describe "aws-parallelcluster-slurm:libraries:are_mount_or_unmount_required" do + CHANGE_SET_PATH = "/CHANGE_SET_PATH".freeze let(:node) do { From 23feaf0b21f77dbe2665bbbf21293d585aced9ed Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 1 Mar 2024 19:26:52 +0100 Subject: [PATCH 4/4] [LoginNodes] Add synchronization point to make login nodes wait for the head node to write keys. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 1 + .../recipes/config/login_nodes_keys.rb | 15 +++++-- .../unit/recipes/login_nodes_keys_spec.rb | 42 +++++++++++++++---- .../libraries/helpers.rb | 26 ++++++++++++ 4 files changed, 72 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42d9a154e6..15c2822705 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **BUG FIXES** - Fix issue making job fail when submitted as active directory user from login nodes. The issue was caused by an incomplete configuration of the integration with the external Active Directory on the head node. +- Fix issue making login nodes fail to bootstrap when the head node takes more time than expected in writing keys. 3.8.0 ------ diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb index 395e50da7f..e30df9877c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb @@ -17,9 +17,11 @@ return if on_docker? || node['cluster']['scheduler'] == 'awsbatch' -script_name = "keys-manager.sh" keys_dir = "#{node['cluster']['shared_dir_login_nodes']}" script_dir = "#{keys_dir}/scripts" +script_path = "#{script_dir}/keys-manager.sh" + +sync_file_path = "#{keys_dir}/.login_nodes_keys_sync_file" case node['cluster']['node_type'] when 'ComputeFleet' @@ -32,7 +34,7 @@ recursive true end - cookbook_file script_dir + '/' + script_name do + cookbook_file script_path do source 'login_nodes/keys-manager.sh' owner 'root' group 'root' @@ -40,13 +42,18 @@ end execute 'Initialize Login Nodes keys' do - command "bash #{script_dir}/#{script_name} --create --folder-path #{keys_dir}" + command "bash #{script_path} --create --folder-path #{keys_dir}" user 'root' end + write_sync_file(sync_file_path) + when 'LoginNode' + + wait_sync_file(sync_file_path) + execute 'Import Login Nodes keys' do - command "bash #{script_dir}/#{script_name} --import --folder-path #{keys_dir}" + command "bash #{script_path} --import --folder-path #{keys_dir}" user 'root' end else diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb index 2d05e15bd0..5603982d57 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb @@ -1,19 +1,24 @@ require 'spec_helper' describe 'aws-parallelcluster-environment::login_nodes_keys' do + SHARED_DIR_LOGIN_NODES = "/SHARED_DIR_LOGIN_NODES".freeze + SYNC_FILE = "#{SHARED_DIR_LOGIN_NODES}/.login_nodes_keys_sync_file".freeze + CLUSTER_CONFIG_VERSION = "CLUSTER_CONFIG_VERSION".freeze + for_all_oses do |platform, version| context "on #{platform}#{version}" do context "when awsbatch scheduler" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['scheduler'] = 'awsbatch' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'does not create the script directory' do - is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts") + is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts") end end @@ -21,13 +26,14 @@ cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'does not create the scripts directory' do - is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts") + is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts") end end @@ -36,14 +42,15 @@ runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'HeadNode' node.override['cluster']['scheduler'] = 'slurm' - node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES + node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'creates the scripts directory' do - is_expected.to create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts").with( + is_expected.to create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts").with( owner: 'root', group: 'root', mode: '0744' @@ -51,7 +58,7 @@ end it 'creates keys-manager.sh script' do - is_expected.to create_cookbook_file("#{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh").with( + is_expected.to create_cookbook_file("#{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh").with( source: 'login_nodes/keys-manager.sh', owner: "root", group: "root", @@ -61,7 +68,16 @@ it "creates login nodes keys" do is_expected.to run_execute("Initialize Login Nodes keys") - .with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --create --folder-path #{node['cluster']['shared_dir_login_nodes']}") + .with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --create --folder-path #{SHARED_DIR_LOGIN_NODES}") + end + + it "writes the synchronization file for login nodes" do + is_expected.to create_file(SYNC_FILE).with( + content: CLUSTER_CONFIG_VERSION, + mode: '0644', + owner: 'root', + group: 'root' + ) end end @@ -70,15 +86,25 @@ runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'LoginNode' node.override['cluster']['scheduler'] = 'slurm' - node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES + node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION end runner.converge(described_recipe) end cached(:node) { chef_run.node } + it "waits for cluster config version file" do + is_expected.to run_bash("Wait for synchronization file at #{SYNC_FILE} to be written for version #{CLUSTER_CONFIG_VERSION}").with( + code: "[[ \"$(cat #{SYNC_FILE})\" == \"#{CLUSTER_CONFIG_VERSION}\" ]] || exit 1", + retries: 30, + retry_delay: 10, + timeout: 5 + ) + end + it "imports login nodes keys" do is_expected.to run_execute("Import Login Nodes keys") - .with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --import --folder-path #{node['cluster']['shared_dir_login_nodes']}") + .with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --import --folder-path #{SHARED_DIR_LOGIN_NODES}") end end end diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index 39e3f005cd..993e2f8a5b 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -81,3 +81,29 @@ def is_custom_node? custom_node_package = node['cluster']['custom_node_package'] !custom_node_package.nil? && !custom_node_package.empty? end + +def write_sync_file(path) + # Write a synchronization file containing the current cluster config version. + # Synchronization files are used as a synchronization point between cluster nodes + # to signal that a group of actions have been completed. + file path do + content node["cluster"]["cluster_config_version"] + mode "0644" + owner "root" + group "root" + end +end + +def wait_sync_file(path) + # Wait for a synchronization file to be written for the current cluster config version. + # Synchronization files are used as a synchronization point between cluster nodes + # to signal that a group of actions have been completed. + cluster_config_version = node["cluster"]["cluster_config_version"] + # Wait for the config version file to contain the current cluster config version. + bash "Wait for synchronization file at #{path} to be written for version #{cluster_config_version}" do + code "[[ \"$(cat #{path})\" == \"#{cluster_config_version}\" ]] || exit 1" + retries 30 + retry_delay 10 + timeout 5 + end +end