diff --git a/CHANGELOG.md b/CHANGELOG.md index c5ea602e7f..7a7885b079 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **BUG FIXES** - Fix issue making job fail when submitted as active directory user from login nodes. The issue was caused by an incomplete configuration of the integration with the external Active Directory on the head node. +- Fix issue making login nodes fail to bootstrap when the head node takes more time than expected in writing keys. 3.8.0 ------ diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb index 395e50da7f..e30df9877c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb @@ -17,9 +17,11 @@ return if on_docker? || node['cluster']['scheduler'] == 'awsbatch' -script_name = "keys-manager.sh" keys_dir = "#{node['cluster']['shared_dir_login_nodes']}" script_dir = "#{keys_dir}/scripts" +script_path = "#{script_dir}/keys-manager.sh" + +sync_file_path = "#{keys_dir}/.login_nodes_keys_sync_file" case node['cluster']['node_type'] when 'ComputeFleet' @@ -32,7 +34,7 @@ recursive true end - cookbook_file script_dir + '/' + script_name do + cookbook_file script_path do source 'login_nodes/keys-manager.sh' owner 'root' group 'root' @@ -40,13 +42,18 @@ end execute 'Initialize Login Nodes keys' do - command "bash #{script_dir}/#{script_name} --create --folder-path #{keys_dir}" + command "bash #{script_path} --create --folder-path #{keys_dir}" user 'root' end + write_sync_file(sync_file_path) + when 'LoginNode' + + wait_sync_file(sync_file_path) + execute 'Import Login Nodes keys' do - command "bash #{script_dir}/#{script_name} --import --folder-path #{keys_dir}" + command "bash #{script_path} --import --folder-path #{keys_dir}" user 'root' end else diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb index 2d05e15bd0..5603982d57 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb @@ -1,19 +1,24 @@ require 'spec_helper' describe 'aws-parallelcluster-environment::login_nodes_keys' do + SHARED_DIR_LOGIN_NODES = "/SHARED_DIR_LOGIN_NODES".freeze + SYNC_FILE = "#{SHARED_DIR_LOGIN_NODES}/.login_nodes_keys_sync_file".freeze + CLUSTER_CONFIG_VERSION = "CLUSTER_CONFIG_VERSION".freeze + for_all_oses do |platform, version| context "on #{platform}#{version}" do context "when awsbatch scheduler" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['scheduler'] = 'awsbatch' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'does not create the script directory' do - is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts") + is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts") end end @@ -21,13 +26,14 @@ cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'does not create the scripts directory' do - is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts") + is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts") end end @@ -36,14 +42,15 @@ runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'HeadNode' node.override['cluster']['scheduler'] = 'slurm' - node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES + node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION end runner.converge(described_recipe) end cached(:node) { chef_run.node } it 'creates the scripts directory' do - is_expected.to create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts").with( + is_expected.to create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts").with( owner: 'root', group: 'root', mode: '0744' @@ -51,7 +58,7 @@ end it 'creates keys-manager.sh script' do - is_expected.to create_cookbook_file("#{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh").with( + is_expected.to create_cookbook_file("#{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh").with( source: 'login_nodes/keys-manager.sh', owner: "root", group: "root", @@ -61,7 +68,16 @@ it "creates login nodes keys" do is_expected.to run_execute("Initialize Login Nodes keys") - .with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --create --folder-path #{node['cluster']['shared_dir_login_nodes']}") + .with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --create --folder-path #{SHARED_DIR_LOGIN_NODES}") + end + + it "writes the synchronization file for login nodes" do + is_expected.to create_file(SYNC_FILE).with( + content: CLUSTER_CONFIG_VERSION, + mode: '0644', + owner: 'root', + group: 'root' + ) end end @@ -70,15 +86,25 @@ runner = runner(platform: platform, version: version) do |node| node.override['cluster']['node_type'] = 'LoginNode' node.override['cluster']['scheduler'] = 'slurm' - node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes' + node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES + node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION end runner.converge(described_recipe) end cached(:node) { chef_run.node } + it "waits for cluster config version file" do + is_expected.to run_bash("Wait for synchronization file at #{SYNC_FILE} to be written for version #{CLUSTER_CONFIG_VERSION}").with( + code: "[[ \"$(cat #{SYNC_FILE})\" == \"#{CLUSTER_CONFIG_VERSION}\" ]] || exit 1", + retries: 30, + retry_delay: 10, + timeout: 5 + ) + end + it "imports login nodes keys" do is_expected.to run_execute("Import Login Nodes keys") - .with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --import --folder-path #{node['cluster']['shared_dir_login_nodes']}") + .with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --import --folder-path #{SHARED_DIR_LOGIN_NODES}") end end end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb index c98ccc7473..0f2b492ad0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb @@ -80,7 +80,8 @@ if new_resource.update # Wait for the head node to write the config version file, which is the signal that - # all configuration files within the shared folder are aligned with the latest config version. + # all configuration files (cluster config, change-set, ...) have been written on the shared folder + # and are aligned with the latest config version. # This is required only on update because on create it is guaranteed that this recipe is executed on compute node # only after it has completed on head node. wait_cluster_config_file(sync_file_login_nodes) diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index 39e3f005cd..993e2f8a5b 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -81,3 +81,29 @@ def is_custom_node? custom_node_package = node['cluster']['custom_node_package'] !custom_node_package.nil? && !custom_node_package.empty? end + +def write_sync_file(path) + # Write a synchronization file containing the current cluster config version. + # Synchronization files are used as a synchronization point between cluster nodes + # to signal that a group of actions have been completed. + file path do + content node["cluster"]["cluster_config_version"] + mode "0644" + owner "root" + group "root" + end +end + +def wait_sync_file(path) + # Wait for a synchronization file to be written for the current cluster config version. + # Synchronization files are used as a synchronization point between cluster nodes + # to signal that a group of actions have been completed. + cluster_config_version = node["cluster"]["cluster_config_version"] + # Wait for the config version file to contain the current cluster config version. + bash "Wait for synchronization file at #{path} to be written for version #{cluster_config_version}" do + code "[[ \"$(cat #{path})\" == \"#{cluster_config_version}\" ]] || exit 1" + retries 30 + retry_delay 10 + timeout 5 + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb index ffcc0d590b..d4747d502c 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb @@ -36,7 +36,12 @@ def are_bulk_custom_slurm_settings_updated? def are_mount_or_unmount_required? require 'json' - change_set = JSON.load_file("#{node['cluster']['shared_dir']}/change-set.json") + + change_set_path = node["cluster"]["change_set_path"] + + return false unless ::File.exist?(change_set_path) + + change_set = JSON.load_file(change_set_path) change_set["changeSet"].each do |change| next unless change["updatePolicy"] == "SHARED_STORAGE_UPDATE_POLICY" diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb index e1ef443e0d..c4e9c8297b 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/libraries/update_spec.rb @@ -1,20 +1,33 @@ require 'spec_helper' -describe "aws-parallelcluster-slurm:libraries:are_mount_unmount_required" do +describe "aws-parallelcluster-slurm:libraries:are_mount_or_unmount_required" do + CHANGE_SET_PATH = "/CHANGE_SET_PATH".freeze + let(:node) do { - "cluster" => { "shared_dir" => "/SHARED_DIR" }, + "cluster" => { "change_set_path" => CHANGE_SET_PATH }, } end shared_examples "the correct method" do |changeset, expected_result| it "returns #{expected_result}" do - allow(File).to receive(:read).with("/SHARED_DIR/change-set.json").and_return(JSON.dump(changeset)) + if changeset.nil? + allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(false) + allow(File).to receive(:read).with(CHANGE_SET_PATH).and_call_original + else + allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(true) + allow(File).to receive(:read).with(CHANGE_SET_PATH).and_return(JSON.dump(changeset)) + end result = are_mount_or_unmount_required? expect(result).to eq(expected_result) end end + context "when changeset does not exist" do + changeset = nil + include_examples "the correct method", changeset, false + end + context "when changeset is empty" do changeset = { "changeSet" => [],