Skip to content

Commit 6122d06

Browse files
committed
[LoginNodes] Add synchronization point to make login nodes
wait for the head node to write keys. Signed-off-by: Giacomo Marciani <mgiacomo@amazon.com>
1 parent 42511eb commit 6122d06

File tree

4 files changed

+72
-12
lines changed

4 files changed

+72
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3434
**BUG FIXES**
3535
- Fix issue making job fail when submitted as active directory user from login nodes.
3636
The issue was caused by an incomplete configuration of the integration with the external Active Directory on the head node.
37+
- Fix issue making login nodes fail to bootstrap when the head node takes more time than expected in writing keys.
3738

3839
3.8.0
3940
------

cookbooks/aws-parallelcluster-environment/recipes/config/login_nodes_keys.rb

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
return if on_docker? || node['cluster']['scheduler'] == 'awsbatch'
1919

20-
script_name = "keys-manager.sh"
2120
keys_dir = "#{node['cluster']['shared_dir_login_nodes']}"
2221
script_dir = "#{keys_dir}/scripts"
22+
script_path = "#{script_dir}/keys-manager.sh"
23+
24+
sync_file_path = "#{keys_dir}/.login_nodes_keys_sync_file"
2325

2426
case node['cluster']['node_type']
2527
when 'ComputeFleet'
@@ -32,21 +34,26 @@
3234
recursive true
3335
end
3436

35-
cookbook_file script_dir + '/' + script_name do
37+
cookbook_file script_path do
3638
source 'login_nodes/keys-manager.sh'
3739
owner 'root'
3840
group 'root'
3941
mode '0744'
4042
end
4143

4244
execute 'Initialize Login Nodes keys' do
43-
command "bash #{script_dir}/#{script_name} --create --folder-path #{keys_dir}"
45+
command "bash #{script_path} --create --folder-path #{keys_dir}"
4446
user 'root'
4547
end
4648

49+
write_sync_file(sync_file_path)
50+
4751
when 'LoginNode'
52+
53+
wait_sync_file(sync_file_path)
54+
4855
execute 'Import Login Nodes keys' do
49-
command "bash #{script_dir}/#{script_name} --import --folder-path #{keys_dir}"
56+
command "bash #{script_path} --import --folder-path #{keys_dir}"
5057
user 'root'
5158
end
5259
else

cookbooks/aws-parallelcluster-environment/spec/unit/recipes/login_nodes_keys_spec.rb

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,39 @@
11
require 'spec_helper'
22

33
describe 'aws-parallelcluster-environment::login_nodes_keys' do
4+
SHARED_DIR_LOGIN_NODES = "/SHARED_DIR_LOGIN_NODES".freeze
5+
SYNC_FILE = "#{SHARED_DIR_LOGIN_NODES}/.login_nodes_keys_sync_file".freeze
6+
CLUSTER_CONFIG_VERSION = "CLUSTER_CONFIG_VERSION".freeze
7+
48
for_all_oses do |platform, version|
59
context "on #{platform}#{version}" do
610
context "when awsbatch scheduler" do
711
cached(:chef_run) do
812
runner = runner(platform: platform, version: version) do |node|
913
node.override['cluster']['scheduler'] = 'awsbatch'
14+
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
1015
end
1116
runner.converge(described_recipe)
1217
end
1318
cached(:node) { chef_run.node }
1419

1520
it 'does not create the script directory' do
16-
is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts")
21+
is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts")
1722
end
1823
end
1924

2025
context "when compute node" do
2126
cached(:chef_run) do
2227
runner = runner(platform: platform, version: version) do |node|
2328
node.override['cluster']['node_type'] = 'ComputeFleet'
29+
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
2430
end
2531
runner.converge(described_recipe)
2632
end
2733
cached(:node) { chef_run.node }
2834

2935
it 'does not create the scripts directory' do
30-
is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts")
36+
is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts")
3137
end
3238
end
3339

@@ -36,22 +42,23 @@
3642
runner = runner(platform: platform, version: version) do |node|
3743
node.override['cluster']['node_type'] = 'HeadNode'
3844
node.override['cluster']['scheduler'] = 'slurm'
39-
node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes'
45+
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
46+
node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION
4047
end
4148
runner.converge(described_recipe)
4249
end
4350
cached(:node) { chef_run.node }
4451

4552
it 'creates the scripts directory' do
46-
is_expected.to create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts").with(
53+
is_expected.to create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts").with(
4754
owner: 'root',
4855
group: 'root',
4956
mode: '0744'
5057
)
5158
end
5259

5360
it 'creates keys-manager.sh script' do
54-
is_expected.to create_cookbook_file("#{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh").with(
61+
is_expected.to create_cookbook_file("/SHARED_DIR_LOGIN_NODES/scripts/keys-manager.sh").with(
5562
source: 'login_nodes/keys-manager.sh',
5663
owner: "root",
5764
group: "root",
@@ -61,7 +68,16 @@
6168

6269
it "creates login nodes keys" do
6370
is_expected.to run_execute("Initialize Login Nodes keys")
64-
.with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --create --folder-path #{node['cluster']['shared_dir_login_nodes']}")
71+
.with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --create --folder-path #{SHARED_DIR_LOGIN_NODES}")
72+
end
73+
74+
it "writes the synchronization file for login nodes" do
75+
is_expected.to create_file(SYNC_FILE).with(
76+
content: CLUSTER_CONFIG_VERSION,
77+
mode: '0644',
78+
owner: 'root',
79+
group: 'root'
80+
)
6581
end
6682
end
6783

@@ -70,15 +86,25 @@
7086
runner = runner(platform: platform, version: version) do |node|
7187
node.override['cluster']['node_type'] = 'LoginNode'
7288
node.override['cluster']['scheduler'] = 'slurm'
73-
node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes'
89+
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
90+
node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION
7491
end
7592
runner.converge(described_recipe)
7693
end
7794
cached(:node) { chef_run.node }
7895

96+
it "waits for cluster config version file" do
97+
is_expected.to run_bash("Wait for synchronization file at #{SYNC_FILE} to be written for version #{CLUSTER_CONFIG_VERSION}").with(
98+
code: "[[ \"$(cat #{SYNC_FILE})\" == \"#{CLUSTER_CONFIG_VERSION}\" ]] || exit 1",
99+
retries: 30,
100+
retry_delay: 10,
101+
timeout: 5
102+
)
103+
end
104+
79105
it "imports login nodes keys" do
80106
is_expected.to run_execute("Import Login Nodes keys")
81-
.with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --import --folder-path #{node['cluster']['shared_dir_login_nodes']}")
107+
.with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --import --folder-path #{SHARED_DIR_LOGIN_NODES}")
82108
end
83109
end
84110
end

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,29 @@ def is_custom_node?
8181
custom_node_package = node['cluster']['custom_node_package']
8282
!custom_node_package.nil? && !custom_node_package.empty?
8383
end
84+
85+
def write_sync_file(path)
86+
# Write a synchronization file containing the current cluster config version.
87+
# Synchronization files are used as a synchronization point between cluster nodes
88+
# to signal that a group of actions have been completed.
89+
file path do
90+
content node["cluster"]["cluster_config_version"]
91+
mode "0644"
92+
owner "root"
93+
group "root"
94+
end
95+
end
96+
97+
def wait_sync_file(path)
98+
# Wait for a synchronization file to be written for the current cluster config version.
99+
# Synchronization files are used as a synchronization point between cluster nodes
100+
# to signal that a group of actions have been completed.
101+
cluster_config_version = node["cluster"]["cluster_config_version"]
102+
# Wait for the config version file to contain the current cluster config version.
103+
bash "Wait for synchronization file at #{path} to be written for version #{cluster_config_version}" do
104+
code "[[ \"$(cat #{path})\" == \"#{cluster_config_version}\" ]] || exit 1"
105+
retries 30
106+
retry_delay 10
107+
timeout 5
108+
end
109+
end

0 commit comments

Comments
 (0)