Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
**BUG FIXES**
- Fix issue making job fail when submitted as active directory user from login nodes.
The issue was caused by an incomplete configuration of the integration with the external Active Directory on the head node.
- Fix issue making login nodes fail to bootstrap when the head node takes more time than expected in writing keys.

3.8.0
------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

return if on_docker? || node['cluster']['scheduler'] == 'awsbatch'

script_name = "keys-manager.sh"
keys_dir = "#{node['cluster']['shared_dir_login_nodes']}"
script_dir = "#{keys_dir}/scripts"
script_path = "#{script_dir}/keys-manager.sh"

sync_file_path = "#{keys_dir}/.login_nodes_keys_sync_file"

case node['cluster']['node_type']
when 'ComputeFleet'
Expand All @@ -32,21 +34,26 @@
recursive true
end

cookbook_file script_dir + '/' + script_name do
cookbook_file script_path do
source 'login_nodes/keys-manager.sh'
owner 'root'
group 'root'
mode '0744'
end

execute 'Initialize Login Nodes keys' do
command "bash #{script_dir}/#{script_name} --create --folder-path #{keys_dir}"
command "bash #{script_path} --create --folder-path #{keys_dir}"
user 'root'
end

write_sync_file(sync_file_path)

when 'LoginNode'

wait_sync_file(sync_file_path)

execute 'Import Login Nodes keys' do
command "bash #{script_dir}/#{script_name} --import --folder-path #{keys_dir}"
command "bash #{script_path} --import --folder-path #{keys_dir}"
user 'root'
end
else
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
require 'spec_helper'

describe 'aws-parallelcluster-environment::login_nodes_keys' do
SHARED_DIR_LOGIN_NODES = "/SHARED_DIR_LOGIN_NODES".freeze
SYNC_FILE = "#{SHARED_DIR_LOGIN_NODES}/.login_nodes_keys_sync_file".freeze
CLUSTER_CONFIG_VERSION = "CLUSTER_CONFIG_VERSION".freeze

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
context "when awsbatch scheduler" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['scheduler'] = 'awsbatch'
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'does not create the script directory' do
is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts")
is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts")
end
end

context "when compute node" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'ComputeFleet'
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'does not create the scripts directory' do
is_expected.to_not create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts")
is_expected.to_not create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts")
end
end

Expand All @@ -36,22 +42,23 @@
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['scheduler'] = 'slurm'
node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes'
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'creates the scripts directory' do
is_expected.to create_directory("#{node['cluster']['shared_dir_login_nodes']}/scripts").with(
is_expected.to create_directory("#{SHARED_DIR_LOGIN_NODES}/scripts").with(
owner: 'root',
group: 'root',
mode: '0744'
)
end

it 'creates keys-manager.sh script' do
is_expected.to create_cookbook_file("#{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh").with(
is_expected.to create_cookbook_file("#{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh").with(
source: 'login_nodes/keys-manager.sh',
owner: "root",
group: "root",
Expand All @@ -61,7 +68,16 @@

it "creates login nodes keys" do
is_expected.to run_execute("Initialize Login Nodes keys")
.with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --create --folder-path #{node['cluster']['shared_dir_login_nodes']}")
.with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --create --folder-path #{SHARED_DIR_LOGIN_NODES}")
end

it "writes the synchronization file for login nodes" do
is_expected.to create_file(SYNC_FILE).with(
content: CLUSTER_CONFIG_VERSION,
mode: '0644',
owner: 'root',
group: 'root'
)
end
end

Expand All @@ -70,15 +86,25 @@
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'LoginNode'
node.override['cluster']['scheduler'] = 'slurm'
node.override['cluster']['shared_dir_login_nodes'] = '/opt/parallelcluster/shared_login_nodes'
node.override['cluster']['shared_dir_login_nodes'] = SHARED_DIR_LOGIN_NODES
node.override['cluster']['cluster_config_version'] = CLUSTER_CONFIG_VERSION
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it "waits for cluster config version file" do
is_expected.to run_bash("Wait for synchronization file at #{SYNC_FILE} to be written for version #{CLUSTER_CONFIG_VERSION}").with(
code: "[[ \"$(cat #{SYNC_FILE})\" == \"#{CLUSTER_CONFIG_VERSION}\" ]] || exit 1",
retries: 30,
retry_delay: 10,
timeout: 5
)
end

it "imports login nodes keys" do
is_expected.to run_execute("Import Login Nodes keys")
.with(command: "bash #{node['cluster']['shared_dir_login_nodes']}/scripts/keys-manager.sh --import --folder-path #{node['cluster']['shared_dir_login_nodes']}")
.with(command: "bash #{SHARED_DIR_LOGIN_NODES}/scripts/keys-manager.sh --import --folder-path #{SHARED_DIR_LOGIN_NODES}")
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@

if new_resource.update
# Wait for the head node to write the config version file, which is the signal that
# all configuration files within the shared folder are aligned with the latest config version.
# all configuration files (cluster config, change-set, ...) have been written on the shared folder
# and are aligned with the latest config version.
# This is required only on update because on create it is guaranteed that this recipe is executed on compute node
# only after it has completed on head node.
wait_cluster_config_file(sync_file_login_nodes)
Expand Down
26 changes: 26 additions & 0 deletions cookbooks/aws-parallelcluster-shared/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,29 @@ def is_custom_node?
custom_node_package = node['cluster']['custom_node_package']
!custom_node_package.nil? && !custom_node_package.empty?
end

def write_sync_file(path)
# Write a synchronization file containing the current cluster config version.
# Synchronization files are used as a synchronization point between cluster nodes
# to signal that a group of actions have been completed.
file path do
content node["cluster"]["cluster_config_version"]
mode "0644"
owner "root"
group "root"
end
end

def wait_sync_file(path)
# Wait for a synchronization file to be written for the current cluster config version.
# Synchronization files are used as a synchronization point between cluster nodes
# to signal that a group of actions have been completed.
cluster_config_version = node["cluster"]["cluster_config_version"]
# Wait for the config version file to contain the current cluster config version.
bash "Wait for synchronization file at #{path} to be written for version #{cluster_config_version}" do
code "[[ \"$(cat #{path})\" == \"#{cluster_config_version}\" ]] || exit 1"
retries 30
retry_delay 10
timeout 5
end
end
7 changes: 6 additions & 1 deletion cookbooks/aws-parallelcluster-slurm/libraries/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ def are_bulk_custom_slurm_settings_updated?

def are_mount_or_unmount_required?
require 'json'
change_set = JSON.load_file("#{node['cluster']['shared_dir']}/change-set.json")

change_set_path = node["cluster"]["change_set_path"]

return false unless ::File.exist?(change_set_path)

change_set = JSON.load_file(change_set_path)
change_set["changeSet"].each do |change|
next unless change["updatePolicy"] == "SHARED_STORAGE_UPDATE_POLICY"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
require 'spec_helper'

describe "aws-parallelcluster-slurm:libraries:are_mount_unmount_required" do
describe "aws-parallelcluster-slurm:libraries:are_mount_or_unmount_required" do
CHANGE_SET_PATH = "/CHANGE_SET_PATH".freeze

let(:node) do
{
"cluster" => { "shared_dir" => "/SHARED_DIR" },
"cluster" => { "change_set_path" => CHANGE_SET_PATH },
}
end

shared_examples "the correct method" do |changeset, expected_result|
it "returns #{expected_result}" do
allow(File).to receive(:read).with("/SHARED_DIR/change-set.json").and_return(JSON.dump(changeset))
if changeset.nil?
allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(false)
allow(File).to receive(:read).with(CHANGE_SET_PATH).and_call_original
else
allow(File).to receive(:exist?).with(CHANGE_SET_PATH).and_return(true)
allow(File).to receive(:read).with(CHANGE_SET_PATH).and_return(JSON.dump(changeset))
end
result = are_mount_or_unmount_required?
expect(result).to eq(expected_result)
end
end

context "when changeset does not exist" do
changeset = nil
include_examples "the correct method", changeset, false
end

context "when changeset is empty" do
changeset = {
"changeSet" => [],
Expand Down