diff --git a/CHANGELOG.md b/CHANGELOG.md index affa31d6aa..5a6f66dc3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Ubuntu 20.04 is no longer supported. - Upgrade Slurm to version 24.11.5. +- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb index 57c304b531..7c97318fe5 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb @@ -76,6 +76,10 @@ action action end unless on_docker? +file "/var/spool/slurm.state/clustername" do + action "delete" +end + if node['cluster']['slurmdbd_service_enabled'] == "true" # After starting slurmdbd the database may not be fully responsive yet and # its bootstrapping may fail. We need to wait for sacctmgr to successfully diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb index 9aca50d359..ece9c76919 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb @@ -23,3 +23,7 @@ supports restart: false action %i(disable stop) end + +file '/var/spool/slurm.state/clustername' do + action :delete +end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb new file mode 100644 index 0000000000..39dff104dd --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb @@ -0,0 +1,28 @@ +require 'spec_helper' + +describe 'aws-parallelcluster-slurm::clear_slurm_accounting' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + mock_file_exists("/var/spool/slurm.state/clustername", true) + node.override['cluster']['slurmdbd_service_enabled'] = true + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'stops the slurm database daemon' do + is_expected.to disable_service("slurmdbd") + end + + it 'deletes the Slurm database password update script' do + is_expected.to delete_file("#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh") + end + + it 'Removes existing cluster name state file' do + is_expected.to delete_file('/var/spool/slurm.state/clustername') + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb index 822d692d2b..f2025932a4 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb @@ -10,6 +10,7 @@ allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false) allow_any_instance_of(Object).to receive(:dig).and_return(true) RSpec::Mocks.configuration.allow_message_expectations_on_nil = true + mock_file_exists("/var/spool/slurm.state/clustername", true) node.override['cluster']['slurmdbd_service_enabled'] = enable_service end runner.converge(described_recipe) @@ -70,6 +71,9 @@ ) end if enable_service == "true" + it 'Removes existing cluster name state file' do + is_expected.to delete_file('/var/spool/slurm.state/clustername') + end it 'starts the slurm database daemon' do is_expected.to enable_service("slurmdbd") is_expected.to start_service("slurmdbd")