From d6c8b449c566620181f8eaf966137be8b3367e0c Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 26 Nov 2024 11:52:59 +0100 Subject: [PATCH] Auto-restart slurmctld on failure after 1 second. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 1 + .../spec/unit/recipes/config_slurmctld_systemd_service_spec.rb | 2 ++ .../templates/default/slurm/head_node/slurmctld.service.erb | 2 ++ 3 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e04e3e2994..f28bd9aef1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - xdcv: `2024.0.631-1` - gl: `2024.0.1078-1` - web_viewer: `2024.0-18131-1` +- Auto-restart slurmctld on failure. **BUG FIXES** - Fix an issue in the way we get region when manage volumes so that it can correctly handle local zone. diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmctld_systemd_service_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmctld_systemd_service_spec.rb index f0135884b4..057ecc3e0a 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmctld_systemd_service_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmctld_systemd_service_spec.rb @@ -32,6 +32,8 @@ it 'creates the service definition for slurmctld with the correct settings' do is_expected.to render_file('/etc/systemd/system/slurmctld.service') .with_content("After=network-online.target munge.service remote-fs.target") + .with_content("Restart=on-failure") + .with_content("RestartSec=1s") end end end diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/slurmctld.service.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/slurmctld.service.erb index 57b28dc248..f7fd7e1ad2 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/slurmctld.service.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/slurmctld.service.erb @@ -12,6 +12,8 @@ ExecReload=/bin/kill -HUP $MAINPID LimitNOFILE=562930 LimitMEMLOCK=infinity LimitSTACK=infinity +Restart=on-failure +RestartSec=1s [Install] WantedBy=multi-user.target