From a41d4f806d06fbafbb5f4a45a30164bc221f2cc9 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Wed, 22 Jan 2025 10:18:16 -0800 Subject: [PATCH] Remove temporary slurm resume file at the end of resume_program Prior this commit, the code only had `trap "rm -f ${SLURM_RESUME_FILE_TMP}" EXIT`, which works fine for the script until the `sudo -u` happens. When using `sudo -u` to run a command, it creates a new process with a different user context. The trap that was set in the parent shell does not carry over to this new process. Therefore, when the script ends through the sudo command, the EXIT trap in the original shell never gets executed. Therefore, this commit add another removal of temporary file at the end of the script Signed-off-by: Hanwen --- .../templates/default/slurm/resume_program.erb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/resume_program.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/resume_program.erb index 252a47a619..5ca3957e25 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/resume_program.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/resume_program.erb @@ -13,3 +13,5 @@ chgrp <%= node['cluster']['cluster_admin_slurm_share_group'] %> ${SLURM_RESUME_F chmod g+r ${SLURM_RESUME_FILE_TMP} sudo -u <%= node['cluster']['cluster_admin_user'] %> SLURM_RESUME_FILE=${SLURM_RESUME_FILE_TMP} <%= node_virtualenv_path %>/bin/slurm_resume "$@" + +rm -f ${SLURM_RESUME_FILE_TMP}