From 0a9bc5f60a06b0690305e00ac7209fe23f135557 Mon Sep 17 00:00:00 2001 From: Mike Weilgart Date: Wed, 21 Feb 2024 17:39:55 -0800 Subject: [PATCH] Made AIX watchdog ignore stale pidfiles Ticket: CFE-4335 Changelog: AIX watchdog now handles stale pids Code uses /proc opportunistically only if it exists; this improves accuracy for systems with /proc and doesn't affect other systems. (Turns out this doesn't matter as the template is only for AIX which does have /proc) This leaves in place a race condition if many watchdog processes are started in very short succession, since the pidfile is not atomically checked and updated. For purposes of the watchdog this is probably good enough. (See https://stackoverflow.com/a/688365/5419599 for more on this.) (cherry picked from commit f53436b11d12573ec7c49dc677c68fbecbcfc2d3) --- .../core/watchdog/templates/watchdog.mustache | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cfe_internal/core/watchdog/templates/watchdog.mustache b/cfe_internal/core/watchdog/templates/watchdog.mustache index 57cb8a7a27..2b52cc8979 100644 --- a/cfe_internal/core/watchdog/templates/watchdog.mustache +++ b/cfe_internal/core/watchdog/templates/watchdog.mustache @@ -50,16 +50,32 @@ LOGFILE="/var/cfengine/watchdog.log" echo "$(date) Initiating watchdog $$" >> ${LOGFILE} if [ -s $PIDFILE ]; then - ps -p $(cat $PIDFILE) > /dev/null 2>&1 - _ret=$? - if [ "${_ret}" -eq 0 ] ; then - echo "$(date) Aborting execution of watchdog $$, existing watchdog process $(cat $PIDFILE) running" >> ${LOGFILE} - exit 1 + # We have a pidfile + if ps -p $(cat $PIDFILE) > /dev/null 2>&1 ; then + # There is a process with the PID in the file, but is it stale? + if [ -d /proc ]; then + # We can know for sure if it's stale + actual_process="/proc/$(cat "$PIDFILE")" + newer="$(ls -1dt "$PIDFILE" "$actual_process" | head -n 1)" + if [ "$actual_process" = "$newer" ]; then + # Pidfile is stale, ignore it + echo $$ > $PIDFILE + else + # Pidfile is definitely correct + echo "$(date) Aborting execution of watchdog $$, existing watchdog process $(cat $PIDFILE) running" >> ${LOGFILE} + exit 1 + fi + else + # No /proc, pidfile shows a running process, we'll assume it's valid + echo "$(date) Aborting execution of watchdog $$, existing watchdog process $(cat $PIDFILE) running" >> ${LOGFILE} + exit 1 + fi else # No current process matching pid in file echo $$ > $PIDFILE fi else + # No pidfile at all echo $$ > $PIDFILE fi