supervisor: kill processes before gathering logs

When we hit the max job timeout, we need to stop the test programs before collecting logs or else we run into errors like 'file size changed while zipping' trying to compress them, and we can't save them or stop the job. Signed-off-by: Josh Durgin <jdurgin@redhat.com>
ceph · Feb 9, 2021 · 603b864 · 603b864
1 parent 6f751ef
commit 603b864
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 6 deletions.
diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py
@@ -215,17 +215,27 @@ def run_with_watchdog(process, job_config):
         if total_seconds > teuth_config.max_job_time:
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
+            try:
+                # kill processes but do not unlock yet so we can save
+                # the logs, coredumps, etc.
+                kill_job(job_info['name'], job_info['job_id'],
+                         teuth_config.archive_base, job_config['owner'],
+                         save_logs=True)
+            except Exception:
+                log.exception('Failed to kill job')
+
             try:
                 transfer_archives(job_info['name'], job_info['job_id'],
                                   teuth_config.archive_base, job_config)
             except Exception:
                 log.exception('Could not save logs')
 
             try:
+                # this time remove everything and unlock the machines
                 kill_job(job_info['name'], job_info['job_id'],
                          teuth_config.archive_base, job_config['owner'])
             except Exception:
-                log.exception('Failed to kill job')
+                log.exception('Failed to kill job and unlock machines')
 
         # calling this without a status just updates the jobs updated time
         report.try_push_job_info(job_info)

diff --git a/teuthology/kill.py b/teuthology/kill.py
@@ -62,7 +62,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
         nuke_targets(targets, owner)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None):
+def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
     serializer = report.ResultsSerializer(archive_base)
     job_info = serializer.job_info(run_name, job_id)
     if not owner:
@@ -76,7 +76,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None):
     # the necessary nodes ain't locked yet, we do not use job_info to get them,
     # but use find_targets():
     targets = find_targets(run_name, owner, job_id)
-    nuke_targets(targets, owner)
+    nuke_targets(targets, owner, save_logs)
 
 
 def find_run_info(serializer, run_name):
@@ -214,7 +214,7 @@ def find_targets(run_name, owner, job_id=None):
     return out_obj
 
 
-def nuke_targets(targets_dict, owner):
+def nuke_targets(targets_dict, owner, save_logs=False):
     targets = targets_dict.get('targets')
     if not targets:
         log.info("No locked machines. Not nuking anything")
@@ -233,11 +233,14 @@ def nuke_targets(targets_dict, owner):
         'teuthology-nuke',
         '-t',
         target_file.name,
-        '--unlock',
-        '-r',
         '--owner',
         owner
     ]
+    if save_logs:
+        nuke_args.extend(['--no-reboot', '--keep-logs'])
+    else:
+        nuke_args.extend(['--reboot-all', '--unlock'])
+
     proc = subprocess.Popen(
         nuke_args,
         stdout=subprocess.PIPE,