Skip to content

Commit

Permalink
supervisor: kill processes before gathering logs
Browse files Browse the repository at this point in the history
When we hit the max job timeout, we need to stop the test programs
before collecting logs or else we run into errors like 'file size
changed while zipping' trying to compress them, and we can't save them
or stop the job.

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
  • Loading branch information
jdurgin committed Feb 9, 2021
1 parent 6f751ef commit 603b864
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
12 changes: 11 additions & 1 deletion teuthology/dispatcher/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,17 +215,27 @@ def run_with_watchdog(process, job_config):
if total_seconds > teuth_config.max_job_time:
log.warning("Job ran longer than {max}s. Killing...".format(
max=teuth_config.max_job_time))
try:
# kill processes but do not unlock yet so we can save
# the logs, coredumps, etc.
kill_job(job_info['name'], job_info['job_id'],
teuth_config.archive_base, job_config['owner'],
save_logs=True)
except Exception:
log.exception('Failed to kill job')

try:
transfer_archives(job_info['name'], job_info['job_id'],
teuth_config.archive_base, job_config)
except Exception:
log.exception('Could not save logs')

try:
# this time remove everything and unlock the machines
kill_job(job_info['name'], job_info['job_id'],
teuth_config.archive_base, job_config['owner'])
except Exception:
log.exception('Failed to kill job')
log.exception('Failed to kill job and unlock machines')

# calling this without a status just updates the jobs updated time
report.try_push_job_info(job_info)
Expand Down
13 changes: 8 additions & 5 deletions teuthology/kill.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
nuke_targets(targets, owner)


def kill_job(run_name, job_id, archive_base=None, owner=None):
def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
serializer = report.ResultsSerializer(archive_base)
job_info = serializer.job_info(run_name, job_id)
if not owner:
Expand All @@ -76,7 +76,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None):
# the necessary nodes ain't locked yet, we do not use job_info to get them,
# but use find_targets():
targets = find_targets(run_name, owner, job_id)
nuke_targets(targets, owner)
nuke_targets(targets, owner, save_logs)


def find_run_info(serializer, run_name):
Expand Down Expand Up @@ -214,7 +214,7 @@ def find_targets(run_name, owner, job_id=None):
return out_obj


def nuke_targets(targets_dict, owner):
def nuke_targets(targets_dict, owner, save_logs=False):
targets = targets_dict.get('targets')
if not targets:
log.info("No locked machines. Not nuking anything")
Expand All @@ -233,11 +233,14 @@ def nuke_targets(targets_dict, owner):
'teuthology-nuke',
'-t',
target_file.name,
'--unlock',
'-r',
'--owner',
owner
]
if save_logs:
nuke_args.extend(['--no-reboot', '--keep-logs'])
else:
nuke_args.extend(['--reboot-all', '--unlock'])

proc = subprocess.Popen(
nuke_args,
stdout=subprocess.PIPE,
Expand Down

0 comments on commit 603b864

Please sign in to comment.