From 3bfe1a7ab36233d54a9dee3b026a33bc03880823 Mon Sep 17 00:00:00 2001 From: Wilfried Goesgens Date: Thu, 7 Nov 2024 10:48:33 +0100 Subject: [PATCH 1/3] add overload process logging --- jenkins/helper/launch_handler.py | 3 ++ jenkins/helper/overload_thread.py | 51 +++++++++++++++++++++++++++++ jenkins/helper/site_config.py | 1 + jenkins/helper/tools/killall.py | 53 +++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+) create mode 100644 jenkins/helper/overload_thread.py diff --git a/jenkins/helper/launch_handler.py b/jenkins/helper/launch_handler.py index 5a040e35c..61aa85bb9 100644 --- a/jenkins/helper/launch_handler.py +++ b/jenkins/helper/launch_handler.py @@ -7,6 +7,7 @@ from traceback import print_exc from dmesg import DmesgWatcher, dmesg_runner +from overload_thread import spawn_overload_watcher_thread, shutdown_overload_watcher_thread from site_config import SiteConfig, IS_LINUX from testing_runner import TestingRunner @@ -36,6 +37,7 @@ def launch_runner(runner, create_report): dmesg_thread = Thread(target=dmesg_runner, args=[dmesg], name="dmesg") dmesg.name = "dmesg" dmesg_thread.start() + spawn_overload_watcher_thread(runner.cfg) time.sleep(3) print(runner.scenarios) try: @@ -59,6 +61,7 @@ def launch_runner(runner, create_report): runner.create_testruns_file() if IS_LINUX: dmesg.end_run() + shutdown_overload_watcher_thread() print('joining dmesg threads') dmesg_thread.join() runner.print_and_exit_closing_stance() diff --git a/jenkins/helper/overload_thread.py b/jenkins/helper/overload_thread.py new file mode 100644 index 000000000..180df369a --- /dev/null +++ b/jenkins/helper/overload_thread.py @@ -0,0 +1,51 @@ +#!/bin/env python3 +""" check for resource shortage of the test host """ +# pylint: disable=global-statement disable=global-variable-not-assigned +from threading import Thread, Lock +import time +from datetime import datetime +import psutil +# from tools.socket_counter import get_socket_count +from tools.killall import get_all_processes_stats_json + +END_THREAD_LOCK = Lock() +END_THREAD = False +OVERLOAD_THREAD = None + + +def overload_thread(sitecfg, _): + """watcher thread to track system load""" + continue_running = True + # print("starting load monitoring thread") + with open((sitecfg.base_dir / "overloads.jsonl"), "w+", encoding="utf-8") as jsonl_file: + while continue_running: + #try: + # sock_count = get_socket_count() + # if sock_count > 8000: + # print(f"Socket count high: {sock_count}") + #except psutil.AccessDenied: + # pass + load = psutil.getloadavg() + if (load[0] > sitecfg.max_load) or (load[1] > sitecfg.max_load1) or (load[0] > sitecfg.overload): + #print(f"{str(load)} <= {sitecfg.overload} Load to high - Disk I/O: " + str(psutil.swap_memory())) + jsonl_file.write(f'["{datetime.now ()}", {get_all_processes_stats_json()}]\n') + time.sleep(1) + with END_THREAD_LOCK: + continue_running = not END_THREAD + #print("exiting load monitoring thread") + + +def spawn_overload_watcher_thread(siteconfig): + """launch the overload watcher thread""" + global OVERLOAD_THREAD + OVERLOAD_THREAD = Thread(target=overload_thread, args=(siteconfig, True)) + OVERLOAD_THREAD.start() + + +def shutdown_overload_watcher_thread(): + """terminate the overload watcher thread""" + global END_THREAD + with END_THREAD_LOCK: + END_THREAD = True + if OVERLOAD_THREAD is not None: + OVERLOAD_THREAD.join() diff --git a/jenkins/helper/site_config.py b/jenkins/helper/site_config.py index 3d447b83b..8b4fda177 100644 --- a/jenkins/helper/site_config.py +++ b/jenkins/helper/site_config.py @@ -117,6 +117,7 @@ def __init__(self, definition_file): # pylint: disable=too-many-statements disable=too-many-branches print_env() init_temp() + self.basedir = Path.cwd() self.datetime_format = "%Y-%m-%dT%H%M%SZ" self.trace = False self.portbase = 7000 diff --git a/jenkins/helper/tools/killall.py b/jenkins/helper/tools/killall.py index ca9b1ed07..a16cb4fc3 100644 --- a/jenkins/helper/tools/killall.py +++ b/jenkins/helper/tools/killall.py @@ -1,5 +1,7 @@ #!/bin/env python3 """ manipulate processes """ +import time +import json import sys import psutil @@ -41,3 +43,54 @@ def kill_all_arango_processes(): process.kill() except psutil.NoSuchProcess: # pragma: no cover pass + +def gather_process_thread_statistics(p): + """ gather the statistics of one process and all its threads """ + ret = {} + ret['process'] = [{ + 'time': time.ctime(), + 'pid': p.pid, + 'name': p.name(), + 'percent': p.cpu_percent(), + 'iocounters': p.io_counters(), + 'ctxSwitches': p.num_ctx_switches(), + 'numfds': p.num_fds(), + 'cpu_times': p.cpu_times(), + 'meminfo': p.memory_full_info(), + 'netcons': p.connections() + }] + for t in p.threads(): + ret[ t.id ] = { 'user': t.user_time, 'sys': t.system_time} + return ret + +def add_delta(p1, p2): + """ calculate and add a delta in cpu and time to all threads of a process """ + tids = list(p1.keys()) + for tid in tids: + if tid in p2 and tid != 'process': + p1[tid]['d_user'] = p2[tid]['user'] - p1[tid]['user'] + p1[tid]['d_sys'] = p2[tid]['sys'] - p1[tid]['sys'] + p1['process'].append(p2['process'][0]) + +def get_all_processes_stats_json(): + """ aggregate a structure of all processes and their threads plus delta """ + process_full_list = {} + for n in [True, False]: + processes = psutil.process_iter() + for process in processes: + name = "" + try: + name = process.name() + if process.ppid() != 2 and process.pid not in [1, 2]: + procstat = gather_process_thread_statistics(process) + if n: + process_full_list[f"p{process.pid}"] = procstat + else: + add_delta(process_full_list[f"p{process.pid}"], procstat) + except psutil.AccessDenied: + pass + except Exception as ex: + print(f"while inspecting {name}: {ex} ") + if n: + time.sleep(1) + return json.dumps(process_full_list) From d9ffc0075ee916bf3e171a31b7d278f282e5a693 Mon Sep 17 00:00:00 2001 From: Wilfried Goesgens Date: Fri, 8 Nov 2024 10:36:08 +0100 Subject: [PATCH 2/3] debug output --- jenkins/helper/overload_thread.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jenkins/helper/overload_thread.py b/jenkins/helper/overload_thread.py index 180df369a..e278237a4 100644 --- a/jenkins/helper/overload_thread.py +++ b/jenkins/helper/overload_thread.py @@ -16,8 +16,10 @@ def overload_thread(sitecfg, _): """watcher thread to track system load""" continue_running = True - # print("starting load monitoring thread") - with open((sitecfg.base_dir / "overloads.jsonl"), "w+", encoding="utf-8") as jsonl_file: + print("starting load monitoring thread") + fn =sitecfg.base_dir / "overloads.jsonl" + print(f"report file: {str(fn)}") + with open(fn, "w+", encoding="utf-8") as jsonl_file: while continue_running: #try: # sock_count = get_socket_count() From b3aef68ee2488b282a5e9908785ef708a5dab43f Mon Sep 17 00:00:00 2001 From: Wilfried Goesgens Date: Fri, 8 Nov 2024 12:53:11 +0100 Subject: [PATCH 3/3] fix filename --- jenkins/helper/overload_thread.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/helper/overload_thread.py b/jenkins/helper/overload_thread.py index e278237a4..a698c1b09 100644 --- a/jenkins/helper/overload_thread.py +++ b/jenkins/helper/overload_thread.py @@ -17,7 +17,7 @@ def overload_thread(sitecfg, _): """watcher thread to track system load""" continue_running = True print("starting load monitoring thread") - fn =sitecfg.base_dir / "overloads.jsonl" + fn =sitecfg.basedir / "overloads.jsonl" print(f"report file: {str(fn)}") with open(fn, "w+", encoding="utf-8") as jsonl_file: while continue_running: