From bd00b881233e3f9e02b7a46dfd3944c78213cdae Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 29 Aug 2023 05:34:28 -0600 Subject: [PATCH 01/90] Add support for pipe refs --- parallel-orch/trace.py | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/parallel-orch/trace.py b/parallel-orch/trace.py index d106cce3..a356123e 100644 --- a/parallel-orch/trace.py +++ b/parallel-orch/trace.py @@ -78,6 +78,9 @@ def __hash__(self): def __str__(self): return f"Key({self.lhs_ref}@{self.env})" + + def __repr__(self) -> str: + return self.__str__() class ExpectResult(): @@ -90,6 +93,15 @@ def __str__(self, ref, result): return f"ExpectResult({self.ref}, {self.result})" +class PipeRef: + + def __init__(self, lhs_ref, env): + self.ref = PathRefKey(env, lhs_ref) + + def __str__(self): + return f"PipeRef({self.lhs_ref})" + + def log_resolved_trace_items(resolved_dict): for k, v in resolved_dict.items(): try: @@ -117,6 +129,9 @@ def is_no_command_prefix(line): def is_new_path_ref(trace_item): return "PathRef" in trace_item +def is_pipe_ref(trace_item): + return "PipeRef" in trace_item + def get_path_ref_id(trace_item): return trace_item.split("=")[0].strip() @@ -210,6 +225,8 @@ def is_expect_result(trace_item): def parse_expect_result(trace_item): return trace_item.lstrip("ExpectResult(").split(")")[0].split(", ") +def parse_pipe_ref(trace_item): + return trace_item.split("] = ")[0].lstrip("[").split(", ") def parse_launch(refs_dict, keys_order, env, line) -> None: assignment_prefix, assignments = parse_launch_command( @@ -220,7 +237,6 @@ def parse_launch(refs_dict, keys_order, env, line) -> None: refs_dict[lhs_ref] = refs_dict[rhs_ref] keys_order.append(lhs_ref) - def add_ref_to_refs_dict(refs_dict, keys_order, lhs_ref, ref): refs_dict[lhs_ref] = ref keys_order.append(lhs_ref) @@ -256,13 +272,25 @@ def parse_new_path_ref(refs_dict, keys_order, env, line): refs_dict[lhs_ref] = path_ref keys_order.append(lhs_ref) - def parse_expect_result_item(expect_result_dict, env, line): line = remove_command_prefix(line).strip() path_ref_id, result = parse_expect_result(line) lhs_ref = PathRefKey(env, path_ref_id) expect_result_dict[lhs_ref] = ExpectResult(lhs_ref, result) - + +def parse_pipe_ref_item(refs_dict, keys_order, env, line): + line = remove_command_prefix(line).strip() + # lhs_ref, rhs_ref = parse_pipe_ref(line) + # Warning HACK: This is a hack to get the correct lhs_ref + # we are probably ok with this because it. + rhs_ref, lhs_ref = parse_pipe_ref(line) + lhs_key = PathRefKey(env, lhs_ref) + lhs_key_rev = PathRefKey(env, rhs_ref) + pipe_ref = PipeRef(rhs_ref, env) + pipe_ref_rev = PipeRef(lhs_ref, env) + refs_dict[lhs_key] = pipe_ref.ref + refs_dict[lhs_key_rev] = pipe_ref_rev.ref + keys_order.append(lhs_key) def parse_rw_sets(trace_object) -> None: # logging.trace("".join(trace_object)) @@ -284,6 +312,9 @@ def parse_rw_sets(trace_object) -> None: # Parses PathRef(...) elif is_new_path_ref(line): parse_new_path_ref(refs_dict, keys_order, env, line) + # Parses PipeRef + elif is_pipe_ref(line): + parse_pipe_ref_item(refs_dict, keys_order, env, line) # Parses ExpectResult(...) elif is_expect_result(line): parse_expect_result_item(expect_result_dict, env, line) From 6cd9325f43339bf97e6cf0ca244ca0a2ad07b08d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 29 Aug 2023 05:44:38 -0600 Subject: [PATCH 02/90] Remove assertions for command nodes due to PipeNodes --- parallel-orch/analysis.py | 4 ++++ parallel-orch/partial_program_order.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/parallel-orch/analysis.py b/parallel-orch/analysis.py index 76408759..157fbb54 100644 --- a/parallel-orch/analysis.py +++ b/parallel-orch/analysis.py @@ -41,6 +41,10 @@ def safe_to_execute(asts: "list[AstNode]", variables: dict) -> bool: ## There should always be a single AST per node and it must be a command assert(len(asts) == 1) ast = asts[0] + ## GL 2023-08-29: I added this in order for PipeNodes to work + ## I suppose piped commands should be generally safe to execute + if isinstance(asts[0], PipeNode): + return True assert(isinstance(ast, CommandNode)) logging.debug(f'Ast in question: {ast}') ## Expand and check whether the asts contain diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 351fe02f..e9ebae23 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -167,7 +167,8 @@ def __init__(self, id, cmd, asts, loop_context: LoopStack): ## There can only be a single AST per node, and this ## must be a command. assert(len(asts) == 1) - assert(isinstance(asts[0], CommandNode)) + ## GL 2023-08-29: I removed this in order for PipeNodes to work + # assert(isinstance(asts[0], CommandNode)) self.cmd_no_redir = trace.remove_command_redir(self.cmd) self.loop_context = loop_context ## Keep track of how many iterations of this loop node we have unrolled @@ -1281,8 +1282,9 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## We no longer add failed commands to the stopped set, ## because this leads to more repetitions than needed ## and does not allow us to properly speculate commands - + logging.critical("___________________--___________________") read_set, write_set = trace.parse_and_gather_cmd_rw_sets(trace_object) + logging.critical("___________________++___________________") rw_set = RWSet(read_set, write_set) self.update_rw_set(node_id, rw_set) From ade6fbd534fbbf105235c9b04b115873a3ef1c1b Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 29 Aug 2023 05:54:39 -0600 Subject: [PATCH 03/90] Remove redundant logs --- parallel-orch/partial_program_order.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index d364b00c..fefb96ab 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1282,9 +1282,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## We no longer add failed commands to the stopped set, ## because this leads to more repetitions than needed ## and does not allow us to properly speculate commands - logging.critical("___________________--___________________") read_set, write_set = trace.parse_and_gather_cmd_rw_sets(trace_object) - logging.critical("___________________++___________________") rw_set = RWSet(read_set, write_set) self.update_rw_set(node_id, rw_set) From 867a88afb19409278e8ef6400439f405dc60dcd5 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 29 Aug 2023 09:24:48 -0600 Subject: [PATCH 04/90] Check PipeNodes independently --- parallel-orch/analysis.py | 68 ++++++++++++++++---------- parallel-orch/partial_program_order.py | 6 +-- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/parallel-orch/analysis.py b/parallel-orch/analysis.py index 157fbb54..c1b033e5 100644 --- a/parallel-orch/analysis.py +++ b/parallel-orch/analysis.py @@ -29,51 +29,67 @@ def parse_shell_to_asts(input_script_path) -> "list[AstNode]": except libdash.parser.ParsingException as e: logging.error(f'Parsing error: {e}') exit(1) + - -## Returns true if the script is safe to speculate and execute outside -## of the original shell context. -## -## The script is not safe if it might contain a shell primitive. Therefore -## the analysis checks if the command in question is one of the underlying -## shell's primitives (in our case bash) and if so returns False -def safe_to_execute(asts: "list[AstNode]", variables: dict) -> bool: - ## There should always be a single AST per node and it must be a command - assert(len(asts) == 1) - ast = asts[0] - ## GL 2023-08-29: I added this in order for PipeNodes to work - ## I suppose piped commands should be generally safe to execute - if isinstance(asts[0], PipeNode): +def validate_node(ast) -> bool: + assert(isinstance(ast, (CommandNode, PipeNode))) + if isinstance(ast, CommandNode): return True - assert(isinstance(ast, CommandNode)) - logging.debug(f'Ast in question: {ast}') + else: + for cmd in ast.items: + assert isinstance(cmd, CommandNode) + + +def is_node_safe(node: CommandNode, variables: dict) -> str: ## Expand and check whether the asts contain - ## a command substitution or a primitive. + ## a command substitution or a primitive. ## If so, then we need to tell the original script to execute the command. ## Expand the command argument - cmd_arg = ast.arguments[0] + cmd_arg = node.arguments[0] exp_state = expand.ExpansionState(variables) ## TODO: Catch exceptions around here expanded_cmd_arg = expand.expand_arg(cmd_arg, exp_state) cmd_str = string_of_arg(expanded_cmd_arg) logging.debug(f'Expanded command argument: {expanded_cmd_arg} (str: "{cmd_str}")') - - ## TODO: Determine if the ast contains a command substitution and if so - ## run it in the original script. - ## In the future, we should be able to perform stateful expansion too, - ## and properly execute and trace command substitutions. - + ## KK 2023-05-26 We need to keep in mind that whenever we execute something ## in the original shell, then we cannot speculate anything ## after it, because we cannot track read-write dependencies ## in the original shell. - if cmd_str in BASH_PRIMITIVES: return False - return True + +def is_pipe_node_safe_to_execute(node: PipeNode, variables: dict) -> bool: + for cmd in node.items: + logging.debug(f'Ast in question: {cmd}') + if not is_node_safe(cmd, variables): + return False + return True + +## Returns true if the script is safe to speculate and execute outside +## of the original shell context. +## +## The script is not safe if it might contain a shell primitive. Therefore +## the analysis checks if the command in question is one of the underlying +## shell's primitives (in our case bash) and if so returns False +def safe_to_execute(asts: "list[AstNode]", variables: dict) -> bool: + ## There should always be a single AST per node and it must be a command + assert(len(asts) == 1) + if isinstance(asts[0], PipeNode): + return is_pipe_node_safe_to_execute(asts[0], variables) + else: + assert(isinstance(ast, CommandNode)) + logging.debug(f'Ast in question: {ast}') + return is_node_safe(ast, variables) + ## TODO: Determine if the ast contains a command substitution and if so + ## run it in the original script. + ## In the future, we should be able to perform stateful expansion too, + ## and properly execute and trace command substitutions. + + BASH_PRIMITIVES = ["break", "continue", "return"] diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index fefb96ab..9a82846a 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -9,7 +9,7 @@ import trace import util -from shasta.ast_node import AstNode, CommandNode +from shasta.ast_node import AstNode, CommandNode, PipeNode class CompletedNodeInfo: @@ -167,8 +167,8 @@ def __init__(self, id, cmd, asts, loop_context: LoopStack): ## There can only be a single AST per node, and this ## must be a command. assert(len(asts) == 1) - ## GL 2023-08-29: I removed this in order for PipeNodes to work - # assert(isinstance(asts[0], CommandNode)) + # Check that the node contains only CommandNode(s) + analysis.validate_node(asts[0]) self.cmd_no_redir = trace.remove_command_redir(self.cmd) self.loop_context = loop_context ## Keep track of how many iterations of this loop node we have unrolled From 52b3936e572a03e16b93d22ed78a4ea257aa78e6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 30 Aug 2023 11:29:48 -0600 Subject: [PATCH 05/90] Add first version of report script --- report/benchmark_plots.py | 28 +++++++++++++++++ report/benchmark_report.py | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 report/benchmark_plots.py create mode 100644 report/benchmark_report.py diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py new file mode 100644 index 00000000..0a27bc48 --- /dev/null +++ b/report/benchmark_plots.py @@ -0,0 +1,28 @@ +import matplotlib.pyplot as plt +import scienceplots + +# plt.style.use('science') + +# Plot a comparison of execution times for Bash and Orch. +def plot_benchmark_results(benchmarks, bash_times, orch_times): + + + + fig, ax = plt.subplots(figsize=(10,6)) + + # Define bar width and positions + bar_width = 0.35 + index = range(len(benchmarks)) + + bar1 = ax.bar(index, bash_times, bar_width, label='bash', color='b') + bar2 = ax.bar([i+bar_width for i in index], orch_times, bar_width, label='hs', color='r') + + ax.set_xlabel('Benchmarks') + ax.set_ylabel('Execution Time (s)') + ax.set_title('Execution Time Comparison: Bash vs orch') + ax.set_xticks([i + bar_width/2 for i in index]) + ax.set_xticklabels(benchmarks) + ax.legend() + + plt.tight_layout() + plt.savefig("out.pdf") diff --git a/report/benchmark_report.py b/report/benchmark_report.py new file mode 100644 index 00000000..faa3ddd1 --- /dev/null +++ b/report/benchmark_report.py @@ -0,0 +1,62 @@ +import subprocess +import time +import json +import os +import matplotlib.pyplot as plt +from benchmark_plots import * + +# Setting and exporting environment variables (same as tests for now). +# This will change in the future. +os.environ['ORCH_TOP'] = os.environ.get('ORCH_TOP', subprocess.check_output(['git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree']).decode('utf-8').strip()) +os.environ['WORKING_DIR'] = os.path.join(os.environ['ORCH_TOP'], 'test') +os.environ['TEST_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'test_scripts') +os.environ['MISC_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'misc') + +BASH_COMMAND = "/bin/bash" +ORCH_COMMAND = os.path.join(os.environ['ORCH_TOP'], 'pash-spec.sh') + +def run_command(command): + print("Running command: ", command) + start_time = time.time() + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + end_time = time.time() + return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) + +# TODO: Make this more robust - maybe even use sth like difflib +def compare_results(bash_output, orch_output): + return bash_output == orch_output + +def main(): + # Load benchmark configurations + with open('benchmark_config.json', 'r') as f: + benchmarks_config = json.load(f) + + bash_times = [] + orch_times = [] + + for benchmark in benchmarks_config: + # Run pre-execution commands + for pre_command in benchmark.get('pre_execution_script', []): + run_command(pre_command) + + # TODO: in the future, we are going to parse the orch_error and generate reports + bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, os.environ.get('TEST_SCRIPT_DIR') + "/" + benchmark['bash_command']]) + orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], os.environ.get('TEST_SCRIPT_DIR') + "/" + benchmark['bash_command']]) + bash_times.append(bash_time) + orch_times.append(orch_time) + are_results_same = compare_results(bash_output, orch_output) + + print(f"Results for benchmark: {benchmark['name']}") + print(f"Bash Execution Time: {bash_time}s") + print(f"orch Execution Time: {orch_time}s") + print(f"Are outputs the same? {'Yes' if are_results_same else 'No'}") + print("-------------------------------") + + # Plot the results + benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] + plot_benchmark_results(benchmark_names, bash_times, orch_times) + + +if __name__ == "__main__": + main() From 2ebcedc6a89b9416af0d717498df45c427fc4267 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 01:37:52 -0600 Subject: [PATCH 06/90] Fix typo in ast PipeNode assertion check --- parallel-orch/analysis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parallel-orch/analysis.py b/parallel-orch/analysis.py index c1b033e5..3620d28e 100644 --- a/parallel-orch/analysis.py +++ b/parallel-orch/analysis.py @@ -81,9 +81,9 @@ def safe_to_execute(asts: "list[AstNode]", variables: dict) -> bool: if isinstance(asts[0], PipeNode): return is_pipe_node_safe_to_execute(asts[0], variables) else: - assert(isinstance(ast, CommandNode)) - logging.debug(f'Ast in question: {ast}') - return is_node_safe(ast, variables) + assert(isinstance(asts[0], CommandNode)) + logging.debug(f'Ast in question: {asts[0]}') + return is_node_safe(asts[0], variables) ## TODO: Determine if the ast contains a command substitution and if so ## run it in the original script. ## In the future, we should be able to perform stateful expansion too, From dac2afa47e825a9e50634819eb7c17c4bfd7679e Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 03:06:59 -0600 Subject: [PATCH 07/90] Make plotting individual and improve format --- report/benchmark_plots.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py index 0a27bc48..e592bff0 100644 --- a/report/benchmark_plots.py +++ b/report/benchmark_plots.py @@ -1,13 +1,11 @@ +import os import matplotlib.pyplot as plt -import scienceplots +# import scienceplots # plt.style.use('science') -# Plot a comparison of execution times for Bash and Orch. -def plot_benchmark_results(benchmarks, bash_times, orch_times): - - - +# Plot a comparison of execution times for Bash and hs. +def plot_benchmark_times_combined(benchmarks, bash_times, orch_times, output_dir, filename): fig, ax = plt.subplots(figsize=(10,6)) # Define bar width and positions @@ -25,4 +23,21 @@ def plot_benchmark_results(benchmarks, bash_times, orch_times): ax.legend() plt.tight_layout() - plt.savefig("out.pdf") + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + +def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_dir, filename): + num_benchmarks = len(benchmarks) + fig, axes = plt.subplots(num_benchmarks, 1, figsize=(10, 6*num_benchmarks)) + # Check if only one benchmark, else wrap axes in a list + if num_benchmarks == 1: + axes = [axes] + for ax, benchmark, bash_time, pash_time in zip(axes, benchmarks, bash_times, orch_times): + bar_width = 0.2 + labels = ['Bash', 'hs'] + times = [bash_time, pash_time] + ax.bar(labels, times, width=bar_width, color=['b', 'r']) + ax.set_ylabel('Execution Time (s)') + ax.set_title(f'Execution Time Comparison for {benchmark}: Bash vs hs') + + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) \ No newline at end of file From 5d4d9dae941197db3c3f0f2f56108d44885e8536 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 04:15:27 -0600 Subject: [PATCH 08/90] Add first dgsh benchmark --- report/benchmarks/dgsh/1.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100755 report/benchmarks/dgsh/1.sh diff --git a/report/benchmarks/dgsh/1.sh b/report/benchmarks/dgsh/1.sh new file mode 100755 index 00000000..db45e28e --- /dev/null +++ b/report/benchmarks/dgsh/1.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) + +cat $1 >"$file1" + +printf 'File type:\t' +file - <"$file1" + +printf 'Original size:\t' +wc -c <"$file1" + +printf 'xz:\t\t' +xz -c <"$file1" | wc -c + +printf 'bzip2:\t\t' +bzip2 -c <"$file1" | wc -c + +printf 'gzip:\t\t' +gzip -c <"$file1" | wc -c \ No newline at end of file From 2c1f699ca0e367e0a90cf33c98adc90dc03e948b Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 04:31:26 -0600 Subject: [PATCH 09/90] Improve structure of benchmark report script --- report/benchmark_report.py | 64 +++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index faa3ddd1..ce363fb3 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -4,21 +4,30 @@ import os import matplotlib.pyplot as plt from benchmark_plots import * +import logging # Setting and exporting environment variables (same as tests for now). # This will change in the future. os.environ['ORCH_TOP'] = os.environ.get('ORCH_TOP', subprocess.check_output(['git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree']).decode('utf-8').strip()) -os.environ['WORKING_DIR'] = os.path.join(os.environ['ORCH_TOP'], 'test') -os.environ['TEST_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'test_scripts') -os.environ['MISC_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'misc') +os.environ['WORKING_DIR'] = os.path.join(os.environ['ORCH_TOP'], 'report') +os.environ['TEST_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'benchmarks') +os.environ['RESOURCE_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'resources') BASH_COMMAND = "/bin/bash" ORCH_COMMAND = os.path.join(os.environ['ORCH_TOP'], 'pash-spec.sh') -def run_command(command): - print("Running command: ", command) +REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') + +def run_pre_execution_command(command, working_dir=os.getcwd()): + print("Running pre-execution command: ", command) + process = subprocess.Popen(command.strip().split(" "), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=working_dir) + process.wait() + return process.returncode + +def run_command(command, working_dir=os.getcwd()): + print("Running (and timing) command: ", " ".join(command)) start_time = time.time() - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir) stdout, stderr = process.communicate() end_time = time.time() return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) @@ -27,36 +36,55 @@ def run_command(command): def compare_results(bash_output, orch_output): return bash_output == orch_output +def print_results(benchmark_name, bash_time, orch_time, are_results_same, diff_percentage): + if orch_time < bash_time: + comparison_result = f"hs is {diff_percentage:.2f}% faster than Bash" + elif orch_time > bash_time: + comparison_result = f"hs is {abs(diff_percentage):.2f}% slower than Bash" + else: + comparison_result = "hs and Bash have the same execution time" + print("-" * 40) + print(f"Results for benchmark: {benchmark_name}") + print(f"Bash Execution Time: {bash_time}s") + print(f"hs Execution Time: {orch_time}s") + print(f"Valid: {'Yes' if are_results_same else 'No'}") + print(comparison_result) + print("-" * 40) + print("-" * 40) + + def main(): # Load benchmark configurations - with open('benchmark_config.json', 'r') as f: + with open(os.path.join(os.environ.get('WORKING_DIR'), 'benchmark_config.json'), 'r') as f: benchmarks_config = json.load(f) bash_times = [] orch_times = [] for benchmark in benchmarks_config: + # Create resource dir if non-existent + os.makedirs(os.environ.get('RESOURCE_DIR'), exist_ok=True) # Run pre-execution commands for pre_command in benchmark.get('pre_execution_script', []): - run_command(pre_command) + logging.debug(f"|Pre-execution: {pre_command}") + run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR')) # TODO: in the future, we are going to parse the orch_error and generate reports - bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, os.environ.get('TEST_SCRIPT_DIR') + "/" + benchmark['bash_command']]) - orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], os.environ.get('TEST_SCRIPT_DIR') + "/" + benchmark['bash_command']]) + bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, benchmark['command']], os.environ.get('TEST_SCRIPT_DIR')) + orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], benchmark['command']], os.environ.get('TEST_SCRIPT_DIR')) bash_times.append(bash_time) orch_times.append(orch_time) are_results_same = compare_results(bash_output, orch_output) + diff_percentage = ((bash_time - orch_time) / bash_time) * 100 + print_results(benchmark['name'], bash_time, orch_time, are_results_same, diff_percentage) - print(f"Results for benchmark: {benchmark['name']}") - print(f"Bash Execution Time: {bash_time}s") - print(f"orch Execution Time: {orch_time}s") - print(f"Are outputs the same? {'Yes' if are_results_same else 'No'}") - print("-------------------------------") - + # Create output dir for reports + os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] - plot_benchmark_results(benchmark_names, bash_times, orch_times) - + plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") + plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") + print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") if __name__ == "__main__": main() From 32ea150901906f92adf119c2887c2fad34f5b747 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 10:20:35 -0600 Subject: [PATCH 10/90] Adjust benchmark report script to work in different directories --- report/benchmark_report.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index ce363fb3..28a5046d 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -2,7 +2,6 @@ import time import json import os -import matplotlib.pyplot as plt from benchmark_plots import * import logging @@ -12,15 +11,24 @@ os.environ['WORKING_DIR'] = os.path.join(os.environ['ORCH_TOP'], 'report') os.environ['TEST_SCRIPT_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'benchmarks') os.environ['RESOURCE_DIR'] = os.path.join(os.environ['WORKING_DIR'], 'resources') +os.environ['PASH_TOP'] = os.path.join(os.environ['ORCH_TOP'], 'deps', 'pash') +os.environ['PASH_SPEC_TOP'] = os.path.join(os.environ['ORCH_TOP']) BASH_COMMAND = "/bin/bash" ORCH_COMMAND = os.path.join(os.environ['ORCH_TOP'], 'pash-spec.sh') REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') +def resolve_working_dir(path): + return path.format(RESOURCE_DIR=os.environ.get('RESOURCE_DIR')) + +def resolve_command_path(command): + return command.format(TEST_SCRIPT_DIR=os.environ.get('TEST_SCRIPT_DIR')) + + def run_pre_execution_command(command, working_dir=os.getcwd()): - print("Running pre-execution command: ", command) - process = subprocess.Popen(command.strip().split(" "), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=working_dir) + print("Running pre-execution command:", command) + process = subprocess.Popen(command.strip().split(" "), cwd=working_dir) process.wait() return process.returncode @@ -38,11 +46,9 @@ def compare_results(bash_output, orch_output): def print_results(benchmark_name, bash_time, orch_time, are_results_same, diff_percentage): if orch_time < bash_time: - comparison_result = f"hs is {diff_percentage:.2f}% faster than Bash" - elif orch_time > bash_time: - comparison_result = f"hs is {abs(diff_percentage):.2f}% slower than Bash" + comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) faster than Bash" else: - comparison_result = "hs and Bash have the same execution time" + comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) slower than Bash" print("-" * 40) print(f"Results for benchmark: {benchmark_name}") print(f"Bash Execution Time: {bash_time}s") @@ -70,12 +76,15 @@ def main(): run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR')) # TODO: in the future, we are going to parse the orch_error and generate reports - bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, benchmark['command']], os.environ.get('TEST_SCRIPT_DIR')) - orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], benchmark['command']], os.environ.get('TEST_SCRIPT_DIR')) + working_dir = resolve_working_dir(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) + + bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, resolve_command_path(benchmark['command'])], working_dir) + orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], resolve_command_path(benchmark['command'])], working_dir) bash_times.append(bash_time) orch_times.append(orch_time) + # print(bash_output) are_results_same = compare_results(bash_output, orch_output) - diff_percentage = ((bash_time - orch_time) / bash_time) * 100 + diff_percentage = abs((bash_time - orch_time) / bash_time) * 100 print_results(benchmark['name'], bash_time, orch_time, are_results_same, diff_percentage) # Create output dir for reports From 1a4031ceb2a9e19f3c90084ea4d67053a2f13244 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 10:24:01 -0600 Subject: [PATCH 11/90] Add 2nd dgsh (no func) script --- report/benchmark_config.json | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 report/benchmark_config.json diff --git a/report/benchmark_config.json b/report/benchmark_config.json new file mode 100644 index 00000000..b199dcaa --- /dev/null +++ b/report/benchmark_config.json @@ -0,0 +1,41 @@ +[ + { + "name": "Dgsh 1.sh - 2M", + "pre_execution_script": ["wget -ncO in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in2M.xml", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 1.sh - 120M", + "pre_execution_script": ["wget -ncO in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in120M.xml", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 1.sh - 700M", + "pre_execution_script": ["wget -ncO in700M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in700M.xml", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 2.sh (no function) - Τry Repo", + "pre_execution_script": ["git clone https://github.com/binpash/try.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/2_no_func.sh", + "working_dir": "{RESOURCE_DIR}/try", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 2.sh (no function) - PaSh Repo", + "pre_execution_script": ["git clone https://github.com/binpash/pash.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/2_no_func.sh", + "working_dir": "{RESOURCE_DIR}/try", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 2.sh (no function) - PaSh Repo", + "pre_execution_script": ["git clone https://github.com/binpash/pash.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/2_no_func.sh", + "working_dir": "{RESOURCE_DIR}/try", + "orch_args": "-d 2" + } +] From ae4fbd6feffce5e2bcef85e360abfea47f64e275 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 10:24:36 -0600 Subject: [PATCH 12/90] Add 2_no_func.sh --- report/benchmarks/dgsh/2_no_func.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 report/benchmarks/dgsh/2_no_func.sh diff --git a/report/benchmarks/dgsh/2_no_func.sh b/report/benchmarks/dgsh/2_no_func.sh new file mode 100644 index 00000000..37e81ef1 --- /dev/null +++ b/report/benchmarks/dgsh/2_no_func.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +## Note: Needs to be run on a big git repository to make sense (maybe linux) + +## Initialize the necessary temporary files +file1=$(mktemp) +git log --format="%an:%ad" --date=default "$@" >"$file1" + +echo "Authors ordered by number of commits" +# Order by frequency +awk -F: '{print $1}' <"$file1" # | sort | uniq | sort -rn + +echo "Days ordered by number of commits" +# Order by frequency +# awk -F: '{print substr($2, 1, 3)}' <"$file1" | sort | uniq | sort -rn From d78e3f7755f25e74e1cbf26e60fb5f94a9c25fa1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 31 Aug 2023 10:35:59 -0600 Subject: [PATCH 13/90] Use difflib to print line diffs --- report/benchmark_report.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 28a5046d..fb1b1a90 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -4,6 +4,7 @@ import os from benchmark_plots import * import logging +import difflib # Setting and exporting environment variables (same as tests for now). # This will change in the future. @@ -40,11 +41,17 @@ def run_command(command, working_dir=os.getcwd()): end_time = time.time() return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) -# TODO: Make this more robust - maybe even use sth like difflib def compare_results(bash_output, orch_output): - return bash_output == orch_output + + bash_lines = bash_output.splitlines() + orch_lines = orch_output.splitlines() -def print_results(benchmark_name, bash_time, orch_time, are_results_same, diff_percentage): + # Compare lines + d = difflib.ndiff(bash_lines, orch_lines) + return [diff for diff in d if diff.startswith('- ') or diff.startswith('+ ')] + + +def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percentage): if orch_time < bash_time: comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) faster than Bash" else: @@ -53,8 +60,11 @@ def print_results(benchmark_name, bash_time, orch_time, are_results_same, diff_p print(f"Results for benchmark: {benchmark_name}") print(f"Bash Execution Time: {bash_time}s") print(f"hs Execution Time: {orch_time}s") - print(f"Valid: {'Yes' if are_results_same else 'No'}") + print(f"Valid: {'Yes' if len(diff_lines) == 0 else 'No - see below'}") + for line in diff_lines: + print(line) print(comparison_result) + print("-" * 40) print("-" * 40) @@ -83,9 +93,9 @@ def main(): bash_times.append(bash_time) orch_times.append(orch_time) # print(bash_output) - are_results_same = compare_results(bash_output, orch_output) + diff_lines = compare_results(bash_output, orch_output) diff_percentage = abs((bash_time - orch_time) / bash_time) * 100 - print_results(benchmark['name'], bash_time, orch_time, are_results_same, diff_percentage) + print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) # Create output dir for reports os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) From a3c1449a6d1c649aa492d6185660bcb7cdee95b5 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 1 Sep 2023 02:26:34 -0600 Subject: [PATCH 14/90] Add timestamp logging functions --- parallel-orch/config.py | 7 ++++++- parallel-orch/util.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 2cb9b5f9..80de8ca1 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -1,6 +1,7 @@ import os import subprocess import logging +import time ## TODO: Figure out how logging here plays out together with the log() in PaSh @@ -27,7 +28,7 @@ def log_root(msg, *args, **kwargs): ## Ensure that PASH_TMP_PREFIX is set by pa.sh -assert(not os.getenv('PASH_SPEC_TMP_PREFIX') is None) +# assert(not os.getenv('PASH_SPEC_TMP_PREFIX') is None) PASH_SPEC_TMP_PREFIX = os.getenv('PASH_SPEC_TMP_PREFIX') SOCKET_BUF_SIZE = 8192 @@ -41,3 +42,7 @@ def log_root(msg, *args, **kwargs): 'PPID', 'PROMPT_COMMAND', 'PS4', 'SHELL', 'SHELLOPTS', 'SHLVL', 'TERM', 'UID', 'USER', 'XDG_SESSION_ID'} SIGNIFICANT_VARS = {'foo', 'bar', 'baz'} + +START_TIME = time.time() + +named_timestamps = {} diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 91c64f31..252a367a 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -135,3 +135,20 @@ def compare_env_strings(file1_content, file2_content): dict1 = parse_env_string_to_dict(file1_content) dict2 = parse_env_string_to_dict(file2_content) return compare_dicts(dict1, dict2) + +def log_time_delta_from_start(module: str, action: str, node=None): + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time From start:{to_milliseconds_str(time.time() - config.START_TIME)}") + +def set_named_timestamp(action: str, node=None): + key = f"{action}{',' + str(node) if node is not None else ''}" + config.named_timestamps[key] = time.time() + +def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None): + set_named_timestamp(action, node) + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") + +def log_time_delta_from_named_timestamp(module: str, action: str, node=None): + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[action])}") + +def to_milliseconds_str(seconds: float) -> str: + return f"{seconds * 1000:.3f}ms" \ No newline at end of file From e8a051dbe88a029dee848784c9adf5bc18d1b0db Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 1 Sep 2023 05:23:15 -0600 Subject: [PATCH 15/90] Add support for more complex timestamp logging --- parallel-orch/util.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 252a367a..7f90368e 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -139,16 +139,26 @@ def compare_env_strings(file1_content, file2_content): def log_time_delta_from_start(module: str, action: str, node=None): logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time From start:{to_milliseconds_str(time.time() - config.START_TIME)}") -def set_named_timestamp(action: str, node=None): - key = f"{action}{',' + str(node) if node is not None else ''}" +def set_named_timestamp(action: str, node=None, key=None): + if key is None: + key = f"{action}{',' + str(node) if node is not None else ''}" config.named_timestamps[key] = time.time() -def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None): - set_named_timestamp(action, node) +def invalidate_named_timestamp(action: str, node=None, key=None): + if key is None: + key = f"{action}{',' + str(node) if node is not None else ''}" + del config.named_timestamps[key] + +def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None): + set_named_timestamp(action, node, key) logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") -def log_time_delta_from_named_timestamp(module: str, action: str, node=None): - logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[action])}") +def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key=None, invalidate=True): + if key is None: + key = f"{action}{',' + str(node) if node is not None else ''}" + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[key])}") + if invalidate: + invalidate_named_timestamp(action, node, key) def to_milliseconds_str(seconds: float) -> str: return f"{seconds * 1000:.3f}ms" \ No newline at end of file From f932a38fc880ecd96c6a7027fcf8a738787f89fc Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 1 Sep 2023 05:23:54 -0600 Subject: [PATCH 16/90] Add timed logs for main scheduler workflow --- parallel-orch/partial_program_order.py | 32 +++++++++++++++++++++----- parallel-orch/scheduler_server.py | 20 ++++++++++++---- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 9a82846a..c4c77e63 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -7,6 +7,7 @@ import config import executor import trace +from util import * import util from shasta.ast_node import AstNode, CommandNode, PipeNode @@ -654,20 +655,26 @@ def __kill_node(self, cmd_id: "NodeId"): util.kill_process(proc_to_kill.pid) def resolve_commands_that_can_be_resolved_and_push_frontier(self): + cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated() logging.debug(f"Commands to check for dependencies this round are: {sorted(cmds_to_resolve)}") logging.debug(f"Commands that cannot be resolved this round are: {sorted(self.speculated)}") - ## Resolve dependencies for the commands that can actually be resolved to_commit = self.__resolve_dependencies_continuous_and_move_frontier(cmds_to_resolve) + for cmd in to_commit: + log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependenciesDone", cmd, key=f"ResolveDependencies-{cmd}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling") + if len(to_commit) == 0: logging.debug(" > No nodes to be committed this round") else: logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) self.__kill_all_currently_executing_and_schedule_restart() + log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) # self.print_cmd_stderr(stderr) + def __pop_cmds_to_resolve_from_speculated(self): cmd_ids_to_check = sorted(list(self.speculated)) @@ -679,7 +686,6 @@ def __pop_cmds_to_resolve_from_speculated(self): if not self.cmd_can_be_resolved(cmd_id): if cmd_id not in self.speculated: logging.debug(f" > Adding node {cmd_id} to waiting list") - logging.trace(f"WaitingAdd|{cmd_id}") self.speculated.add(cmd_id) else: logging.debug(f" > Keeping node {cmd_id} to waiting list") @@ -693,6 +699,8 @@ def __pop_cmds_to_resolve_from_speculated(self): self.speculated.remove(cmd_id) else: logging.debug(f" > Node {cmd_id} is able to be resolved") + # The node can be resolved now + log_time_delta_from_named_timestamp("PartialOrder", "WaitingToResolveDone", cmd_id, key=f"WaitingToResolve-{cmd_id}") return sorted(cmds_to_resolve) @@ -717,7 +725,9 @@ def resolve_dependencies(self, cmds_to_resolve): ## as the input of a following command def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): self.log_partial_program_order_info() - + for cmd in cmds_to_resolve: + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd, key=f"ResolveDependencies-{cmd}") + logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}") logging.debug(" --- Starting dependency resolution --- ") new_workset = self.resolve_dependencies(cmds_to_resolve) @@ -1183,7 +1193,6 @@ def attempt_move_stopped_to_workset(self): ## TODO: Eventually, in the future, let's add here some form of limit def schedule_work(self, limit=0): # self.log_partial_program_order_info() - logging.debug("Scheduling work...") logging.debug("Rerunning stopped commands") # attempt_move_stopped_to_workset() needs to happen before the node execution self.attempt_move_stopped_to_workset() @@ -1204,10 +1213,12 @@ def schedule_work(self, limit=0): def schedule_node(self, cmd_id): # This replaced the old frontier check if self.is_next_non_committed_node(cmd_id): + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunFrontierNode", cmd_id, key=f"Run-{cmd_id}") # TODO: run this and before committing kill any speculated commands still executing self.run_cmd_non_blocking(cmd_id) else: if not cmd_id in self.speculated: + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id, key=f"Run-{cmd_id}") self.speculate_cmd_non_blocking(cmd_id) return @@ -1258,11 +1269,13 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing") def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): + log_time_delta_from_named_timestamp("PartialOrder", "CommandExecComplete", node_id, key=f"Run-{node_id}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") + logging.debug(f" --- Node {node_id}, just finished execution ---") self.sandbox_dirs[node_id] = sandbox_dir ## TODO: Store variable file somewhere so that we can return when wait _proc, trace_file, stdout, stderr, post_execution_env_file = self.commands_currently_executing.pop(node_id) - logging.debug(f" >>>>> Command {node_id} - {_proc.pid} just finished executing") logging.trace(f"ExecutingRemove|{node_id}") # Handle stopped by riker due to network access if int(riker_exit_code) == 159: @@ -1295,9 +1308,11 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand # do nothing and wait until a new command finishes executing logging.debug("No resolvable nodes were found in this round, nothing will change...") return - + + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheckDone", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) # Remove from workset and add it again later if necessary self.workset.remove(node_id) + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution_FrontendWait", node_id, key=f"PostExecResolution_FrontendWait-{node_id}") ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. if self.get_new_env_file_for_node(node_id) is None: logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") @@ -1336,6 +1351,7 @@ def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId self.resolve_most_recent_envs_and_continue_command_execution(node_id) def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution_FrontendWait-{new_env_node}") to_check = list(self.waiting_for_frontend) + [new_env_node] logging.debug(f"Node {new_env_node} received its latest env from runtime. Comparing env with itself and other waiting nodes.") # Node is no longer waiting to be resolved. It might have not been waiting at all. @@ -1354,6 +1370,7 @@ def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: elif node_id == new_env_node: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env.") assert(node_id not in self.stopped) + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id, key=f"WaitingToResolve-{node_id}") self.add_to_speculated(node_id) ## We can now call the general resolution method that determines which commands ## can be resolved (all their dependencies are done executing), and resolves them. @@ -1378,6 +1395,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node else: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") assert(node_id not in self.stopped) + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id, key=f"WaitingToResolve-{node_id}") self.add_to_speculated(node_id) ## We can now call the general resolution method that determines which commands ## can be resolved (all their dependencies are done executing), and resolves them. @@ -1407,6 +1425,7 @@ def print_cmd_stderr(self, stderr): def commit_cmd_workspaces(self, to_commit_ids): for cmd_id in sorted(to_commit_ids): + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) workspace = self.sandbox_dirs[cmd_id] if workspace != "": logging.debug(f" (!) Committing workspace of cmd {cmd_id} found in {workspace}") @@ -1414,6 +1433,7 @@ def commit_cmd_workspaces(self, to_commit_ids): logging.debug(commit_workspace_out.decode()) else: logging.debug(f" (!) No need to commit workspace of cmd {cmd_id} as it was run in the main workspace") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) def log_rw_sets(self): logging.debug("====== RW Sets " + "=" * 65) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 2f387028..b603abdd 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -172,8 +172,6 @@ def handle_command_exec_complete(self, input_cmd: str): if trace_file in self.partial_program_order.banned_files: logging.debug(f'CommandExecComplete: {cmd_id} ignored') return - logging.debug(input_cmd) - ## Gather RWset, resolve dependencies, and progress graph self.partial_program_order.command_execution_completed(cmd_id, exit_code, sandbox_dir) @@ -185,23 +183,31 @@ def process_next_cmd(self): connection, input_cmd = socket_get_next_cmd(self.socket) if(input_cmd.startswith("Init")): + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "PartialOrderInit") connection.close() self.handle_init(input_cmd) - ## TODO: Read the partial order from the given file + ## TODO: Read the partial order from the given file + log_time_delta_from_named_timestamp("Scheduler", "PartialOrderInit") elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "DaemonStart") connection.close() ## This happens when pa.sh first connects to daemon to see if it is on logging.debug(f'PaSh made first contact with scheduler server.') + log_time_delta_from_named_timestamp("Scheduler", "DaemonStart") elif (input_cmd.startswith("CommandExecComplete:")): + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "CommandExecComplete") ## We have received this message from an a runner (tracer +isolation) ## The runner should have already parsed RWsets and serialized them to ## a file. connection.close() self.handle_command_exec_complete(input_cmd) + log_time_delta_from_named_timestamp("Scheduler", "CommandExecComplete") elif (input_cmd.startswith("Wait")): + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Wait") self.handle_wait(input_cmd, connection) + log_time_delta_from_named_timestamp("Scheduler", "Wait") elif (input_cmd.startswith("Done")): - + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Done") logging.debug(f'Scheduler server received shutdown message.') logging.debug(f'The partial order was successfully completed.') if not self.partial_program_order.is_completed(): @@ -209,6 +215,7 @@ def process_next_cmd(self): socket_respond(connection, success_response("All finished!")) self.partial_program_order.log_executions() self.done = True + log_time_delta_from_named_timestamp("Scheduler", "Done") else: logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) raise Exception(f'Error: Unsupported command: {input_cmd}') @@ -230,10 +237,12 @@ def check_unsafe_and_waiting(self): ## It should add some work (if possible), and then return immediately. ## It is called once per loop iteration, making sure that there is always work happening def schedule_work(self): + log_time_delta_from_start_and_set_named_timestamp("Scheduler", "ScheduleWork") self.partial_program_order.schedule_work() ## Respond to any waiting nodes that have been deemed to be unsafe self.check_unsafe_and_waiting() + log_time_delta_from_named_timestamp("Scheduler", "ScheduleWork") def run(self): ## The first command should be the daemon start @@ -266,6 +275,7 @@ def terminate_pending_commands(self): def main(): + log_time_delta_from_start("Scheduler", "Scheduler Init") args = init() # Format logging @@ -281,7 +291,7 @@ def main(): if args.debug_level == 1: logging.getLogger().setLevel(logging.INFO) elif args.debug_level >= 2: - logging.getLogger().setLevel(logging.DEBUG) + logging.getLogger().setLevel(logging.INFO) # elif args.debug_level >= 3: # logging.getLogger().setLevel(logging.TRACE) From 416bfdd90af998b016113e196343bab111ac708d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 00:36:47 -0600 Subject: [PATCH 17/90] Improve log messages and add log to loop unrolling --- parallel-orch/partial_program_order.py | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index c4c77e63..3fb99088 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -662,7 +662,8 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): ## Resolve dependencies for the commands that can actually be resolved to_commit = self.__resolve_dependencies_continuous_and_move_frontier(cmds_to_resolve) for cmd in to_commit: - log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependenciesDone", cmd, key=f"ResolveDependencies-{cmd}") + log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd) + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}") log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling") if len(to_commit) == 0: @@ -673,7 +674,6 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): self.__kill_all_currently_executing_and_schedule_restart() log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) - # self.print_cmd_stderr(stderr) def __pop_cmds_to_resolve_from_speculated(self): @@ -700,7 +700,7 @@ def __pop_cmds_to_resolve_from_speculated(self): else: logging.debug(f" > Node {cmd_id} is able to be resolved") # The node can be resolved now - log_time_delta_from_named_timestamp("PartialOrder", "WaitingToResolveDone", cmd_id, key=f"WaitingToResolve-{cmd_id}") + log_time_delta_from_named_timestamp("PartialOrder", "WaitingToResolve", cmd_id) return sorted(cmds_to_resolve) @@ -726,7 +726,7 @@ def resolve_dependencies(self, cmds_to_resolve): def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): self.log_partial_program_order_info() for cmd in cmds_to_resolve: - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd, key=f"ResolveDependencies-{cmd}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd) logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}") logging.debug(" --- Starting dependency resolution --- ") @@ -896,15 +896,19 @@ def wait_received(self, node_id: NodeId): ## node is very complex and not elegant. ## TODO: Could we swap unrolling and progressing so that we always ## check if a node can be progressed by checking edges? + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) self.progress_po_due_to_wait(node_id) + log_time_delta_from_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) + + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) ## Unroll some nodes if needed. if node_id.has_iters(): ## TODO: This unrolling can also happen and be moved to speculation. ## For now we are being conservative and that is why it only happens here ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work) self.maybe_unroll(node_id) - + assert(self.valid()) def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]": @@ -1081,8 +1085,9 @@ def unroll_loop_node(self, target_concrete_node_id: NodeId): def maybe_unroll(self, node_id: NodeId) -> NodeId: ## Only unrolls this node if it doesn't already exist in the PO if not self.is_node_id(node_id): + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) self.unroll_loop_node(node_id) - + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) ## The node_id must be part of the PO after unrolling, otherwise we did something wrong assert(self.is_node_id(node_id)) @@ -1212,13 +1217,12 @@ def schedule_work(self, limit=0): # Nodes to be scheduled are always not committed and not executing def schedule_node(self, cmd_id): # This replaced the old frontier check + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id) if self.is_next_non_committed_node(cmd_id): - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunFrontierNode", cmd_id, key=f"Run-{cmd_id}") # TODO: run this and before committing kill any speculated commands still executing self.run_cmd_non_blocking(cmd_id) else: if not cmd_id in self.speculated: - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id, key=f"Run-{cmd_id}") self.speculate_cmd_non_blocking(cmd_id) return @@ -1269,7 +1273,7 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing") def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): - log_time_delta_from_named_timestamp("PartialOrder", "CommandExecComplete", node_id, key=f"Run-{node_id}") + log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") logging.debug(f" --- Node {node_id}, just finished execution ---") @@ -1309,10 +1313,10 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand logging.debug("No resolvable nodes were found in this round, nothing will change...") return - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheckDone", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) # Remove from workset and add it again later if necessary self.workset.remove(node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution_FrontendWait", node_id, key=f"PostExecResolution_FrontendWait-{node_id}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. if self.get_new_env_file_for_node(node_id) is None: logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") @@ -1351,7 +1355,7 @@ def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId self.resolve_most_recent_envs_and_continue_command_execution(node_id) def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution_FrontendWait-{new_env_node}") + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution-{new_env_node}", invalidate=False) to_check = list(self.waiting_for_frontend) + [new_env_node] logging.debug(f"Node {new_env_node} received its latest env from runtime. Comparing env with itself and other waiting nodes.") # Node is no longer waiting to be resolved. It might have not been waiting at all. @@ -1370,7 +1374,7 @@ def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: elif node_id == new_env_node: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env.") assert(node_id not in self.stopped) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id, key=f"WaitingToResolve-{node_id}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) self.add_to_speculated(node_id) ## We can now call the general resolution method that determines which commands ## can be resolved (all their dependencies are done executing), and resolves them. @@ -1395,7 +1399,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node else: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") assert(node_id not in self.stopped) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id, key=f"WaitingToResolve-{node_id}") + log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) self.add_to_speculated(node_id) ## We can now call the general resolution method that determines which commands ## can be resolved (all their dependencies are done executing), and resolves them. From 738f48f5558edda21dd1bff037a7945345fa1db2 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 00:37:36 -0600 Subject: [PATCH 18/90] Change scheduler debug level --- parallel-orch/scheduler_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index b603abdd..136ef30e 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -291,7 +291,7 @@ def main(): if args.debug_level == 1: logging.getLogger().setLevel(logging.INFO) elif args.debug_level >= 2: - logging.getLogger().setLevel(logging.INFO) + logging.getLogger().setLevel(logging.DEBUG) # elif args.debug_level >= 3: # logging.getLogger().setLevel(logging.TRACE) From 1601a5d45b927bd2c59ed62f758397c0380755ae Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 02:01:32 -0600 Subject: [PATCH 19/90] Change config parsing to handle complex commands --- report/benchmark_config.json | 12 +++++------ report/benchmark_report.py | 42 +++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index b199dcaa..e286a788 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -1,20 +1,20 @@ [ { "name": "Dgsh 1.sh - 2M", - "pre_execution_script": ["wget -ncO in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in2M.xml", + "pre_execution_script": ["wget -nc -O in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in2M.xml", "orch_args": "-d 2" }, { "name": "Dgsh 1.sh - 120M", - "pre_execution_script": ["wget -ncO in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in120M.xml", + "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in120M.xml", "orch_args": "-d 2" }, { "name": "Dgsh 1.sh - 700M", - "pre_execution_script": ["wget -ncO in700M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh resources/in700M.xml", + "pre_execution_script": ["wget -nc -O in700M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in700M.xml", "orch_args": "-d 2" }, { diff --git a/report/benchmark_report.py b/report/benchmark_report.py index fb1b1a90..10f7ba36 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -20,12 +20,16 @@ REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') -def resolve_working_dir(path): - return path.format(RESOURCE_DIR=os.environ.get('RESOURCE_DIR')) - -def resolve_command_path(command): - return command.format(TEST_SCRIPT_DIR=os.environ.get('TEST_SCRIPT_DIR')) - +def replace_with_env_var(input_string): + format_args = { + "TEST_SCRIPT_DIR": os.environ.get("TEST_SCRIPT_DIR", os.getcwd()), + "RESOURCE_DIR": os.environ.get("RESOURCE_DIR", os.getcwd()) + } + + # Replace placeholders with actual environment variables using `format` + replaced_string = input_string.format(**format_args) + + return replaced_string def run_pre_execution_command(command, working_dir=os.getcwd()): print("Running pre-execution command:", command) @@ -36,6 +40,7 @@ def run_pre_execution_command(command, working_dir=os.getcwd()): def run_command(command, working_dir=os.getcwd()): print("Running (and timing) command: ", " ".join(command)) start_time = time.time() + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir) stdout, stderr = process.communicate() end_time = time.time() @@ -64,11 +69,10 @@ def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percent for line in diff_lines: print(line) print(comparison_result) - print("-" * 40) print("-" * 40) - - + + def main(): # Load benchmark configurations with open(os.path.join(os.environ.get('WORKING_DIR'), 'benchmark_config.json'), 'r') as f: @@ -86,23 +90,31 @@ def main(): run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR')) # TODO: in the future, we are going to parse the orch_error and generate reports - working_dir = resolve_working_dir(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) + working_dir = replace_with_env_var(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) - bash_time, bash_output, _bash_error = run_command([BASH_COMMAND, resolve_command_path(benchmark['command'])], working_dir) - orch_time, orch_output, orch_error = run_command([ORCH_COMMAND, benchmark['orch_args'], resolve_command_path(benchmark['command'])], working_dir) + bash_cmd_str = [BASH_COMMAND] + replace_with_env_var(benchmark['command']).split(" ") + print(bash_cmd_str) + bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) + + + orch_cmd_str = [ORCH_COMMAND, benchmark['orch_args'], "-c", replace_with_env_var(benchmark['command'])] + print(orch_cmd_str) + orch_time, orch_output, orch_error = run_command(orch_cmd_str, working_dir) bash_times.append(bash_time) orch_times.append(orch_time) - # print(bash_output) diff_lines = compare_results(bash_output, orch_output) diff_percentage = abs((bash_time - orch_time) / bash_time) * 100 + print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) + # print(orch_error) + # print(">", bash_output, _bash_error) # Create output dir for reports os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] - plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") - plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") + # plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") + # plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") if __name__ == "__main__": From 44a18fb6307b226c05f2fb58d2a967438d8328f6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 03:23:02 -0600 Subject: [PATCH 20/90] Use env for benchmarks instestead of input file args --- report/benchmark_config.json | 9 ++++++--- report/benchmarks/dgsh/1.sh | 4 +--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index e286a788..1c1da3b3 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -1,20 +1,23 @@ [ { "name": "Dgsh 1.sh - 2M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in2M.xml"], "pre_execution_script": ["wget -nc -O in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in2M.xml", + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", "orch_args": "-d 2" }, { "name": "Dgsh 1.sh - 120M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in120M.xml", + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", "orch_args": "-d 2" }, { "name": "Dgsh 1.sh - 700M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in700M.xml"], "pre_execution_script": ["wget -nc -O in700M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh {RESOURCE_DIR}/in700M.xml", + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", "orch_args": "-d 2" }, { diff --git a/report/benchmarks/dgsh/1.sh b/report/benchmarks/dgsh/1.sh index db45e28e..8312c443 100755 --- a/report/benchmarks/dgsh/1.sh +++ b/report/benchmarks/dgsh/1.sh @@ -2,9 +2,7 @@ ## Initialize the necessary temporary files file1=$(mktemp) - -cat $1 >"$file1" - +cat $INPUT_FILE >"$file1" printf 'File type:\t' file - <"$file1" From ddb8a527531347e19d3b163817a4f314fd18232c Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 03:24:02 -0600 Subject: [PATCH 21/90] Call the orch script correctly --- report/benchmark_report.py | 46 ++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 10f7ba36..bff9bbb7 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -17,7 +17,6 @@ BASH_COMMAND = "/bin/bash" ORCH_COMMAND = os.path.join(os.environ['ORCH_TOP'], 'pash-spec.sh') - REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') def replace_with_env_var(input_string): @@ -25,10 +24,7 @@ def replace_with_env_var(input_string): "TEST_SCRIPT_DIR": os.environ.get("TEST_SCRIPT_DIR", os.getcwd()), "RESOURCE_DIR": os.environ.get("RESOURCE_DIR", os.getcwd()) } - - # Replace placeholders with actual environment variables using `format` replaced_string = input_string.format(**format_args) - return replaced_string def run_pre_execution_command(command, working_dir=os.getcwd()): @@ -40,17 +36,23 @@ def run_pre_execution_command(command, working_dir=os.getcwd()): def run_command(command, working_dir=os.getcwd()): print("Running (and timing) command: ", " ".join(command)) start_time = time.time() - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir) stdout, stderr = process.communicate() end_time = time.time() return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) +def run_command_with_orch(command, orch_args, working_dir=os.getcwd()): + print("Running (and timing) command with orch: ", " ".join(command)) + start_time = time.time() + process = subprocess.Popen([ORCH_COMMAND, orch_args] + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir, env=os.environ) + stdout, stderr = process.communicate() + end_time = time.time() + return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) + def compare_results(bash_output, orch_output): bash_lines = bash_output.splitlines() orch_lines = orch_output.splitlines() - # Compare lines d = difflib.ndiff(bash_lines, orch_lines) return [diff for diff in d if diff.startswith('- ') or diff.startswith('+ ')] @@ -72,7 +74,26 @@ def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percent print("-" * 40) print("-" * 40) - +def print_sorted_logs(orch_output): + relevant_lines = [line for line in orch_output.split("\n") if line.startswith("INFO:root:>|")] + # Extract lines with step time and sort + step_time_lines = [(line, float(line.split("Step time:")[1].split("ms")[0])) for line in relevant_lines if "Step time:" in line] + sorted_step_time_lines = sorted(step_time_lines, key=lambda x: x[1], reverse=True) + + for entry in sorted_step_time_lines: + split_line = entry[0].split("|")[1:] + pretty_line = " | ".join(split_line) + print(f"{pretty_line}, Step Time: {entry[1]:.3f}ms") + print(orch_output) + + +def export_env_vars(env_vars): + for env_var in env_vars: + lhs, rhs = env_var.split("=") + rhs = replace_with_env_var(rhs) + os.environ[lhs] = rhs + + def main(): # Load benchmark configurations with open(os.path.join(os.environ.get('WORKING_DIR'), 'benchmark_config.json'), 'r') as f: @@ -82,6 +103,9 @@ def main(): orch_times = [] for benchmark in benchmarks_config: + + # Set up preferred environment + export_env_vars(benchmark.get('env', {})) # Create resource dir if non-existent os.makedirs(os.environ.get('RESOURCE_DIR'), exist_ok=True) # Run pre-execution commands @@ -97,15 +121,18 @@ def main(): bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) - orch_cmd_str = [ORCH_COMMAND, benchmark['orch_args'], "-c", replace_with_env_var(benchmark['command'])] + orch_cmd_str = replace_with_env_var(benchmark['command']).split(" ") print(orch_cmd_str) - orch_time, orch_output, orch_error = run_command(orch_cmd_str, working_dir) + orch_time, orch_output, orch_error = run_command_with_orch(orch_cmd_str, benchmark['orch_args'], working_dir) bash_times.append(bash_time) orch_times.append(orch_time) diff_lines = compare_results(bash_output, orch_output) diff_percentage = abs((bash_time - orch_time) / bash_time) * 100 print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) + + print(bash_output) + print(orch_output) # print(orch_error) # print(">", bash_output, _bash_error) @@ -113,6 +140,7 @@ def main(): os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] + # print_sorted_logs(orch_error) # plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") # plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") From 21b477f1f8e8328207499c6900f1ede5ef9fd3ec Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 15:37:34 -0600 Subject: [PATCH 22/90] Count how many times a command was executed along with time --- report/benchmark_report.py | 52 ++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index bff9bbb7..e372d91f 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -65,14 +65,14 @@ def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percent comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) slower than Bash" print("-" * 40) print(f"Results for benchmark: {benchmark_name}") - print(f"Bash Execution Time: {bash_time}s") - print(f"hs Execution Time: {orch_time}s") + print(f"Bash Execution Time: {round(bash_time, 3)}s") + print(f"hs Execution Time: {round(orch_time, 3)}s") print(f"Valid: {'Yes' if len(diff_lines) == 0 else 'No - see below'}") for line in diff_lines: print(line) print(comparison_result) print("-" * 40) - print("-" * 40) + print() def print_sorted_logs(orch_output): relevant_lines = [line for line in orch_output.split("\n") if line.startswith("INFO:root:>|")] @@ -84,7 +84,38 @@ def print_sorted_logs(orch_output): split_line = entry[0].split("|")[1:] pretty_line = " | ".join(split_line) print(f"{pretty_line}, Step Time: {entry[1]:.3f}ms") - print(orch_output) + + +def print_exec_time_for_cmds(orch_outpt): + # Split the log into lines and filter the relevant ones + relevant_lines = [line.replace("INFO:root:>|PartialOrder|RunNode,", "") for line in orch_outpt.split("\n") if line.startswith("INFO:root:>|PartialOrder|RunNode,") and "Step time:" in line] + # Extract lines with RunNode commands and their step times + node_and_times = [(int(line.split("|")[0]), float(line.split("|")[1].split(":")[1][:-2]), float(line.split("|")[2].split(":")[1][:-2])) for line in relevant_lines] + + # Total number of times a RunNode command was executed + total_run_node_commands = len(node_and_times) + # print(node_and_times) + # Total time of all RunNode commands + total_time = sum([entry[2] for entry in node_and_times]) + + # Extract and sum the total time of the step per node + node_times = {} + counts = {} + for node, _, time in node_and_times: + if node in node_times: + node_times[node] += time + counts[node] += 1 + else: + node_times[node] = time + counts[node] = 1 + + print("-" * 40) + print(f"Total number of times a RunNode command was executed: {total_run_node_commands}") + print(f"Total time of all RunNode commands: {total_time:.3f}ms") + print("\nTotal time of the step per node:") + for node, time in sorted(node_times.items(), key=lambda x: x[1], reverse=True): + print(f"{node}: {time:.3f}ms ({counts[node]} times)") + print("-" * 40) def export_env_vars(env_vars): @@ -117,12 +148,9 @@ def main(): working_dir = replace_with_env_var(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) bash_cmd_str = [BASH_COMMAND] + replace_with_env_var(benchmark['command']).split(" ") - print(bash_cmd_str) bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) - orch_cmd_str = replace_with_env_var(benchmark['command']).split(" ") - print(orch_cmd_str) orch_time, orch_output, orch_error = run_command_with_orch(orch_cmd_str, benchmark['orch_args'], working_dir) bash_times.append(bash_time) orch_times.append(orch_time) @@ -131,19 +159,17 @@ def main(): print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) - print(bash_output) - print(orch_output) - # print(orch_error) - # print(">", bash_output, _bash_error) + print_exec_time_for_cmds(orch_error) # Create output dir for reports os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] # print_sorted_logs(orch_error) - # plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") - # plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") + plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") + plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") + if __name__ == "__main__": main() From 4ef1ab233efe810b719b104c901b3370a1152cb1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 4 Sep 2023 16:16:02 -0600 Subject: [PATCH 23/90] Add a gantt chart of benchmark execution --- report/benchmark_plots.py | 28 +++++++++++++++++++++++++++- report/benchmark_report.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py index e592bff0..632f8184 100644 --- a/report/benchmark_plots.py +++ b/report/benchmark_plots.py @@ -40,4 +40,30 @@ def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_d ax.set_title(f'Execution Time Comparison for {benchmark}: Bash vs hs') plt.tight_layout() - plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) \ No newline at end of file + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + +def plot_gantt(activities, output_dir, filename): + fig, ax = plt.subplots(figsize=(15, 20)) # Increase figure size + + # Sort the activities by their start time + activities.sort(key=lambda x: x[1]) + + # Reduce the height of each bar and reduce the gap between bars + bar_height = 5 + gap = 1 + + # Plotting each activity + for index, activity in enumerate(activities): + action, start_time, duration = activity + ax.broken_barh([(start_time, duration)], (index*(bar_height + gap), bar_height), facecolors='blue', edgecolor='black') + ax.text(start_time + duration/2, index*(bar_height + gap) + bar_height/2, action, ha='center', va='center', fontsize=6, color='white') + + # Setting labels & title + ax.set_xlabel('Time (ms)') + ax.set_title(f'Gantt Chart of {filename.strip("_gantt.pdf")}') + ax.set_yticks([i*(bar_height + gap) + bar_height/2 for i in range(len(activities))]) + ax.set_yticklabels([activity[0] for activity in activities], rotation=30, fontsize=8) + ax.grid(True) + + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index e372d91f..43c77b92 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -6,6 +6,7 @@ import logging import difflib + # Setting and exporting environment variables (same as tests for now). # This will change in the future. os.environ['ORCH_TOP'] = os.environ.get('ORCH_TOP', subprocess.check_output(['git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree']).decode('utf-8').strip()) @@ -19,6 +20,26 @@ ORCH_COMMAND = os.path.join(os.environ['ORCH_TOP'], 'pash-spec.sh') REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') + + +def parse_logs_into_activities(log_data): + info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")] + print(info_lines) + # Define a regex pattern to extract data from the log lines + pattern = r">\|(?P[\w\-,]+)\|Time from start:(?P[\d\.]+)ms" + step_time_pattern = r"Step time:(?P[\d\.]+)ms" + + activities = [] + + for line in info_lines: + if len(line) == 4: + activity = line[1] + end_time = float(line[2].split(":")[1].rstrip("ms")) + step_time = float(line[3].split(":")[1].rstrip("ms")) + start_time = end_time - step_time + activities.append((activity, start_time, step_time)) + return activities + def replace_with_env_var(input_string): format_args = { "TEST_SCRIPT_DIR": os.environ.get("TEST_SCRIPT_DIR", os.getcwd()), @@ -94,7 +115,7 @@ def print_exec_time_for_cmds(orch_outpt): # Total number of times a RunNode command was executed total_run_node_commands = len(node_and_times) - # print(node_and_times) + # Total time of all RunNode commands total_time = sum([entry[2] for entry in node_and_times]) @@ -144,7 +165,6 @@ def main(): logging.debug(f"|Pre-execution: {pre_command}") run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR')) - # TODO: in the future, we are going to parse the orch_error and generate reports working_dir = replace_with_env_var(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) bash_cmd_str = [BASH_COMMAND] + replace_with_env_var(benchmark['command']).split(" ") @@ -159,16 +179,20 @@ def main(): print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) + activities = parse_logs_into_activities(log_data=orch_error) + plot_gantt(activities, REPORT_OUTPUT_DIR, f"{benchmark['name']}_gantt.pdf") + print_exec_time_for_cmds(orch_error) # Create output dir for reports os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] + + print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") # print_sorted_logs(orch_error) plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") - print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") if __name__ == "__main__": From e927c29de95fadd93380cff7b44b3ed383a44e1f Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 5 Sep 2023 05:49:26 -0600 Subject: [PATCH 24/90] Print time lost and other execution statistics --- report/benchmark_report.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 43c77b92..0c0e41b7 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -24,7 +24,6 @@ def parse_logs_into_activities(log_data): info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")] - print(info_lines) # Define a regex pattern to extract data from the log lines pattern = r">\|(?P[\w\-,]+)\|Time from start:(?P[\d\.]+)ms" step_time_pattern = r"Step time:(?P[\d\.]+)ms" @@ -121,21 +120,26 @@ def print_exec_time_for_cmds(orch_outpt): # Extract and sum the total time of the step per node node_times = {} + node_distinct_times = {} counts = {} for node, _, time in node_and_times: if node in node_times: node_times[node] += time + node_distinct_times[node].append(time) counts[node] += 1 else: node_times[node] = time + node_distinct_times[node] = [time] counts[node] = 1 + + time_lost_per_node = {node: sum(node_distinct_times[node]) - node_distinct_times[node][-1] for node in node_times} print("-" * 40) print(f"Total number of times a RunNode command was executed: {total_run_node_commands}") print(f"Total time of all RunNode commands: {total_time:.3f}ms") - print("\nTotal time of the step per node:") - for node, time in sorted(node_times.items(), key=lambda x: x[1], reverse=True): - print(f"{node}: {time:.3f}ms ({counts[node]} times)") + print("\nTotal execution time per node:") + for node, time_lost in sorted(time_lost_per_node.items(), key=lambda x: x[1], reverse=True): + print(f"{node:2d}: {node_times[node]:.3f}ms ({counts[node]} times) | Avg: {sum(node_distinct_times[node])/len(node_distinct_times[node]):.3f}ms | {node_distinct_times[node]} | Time lost: {time_lost:.3f}ms") print("-" * 40) @@ -153,6 +157,9 @@ def main(): bash_times = [] orch_times = [] + + # Create output dir for reports + os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) for benchmark in benchmarks_config: @@ -180,12 +187,11 @@ def main(): print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) activities = parse_logs_into_activities(log_data=orch_error) - plot_gantt(activities, REPORT_OUTPUT_DIR, f"{benchmark['name']}_gantt.pdf") + plot_gantt(activities, REPORT_OUTPUT_DIR, f"{benchmark['name']}_gantt") - print_exec_time_for_cmds(orch_error) + print_exec_time_for_cmds(orch_error) - # Create output dir for reports - os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) + # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] From ed095a64e3c4f0f93aac7032e8197deedd8b54e6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 02:35:07 -0600 Subject: [PATCH 25/90] Fix awk parsing bug --- parallel-orch/trace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parallel-orch/trace.py b/parallel-orch/trace.py index a356123e..ec41ae05 100644 --- a/parallel-orch/trace.py +++ b/parallel-orch/trace.py @@ -189,9 +189,9 @@ def is_launch(line): def parse_launch_command(trace_item): - assignment_prefix = trace_item.split(", ")[0].split( + assignment_prefix = trace_item.split("], ")[0].split( "([Command ")[1].rstrip("]").strip() - assignment_suffix = ", ".join(trace_item.split(", ")[1:]).strip() + assignment_suffix = ", ".join(trace_item.split("], ")[1:]).strip() assignment_string = assignment_suffix[1:-2].split(",") assignments = [(x.split("=")) for x in assignment_string] return assignment_prefix, assignments From 0d725356ddf439fe033facdff098d6d739771677 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 03:21:44 -0600 Subject: [PATCH 26/90] Add 3.sh benchmark --- report/benchmarks/dgsh/3.sh | 130 ++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 report/benchmarks/dgsh/3.sh diff --git a/report/benchmarks/dgsh/3.sh b/report/benchmarks/dgsh/3.sh new file mode 100644 index 00000000..93ecbf7b --- /dev/null +++ b/report/benchmarks/dgsh/3.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +## Note: Needs to be run on a big git repository to make sense (maybe linux) + +## Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) +file4=$(mktemp) + +find "$@" \( -name \*.c -or -name \*.h \) -type f -print0 >"$file1" + +echo -n 'FNAMELEN: ' + +tr \\0 \\n <"$file1" | +sed 's|^.*/||' | +awk '{s += length($1); n++} END { + if (n>0) + print s / n; + else + print 0; }' + +xargs -0 /bin/cat <"$file1" >"$file2" + +sed 's/#/@/g;s/\\[\\"'\'']/@/g;s/"[^"]*"/""/g;'"s/'[^']*'/''/g" <"$file2" | + cpp -P >"$file3" + +# Structure definitions +echo -n 'NSTRUCT: ' + +egrep -c 'struct[ ]*{|struct[ ]*[a-zA-Z_][a-zA-Z0-9_]*[ ]*{' <"$file3" +#}} (match preceding openings) + +# Type definitions +echo -n 'NTYPEDEF: ' +grep -cw typedef <"$file3" + +# Use of void +echo -n 'NVOID: ' +grep -cw void <"$file3" + +# Use of gets +echo -n 'NGETS: ' +grep -cw gets <"$file3" + +# Average identifier length +echo -n 'IDLEN: ' + +tr -cs 'A-Za-z0-9_' '\n' <"$file3" | +sort -u | +awk '/^[A-Za-z]/ { len += length($1); n++ } END { + if (n>0) + print len / n; + else + print 0; }' + +echo -n 'CHLINESCHAR: ' +wc -lc <"$file2" | + awk '{OFS=":"; print $1, $2}' + +echo -n 'NCCHAR: ' +sed 's/#/@/g' <"$file2" | +cpp -traditional -P | +wc -c | +awk '{OFMT = "%.0f"; print $1/1000}' + +# Number of comments +echo -n 'NCOMMENT: ' +egrep -c '/\*|//' <"$file2" + +# Occurences of the word Copyright +echo -n 'NCOPYRIGHT: ' +grep -ci copyright <"$file2" + +# C files +find "$@" -name \*.c -type f -print0 >"$file2" + +# Convert to newline separation for counting +tr \\0 \\n <"$file2" >"$file3" + +# Number of C files +echo -n 'NCFILE: ' +wc -l <"$file3" + +# Number of directories containing C files +echo -n 'NCDIR: ' +sed 's,/[^/]*$,,;s,^.*/,,' <"$file3" | +sort -u | +wc -l + +# C code +xargs -0 /bin/cat <"$file2" >"$file3" + +# Lines and characters +echo -n 'CLINESCHAR: ' +wc -lc <"$file3" | +awk '{OFS=":"; print $1, $2}' + +# C code without comments and strings +sed 's/#/@/g;s/\\[\\"'\'']/@/g;s/"[^"]*"/""/g;'"s/'[^']*'/''/g" <"$file3" | +cpp -P >"$file4" + +# Number of functions +echo -n 'NFUNCTION: ' +grep -c '^{' <"$file4" + +# Number of gotos +echo -n 'NGOTO: ' +grep -cw goto <"$file4" + +# Occurrences of the register keyword +echo -n 'NREGISTER: ' +grep -cw register <"$file4" + +# Number of macro definitions +echo -n 'NMACRO: ' +grep -c '@[ ]*define[ ][ ]*[a-zA-Z_][a-zA-Z0-9_]*(' <"$file4" +# Number of include directives +echo -n 'NINCLUDE: ' +grep -c '@[ ]*include' <"$file4" + +# Number of constants +echo -n 'NCONST: ' +grep -ohw '[0-9][x0-9][0-9a-f]*' <"$file4" | wc -l + + +# Header files +echo -n 'NHFILE: ' +find "$@" -name \*.h -type f | +wc -l \ No newline at end of file From af0fcee4d35d298bad9ee283504b1c4851358856 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 03:22:01 -0600 Subject: [PATCH 27/90] Uncomment awk commands --- report/benchmarks/dgsh/2_no_func.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/report/benchmarks/dgsh/2_no_func.sh b/report/benchmarks/dgsh/2_no_func.sh index 37e81ef1..37e28857 100644 --- a/report/benchmarks/dgsh/2_no_func.sh +++ b/report/benchmarks/dgsh/2_no_func.sh @@ -8,8 +8,8 @@ git log --format="%an:%ad" --date=default "$@" >"$file1" echo "Authors ordered by number of commits" # Order by frequency -awk -F: '{print $1}' <"$file1" # | sort | uniq | sort -rn +awk -F: '{print $1}' <"$file1" | sort | uniq | sort -rn echo "Days ordered by number of commits" # Order by frequency -# awk -F: '{print substr($2, 1, 3)}' <"$file1" | sort | uniq | sort -rn +awk -F: '{print substr($2, 1, 3)}' <"$file1" | sort | uniq | sort -rn From 98a8078b7ab37775c0d82456c697379a46d6ce42 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 03:29:21 -0600 Subject: [PATCH 28/90] Add 4.sh benchmark --- report/benchmarks/dgsh/4.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 report/benchmarks/dgsh/4.sh diff --git a/report/benchmarks/dgsh/4.sh b/report/benchmarks/dgsh/4.sh new file mode 100644 index 00000000..e2f0c003 --- /dev/null +++ b/report/benchmarks/dgsh/4.sh @@ -0,0 +1,19 @@ + +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) + +# Create list of files +find "$@" -type f | +xargs openssl md5 | +sed 's/^MD5(//;s/)= / /' | +sort -k2 > "$file1" +awk '{print $2}' < "$file1" | uniq -d > "$file2" +join -2 2 "$file2" "$file1" | +awk ' +BEGIN {ORS=""} +$1 != prev && prev {print "\n"} +END {if (prev) print "\n"} +{if (prev) print " "; prev = $1; print $2}' From d20813270fee8fd552b25abbb0a91b3b081460d4 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 03:36:08 -0600 Subject: [PATCH 29/90] Add 5.sh dgsh benchmark --- report/benchmarks/dgsh/5.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 report/benchmarks/dgsh/5.sh diff --git a/report/benchmarks/dgsh/5.sh b/report/benchmarks/dgsh/5.sh new file mode 100644 index 00000000..21a7fc58 --- /dev/null +++ b/report/benchmarks/dgsh/5.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) +file4=$(mktemp) + +export LC_ALL=C + +cat $INPUT_FILE >"$file1" + +# Find errors + +# Obtain list of words in text +cat "$file1" | +tr -cs A-Za-z \\n | +tr A-Z a-z | +sort -u > "$file2" + +# Ensure dictionary is compatibly sorted +cat "$file1" | +sort /usr/share/dict/words > "$file3" + +# List errors as a set difference +comm -23 "$file2" "$file3" > "$file4" + +fgrep -f "$file4" -i --color -w -C 2 "$file1" From 190afbda92ac3c609816a8c3d136201c5fcbc472 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 6 Sep 2023 03:40:13 -0600 Subject: [PATCH 30/90] Add benchmark config for dgsh scripts 3-5 --- report/benchmark_config.json | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index 1c1da3b3..ed10a8d3 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -40,5 +40,33 @@ "command": "{TEST_SCRIPT_DIR}/dgsh/2_no_func.sh", "working_dir": "{RESOURCE_DIR}/try", "orch_args": "-d 2" + }, + { + "name": "Dgsh 3.sh - Riker Repo", + "pre_execution_script": ["git clone https://github.com/curtsinger-lab/riker.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/3.sh", + "working_dir": "{RESOURCE_DIR}/riker", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 4.sh (no function) - PaSh Repo", + "pre_execution_script": ["git clone https://github.com/curtsinger-lab/riker.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/2_no_func.sh", + "working_dir": "{RESOURCE_DIR}/riker", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 5.sh - 2M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in2M.xml"], + "pre_execution_script": ["wget -nc -O in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/5.sh", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 5.sh - 120M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], + "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/5.sh", + "orch_args": "-d 2" } ] From f04d9457084556bf688ac905e5686651fff85f9c Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 02:35:25 -0600 Subject: [PATCH 31/90] Add 7.sh dgsh benchmark without function calls --- report/benchmarks/dgsh/6.sh | 33 ++++++++++++++++++ report/benchmarks/dgsh/7_no_func.sh | 52 +++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100755 report/benchmarks/dgsh/6.sh create mode 100755 report/benchmarks/dgsh/7_no_func.sh diff --git a/report/benchmarks/dgsh/6.sh b/report/benchmarks/dgsh/6.sh new file mode 100755 index 00000000..4a919523 --- /dev/null +++ b/report/benchmarks/dgsh/6.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) +file4=$(mktemp) +file5=$(mktemp) + +cat $INPUT_FILE > $file1 + +# Consistent sorting across machines +export LC_ALL=C + +# Stream input from file and split input one word per line +# Create list of unique words +tr -cs a-zA-Z '\n' < "$file1" | +sort -u > "$file2" + +# List two-letter palindromes +sed 's/.*\(.\)\(.\)\2\1.*/p: \1\2-\2\1/;t;g' "$file2" > "$file3" + +# List four consecutive consonants +sed -E 's/.*([^aeiouyAEIOUY]{4}).*/c: \1/;t;g' "$file2" > "$file4" + +# List length of words longer than 12 characters +awk '{if (length($1) > 12) print "l:", length($1); + else print ""}' "$file2" > "$file5" + +# Paste the four streams side-by-side +# List only words satisfying one or more properties +paste "$file2" "$file3" "$file4" "$file5" | +fgrep : diff --git a/report/benchmarks/dgsh/7_no_func.sh b/report/benchmarks/dgsh/7_no_func.sh new file mode 100755 index 00000000..3c22d571 --- /dev/null +++ b/report/benchmarks/dgsh/7_no_func.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Consistent sorting across machines +export LC_ALL=C + +# Temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) + +cat $INPUT_FILE > "$file1" + +# Split input one word per line +tr -cs a-zA-Z '\n' < "$file1" > "$file2" + +# Digram frequency +echo "Digram frequency" +perl -ne 'for ($i = 0; $i < length($_) - 2; $i++) { + print substr($_, $i, 2), "\n"; +}' < "$file2" | +awk '{count[$1]++} END {for (i in count) print count[i], i}' | +sort -rn + +# Trigram frequency +echo "Trigram frequency" +perl -ne 'for ($i = 0; $i < length($_) - 3; $i++) { + print substr($_, $i, 3), "\n"; +}' < "$file2" | +awk '{count[$1]++} END {for (i in count) print count[i], i}' | +sort -rn + +# Word frequency +echo "Word frequency" +awk '{count[$1]++} END {for (i in count) print count[i], i}' < "$file2" | +sort -rn + +# Store number of characters to use in awk below +nchars=$(wc -c < "$file1") + +# Character frequency +# Print absolute +echo "Character frequency" +sed 's/./&\ +/g' < "$file1" | +awk '{count[$1]++} END {for (i in count) print count[i], i}' | +sort -rn | tee "$file3" + +# Print relative +echo "Relative character frequency" +awk -v NCHARS=$nchars 'BEGIN { + OFMT = "%.2g%%"} + {print $1, $2, $1 / NCHARS * 100}' "$file3" From eeaa938389bc14910c20743fb40880f372db63d3 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 06:08:19 -0600 Subject: [PATCH 32/90] Comment-out 7.sh awk command --- report/benchmarks/dgsh/7_no_func.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/report/benchmarks/dgsh/7_no_func.sh b/report/benchmarks/dgsh/7_no_func.sh index 3c22d571..4aab48ce 100755 --- a/report/benchmarks/dgsh/7_no_func.sh +++ b/report/benchmarks/dgsh/7_no_func.sh @@ -1,14 +1,16 @@ #!/bin/bash # Consistent sorting across machines -export LC_ALL=C +# export LC_ALL=C # Temporary files file1=$(mktemp) file2=$(mktemp) file3=$(mktemp) +file4=$(mktemp) -cat $INPUT_FILE > "$file1" +cat $INPUT_FILE > $file1 +cat $file1 # Split input one word per line tr -cs a-zA-Z '\n' < "$file1" > "$file2" @@ -35,18 +37,18 @@ awk '{count[$1]++} END {for (i in count) print count[i], i}' < "$file2" | sort -rn # Store number of characters to use in awk below + nchars=$(wc -c < "$file1") # Character frequency # Print absolute echo "Character frequency" -sed 's/./&\ -/g' < "$file1" | +sed 's/./&\n/g' < "$file1" | awk '{count[$1]++} END {for (i in count) print count[i], i}' | sort -rn | tee "$file3" # Print relative -echo "Relative character frequency" -awk -v NCHARS=$nchars 'BEGIN { - OFMT = "%.2g%%"} - {print $1, $2, $1 / NCHARS * 100}' "$file3" +# echo "Relative character frequency" +# awk -v NCHARS=$nchars 'BEGIN { +# OFMT = "%.2g%%"} +# {print $1, $2, $1 / NCHARS * 100}' "$file3" \ No newline at end of file From b39ff74a4fd0408f38efd0b4717e80cf9f14b791 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 06:09:41 -0600 Subject: [PATCH 33/90] Ad latest benchmark config for 6.sh and 7.sh --- report/benchmark_config.json | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index ed10a8d3..ae6862d2 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -1,9 +1,9 @@ [ { - "name": "Dgsh 1.sh - 2M", - "env": ["INPUT_FILE={RESOURCE_DIR}/in2M.xml"], - "pre_execution_script": ["wget -nc -O in2M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/mondial/mondial-3.0.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", + "name": "Dgsh 6.sh", + "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], + "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], + "command": "{TEST_SCRIPT_DIR}/dgsh/6.sh", "orch_args": "-d 2" }, { @@ -68,5 +68,19 @@ "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], "command": "{TEST_SCRIPT_DIR}/dgsh/5.sh", "orch_args": "-d 2" + }, + { + "name": "Dgsh 6.sh", + "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], + "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], + "command": "{TEST_SCRIPT_DIR}/dgsh/6.sh", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 7.sh (no func)", + "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], + "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], + "command": "{TEST_SCRIPT_DIR}/dgsh/7_no_func.sh", + "orch_args": "-d 2" } ] From 743c8f26b3cd470b1aeb1e2dc1a2140487311e8d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 07:51:01 -0600 Subject: [PATCH 34/90] Rename benchmark 7 to 8 --- report/benchmark_config.json | 2 +- report/benchmarks/dgsh/{7_no_func.sh => 8_no_func.sh} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename report/benchmarks/dgsh/{7_no_func.sh => 8_no_func.sh} (100%) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index ae6862d2..67d767f7 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -80,7 +80,7 @@ "name": "Dgsh 7.sh (no func)", "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], - "command": "{TEST_SCRIPT_DIR}/dgsh/7_no_func.sh", + "command": "{TEST_SCRIPT_DIR}/dgsh/8_no_func.sh", "orch_args": "-d 2" } ] diff --git a/report/benchmarks/dgsh/7_no_func.sh b/report/benchmarks/dgsh/8_no_func.sh similarity index 100% rename from report/benchmarks/dgsh/7_no_func.sh rename to report/benchmarks/dgsh/8_no_func.sh From 988055957fe60adf3cade5f6e855f6b6c206c316 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 08:04:22 -0600 Subject: [PATCH 35/90] Add 9.sh --- report/benchmark_config.json | 8 ++++++++ report/benchmarks/dgsh/9.sh | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 report/benchmarks/dgsh/9.sh diff --git a/report/benchmark_config.json b/report/benchmark_config.json index 67d767f7..60454a1e 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -82,5 +82,13 @@ "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], "command": "{TEST_SCRIPT_DIR}/dgsh/8_no_func.sh", "orch_args": "-d 2" + }, + { + "name": "Dgsh 9.sh - Riker Repo", + "env": ["INPUT={RESOURCE_DIR}/../../deps/riker/"], + "pre_execution_script": ["git clone https://github.com/curtsinger-lab/riker.git"], + "command": "{TEST_SCRIPT_DIR}/dgsh/9.sh", + "working_dir": "{RESOURCE_DIR}/riker", + "orch_args": "-d 2" } ] diff --git a/report/benchmarks/dgsh/9.sh b/report/benchmarks/dgsh/9.sh new file mode 100644 index 00000000..25b20804 --- /dev/null +++ b/report/benchmarks/dgsh/9.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) + +# Find object files and print defined symbols +find "$INPUT" -name "*.o" | xargs nm > "$file1" + +# List all defined (exported) symbols +awk 'NF == 3 && $2 ~ /[A-Z]/ {print $3}' "$file1" | sort > "$file2" + +# List all undefined (imported) symbols +awk '$1 == "U" {print $2}' "$file1" | sort > "$file3" + +# Print exports that are not imported +comm -23 "$file2" "$file3" From e5ca215792341f0fab73e6604c99dbb52406f9d7 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 08:28:47 -0600 Subject: [PATCH 36/90] Add more env variables to keep track of --- parallel-orch/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 80de8ca1..23707993 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -41,7 +41,7 @@ def log_root(msg, *args, **kwargs): 'PASH_TOP', 'PASH_TOP_LEVEL','RANDOM', 'LOGNAME', 'MACHTYPE', 'MOTD_SHOWN', 'OPTERR', 'OPTIND', 'PPID', 'PROMPT_COMMAND', 'PS4', 'SHELL', 'SHELLOPTS', 'SHLVL', 'TERM', 'UID', 'USER', 'XDG_SESSION_ID'} -SIGNIFICANT_VARS = {'foo', 'bar', 'baz'} +SIGNIFICANT_VARS = {'foo', 'bar', 'baz','file1', 'file2', 'file3', 'file4', 'file5', 'LC_ALL', 'nchars'} START_TIME = time.time() From 265909e09f4ca41f2e2b25e79a8e139083d2a137 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 08:30:34 -0600 Subject: [PATCH 37/90] Split pre exec cmd outside of run command --- report/benchmark_report.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 0c0e41b7..cb0b2438 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -49,7 +49,7 @@ def replace_with_env_var(input_string): def run_pre_execution_command(command, working_dir=os.getcwd()): print("Running pre-execution command:", command) - process = subprocess.Popen(command.strip().split(" "), cwd=working_dir) + process = subprocess.Popen(command, cwd=working_dir) process.wait() return process.returncode @@ -169,14 +169,14 @@ def main(): os.makedirs(os.environ.get('RESOURCE_DIR'), exist_ok=True) # Run pre-execution commands for pre_command in benchmark.get('pre_execution_script', []): - logging.debug(f"|Pre-execution: {pre_command}") - run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR')) + print(f"{pre_command}") + split_pre_command = replace_with_env_var(pre_command).split(" ") + run_pre_execution_command(split_pre_command, os.environ.get('RESOURCE_DIR')) working_dir = replace_with_env_var(benchmark.get('working_dir', os.environ.get('TEST_SCRIPT_DIR'))) bash_cmd_str = [BASH_COMMAND] + replace_with_env_var(benchmark['command']).split(" ") bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) - orch_cmd_str = replace_with_env_var(benchmark['command']).split(" ") orch_time, orch_output, orch_error = run_command_with_orch(orch_cmd_str, benchmark['orch_args'], working_dir) bash_times.append(bash_time) From 4a41309e0c6a9f67d202e5fd99b4d92e963544f1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 10:16:36 -0600 Subject: [PATCH 38/90] Remove LC_ALL var export from benchmarks --- report/benchmarks/dgsh/5.sh | 2 +- report/benchmarks/dgsh/6.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 report/benchmarks/dgsh/5.sh diff --git a/report/benchmarks/dgsh/5.sh b/report/benchmarks/dgsh/5.sh old mode 100644 new mode 100755 index 21a7fc58..0c639dad --- a/report/benchmarks/dgsh/5.sh +++ b/report/benchmarks/dgsh/5.sh @@ -6,7 +6,7 @@ file2=$(mktemp) file3=$(mktemp) file4=$(mktemp) -export LC_ALL=C +# export LC_ALL=C cat $INPUT_FILE >"$file1" diff --git a/report/benchmarks/dgsh/6.sh b/report/benchmarks/dgsh/6.sh index 4a919523..ad1001d4 100755 --- a/report/benchmarks/dgsh/6.sh +++ b/report/benchmarks/dgsh/6.sh @@ -10,7 +10,7 @@ file5=$(mktemp) cat $INPUT_FILE > $file1 # Consistent sorting across machines -export LC_ALL=C +# export LC_ALL=C # Stream input from file and split input one word per line # Create list of unique words From c3286edcfb0b72d76d936c6019bd1286fe5d7981 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 7 Sep 2023 11:00:48 -0600 Subject: [PATCH 39/90] Adjust gantt plot dimensions dynamically --- report/benchmark_plots.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py index 632f8184..7e7996d2 100644 --- a/report/benchmark_plots.py +++ b/report/benchmark_plots.py @@ -42,23 +42,27 @@ def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_d plt.tight_layout() plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) -def plot_gantt(activities, output_dir, filename): - fig, ax = plt.subplots(figsize=(15, 20)) # Increase figure size + +def plot_gantt(activities, output_dir, filename, simple=False): + + if simple: + activities = [activity for activity in activities if activity[0].startswith("RunNode,") or activity[0] == "Wait"] + + # Set figure height based on the number of activities + fig_height = len(activities) + fig, ax = plt.subplots(figsize=(15, 0.2 * fig_height)) # Sort the activities by their start time activities.sort(key=lambda x: x[1]) - # Reduce the height of each bar and reduce the gap between bars - bar_height = 5 - gap = 1 + bar_height = 0.8 + gap = 0.2 - # Plotting each activity for index, activity in enumerate(activities): action, start_time, duration = activity ax.broken_barh([(start_time, duration)], (index*(bar_height + gap), bar_height), facecolors='blue', edgecolor='black') - ax.text(start_time + duration/2, index*(bar_height + gap) + bar_height/2, action, ha='center', va='center', fontsize=6, color='white') + ax.text(start_time + duration/2, index*(bar_height + gap) + bar_height/2, action, ha='center', va='center', fontsize=6, color='gray') - # Setting labels & title ax.set_xlabel('Time (ms)') ax.set_title(f'Gantt Chart of {filename.strip("_gantt.pdf")}') ax.set_yticks([i*(bar_height + gap) + bar_height/2 for i in range(len(activities))]) From 727eed64757dca5623f305b9cf2de0063d2b3c5d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 8 Sep 2023 01:52:21 -0600 Subject: [PATCH 40/90] Add 7.sh --- report/benchmarks/dgsh/7.sh | 151 ++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 report/benchmarks/dgsh/7.sh diff --git a/report/benchmarks/dgsh/7.sh b/report/benchmarks/dgsh/7.sh new file mode 100644 index 00000000..becbd891 --- /dev/null +++ b/report/benchmarks/dgsh/7.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Consistent sorting +export LC_ALL=C + +# Print initial header only if DGSH_DRAW_EXIT is not set +if [ -z "${DGSH_DRAW_EXIT}" ] +then + cat < "$file_initial" + +# Number of accesses +echo -n 'Number of accesses: ' +wc -l < "$file_initial" + +# Total transferred bytes +awk '{s += $NF} END {print s}' "$file_initial" > "$file_bytes" +echo -n 'Number of Gbytes transferred: ' +awk '{print $1 / 1024 / 1024 / 1024}' "$file_bytes" + +# Process Host names +awk '{print $1}' "$file_initial" > "$file_hosts" + +# Number of accesses +echo -n 'Number of accesses: ' +wc -l < "$file_hosts" + +# Sorted hosts +sort "$file_hosts" > "$file_sorted_hosts" + +# Unique hosts +uniq "$file_sorted_hosts" > "$file_unique_hosts" +echo -n 'Number of hosts: ' +wc -l < "$file_unique_hosts" + +# Number of TLDs +awk -F. '$NF !~ /[0-9]/ {print $NF}' "$file_unique_hosts" | sort -u | wc -l +echo -n 'Number of top level domains: ' + +# Top 10 hosts +echo +echo "Top 10 Hosts" +echo "Top 10 Hosts" | sed 's/./-/g' + +uniq -c "$file_sorted_hosts" | sort -rn | head -10 +echo + +# Top 20 TLDs +echo +echo "Top 20 Level Domain Accesses" +echo "Top 20 Level Domain Accesses" | sed 's/./-/g' + +awk -F. '$NF !~ /^[0-9]/ {print $NF}' "$file_sorted_hosts" | sort | uniq -c | sort -rn | head -20 +echo + +# Domains +awk -F. 'BEGIN {OFS = "."} $NF !~ /^[0-9]/ {$1 = ""; print}' "$file_sorted_hosts" | sort > "$file_domains" + +# Number of domains +echo -n 'Number of domains: ' +uniq "$file_domains" | wc -l + +# Top 10 domains +echo +echo "Top 10 domains" +echo "Top 10 domains" | sed 's/./-/g' +uniq -c "$file_domains" | sort -rn | head -10 < "$file_domains" + +# Hosts by volume +echo +echo "Top 10 Hosts by Transfer" +echo "Top 10 Hosts by Transfer" | sed 's/./-/g' +awk ' {bytes[$1] += $NF} +END {for (h in bytes) print bytes[h], h}' "$file_initial" | sort -rn | head -10 + +# Sorted page name requests +awk '{print $7}' "$file_initial" | sort > "$file_requests" + +# Top 20 area requests (input is already sorted) +echo +echo "Top 20 area requests" +echo "Top 20 area requests" | sed 's/./-/g' +awk -F/ '{print $2}' "$file_requests" | uniq -c | sort -rn | head -20 +# Number of different pages +echo -n 'Number of different pages: ' +uniq "$file_requests" | wc -l + +# Top 20 requests +echo +echo "Top 20 requests" +echo "Top 20 requests" | sed 's/./-/g' +uniq -c "$file_requests" | sort -rn | head -20 + +# Access time: dd/mmm/yyyy:hh:mm:ss +awk '{print substr($4, 2)}' "$file_initial" > "$file_times" + +# Just dates +awk -F: '{print $1}' "$file_times" > "$file_dates" + +# Number of days +echo -n 'Accesses per day: ' +uniq "$file_dates" | wc -l > "$file_day_count" +awk ' +BEGIN { + getline NACCESS < "'"$file_initial"'" +} +{print NACCESS / $1}' "$file_day_count" + +echo -n 'MBytes per day: ' +awk ' +BEGIN { + getline NXBYTES < "'"$file_bytes"'" +} +{print NXBYTES / $1 / 1024 / 1024}' "$file_day_count" + +echo +echo "Accesses by Date" +echo "Accesses by Date" | sed 's/./-/g' +uniq -c < "$file_dates" + +# Accesses by day of week +echo +echo "Accesses by Day of Week" +echo "Accesses by Day of Week" | sed 's/./-/g' +sed 's|/|-|g' "$file_dates" | date -f - +%a 2>/dev/null | sort | uniq -c | sort -rn + +# Accesses by Local Hour +echo +echo "Accesses by Local Hour" +echo "Accesses by Local Hour" | sed 's/./-/g' +awk -F: '{print $2}' "$file_times" | sort | uniq -c From 9275db14c9add40d50a280707273b0e76c7dae70 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 8 Sep 2023 05:47:59 -0600 Subject: [PATCH 41/90] Correctly resolve all terminal nodes --- parallel-orch/trace.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/parallel-orch/trace.py b/parallel-orch/trace.py index ec41ae05..4fb93081 100644 --- a/parallel-orch/trace.py +++ b/parallel-orch/trace.py @@ -4,6 +4,7 @@ from typing import Tuple from enum import Enum import logging +from copy import deepcopy class Ref(Enum): @@ -44,6 +45,9 @@ def resolve_permissions(self, permissions: str): def __str__(self): return f"PathRef({self.ref}, {self.path}, {'r' if self.is_read else '-'}{'w' if self.is_write else '-'}{'x' if self.is_exec else '-'} {'no follow' if self.is_nofollow else ''})" + + def __repr__(self) -> str: + return self.__str__() def get_resolved_path(self): # Remove dupliate prefixes @@ -60,6 +64,7 @@ def get_resolved_path(self): return os.path.join(commonprefix, ref_without_prefix, path_without_prefix).replace("/./", "/") + class PathRefKey: @@ -339,19 +344,25 @@ def replace_path_ref_terminal_nodes(refs_dict: dict): refs_dict_new = {} for i, ref in refs_dict.items(): if isinstance(ref, PathRef): + # HACK: This is hard-coded stdout if ref.path == "" and ref.is_nofollow: continue else: - if ref.ref not in refs_dict: + # If ref of ref is string, it means that we reached a terminal node. + if isinstance(ref.ref, str): + pass + elif ref.ref not in refs_dict: key = PathRefKey("No Command", "r1") ref.ref = refs_dict[key].value else: + if isinstance(refs_dict[ref.ref], Ref): - ref.ref = refs_dict[ref.ref].value + ref.ref = deepcopy(str(refs_dict[ref.ref].value)) else: - ref.ref = os.getcwd() - refs_dict_new[i] = ref + ref.ref = deepcopy(refs_dict[ref.ref]) + assert(i not in refs_dict_new) + refs_dict_new[i] = deepcopy(ref) return refs_dict_new From b4419718b5541b38052b21b5502e522a8b035493 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 8 Sep 2023 09:06:58 -0600 Subject: [PATCH 42/90] Add 17.sh --- report/benchmarks/dgsh/17.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 report/benchmarks/dgsh/17.sh diff --git a/report/benchmarks/dgsh/17.sh b/report/benchmarks/dgsh/17.sh new file mode 100644 index 00000000..effa236f --- /dev/null +++ b/report/benchmarks/dgsh/17.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) + +# Read the input stream and save to a temporary file +cat $INPUT_FILE > "$file1" + +# Process the input in two different ways +cut -d , -f 5-6 "$file1" > "$file2" +cut -d , -f 2-4 "$file1" > "$file3" + +# Merge the processed results +paste -d , "$file2" "$file3" From 4452cee5e7a9ff8bd80b0493f1fe10480bdd8fd1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 8 Sep 2023 09:24:35 -0600 Subject: [PATCH 43/90] Add 16.sh --- report/benchmarks/dgsh/16.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 report/benchmarks/dgsh/16.sh diff --git a/report/benchmarks/dgsh/16.sh b/report/benchmarks/dgsh/16.sh new file mode 100644 index 00000000..718935c2 --- /dev/null +++ b/report/benchmarks/dgsh/16.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) +file4=$(mktemp) + +# Save the ls output to a temporary file +ls -n > "$file1" + +# Reorder fields in DIR-like way +awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2" + +# Count number of files +wc -l "$file1" | tr -d \\n > "$file3" +echo -n ' File(s) ' >> "$file3" +awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3" + +# Count number of directories and print label for number of dirs and calculate free bytes +grep -c '^d' "$file1" | tr -d \\n > "$file4" +df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4" + +# Display the results +cat "$file2" "$file3" "$file4" From 00f9b19b620b8a167f427a4a84a0a264c7681272 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 8 Sep 2023 09:25:43 -0600 Subject: [PATCH 44/90] Rename benchmark scripts --- report/benchmarks/dgsh/16.sh | 25 ------------------------- report/benchmarks/dgsh/17.sh | 23 ++++++++++++++++------- report/benchmarks/dgsh/18.sh | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 32 deletions(-) delete mode 100644 report/benchmarks/dgsh/16.sh create mode 100644 report/benchmarks/dgsh/18.sh diff --git a/report/benchmarks/dgsh/16.sh b/report/benchmarks/dgsh/16.sh deleted file mode 100644 index 718935c2..00000000 --- a/report/benchmarks/dgsh/16.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Initialize the necessary temporary files -file1=$(mktemp) -file2=$(mktemp) -file3=$(mktemp) -file4=$(mktemp) - -# Save the ls output to a temporary file -ls -n > "$file1" - -# Reorder fields in DIR-like way -awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2" - -# Count number of files -wc -l "$file1" | tr -d \\n > "$file3" -echo -n ' File(s) ' >> "$file3" -awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3" - -# Count number of directories and print label for number of dirs and calculate free bytes -grep -c '^d' "$file1" | tr -d \\n > "$file4" -df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4" - -# Display the results -cat "$file2" "$file3" "$file4" diff --git a/report/benchmarks/dgsh/17.sh b/report/benchmarks/dgsh/17.sh index effa236f..718935c2 100644 --- a/report/benchmarks/dgsh/17.sh +++ b/report/benchmarks/dgsh/17.sh @@ -4,13 +4,22 @@ file1=$(mktemp) file2=$(mktemp) file3=$(mktemp) +file4=$(mktemp) -# Read the input stream and save to a temporary file -cat $INPUT_FILE > "$file1" +# Save the ls output to a temporary file +ls -n > "$file1" -# Process the input in two different ways -cut -d , -f 5-6 "$file1" > "$file2" -cut -d , -f 2-4 "$file1" > "$file3" +# Reorder fields in DIR-like way +awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2" -# Merge the processed results -paste -d , "$file2" "$file3" +# Count number of files +wc -l "$file1" | tr -d \\n > "$file3" +echo -n ' File(s) ' >> "$file3" +awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3" + +# Count number of directories and print label for number of dirs and calculate free bytes +grep -c '^d' "$file1" | tr -d \\n > "$file4" +df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4" + +# Display the results +cat "$file2" "$file3" "$file4" diff --git a/report/benchmarks/dgsh/18.sh b/report/benchmarks/dgsh/18.sh new file mode 100644 index 00000000..effa236f --- /dev/null +++ b/report/benchmarks/dgsh/18.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Initialize the necessary temporary files +file1=$(mktemp) +file2=$(mktemp) +file3=$(mktemp) + +# Read the input stream and save to a temporary file +cat $INPUT_FILE > "$file1" + +# Process the input in two different ways +cut -d , -f 5-6 "$file1" > "$file2" +cut -d , -f 2-4 "$file1" > "$file3" + +# Merge the processed results +paste -d , "$file2" "$file3" From 01faf8f4eacf65ddb521290e9369a0732b184b86 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 12 Sep 2023 03:35:03 -0600 Subject: [PATCH 45/90] Update the operation of some scheduling components --- parallel-orch/partial_program_order.py | 42 +++++++++++++++++++++++--- parallel-orch/util.py | 20 +++++++----- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 3fb99088..ce95e201 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -629,14 +629,18 @@ def cmd_can_be_resolved(self, node_id: int) -> bool: logging.debug(f' >> Able to resolve {node_id}') return True - def __kill_all_currently_executing_and_schedule_restart(self): + def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[NodeId]"): nodes_to_kill = self.get_currently_executing() + latest_env = self.get_latest_env_file_for_node(max(node_ids)) for cmd_id in nodes_to_kill: self.__kill_node(cmd_id) + self.set_latest_env_file_for_node(cmd_id, latest_env) self.workset.remove(cmd_id) # Our new workset is the nodes that were killed # Previous workset got killed self.workset.extend(nodes_to_kill) + + def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') @@ -671,7 +675,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): else: logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) - self.__kill_all_currently_executing_and_schedule_restart() + # self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) @@ -1127,7 +1131,21 @@ def __frontier_commit_and_push(self): # If node is still being executed, we cannot progress further else: new_frontier.extend([frontier_node]) - logging.debug(f" > Not commiting node {frontier_node}, readding to frontier") + if frontier_node in self.get_currently_executing(): + logging.debug(f" > Node {frontier_node} is still being executed") + elif frontier_node in self.get_committed(): + logging.debug(f" > Node {frontier_node} is already committed") + elif frontier_node in self.stopped: + logging.debug(f" > Node {frontier_node} is stopped") + elif frontier_node in self.speculated: + logging.debug(f" > Node {frontier_node} is speculated") + elif frontier_node in self.workset: + logging.debug(f" > Node {frontier_node} is in the workset") + elif self.is_loop_node(frontier_node): + logging.debug(f" > Node {frontier_node} is a loop node") + elif frontier_node in self.waiting_for_frontend: + logging.debug(f" > Node {frontier_node} is waiting for frontend") + logging.debug(f" > Not commiting node {frontier_node}, keeping in frontier") ## Update the frontier to the new frontier self.frontier = new_frontier @@ -1324,7 +1342,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## Here we continue with the normal execution flow else: logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") - self.resolve_most_recent_envs_and_continue_command_execution(node_id) + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) # This needs to become more fine grained def exclude_insignificant_diffs(self, env_diff_dict): @@ -1352,7 +1370,7 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): if node_id in self.waiting_for_frontend: logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - self.resolve_most_recent_envs_and_continue_command_execution(node_id) + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution-{new_env_node}", invalidate=False) @@ -1393,9 +1411,23 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") # If there are significant differences, set the new env as the latest (the one to run Riker with) self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) + logging.critical(f">>>>>>>>>>>>>>>>{node_id} - {self.get_new_env_file_for_node(node_id)}") # Add the node to the workset again if node_id not in self.workset: self.workset.append(node_id) + # Kill and restart all currently executing commands + self.__kill_all_currently_executing_and_schedule_restart([node_id]) + logging.critical(f">>>>>>>>>>>>>>>>{node_id} - {self.get_new_env_file_for_node(node_id)}") + for waiting_for_frontend_node in self.waiting_for_frontend: + if waiting_for_frontend_node not in self.workset: + self.workset.append(waiting_for_frontend_node) + self.set_latest_env_file_for_node(waiting_for_frontend_node, self.get_new_env_file_for_node(node_id)) + assert(self.get_new_env_file_for_node(node_id) is not None) + assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) + self.log_partial_program_order_info() + logging.debug("-") + self.waiting_for_frontend = set() + self.populate_to_be_resolved_dict() else: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") assert(node_id not in self.stopped) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 7f90368e..8458d15e 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -150,15 +150,21 @@ def invalidate_named_timestamp(action: str, node=None, key=None): del config.named_timestamps[key] def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None): - set_named_timestamp(action, node, key) - logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") + try: + set_named_timestamp(action, node, key) + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") + except KeyError: + logging.error(f"Named timestamp {key} already exists") def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key=None, invalidate=True): - if key is None: - key = f"{action}{',' + str(node) if node is not None else ''}" - logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[key])}") - if invalidate: - invalidate_named_timestamp(action, node, key) + try: + if key is None: + key = f"{action}{',' + str(node) if node is not None else ''}" + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[key])}") + if invalidate: + invalidate_named_timestamp(action, node, key) + except KeyError: + logging.error(f"Named timestamp {key} does not exist") def to_milliseconds_str(seconds: float) -> str: return f"{seconds * 1000:.3f}ms" \ No newline at end of file From aafd376746372aebcec676f989e269e1485437f5 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 12 Sep 2023 04:03:59 -0600 Subject: [PATCH 46/90] Add scheduler optimization args --- parallel-orch/scheduler_server.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 136ef30e..724a2b85 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -1,10 +1,8 @@ import argparse -import copy import logging import signal from util import * import config -import sys from partial_program_order import parse_partial_program_order_from_file, LoopStack, NodeId, parse_node_id ## @@ -28,6 +26,15 @@ def parse_args(): type=str, default=None, help="Set logging output file. Default: stdout") + parser.add_argument("--sandbox-killing-on-commit", + action="store_true", + default=False, + help="Kill any running overlay instances before commiting to the lower layer") + parser.add_argument("--env-check-all-nodes-on-wait", + action="store_true", + default=None, + help="When receiving a wait check for env changes between the current node and all other waiting nodes, instead of only examining the current wait node.") + args, unknown_args = parser.parse_known_args() return args @@ -294,7 +301,10 @@ def main(): logging.getLogger().setLevel(logging.DEBUG) # elif args.debug_level >= 3: # logging.getLogger().setLevel(logging.TRACE) - + + # Set optimization options + config.sandbox_killing = args.sandbox_killing_on_commit + config.all_node_env_resolution = args.env_check_all_nodes_on_wait scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() From 60dc74fa8f309426472342c3d029968abfdb1dd7 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 12 Sep 2023 05:27:18 -0600 Subject: [PATCH 47/90] Optional sandbox killing based on arg --- parallel-orch/config.py | 3 +++ parallel-orch/partial_program_order.py | 27 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 23707993..be4259fd 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -46,3 +46,6 @@ def log_root(msg, *args, **kwargs): START_TIME = time.time() named_timestamps = {} + +sandbox_killing = False +all_node_env_resolution = False diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index ce95e201..d53aaa38 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -282,7 +282,7 @@ def __init__(self, nodes, edges, initial_env_file): self.latest_envs = {} self.initial_env_file = initial_env_file self.waiting_for_frontend = set() - + def __str__(self): return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" @@ -357,6 +357,17 @@ def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str): def get_latest_env_file_for_node(self, node_id: NodeId) -> str: return self.latest_envs.get(node_id) + + def get_most_recent_possible_new_env_for_node(self, node_id) -> str: + most_recent_env_node = node_id + while self.get_new_env_file_for_node(most_recent_env_node) is None: + predecessor = self.get_prev(most_recent_env_node) + logging.critical(predecessor) + if len(predecessor) == 0: + return None + else: + most_recent_env_node = predecessor[0] + return self.get_new_env_file_for_node(most_recent_env_node) ## This returns all previous nodes of a sub partial order def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": @@ -629,18 +640,19 @@ def cmd_can_be_resolved(self, node_id: int) -> bool: logging.debug(f' >> Able to resolve {node_id}') return True - def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[NodeId]"): + def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[NodeId]", start=None): nodes_to_kill = self.get_currently_executing() - latest_env = self.get_latest_env_file_for_node(max(node_ids)) + if start is not None: + nodes_to_kill = [node_id for node_id in nodes_to_kill if node_id in self.get_transitive_closure([start])] for cmd_id in nodes_to_kill: self.__kill_node(cmd_id) - self.set_latest_env_file_for_node(cmd_id, latest_env) + most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id) + if most_recent_new_env is not None: + self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) self.workset.remove(cmd_id) # Our new workset is the nodes that were killed # Previous workset got killed self.workset.extend(nodes_to_kill) - - def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') @@ -675,7 +687,8 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): else: logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) - # self.__kill_all_currently_executing_and_schedule_restart(to_commit) + if config.sandbox_killing: + self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) From 63b2acbf254ce836dcea96482c2a03b01b00a28a Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 12 Sep 2023 10:02:48 -0600 Subject: [PATCH 48/90] Update logs while killing --- parallel-orch/partial_program_order.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index d53aaa38..c44d8466 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -650,8 +650,10 @@ def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[No if most_recent_new_env is not None: self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) self.workset.remove(cmd_id) + log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id) + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd_id, key=f"PostExecResolution-{cmd_id}") # Our new workset is the nodes that were killed - # Previous workset got killed + # Previous workset got killed self.workset.extend(nodes_to_kill) def __kill_node(self, cmd_id: "NodeId"): @@ -688,7 +690,8 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) if config.sandbox_killing: - self.__kill_all_currently_executing_and_schedule_restart(to_commit) + # self.__kill_all_currently_executing_and_schedule_restart(to_commit) + pass log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) @@ -1355,7 +1358,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## Here we continue with the normal execution flow else: logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + self.call_resolve_most_recent_envs_and_continue_command_execution(node_id) # This needs to become more fine grained def exclude_insignificant_diffs(self, env_diff_dict): @@ -1383,7 +1386,13 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): if node_id in self.waiting_for_frontend: logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + self.call_resolve_most_recent_envs_and_continue_command_execution(node_id) + + def call_resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): + if config.all_node_env_resolution: + self.resolve_most_recent_envs_and_continue_command_execution(new_env_node) + else: + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(new_env_node) def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution-{new_env_node}", invalidate=False) @@ -1424,17 +1433,19 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") # If there are significant differences, set the new env as the latest (the one to run Riker with) self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) - logging.critical(f">>>>>>>>>>>>>>>>{node_id} - {self.get_new_env_file_for_node(node_id)}") # Add the node to the workset again if node_id not in self.workset: self.workset.append(node_id) # Kill and restart all currently executing commands + # The envs are updated inside __kill_all_currently_executing_and_schedule_restart self.__kill_all_currently_executing_and_schedule_restart([node_id]) logging.critical(f">>>>>>>>>>>>>>>>{node_id} - {self.get_new_env_file_for_node(node_id)}") + # For all other nodes not killed, we update the latest env and restart them for waiting_for_frontend_node in self.waiting_for_frontend: if waiting_for_frontend_node not in self.workset: self.workset.append(waiting_for_frontend_node) - self.set_latest_env_file_for_node(waiting_for_frontend_node, self.get_new_env_file_for_node(node_id)) + most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) + self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) assert(self.get_new_env_file_for_node(node_id) is not None) assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) self.log_partial_program_order_info() From d4dc63c5ef449444771293bdb269fb79ac7646d5 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 13 Sep 2023 05:10:15 -0600 Subject: [PATCH 49/90] Add early rerun first (buggy) implementation --- parallel-orch/partial_program_order.py | 69 ++++++++++++++++++++++---- parallel-orch/scheduler_server.py | 4 +- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index c44d8466..c1a58472 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -9,6 +9,7 @@ import trace from util import * import util +from collections import defaultdict from shasta.ast_node import AstNode, CommandNode, PipeNode @@ -282,6 +283,8 @@ def __init__(self, nodes, edges, initial_env_file): self.latest_envs = {} self.initial_env_file = initial_env_file self.waiting_for_frontend = set() + self.run_after = defaultdict(set) + self.pending_to_execute = set() def __str__(self): return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" @@ -671,6 +674,30 @@ def __kill_node(self, cmd_id: "NodeId"): # Terminate the main process util.kill_process(proc_to_kill.pid) + + def resolve_dependencies_early(self, node_id=None): + to_check = self.waiting_for_frontend.copy() + if node_id: + to_check.add(node_id) + node_id_has_dependency = False + for second_cmd_id in to_check: + ## reverse sort breaks because it does not guarantee that the new env has arrived + for first_cmd_id in sorted(self.to_be_resolved[second_cmd_id], reverse=True): + if self.rw_sets.get(first_cmd_id) is not None: + if self.has_forward_dependency(first_cmd_id, second_cmd_id): + # if second_cmd_id not in self.workset and self.check_if_to_be_resolved_entry_would_change(second_cmd_id): + node_id_has_dependency = True + self.waiting_for_frontend.discard(second_cmd_id) + self.run_after[first_cmd_id].add(second_cmd_id) + self.pending_to_execute.add(second_cmd_id) + # self.workset.append(second_cmd_id) + logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) + break + logging.critical("HERE") + # if node_id_has_dependency == True: + self.populate_to_be_resolved_dict() + return node_id_has_dependency def resolve_commands_that_can_be_resolved_and_push_frontier(self): @@ -1305,7 +1332,25 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): proc, trace_file, stdout, stderr, post_execution_env_file = execute_func(cmd, node_id, env_file_to_execute_with) self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file) logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing") - + + # This method attempts to add to workset (rerun) + # any command that found to have a dependency through early resolution + def attempt_rerun_pending_nodes(self): + restarted_nodes = set() + for node_id, run_after_nodes in self.run_after.items(): + new_run_after_nodes = run_after_nodes.copy() + if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute: + for node in run_after_nodes: + if node not in self.get_currently_executing(): + logging.debug(f"Running node {node} after execution of {node_id}") + self.workset.append(node) + self.pending_to_execute.discard(node) + self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id)) + restarted_nodes.add(node) + new_run_after_nodes.discard(node) + self.run_after[node_id] = new_run_after_nodes + return restarted_nodes + def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") @@ -1347,8 +1392,16 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand logging.debug("No resolvable nodes were found in this round, nothing will change...") return + self.resolve_dependencies_early(node_id) + + restarted_cmds = self.attempt_rerun_pending_nodes() + logging.critical(f"Restarted__{(node_id, restarted_cmds, self.pending_to_execute, self.run_after)}") log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) # Remove from workset and add it again later if necessary + + # assert node_id not in restarted_cmds + # if len(restarted_cmds) > 0: + self.workset.remove(node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. @@ -1358,7 +1411,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## Here we continue with the normal execution flow else: logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") - self.call_resolve_most_recent_envs_and_continue_command_execution(node_id) + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) # This needs to become more fine grained def exclude_insignificant_diffs(self, env_diff_dict): @@ -1385,15 +1438,9 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): if node_id in self.waiting_for_frontend: - logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - self.call_resolve_most_recent_envs_and_continue_command_execution(node_id) - - def call_resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): - if config.all_node_env_resolution: - self.resolve_most_recent_envs_and_continue_command_execution(new_env_node) - else: - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(new_env_node) - + logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution-{new_env_node}", invalidate=False) to_check = list(self.waiting_for_frontend) + [new_env_node] diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 724a2b85..de6054d9 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -103,8 +103,10 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) + logging.critical(f"HERE - {node_id} - {self.partial_program_order.get_new_env_file_for_node(node_id)}") + ## Attempt to rerun all pending nodes + self.partial_program_order.attempt_rerun_pending_nodes() - ## Attempt to resolve environment differences on waiting partial order nodes self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) From 11657b93b0b77e1208ca132d58b0dbb7de59b93d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 13 Sep 2023 14:54:05 -0600 Subject: [PATCH 50/90] Fix early resolution corectedness issues --- parallel-orch/partial_program_order.py | 76 +++++++++++--------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index c1a58472..be3012db 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -285,6 +285,7 @@ def __init__(self, nodes, edges, initial_env_file): self.waiting_for_frontend = set() self.run_after = defaultdict(set) self.pending_to_execute = set() + self.to_be_resolved_prev = {} def __str__(self): return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" @@ -676,7 +677,7 @@ def __kill_node(self, cmd_id: "NodeId"): util.kill_process(proc_to_kill.pid) def resolve_dependencies_early(self, node_id=None): - to_check = self.waiting_for_frontend.copy() + to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} if node_id: to_check.add(node_id) node_id_has_dependency = False @@ -694,10 +695,20 @@ def resolve_dependencies_early(self, node_id=None): logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) break - logging.critical("HERE") # if node_id_has_dependency == True: self.populate_to_be_resolved_dict() - return node_id_has_dependency + for node in self.pending_to_execute: + prev_to_be_resoved = self.to_be_resolved_prev.get(node) + if prev_to_be_resoved is None: + return + elif set(self.to_be_resolved[node]) == set(prev_to_be_resoved): + # Not caring about this dependency because env has not yet changed + logging.debug() + self.pending_to_execute.remove(node) + for k, v in self.run_after.items(): + if node in v: + self.run_after[k].remove(node) + return def resolve_commands_that_can_be_resolved_and_push_frontier(self): @@ -1292,6 +1303,7 @@ def run_cmd_non_blocking(self, node_id: NodeId): ## A command should only be run if it's in the frontier, otherwise it should be spec run logging.debug(f'Running command: {node_id} {self.get_node(node_id)}') logging.debug(f"ExecutingAdd|{node_id}") + self.to_be_resolved_prev[node_id] = self.to_be_resolved[node_id].copy() self.execute_cmd_core(node_id, speculate=False) ## Run a command and add it to the dictionary of executing ones @@ -1339,7 +1351,7 @@ def attempt_rerun_pending_nodes(self): restarted_nodes = set() for node_id, run_after_nodes in self.run_after.items(): new_run_after_nodes = run_after_nodes.copy() - if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute: + if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute and node_id not in self.get_currently_executing(): for node in run_after_nodes: if node not in self.get_currently_executing(): logging.debug(f"Running node {node} after execution of {node_id}") @@ -1392,22 +1404,22 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand logging.debug("No resolvable nodes were found in this round, nothing will change...") return - self.resolve_dependencies_early(node_id) - restarted_cmds = self.attempt_rerun_pending_nodes() - logging.critical(f"Restarted__{(node_id, restarted_cmds, self.pending_to_execute, self.run_after)}") log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) # Remove from workset and add it again later if necessary - - # assert node_id not in restarted_cmds - # if len(restarted_cmds) > 0: - self.workset.remove(node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) + ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. if self.get_new_env_file_for_node(node_id) is None: logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") self.waiting_for_frontend.add(node_id) + + # We will however attempt to resolve dependencies early + self.resolve_dependencies_early(node_id) + restarted_cmds = self.attempt_rerun_pending_nodes() + logging.critical(f"Restarted {restarted_cmds}") + self.log_partial_program_order_info() ## Here we continue with the normal execution flow else: logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") @@ -1440,37 +1452,8 @@ def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId if node_id in self.waiting_for_frontend: logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) - - def resolve_most_recent_envs_and_continue_command_execution(self, new_env_node: NodeId): - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution_FrontendWaitReceived", new_env_node, key=f"PostExecResolution-{new_env_node}", invalidate=False) - to_check = list(self.waiting_for_frontend) + [new_env_node] - logging.debug(f"Node {new_env_node} received its latest env from runtime. Comparing env with itself and other waiting nodes.") - # Node is no longer waiting to be resolved. It might have not been waiting at all. - self.waiting_for_frontend.discard(new_env_node) - for node_id in to_check: - if self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(new_env_node), - self.get_latest_env_file_for_node(node_id)): - logging.debug(f"Significant differences found between new and latest env files for {node_id}.") - logging.debug(f"Assigning node {new_env_node} new env (Wait) as the new latest env of node {node_id} and re-executing.") - # If there are significant differences, set the new env as the latest (the one to run Riker with) - self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(new_env_node)) - # Add the node to the workset again - assert node_id not in self.workset - self.workset.append(node_id) - self.waiting_for_frontend.discard(node_id) - elif node_id == new_env_node: - logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env.") - assert(node_id not in self.stopped) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) - self.add_to_speculated(node_id) - ## We can now call the general resolution method that determines which commands - ## can be resolved (all their dependencies are done executing), and resolves them. - self.resolve_commands_that_can_be_resolved_and_push_frontier() - assert(self.valid()) - else: - logging.debug(f"Node {node_id} has no significant differences with the new env, but has not yet received its wait. Nothing to do for now.") - - def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId): + + def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") # Node is no longer waiting to be resolved. It might have not been waiting at all. self.waiting_for_frontend.discard(node_id) @@ -1499,13 +1482,19 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node logging.debug("-") self.waiting_for_frontend = set() self.populate_to_be_resolved_dict() - else: + else: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") assert(node_id not in self.stopped) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) self.add_to_speculated(node_id) ## We can now call the general resolution method that determines which commands ## can be resolved (all their dependencies are done executing), and resolves them. + + # We will however attempt to resolve dependencies early for the remaining nodes + self.resolve_dependencies_early(node_id) + restarted_cmds = self.attempt_rerun_pending_nodes() + logging.critical(f"Restarted after successfull env resolution {restarted_cmds}") + self.log_partial_program_order_info() self.resolve_commands_that_can_be_resolved_and_push_frontier() assert(self.valid()) @@ -1558,6 +1547,7 @@ def log_partial_program_order_info(self): logging.debug(f"WAITING: {sorted(list(self.speculated))}") logging.debug(f"for FRONTEND: {sorted(list(self.waiting_for_frontend))}") logging.debug(f"TO RESOLVE: {self.to_be_resolved}") + logging.debug(f"PENDING TO EXEC: {self.pending_to_execute}") self.log_rw_sets() logging.debug(f"=" * 80) From fb4c863f6ce2d357d32dd8eacb1c0e4fca0456e8 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 13 Sep 2023 15:55:21 -0600 Subject: [PATCH 51/90] Save orch logs while running benchmarks --- report/benchmark_report.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index cb0b2438..7cce2b7b 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -21,6 +21,10 @@ REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') +def save_log_data(log_data, output_dir, filename): + with open(os.path.join(output_dir, filename), 'w') as f: + f.write(log_data) + def parse_logs_into_activities(log_data): info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")] @@ -179,6 +183,7 @@ def main(): bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) orch_cmd_str = replace_with_env_var(benchmark['command']).split(" ") orch_time, orch_output, orch_error = run_command_with_orch(orch_cmd_str, benchmark['orch_args'], working_dir) + save_log_data(orch_error, REPORT_OUTPUT_DIR, f"{benchmark['name']}_log.log") bash_times.append(bash_time) orch_times.append(orch_time) diff_lines = compare_results(bash_output, orch_output) From ac0720ed7712b3dfad3e7f32dd00936bba776a6d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 14 Sep 2023 02:36:02 -0600 Subject: [PATCH 52/90] Add options to save results --- report/benchmark_report.py | 82 +++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 7cce2b7b..0d12ac0e 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -5,6 +5,9 @@ from benchmark_plots import * import logging import difflib +import argparse +import csv + # Setting and exporting environment variables (same as tests for now). @@ -21,7 +24,17 @@ REPORT_OUTPUT_DIR = os.path.join(os.environ['WORKING_DIR'], 'report_output') +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark and report interface for a system.") + parser.add_argument('--no-plots', action='store_true', help="Do not print plots.") + parser.add_argument('--no-logs', action='store_true', help="Do not save log files.") + parser.add_argument('--csv-output', action='store_true', help="Save results in CSV format.") + return parser.parse_args() + + def save_log_data(log_data, output_dir, filename): + if args.no_logs: + return with open(os.path.join(output_dir, filename), 'w') as f: f.write(log_data) @@ -92,25 +105,21 @@ def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percent print(f"Bash Execution Time: {round(bash_time, 3)}s") print(f"hs Execution Time: {round(orch_time, 3)}s") print(f"Valid: {'Yes' if len(diff_lines) == 0 else 'No - see below'}") - for line in diff_lines: - print(line) + if len(diff_lines) > 0: + for line in diff_lines: + print(line) + print("-" * 40) print(comparison_result) - print("-" * 40) print() - -def print_sorted_logs(orch_output): - relevant_lines = [line for line in orch_output.split("\n") if line.startswith("INFO:root:>|")] - # Extract lines with step time and sort - step_time_lines = [(line, float(line.split("Step time:")[1].split("ms")[0])) for line in relevant_lines if "Step time:" in line] - sorted_step_time_lines = sorted(step_time_lines, key=lambda x: x[1], reverse=True) - - for entry in sorted_step_time_lines: - split_line = entry[0].split("|")[1:] - pretty_line = " | ".join(split_line) - print(f"{pretty_line}, Step Time: {entry[1]:.3f}ms") - + if args.csv_output: + csv_filename = os.path.join(REPORT_OUTPUT_DIR, f"results.csv") + with open(csv_filename, 'a') as csv_file: + writer = csv.writer(csv_file) + valid = 'Yes' if len(diff_lines) == 0 else 'No' + writer.writerow([benchmark_name, bash_time, orch_time, valid, comparison_result]) + -def print_exec_time_for_cmds(orch_outpt): +def print_exec_time_for_cmds(orch_outpt, benchmark_name): # Split the log into lines and filter the relevant ones relevant_lines = [line.replace("INFO:root:>|PartialOrder|RunNode,", "") for line in orch_outpt.split("\n") if line.startswith("INFO:root:>|PartialOrder|RunNode,") and "Step time:" in line] # Extract lines with RunNode commands and their step times @@ -145,6 +154,17 @@ def print_exec_time_for_cmds(orch_outpt): for node, time_lost in sorted(time_lost_per_node.items(), key=lambda x: x[1], reverse=True): print(f"{node:2d}: {node_times[node]:.3f}ms ({counts[node]} times) | Avg: {sum(node_distinct_times[node])/len(node_distinct_times[node]):.3f}ms | {node_distinct_times[node]} | Time lost: {time_lost:.3f}ms") print("-" * 40) + print(f"Total time lost: {sum(time_lost_per_node.values()):.02f}ms") + print("=" * 100) + + if args.csv_output: + csv_filename = os.path.join(REPORT_OUTPUT_DIR, f"{benchmark_name}_execution_times.csv") + with open(csv_filename, 'w') as csv_file: + writer = csv.writer(csv_file) + writer.writerow(["Node", "Time (ms)", "Execution Count", "Average Time (ms)", "Distinct Times", "Time Lost (ms)"]) + for node, time_lost in sorted(time_lost_per_node.items(), key=lambda x: x[1], reverse=True): + writer.writerow([node, node_times[node], counts[node], sum(node_distinct_times[node])/len(node_distinct_times[node]), node_distinct_times[node], time_lost]) + def export_env_vars(env_vars): @@ -155,6 +175,7 @@ def export_env_vars(env_vars): def main(): + # Load benchmark configurations with open(os.path.join(os.environ.get('WORKING_DIR'), 'benchmark_config.json'), 'r') as f: benchmarks_config = json.load(f) @@ -164,9 +185,15 @@ def main(): # Create output dir for reports os.makedirs(REPORT_OUTPUT_DIR, exist_ok=True) + + if args.csv_output: + csv_filename = os.path.join(REPORT_OUTPUT_DIR, f"results.csv") + with open(csv_filename, 'w') as csv_file: + writer = csv.writer(csv_file) + writer.writerow(["Benchmark", "Bash Execution Time", "hs Execution Time", "Valid", "Comparison"]) for benchmark in benchmarks_config: - + print("=" * 100) # Set up preferred environment export_env_vars(benchmark.get('env', {})) # Create resource dir if non-existent @@ -183,7 +210,6 @@ def main(): bash_time, bash_output, _bash_error = run_command(bash_cmd_str, working_dir) orch_cmd_str = replace_with_env_var(benchmark['command']).split(" ") orch_time, orch_output, orch_error = run_command_with_orch(orch_cmd_str, benchmark['orch_args'], working_dir) - save_log_data(orch_error, REPORT_OUTPUT_DIR, f"{benchmark['name']}_log.log") bash_times.append(bash_time) orch_times.append(orch_time) diff_lines = compare_results(bash_output, orch_output) @@ -192,19 +218,27 @@ def main(): print_results(benchmark['name'], bash_time, orch_time, diff_lines, diff_percentage) activities = parse_logs_into_activities(log_data=orch_error) - plot_gantt(activities, REPORT_OUTPUT_DIR, f"{benchmark['name']}_gantt") + if not args.no_plots: + plot_gantt(activities, REPORT_OUTPUT_DIR, f"{benchmark['name']}_gantt", simple=True) + + print_exec_time_for_cmds(orch_error, benchmark['name']) + + # Instead of always saving the logs, check the argument: + if not args.no_logs: + save_log_data(orch_error, REPORT_OUTPUT_DIR, f"{benchmark['name']}_log.log") - print_exec_time_for_cmds(orch_error) # Plot the results benchmark_names = [benchmark['name'] for benchmark in benchmarks_config] print(f"Execution graphs can be found in {REPORT_OUTPUT_DIR}") - # print_sorted_logs(orch_error) - plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") - plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") - + + if not args.no_plots: + plot_benchmark_times_combined(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_combined") + plot_benchmark_times_individual(benchmark_names, bash_times, orch_times, REPORT_OUTPUT_DIR, "benchmark_times_individual") + if __name__ == "__main__": + args = parse_args() main() From 2a7e3e42376955c569d4a0d377eeb547ad480488 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 14 Sep 2023 03:21:41 -0600 Subject: [PATCH 53/90] Add sandbox kiling arg again --- parallel-orch/partial_program_order.py | 3 +-- parallel-orch/scheduler_server.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index be3012db..3970922e 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -728,8 +728,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) if config.sandbox_killing: - # self.__kill_all_currently_executing_and_schedule_restart(to_commit) - pass + self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index de6054d9..0893f0e8 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -26,7 +26,7 @@ def parse_args(): type=str, default=None, help="Set logging output file. Default: stdout") - parser.add_argument("--sandbox-killing-on-commit", + parser.add_argument("--sandbox-killing", action="store_true", default=False, help="Kill any running overlay instances before commiting to the lower layer") @@ -305,7 +305,7 @@ def main(): # logging.getLogger().setLevel(logging.TRACE) # Set optimization options - config.sandbox_killing = args.sandbox_killing_on_commit + config.sandbox_killing = args.sandbox_killing config.all_node_env_resolution = args.env_check_all_nodes_on_wait scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() From 00198bd76e0945fc72e3fb46e4e7bf24d2b0af25 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 14 Sep 2023 04:00:06 -0600 Subject: [PATCH 54/90] Add correct argument splitting --- report/benchmark_report.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 0d12ac0e..586c20a6 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -79,9 +79,11 @@ def run_command(command, working_dir=os.getcwd()): return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) def run_command_with_orch(command, orch_args, working_dir=os.getcwd()): - print("Running (and timing) command with orch: ", " ".join(command)) + orch_args = orch_args.split(" ") + print("Running (and timing) command with orch: ", " ".join([ORCH_COMMAND] + orch_args + command)) + print([ORCH_COMMAND] + orch_args + command) start_time = time.time() - process = subprocess.Popen([ORCH_COMMAND, orch_args] + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir, env=os.environ) + process = subprocess.Popen([ORCH_COMMAND] + orch_args + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir, env=os.environ) stdout, stderr = process.communicate() end_time = time.time() return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) From 42c5a50e6219dc3698a7f512f8cbe42d01fb33b6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 15 Sep 2023 11:32:24 -0600 Subject: [PATCH 55/90] Rewrite proc killing commands using psutil --- parallel-orch/util.py | 79 ++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 8458d15e..071ad3fd 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -5,8 +5,9 @@ import subprocess import tempfile import time -import difflib import re +import psutil +import signal def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX) @@ -58,46 +59,6 @@ def socket_respond(connection: socket.socket, message: str): connection.sendall(bytes_message) connection.close() -# Check if the process with the given PID is alive. -def is_process_alive(pid) -> bool: - try: - os.kill(pid, 0) - except OSError: - return False - else: - return True - -# Get all child process PIDs of a process -def get_child_processes(parent_pid) -> int: - try: - output = subprocess.check_output(['pgrep', '-P', str(parent_pid)]) - return [int(pid) for pid in output.decode('utf-8').split()] - except subprocess.CalledProcessError: - # No child processes were found - return [] - -# Note: Check this function as it does not seem the right way to kill a proc. -# SIGKILL should be sent once and for all. -# Kills the process with the provided PID. -# Returns True if the process was successfully killed, False otherwise. -def kill_process(pid: int) -> bool: - kill_attempts = 0 - while is_process_alive(pid) and kill_attempts < config.MAX_KILL_ATTEMPTS: - try: - # Send SIGKILL signal for a forceful kill - subprocess.check_call(['kill', '-9', str(pid)]) - time.sleep(0.005) # Sleep for 5 milliseconds before checking again - except subprocess.CalledProcessError: - logging.debug(f"Failed to kill PID {pid}.") - kill_attempts += 1 - - if kill_attempts >= config.MAX_KILL_ATTEMPTS: - logging.warning(f"Gave up killing PID {pid} after {config.MAX_KILL_ATTEMPTS} attempts.") - return False - - return True - - def parse_env_string_to_dict(content): # Parse scalar string vars scalar_vars_string = re.findall(r'declare (?:-x|--)? (\w+)="([^"]*)"', content, re.DOTALL) @@ -167,4 +128,38 @@ def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key logging.error(f"Named timestamp {key} does not exist") def to_milliseconds_str(seconds: float) -> str: - return f"{seconds * 1000:.3f}ms" \ No newline at end of file + return f"{seconds * 1000:.3f}ms" + + + +def get_all_child_processes(pid): + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + return [] + + children = parent.children(recursive=True) + parent_of_parent = parent.parent() + logging.critical("PARENT_PROCESS: " + str(parent_of_parent)) + logging.critical("MAIN_PROCESS: " + str(parent)) + all_processes = [parent] + children + for process in all_processes: + logging.critical("PROCESS: " + str(process)) + return all_processes + + +def kill_process_tree(pid, sig=signal.SIGTERM): + processes = get_all_child_processes(pid) + for proc in processes: + try: + os.kill(proc.pid, sig) + except (psutil.NoSuchProcess): + pass + except (PermissionError): + logging.critical("NO PERMISSION") + + # Check if processes are still alive + time.sleep(0.01) + + alive_processes = [f"{proc}-({proc.status()})" for proc in processes if proc.is_running()] + return alive_processes From 6b3e67815fb9864700528192eb92851197c79b40 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 15 Sep 2023 11:32:58 -0600 Subject: [PATCH 56/90] Use most recent process killing methods --- parallel-orch/config.py | 2 -- parallel-orch/partial_program_order.py | 19 +++++++++---------- parallel-orch/scheduler_server.py | 1 - 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index be4259fd..48761c76 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -35,8 +35,6 @@ def log_root(msg, *args, **kwargs): SCHEDULER_SOCKET = os.getenv('PASH_SPEC_SCHEDULER_SOCKET') -MAX_KILL_ATTEMPTS = 10 # Define a maximum number of kill attempts for each process in the partial program order - INSIGNIFICANT_VARS = {'PWD', 'OLDPWD', 'SHLVL', 'PASH_SPEC_TMP_PREFIX', 'PASH_SPEC_SCHEDULER_SOCKET', 'PASH_SPEC_TOP', 'PASH_TOP', 'PASH_TOP_LEVEL','RANDOM', 'LOGNAME', 'MACHTYPE', 'MOTD_SHOWN', 'OPTERR', 'OPTIND', 'PPID', 'PROMPT_COMMAND', 'PS4', 'SHELL', 'SHELLOPTS', 'SHLVL', 'TERM', 'UID', 'USER', 'XDG_SESSION_ID'} diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 3970922e..1c71fe32 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -366,7 +366,6 @@ def get_most_recent_possible_new_env_for_node(self, node_id) -> str: most_recent_env_node = node_id while self.get_new_env_file_for_node(most_recent_env_node) is None: predecessor = self.get_prev(most_recent_env_node) - logging.critical(predecessor) if len(predecessor) == 0: return None else: @@ -666,15 +665,14 @@ def __kill_node(self, cmd_id: "NodeId"): # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response self.banned_files.add(trace_file) - # Get all child processes of proc_to_kill - children = util.get_child_processes(proc_to_kill.pid) - - # Kill all child processes - for child in children: - util.kill_process(child) - - # Terminate the main process - util.kill_process(proc_to_kill.pid) + alive_after_kill = util.kill_process_tree(proc_to_kill.pid) + + if alive_after_kill: + logging.critical("Processes still alive after attempting to kill:") + for proc in alive_after_kill: + logging.critical(proc) + else: + logging.critical("All processes were successfully terminated.") def resolve_dependencies_early(self, node_id=None): to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} @@ -728,6 +726,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) if config.sandbox_killing: + logging.info("Sandbox killing") self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 0893f0e8..4267d946 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -103,7 +103,6 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - logging.critical(f"HERE - {node_id} - {self.partial_program_order.get_new_env_file_for_node(node_id)}") ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() From 492c3adce4157b6576b6047e1cbabe2393b46955 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 15 Sep 2023 11:45:27 -0600 Subject: [PATCH 57/90] Fix typos in 7.sh --- report/benchmarks/dgsh/7.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) mode change 100644 => 100755 report/benchmarks/dgsh/7.sh diff --git a/report/benchmarks/dgsh/7.sh b/report/benchmarks/dgsh/7.sh old mode 100644 new mode 100755 index becbd891..a29a8e8d --- a/report/benchmarks/dgsh/7.sh +++ b/report/benchmarks/dgsh/7.sh @@ -22,10 +22,11 @@ file_hosts=$(mktemp) file_sorted_hosts=$(mktemp) file_unique_hosts=$(mktemp) file_domains=$(mktemp) -file_sorted_pages=$(mktemp) -file_access_times=$(mktemp) -file_access_dates=$(mktemp) - +file_requests=$(mktemp) +file_times=$(mktemp) +file_bytes=$(mktemp) +file_day_count=$(mktemp) +file_dates=$(mktemp) # This file will capture a large portion of the processed data to be reused in subsequent parts cat $INPUT_FILE > "$file_initial" From a996e51bfbe6f9ca9f777c4cac37465a94cff6d3 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 16 Sep 2023 01:26:05 -0600 Subject: [PATCH 58/90] Fix minor proc killing bug --- parallel-orch/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 071ad3fd..8d5e5101 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -157,6 +157,8 @@ def kill_process_tree(pid, sig=signal.SIGTERM): pass except (PermissionError): logging.critical("NO PERMISSION") + except (ProcessLookupError): + logging.critical("PROCESS LOOKUP ERROR") # Check if processes are still alive time.sleep(0.01) From 168c713523aa1d65c35a9ea4cf9b581e302230d5 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 02:31:37 -0600 Subject: [PATCH 59/90] Remove redundant pre-commit resolution checks --- parallel-orch/partial_program_order.py | 52 ++++++++------------------ 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 1c71fe32..307ef0f8 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -672,7 +672,7 @@ def __kill_node(self, cmd_id: "NodeId"): for proc in alive_after_kill: logging.critical(proc) else: - logging.critical("All processes were successfully terminated.") + logging.debug("All processes were successfully terminated.") def resolve_dependencies_early(self, node_id=None): to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} @@ -709,8 +709,13 @@ def resolve_dependencies_early(self, node_id=None): return def resolve_commands_that_can_be_resolved_and_push_frontier(self): - - cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated() + # This may be obsolete since we only resolve one node at a time + # cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated() + # assert len(cmds_to_resolve) <= 1 + if len(self.speculated) == 0: + cmds_to_resolve = [] + else: + cmds_to_resolve = [self.speculated.pop()] logging.debug(f"Commands to check for dependencies this round are: {sorted(cmds_to_resolve)}") logging.debug(f"Commands that cannot be resolved this round are: {sorted(self.speculated)}") ## Resolve dependencies for the commands that can actually be resolved @@ -730,34 +735,6 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) - - - def __pop_cmds_to_resolve_from_speculated(self): - cmd_ids_to_check = sorted(list(self.speculated)) - logging.debug(f" > Uncommitted commands done executing to be checked: {cmd_ids_to_check}") - cmds_to_resolve = [] - for cmd_id in cmd_ids_to_check: - # We check if we can resolve any possible dependencies - # If we can't, we have to wait for another cycle - if not self.cmd_can_be_resolved(cmd_id): - if cmd_id not in self.speculated: - logging.debug(f" > Adding node {cmd_id} to waiting list") - self.speculated.add(cmd_id) - else: - logging.debug(f" > Keeping node {cmd_id} to waiting list") - # If we are in this branch it means that we can resolve the dependencies of the current command - else: - cmds_to_resolve.append(cmd_id) - # We remove the command from the waiting to be resolved set - if cmd_id in self.speculated: - logging.debug(f" > Removing node {cmd_id} from waiting list") - logging.trace(f"WaitingRemove|{cmd_id}") - self.speculated.remove(cmd_id) - else: - logging.debug(f" > Node {cmd_id} is able to be resolved") - # The node can be resolved now - log_time_delta_from_named_timestamp("PartialOrder", "WaitingToResolve", cmd_id) - return sorted(cmds_to_resolve) def resolve_dependencies(self, cmds_to_resolve): @@ -1166,8 +1143,8 @@ def __frontier_commit_and_push(self): and frontier_node not in self.get_committed() \ and frontier_node not in self.stopped \ and frontier_node not in self.speculated \ - and frontier_node not in self.workset\ - and not self.is_loop_node(frontier_node)\ + and frontier_node not in self.workset \ + and not self.is_loop_node(frontier_node) \ and frontier_node not in self.waiting_for_frontend: ## Commit the node self.commit_node(frontier_node) @@ -1341,7 +1318,7 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): execute_func = executor.async_run_and_trace_command_return_trace proc, trace_file, stdout, stderr, post_execution_env_file = execute_func(cmd, node_id, env_file_to_execute_with) self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file) - logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing") + logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}") # This method attempts to add to workset (rerun) # any command that found to have a dependency through early resolution @@ -1416,7 +1393,6 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand # We will however attempt to resolve dependencies early self.resolve_dependencies_early(node_id) restarted_cmds = self.attempt_rerun_pending_nodes() - logging.critical(f"Restarted {restarted_cmds}") self.log_partial_program_order_info() ## Here we continue with the normal execution flow else: @@ -1467,7 +1443,6 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node # Kill and restart all currently executing commands # The envs are updated inside __kill_all_currently_executing_and_schedule_restart self.__kill_all_currently_executing_and_schedule_restart([node_id]) - logging.critical(f">>>>>>>>>>>>>>>>{node_id} - {self.get_new_env_file_for_node(node_id)}") # For all other nodes not killed, we update the latest env and restart them for waiting_for_frontend_node in self.waiting_for_frontend: if waiting_for_frontend_node not in self.workset: @@ -1491,7 +1466,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node # We will however attempt to resolve dependencies early for the remaining nodes self.resolve_dependencies_early(node_id) restarted_cmds = self.attempt_rerun_pending_nodes() - logging.critical(f"Restarted after successfull env resolution {restarted_cmds}") + logging.debug(f"Restarted after successfull env resolution {restarted_cmds}") self.log_partial_program_order_info() self.resolve_commands_that_can_be_resolved_and_push_frontier() assert(self.valid()) @@ -1546,6 +1521,9 @@ def log_partial_program_order_info(self): logging.debug(f"for FRONTEND: {sorted(list(self.waiting_for_frontend))}") logging.debug(f"TO RESOLVE: {self.to_be_resolved}") logging.debug(f"PENDING TO EXEC: {self.pending_to_execute}") + logging.debug(f"RUN AFTER: {self.run_after}") + logging.debug(f"New envs: {self.new_envs}") + logging.debug(f"Latest envs: {self.latest_envs}") self.log_rw_sets() logging.debug(f"=" * 80) From f240d69dfddf2faf44a81c7e135a22fb1198711e Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 03:00:48 -0600 Subject: [PATCH 60/90] Use recent pash version --- deps/pash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/pash b/deps/pash index 956064e3..e84a7125 160000 --- a/deps/pash +++ b/deps/pash @@ -1 +1 @@ -Subproject commit 956064e3fd50380538e0bc3f5dde4957c0d2a12c +Subproject commit e84a7125c813098beea659af7cce566f99e3266c From 4e9879098f16b708cf9ab6d2f3da2e70a9719369 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 03:01:31 -0600 Subject: [PATCH 61/90] Add matplotlib dependency for reporting --- requirements.txt | 1 + scripts/install_deps_ubuntu20.sh | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..958a6b27 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +matplotlib>=3.7.0 \ No newline at end of file diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh index fd1dc5b5..90323228 100755 --- a/scripts/install_deps_ubuntu20.sh +++ b/scripts/install_deps_ubuntu20.sh @@ -8,6 +8,8 @@ sudo update-alternatives --install /usr/bin/cram cram /usr/bin/cram3 100 export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} +pip3 install --user -r $PASH_TOP/requirements.txt + ## Download submodule dependencies git submodule update --init --recursive From af0d3f65f094f6edb8acd96967cf7d0b0de2e662 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 03:19:38 -0600 Subject: [PATCH 62/90] Fix trace parsing for mkdir -p --- parallel-orch/trace.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parallel-orch/trace.py b/parallel-orch/trace.py index 4fb93081..16df3d74 100644 --- a/parallel-orch/trace.py +++ b/parallel-orch/trace.py @@ -50,6 +50,10 @@ def __repr__(self) -> str: return self.__str__() def get_resolved_path(self): + + if isinstance(self.ref, PathRef): + self.ref = self.ref.get_resolved_path() + # Remove dupliate prefixes if not self.path.startswith("/"): modified_path = "/" + self.path From dd5365ae8d1c9947de747729c833a978a388cbaf Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 03:20:17 -0600 Subject: [PATCH 63/90] Add the option to run the trace parser independently --- parallel-orch/trace.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/parallel-orch/trace.py b/parallel-orch/trace.py index 16df3d74..e1ec859f 100644 --- a/parallel-orch/trace.py +++ b/parallel-orch/trace.py @@ -452,3 +452,22 @@ def parse_exit_code(trace_object) -> int: for line in reversed(trace_object): if "Exit(" in line: return int(line.split("Exit(")[1].rstrip(")\n")) + +# Trace can be called as a script with the trace file to analyze as an argument +def main(): + logging.basicConfig(level=logging.DEBUG) + trace_file = sys.argv[1] + with open(trace_file, "r") as f: + trace_object = f.readlines() + read_set, write_set = parse_and_gather_cmd_rw_sets(trace_object) + print("Read set:") + for r in read_set: + print(r) + print("Write set:") + for w in write_set: + print(w) + print("Exit code:") + print(parse_exit_code(trace_object)) + +if __name__ == "__main__": + main() \ No newline at end of file From 7b6832658c4dde2e9429e6148f0628cbe9bb3133 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 12:27:50 -0600 Subject: [PATCH 64/90] Add README.md for benchmarks --- report/README.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 report/README.md diff --git a/report/README.md b/report/README.md new file mode 100644 index 00000000..2b4dc1a5 --- /dev/null +++ b/report/README.md @@ -0,0 +1,74 @@ +# hs Benchmark Directory README + +Welcome to the benchmark directory of the `hs`. This directory contains the essential tools and scripts to run benchmarks, analyze logs, and generate reports. + +## Overview + +The benchmarking tool provides an interface to run different benchmarks, collect performance metrics, and visualize the results through plots. It supports a wide range of features including: +- Running benchmarks with `bash` and `hs`. +- Comparing the outputs and performance of `Bash` and `hs`. +- Generating Gantt charts for each benchmark. +- Producing detailed logs and CSV results. + +## Environment Variables + +The benchmarking tool sets up and exports a few essential environment variables for the system: + +- `WORKING_DIR`: The directory for the benchmarks and reports. +- `TEST_SCRIPT_DIR`: The directory containing benchmark scripts. +- `RESOURCE_DIR`: The directory to store resources required by benchmarks. +- `PASH_TOP`: The directory of `pash`. +- `PASH_SPEC_TOP`: The top directory of `hs`. + +## Command-Line Interface + +The primary command-line interface for the benchmark runner includes: +- `--no-plots`: Do not generate plots. +- `--no-logs`: Do not save log files. +- `--csv-output`: Save the results in CSV format. + +## Benchmark Configuration + +Benchmarks are configured using the `benchmark_config.json` file. Each benchmark in the configuration has the following properties: + +- `name`: The name of the benchmark. +- `env`: A list of environment variables required by the benchmark. +- `pre_execution_script`: A list of commands to run before executing the benchmark. Useful for fetching data or setting up the environment. +- `command`: The command or script to benchmark. +- `orch_args`: Arguments to pass to the `orch` system when running the benchmark. + +Example: + +```json +[ + { + "name": "Dgsh 1.sh - 120M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], + "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", + "orch_args": "-d 2 --sandbox-killing" + } +] +``` + +## Running Benchmarks + +To run benchmarks: + +1. Navigate to the directory containing the benchmark runner (`cd ./report` from the top-level directory). +2. Execute the benchmark runner with desired arguments, e.g., `python3 benchmark_runner.py --csv-output`. + +After running, the results, including logs, plots, and CSV files (if selected), will be saved in the `report_output` directory. + +## Results Interpretation + +The results include: +- Execution times for `bash` and `hs`. +- A comparison of the execution times. +- Validity checks for the outputs. +- Detailed execution logs. +- Gantt and bar charts visualizing the execution. + +## Contributions + +Feel free to contribute to the benchmark suite by adding new benchmarks or improving existing ones. Ensure that any new benchmarks have the necessary configuration in the `benchmark_config.json` file. From d6f7f7940cb42c4035c4b01a87650d6e63ef77ad Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 19 Sep 2023 13:22:59 -0600 Subject: [PATCH 65/90] Add improved top-level README --- README.md | 90 ++++++++++++++++++++++++++++---------------------- test/README.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 39 deletions(-) create mode 100644 test/README.md diff --git a/README.md b/README.md index 17053c2e..20009189 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,78 @@ -## Dynamic Parallelizer +## hs README -A dynamic parallelizer that optimistically/speculatively executes everything in a script in parallel and ensures that it executes correctly by tracing it and reexecuting the parts that were erroneous. +### Overview -## Installing +`hs` is an out-of-order shell designed to execute script commands in a speculative parallel manner. It achieves this by tracing the script's execution, and if an error arises due to speculative execution, the script re-executes the necessary parts to ensure correct outcomes. The project aims to boost the parallel execution of shell scripts, reducing their runtime and enhancing efficiency. -```sh -./scripts/install_deps_ubuntu20.sh -``` +### Structure -## Tests +The project's top-level directory contains the following: -To run the tests: -```sh -cd test -./test_orch.sh -``` +- `deps`: Dependencies required by `hs`. +- `docs`: Documentation and architectural diagrams. +- `model-checking`: Tools and utilities for model checking. +- `parallel-orch`: Main orchestration components. +- `pash-spec.sh`: Entry script to initiate the `hs` process. +- `README.md`: This documentation file. +- `report`: Generated reports related to test runs and performance metrics. +- `requirements.txt`: List of Python dependencies. +- `Rikerfile`: Configuration file for Riker. -### TODO Items +### Installation -#### Complete control flow and complex script support +Install `hs` on your Linux-based machine by following these steps: -Extend the architecture to support complete scripts and not just partial order graphs of commands. +**Note:** Currently works with `Ubuntu 20.04` or later -A potential solution is shown below: +1. Navigate to the project directory: + ```sh + cd path_to/dynamic-parallelizer + ``` -![Architecture Diagram](/docs/handdrawn_architecture.jpeg) +2. Run the installation script: + ```sh + ./scripts/install_deps_ubuntu20.sh + ``` -This solution includes a preprocessor that creates two executable artifacts: -- the preprocessed/instrumented script (similar to what the PaSh-JIT preprocessor produces) -- the partial program order graph (a graph of commands that will be speculated and executed with tracing from the orchestrator) +This script will handle all the necessary installations, including dependencies, try, Riker, and PaSh. -The graph might contain unexpanded commands, so the orchestrator should support unexpanded strings. -On these commands, the orchestrator can speculate for the value of these strings and then when they become the frontier (the preprocessed script has reached them), we actually know their values and could confirm/abort the speculation. +### Running `hs` -The two executors communicate with each other and progress through the script execution in tandem. The JIT executor (left) also needs to trace execution to inform the orchestrator about changes in the environment. +The main entry script to initiate `hs` is `pash-spec.sh`. This script sets up the necessary environment and invokes the orchestrator in `parallel-orch/orch.py`. It's designed to accept a variety of arguments to customize its behavior, such as setting debug levels or specifying log files. -#### Orchestator: Partial Program Order Graph +Example of running the script: -**Note:** we have moved to a continuous scheduling implementation. An example explaining its operation can be found [here](/docs/example.md). +```bash +./pash-spec.sh [arguments] script_to_speculatively_run.sh +``` -The orchestrator needs to support arbitrary partial program order graphs (instead of just sequences of instructions), to figure out the precise real program order dependencies. +**Arguments**: +- `-d, --debug-level`: Set the debugging level. Default is `0`. +- `-f, --log_file`: Define the logging output file. By default, logs are printed to stdout. +- `--sandbox-killing`: Kill any running overlay instances before committing to the lower layer. +- `--env-check-all-nodes-on-wait`: On a wait, check for environment changes between the current node and all other waiting nodes. (not fully functional yet!) -An instance of a graph is shown below: +### Testing -![Example Partial Program Order Graph](/docs/handdrawn_partial_program_order.jpeg) +To run the provided tests: -One important characteristic of the graph (and the speculative execution algorithm) is that there is a committed prefix-closed part that has already executed and cannot be affected. -The rest of the graph is uncommited and therefore might or might not have completed execution. The uncommited frontier, the part of the graph adjacent to the prefix is guaranteed to execute and complete without speculation (since we have both the environment and the variables resolved) and this is part of the argument for the termination of the algorithm. Every step that the orchestration takes, it can always commit the uncommited frontier, and therefore the commited prefix grows until it reaches the whole graph. +```bash +./test/test_orch.sh +``` -#### Orchestrator: Backward dependencies and Execution Isolation/Aborting/Reverting +For in-depth analysis, set the `DEBUG` environment variable to `2` for detailed logs and redirect logs to a file: -How do we resolve backward dependencies? For example: -```sh -grep foo in1 > out1 -grep bar in0 > in1 ## Its write might affect the first command exec. +```bash +DEBUG=2 ./test/test_orch.sh 2>logs.txt ``` -One solution would be to run the non-frontier (non-root) commands in an isolated environment and only at the end of their execution commit their results. This might have significant overhead, except if we can just write to temporary files and then move them? Or let them work in a temporary directory? +### Contributing and Further Development + +Contributions are always welcome! The project roadmap includes extending the architecture to support complete scripts, optimizing the scheduler for better performance, etc. -Another way would be to dynamically track writes of non-frontier commands and stop them when they try to write to something that might be a read dependency of the first, but there are timing issues here that I don't see how to resolve. +More issues to be added soon... -#### Commands that change current directory +### License -Can we actually trace that and not run these commands? Is that simply a change of an environment variable? They will run in a forked version anyway, but we want to see their results. +`hs` is licensed under the MIT License. See the `LICENSE` file for more information. diff --git a/test/README.md b/test/README.md new file mode 100644 index 00000000..233dbd95 --- /dev/null +++ b/test/README.md @@ -0,0 +1,86 @@ +## README for `hs` Test Suite + +### Overview +This directory contains the test suite for `hs`. The main test script is `test_orch.sh`, which automates the process of running various tests on the `hs` and `bash` to ensure consistency and correctness. + +### Directory Structure + +- **test_scripts**: Contains the individual test scripts. +- **misc**: Contains utility scripts used by the test cases. +- **output_bash**: Directory to save the output of scripts executed by `bash`. +- **output_orch**: Directory to save the output of scripts executed by `hs`. +- **results**: Stores the result status and logs for each test. +- **parse_cmd_repetitions.py**: Python script to parse command repetitions from the `hs` logs. + +### Main Test Script (`test_orch.sh`) + +The main test script `test_orch.sh` starts by setting up environment variables and directories. It then proceeds to define utility functions: + +- `cleanup()`: Removes cache and clears output directories. +- `test_repetitions()`: Validates the repetition of commands using `parse_cmd_repetitions.py`. +- `run_test()`: Executes a given test for both `bash` and `hs` and compares the outputs. +- Various test functions, e.g., `test_single_command()`, `test_local_vars_1()`, etc. + +Finally, it runs the set of defined tests, provides a summary of the results, and outputs logs for both passed and failed tests. + +### Running Tests + +To run all tests: +``` +./test_orch.sh +``` + +To run specific tests: +``` +./test_orch.sh [testname] +``` + +Before running your scripts, you can set the DEBUG environment variable to provide detailed logging information. Assign a value of 2 to DEBUG to get the most detailed logs. + +```bash +export DEBUG=2 +``` + +Since the logs are printed to stderr, you can redirect them to a file to facilitate easier analysis: + +```bash +./test_orch.sh [test_name] 2>logs.txt +``` + +### Test Results + +At the end of execution, a summary is presented: + +1. List of tests that produced identical outputs on both `bash` and `hs`. +2. List of tests that produced non-identical outputs. +3. Overall summary indicating the number of tests passed. + +The detailed logs for passed and failed tests can be found in the `results` directory. + +### Adding More Tests + +If you would like to expand the test suite by adding more tests, follow these guidelines: + +1. **Create Test Script**: Write a new Bash script that performs the desired test. For example, if you wish to test a new functionality named `test_XXX.sh`, create a file with that name under the `test_scripts` directory.

+Utilize utility scripts from the `misc` directory, like `sleep_and_grep.sh`, to help maintain a modular design. This way, changes made to utility functions can propagate across multiple tests. + +2. **Update Main Test Suite**: + In the main test suite (shown above), add a new function named similarly to your test script. This function should prepare any required input files and run your test script. For example: + + ```bash + test_XXX() + { + local shell=$1 + # Setup input data here if required + $shell $TEST_SCRIPT_DIR/test_XXX.sh + } + ``` + +3. **Integrate into Test Runner**: + Add a call to your `run_test` function with your new function as the argument. This ensures it's part of the suite when no specific test names are given. Add this before `if [ "$#" -eq 0 ]; then`, for instance: + + ```bash + run_test test_XXX + ``` + + Optionally, you can provide expected repetition values as a second argument if required by the test. From 53c5ea56b454c855fecf1514b17bc7e30848c92b Mon Sep 17 00:00:00 2001 From: Georgios Liargkovas <56384743+gliargovas@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:37:38 +0300 Subject: [PATCH 66/90] Update README.md Add link to optimization issues --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 20009189..44cce850 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ DEBUG=2 ./test/test_orch.sh 2>logs.txt Contributions are always welcome! The project roadmap includes extending the architecture to support complete scripts, optimizing the scheduler for better performance, etc. -More issues to be added soon... +For a detailed description of possible optimizations, see the [related issues](https://github.com/binpash/dynamic-parallelizer/issues?q=is%3Aopen+is%3Aissue+label%3Aoptimization) ### License From f4880ee8463f14b2ddc379261928008b8985a928 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 00:34:49 -0600 Subject: [PATCH 67/90] Fix typo in installation script --- scripts/install_deps_ubuntu20.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh index 90323228..d54171b0 100755 --- a/scripts/install_deps_ubuntu20.sh +++ b/scripts/install_deps_ubuntu20.sh @@ -8,7 +8,7 @@ sudo update-alternatives --install /usr/bin/cram cram /usr/bin/cram3 100 export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} -pip3 install --user -r $PASH_TOP/requirements.txt +pip3 install --user -r $PASH_SPEC_TOP/requirements.txt ## Download submodule dependencies git submodule update --init --recursive From db75a7c0a20122b1a377b9e3f66f421911e008f7 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 00:35:25 -0600 Subject: [PATCH 68/90] Remove redundant reporting script --- report/scheduling_report.py | 239 ------------------------------------ 1 file changed, 239 deletions(-) delete mode 100644 report/scheduling_report.py diff --git a/report/scheduling_report.py b/report/scheduling_report.py deleted file mode 100644 index c687cb13..00000000 --- a/report/scheduling_report.py +++ /dev/null @@ -1,239 +0,0 @@ -#!/bin/env python3 - -from enum import Enum -import os -import sys -from dateutil import parser -import plotly.express as px -from datetime import datetime, date - - -class CommandState(Enum): - EXECUTING = "Executing" - EXECUTING_SANDBOXED = "Executing sandboxed" - STOPPED_NETWORK = "Stopped: network" - STOPPED_ERROR = "Stopped: ec!=0" - WAITING = "Waiting" - COMMITTED = "Committed" - NO_STATE = "No state" - - -class PashSpecTraceObject: - - def __init__(self, timestamp: datetime, action: str, message): - self.action = action - self.message = message - self.timestamp = timestamp.time() - - def __str__(self): - return f"PashSpecTraceObject({self.timestamp}|{self.action}|{self.message})" - - -class SchedulingStateSet: - - def handle_node(self, object): - self.nodes = [node_id for node_id in object.message.split(",")] - self.nodes.reverse() - self.start_timestamp = object.timestamp - - def __init__(self): - self.cmd_states = [] - self.unresolved_states = dict() - self.nodes = [] - self.marks = [] - self.bash_timestamp = None - self.start_timestamp = None - - def plot(self): - self.cmd_states.sort(key=lambda x: x["Command_Id"]) - fig1 = px.timeline(self.cmd_states, - y='Command_Id', - x_start="Start", - x_end="Finish", - hover_data=['Command_Id', 'State'], - color="State", - category_orders={"0": 1, "1": 2, "2": 3, "3": 4, "4": 5}) - fig1.update_layout(showlegend=True, xaxis_tickformat='%M:%S,%L', - yaxis_title="Command ID", xaxis_title="Time (ms)") - fig1.update_yaxes(categoryorder='array', categoryarray=self.nodes) - fig1.update_traces(marker=dict(size=12, - line=dict(width=2, - color='DarkSlateGrey')), - selector=dict(mode='markers')) - - # Add commit markers - y = [mark["y"] for mark in self.marks if mark["event"] == "Commit"] - x = [mark["x"] for mark in self.marks if mark["event"] == "Commit"] - fig1.add_scatter(y=y, x=x, - marker_symbol="diamond", - marker=dict(color='Black', size=16), - mode="markers", - name="Commit") - - # Add execution markers - y = [mark["y"] - for mark in self.marks if mark["event"] == CommandState.EXECUTING] - x = [mark["x"] - for mark in self.marks if mark["event"] == CommandState.EXECUTING] - fig1.add_scatter(y=y, x=x, - marker_symbol="circle", - marker=dict(color='Black', size=16), - mode="markers", - name="Normal exec start") - - # Add sandbox execution markers - y = [mark["y"] for mark in self.marks if mark["event"] - == CommandState.EXECUTING_SANDBOXED] - x = [mark["x"] for mark in self.marks if mark["event"] - == CommandState.EXECUTING_SANDBOXED] - fig1.add_scatter(y=y, x=x, - marker_symbol="circle", - marker=dict(color='Red', size=16), - mode="markers", - name="Sandbox exec start") - - if self.bash_timestamp is not None: - fig1.update_layout(shapes=[ - dict( - type='line', - yref='paper', y0=0, y1=1, - xref='x', x0=self.bash_timestamp, x1=self.bash_timestamp - ) - ]) - fig1.show() - - def add_task(self, node_id, start, end, state): - self.cmd_states.append( - dict(Command_Id=str(node_id), Start=start, Finish=end, State=state.value)) - - def handle_executing_add(self, object): - node_id = int(object.message) - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.EXECUTING) - self.marks.append(dict(y=node_id, x=object.timestamp, - event=CommandState.EXECUTING)) - - def handle_executing_sandbox_add(self, object): - node_id = int(object.message) - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.EXECUTING_SANDBOXED) - self.marks.append(dict(y=node_id, x=object.timestamp, - event=CommandState.EXECUTING_SANDBOXED)) - - def handle_executing_remove(self, object): - node_id = int(object.message) - assert node_id in self.unresolved_states - start, state = self.unresolved_states.pop(node_id) - end = object.timestamp - self.add_task(node_id, start, end, state) - self.add_task(node_id, end, end, state) - - def handle_frontier_add(self, object): - node_id = int(object.message) - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.FRONTIER) - - def handle_frontier_remove(self, object): - pass - - def handle_stopped_add(self, object): - node_str, reason = object.message.split(":") - node_id = int(node_str) - if reason == "error": - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.STOPPED_ERROR) - elif reason == "network": - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.STOPPED_NETWORK) - else: - assert False - - def handle_stopped_remove(self, object): - node_id = int(object.message) - assert node_id in self.unresolved_states - start, state = self.unresolved_states.pop(node_id) - end = object.timestamp - self.add_task(node_id, start, end, state) - - def handle_waiting_add(self, object): - node_id = int(object.message) - self.unresolved_states[node_id] = ( - object.timestamp, CommandState.WAITING) - - def handle_waiting_remove(self, object): - node_id = int(object.message) - assert node_id in self.unresolved_states - start, state = self.unresolved_states.pop(node_id) - end = object.timestamp - self.add_task(node_id, start, end, state) - - def handle_commit(self, object): - nodes = [str(node) for node in object.message.split(",")] - for node in nodes: - self.marks.append(dict(y=node, x=object.timestamp, event="Commit")) - - def handle_bash(self, object): - self.bash_timestamp = datetime.strptime(object.message, "%M:%S.%f") - print(datetime.strptime(object.message, "%M:%S.%f")) - pass - - -def adjust_timestamp(state_set: SchedulingStateSet, trace_object): - t = state_set.start_timestamp - print(datetime(1900, 1, 1, 0, 0, 0) + (datetime.combine(date.min, - trace_object.timestamp) - datetime.combine(date.min, t))) - trace_object.timestamp = datetime(1900, 1, 1, 0, 0, 0) + (datetime.combine( - date.min, trace_object.timestamp) - datetime.combine(date.min, t)) - - -def parse_trace_objects(trace_file): - with open(trace_file) as logfile: - lines = logfile.read().split("\n") - lines = [tuple(line.split("|")[1:]) - for line in lines if line.startswith("TRACE|")] - trace_objects = [PashSpecTraceObject(parser.parse( - timestamp), action, message) for timestamp, action, message in lines] - return trace_objects - - -def main(): - states = SchedulingStateSet() - trace_file = os.path.join(os.path.abspath(sys.argv[1])) - lines = parse_trace_objects(trace_file) - for object in lines: - action = object.action - if action == "Nodes": - states.handle_node(object) - elif action == "ExecutingAdd": - adjust_timestamp(states, object) - states.handle_executing_add(object) - elif action == "ExecutingSandboxAdd": - adjust_timestamp(states, object) - states.handle_executing_sandbox_add(object) - if action == "ExecutingRemove": - adjust_timestamp(states, object) - states.handle_executing_remove(object) - elif action == "StoppedAdd": - adjust_timestamp(states, object) - states.handle_stopped_add(object) - elif action == "StoppedRemove": - adjust_timestamp(states, object) - states.handle_stopped_remove(object) - elif action == "WaitingAdd": - adjust_timestamp(states, object) - states.handle_waiting_add(object) - elif action == "WaitingRemove": - adjust_timestamp(states, object) - states.handle_waiting_remove(object) - elif action == "Commit": - adjust_timestamp(states, object) - states.handle_commit(object) - elif action == "Bash": - states.handle_bash(object) - else: - assert False, f"Not implemented handle for action: {action}" - states.plot() - - -if __name__ == "__main__": - main() From 5aa58346608de5d00bd4d1a503aabde871ab30c1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 03:44:38 -0600 Subject: [PATCH 69/90] Add option to start speculation on first wait --- parallel-orch/config.py | 3 ++- parallel-orch/partial_program_order.py | 18 ++++++++++++++---- parallel-orch/scheduler_server.py | 8 ++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 48761c76..0796435b 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -39,7 +39,7 @@ def log_root(msg, *args, **kwargs): 'PASH_TOP', 'PASH_TOP_LEVEL','RANDOM', 'LOGNAME', 'MACHTYPE', 'MOTD_SHOWN', 'OPTERR', 'OPTIND', 'PPID', 'PROMPT_COMMAND', 'PS4', 'SHELL', 'SHELLOPTS', 'SHLVL', 'TERM', 'UID', 'USER', 'XDG_SESSION_ID'} -SIGNIFICANT_VARS = {'foo', 'bar', 'baz','file1', 'file2', 'file3', 'file4', 'file5', 'LC_ALL', 'nchars'} +SIGNIFICANT_VARS = {'foo', 'bar', 'baz', 'file1', 'file2', 'file3', 'file4', 'file5', 'LC_ALL', 'nchars', 'filename'} START_TIME = time.time() @@ -47,3 +47,4 @@ def log_root(msg, *args, **kwargs): sandbox_killing = False all_node_env_resolution = False +speculate_immidiately = False \ No newline at end of file diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 307ef0f8..715e5f53 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -408,17 +408,23 @@ def init_partial_order(self): self.init_workset() logging.debug(f'Initialized workset') self.populate_to_be_resolved_dict() - self.init_latest_env_files() + if config.speculate_immidiately: + self.init_latest_env_files() logging.debug(f'To be resolved sets per node:') logging.debug(self.to_be_resolved) logging.info(f'Initialized the partial order!') self.log_partial_program_order_info() - assert(self.valid()) - def init_latest_env_files(self): + + def init_latest_env_files(self, node=None): + if node is None: + env_to_assign = self.initial_env_file + else: + env_to_assign = self.get_new_env_file_for_node(node) for node_id in self.get_all_non_committed(): - self.set_latest_env_file_for_node(node_id, self.initial_env_file) + self.set_latest_env_file_for_node(node_id, env_to_assign) + def init_workset(self): self.workset = self.get_all_non_committed_standard_nodes() @@ -1244,6 +1250,10 @@ def attempt_move_stopped_to_workset(self): ## TODO: Eventually, in the future, let's add here some form of limit def schedule_work(self, limit=0): + if not config.speculate_immidiately and \ + self.get_latest_env_file_for_node(self.get_standard_source_nodes()[0]) is None: + logging.debug("Not scheduling work yet, waiting for first Wait") + return # self.log_partial_program_order_info() logging.debug("Rerunning stopped commands") # attempt_move_stopped_to_workset() needs to happen before the node execution diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 4267d946..82d7d2ca 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -34,6 +34,10 @@ def parse_args(): action="store_true", default=None, help="When receiving a wait check for env changes between the current node and all other waiting nodes, instead of only examining the current wait node.") + parser.add_argument("--speculate-immidiately", + action="store_true", + default=False, + help="Speculate immidiately instead of waiting for the first Wait message.") args, unknown_args = parser.parse_known_args() return args @@ -103,6 +107,9 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) + + self.partial_program_order.init_latest_env_files(node_id) + ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() @@ -306,6 +313,7 @@ def main(): # Set optimization options config.sandbox_killing = args.sandbox_killing config.all_node_env_resolution = args.env_check_all_nodes_on_wait + config.speculate_immidiately = args.speculate_immidiately scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() From 0bcf4632f967b657496403a011e51059f2a20a4a Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 04:40:48 -0600 Subject: [PATCH 70/90] Add extra check when setting most recent env --- parallel-orch/scheduler_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 82d7d2ca..7ef78ed9 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -108,7 +108,8 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - self.partial_program_order.init_latest_env_files(node_id) + if not config.speculate_immidiately and node_id == self.partial_program_order.get_standard_source_nodes()[0]: + self.partial_program_order.init_latest_env_files(node_id) ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() From f96e20a0d88893ffdcca22d6043534d616e0d098 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 05:35:51 -0600 Subject: [PATCH 71/90] Add early on-wait-received env resolution check --- parallel-orch/partial_program_order.py | 49 ++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 715e5f53..7e4a3b01 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -286,6 +286,7 @@ def __init__(self, nodes, edges, initial_env_file): self.run_after = defaultdict(set) self.pending_to_execute = set() self.to_be_resolved_prev = {} + self.prechecked_env = set() def __str__(self): return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" @@ -656,7 +657,9 @@ def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[No for cmd_id in nodes_to_kill: self.__kill_node(cmd_id) most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id) + self.prechecked_env.discard(cmd_id) if most_recent_new_env is not None: + self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) self.workset.remove(cmd_id) log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id) @@ -664,6 +667,16 @@ def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[No # Our new workset is the nodes that were killed # Previous workset got killed self.workset.extend(nodes_to_kill) + + def __kill_executing_node_and_schedule_restart(self, node_id: NodeId): + self.__kill_node(node_id) + most_recent_new_env = self.get_most_recent_possible_new_env_for_node(node_id) + if most_recent_new_env is not None: + self.set_latest_env_file_for_node(node_id, most_recent_new_env) + self.workset.remove(node_id) + log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") + self.workset.extend(node_id) def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') @@ -1432,16 +1445,47 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i logging.debug("No significant differences found:") return False + def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): + if self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), + self.get_latest_env_file_for_node(node_id)): + logging.debug(f"[Early] Significant differences found between new and latest env files for {node_id}.") + logging.debug(f"[Early] Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") + self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) + if node_id not in self.workset: + self.workset.append(node_id) + + nodes_to_kill = self.get_currently_executing().copy() + self.__kill_all_currently_executing_and_schedule_restart([node_id]) + for waiting_for_frontend_node in self.waiting_for_frontend: + if waiting_for_frontend_node not in self.workset: + self.workset.append(waiting_for_frontend_node) + most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) + self.prechecked_env.discard(waiting_for_frontend_node) + self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) + assert(self.get_new_env_file_for_node(node_id) is not None) + assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) + self.log_partial_program_order_info() + logging.debug("-") + self.waiting_for_frontend = set() + self.populate_to_be_resolved_dict() + else: + self.prechecked_env.add(node_id) + + + def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): if node_id in self.waiting_for_frontend: - logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") + logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.") self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + else: + logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.") + self.resolve_most_recent_envs_check_only_wait_node_early(node_id) def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") # Node is no longer waiting to be resolved. It might have not been waiting at all. self.waiting_for_frontend.discard(node_id) - if self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), + if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): logging.debug(f"Significant differences found between new and latest env files for {node_id}.") logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") @@ -1459,6 +1503,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node self.workset.append(waiting_for_frontend_node) most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) + self.prechecked_env.discard(waiting_for_frontend_node) assert(self.get_new_env_file_for_node(node_id) is not None) assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) self.log_partial_program_order_info() From b7d6077dea3d7da482cd20c82358597b2b72c701 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 07:15:53 -0600 Subject: [PATCH 72/90] Add check for initial env on non-po nodes --- parallel-orch/partial_program_order.py | 28 ++++++++++++++------------ parallel-orch/scheduler_server.py | 9 +++++++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 7e4a3b01..0f57b43a 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -667,16 +667,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[No # Our new workset is the nodes that were killed # Previous workset got killed self.workset.extend(nodes_to_kill) - - def __kill_executing_node_and_schedule_restart(self, node_id: NodeId): - self.__kill_node(node_id) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(node_id) - if most_recent_new_env is not None: - self.set_latest_env_file_for_node(node_id, most_recent_new_env) - self.workset.remove(node_id) - log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") - self.workset.extend(node_id) + def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') @@ -1263,8 +1254,12 @@ def attempt_move_stopped_to_workset(self): ## TODO: Eventually, in the future, let's add here some form of limit def schedule_work(self, limit=0): - if not config.speculate_immidiately and \ - self.get_latest_env_file_for_node(self.get_standard_source_nodes()[0]) is None: + + if not config.speculate_immidiately: + starting_env_node = self.get_source_nodes() + ## It means we have a loop node at the start + ## In that case, we roll back to the original initial env + if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None: logging.debug("Not scheduling work yet, waiting for first Wait") return # self.log_partial_program_order_info() @@ -1357,6 +1352,7 @@ def attempt_rerun_pending_nodes(self): self.pending_to_execute.discard(node) self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id)) restarted_nodes.add(node) + self.prechecked_env.discard(node) new_run_after_nodes.discard(node) self.run_after[node_id] = new_run_after_nodes return restarted_nodes @@ -1446,11 +1442,17 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i return False def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): - if self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), + # We check whether we received a wait for a node we haven't yet unrolled. + # We need to first unroll the node and then speculate about it. + # TODO: Maybe we could move unrolling earlier in handle_wait()? + if not self.is_node_id(node_id): + return + if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): logging.debug(f"[Early] Significant differences found between new and latest env files for {node_id}.") logging.debug(f"[Early] Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) + self.prechecked_env.discard(node_id) if node_id not in self.workset: self.workset.append(node_id) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 7ef78ed9..24cfdebe 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -108,8 +108,11 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - if not config.speculate_immidiately and node_id == self.partial_program_order.get_standard_source_nodes()[0]: - self.partial_program_order.init_latest_env_files(node_id) + if not config.speculate_immidiately: + starting_env_node = self.partial_program_order.get_source_nodes() + if len(starting_env_node) > 0 and self.partial_program_order.get_latest_env_file_for_node(starting_env_node[0]) is None: + logging.debug("Initializing latest env and speculating") + self.partial_program_order.init_latest_env_files(node_id) ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() @@ -121,6 +124,8 @@ def handle_wait(self, input_cmd: str, connection): ## forward and so on. self.partial_program_order.wait_received(node_id) + # self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) + ## If the node_id is already committed, just return its exit code if node_id in self.partial_program_order.get_committed(): # TODO: Env check and if no conflicts, commit From eb218dff9ca2a44eda50c98f6f4fc258c45d618b Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 07:57:31 -0600 Subject: [PATCH 73/90] move env check on wait after loop unrolling --- parallel-orch/partial_program_order.py | 2 -- parallel-orch/scheduler_server.py | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 0f57b43a..a7da234a 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1445,8 +1445,6 @@ def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, r # We check whether we received a wait for a node we haven't yet unrolled. # We need to first unroll the node and then speculate about it. # TODO: Maybe we could move unrolling earlier in handle_wait()? - if not self.is_node_id(node_id): - return if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): logging.debug(f"[Early] Significant differences found between new and latest env files for {node_id}.") diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 24cfdebe..cf242082 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -117,14 +117,12 @@ def handle_wait(self, input_cmd: str, connection): ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() - ## Attempt to resolve environment differences on waiting partial order nodes - self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) - ## Inform the partial order that we received a wait for a node so that it can push loops ## forward and so on. self.partial_program_order.wait_received(node_id) - # self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) + # Moved this below wait_received, in order to support unrolled loop nodes + self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) ## If the node_id is already committed, just return its exit code if node_id in self.partial_program_order.get_committed(): From 62ae3d8af34b14831a3dda6517c5a912c3058b85 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 08:10:02 -0600 Subject: [PATCH 74/90] Simplify Node 0 env initialization --- parallel-orch/partial_program_order.py | 21 +++++++++++++-------- parallel-orch/scheduler_server.py | 8 +++----- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index a7da234a..a4ab583b 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -616,6 +616,16 @@ def add_to_write_set(self, node_id: NodeId, item: str): def add_to_speculated(self, node_id: NodeId): self.speculated = self.speculated.union([node_id]) + def is_first_node_when_env_is_uninitialized(self, speculate_immidiately): + if not speculate_immidiately: + starting_env_node = self.get_source_nodes() + ## We may have a loop node at the start + ## In that case, we roll back to the initial env + if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None: + logging.debug("Initializing latest env and speculating") + return True + return False + # Check if the specific command can be resolved. # KK 2023-05-04 I am not even sure what this function does and why is it useful. def cmd_can_be_resolved(self, node_id: int) -> bool: @@ -1254,14 +1264,9 @@ def attempt_move_stopped_to_workset(self): ## TODO: Eventually, in the future, let's add here some form of limit def schedule_work(self, limit=0): - - if not config.speculate_immidiately: - starting_env_node = self.get_source_nodes() - ## It means we have a loop node at the start - ## In that case, we roll back to the original initial env - if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None: - logging.debug("Not scheduling work yet, waiting for first Wait") - return + if self.is_first_node_when_env_is_uninitialized(config.speculate_immidiately): + logging.debug("Not scheduling work yet, waiting for first Wait") + return # self.log_partial_program_order_info() logging.debug("Rerunning stopped commands") # attempt_move_stopped_to_workset() needs to happen before the node execution diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index cf242082..11483592 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -108,11 +108,9 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - if not config.speculate_immidiately: - starting_env_node = self.partial_program_order.get_source_nodes() - if len(starting_env_node) > 0 and self.partial_program_order.get_latest_env_file_for_node(starting_env_node[0]) is None: - logging.debug("Initializing latest env and speculating") - self.partial_program_order.init_latest_env_files(node_id) + if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.speculate_immidiately): + logging.debug("Initializing latest env and speculating") + self.partial_program_order.init_latest_env_files(node_id) ## Attempt to rerun all pending nodes self.partial_program_order.attempt_rerun_pending_nodes() From 749a77feb9d9cc6acad04366b46af4b937027e1e Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 10:26:02 -0600 Subject: [PATCH 75/90] Refactor env resolution to improve quality --- parallel-orch/partial_program_order.py | 97 +++++++++----------------- 1 file changed, 33 insertions(+), 64 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index a4ab583b..b533a57b 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1446,91 +1446,60 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i logging.debug("No significant differences found:") return False + def update_env_and_restart_nodes(self, node_id: NodeId): + logging.debug(f"Significant differences found between new and latest env files for {node_id}.") + logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") + self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) + self.prechecked_env.discard(node_id) + if node_id not in self.workset: + self.workset.append(node_id) + self.__kill_all_currently_executing_and_schedule_restart([node_id]) + for waiting_for_frontend_node in self.waiting_for_frontend: + if waiting_for_frontend_node not in self.workset: + self.workset.append(waiting_for_frontend_node) + most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) + self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) + self.prechecked_env.discard(waiting_for_frontend_node) + assert(self.get_new_env_file_for_node(node_id) is not None) + assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) + self.log_partial_program_order_info() + logging.debug("-") + self.waiting_for_frontend = set() + self.populate_to_be_resolved_dict() + def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): - # We check whether we received a wait for a node we haven't yet unrolled. - # We need to first unroll the node and then speculate about it. - # TODO: Maybe we could move unrolling earlier in handle_wait()? if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): - logging.debug(f"[Early] Significant differences found between new and latest env files for {node_id}.") - logging.debug(f"[Early] Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") - self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) - self.prechecked_env.discard(node_id) - if node_id not in self.workset: - self.workset.append(node_id) - - nodes_to_kill = self.get_currently_executing().copy() - self.__kill_all_currently_executing_and_schedule_restart([node_id]) - for waiting_for_frontend_node in self.waiting_for_frontend: - if waiting_for_frontend_node not in self.workset: - self.workset.append(waiting_for_frontend_node) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) - self.prechecked_env.discard(waiting_for_frontend_node) - self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) - assert(self.get_new_env_file_for_node(node_id) is not None) - assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) - self.log_partial_program_order_info() - logging.debug("-") - self.waiting_for_frontend = set() - self.populate_to_be_resolved_dict() + self.update_env_and_restart_nodes(node_id) else: self.prechecked_env.add(node_id) - - - - def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): - if node_id in self.waiting_for_frontend: - logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) - else: - logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.") - self.resolve_most_recent_envs_check_only_wait_node_early(node_id) def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - # Node is no longer waiting to be resolved. It might have not been waiting at all. self.waiting_for_frontend.discard(node_id) if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): - logging.debug(f"Significant differences found between new and latest env files for {node_id}.") - logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") - # If there are significant differences, set the new env as the latest (the one to run Riker with) - self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) - # Add the node to the workset again - if node_id not in self.workset: - self.workset.append(node_id) - # Kill and restart all currently executing commands - # The envs are updated inside __kill_all_currently_executing_and_schedule_restart - self.__kill_all_currently_executing_and_schedule_restart([node_id]) - # For all other nodes not killed, we update the latest env and restart them - for waiting_for_frontend_node in self.waiting_for_frontend: - if waiting_for_frontend_node not in self.workset: - self.workset.append(waiting_for_frontend_node) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) - self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) - self.prechecked_env.discard(waiting_for_frontend_node) - assert(self.get_new_env_file_for_node(node_id) is not None) - assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) - self.log_partial_program_order_info() - logging.debug("-") - self.waiting_for_frontend = set() - self.populate_to_be_resolved_dict() + self.update_env_and_restart_nodes(node_id) else: logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") assert(node_id not in self.stopped) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) self.add_to_speculated(node_id) - ## We can now call the general resolution method that determines which commands - ## can be resolved (all their dependencies are done executing), and resolves them. - - # We will however attempt to resolve dependencies early for the remaining nodes self.resolve_dependencies_early(node_id) restarted_cmds = self.attempt_rerun_pending_nodes() - logging.debug(f"Restarted after successfull env resolution {restarted_cmds}") + logging.debug(f"Restarted after successful env resolution {restarted_cmds}") self.log_partial_program_order_info() self.resolve_commands_that_can_be_resolved_and_push_frontier() assert(self.valid()) - + + def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): + if node_id in self.waiting_for_frontend: + logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.") + self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + else: + logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.") + self.resolve_most_recent_envs_check_only_wait_node_early(node_id) + def new_and_latest_env_files_have_significant_differences(self, new_env_file, latest_env_file): # Early resolution if same files are compared if new_env_file == latest_env_file: From 541873f9bd93d06f236c03b9bd3813a02fe90d2d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 10:31:48 -0600 Subject: [PATCH 76/90] Remove sleep while killing processes --- parallel-orch/util.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 8d5e5101..c9cb5d60 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -161,7 +161,11 @@ def kill_process_tree(pid, sig=signal.SIGTERM): logging.critical("PROCESS LOOKUP ERROR") # Check if processes are still alive - time.sleep(0.01) - - alive_processes = [f"{proc}-({proc.status()})" for proc in processes if proc.is_running()] + alive_processes = [] + for proc in processes: + try: + if proc.is_running(): + alive_processes.append(f"{proc}-({proc.status()})") + except: + pass return alive_processes From ea169bf5dd5df94b1d9155300f038bd2fb41a5bf Mon Sep 17 00:00:00 2001 From: gliargovas Date: Thu, 21 Sep 2023 11:29:26 -0600 Subject: [PATCH 77/90] Use most recent pash instance --- deps/pash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/pash b/deps/pash index e84a7125..760da58b 160000 --- a/deps/pash +++ b/deps/pash @@ -1 +1 @@ -Subproject commit e84a7125c813098beea659af7cce566f99e3266c +Subproject commit 760da58b6687a304b12931c5a3fef987a7327b74 From 97dea9f669b0635e78d732a020c235d4fd88b0ce Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 01:19:21 -0600 Subject: [PATCH 78/90] Add a reversed instance of 1.sh --- report/benchmarks/dgsh/1_reversed.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 report/benchmarks/dgsh/1_reversed.sh diff --git a/report/benchmarks/dgsh/1_reversed.sh b/report/benchmarks/dgsh/1_reversed.sh new file mode 100644 index 00000000..6c728b6d --- /dev/null +++ b/report/benchmarks/dgsh/1_reversed.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +## Initialize the necessary temporary files +file1=$(mktemp) +cat $INPUT_FILE >"$file1" +printf 'File type:\t' +file - <"$file1" + +printf 'Original size:\t' +wc -c <"$file1" + +printf 'gzip:\t\t' +gzip -c <"$file1" | wc -c + +printf 'bzip2:\t\t' +bzip2 -c <"$file1" | wc -c + +printf 'xz:\t\t' +xz -c <"$file1" | wc -c From 8ba96532289a49b7238c0c1a223ccf519c8ff03c Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 04:40:41 -0600 Subject: [PATCH 79/90] Add improved config for running dgsh benchmarks --- report/benchmark_config.json | 41 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/report/benchmark_config.json b/report/benchmark_config.json index 60454a1e..4da3ed1b 100644 --- a/report/benchmark_config.json +++ b/report/benchmark_config.json @@ -6,20 +6,6 @@ "command": "{TEST_SCRIPT_DIR}/dgsh/6.sh", "orch_args": "-d 2" }, - { - "name": "Dgsh 1.sh - 120M", - "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], - "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", - "orch_args": "-d 2" - }, - { - "name": "Dgsh 1.sh - 700M", - "env": ["INPUT_FILE={RESOURCE_DIR}/in700M.xml"], - "pre_execution_script": ["wget -nc -O in700M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml"], - "command": "{TEST_SCRIPT_DIR}/dgsh/1.sh", - "orch_args": "-d 2" - }, { "name": "Dgsh 2.sh (no function) - Τry Repo", "pre_execution_script": ["git clone https://github.com/binpash/try.git"], @@ -77,11 +63,18 @@ "orch_args": "-d 2" }, { - "name": "Dgsh 7.sh (no func)", + "name": "7.sh - kill", "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], - "pre_execution_script": ["wget -nc -O words.txt https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"], + "pre_execution_script": ["wget -nc -O weblog.log https://raw.githubusercontent.com/elastic/examples/master/Common%20Data%20Formats/apache_logs/apache_logs"], "command": "{TEST_SCRIPT_DIR}/dgsh/8_no_func.sh", - "orch_args": "-d 2" + "orch_args": "-d 2 --sandbox-killing" + }, + { + "name": "7.sh - kill", + "env": ["INPUT_FILE={RESOURCE_DIR}/words.txt"], + "pre_execution_script": ["wget -nc -O weblog.log https://raw.githubusercontent.com/elastic/examples/master/Common%20Data%20Formats/apache_logs/apache_logs"], + "command": "{TEST_SCRIPT_DIR}/dgsh/8_no_func.sh", + "orch_args": "-d 2 --sandbox-killing" }, { "name": "Dgsh 9.sh - Riker Repo", @@ -90,5 +83,19 @@ "command": "{TEST_SCRIPT_DIR}/dgsh/9.sh", "working_dir": "{RESOURCE_DIR}/riker", "orch_args": "-d 2" + }, + { + "name": "18.sh - 120M", + "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], + "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/18.sh", + "orch_args": "-d 2" + }, + { + "name": "18.sh - 120M - kill", + "env": ["INPUT_FILE={RESOURCE_DIR}/in120M.xml"], + "pre_execution_script": ["wget -nc -O in120M.xml http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml"], + "command": "{TEST_SCRIPT_DIR}/dgsh/18.sh", + "orch_args": "-d 2 --sandbox-killing" } ] From 74de062f5dcad8124661278e61db133d29c9f1cf Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 04:56:55 -0600 Subject: [PATCH 80/90] Use correct speedup calculation --- report/benchmark_report.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/report/benchmark_report.py b/report/benchmark_report.py index 586c20a6..0eab564b 100644 --- a/report/benchmark_report.py +++ b/report/benchmark_report.py @@ -89,19 +89,24 @@ def run_command_with_orch(command, orch_args, working_dir=os.getcwd()): return (end_time - start_time, stdout.decode('utf-8'), stderr.decode('utf-8')) def compare_results(bash_output, orch_output): - bash_lines = bash_output.splitlines() - orch_lines = orch_output.splitlines() + if len(bash_output) <= 10000: + orch_lines = orch_output.splitlines()[:10000] + else: + bash_lines = bash_lines[:10000] + orch_lines = orch_output.splitlines()[:10000] # Compare lines d = difflib.ndiff(bash_lines, orch_lines) return [diff for diff in d if diff.startswith('- ') or diff.startswith('+ ')] def print_results(benchmark_name, bash_time, orch_time, diff_lines, diff_percentage): - if orch_time < bash_time: - comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) faster than Bash" + if bash_time > orch_time: + speedup = bash_time / orch_time + comparison_result = f"hs is {speedup:.2f}x ({bash_time - orch_time:.2f}s) faster than Bash" else: - comparison_result = f"hs is {round(diff_percentage/100, 1)}x ({diff_percentage:.2f}%) slower than Bash" + speedup = orch_time / bash_time + comparison_result = f"hs is {speedup:.2f}x ({orch_time - bash_time:.2f}s) slower than Bash" print("-" * 40) print(f"Results for benchmark: {benchmark_name}") print(f"Bash Execution Time: {round(bash_time, 3)}s") From 09f35792a7ef9f54e3278733bc7d6bb6e423fe01 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 05:15:26 -0600 Subject: [PATCH 81/90] Move env resolution after loop unrolling but before po progression --- parallel-orch/scheduler_server.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 11483592..cc4067f4 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -117,14 +117,15 @@ def handle_wait(self, input_cmd: str, connection): ## Inform the partial order that we received a wait for a node so that it can push loops ## forward and so on. - self.partial_program_order.wait_received(node_id) - + self.partial_program_order.maybe_unroll(node_id) + # Moved this below wait_received, in order to support unrolled loop nodes self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) + self.partial_program_order.wait_received(node_id) + ## If the node_id is already committed, just return its exit code if node_id in self.partial_program_order.get_committed(): - # TODO: Env check and if no conflicts, commit logging.debug(f'Node: {node_id} found in committed, responding immediately!') self.waiting_for_response[node_id] = connection self.respond_to_pending_wait(node_id) From 25c854db9fa7a7045fd07d64525167dfd7c9ac8d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 05:44:28 -0600 Subject: [PATCH 82/90] Make update_and_restart_nodes work with the transitive closure of the given node --- parallel-orch/partial_program_order.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index b533a57b..3289d077 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -660,7 +660,7 @@ def cmd_can_be_resolved(self, node_id: int) -> bool: logging.debug(f' >> Able to resolve {node_id}') return True - def __kill_all_currently_executing_and_schedule_restart(self, node_ids: "list[NodeId]", start=None): + def __kill_all_currently_executing_and_schedule_restart(self, start=None): nodes_to_kill = self.get_currently_executing() if start is not None: nodes_to_kill = [node_id for node_id in nodes_to_kill if node_id in self.get_transitive_closure([start])] @@ -1453,10 +1453,12 @@ def update_env_and_restart_nodes(self, node_id: NodeId): self.prechecked_env.discard(node_id) if node_id not in self.workset: self.workset.append(node_id) - self.__kill_all_currently_executing_and_schedule_restart([node_id]) + self.__kill_all_currently_executing_and_schedule_restart(start=node_id) + new_waiting_for_frontend = self.waiting_for_frontend.copy() for waiting_for_frontend_node in self.waiting_for_frontend: - if waiting_for_frontend_node not in self.workset: + if waiting_for_frontend_node not in self.workset and waiting_for_frontend_node in self.get_transitive_closure([node_id]): self.workset.append(waiting_for_frontend_node) + new_waiting_for_frontend.remove(waiting_for_frontend_node) most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) self.prechecked_env.discard(waiting_for_frontend_node) @@ -1464,7 +1466,7 @@ def update_env_and_restart_nodes(self, node_id: NodeId): assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) self.log_partial_program_order_info() logging.debug("-") - self.waiting_for_frontend = set() + self.waiting_for_frontend = new_waiting_for_frontend self.populate_to_be_resolved_dict() def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): From 353101f98323c5a2b40341e515bef5da1297737e Mon Sep 17 00:00:00 2001 From: gliargovas Date: Mon, 25 Sep 2023 23:59:40 -0600 Subject: [PATCH 83/90] Clarify hs is not a shell --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 44cce850..db90fe82 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ### Overview -`hs` is an out-of-order shell designed to execute script commands in a speculative parallel manner. It achieves this by tracing the script's execution, and if an error arises due to speculative execution, the script re-executes the necessary parts to ensure correct outcomes. The project aims to boost the parallel execution of shell scripts, reducing their runtime and enhancing efficiency. +`hs` is a system for executing shell scripts out of order. It achieves this by tracing the script's execution, and if an error arises due to speculative execution, the script re-executes the necessary parts to ensure correct outcomes. The project aims to boost the parallel execution of shell scripts, reducing their runtime and enhancing efficiency. ### Structure From f9fb245501e0feba387a24880f6e18517a3340a4 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 00:05:44 -0600 Subject: [PATCH 84/90] Capitalize config global variables --- parallel-orch/config.py | 8 ++++---- parallel-orch/partial_program_order.py | 10 +++++----- parallel-orch/scheduler_server.py | 12 ++++++------ parallel-orch/util.py | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 0796435b..e252040e 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -43,8 +43,8 @@ def log_root(msg, *args, **kwargs): START_TIME = time.time() -named_timestamps = {} +NAMED_TIMESTAMPS = {} -sandbox_killing = False -all_node_env_resolution = False -speculate_immidiately = False \ No newline at end of file +SANDBOX_KILLING = False +ALL_NODE_ENV_RESOLUTION = False +SPECULATE_IMMEDIATELY = False \ No newline at end of file diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 3289d077..00d0dd06 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -409,7 +409,7 @@ def init_partial_order(self): self.init_workset() logging.debug(f'Initialized workset') self.populate_to_be_resolved_dict() - if config.speculate_immidiately: + if config.SPECULATE_IMMEDIATELY: self.init_latest_env_files() logging.debug(f'To be resolved sets per node:') logging.debug(self.to_be_resolved) @@ -616,8 +616,8 @@ def add_to_write_set(self, node_id: NodeId, item: str): def add_to_speculated(self, node_id: NodeId): self.speculated = self.speculated.union([node_id]) - def is_first_node_when_env_is_uninitialized(self, speculate_immidiately): - if not speculate_immidiately: + def is_first_node_when_env_is_uninitialized(self, speculate_immediately): + if not speculate_immediately: starting_env_node = self.get_source_nodes() ## We may have a loop node at the start ## In that case, we roll back to the initial env @@ -750,7 +750,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): else: logging.debug(f" > Nodes to be committed this round: {to_commit}") logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) - if config.sandbox_killing: + if config.SANDBOX_KILLING: logging.info("Sandbox killing") self.__kill_all_currently_executing_and_schedule_restart(to_commit) log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") @@ -1264,7 +1264,7 @@ def attempt_move_stopped_to_workset(self): ## TODO: Eventually, in the future, let's add here some form of limit def schedule_work(self, limit=0): - if self.is_first_node_when_env_is_uninitialized(config.speculate_immidiately): + if self.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): logging.debug("Not scheduling work yet, waiting for first Wait") return # self.log_partial_program_order_info() diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index cc4067f4..ebc16ac7 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -34,10 +34,10 @@ def parse_args(): action="store_true", default=None, help="When receiving a wait check for env changes between the current node and all other waiting nodes, instead of only examining the current wait node.") - parser.add_argument("--speculate-immidiately", + parser.add_argument("--speculate-immediately", action="store_true", default=False, - help="Speculate immidiately instead of waiting for the first Wait message.") + help="Speculate immediately instead of waiting for the first Wait message.") args, unknown_args = parser.parse_known_args() return args @@ -108,7 +108,7 @@ def handle_wait(self, input_cmd: str, connection): ## Set the new env file for the node self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.speculate_immidiately): + if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): logging.debug("Initializing latest env and speculating") self.partial_program_order.init_latest_env_files(node_id) @@ -314,9 +314,9 @@ def main(): # logging.getLogger().setLevel(logging.TRACE) # Set optimization options - config.sandbox_killing = args.sandbox_killing - config.all_node_env_resolution = args.env_check_all_nodes_on_wait - config.speculate_immidiately = args.speculate_immidiately + config.SANDBOX_KILLING = args.sandbox_killing + config.ALL_NODE_ENV_RESOLUTION = args.env_check_all_nodes_on_wait + config.SPECULATE_IMMEDIATELY = args.speculate_immediately scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() diff --git a/parallel-orch/util.py b/parallel-orch/util.py index c9cb5d60..1ed96118 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -103,12 +103,12 @@ def log_time_delta_from_start(module: str, action: str, node=None): def set_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" - config.named_timestamps[key] = time.time() + config.NAMED_TIMESTAMPS[key] = time.time() def invalidate_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" - del config.named_timestamps[key] + del config.NAMED_TIMESTAMPS[key] def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None): try: @@ -121,7 +121,7 @@ def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key try: if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" - logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.named_timestamps[key])}") + logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}|Step time:{to_milliseconds_str(time.time() - config.NAMED_TIMESTAMPS[key])}") if invalidate: invalidate_named_timestamp(action, node, key) except KeyError: From 54ec04287e96e918cdc3a62b96c22fb4da600cc2 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 00:06:40 -0600 Subject: [PATCH 85/90] Removeredundant commented-out assertion --- parallel-orch/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index e252040e..8e170ec6 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -28,7 +28,6 @@ def log_root(msg, *args, **kwargs): ## Ensure that PASH_TMP_PREFIX is set by pa.sh -# assert(not os.getenv('PASH_SPEC_TMP_PREFIX') is None) PASH_SPEC_TMP_PREFIX = os.getenv('PASH_SPEC_TMP_PREFIX') SOCKET_BUF_SIZE = 8192 From 5c7ef20c9b4cfe39023104ebb55c163ed54ddaaa Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 00:19:09 -0600 Subject: [PATCH 86/90] Addexplanatory comments and future full PO assertions --- parallel-orch/partial_program_order.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 00d0dd06..cf6575d4 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -283,6 +283,8 @@ def __init__(self, nodes, edges, initial_env_file): self.latest_envs = {} self.initial_env_file = initial_env_file self.waiting_for_frontend = set() + ## In case we spot a dependency meaning a node must execute after another node, it will appear here + ## Contains the nodes to execute only after the key node finishes execution self.run_after = defaultdict(set) self.pending_to_execute = set() self.to_be_resolved_prev = {} @@ -367,10 +369,17 @@ def get_most_recent_possible_new_env_for_node(self, node_id) -> str: most_recent_env_node = node_id while self.get_new_env_file_for_node(most_recent_env_node) is None: predecessor = self.get_prev(most_recent_env_node) + + ## This will trigger when we move to full Partial Orders + assert len(predecessor) <= 1 + + ## If there are no predecessors for a node it means we are at the source + ## so there is no point to search further back if len(predecessor) == 0: - return None + break else: most_recent_env_node = predecessor[0] + return self.get_new_env_file_for_node(most_recent_env_node) ## This returns all previous nodes of a sub partial order From 06c1e074a75cf5d8365016422cbffb0adde3f30d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 01:41:12 -0600 Subject: [PATCH 87/90] Refactor dependency resolution by abstracting early and late resolution --- parallel-orch/partial_program_order.py | 96 ++++++++++++++------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index cf6575d4..3bc59266 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -702,40 +702,6 @@ def __kill_node(self, cmd_id: "NodeId"): logging.critical(proc) else: logging.debug("All processes were successfully terminated.") - - def resolve_dependencies_early(self, node_id=None): - to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} - if node_id: - to_check.add(node_id) - node_id_has_dependency = False - for second_cmd_id in to_check: - ## reverse sort breaks because it does not guarantee that the new env has arrived - for first_cmd_id in sorted(self.to_be_resolved[second_cmd_id], reverse=True): - if self.rw_sets.get(first_cmd_id) is not None: - if self.has_forward_dependency(first_cmd_id, second_cmd_id): - # if second_cmd_id not in self.workset and self.check_if_to_be_resolved_entry_would_change(second_cmd_id): - node_id_has_dependency = True - self.waiting_for_frontend.discard(second_cmd_id) - self.run_after[first_cmd_id].add(second_cmd_id) - self.pending_to_execute.add(second_cmd_id) - # self.workset.append(second_cmd_id) - logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) - break - # if node_id_has_dependency == True: - self.populate_to_be_resolved_dict() - for node in self.pending_to_execute: - prev_to_be_resoved = self.to_be_resolved_prev.get(node) - if prev_to_be_resoved is None: - return - elif set(self.to_be_resolved[node]) == set(prev_to_be_resoved): - # Not caring about this dependency because env has not yet changed - logging.debug() - self.pending_to_execute.remove(node) - for k, v in self.run_after.items(): - if node in v: - self.run_after[k].remove(node) - return def resolve_commands_that_can_be_resolved_and_push_frontier(self): # This may be obsolete since we only resolve one node at a time @@ -765,23 +731,61 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") self.commit_cmd_workspaces(to_commit) + def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn): + for second_cmd_id in cmds_to_check: + for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id): + + if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id): + update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id) + + # Internal function, modified the run_after dict and the pending_to_execute set + def __populate_run_after_dict(self): + for node in self.pending_to_execute.copy(): + prev_to_be_resolved = self.to_be_resolved_prev.get(node) + if prev_to_be_resolved is None: + return + # Check if env has changed since last comparison + elif set(self.to_be_resolved[node]) == set(prev_to_be_resolved): + # Not caring about this dependency because env has not yet changed + self.pending_to_execute.remove(node) + for k, v in self.run_after.items(): + if node in v: + self.run_after[k].remove(node) + + ## Spots dependencies and updates the state. + ## Safe to call everywhere + def resolve_dependencies_early(self, node_id=None): + def get_first_cmd_ids(second_cmd_id): + return sorted(self.to_be_resolved[second_cmd_id], reverse=True) + + def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): + self.waiting_for_frontend.discard(second_cmd_id) + self.run_after[first_cmd_id].add(second_cmd_id) + self.pending_to_execute.add(second_cmd_id) + logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) + + to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} + if node_id is not None: + to_check.add(node_id) + self.check_dependencies(to_check, get_first_cmd_ids, update_state_due_to_a_dependency) + self.populate_to_be_resolved_dict() + self.__populate_run_after_dict() def resolve_dependencies(self, cmds_to_resolve): - # Init stuff + def get_first_cmd_ids(second_cmd_id): + return sorted([cmd_id for cmd_id in self.to_be_resolved[second_cmd_id] if cmd_id not in self.stopped]) + + def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): + logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}') + new_workset.add(second_cmd_id) + new_workset = set() - for second_cmd_id in sorted(cmds_to_resolve): - first_cmd_ids = sorted([cmd_id for cmd_id in self.to_be_resolved[second_cmd_id] if cmd_id not in self.stopped]) - for first_cmd_id in first_cmd_ids: - if second_cmd_id not in new_workset: - ## We only check for forward dependencies if the first node is not a loop (abstract) node - if self.is_loop_node(first_cmd_id): - logging.debug(f' > Skipping dependency check with node {first_cmd_id} because it is a loop node') - continue - if self.has_forward_dependency(first_cmd_id, second_cmd_id): - logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}') - new_workset.add(second_cmd_id) + self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency) + return new_workset + ## Resolve all the forward dependencies and update the workset ## Forward dependency is when a command's output is the same ## as the input of a following command From 2f9372cdd21376037641a69344cee78720cd477b Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 01:43:55 -0600 Subject: [PATCH 88/90] Remove redundant arg --- parallel-orch/scheduler_server.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index ebc16ac7..2d673e71 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -30,14 +30,6 @@ def parse_args(): action="store_true", default=False, help="Kill any running overlay instances before commiting to the lower layer") - parser.add_argument("--env-check-all-nodes-on-wait", - action="store_true", - default=None, - help="When receiving a wait check for env changes between the current node and all other waiting nodes, instead of only examining the current wait node.") - parser.add_argument("--speculate-immediately", - action="store_true", - default=False, - help="Speculate immediately instead of waiting for the first Wait message.") args, unknown_args = parser.parse_known_args() return args From 929070bb5cfc575af9ef181cb51264545c56e11c Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 01:45:08 -0600 Subject: [PATCH 89/90] Use most recent pash changes --- deps/pash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/pash b/deps/pash index 760da58b..9044f6db 160000 --- a/deps/pash +++ b/deps/pash @@ -1 +1 @@ -Subproject commit 760da58b6687a304b12931c5a3fef987a7327b74 +Subproject commit 9044f6dbb79f2bd90a9076453932fe842ea8ba09 From 8935a843a7c573db1af86e53a1a17af7e0433725 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 26 Sep 2023 02:51:37 -0600 Subject: [PATCH 90/90] Resolve merge conflits with main correctly --- parallel-orch/config.py | 1 - parallel-orch/executor.py | 14 +++++++------- parallel-orch/partial_program_order.py | 14 +++++++------- parallel-orch/scheduler_server.py | 9 ++++++--- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/parallel-orch/config.py b/parallel-orch/config.py index 8e170ec6..6d9b2abf 100644 --- a/parallel-orch/config.py +++ b/parallel-orch/config.py @@ -45,5 +45,4 @@ def log_root(msg, *args, **kwargs): NAMED_TIMESTAMPS = {} SANDBOX_KILLING = False -ALL_NODE_ENV_RESOLUTION = False SPECULATE_IMMEDIATELY = False \ No newline at end of file diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 3b33025c..0349285e 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -12,18 +12,18 @@ def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, trace_file = util.ptempfile() stdout_file = util.ptempfile() stderr_file = util.ptempfile() - post_exec_env = util.ptempfile() + post_execution_env_file = util.ptempfile() logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_exec_env, speculate_mode) - return process, trace_file, stdout_file, stderr_file, post_exec_env + process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode) + return process, trace_file, stdout_file, stderr_file, post_execution_env_file def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file): - process, trace_file, stdout_file, stderr_file, post_exec_env = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) - return process, trace_file, stdout_file, stderr_file, post_exec_env + process, trace_file, stdout_file, stderr_file, post_execution_env_file = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) + return process, trace_file, stdout_file, stderr_file, post_execution_env_file -def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_exec_env, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file] @@ -32,7 +32,7 @@ def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, nod else: args.append("standard") args.append(str(node_id)) - args.append(post_exec_env) + args.append(post_execution_env_file) # Save output to temporary files to not saturate the memory logging.debug(args) process = subprocess.Popen(args, stdout=None, stderr=None) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index ce27ad57..8aabcd7f 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -15,17 +15,17 @@ class CompletedNodeInfo: - def __init__(self, exit_code, post_exec_env, stdout_file, sandbox_dir): + def __init__(self, exit_code, post_execution_env_file, stdout_file, sandbox_dir): self.exit_code = exit_code - self.post_exec_env = post_exec_env + self.post_execution_env_file = post_execution_env_file self.stdout_file = stdout_file self.sandbox_dir = sandbox_dir def get_exit_code(self): return self.exit_code - def get_post_exec_env(self): - return self.post_exec_env + def get_post_execution_env_file(self): + return self.post_execution_env_file def get_stdout_file(self): return self.stdout_file @@ -34,7 +34,7 @@ def get_sandbox_dir(self): return self.sandbox_dir def __str__(self): - return f'CompletedNodeInfo(ec:{self.get_exit_code()}, env:{self.get_post_exec_env()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' + return f'CompletedNodeInfo(ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' ## This class is used for both loop contexts and loop iters ## The indices go from inner to outer @@ -690,7 +690,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, start=None): def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') - proc_to_kill, trace_file, _stdout, _stderr, _post_exec_env = self.commands_currently_executing.pop(cmd_id) + proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file = self.commands_currently_executing.pop(cmd_id) # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response self.banned_files.add(trace_file) @@ -1399,7 +1399,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## Save the completed node info. Note that if the node doesn't commit ## this information will be invalid and rewritten the next time execution ## is completed for this node. - completed_node_info = CompletedNodeInfo(cmd_exit_code, post_exec_env, stdout, sandbox_dir) + completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir) self.nodes[node_id].set_completed_info(completed_node_info) ## We no longer add failed commands to the stopped set, diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 4ba8ee66..29ebddbf 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -29,7 +29,11 @@ def parse_args(): parser.add_argument("--sandbox-killing", action="store_true", default=False, - help="Kill any running overlay instances before commiting to the lower layer") + help="Kill any running overlay instances before commiting to the lower layer") + parser.add_argument("--speculate-immediately", + action="store_true", + default=False, + help="Speculate immediately instead of waiting for the first Wait message.") args, unknown_args = parser.parse_known_args() return args @@ -162,7 +166,7 @@ def respond_to_pending_wait(self, node_id: int): ## Get the completed node info node = self.partial_program_order.get_node(node_id) completed_node_info = node.get_completed_node_info() - msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_exec_env()} {completed_node_info.get_stdout_file()}' + msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' response = success_response(msg) ## Send the response self.respond_to_frontend_core(node_id, response) @@ -307,7 +311,6 @@ def main(): # Set optimization options config.SANDBOX_KILLING = args.sandbox_killing - config.ALL_NODE_ENV_RESOLUTION = args.env_check_all_nodes_on_wait config.SPECULATE_IMMEDIATELY = args.speculate_immediately scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run()