diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 193e1af2..1e187580 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -31,6 +31,10 @@ jobs: if: github.event.pull_request.draft == false steps: - uses: actions/checkout@v2 + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: '3.11' - name: Running Correctness Tests run: | cd .. diff --git a/.gitignore b/.gitignore index ca6b382a..6f4d6192 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ scripts/vars.sh *__pycache__* .DS_Store +*~ +\#*\# +.\#* \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index ba4f5c51..bf6ada7c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "deps/riker"] - path = deps/riker - url = https://github.com/angelhof/riker.git - branch = eric-custom-db-store [submodule "deps/pash"] path = deps/pash url = https://github.com/binpash/pash.git diff --git a/deps/riker b/deps/riker deleted file mode 160000 index f3bee7ba..00000000 --- a/deps/riker +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f3bee7ba19b8834199ff49dac53852f09f03338a diff --git a/deps/try b/deps/try index 37bbf7da..ba6a9061 160000 --- a/deps/try +++ b/deps/try @@ -1 +1 @@ -Subproject commit 37bbf7da5bfde97f598c3327c9582d9b08d7e264 +Subproject commit ba6a90615944203a95d5a86638447da34e539d1b diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 0349285e..2ec5e962 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -8,31 +8,33 @@ # and traces them with Riker. # All commands are run inside an overlay sandbox. -def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace(command, concrete_node_id, execution_id, pre_execution_env_file, speculate_mode=False): trace_file = util.ptempfile() stdout_file = util.ptempfile() stderr_file = util.ptempfile() post_execution_env_file = util.ptempfile() - logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') - logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') - logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode) - return process, trace_file, stdout_file, stderr_file, post_execution_env_file + sandbox_dir, tmp_dir = util.create_sandbox() + logging.debug(f'Scheduler: Stdout file for: {concrete_node_id} is: {stdout_file}') + logging.debug(f'Scheduler: Stderr file for: {concrete_node_id} is: {stderr_file}') + logging.debug(f'Scheduler: Trace file for: {concrete_node_id}: {trace_file}') + process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) + return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file): - process, trace_file, stdout_file, stderr_file, post_execution_env_file = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) - return process, trace_file, stdout_file, stderr_file, post_execution_env_file +def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, concrete_node_id, pre_execution_env_file): + process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, concrete_node_id, pre_execution_env_file, speculate_mode=True) + return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' - args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file] + args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir] if speculate_mode: args.append("speculate") else: args.append("standard") - args.append(str(node_id)) + args.append(str(concrete_node_id)) args.append(post_execution_env_file) + args.append(str(execution_id)) # Save output to temporary files to not saturate the memory logging.debug(args) process = subprocess.Popen(args, stdout=None, stderr=None) @@ -56,7 +58,7 @@ def read_trace(sandbox_dir, trace_file): path = f"{sandbox_dir}/upperdir/{trace_file}" logging.debug(f'Reading trace from: {path}') with open(path) as f: - return f.readlines() + return f.read().split('\n')[:-1] def read_env_file(env_file, sandbox_dir=None): if sandbox_dir is None: diff --git a/parallel-orch/node.py b/parallel-orch/node.py new file mode 100644 index 00000000..1f34e1b3 --- /dev/null +++ b/parallel-orch/node.py @@ -0,0 +1,529 @@ +from itertools import chain +import logging +import re +import executor +import trace_v2 +import util +import signal +from dataclasses import dataclass +from subprocess import Popen +from typing import Tuple +from enum import Enum, auto +import util +import analysis + +class NodeState(Enum): + INIT = auto() + READY = auto() + COMMITTED = auto() + STOP = auto() + SPECULATED = auto() + EXECUTING = auto() + SPEC_EXECUTING = auto() + UNSAFE = auto() + +def state_pstr(state: NodeState): + same_length_state_str = { + NodeState.INIT: ' INIT', + NodeState.READY: ' READY', + NodeState.COMMITTED: 'COMMIT', + NodeState.STOP: ' STOP', + NodeState.SPECULATED: 'SPEC_F', + NodeState.EXECUTING: ' EXE', + NodeState.SPEC_EXECUTING: 'SPEC_E', + NodeState.UNSAFE: 'UNSAFE' + } + return same_length_state_str[state] + +class RWSet: + + def __init__(self, read_set: set, write_set: set): + self.read_set = read_set + self.write_set = write_set + + def add_to_read_set(self, item: str): + self.read_set.add(item) + + def add_to_write_set(self, item: str): + self.write_set.add(item) + + def get_read_set(self) -> set: + return self.read_set + + def get_write_set(self) -> set: + return self.write_set + + def has_conflict(self, other: 'RWSet') -> bool: + if (self.write_set.intersection(other.read_set) or + self.read_set.intersection(other.write_set) or + self.write_set.intersection(other.write_set)): + return True + else: + return False + + def get_conflict(self, other: 'RWSet') -> set: + return self.write_set.intersection(other.read_set).union( + self.read_set.intersection(other.write_set)).union( + self.write_set.intersection(other.write_set)) + + def __str__(self): + return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" + + +class NodeId: + def __init__(self, id_: int): + self.id_ = id_ + + def get_non_iter_id(self): + return NodeId(self.id_) + + def __repr__(self): + ## TODO: Represent it using n. + output = f'{self.id_}' + return output + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other): + # return self.loop_iters == other.loop_iters and self.id == other.id + return self.id_ == other.id_ + + def __ne__(self, other): + return not(self == other) + + def __lt__(self, obj): + return (str(self) < str(obj)) + + def __gt__(self, obj): + return (str(self) > str(obj)) + + @staticmethod + def parse_node_id(node_id_str: str): + return NodeId(int(node_id_str)) + +@dataclass +class ExecCtxt: + process: Popen + trace_file: str + stdout: str + stderr: str + pre_env_file: str + post_env_file: str + sandbox_dir: str + +@dataclass +class ExecResult: + exit_code: int + proc_id: int + +class LoopStack: + def __init__(self, loop_contexts_or_iters=None): + if loop_contexts_or_iters is None: + self.loops = [] + else: + self.loops = loop_contexts_or_iters + + def __repr__(self): + ## TODO: Represent it using 'it', 'it0', 'it1', etc + ## or -(iters)- in front of it. + output = "-".join([str(it) for it in self.loops]) + return output + def __eq__(self, other): + return self.loops == other.loops + +@dataclass +class Node: + id_: NodeId + cmd: str + asts: "list[AstNode]" + loop_context: LoopStack + + def __init__(self, id_, cmd, asts, loop_context=None): + self.id_ = id_ + self.cmd = cmd + self.asts = asts + self.loop_context = loop_context if loop_context else LoopStack() + +class ConcreteNodeId: + def __init__(self, node_id: NodeId, loop_iters = list()): + self.node_id = node_id + self.loop_iters = tuple(loop_iters) + + def __repr__(self): + return f'cnid({self.node_id.id_})' + + def __hash__(self): + return hash((self.node_id, self.loop_iters)) + + def __eq__(self, other): + return self.node_id == other.node_id and self.loop_iters == other.loop_iters + + def __str__(self): + return f'{self.node_id}@' + ''.join(['-' + str(n) for n in self.loop_iters]) + + @staticmethod + def parse(input_str): + node_id_str, loop_iters_str = input_str.split('@') + return ConcreteNodeId(NodeId(int(node_id_str)), [int(cnt) for cnt in loop_iters_str.split('-')[1:]]) + +class ConcreteNode: + cnid: ConcreteNodeId + abstract_node: Node + state: NodeState + # Used for identifying the most recent valid execution + exec_id: int + # Nodes to check for fs dependencies before this node can be committed + # for this particular execution of the main sandbox. + # No need to do the same for the background sandbox since it will always get committed. + to_be_resolved_snapshot: "set[NodeId]" + # Read and write sets for this node + rwset: RWSet + # The wait trace file for this node + wait_env_file: str + # This can only be set while in the frontier and the background node execution is enabled + # TODO: For now ignore this. Maybe there is a better way to do this. + # background_sandbox: Sandbox + exec_ctxt: ExecCtxt + exec_result: ExecResult + + def __init__(self, cnid: ConcreteNodeId, node: Node): + self.cnid = cnid + self.abstract_node = node + self.state = NodeState.INIT + self.tracefile = None + self.rwset = None + self.wait_env_file = None + self.to_be_resolved_snapshot = None + self.exec_ctxt = None + self.exec_id = None + + def __str__(self): + return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})' + + def __repr__(self): + return str(self) + + @property + def id_(self): + return self.abstract_node.id_ + + @property + def cmd(self): + return self.abstract_node.cmd + + @property + def asts(self): + return self.abstract_node.asts + + def pretty_state_repr(self): + return f'{state_pstr(self.state)},{self.id_},{self.cmd}' + + def is_initialized(self): + return self.state == NodeState.INIT + + def is_ready(self): + return self.state == NodeState.READY + + def is_committed(self): + return self.state == NodeState.COMMITTED + + def is_stopped(self): + return self.state == NodeState.STOP + + def is_speculated(self): + return self.state == NodeState.SPECULATED + + def is_executing(self): + return self.state == NodeState.EXECUTING + + def is_spec_executing(self): + return self.state == NodeState.SPEC_EXECUTING + + def is_unsafe(self): + return self.state == NodeState.UNSAFE + + def start_command(self, env_file: str, speculate=False): + # TODO: implement speculate + # TODO: built-in commands + cmd = self.cmd + execute_func = executor.async_run_and_trace_command_return_trace + # Set the execution id + self.exec_id = util.generate_id() + self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.cnid, self.exec_id, env_file)) + + def execution_outcome(self) -> Tuple[int, str, str]: + assert self.exec_result is not None + return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout + + def command_unsafe(self): + return not analysis.safe_to_execute(self.asts, {}) + + + ## ## + ## Transition Functions ## + ## ## + + def transition_from_init_to_ready(self): + assert self.state == NodeState.INIT + self.state = NodeState.READY + self.rwset = RWSet(set(), set()) + # Also, probably unroll here? + + def transition_from_ready_to_unsafe(self): + assert self.state == NodeState.READY + self.state = NodeState.UNSAFE + + def kill(self): + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] + self.exec_ctxt.process.kill() + + def reset_to_ready(self): + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, + NodeState.SPECULATED] + + logging.info(f"Resetting node {self.id_} to ready {self.exec_id}") + # We reset the exec id so if we receive a message + # due to a race condition, we will ignore it. + self.exec_id = None + + # TODO: make this more sophisticated + if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: + self.kill() + + # Probably delete them from tmpfs too + process = self.exec_ctxt.process + if process.poll() is None: + # Exceptions will be handled inside the call so we don't have to worry + util.kill_process_tree(process.pid, sig=signal.SIGKILL) + + self.exec_ctxt = None + self.exec_result = None + self.state = NodeState.READY + + + def start_executing(self, env_file): + assert self.state == NodeState.READY + self.start_command(env_file) + self.state = NodeState.EXECUTING + + def start_spec_executing(self, env_file): + assert self.state == NodeState.READY + self.start_command(env_file, speculate=True) + self.state = NodeState.SPEC_EXECUTING + + def commit_frontier_execution(self): + assert self.state == NodeState.EXECUTING + self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) + self.gather_fs_actions() + executor.commit_workspace(self.exec_ctxt.sandbox_dir) + self.state = NodeState.COMMITTED + + def finish_spec_execution(self): + assert self.state == NodeState.SPEC_EXECUTING + self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) + self.gather_fs_actions() + self.state = NodeState.SPECULATED + + + def commit_speculated(self): + assert self.state == NodeState.SPECULATED + executor.commit_workspace(self.exec_ctxt.sandbox_dir) + self.state = NodeState.COMMITTED + + def transition_from_stopped_to_executing(self, env_file=None): + assert self.state == NodeState.READY + self.state = NodeState.EXECUTING + self._attempt_start_command(env_file) + + def transition_to_committed(self): + assert self.state in NodeState.SPECULATED + self.state = NodeState.COMMITTED + # TODO + + def transition_from_spec_executing_to_speculated(self): + pass + + def commit_unsafe_node(self): + assert self.state == NodeState.UNSAFE + self.state = NodeState.COMMITTED + + def update_rw_set(self, rw_set): + self.rwset = rw_set + + def gather_fs_actions(self) -> RWSet: + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] + sandbox_dir = self.exec_ctxt.sandbox_dir + trace_file = self.exec_ctxt.trace_file + try: + trace_object = executor.read_trace(sandbox_dir, trace_file) + except FileNotFoundError: + self.update_rw_set(RWSet(set(), set())) + return + read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) + rw_set = RWSet(read_set, write_set) + self.update_rw_set(rw_set) + + def get_rw_set(self): + # if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: + # self.gather_fs_actions() + return self.rwset + + def has_env_conflict_with(self, other_env) -> bool: + # Early return if paths are the same + if self.exec_ctxt.pre_env_file == other_env: + return False + + ignore_vars = set(["_", 'RANDOM', "msg", "pash_runtime_final_status", "pash_previous_set_status", + "pash_runtime_shell_variables_file", "from_set", "output_variable_file", + "pash_loop_iter_counters", "daemon_response", "vars_file", + "pash_speculative_command_id", "prev_env", "PREVIOUS_SET_STATUS", + "BASH_LINENO", "response_args", "stdout_file", "pash_spec_command_id", + "cmd_exit_code", "pash_set_to_add"]) + + re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"') + re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"') + re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))') + + def parse_env(content): + env_vars = {} + for line in content.splitlines(): + if line.startswith('#') or not line.strip(): + continue + for regex in [re_scalar_string, re_scalar_int, re_array]: + match = regex.match(line) + if match: + key, value = match.groups() + if key not in ignore_vars: + env_vars[key] = value + return env_vars + + with open(self.exec_ctxt.pre_env_file, 'r') as file: + node_env_vars = parse_env(file.read()) + + with open(other_env, 'r') as file: + other_env_vars = parse_env(file.read()) + + conflict_exists = False + for key in set(node_env_vars.keys()).union(other_env_vars.keys()): + if key not in node_env_vars: + logging.critical(f"Variable {key} missing in node environment") + conflict_exists = True + elif key not in other_env_vars: + logging.critical(f"Variable {key} missing in other environment") + conflict_exists = True + elif node_env_vars[key] != other_env_vars[key]: + logging.critical(f"Variable {key} differs: node environment has {node_env_vars[key]}, other has {other_env_vars[key]}") + conflict_exists = True + + return conflict_exists + + + +class HSBasicBlock: + def __init__(self, nodes: list[Node]): + if len(nodes) == 0: + raise ValueError('basic block size 0') + self.nodes = nodes + + def __str__(self): + return ''.join([node.cmd.strip() + '\n' for node in self.nodes]) + + @property + def loop_context(self): + return self.nodes[0].loop_context + + @property + def node_ids(self): + return [node.id_ for node in self.nodes] + + def get_node(self, node_id: NodeId) -> Node: + nodes = [node for node in self.nodes if node.id_ == node_id] + assert len(nodes) == 1 + return nodes[0] + +class HSProg: + abstract_nodes: "dict[NodeId, Node]" + adjacency: "dict[NodeId, list[NodeId]]" + inverse_adjacency: "dict[NodeId, list[NodeId]]" + basic_blocks: list[HSBasicBlock] = [] + block_adjacency: "dict[int, list[int]]" + BB_ENTER = -1 + BB_EXIT = -2 + + def __init__(self, abstract_nodes: dict[NodeId, Node], + edges: dict[NodeId, list[NodeId]]): + self.abstract_nodes = abstract_nodes + self.adjacency = edges + self.inverse_adjacency = util.invert_graph(abstract_nodes, edges) + self.construct_basic_blocks() + util.debug_log(str(self)) + + def construct_basic_blocks(self): + node_list = [] + block_id = LoopStack() + for node in self.abstract_nodes.values(): + if (node.loop_context == block_id and + not (len(node_list) >= 1 and node_list[-1].cmd == 'break')): + node_list.append(node) + else: + if len(node_list) != 0: + # This branch happens for conditional at the beginning + # of the program + basic_block = HSBasicBlock(node_list) + self.basic_blocks.append(basic_block) + node_list = [node] + block_id = node.loop_context + basic_block = HSBasicBlock(node_list) + self.basic_blocks.append(basic_block) + if len(self.basic_blocks) == 0: + raise ValueError('empty hsprog') + + # TODO: the algorithm here is wrong, + # echo 1 + # for i in {1..n}; do + # echo 2 + # done + # for i in {1..m}; do + # echo 3 + # done + # echo 4 + # + # echo 1 can goto echo 2, echo 3, or echo 4 + self.block_adjacency = {} + prev_blocks = {tuple(): self.basic_blocks[0]} + for bb_id, bb in enumerate(self.basic_blocks): + # the fallthrough edge + if bb_id != len(self.basic_blocks) - 1: + self.block_adjacency[bb_id] = [bb_id + 1] + else: + self.block_adjacency[bb_id] = [HSProg.BB_EXIT] + break + + for next_bb_id in chain(range(bb_id + 1, len(self.basic_blocks)), + range(0, bb_id + 1)): + next_bb = self.basic_blocks[next_bb_id] + if next_bb.loop_context == bb.loop_context: + self.block_adjacency[bb_id].append(next_bb_id) + break + else: + raise ValueError('no jump block') + + def is_start_of_block(self, node_id: NodeId): + for bb in self.basic_blocks: + bb : HSBasicBlock + if bb.nodes[0].id_ == node_id: + return True + return False + + def find_basic_block(self, node_id: NodeId): + for bb in self.basic_blocks: + bb : HSBasicBlock + for node in bb.nodes: + if node.id_ == node_id: + return bb + raise ValueError('no such node_id') + + def __str__(self): + return 'prog:\n' + '\n'.join( + [f'block {i}:\n' + str(bb) + f'goto block {self.block_adjacency[i]}\n' for i, bb in enumerate(self.basic_blocks)]) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 8aabcd7f..73ad58ef 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1,1704 +1,309 @@ -import copy +from node import NodeId, Node, ConcreteNodeId, ConcreteNode, HSProg, HSBasicBlock import logging -import os -import sys - -import analysis -import config -import executor -import trace -from util import * import util -from collections import defaultdict - -from shasta.ast_node import AstNode, CommandNode, PipeNode - - -class CompletedNodeInfo: - def __init__(self, exit_code, post_execution_env_file, stdout_file, sandbox_dir): - self.exit_code = exit_code - self.post_execution_env_file = post_execution_env_file - self.stdout_file = stdout_file - self.sandbox_dir = sandbox_dir - - def get_exit_code(self): - return self.exit_code - - def get_post_execution_env_file(self): - return self.post_execution_env_file - - def get_stdout_file(self): - return self.stdout_file - - def get_sandbox_dir(self): - return self.sandbox_dir - - def __str__(self): - return f'CompletedNodeInfo(ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' - -## This class is used for both loop contexts and loop iters -## The indices go from inner to outer -class LoopStack: - def __init__(self, loop_contexts_or_iters=None): - if loop_contexts_or_iters is None: - self.loops = [] - else: - self.loops = loop_contexts_or_iters - - def is_empty(self): - return len(self.loops) == 0 - - def __len__(self): - return len(self.loops) - - ## Generates a new loop stack with the same length but 0s as values - def new_zeroed_loop_stack(self): - return [0 for i in self.loops] - - def get_outer(self): - return self.loops[-1] - - def pop_outer(self): - return self.loops.pop() - - def add_inner(self, loop_iter_id: int): - self.loops.insert(0, loop_iter_id) - - def outer_to_inner(self): - return self.loops[::-1] - - def index(self, loop_iter_id: int) -> int: - return self.loops.index(loop_iter_id) - - def get(self, index: int): - return self.loops[index] - - def __repr__(self): - ## TODO: Represent it using 'it', 'it0', 'it1', etc - ## or -(iters)- in front of it. - output = "-".join([str(it) for it in self.loops]) - return output - - def __eq__(self, other): - if not len(self.loops) == len(other.loops): - return False - for i in range(len(self.loops)): - if not self.loops[i] == other.loops[i]: - return False - return True - - -class NodeId: - def __init__(self, id: int, loop_iters=None): - self.id = id - - if loop_iters is None: - self.loop_iters = LoopStack() - else: - assert(isinstance(loop_iters, LoopStack)) - self.loop_iters = loop_iters - - def has_iters(self): - return not self.loop_iters.is_empty() - - def get_iters(self): - return copy.deepcopy(self.loop_iters) - - def get_non_iter_id(self): - return NodeId(self.id) - - ## Returns a new NodeId - def generate_new_node_id_with_another_iter(self, new_iter: int): - ## This node already contains iterations for the outer loops potentially - ## so we just need to add another inner iteration - new_iters = copy.deepcopy(self.loop_iters) - new_iters.add_inner(new_iter) - - new_node_id = NodeId(self.id, new_iters) - return new_node_id - - def __repr__(self): - ## TODO: Represent it using n. - output = f'{self.id}' - if not self.loop_iters.is_empty(): - output += f'+{self.loop_iters}' - return output - - def __hash__(self): - return hash(str(self)) - - def __eq__(self, other): - return self.loop_iters == other.loop_iters and self.id == other.id - - def __ne__(self, other): - # Not strictly necessary, but to avoid having both x==y and x!=y - # True at the same time - return not(self == other) - - ## TODO: Define this correctly if it is to be used for something other than dictionary indexing - def __lt__(self, obj): - return (str(self) < str(obj)) - - def __gt__(self, obj): - return (str(self) > str(obj)) - - # def __le__(self, obj): - # return ((self.b) <= (obj.b)) - - # def __ge__(self, obj): - # return ((self.b) >= (obj.b)) +from collections import deque -def parse_node_id(node_id_str: str) -> NodeId: - if "+" in node_id_str: - node_id_int, iters_str = node_id_str.split("+") - iters = [int(it) for it in iters_str.split("-")] - return NodeId(int(node_id_int), LoopStack(iters)) - else: - return NodeId(int(node_id_str), LoopStack()) +PROG_LOG = '[PROG_LOG] ' +EVENT_LOG = '[EVENT_LOG] ' -class Node: - id: NodeId - cmd: str - asts: "list[AstNode]" - loop_context: LoopStack - - def __init__(self, id, cmd, asts, loop_context: LoopStack): - self.id = id - self.cmd = cmd - self.asts = asts - ## There can only be a single AST per node, and this - ## must be a command. - assert(len(asts) == 1) - # Check that the node contains only CommandNode(s) - analysis.validate_node(asts[0]) - self.cmd_no_redir = trace.remove_command_redir(self.cmd) - self.loop_context = loop_context - ## Keep track of how many iterations of this loop node we have unrolled - if not loop_context.is_empty(): - self.current_iters = loop_context.new_zeroed_loop_stack() - - def __str__(self): - # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}" - return self.cmd - - def __repr__(self): - # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}" - return f'N({self.cmd})' - - def get_cmd(self) -> str: - return self.cmd - - def get_cmd_no_redir(self) -> str: - return self.cmd_no_redir - - def get_loop_context(self) -> LoopStack: - return self.loop_context - - def in_loop(self) -> bool: - return not self.loop_context.is_empty() - - ## KK 2023-05-17 Does this generate the correct iteration even in nested loops? - def get_next_iter(self, loop_id: int) -> int: - assert(self.in_loop()) - assert(self.loop_context.get_outer() == loop_id) - loop_id_index_in_loop_context_stack = self.loop_context.index(loop_id) - self.current_iters[loop_id_index_in_loop_context_stack] += 1 - return self.current_iters[loop_id_index_in_loop_context_stack] - - ## Note: This information is valid only after a node is committed. - ## It might be set even before that, but it should only be retrieved when - ## a node is committed. - def set_completed_info(self, completed_node_info: CompletedNodeInfo): - self.completed_node_info = completed_node_info - - def get_completed_node_info(self) -> CompletedNodeInfo: - return self.completed_node_info - - -class RWSet: - - def __init__(self, read_set: set, write_set: set): - self.read_set = read_set - self.write_set = write_set - - def add_to_read_set(self, item: str): - self.read_set.add(item) - - def add_to_write_set(self, item: str): - self.write_set.add(item) - - def get_read_set(self) -> set: - return self.read_set - - def get_write_set(self) -> set: - return self.write_set - - def __str__(self): - return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" +def event_log(s): + logging.info(EVENT_LOG + s) +def progress_log(s): + logging.info(PROG_LOG + s) class PartialProgramOrder: - - def __init__(self, nodes, edges, initial_env_file): - self.nodes = nodes - # TODO: consider changing values to sets instead of lists - self.adjacency = edges - self.init_inverse_adjacency() - ## TODO: KK: Is it OK if we modify adjacency lists on the fly while processing the partial-order? - ## TODO: Remember to modify inverse_adjacency - ## self.committed is an add-only set, we never remove - ## TODO: For loop modify committed, workset, frontier, stopped - ## TODO: Add assertions that committed etc do not contain loop nodes - self.committed = set() - ## Nodes that are in the frontier can only move to committed - self.frontier = [] - self.rw_sets = {node_id: None for node_id in self.nodes.keys()} - self.workset = [] - ## A dictionary from cmd_ids that are currently executing that contains their trace_files - self.commands_currently_executing = {} - ## A dictionary that contains information about completed nodes - ## from cmd_id -> CompletedNodeInfo - ## Note: this dictionary does not contain information - ## TODO: Delete this - self.completed_node_info = {} - ## KK 2023-05-09 @Giorgo What is the difference of the following two? - self.to_be_resolved = {} - self.speculated = set() - ## Contains the most recent sandbox directory paths - self.sandbox_dirs = {} - ## Commands that were killed by riker - ## we should keep those in the workset but not execute them - ## until they reach the frontier - self.stopped = set() - ## Commands deemed unsafe from our analysis, that have to be executed - ## in the original shell (e.g., shell primitives) - ## Invariant: self.unsafe \subseteq self.stopped - self.unsafe = set() - self.committed_order = [] - self.commit_state = {} - ## Counts the times a node was (re)executed - self.executions = {node_id: 0 for node_id in self.nodes.keys()} - self.banned_files = set() - self.new_envs = {} - self.latest_envs = {} - self.initial_env_file = initial_env_file - self.waiting_for_frontend = set() - ## In case we spot a dependency meaning a node must execute after another node, it will appear here - ## Contains the nodes to execute only after the key node finishes execution - self.run_after = defaultdict(set) - self.pending_to_execute = set() - self.to_be_resolved_prev = {} - self.prechecked_env = set() - - def __str__(self): - return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" - - def get_source_nodes(self) -> list: - sources = set() - for to_id, from_ids in self.inverse_adjacency.items(): - if len(from_ids) == 0: - sources.add(to_id) - return list(sources) - - def get_standard_source_nodes(self) -> list: - source_nodes = self.get_source_nodes() - return self.filter_standard_nodes(source_nodes) - - ## This returns the minimum w.r.t. to the PO of a bunch of node_ids. - ## In a real partial order, this could be many, - def get_min(self, node_ids: "list[NodeId]") -> "list[NodeId]": - potential_minima = set(copy.deepcopy(node_ids)) - for node_id in node_ids: - tc = self.get_transitive_closure([node_id]) - ## Remove the node itself from its transitive closure - tc.remove(node_id) - ## If a node is found in the tc of another node, then - ## it is not a minimum - for nid in tc: - potential_minima.discard(nid) - ## KK 2023-05-22 This will be removed at some point but I keep it here - ## for now for easier bug finding. - # logging.debug(f"Potential minima: {potential_minima}") - assert(len(potential_minima) == 1) - return list(potential_minima) - - ## This returns all previous nodes of a sub partial order - def get_sub_po_source_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - source_nodes = list() - node_set = set(node_ids) - for node_id in node_ids: - prev_ids_set = set(self.get_prev(node_id)) - ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so - if len(prev_ids_set) == 0 or \ - not prev_ids_set.issubset(node_set): - source_nodes.append(node_id) - - ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure - assert(len(source_nodes) == 1) - return source_nodes - - def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - sink_nodes = list() - node_set = set(node_ids) - for node_id in node_ids: - next_ids_set = set(self.get_next(node_id)) - ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so - if len(next_ids_set) == 0 or \ - not next_ids_set.issubset(node_set): - sink_nodes.append(node_id) - - ## KK 2024-05-03: I don't see how we can get multiple sink with the current structure - assert(len(sink_nodes) == 1) - return sink_nodes - - def set_new_env_file_for_node(self, node_id: NodeId, new_env_file: str): - self.new_envs[node_id] = new_env_file - - def get_new_env_file_for_node(self, node_id: NodeId) -> str: - return self.new_envs.get(node_id) - - def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str): - self.latest_envs[node_id] = latest_env_file - - def get_latest_env_file_for_node(self, node_id: NodeId) -> str: - return self.latest_envs.get(node_id) - - def get_most_recent_possible_new_env_for_node(self, node_id) -> str: - most_recent_env_node = node_id - while self.get_new_env_file_for_node(most_recent_env_node) is None: - predecessor = self.get_prev(most_recent_env_node) - - ## This will trigger when we move to full Partial Orders - assert len(predecessor) <= 1 - - ## If there are no predecessors for a node it means we are at the source - ## so there is no point to search further back - if len(predecessor) == 0: - break - else: - most_recent_env_node = predecessor[0] - - return self.get_new_env_file_for_node(most_recent_env_node) - - ## This returns all previous nodes of a sub partial order - def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - prev_nodes = set() - node_set = set(node_ids) - for node_id in node_ids: - prev_ids_set = set(self.get_prev(node_id)) - prev_nodes = prev_nodes.union(prev_ids_set - node_set) - - ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure - assert(len(prev_nodes) <= 1) - return list(prev_nodes) - - ## TODO: Implement this correctly. I have thought of a naive algorithm that - ## does a BFS forward and backward for each node and if we first see a - ## node outside of the set and then one inside it means that the subset is not closed. - def is_closed_sub_partial_order(self, node_ids: "list[NodeId]") -> bool: - # node_set = set(node_ids) - # visited_set = set() - # for node_id in node_ids: - # prev_ids_set = set(self.get_prev(node_id)) - # next_id_set = set(self.get_next(node_id)) - # ## If one of the previous or next nodes is not in the node set - # ## it means that the sub partial order is not closed. - # if not node_set.issuperset(prev_ids_set.union(next_id_set)): - # return False - - return True - - def init_partial_order(self): - ## Initialize the frontier with all non-loop source nodes - self.frontier = self.get_standard_source_nodes() - ## Initialize the workset - self.init_workset() - logging.debug(f'Initialized workset') - self.populate_to_be_resolved_dict() - if config.SPECULATE_IMMEDIATELY: - self.init_latest_env_files() - logging.debug(f'To be resolved sets per node:') - logging.debug(self.to_be_resolved) - logging.info(f'Initialized the partial order!') - self.log_partial_program_order_info() - assert(self.valid()) - - - def init_latest_env_files(self, node=None): - if node is None: - env_to_assign = self.initial_env_file - else: - env_to_assign = self.get_new_env_file_for_node(node) - for node_id in self.get_all_non_committed(): - self.set_latest_env_file_for_node(node_id, env_to_assign) - - - def init_workset(self): - self.workset = self.get_all_non_committed_standard_nodes() - - ## Check if the partial order is done - def is_completed(self) -> bool: - return len(self.get_all_non_committed_standard_nodes()) == 0 - - def get_workset(self) -> list: - return self.workset - - def get_unsafe(self) -> set: - return copy.deepcopy(self.unsafe) - - ## Only return the stopped that are not unsafe - def get_stopped_safe(self) -> set: - return copy.deepcopy(self.stopped.difference(self.unsafe)) - - ## When we remove a command from unsafe we always remove from stopped too - def remove_from_unsafe(self, node_id: NodeId): - self.unsafe.remove(node_id) - self.stopped.remove(node_id) - - def get_committed(self) -> set: - return copy.deepcopy(self.committed) - - def get_committed_list(self) -> list: - return sorted(list(self.committed)) - - def is_committed(self, node_id: NodeId) -> bool: - return node_id in self.committed - - def init_inverse_adjacency(self): - self.inverse_adjacency = {i: [] for i in self.nodes.keys()} - for from_id, to_ids in self.adjacency.items(): - for to_id in to_ids: - self.inverse_adjacency[to_id].append(from_id) - - # ## TODO: (When there is time) Define a function that checks that the graph is valid - ## TODO: Call valid and add assertiosn for loops here. + frontier: set # Set of nodes at the frontier + # Di: I'm going to ignore this for now and implement the feature without a local data structure + # Later we can add this back as a caching mechanism to avoid doing RWSet + # intersections of files all the time + # run_after: "dict[NodeId, list[Node]]" # Nodes that should run after certain conditions + + # Mapping of concrete nodes to lists of uncommitted concrete nodes the precedes them. + # It is the snapshot of the reachable uncommited concrete nodes from prev_concrete_node graph + # at the time the concrete node enters execution. So if there is fs conflict in them, + # it needs to be rerun + to_be_resolved: "dict[NodeId, list[Node]]" + concrete_nodes: "dict[NodeId, Node]" + + def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"): + self.hsprog = HSProg(abstract_nodes, edges) + self.concrete_nodes: dict[ConcreteNodeId, ConcreteNode] = {} + self.frontier = set() + # self.run_after = {} + self.prev_concrete_node: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} + self.to_be_resolved: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} + + @property + def abstract_nodes(self): + return self.hsprog.abstract_nodes + + @property + def adjacency(self): + return self.hsprog.adjacency + + @property + def inverse_adjacency(self): + return self.hsprog.inverse_adjacency + + def commit_node(self, node): + # Logic to handle committing a node + node.transition_to_committed() + # Maybe update dependencies here + # etc. + + def get_concrete_node(self, concrete_node_id: ConcreteNodeId) -> ConcreteNode: + return self.concrete_nodes[concrete_node_id] + + def get_all_nodes(self): + return [node for node in self.concrete_nodes.values()] + + def get_committed_nodes(self): + return [node for node in self.concrete_nodes.values() if node.is_committed()] + + def get_ready_nodes(self): + return [(cnid, n) for cnid, n in self.concrete_nodes.items() if n.is_ready()] + + def get_executing_nodes(self): + return [node for node in self.concrete_nodes.values() if node.is_executing()] + + def get_spec_executing_nodes(self): + return [node for node in self.concrete_nodes.values() if node.is_spec_executing()] + + def get_executing_normal_and_spec_nodes(self): + return [node for node in self.concrete_nodes.values() if node.is_executing() or node.is_spec_executing()] + + def get_speculated_nodes(self): + return [node for node in self.concrete_nodes.values() if node.is_speculated()] + + def get_uncommitted_nodes(self): + return [node for node in self.concrete_nodes.values() if not node.is_committed()] + + def get_frontier(self): + return self.frontier + + def log_info(self): + logging.info(f"Nodes: {self.concrete_nodes}") + logging.info(f"Adjacency: {self.adjacency}") + logging.info(f"Inverse adjacency: {self.inverse_adjacency}") + self.log_state() + + def log_state(self): + for node in self.concrete_nodes.values(): + progress_log(node.pretty_state_repr()) + progress_log('') + + def get_schedulable_nodes(self) -> list[ConcreteNodeId]: + return [concrete_node_id for concrete_node_id, _ in self.get_ready_nodes()] + + def get_prev_nodes(self, concrete_node_id: ConcreteNodeId) -> "list[ConcreteNodeId]": + return self.prev_concrete_node[concrete_node_id][:] + + def get_all_next(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]": + all_next = set() + def reachable_rec(cur, reachable): + if cur in reachable: + return + reachable.add(cur) + for n in self.get_next_nodes(cur): + reachable_rec(n, reachable) + for n in self.get_next_nodes(current_node_id): + reachable_rec(n, all_next) + return all_next + + + def get_all_previous(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]": + all_prev = set() + def reachable_rec(cur, reachable): + if cur in reachable: + return + reachable.add(cur) + for n in self.get_prev_nodes(cur): + reachable_rec(n, reachable) + for n in self.get_prev_nodes(current_node_id): + reachable_rec(n, all_prev) + return all_prev + + # TODO: fixme + # def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]": + # next = self.get_all_next(node_id) + # return set([node for node in next if not self.concrete_nodes[node].is_committed()]) + + def get_all_previous_uncommitted(self, concrete_node_id: ConcreteNodeId) -> "set[ConcreteNodeId]": + previous = self.get_all_previous(concrete_node_id) + return set([cnid for cnid in previous if not self.concrete_nodes[cnid].is_committed()]) + + def adjust_to_be_resolved_dict_entry(self, concrete_node_id: ConcreteNodeId): + node = self.concrete_nodes.get(concrete_node_id) + if node.is_committed(): + self.to_be_resolved[concrete_node_id] = [] + elif node.is_ready(): + self.to_be_resolved[concrete_node_id] = self.get_all_previous_uncommitted(concrete_node_id) + + def init_to_be_resolved_dict(self): + for node_id in self.concrete_nodes: + self.adjust_to_be_resolved_dict_entry(node_id) + + def adjust_to_be_resolved_dict(self): + # TODO: this design seems to require the function to be called + # each time before a node entering EXECUTING or SPEC_EXECUTING + # to be optimal (that is, it might keep more things in the list). + # It's safe as is so I'm not touching it. + for node_id in self.to_be_resolved.keys(): + self.adjust_to_be_resolved_dict_entry(node_id) + + #TODO: Add partial order invariant checks def valid(self): - logging.debug("Checking partial order validity...") - self.log_partial_program_order_info() - valid1 = self.loop_nodes_valid() - ## TODO: Add a check that for x, y : NodeIds, x < y iff x is a predecessor to x - ## This is necessary due to the `hypothetical_before` method. - - ## Any command in unsafe must also be in stopped - valid2 = self.unsafe.issubset(self.stopped) - - ## TODO: Fix the checks below because they do not work currently - ## TODO: Check that committed is prefix closed w.r.t partial order - return valid1 and valid2 - - ## Checks if loop nodes are all valid, i.e., that there are no loop nodes handled like normal ones, - ## e.g., in workset, frontier etc - ## - ## Note that loop nodes can be in the committed set (after we are done executing all iterations of a loop) - def loop_nodes_valid(self): - # GL 2023-07-08: This works without get_all_next_non_committed_nodes(), not sure why - forbidden_sets = self.get_all_next_non_committed_nodes() + \ - self.get_workset() + \ - list(self.stopped) + \ - list(self.commands_currently_executing.keys()) - loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets - if self.is_loop_node(node_id)] - return len(loop_nodes_in_forbidden_sets) == 0 - - def __len__(self): - return len(self.nodes) - - def get_node(self, node_id:NodeId) -> Node: - return self.nodes[node_id] - - def is_node_id(self, node_id:NodeId) -> bool: - return node_id in self.nodes - - def get_node_loop_context(self, node_id: NodeId) -> LoopStack: - return self.get_node(node_id).get_loop_context() - - def get_all_non_committed(self) -> "list[NodeId]": - all_node_ids = self.nodes.keys() - non_committed_node_ids = [node_id for node_id in all_node_ids - if not self.is_committed(node_id)] - return non_committed_node_ids - - ## This adds a node to the committed set and saves important information - def commit_node(self, node_id: NodeId): - logging.debug(f" > Commiting node {node_id}") - self.committed.add(node_id) - - - def is_loop_node(self, node_id:NodeId) -> bool: - return self.get_node(node_id).in_loop() - - ## Only keeps standard (non-loop) nodes - def filter_standard_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - return [node_id for node_id in node_ids - if not self.is_loop_node(node_id)] - - def filter_loop_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - return [node_id for node_id in node_ids - if self.is_loop_node(node_id)] - - ## This creates a new node_id and then creates a mapping from the node and iteration id to this node id - ## TODO: Currently doesn't work with nested loops - def create_node_id_with_one_less_loop_from_loop_node(self, node_id: NodeId, loop_id: int) -> NodeId: - node = self.get_node(node_id) - logging.debug(f' >>> Node: {node}') - logging.debug(f' >>> its loops: {node.loop_context} --- {node.current_iters}') - - new_iter = node.get_next_iter(loop_id) - ## Creates a new node id where we have appended the new iter - new_node_id = node_id.generate_new_node_id_with_another_iter(new_iter) - logging.debug(f' >>> new node_id with another iter: {new_node_id}') - return new_node_id - - - ## Returns all non committed non-loop nodes - def get_all_non_committed_standard_nodes(self) -> "list[NodeId]": - all_non_committed = self.get_all_non_committed() - logging.debug(f"All non committed nodes: {all_non_committed}") - return self.filter_standard_nodes(all_non_committed) - - def get_next(self, node_id:NodeId) -> "list[NodeId]": - return self.adjacency[node_id][:] - - def get_prev(self, node_id:NodeId) -> "list[NodeId]": - return self.inverse_adjacency[node_id][:] - - def add_edge(self, from_id: NodeId, to_id: NodeId): - ## KK 2023-05-04 Is it a problem that we append? Maybe we should make that a set - self.adjacency[from_id].append(to_id) - self.inverse_adjacency[to_id].append(from_id) - - def remove_edge(self, from_id: NodeId, to_id: NodeId): - self.adjacency[from_id].remove(to_id) - self.inverse_adjacency[to_id].remove(from_id) - - def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": - all_next_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - successors = set(self.get_next(node_id)) - new_next = successors - all_next_transitive - all_next_transitive = all_next_transitive.union(successors) - next_work.extend(new_next) - return list(all_next_transitive) - - def get_inverse_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": - all_prev_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - predecessors = set(self.get_prev(node_id)) - new_prev = predecessors - all_prev_transitive - all_prev_transitive = all_prev_transitive.union(predecessors) - next_work.extend(new_prev) - return list(all_prev_transitive) - - def get_transitive_closure_if_can_be_resolved(self, can_be_resolved: list, target_node_ids: list) -> list: - all_next_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - successors = {next_node_id for next_node_id in self.get_next(node_id) if next_node_id in can_be_resolved} - new_next = successors - all_next_transitive - all_next_transitive = all_next_transitive.union(successors) - next_work.extend(new_next) - return list(all_next_transitive) - - def update_rw_set(self, node_id, rw_set): - self.rw_sets[node_id] = rw_set - - def get_rw_set(self, node_id) -> RWSet: - return self.rw_sets[node_id] - - def get_rw_sets(self) -> dict: - return self.rw_sets - - def add_to_read_set(self, node_id: NodeId, item: str): - self.rw_sets[node_id].add_to_read_set(item) - - def add_to_write_set(self, node_id: NodeId, item: str): - self.rw_sets[node_id].add_to_write_set(item) - - def add_to_speculated(self, node_id: NodeId): - self.speculated = self.speculated.union([node_id]) - - def is_first_node_when_env_is_uninitialized(self, speculate_immediately): - if not speculate_immediately: - starting_env_node = self.get_source_nodes() - ## We may have a loop node at the start - ## In that case, we roll back to the initial env - if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None: - logging.debug("Initializing latest env and speculating") - return True - return False - - # Check if the specific command can be resolved. - # KK 2023-05-04 I am not even sure what this function does and why is it useful. - def cmd_can_be_resolved(self, node_id: int) -> bool: - logging.debug(f'Checking if node {node_id} can be resolved...') - ## Get inverse_transitive_closure to find all nodes that are before this one - inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id]) - - ## Out of those nodes, filter out the non-committed ones - non_committed_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids - if not self.is_committed(node_id)] - logging.debug(f' > Non committed nodes that are predecessors to {node_id} are: {non_committed_nodes_in_inverse_tc}') - - currently_executing_ids = self.get_currently_executing() - logging.debug(f' > Currently executing: {currently_executing_ids}') - - ## TODO: Make this check more efficient - for other_node_id in non_committed_nodes_in_inverse_tc: - ## If one of the non-committed nodes in the inverse_tc is currently executing then - ## we can't resolve this command - ## KK 2023-05-04 This is not sufficient. In the future (where we don't speculate everything at once) - ## there might be a case where nothing is executing but a command can still not be resolved. - if other_node_id in currently_executing_ids: - logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is currently executing') - return False - - ## If there exists a loop node that is not committed before the command then we cannot resolve. - if self.is_loop_node(other_node_id): - logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is a loop node') - return False - - ## Otherwise we can return - logging.debug(f' >> Able to resolve {node_id}') return True - - def __kill_all_currently_executing_and_schedule_restart(self, start=None): - nodes_to_kill = self.get_currently_executing() - if start is not None: - nodes_to_kill = [node_id for node_id in nodes_to_kill if node_id in self.get_transitive_closure([start])] - for cmd_id in nodes_to_kill: - self.__kill_node(cmd_id) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id) - self.prechecked_env.discard(cmd_id) - if most_recent_new_env is not None: - - self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) - self.workset.remove(cmd_id) - log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id) - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd_id, key=f"PostExecResolution-{cmd_id}") - # Our new workset is the nodes that were killed - # Previous workset got killed - self.workset.extend(nodes_to_kill) - - - def __kill_node(self, cmd_id: "NodeId"): - logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') - proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file = self.commands_currently_executing.pop(cmd_id) - # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response - self.banned_files.add(trace_file) - - alive_after_kill = util.kill_process_tree(proc_to_kill.pid) - - if alive_after_kill: - logging.critical("Processes still alive after attempting to kill:") - for proc in alive_after_kill: - logging.critical(proc) - else: - logging.debug("All processes were successfully terminated.") - - def resolve_commands_that_can_be_resolved_and_push_frontier(self): - # This may be obsolete since we only resolve one node at a time - # cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated() - # assert len(cmds_to_resolve) <= 1 - if len(self.speculated) == 0: - cmds_to_resolve = [] - else: - cmds_to_resolve = [self.speculated.pop()] - logging.debug(f"Commands to check for dependencies this round are: {sorted(cmds_to_resolve)}") - logging.debug(f"Commands that cannot be resolved this round are: {sorted(self.speculated)}") - ## Resolve dependencies for the commands that can actually be resolved - to_commit = self.__resolve_dependencies_continuous_and_move_frontier(cmds_to_resolve) - for cmd in to_commit: - log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd) - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}") - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling") - - if len(to_commit) == 0: - logging.debug(" > No nodes to be committed this round") - else: - logging.debug(f" > Nodes to be committed this round: {to_commit}") - logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) - if config.SANDBOX_KILLING: - logging.info("Sandbox killing") - self.__kill_all_currently_executing_and_schedule_restart(to_commit) - log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") - self.commit_cmd_workspaces(to_commit) - - def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn): - for second_cmd_id in cmds_to_check: - for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id): - - if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id): - update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id) - - # Internal function, modified the run_after dict and the pending_to_execute set - def __populate_run_after_dict(self): - for node in self.pending_to_execute.copy(): - prev_to_be_resolved = self.to_be_resolved_prev.get(node) - if prev_to_be_resolved is None: - return - # Check if env has changed since last comparison - elif set(self.to_be_resolved[node]) == set(prev_to_be_resolved): - # Not caring about this dependency because env has not yet changed - self.pending_to_execute.remove(node) - for k, v in self.run_after.items(): - if node in v: - self.run_after[k].remove(node) - - ## Spots dependencies and updates the state. - ## Safe to call everywhere - def resolve_dependencies_early(self, node_id=None): - def get_first_cmd_ids(second_cmd_id): - return sorted(self.to_be_resolved[second_cmd_id], reverse=True) - - def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): - self.waiting_for_frontend.discard(second_cmd_id) - self.run_after[first_cmd_id].add(second_cmd_id) - self.pending_to_execute.add(second_cmd_id) - logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) - - to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} - if node_id is not None: - to_check.add(node_id) - self.check_dependencies(to_check, get_first_cmd_ids, update_state_due_to_a_dependency) - self.populate_to_be_resolved_dict() - self.__populate_run_after_dict() - - def resolve_dependencies(self, cmds_to_resolve): - def get_first_cmd_ids(second_cmd_id): - return sorted([cmd_id for cmd_id in self.to_be_resolved[second_cmd_id] if cmd_id not in self.stopped]) - - def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): - logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}') - new_workset.add(second_cmd_id) - - new_workset = set() - self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency) - - return new_workset - - ## Resolve all the forward dependencies and update the workset - ## Forward dependency is when a command's output is the same - ## as the input of a following command - def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): - self.log_partial_program_order_info() - for cmd in cmds_to_resolve: - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd) - - logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}") - logging.debug(" --- Starting dependency resolution --- ") - new_workset = self.resolve_dependencies(cmds_to_resolve) - - logging.debug(" > Modifying workset accordingly") - # New workset contains previous unresolved commands and resolved commands with dependencies that have not been stopped - workset_old = self.workset.copy() - self.workset = [cmd_id for cmd_id in self.workset if cmd_id not in cmds_to_resolve and cmd_id not in self.stopped] - self.workset.extend(list(new_workset)) - workset_diff = set(self.workset) - set(workset_old) - logging.trace(f"WorksetAdd|{','.join(str(cmd_id) for cmd_id in workset_diff)}") + def fetch_fs_actions(self): + for node in self.get_executing_normal_and_spec_nodes(): + node.gather_fs_actions() - # Keep the previous committed state - old_committed = self.get_committed() - - # We want stopped commands to not enter the workset again yet - assert(set(self.workset).isdisjoint(self.stopped)) - - self.__frontier_commit_and_push() - # self.log_partial_program_order_info() - return set(self.get_committed()) - old_committed - - - ## This method checks if nid1 would be before nid2 if nid2 was part of the PO. - ## - ## Therefore it does not just check edges, but rather computes if it would be before - ## based on ids and loop iterations. - ## - ## 1. Check if the loop ids of the two abstract parents of both nodes differ - ## thus showing that one is before the other - ## 2. If all loop ids are the same, now we can actually compare iterations. - ## If a node is in the same loop ids but in a later iteration then it is later. - ## 3. If all iterations are the same too, then we just compare node ids - ## - ## KK 2023-05-22 This is a complex procedure, I wonder if we can simplify it in some way - def hypothetical_before(self, nid1: NodeId, nid2: NodeId): - raw_id1 = nid1.get_non_iter_id() - ## Get all loop ids that nid1 could be in - loop_ids1 = self.get_node_loop_context(raw_id1) - - raw_id2 = nid1.get_non_iter_id() - ## Get all loop ids that nid2 could be in - loop_ids2 = self.get_node_loop_context(raw_id2) - - i = 0 - while i < len(loop_ids1) and i < len(loop_ids2): - loop_id_1 = loop_ids1.get(len(loop_ids1) - 1 - i) - loop_id_2 = loop_ids2.get(len(loop_ids2) - 1 - i) - ## If the first node is in a previous loop than the second, - ## then we are done. - if loop_id_1 < loop_id_2: + def _has_fs_deps(self, concrete_node_id: ConcreteNodeId): + node_of_interest : ConcreteNode = self.get_concrete_node(concrete_node_id) + for nid in self.to_be_resolved[concrete_node_id]: + node: ConcreteNode = self.get_concrete_node(nid) + if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()): return True - elif loop_id_1 > loop_id_2: - return False - - ## We need to keep going - i += 1 - - ## If we reach this, we know that both nodes are in the same loops up to i - ## so we now compare iterations and node identifiers. - - iters1 = nid1.get_iters() - iters2 = nid2.get_iters() - - i = 0 - while i < len(iters1) and i < len(iters2): - iter1 = iters1.get(len(iters1) - 1 - i) - iter2 = iters2.get(len(iters2) - 1 - i) - ## If the first node is in a previous iteration than the second, - ## then we are done. - if iter1 < iter2: - return True - elif iter1 > iter2: - return False - ## We need to keep going - i += 1 - - ## We now know that their common prefix of iterations is the same - - ## Check if the node could potentially generate other nodes that are bigger - ## i.e., if it is more abstract. If so, then it is not smaller. - common_loop_depth = min(len(loop_ids1), len(loop_ids2)) - abstract_depth1 = max(common_loop_depth - len(iters1), 0) - abstract_depth2 = max(common_loop_depth - len(iters2), 0) - if abstract_depth1 < abstract_depth2: - return True - elif abstract_depth1 > abstract_depth2: - return False - - return nid1.id < nid2.id - - - def progress_po_due_to_wait(self, node_id: NodeId): - logging.debug(f"Checking if we can progress the partial order after having received a wait for {node_id}") - ## The node might not be part of the partial order if it corresponds to - ## a loop node iteration. In this case, we just need to make sure that - ## we commit the right previous loop nodes that are relevant to it. - if not self.is_node_id(node_id): - ## TODO: This check is not correct currently, it works for now, but when we move to full partial orders it wont anymore, - ## due to the check happening with < in hypothetical before - logging.debug(f" > Node {node_id} is not part of the PO so we compute the nodes that would be before it...") - all_non_committed = self.get_all_non_committed() - all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed) - non_committed_loop_nodes_that_would_be_predecessors = [n_id for n_id in all_non_committed_loop_nodes - if self.hypothetical_before(n_id, node_id)] - - new_committed_nodes = non_committed_loop_nodes_that_would_be_predecessors - - else: - logging.debug(f" > Node {node_id} is part of the PO so we just check its predecessors following the inverse edges...") - ## If the node is in the PO, then we can proceed normally and find its predecessors and commit them - - ## Get inverse_transitive_closure to find all nodes that are before this one - inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id]) - - ## Out of those nodes, filter out the non-committed loop ones - non_committed_loop_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids - if not self.is_committed(node_id) and - self.is_loop_node(node_id)] - logging.debug(f'Non committed loop nodes that are predecessors to {node_id} are: {non_committed_loop_nodes_in_inverse_tc}') - - new_committed_nodes = non_committed_loop_nodes_in_inverse_tc - - ## And "close them" - ## TODO: This is a hack here, we need to have a proper method that commits - ## nodes and does whatever else is needed to do (e.g., add new nodes to frontier) - logging.debug(f'Adding following loop nodes to committed: {new_committed_nodes}') - for node_id in new_committed_nodes: - self.commit_node(node_id) - - ## Since we committed some nodes, let's make sure that we also push the frontier - ## TODO: Can we do this in a less hacky method? By using a well-defined commit_node_and_push_frontier method? - if len(new_committed_nodes) > 0: - new_nodes_sinks = self.get_sub_po_sink_nodes(new_committed_nodes) - assert(len(new_nodes_sinks) == 1) - new_nodes_sink = new_nodes_sinks[0] - logging.debug(f'The sink of the newly committed loop nodes is {new_nodes_sink}') - - next_nodes = self.get_next(new_nodes_sink) - next_standard_nodes = self.filter_standard_nodes(next_nodes) - logging.trace(f"Adding its next nodes to the frontier|{','.join(str(node_id) for node_id in next_standard_nodes)}") - self.frontier.extend(next_standard_nodes) - - - - ## TODO: Add some form of validity assertion after we are done with this. - ## Just to make sure that we haven't violated the continuity of the committed set. - - ## We check if something can be resolved and stepped forward here - ## KK 2023-05-10 This seems to work for all tests (so it might be idempotent - ## since in many tests there is nothing new to resolve after a wait) - self.resolve_commands_that_can_be_resolved_and_push_frontier() - - ## When the frontend sends a wait for a node, it means that execution in the frontend has - ## already surpassed all nodes prior to it. This is particularly important for loops, - ## since we can't always statically predict how many iterations they will do, so the only - ## definitive way to know that they are done is to receive a wait for a node after them. - def wait_received(self, node_id: NodeId): - ## Whenever we receive a wait for a node, we always need to check and "commit" all prior loop nodes - ## since we know that they won't have any more iterations (the JIT frontend has already passed them). - - ## We first have to push and progress the PO due to the wait and then unroll - ## KK 2023-05-22 Currently this checks whether a still nonexistent node is - ## would be a successor of existing nodes to commit some of - ## them if needed. Unfortunately, to make this check for a non-existent - ## node is very complex and not elegant. - ## TODO: Could we swap unrolling and progressing so that we always - ## check if a node can be progressed by checking edges? - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - self.progress_po_due_to_wait(node_id) - log_time_delta_from_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - - - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - ## Unroll some nodes if needed. - if node_id.has_iters(): - ## TODO: This unrolling can also happen and be moved to speculation. - ## For now we are being conservative and that is why it only happens here - ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work) - self.maybe_unroll(node_id) - - assert(self.valid()) - - def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]": - loop_node_ids = [] - for node_id in nodes_subset: - loop_context = self.get_node_loop_context(node_id) - ## Note: this only checks for the nodes that have this loop id as their outer loop - if not loop_context.is_empty() and loop_id == loop_context.get_outer(): - loop_node_ids.append(node_id) - ## TODO: Assert that this is closed w.r.t. partial order - return loop_node_ids - - - ## This function unrolls a single loop, by first finding all its nodes (they must be contiguous) and then creating new versions of them - ## that are concretized. Its second argument describes which subset of all partial order nodes we want to look at. - ## That is necessary because when unrolling nested loops, we might end up in a situation where we have unrolled the - ## outer loop, but some of the newly created nodes might still be loop nodes (so we might have loop nodes for the same loop in multiple locations). - def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"): - logging.info(f'Unrolling loop with id: {loop_id}') - all_loop_node_ids = self.find_outer_loop_sub_partial_order(loop_id, nodes_subset) - - ## We don't want to unroll already committed nodes - loop_node_ids = [nid for nid in all_loop_node_ids - if not self.is_committed(nid)] - - logging.debug(f'Node ids for loop: {loop_id} are: {loop_node_ids}') - - ## Create the new nodes and remap adjacencies accordingly - node_mappings = {} - for node_id in loop_node_ids: - node = self.get_node(node_id) - new_loop_node_id = self.create_node_id_with_one_less_loop_from_loop_node(node_id, loop_id) - node_mappings[node_id] = new_loop_node_id - ## The new node has one less loop context than the previous one - node_loop_contexts = node.get_loop_context() - logging.debug(f'Node: {node_id} loop_contexts: {node_loop_contexts}') - assert(node_loop_contexts.get_outer() == loop_id) - new_node_loop_contexts = copy.deepcopy(node_loop_contexts) - new_node_loop_contexts.pop_outer() - - ## Create the new node - self.nodes[new_loop_node_id] = Node(new_loop_node_id, node.cmd, node.asts, new_node_loop_contexts) - self.executions[new_loop_node_id] = 0 - logging.debug(f'New loop ids: {node_mappings}') - - ## Create the new adjacencies, by mapping adjacencies in the node set to the new node ids - ## and leaving outside adjacencies as they are - for _, new_node_id in node_mappings.items(): - self.adjacency[new_node_id] = [] - - for node_id, new_node_id in node_mappings.items(): - old_prev_ids = self.get_prev(node_id) - ## Modify all id to be in the new set except for the - new_prev_ids = PartialProgramOrder.map_using_mapping(old_prev_ids, node_mappings) - self.inverse_adjacency[new_node_id] = new_prev_ids - for new_prev_id in new_prev_ids: - self.adjacency[new_prev_id].append(new_node_id) - - ## TODO: The rest of the code here makes assumptions about the shape of the partial order - - ## Modify the previous node of the loop nodes - new_nodes_sinks = self.get_sub_po_sink_nodes(list(node_mappings.values())) - assert(len(new_nodes_sinks) == 1) - new_nodes_sink = new_nodes_sinks[0] - logging.debug(f'The sink of the new iteration for loop: {loop_id} is {new_nodes_sink}') - - old_nodes_sources = self.get_sub_po_source_nodes(list(node_mappings.keys())) - assert(len(old_nodes_sources) == 1) - old_nodes_source = old_nodes_sources[0] - - old_next_node_ids = self.get_next(new_nodes_sink) - assert(len(old_next_node_ids) <= 1) - - previous_ids = self.get_sub_po_prev_nodes(loop_node_ids) - assert(len(previous_ids) <= 1) - - ## Add a new edge between the new_sink (concrete iter) and the old_source (loop po) - self.add_edge(new_nodes_sink, old_nodes_source) - - ## Remove the old previous edge of the old_source if it exists - if len(previous_ids) == 1: - previous_id = previous_ids[0] - logging.debug(f'Previous node id for loop: {loop_id} is {previous_id}') - self.remove_edge(from_id=previous_id, - to_id=old_nodes_source) - - - ## Return the new first node and all node mappings - return node_mappings[old_nodes_source], node_mappings.values() - - ## Static method that just maps using a node mapping dictionary or leaves them as - ## they are if not - def map_using_mapping(node_ids: "list[NodeId]", mapping) -> "list[NodeId]": - new_node_ids = [] - for node_id in node_ids: - if node_id in mapping: - new_id = copy.deepcopy(mapping[node_id]) - else: - new_id = copy.deepcopy(node_id) - new_node_ids.append(new_id) - return new_node_ids - - ## This unrolls a sequence of loops by unrolling each loop outside-in - def unroll_loops(self, loop_contexts: LoopStack) -> NodeId: - logging.debug(f'Unrolling the following loops: {loop_contexts}') - - ## All new node_ids - all_new_node_ids = set() - relevant_node_ids = list(self.nodes.keys()) - for loop_ctx in loop_contexts.outer_to_inner(): - new_first_node_id, new_node_ids = self.unroll_single_loop(loop_ctx, relevant_node_ids) - logging.debug(f'New node ids after unrolling: {new_node_ids}') - ## Update all new nodes that we have added - all_new_node_ids.update(new_node_ids) - - ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once, - ## we just want to look at those new unrolled nodes for the next unrolling). - relevant_node_ids = new_node_ids - - logging.debug(f' >>> Edges after unrolling : {self.adjacency}') - logging.debug(f' >>> Inv Edges after unrolling: {self.inverse_adjacency}') - - ## Add all new standard nodes to the workset (since they have to be tracked) - for new_node_id in all_new_node_ids: - if not self.is_loop_node(new_node_id): - self.workset.append(new_node_id) - ## GL: 08-24-2023: This might not the best way to treat this as we need - ## to update the env half way through the loop. - ## For now, we just copy the env from the parent loop node - non_iter_id = new_node_id.get_non_iter_id() - logging.debug(f"Copying latest env from loop context to loop node: {non_iter_id} -> {new_node_id}") - self.latest_envs[new_node_id] = self.latest_envs[non_iter_id] - - ## KK 2023-05-22 Do we need to correctly populate the resolved set of next commands - ## after unrolling the loop. - - return new_first_node_id - - ## This unrolls a loop given a target concrete node id - def unroll_loop_node(self, target_concrete_node_id: NodeId): - raw_node_id = target_concrete_node_id.get_non_iter_id() - assert(self.is_loop_node(raw_node_id)) - - logging.debug(f'Edges: {self.adjacency}') - - ## Find the closest non-committed successor with this node id - ## Note: This is necessary because we might need to unroll only a subset of the loops that a node is part of. - ## This is relevant when we have nested loops. - all_non_committed = self.get_all_non_committed() - all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed) - logging.debug(f'All non committed loop nodes: {all_non_committed_loop_nodes}') - source_node_ids = self.get_min(all_non_committed_loop_nodes) - ## Note: This assertion might not hold once we have actual partial orders - assert(len(source_node_ids) == 1) - node_id = source_node_ids[0] - logging.debug(f'Closest non-committed loop node successor with raw_id {raw_node_id} is: {node_id}') - loop_contexts = self.get_node_loop_context(node_id) - - - ## Unroll all loops that this node is in - new_first_node_id = self.unroll_loops(loop_contexts) - - ## TODO: This needs to change when we modify unrolling to happen speculatively too - ## TODO: This needs to properly add the node to frontier and to resolve dictionary - - # GL 2023-05-22: __frontier_commit_and_push() should be called here instead of step_forward() - # Although without it the test cases pass - self.frontier.append(new_first_node_id) - - ## At the end of unrolling the target node must be part of the PO - assert(self.is_node_id(target_concrete_node_id)) - - - def maybe_unroll(self, node_id: NodeId) -> NodeId: - ## Only unrolls this node if it doesn't already exist in the PO - if not self.is_node_id(node_id): - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) - self.unroll_loop_node(node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) - ## The node_id must be part of the PO after unrolling, otherwise we did something wrong - assert(self.is_node_id(node_id)) - - - ## Pushes the frontier forward as much as possible for all commands in it that can be committed - ## This function is not safe to call on its own, since it might leave the PO in a broken state - ## It should be called right after - def __frontier_commit_and_push(self): - logging.debug(" > Commiting and pushing frontier") - logging.debug(f' > Frontier: {self.frontier}') - changes_in_frontier = True - while changes_in_frontier: - new_frontier = [] - changes_in_frontier = False - # Second condition below may be unecessary - for frontier_node in self.frontier: - ## If a node is not in the workset it means that it is actually done executing - ## KK 2023-05-10 Do we need all these conditions in here? Some might be redundant? - if frontier_node not in self.get_currently_executing() \ - and frontier_node not in self.get_committed() \ - and frontier_node not in self.stopped \ - and frontier_node not in self.speculated \ - and frontier_node not in self.workset \ - and not self.is_loop_node(frontier_node) \ - and frontier_node not in self.waiting_for_frontend: - ## Commit the node - self.commit_node(frontier_node) - - ## Add its non-loop successors to the frontier - next_nodes = self.get_next(frontier_node) - next_standard_nodes = self.filter_standard_nodes(next_nodes) - logging.trace(f"FrontierAdd|{','.join(str(node_id) for node_id in next_standard_nodes)}") - new_frontier.extend(next_standard_nodes) - - ## There are some changes in the frontier so we need to reenter the loop - changes_in_frontier = True - # If node is still being executed, we cannot progress further - else: - new_frontier.extend([frontier_node]) - if frontier_node in self.get_currently_executing(): - logging.debug(f" > Node {frontier_node} is still being executed") - elif frontier_node in self.get_committed(): - logging.debug(f" > Node {frontier_node} is already committed") - elif frontier_node in self.stopped: - logging.debug(f" > Node {frontier_node} is stopped") - elif frontier_node in self.speculated: - logging.debug(f" > Node {frontier_node} is speculated") - elif frontier_node in self.workset: - logging.debug(f" > Node {frontier_node} is in the workset") - elif self.is_loop_node(frontier_node): - logging.debug(f" > Node {frontier_node} is a loop node") - elif frontier_node in self.waiting_for_frontend: - logging.debug(f" > Node {frontier_node} is waiting for frontend") - logging.debug(f" > Not commiting node {frontier_node}, keeping in frontier") - - ## Update the frontier to the new frontier - self.frontier = new_frontier - - - ## For a file - dir forward dependency to exist, - ## we need the succeding command to attempt to read anything that is a subpath of the - ## write set of the preceeding command. - ## e.g. in: W1: {/foo/} | R2: {/f1, /foo/f2, /foo/bar/f3} - ## /foo/f2 and /foo/bar/f3 will trigger the dependency check. - def has_dir_file_dependency(self, first_cmd_set, second_cmd_set): - # Get all directory paths without the "/" in the end - dirs = {dir_path[:-1] for dir_path in first_cmd_set if dir_path.endswith("/")} - # Get all files in a separate set - to_check = {filepath for filepath in second_cmd_set if not filepath.endswith("/")} - for dir in dirs: - for other_path in to_check: - if self.is_subpath(dir, other_path): - logging.debug(f' > File forward dependency found C1:({dir}) C2:({other_path})') - return True return False - - def is_subpath(self, dir, other_path): - other_path.startswith(os.path.abspath(dir)+os.sep) - - def has_forward_dependency(self, first_id, second_id): - first_write_set = set(self.rw_sets[first_id].get_write_set()) - second_read_set = set(self.rw_sets[second_id].get_read_set()) - logging.debug(f'Checking dependencies between {first_id} and {second_id}') - if not first_write_set.isdisjoint(second_read_set): - logging.debug(f' > Forward dependency found {first_write_set.intersection(second_read_set)}') - return True - - elif self.has_dir_file_dependency(first_write_set, second_read_set): - return True - else: - logging.debug(f' > No dependencies') - return False - - def get_all_next_non_committed_nodes(self) -> "list[NodeId]": - next_non_committed_nodes = [] - for cmd_id in self.get_all_non_committed(): - if cmd_id in self.workset and self.is_next_non_committed_node(cmd_id): - next_non_committed_nodes.append(cmd_id) - return next_non_committed_nodes - - def is_next_non_committed_node(self, node_id: NodeId) -> bool: - # We want the predecessor to be committed and the current node to not be committed - for prev_node in self.get_prev(node_id): - if not (self.is_committed(prev_node) and not self.is_committed(node_id)): - return False - return True - - # This command never leaves the partial order at a broken state - # It is always safe to call it - def attempt_move_stopped_to_workset(self): - new_stopped = self.stopped.copy() - ## We never remove stopped commands that are unsafe - ## from the stopped set to be reexecuted. - for cmd_id in self.get_stopped_safe(): - if self.is_next_non_committed_node(cmd_id): - self.workset.append(cmd_id) - logging.debug(f"StoppedRemove|{cmd_id}") - new_stopped.remove(cmd_id) - self.to_be_resolved[cmd_id] = [] - self.stopped = new_stopped - - ## TODO: Eventually, in the future, let's add here some form of limit - def schedule_work(self, limit=0): - if self.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): - logging.debug("Not scheduling work yet, waiting for first Wait") - return - # self.log_partial_program_order_info() - logging.debug("Rerunning stopped commands") - # attempt_move_stopped_to_workset() needs to happen before the node execution - self.attempt_move_stopped_to_workset() - ## GL 2023-07-05 populate_to_be_resolved_dict() is OK to call anywhere, - ## __frontier_commit_and_push() is not safe to call here - self.populate_to_be_resolved_dict() - - ## TODO: Move loop unrolling here for speculation too - - for cmd_id in self.get_workset(): - # We only need to schedule non-committed and non-executing nodes - if not (cmd_id in self.get_committed() or \ - cmd_id in self.commands_currently_executing): - self.schedule_node(cmd_id) - assert(self.valid()) - - # Nodes to be scheduled are always not committed and not executing - def schedule_node(self, cmd_id): - # This replaced the old frontier check - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id) - if self.is_next_non_committed_node(cmd_id): - # TODO: run this and before committing kill any speculated commands still executing - self.run_cmd_non_blocking(cmd_id) - else: - if not cmd_id in self.speculated: - self.speculate_cmd_non_blocking(cmd_id) - return - - ## Run a command and add it to the dictionary of executing ones - def run_cmd_non_blocking(self, node_id: NodeId): - ## A command should only be run if it's in the frontier, otherwise it should be spec run - logging.debug(f'Running command: {node_id} {self.get_node(node_id)}') - logging.debug(f"ExecutingAdd|{node_id}") - self.to_be_resolved_prev[node_id] = self.to_be_resolved[node_id].copy() - self.execute_cmd_core(node_id, speculate=False) - - ## Run a command and add it to the dictionary of executing ones - def speculate_cmd_non_blocking(self, node_id: NodeId): - logging.debug(f'Speculating command: {node_id} {self.get_node(node_id)}') - ## TODO: Since these (this and the function above) - ## are relevant for the report maker, - ## add them in some library (e.g., trace_for_report) - ## so that we don't accidentally delete them. - logging.debug(f"ExecutingSandboxAdd|{node_id}") - self.execute_cmd_core(node_id, speculate=True) - - def execute_cmd_core(self, node_id: NodeId, speculate=False): - node = self.get_node(node_id) - ## TODO: Read and pass the actual variables in this - variables = {} - is_safe = analysis.safe_to_execute(node.asts, variables) - if not is_safe: - logging.debug(f'Command: "{node}" is not safe to execute, sending to the original shell to execute...') - - ## Keep some state around to determine that this command is not safe to execute. - self.stopped.add(node_id) - self.unsafe.add(node_id) - ## TODO: After we respond to the wait, we need to invalidate all later - ## commands as if they had dependencies with it. In the future, - ## we can be smarter with it. Many unsafe commands will not have - ## other side-effects, so we don't need to invalidate anything after them. - return - - cmd = node.get_cmd() - self.executions[node_id] += 1 - env_file_to_execute_with = self.get_latest_env_file_for_node(node_id) - logging.debug(f"Executing with environment file: {env_file_to_execute_with}") - if speculate: - execute_func = executor.async_run_and_trace_command_return_trace_in_sandbox_speculate - else: - execute_func = executor.async_run_and_trace_command_return_trace - - proc, trace_file, stdout, stderr, post_execution_env_file = execute_func(cmd, node_id, env_file_to_execute_with) - self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file) - logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}") - - # This method attempts to add to workset (rerun) - # any command that found to have a dependency through early resolution - def attempt_rerun_pending_nodes(self): - restarted_nodes = set() - for node_id, run_after_nodes in self.run_after.items(): - new_run_after_nodes = run_after_nodes.copy() - if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute and node_id not in self.get_currently_executing(): - for node in run_after_nodes: - if node not in self.get_currently_executing(): - logging.debug(f"Running node {node} after execution of {node_id}") - self.workset.append(node) - self.pending_to_execute.discard(node) - self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id)) - restarted_nodes.add(node) - self.prechecked_env.discard(node) - new_run_after_nodes.discard(node) - self.run_after[node_id] = new_run_after_nodes - return restarted_nodes - - def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): - log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") - - logging.debug(f" --- Node {node_id}, just finished execution ---") - self.sandbox_dirs[node_id] = sandbox_dir - ## TODO: Store variable file somewhere so that we can return when wait - - _proc, trace_file, stdout, stderr, post_execution_env_file = self.commands_currently_executing.pop(node_id) - - logging.trace(f"ExecutingRemove|{node_id}") - # Handle stopped by riker due to network access - if int(riker_exit_code) == 159: - logging.debug(f" > Adding {node_id} to stopped because it tried to access the network.") - logging.trace(f"StoppedAdd|{node_id}:network") - self.stopped.add(node_id) - else: - trace_object = executor.read_trace(sandbox_dir, trace_file) - cmd_exit_code = trace.parse_exit_code(trace_object) - - ## Save the completed node info. Note that if the node doesn't commit - ## this information will be invalid and rewritten the next time execution - ## is completed for this node. - completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir) - self.nodes[node_id].set_completed_info(completed_node_info) - - ## We no longer add failed commands to the stopped set, - ## because this leads to more repetitions than needed - ## and does not allow us to properly speculate commands - read_set, write_set = trace.parse_and_gather_cmd_rw_sets(trace_object) - rw_set = RWSet(read_set, write_set) - self.update_rw_set(node_id, rw_set) - - if node_id in self.stopped: - logging.debug(f"Nothing new to be resolved since {node_id} exited with an error.") - if node_id in self.workset: - self.workset.remove(node_id) - logging.debug(f"WorksetRemove|{node_id}") - # If no commands can be resolved this round, - # do nothing and wait until a new command finishes executing - logging.debug("No resolvable nodes were found in this round, nothing will change...") - return - - - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) - # Remove from workset and add it again later if necessary - self.workset.remove(node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) - - ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. - if self.get_new_env_file_for_node(node_id) is None: - logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") - self.waiting_for_frontend.add(node_id) - - # We will however attempt to resolve dependencies early - self.resolve_dependencies_early(node_id) - restarted_cmds = self.attempt_rerun_pending_nodes() - self.log_partial_program_order_info() - ## Here we continue with the normal execution flow - else: - logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) - - #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. - def exclude_insignificant_diffs(self, env_diff_dict): - return {k: v for k, v in env_diff_dict.items() if k not in config.INSIGNIFICANT_VARS} - - #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. - def include_only_significant_vars(self, env_diff_dict): - return {k: v for k, v in env_diff_dict.items() if k in config.SIGNIFICANT_VARS} - - def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_in_both): - # Exclude insignificant differences - only_in_new_sig = self.include_only_significant_vars(only_in_new) - only_in_latest_sig = self.include_only_significant_vars(only_in_latest) - different_in_both_sig = self.include_only_significant_vars(different_in_both) - # If still diffs are present, return False - if len(only_in_new_sig) > 0 or len(only_in_latest_sig) > 0 or len(different_in_both_sig) > 0: - logging.debug("Significant differences found:") - logging.debug(f"Unique to new (Wait): {only_in_new_sig}") - logging.debug(f"Unique to latest (Before Riker): {only_in_latest_sig}") - logging.debug(f"Differing values: {different_in_both_sig}") - return True - else: - logging.debug("No significant differences found:") - return False - - def update_env_and_restart_nodes(self, node_id: NodeId): - logging.debug(f"Significant differences found between new and latest env files for {node_id}.") - logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") - self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) - self.prechecked_env.discard(node_id) - if node_id not in self.workset: - self.workset.append(node_id) - self.__kill_all_currently_executing_and_schedule_restart(start=node_id) - new_waiting_for_frontend = self.waiting_for_frontend.copy() - for waiting_for_frontend_node in self.waiting_for_frontend: - if waiting_for_frontend_node not in self.workset and waiting_for_frontend_node in self.get_transitive_closure([node_id]): - self.workset.append(waiting_for_frontend_node) - new_waiting_for_frontend.remove(waiting_for_frontend_node) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) - self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) - self.prechecked_env.discard(waiting_for_frontend_node) - assert(self.get_new_env_file_for_node(node_id) is not None) - assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) - self.log_partial_program_order_info() - logging.debug("-") - self.waiting_for_frontend = new_waiting_for_frontend - self.populate_to_be_resolved_dict() - - def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), - self.get_latest_env_file_for_node(node_id)): - self.update_env_and_restart_nodes(node_id) - else: - self.prechecked_env.add(node_id) - def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): - logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - self.waiting_for_frontend.discard(node_id) - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), - self.get_latest_env_file_for_node(node_id)): - self.update_env_and_restart_nodes(node_id) + # TODO: It's currently designed this way to avoid reading trace file all the time + # When we have complex caching code for this we can make this go away + def has_fs_deps(self, concrete_node_id: ConcreteNodeId): + self.fetch_fs_actions() + self._has_fs_deps(concrete_node_id) + + ### external handler events ### + + def schedule_work(self, concrete_node_id: ConcreteNodeId, env_file: str): + event_log("schedule_work") + self.get_concrete_node(concrete_node_id).start_executing(env_file) + + def schedule_spec_work(self, concrete_node_id: ConcreteNodeId, env_file: str): + event_log("schedule_spec") + self.adjust_to_be_resolved_dict_entry(concrete_node_id) + self.get_concrete_node(concrete_node_id).start_spec_executing(env_file) + + def handle_complete(self, concrete_node_id: ConcreteNodeId, has_pending_wait: bool, + current_env: str): + event_log(f"handle_complete {concrete_node_id}") + node = self.get_concrete_node(concrete_node_id) + # TODO: complete the state matching + if node.is_executing(): + node.commit_frontier_execution() + self.adjust_to_be_resolved_dict() + elif node.is_spec_executing(): + if self.has_fs_deps(concrete_node_id): + node.reset_to_ready() + # otherwise it stays in ready state and waits to be scheduled by the scheduler + if has_pending_wait: + node.start_executing(current_env) + else: + node.finish_spec_execution() + if has_pending_wait: + node.commit_speculated() + self.adjust_to_be_resolved_dict() else: - logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") - assert(node_id not in self.stopped) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) - self.add_to_speculated(node_id) - self.resolve_dependencies_early(node_id) - restarted_cmds = self.attempt_rerun_pending_nodes() - logging.debug(f"Restarted after successful env resolution {restarted_cmds}") - self.log_partial_program_order_info() - self.resolve_commands_that_can_be_resolved_and_push_frontier() - assert(self.valid()) - - def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): - if node_id in self.waiting_for_frontend: - logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) + assert False + + def reset_succeeding_nodes(self, node_id: NodeId, env_file: str): + # TODO: fixme + pass + # for uncommitted_node_id in self.get_all_next(node_id): + # uncommitted_node = self.get_concrete_node(uncommitted_node_id) + # if uncommitted_node.is_spec_executing(): + # uncommitted_node.reset_to_ready() + # # uncommitted_node.start_spec_executing(env_file) + + def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId): + basic_block = self.hsprog.find_basic_block(concrete_node_id.node_id) + if len(self.concrete_nodes) != 0: + prev_concrete_node_id = next(reversed(self.concrete_nodes)) else: - logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.") - self.resolve_most_recent_envs_check_only_wait_node_early(node_id) - - def new_and_latest_env_files_have_significant_differences(self, new_env_file, latest_env_file): - # Early resolution if same files are compared - if new_env_file == latest_env_file: - logging.debug(f"Env files are the same. No need to compare.") - return False - logging.debug(f"Comparing new and latest env files: {new_env_file} {latest_env_file}") - assert(latest_env_file is not None) - - new_env = executor.read_env_file(new_env_file) - latest_env = executor.read_env_file(latest_env_file) - - only_in_new, only_in_latest, different_in_both = util.compare_env_strings(new_env, latest_env) - - return self.significant_diff_in_env_dicts(only_in_new, only_in_latest, different_in_both) - - def print_cmd_stderr(self, stderr): - # stdout.seek(0) - # print(stdout.read().decode(), end="") - stderr.seek(0) - print(stderr.read().decode(), file=sys.stderr, end="") - - def commit_cmd_workspaces(self, to_commit_ids): - for cmd_id in sorted(to_commit_ids): - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) - workspace = self.sandbox_dirs[cmd_id] - if workspace != "": - logging.debug(f" (!) Committing workspace of cmd {cmd_id} found in {workspace}") - commit_workspace_out = executor.commit_workspace(workspace) - logging.debug(commit_workspace_out.decode()) + prev_concrete_node_id = None + loop_iters = concrete_node_id.loop_iters + for abstract_node_id in basic_block.node_ids: + new_concrete_node_id = ConcreteNodeId(abstract_node_id, loop_iters) + new_concrete_node = ConcreteNode(new_concrete_node_id, + basic_block.get_node(abstract_node_id)) + new_concrete_node.transition_from_init_to_ready() + if new_concrete_node.command_unsafe(): + new_concrete_node.transition_from_ready_to_unsafe() + self.concrete_nodes[new_concrete_node_id] = new_concrete_node + if prev_concrete_node_id is not None: + self.prev_concrete_node[new_concrete_node_id] = [prev_concrete_node_id] else: - logging.debug(f" (!) No need to commit workspace of cmd {cmd_id} as it was run in the main workspace") - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) - - def log_rw_sets(self): - logging.debug("====== RW Sets " + "=" * 65) - for node_id, rw_set in self.rw_sets.items(): - logging.debug(f"ID:{node_id} | R.size:{len(rw_set.get_read_set()) if rw_set is not None else None} | W:{rw_set.get_write_set() if rw_set is not None else None}") - - def log_partial_program_order_info(self): - logging.debug(f"=" * 80) - logging.debug(f"WORKSET: {self.get_workset()}") - logging.debug(f"COMMITTED: {self.get_committed_list()}") - logging.debug(f"FRONTIER: {self.frontier}") - logging.debug(f"EXECUTING: {list(self.commands_currently_executing.keys())}") - logging.debug(f"STOPPED: {list(self.stopped)}") - logging.debug(f" of which UNSAFE: {list(self.get_unsafe())}") - logging.debug(f"WAITING: {sorted(list(self.speculated))}") - logging.debug(f"for FRONTEND: {sorted(list(self.waiting_for_frontend))}") - logging.debug(f"TO RESOLVE: {self.to_be_resolved}") - logging.debug(f"PENDING TO EXEC: {self.pending_to_execute}") - logging.debug(f"RUN AFTER: {self.run_after}") - logging.debug(f"New envs: {self.new_envs}") - logging.debug(f"Latest envs: {self.latest_envs}") - self.log_rw_sets() - logging.debug(f"=" * 80) - - ## TODO: Document how this finds the to be resolved dict - def populate_to_be_resolved_dict(self): - logging.debug("Populating the resolved dictionary for all nodes") - for node_id in self.nodes: - if self.is_committed(node_id): - logging.debug(f" > Node: {node_id} is committed, emptying its dict") - self.to_be_resolved[node_id] = [] - continue - # We don't want to modify the set of nodes to check for dependencies for this node - # as it started running before previous cmds had started executing - elif node_id in self.speculated: - logging.debug(f" > Node: {node_id} is waiting to be resolved, skipping...") - continue - elif node_id in self.get_currently_executing(): - logging.debug(f" > Node: {node_id} is currently executing, skipping...") - continue - elif node_id in self.waiting_for_frontend: - logging.debug(f" > Node: {node_id} is currently waiting for frontend, skipping...") - continue + self.prev_concrete_node[new_concrete_node_id] = [] + prev_concrete_node_id = new_concrete_node_id + assert concrete_node_id in self.concrete_nodes + + def finish_wait_unsafe(self, concrete_node_id: ConcreteNodeId): + node = self.concrete_nodes[concrete_node_id] + node.commit_unsafe_node() + + def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): + event_log(f"handle_wait {concrete_node_id}") + + if not concrete_node_id in self.concrete_nodes: + abstract_node_id = concrete_node_id.node_id + assert self.hsprog.is_start_of_block(abstract_node_id) + self.adding_new_basic_block(concrete_node_id) + util.debug_log("try to add concrete node here") + util.debug_log(repr(self.prev_concrete_node)) + util.debug_log("") + node = self.get_concrete_node(concrete_node_id) + + # Invalid state check + if node.is_committed() or node.is_initialized(): + logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + + if node.is_ready(): + node.start_executing(env_file) + elif node.is_unsafe(): + pass + elif node.is_stopped(): + if node in self.get_frontier(): + logging.info(f'Node {concrete_node_id} is stopped and in the frontier.') + node.transition_from_stopped_to_executing(env_file) else: - logging.debug(f" > Node: {node_id} is not executing or waiting to be resolved (speculated) so we modify its set.") - self.to_be_resolved[node_id] = [] - traversal = [] - relevant_committed = self.get_committed() - if node_id not in relevant_committed: - to_add = self.get_prev(node_id).copy() - traversal = to_add.copy() - to_be_resolved_nodes_ids = to_add.copy() - while len(traversal) > 0: - current_node_id = traversal.pop(0) - if current_node_id not in relevant_committed: - to_add = self.get_prev(current_node_id) - to_be_resolved_nodes_ids.extend(to_add) - traversal.extend(to_add) - self.to_be_resolved[node_id] = to_be_resolved_nodes_ids.copy() - self.to_be_resolved[node_id] = list(set(self.to_be_resolved[node_id]) - set(relevant_committed)) - logging.debug(f' |> New to be resolved set: {self.to_be_resolved[node_id]}') - - def get_currently_executing(self) -> list: - return sorted(list(self.commands_currently_executing.keys())) - - def log_executions(self): - logging.debug("---------- (Re)executions ------------") - for cmd in sorted(self.get_committed_list()): - logging.debug(f" CMD {cmd} executed {self.executions[cmd]} times") - logging.debug(f"Executions|{cmd},{self.executions[cmd]}") - logging.debug(f" Total (re)executions: {sum(list(self.executions.values()))}") - logging.debug(f"TotalExec|{sum(list(self.executions.values()))}") - logging.debug("--------------------------------------") - - -## TODO: Try to move those to PaSh and import them here -def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]": - logging.debug(f'Parsing: {file_path}') - with open(file_path) as f: - cmd = f.read() - asts = analysis.parse_shell_to_asts(file_path) - return cmd, asts - -def parse_edge_line(line: str) -> "tuple[int, int]": - from_str, to_str = line.split(" -> ") - return (int(from_str), int(to_str)) - -def parse_loop_context_line(line: str) -> "tuple[int, list[int]]": - node_id, loop_contexts_raw = line.split("-loop_ctx-") - if loop_contexts_raw != "": - loop_contexts_str = loop_contexts_raw.split(",") - loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str] - else: - loop_contexts = [] - return int(node_id), loop_contexts - -def parse_loop_contexts(lines): - loop_contexts = {} - for line in lines: - node_id, loop_ctx = parse_loop_context_line(line) - loop_contexts[node_id] = loop_ctx - - return loop_contexts - -def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder: - with open(file_path) as f: - raw_lines = f.readlines() - - ## Filter comments and remove new lines - lines = [line.rstrip() for line in raw_lines - if not line.startswith("#")] - - ## The directory in which cmd_files are - cmds_directory = str(lines[0]) - logging.debug(f'Cmds are stored in: {cmds_directory}') - - ## The initial env file - initial_env_file = str(lines[1]) - - ## The number of nodes - number_of_nodes = int(lines[2]) - logging.debug(f'Number of po cmds: {number_of_nodes}') - - ## The loop context for each node - loop_context_start=3 - loop_context_end=number_of_nodes+3 - loop_context_lines = lines[loop_context_start:loop_context_end] - loop_contexts = parse_loop_contexts(loop_context_lines) - logging.debug(f'Loop contexts: {loop_contexts}') - - ## The rest of the lines are edge_lines - edge_lines = lines[loop_context_end:] - logging.debug(f'Edges: {edge_lines}') - - nodes = {} - for i in range(number_of_nodes): - file_path = f'{cmds_directory}/{i}' - cmd, asts = parse_cmd_from_file(file_path) - loop_ctx = loop_contexts[i] - nodes[NodeId(i)] = Node(NodeId(i), cmd, - asts=asts, - loop_context=LoopStack(loop_ctx)) - - edges = {NodeId(i) : [] for i in range(number_of_nodes)} - for edge_line in edge_lines: - from_id, to_id = parse_edge_line(edge_line) - edges[NodeId(from_id)].append(NodeId(to_id)) - - logging.trace(f"Nodes|{','.join([str(node) for node in nodes])}") - logging.trace(f"Edges|{edges}") - return PartialProgramOrder(nodes, edges, initial_env_file) + logging.info(f'Node {concrete_node_id} is stopped but not in the frontier.') + elif node.is_speculated(): + # Check if env conflicts exist + if node.has_env_conflict_with(env_file): + util.debug_log(f'prev_env: {node.exec_ctxt.pre_env_file}, real: {env_file}') + node.reset_to_ready() + node.start_executing(env_file) + self.reset_succeeding_nodes(concrete_node_id, env_file) + # Optimization: It would make sense to perform the checks independently, + # and if fs conflict, then update the run after dict. + elif self.has_fs_deps(concrete_node_id): + node.reset_to_ready() + node.start_executing(env_file) + else: + node.commit_speculated() + self.adjust_to_be_resolved_dict() + elif node.is_executing(): + if node.has_env_conflict_with(env_file): + node.reset_to_ready() + node.start_executing(env_file) + self.reset_succeeding_nodes(concrete_node_id, env_file) + elif node.is_spec_executing(): + if node.has_env_conflict_with(env_file): + node.reset_to_ready() + self.reset_succeeding_nodes(concrete_node_id, env_file) + else: + logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + + def eager_fs_killing(self): + event_log("try to eagerly kill conflicted speculation") + to_be_killed = [] + self.fetch_fs_actions() + for node in self.get_all_nodes(): + if ((node.is_speculated() or node.is_spec_executing()) + and self._has_fs_deps(node.cnid)): + to_be_killed.append(node) + for node in to_be_killed: + node.reset_to_ready() diff --git a/parallel-orch/run_command.sh b/parallel-orch/run_command.sh index e2c46e92..9fe48f26 100755 --- a/parallel-orch/run_command.sh +++ b/parallel-orch/run_command.sh @@ -5,10 +5,12 @@ export CMD_STRING=${1?No command was given to execute} export TRACE_FILE=${2?No trace file path given} export STDOUT_FILE=${3?No stdout file given} export LATEST_ENV_FILE=${4?No env file to run with given} -export EXEC_MODE=${5?No execution mode given} -export CMD_ID=${6?No command id given} -export POST_EXEC_ENV=${7?No Riker env file given} - +export SANDBOX_DIR=${5?No sandbox dir given} +export TMPDIR=${6?No tmp dir given} +export EXEC_MODE=${7?No execution mode given} +export CMD_ID=${8?No command id given} +export POST_EXEC_ENV=${9?No Riker env file given} +export EXECUTION_ID=${10?No execution id given} ## KK 2023-04-24: Not sure this should be run every time we run a command ## GL 2023-07-08: Tests seem to pass without it @@ -23,10 +25,10 @@ else exit 1 fi -mkdir -p /tmp/pash_spec/a -mkdir -p /tmp/pash_spec/b -export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/" -export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)" +# mkdir -p /tmp/pash_spec/a +# mkdir -p /tmp/pash_spec/b +# export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/" +# export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)" # echo tempdir $TEMPDIR # echo sandbox $SANDBOX_DIR @@ -40,5 +42,5 @@ out=`head -3 $SANDBOX_DIR/upperdir/$TRACE_FILE` ## Assumes "${PASH_SPEC_SCHEDULER_SOCKET}" is set and exported ## Pass the proper exit code -msg="CommandExecComplete:${CMD_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}" +msg="CommandExecComplete:${CMD_ID}|Exec id:${EXECUTION_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}" daemon_response=$(pash_spec_communicate_scheduler_just_send "$msg") # Blocking step, daemon will not send response until it's safe to continue diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 29ebddbf..74a8583d 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -1,9 +1,10 @@ import argparse import logging import signal -from util import * +import util import config -from partial_program_order import parse_partial_program_order_from_file, LoopStack, NodeId, parse_node_id +from partial_program_order import PartialProgramOrder, NodeId +from node import LoopStack, ConcreteNodeId ## ## A scheduler server @@ -18,11 +19,11 @@ def handler(signum, frame): def parse_args(): parser = argparse.ArgumentParser(add_help=False) ## TODO: Import the arguments so that they are not duplicated here and in orch - parser.add_argument("-d", "--debug-level", - type=int, + parser.add_argument("-d", "--debug-level", + type=int, default=0, help="Set debugging level") - parser.add_argument("-f", "--log_file", + parser.add_argument("-f", "--log_file", type=str, default=None, help="Set logging output file. Default: stdout") @@ -34,7 +35,7 @@ def parse_args(): action="store_true", default=False, help="Speculate immediately instead of waiting for the first Wait message.") - + args, unknown_args = parser.parse_known_args() return args @@ -56,18 +57,20 @@ def error_response(string): class Scheduler: """ Schedules a partial order of commands to run out-of-order Flow: - input cmd -> + input cmd -> | Daemon Start -> Receive whens tarting | Init -> Read the partial order from a file | CommandExecComplete -> A command completed its execution | Wait -> The JIT component waits for the results of a specific command | Done -> We are done """ + window: int # Integer representing the window + latest_env: str # This variable should be initialized by the first wait, and always have a value since def __init__(self, socket_file): - ## TODO: Add all the orchestrator state here (it should just be the partial order) + self.window = 0 self.done = False - self.socket = init_unix_socket(socket_file) + self.socket = util.init_unix_socket(socket_file) ## A map containing connections for node_ids that are waiting for a response self.waiting_for_response = {} self.partial_program_order = None @@ -76,203 +79,119 @@ def handle_init(self, input_cmd: str): assert(input_cmd.startswith("Init")) partial_order_file = input_cmd.split(":")[1].rstrip() logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}') - self.partial_program_order = parse_partial_program_order_from_file(partial_order_file) - self.partial_program_order.init_partial_order() - - def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]": - try: - node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|") - raw_node_id_int = int(node_id_component.split(":")[1].rstrip()) - loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip() - pash_runtime_vars_file_str = pash_runtime_vars_file_component.split(":")[1].rstrip() - if loop_counters_str == "None": - node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str - else: - loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] - node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str - return node_id - except: - raise Exception(f'Parsing failure for line: {input_cmd}') + self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file) + util.debug_log(str(self.partial_program_order.hsprog)) def handle_wait(self, input_cmd: str, connection): - assert(input_cmd.startswith("Wait")) - ## We have received this message by the JIT, which waits for a node_id to - ## finish execution. - node_id, pash_runtime_vars_file_str = self.__parse_wait(input_cmd) - logging.debug(f'Scheduler: Received wait for node_id: {node_id}|New env file: {pash_runtime_vars_file_str}') - - ## Set the new env file for the node - self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - - if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): - logging.debug("Initializing latest env and speculating") - self.partial_program_order.init_latest_env_files(node_id) - - ## Attempt to rerun all pending nodes - self.partial_program_order.attempt_rerun_pending_nodes() - - ## Inform the partial order that we received a wait for a node so that it can push loops - ## forward and so on. - self.partial_program_order.maybe_unroll(node_id) - - # Moved this below wait_received, in order to support unrolled loop nodes - self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) - - self.partial_program_order.wait_received(node_id) - - ## If the node_id is already committed, just return its exit code - if node_id in self.partial_program_order.get_committed(): - logging.debug(f'Node: {node_id} found in committed, responding immediately!') - self.waiting_for_response[node_id] = connection - self.respond_to_pending_wait(node_id) - elif node_id in self.partial_program_order.get_unsafe(): - logging.debug(f'Node: {node_id} found in unsafe, it must be executed in the original shell!') - self.waiting_for_response[node_id] = connection - self.respond_unsafe_to_pending_wait(node_id) - else: - ## Command has not executed yet, so we need to wait for it - logging.debug(f'Node: {node_id} has not finished execution, waiting for response...') - self.waiting_for_response[node_id] = connection - - - def __parse_command_exec_complete(self, input_cmd: str) -> "tuple[int, int]": - try: - components = input_cmd.rstrip().split("|") - command_id = parse_node_id(components[0].split(":")[1]) - exit_code = int(components[1].split(":")[1]) - sandbox_dir = components[2].split(":")[1] - trace_file = components[3].split(":")[1] - return command_id, exit_code, sandbox_dir, trace_file - except: - raise Exception(f'Parsing failure for line: {input_cmd}') - - def respond_unsafe_to_pending_wait(self, node_id: int): - assert(node_id in self.partial_program_order.get_unsafe()) - - ## First remove node_id from unsafe and stopped and add to committed - ## since it will be executed immediately in the original shell - self.partial_program_order.remove_from_unsafe(node_id) - self.partial_program_order.commit_node(node_id) - - response = unsafe_response("") - - ## Send the response - self.respond_to_frontend_core(node_id, response) - - - ## TODO: send riker env here - def respond_to_pending_wait(self, node_id: int): - logging.debug(f'Responding to pending wait for node: {node_id}') - ## Get the completed node info - node = self.partial_program_order.get_node(node_id) - completed_node_info = node.get_completed_node_info() - msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' - response = success_response(msg) - ## Send the response - self.respond_to_frontend_core(node_id, response) - - - def respond_to_frontend_core(self, node_id: NodeId, response: str): - assert(node_id in self.waiting_for_response) - ## Get the connection that we need to respond to - connection = self.waiting_for_response.pop(node_id) - socket_respond(connection, response) - connection.close() - - def handle_command_exec_complete(self, input_cmd: str): - assert(input_cmd.startswith("CommandExecComplete:")) - ## Read the node id from the command argument - cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_complete(input_cmd) - if trace_file in self.partial_program_order.banned_files: - logging.debug(f'CommandExecComplete: {cmd_id} ignored') - return - ## Gather RWset, resolve dependencies, and progress graph - self.partial_program_order.command_execution_completed(cmd_id, exit_code, sandbox_dir) - - ## If there is a connection waiting for this node_id, respond to it - if cmd_id in self.waiting_for_response and cmd_id in self.partial_program_order.get_committed(): - self.respond_to_pending_wait(cmd_id) + concrete_node_id, env_file = self.__parse_wait(input_cmd) + self.waiting_for_response[concrete_node_id] = connection + logging.info(f'Scheduler: Received wait message - {concrete_node_id}.') + self.latest_env = env_file + self.partial_program_order.handle_wait(concrete_node_id, env_file) + concrete_node = self.partial_program_order.get_concrete_node(concrete_node_id) + if concrete_node.is_committed(): + self.respond_to_pending_wait(concrete_node_id) + elif concrete_node.is_unsafe(): + self.partial_program_order.finish_wait_unsafe(concrete_node_id) + self.respond_to_wait_on_unsafe(concrete_node_id) def process_next_cmd(self): - connection, input_cmd = socket_get_next_cmd(self.socket) + connection, input_cmd = util.socket_get_next_cmd(self.socket) if(input_cmd.startswith("Init")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "PartialOrderInit") connection.close() self.handle_init(input_cmd) - ## TODO: Read the partial order from the given file - log_time_delta_from_named_timestamp("Scheduler", "PartialOrderInit") elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "DaemonStart") + logging.info(f'Scheduler: Received daemon start message.') connection.close() - ## This happens when pa.sh first connects to daemon to see if it is on - logging.debug(f'PaSh made first contact with scheduler server.') - log_time_delta_from_named_timestamp("Scheduler", "DaemonStart") elif (input_cmd.startswith("CommandExecComplete:")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "CommandExecComplete") - ## We have received this message from an a runner (tracer +isolation) - ## The runner should have already parsed RWsets and serialized them to - ## a file. - connection.close() - self.handle_command_exec_complete(input_cmd) - log_time_delta_from_named_timestamp("Scheduler", "CommandExecComplete") + node_id, exec_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + if self.partial_program_order.get_concrete_node(node_id).exec_id == exec_id: + logging.info(f'Scheduler: Received command exec complete message - {node_id}.') + self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) + + if self.partial_program_order.get_concrete_node(node_id).is_committed(): + self.respond_to_pending_wait(node_id) + else: + logging.info(f'Scheduler: Received command exec complete message for a killed instance, ignoring - {node_id}.') elif (input_cmd.startswith("Wait")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Wait") self.handle_wait(input_cmd, connection) - log_time_delta_from_named_timestamp("Scheduler", "Wait") elif (input_cmd.startswith("Done")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Done") - logging.debug(f'Scheduler server received shutdown message.') - logging.debug(f'The partial order was successfully completed.') - if not self.partial_program_order.is_completed(): - logging.debug(" |- some nodes were skipped completed.") - socket_respond(connection, success_response("All finished!")) - self.partial_program_order.log_executions() + util.socket_respond(connection, success_response("All finished!")) + self.partial_program_order.log_info() self.done = True - log_time_delta_from_named_timestamp("Scheduler", "Done") else: logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) raise Exception(f'Error: Unsupported command: {input_cmd}') - def check_unsafe_and_waiting(self): - ## If a command is waiting and also deemed to be unsafe, we need to respond - waiting_for_response = set(self.waiting_for_response.keys()) - unsafe = set(self.partial_program_order.get_unsafe()) - unsafe_and_waiting = unsafe.intersection(waiting_for_response) - if len(unsafe_and_waiting) > 0: - assert(len(unsafe_and_waiting) == 1) - logging.debug(f'Unsafe and waiting for response nodes: {unsafe_and_waiting}') - logging.debug(f'Sending responses to them: {unsafe_and_waiting}') - unsafe_and_waiting_id = list(unsafe_and_waiting)[0] - self.respond_unsafe_to_pending_wait(unsafe_and_waiting_id) - - ## This function schedules commands for execution until our capacity is reached - ## - ## It should add some work (if possible), and then return immediately. - ## It is called once per loop iteration, making sure that there is always work happening - def schedule_work(self): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "ScheduleWork") - self.partial_program_order.schedule_work() + def respond_to_frontend_core(self, node_id: NodeId, response: str): + assert(node_id in self.waiting_for_response) + ## Get the connection that we need to respond to + connection = self.waiting_for_response.pop(node_id) + util.socket_respond(connection, response) + connection.close() + + def respond_to_wait_on_unsafe(self, node_id: ConcreteNodeId): + response = unsafe_response('') + self.respond_to_frontend_core(node_id, response) + + def respond_to_pending_wait(self, node_id: ConcreteNodeId): + logging.debug(f'Responding to pending wait for node: {node_id}') + ## Get the completed node info + node = self.partial_program_order.get_concrete_node(node_id) + msg = '{} {} {}'.format(*node.execution_outcome()) + response = success_response(msg) + + ## Send the response + self.respond_to_frontend_core(node_id, response) + + def __parse_wait(self, input_cmd: str) -> "tuple[ConcreteNodeId, str]": + try: + node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|") + node_id = NodeId(int(node_id_component.split(":")[1].rstrip())) + loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip() + pash_env_filename = pash_runtime_vars_file_component.split(":")[1].rstrip() + if loop_counters_str == "None": + return ConcreteNodeId(node_id), pash_env_filename + else: + loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] + return ConcreteNodeId(node_id, loop_counters), pash_env_filename + except: + raise Exception(f'Parsing failure for line: {input_cmd}') + + def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": + try: + components = input_cmd.rstrip().split("|") + command_id = ConcreteNodeId.parse(components[0].split(":")[1]) + exec_id = int(components[1].split(":")[1]) + exit_code = int(components[2].split(":")[1]) + sandbox_dir = components[3].split(":")[1] + trace_file = components[4].split(":")[1] + return command_id, exec_id, exit_code, sandbox_dir, trace_file + except: + raise Exception(f'Parsing failure for line: {input_cmd}') + - ## Respond to any waiting nodes that have been deemed to be unsafe - self.check_unsafe_and_waiting() - log_time_delta_from_named_timestamp("Scheduler", "ScheduleWork") + def schedule_work(self): + concrete_node_ids = self.partial_program_order.get_schedulable_nodes() + for n in concrete_node_ids[:2]: + self.partial_program_order.schedule_spec_work(n, self.latest_env) def run(self): ## The first command should be the daemon start self.process_next_cmd() - + ## The second command should be the partial order init self.process_next_cmd() - + self.partial_program_order.log_state() while not self.done: - ## Schedule some work (if we are already at capacity this will return immediately) - self.schedule_work() - ## Process a single request self.process_next_cmd() - # If workset is empty we should end. - # TODO: ec checks fail for now + self.partial_program_order.log_state() + self.schedule_work() + self.partial_program_order.log_state() + self.partial_program_order.eager_fs_killing() + self.partial_program_order.log_state() self.socket.close() self.shutdown() @@ -281,15 +200,14 @@ def shutdown(self): logging.debug("PaSh-Spec scheduler is shutting down...") logging.debug("PaSh-Spec scheduler shut down successfully...") self.terminate_pending_commands() - - def terminate_pending_commands(self): - for _node_id, cmd_info in self.partial_program_order.commands_currently_executing.items(): - proc, _trace_file, _stdout, _stderr, _variable_file = cmd_info - proc.terminate() + def terminate_pending_commands(self): + for node in self.partial_program_order.get_executing_normal_and_spec_nodes(): + proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox() + logging.debug(f'Killing: {proc}') + # proc.terminate() def main(): - log_time_delta_from_start("Scheduler", "Scheduler Init") args = init() # Format logging @@ -297,8 +215,8 @@ def main(): if args.log_file is None: logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s") else: - logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s", - filename=f"{os.path.abspath(args.log_file)}", + logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s", + filename=f"{os.path.abspath(args.log_file)}", filemode="w") # Set debug level @@ -306,15 +224,13 @@ def main(): logging.getLogger().setLevel(logging.INFO) elif args.debug_level >= 2: logging.getLogger().setLevel(logging.DEBUG) - # elif args.debug_level >= 3: - # logging.getLogger().setLevel(logging.TRACE) - + # Set optimization options config.SANDBOX_KILLING = args.sandbox_killing config.SPECULATE_IMMEDIATELY = args.speculate_immediately scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() - + if __name__ == "__main__": main() diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh index a63596d1..5af85d64 100755 --- a/parallel-orch/template_script_to_execute.sh +++ b/parallel-orch/template_script_to_execute.sh @@ -1,35 +1,39 @@ #!/bin/bash -touch "$TEMPDIR/Rikerfile" +# touch "$TEMPDIR/Rikerfile" -## We source the latest env file -## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why. -echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile" +# ## We source the latest env file +# ## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why. +# echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile" -## Save the script to execute in the sandboxdir -echo $CMD_STRING >> "$TEMPDIR/Rikerfile" +# ## Save the script to execute in the sandboxdir +# echo $CMD_STRING >> "$TEMPDIR/Rikerfile" -## Add command to export Riker's environment variables after run is complete to a file -echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile" +# ## Add command to export Riker's environment variables after run is complete to a file +# echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile" -if [ $speculate_flag -eq 1 ]; then - rkr_cmd="rkr" -else - rkr_cmd="rkr --frontier" -fi +# if [ $speculate_flag -eq 1 ]; then +# rkr_cmd="rkr" +# else +# rkr_cmd="rkr --frontier" +# fi -cat "$TEMPDIR/Rikerfile" 1>&2 +# cat "$TEMPDIR/Rikerfile" 1>&2 -$rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" -exit_code="$?" +# $rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" +# exit_code="$?" -if [ "$exit_code" -eq 0 ]; then - echo "first riker run done (Node: ${CMD_ID})" 1>&2 -else - echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2 -fi +# if [ "$exit_code" -eq 0 ]; then +# echo "first riker run done (Node: ${CMD_ID})" 1>&2 +# else +# echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2 +# fi -rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null -echo 'second riker run done' 1>&2 + +# rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null +# echo 'second riker run done' 1>&2 +source $LATEST_ENV_FILE +strace -y -f --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE bash -c "source $LATEST_ENV_FILE; $CMD_STRING" +exit_code=$? (exit $exit_code) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py new file mode 100644 index 00000000..d0696944 --- /dev/null +++ b/parallel-orch/trace_v2.py @@ -0,0 +1,325 @@ +import re +import logging +import os.path +import sys +from typing import Tuple +from dataclasses import dataclass + +# Global TODOs: +# handle pwd, such that open and stat can work + +# not handled: listxattr, llistxattr, getxattr, pivot_root, mount, umount2 +# setxattr lsetxattr removexattr lremovexattr, fanotify_mark, renameat2, chroot, quotactl +# handled individually openat, open, chdir, clone, rename +# TODO: link, symlink, renameat, symlinkat +r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs', + 'readlink', 'execve', 'getxattr', 'lgetxattr']) +w_first_path_set = set(['mkdir', 'rmdir', 'truncate', 'creat', 'chmod', 'chown', + 'lchown', 'utime', 'mknod', 'utimes', 'acct', 'unlink']) +r_fd_path_set = set(['fstatat', 'newfstatat', 'statx', 'name_to_handle_at', + 'readlinkat', 'faccessat', 'execveat', 'faccessat2']) +w_fd_path_set = set(['unlinkat', 'utimensat', 'mkdirat', 'mknodat', 'fchownat', 'futimeat', + 'unlinkat', 'linkat', 'fchmodat', 'utimensat']) +ignore_set = set(['getpid', 'getcwd']) + + +@dataclass +class ExitStatus: + exitcode: int + +def parse_info(l): + if "exited" in l: + start = len("+++ exited with ") + end = -len(" +++") + return ExitStatus(int(l[start:end])) + elif 'killed' in l: + return ExitStatus(-1) + else: + raise ValueError + +@dataclass +class RFile: + fname: str + def __init__(self, fname): + self.fname = os.path.normpath(fname) + +@dataclass +class WFile: + fname: str + def __init__(self, fname): + self.fname = os.path.normpath(fname) + +class Context: + def __init__(self): + self.line_dict = {} + self.curdir_dict = {} + self.pid_group_dict = {} + + def do_clone(self, parent, child): + self.pid_group_dict[child] = parent + + def set_dir(self, path, pid=None): + self.curdir_fallback = path + if pid and pid in self.pid_group_dict: + pid = self.pid_group_dict[pid] + if pid: + self.curdir_dict[pid] = path + + def get_dir(self, pid: int): + if pid in self.pid_group_dict: + pid = self.pid_group_dict[pid] + if not pid in self.curdir_dict: + self.curdir_dict[pid] = self.curdir_fallback + return self.curdir_dict[pid] + + def push_half_line(self, pid: int, l): + index = l.find('') + len('resumed>') + total_line = self.line_dict[pid] + l[index:].strip() + del self.line_dict[pid] + return total_line + +def parse_string(s): + s = s.strip() + # handling cases such as utimensat + # if the open fails we will mark the file + # as a read when we handle return value anyway so it's fine + if s == 'NULL': + return '' + if not s[0] == '"' or not s[-1] == '"': + import pdb + pdb.set_trace() + assert s[0] == '"' and s[-1] == '"' + return bytes(s[1:-1], "utf-8").decode("unicode_escape") + +def between(s, d1, d2): + return s.find(d1) + len(d1), s.rfind(d2) + +def is_absolute(path): + return path[0] == '/' + +def is_ret_err(ret: str): + ret = ret.strip() + return ret[0] == '-' + +def convert_absolute(cur_dir, path): + if is_absolute(path): + return path + else: + return os.path.join(cur_dir, path) + +def get_path_first_path(pid, args, ctx): + a = parse_string(args.split(sep=',', maxsplit=1)[0]) + return convert_absolute(ctx.get_dir(pid), a) + +def parse_r_first_path(pid, args, ret, ctx): + return RFile(get_path_first_path(pid, args, ctx)) + +def parse_w_first_path(pid, args, ret, ctx): + path = get_path_first_path(pid, args, ctx) + if is_ret_err(ret): + return RFile(path) + else: + return WFile(path) + +def get_path_at(pid, positions, args, ctx): + args = args.split(sep=',') + if isinstance(positions, list): + rets = [] + for x in args: + rets.append(convert_absolute(ctx.get_dir(pid), parse_string(x))) + return rets + else: + return convert_absolute(ctx.get_dir(pid), parse_string(x)) + +def parse_rename(pid, args, ret, ctx): + path_a, path_b = get_path_at(pid, [0, 1], args, ctx) + return WFile(path_a), WFile(path_b) + +def parse_link(pid, args, ret, ctx): + path_a, path_b = get_path_at(pid, [0, 1], args, ctx) + return RFile(path_a), WFile(path_b) + + +def parse_chdir(pid, args, ret, ctx): + new_path = get_path_first_path(pid, args, ctx) + if not is_ret_err(ret): + ctx.set_dir(new_path, pid) + return RFile(new_path) + +def handle_open_flag(flags): + if 'O_RDONLY' in flags: + return 'r' + else: + return 'w' + +def handle_open_common(total_path, flags, ret): + if handle_open_flag(flags) == 'r': + return RFile(total_path) + if is_ret_err(ret): + return RFile(total_path) + return WFile(total_path) + +def parse_openat(args, ret): + if args.count(',') <= 2: + dfd, path, flags = args.split(',', maxsplit=2) + else: + dfd, path, flags, _ = args.split(',', maxsplit=3) + path = parse_string(path) + if is_absolute(path): + total_path = path + else: + begin, end = between(dfd, '<', '>') + pwd = dfd[begin:end] + total_path = os.path.join(pwd, path) + return handle_open_common(total_path, flags, ret) + +def parse_open(pid, args, ret, ctx): + total_path = get_path_first_path(pid, args, ctx) + flags = args.split(',')[1] + return handle_open_common(total_path, flags, ret) + +def get_path_from_fd_path(args): + a0, a1, _ = args.split(sep=',', maxsplit=2) + a1 = parse_string(a1) + if len(a1) and a1[0] == '/': + return a1 + else: + begin, end = between(a0, '<', '>') + a0 = a0[begin:end] + return os.path.join(a0, a1) + +def parse_r_fd_path(args, ret): + return RFile(get_path_from_fd_path(args)) + +def parse_w_fd_path(args, ret): + if is_ret_err(ret): + return RFile(get_path_from_fd_path(args)) + else: + return WFile(get_path_from_fd_path(args)) + +def has_clone_fs(flags): + if 'CLONE_FS' in flags: + return True + else: + return False + +def parse_clone(pid, args, ret, ctx): + try: + child = int(ret) + except ValueError: + child = -1 + if child < 0: + return + arg_list = [x.strip() for x in args.split(',')] + flags = [arg for arg in arg_list if arg.startswith('flags=')][0] + flags = flags[len('flags='):] + if has_clone_fs(flags): + ctx.do_clone(pid, child) + +def parse_syscall(pid, syscall, args, ret, ctx): + if syscall in r_first_path_set: + return parse_r_first_path(pid, args, ret, ctx) + elif syscall in w_first_path_set: + return parse_w_first_path(pid, args, ret, ctx) + elif syscall == 'openat': + return parse_openat(args, ret) + elif syscall == 'chdir': + return parse_chdir(pid, args, ret, ctx) + elif syscall == 'open': + return parse_open(pid, args, ret, ctx) + elif syscall in r_fd_path_set: + return parse_r_fd_path(args, ret) + elif syscall in w_fd_path_set: + return parse_w_fd_path(args, ret) + elif syscall == 'rename': + return parse_rename(pid, args, ret, ctx) + elif syscall == 'clone': + return parse_clone(pid, args, ret, ctx) + elif syscall in ignore_set: + return None + else: + raise ValueError('Unclassified syscall ' + syscall) + +def strip_pid(l): + if l[0].isdigit(): + pair = l.split(maxsplit=1) + return int(pair[0]), pair[1] + else: + raise ValueError('expect pid') + +def handle_info(l): + if '+++' in l: + return True, parse_info(l) + elif '---' in l: + return True, None + else: + return False, None + +def parse_line(l, ctx): + if len(l) == 0: + return None + pid, l = strip_pid(l) + is_info, info = handle_info(l) + if is_info: + return info + if not len(l): + return None + if "" in l: + l = ctx.pop_complete_line(pid, l) + lparen = l.find('(') + equals = l.rfind('=') + rparen = l[:equals].rfind(')') + if not (lparen >= 0 and equals >= 0 and rparen >= 0): + return None + syscall = l[:lparen] + ret = l[equals+1:] + args = l[lparen+1:rparen] + return parse_syscall(pid, syscall, args, ret, ctx) + +def parse_exit_code(trace_object) -> int: + if len(trace_object) == 0 or trace_object[0] == '': + return None + l = trace_object[0] + first_pid, _ = strip_pid(l) + for l in trace_object: + pid, tmpl = strip_pid(l) + is_info, info = handle_info(tmpl) + if is_info and pid == first_pid and isinstance(info, ExitStatus): + return info.exitcode + raise ValueError("No exitcode") + +def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]: + ctx = Context() + ctx.set_dir(os.getcwd()) + read_set = set() + write_set = set() + for l in trace_object: + try: + record = parse_line(l, ctx) + except Exception: + logging.debug(l) + raise ValueError("error while parsing trace") + if type(record) is RFile and record.fname != '/dev/tty': + read_set.add(record.fname) + elif type(record) is WFile and record.fname != '/dev/tty': + write_set.add(record.fname) + return read_set, write_set + +def main(fname): + ctx = Context() + ctx.set_dir(os.getcwd()) + with open(fname) as f: + for l in f: + record = parse_line(l, ctx) + if record: + print(record) + +if __name__ == '__main__': + main(sys.argv[1]) diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 1ed96118..3886a816 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -8,6 +8,14 @@ import re import psutil import signal +import analysis +from node import Node, NodeId, LoopStack +from partial_program_order import PartialProgramOrder + +DEBUG_LOG = '[DEBUG_LOG] ' + +def debug_log(s): + logging.debug(DEBUG_LOG + s) def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX) @@ -15,6 +23,13 @@ def ptempfile(): os.close(fd) return name +def create_sandbox(): + os.makedirs("/tmp/pash_spec/a", exist_ok=True) + os.makedirs("/tmp/pash_spec/b", exist_ok=True) + sdir = tempfile.mkdtemp(dir="/tmp/pash_spec/a", prefix="sandbox_") + tdir = tempfile.mkdtemp(dir="/tmp/pash_spec/b", prefix="sandbox_") + return sdir, tdir + def init_unix_socket(socket_file: str) -> socket.socket: server_address = socket_file @@ -32,11 +47,11 @@ def init_unix_socket(socket_file: str) -> socket.socket: logging.debug("SocketManager: Created socket") sock.bind(server_address) - logging.debug("SocketManager: Successfully bound to socket") + logging.debug("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the backlog - sock.listen() - logging.debug("SocketManager: Listenting on socket") + sock.listen() + logging.debug("SocketManager: Listenting on socket") return sock @@ -51,7 +66,7 @@ def socket_get_next_cmd(sock: socket.socket) -> "tuple[socket.socket, str]" : ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) assert(str_data.endswith("\n") or str_data == "") - + return (connection, str_data) def socket_respond(connection: socket.socket, message: str): @@ -73,7 +88,7 @@ def parse_env_string_to_dict(content): result = {key: value for key, value in scalar_vars_string} result.update({key: int(value) for key, value in scalar_vars_int}) result.update({key: value for key, value in array_vars}) - + return result def compare_dicts(dict1, dict2): @@ -104,19 +119,19 @@ def set_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" config.NAMED_TIMESTAMPS[key] = time.time() - + def invalidate_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" del config.NAMED_TIMESTAMPS[key] - + def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None): try: set_named_timestamp(action, node, key) logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") except KeyError: logging.error(f"Named timestamp {key} already exists") - + def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key=None, invalidate=True): try: if key is None: @@ -137,7 +152,7 @@ def get_all_child_processes(pid): parent = psutil.Process(pid) except psutil.NoSuchProcess: return [] - + children = parent.children(recursive=True) parent_of_parent = parent.parent() logging.critical("PARENT_PROCESS: " + str(parent_of_parent)) @@ -169,3 +184,92 @@ def kill_process_tree(pid, sig=signal.SIGTERM): except: pass return alive_processes + + +## TODO: Try to move those to PaSh and import them here +def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]": + logging.debug(f'Parsing: {file_path}') + with open(file_path) as f: + cmd = f.read() + asts = analysis.parse_shell_to_asts(file_path) + return cmd, asts + +def parse_edge_line(line: str) -> "tuple[int, int]": + from_str, to_str = line.split(" -> ") + return (int(from_str), int(to_str)) + +def parse_loop_context_line(line: str) -> "tuple[int, list[int]]": + node_id, loop_contexts_raw = line.split("-loop_ctx-") + if loop_contexts_raw != "": + loop_contexts_str = loop_contexts_raw.split(",") + loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str] + else: + loop_contexts = [] + return int(node_id), loop_contexts + +def parse_loop_contexts(lines): + loop_contexts = {} + for line in lines: + node_id, loop_ctx = parse_loop_context_line(line) + loop_contexts[node_id] = loop_ctx + return loop_contexts + +def parse_partial_program_order_from_file(file_path: str): + with open(file_path) as f: + raw_lines = f.readlines() + + ## Filter comments and remove new lines + lines = [line.rstrip() for line in raw_lines + if not line.startswith("#")] + + ## The directory in which cmd_files are + cmds_directory = str(lines[0]) + logging.debug(f'Cmds are stored in: {cmds_directory}') + + ## The initial env file + initial_env_file = str(lines[1]) + + ## The number of nodes + number_of_nodes = int(lines[2]) + logging.debug(f'Number of po cmds: {number_of_nodes}') + + ## The loop context for each node + loop_context_start=3 + loop_context_end=number_of_nodes+3 + loop_context_lines = lines[loop_context_start:loop_context_end] + loop_contexts = parse_loop_contexts(loop_context_lines) + logging.debug(f'Loop contexts: {loop_contexts}') + + ## The rest of the lines are edge_lines + edge_lines = lines[loop_context_end:] + logging.debug(f'Edges: {edge_lines}') + + ab_nodes = {} + for i in range(number_of_nodes): + file_path = f'{cmds_directory}/{i}' + cmd, asts = parse_cmd_from_file(file_path) + loop_ctx = loop_contexts[i] + ab_nodes[NodeId(i)] = Node(NodeId(i), cmd.strip(), + asts=asts, + loop_context=LoopStack(loop_ctx)) + + edges = {NodeId(i) : [] for i in range(number_of_nodes)} + for edge_line in edge_lines: + from_id, to_id = parse_edge_line(edge_line) + edges[NodeId(from_id)].append(NodeId(to_id)) + + logging.info(f"Nodes|{','.join([str(node) for node in ab_nodes])}") + logging.info(f"Edges|{edges}") + return PartialProgramOrder(ab_nodes, edges) + +def generate_id() -> int: + return int(time.time() * 1000000) + +# nodes is iterable of node +# edges is dict[node, list[node]] +def invert_graph(nodes, edges): + graph = {n: [] for n in nodes} + for from_id, to_ids in edges.items(): + for to_id in to_ids: + graph[to_id].append(from_id) + return graph diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py old mode 100755 new mode 100644 index c74f099d..177a8bdf --- a/report/benchmark_plots.py +++ b/report/benchmark_plots.py @@ -1,5 +1,7 @@ import os import matplotlib.pyplot as plt +import numpy as np + # Set the plotting style if desired # plt.style.use('ggplot') # Example: ggplot style @@ -33,7 +35,9 @@ def plot_benchmark_times_combined(benchmarks, bash_times, orch_times, output_dir [i + bar_width / 2 for i in range(len(benchmarks))], benchmarks) ax.legend() - save_plot(output_dir, filename) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + # Plots individual comparison charts for each benchmark. def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_dir, filename): @@ -50,26 +54,62 @@ def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_d save_plot(output_dir, filename) -# Plots a Gantt chart of activities. -def plot_gantt(activities, output_dir, filename, simple=False): - if simple: - activities = [activity for activity in activities if activity[0].startswith("RunNode,") or activity[0] == "Wait"] - - fig_height = max(5, len(activities) * 0.3) - fig, ax = plt.subplots(figsize=(15, fig_height)) - - activities.sort(key=lambda x: x[1]) - bar_height = 0.8 - gap = 0.2 - - for index, (action, start_time, duration) in enumerate(activities): - ax.broken_barh([(start_time, duration)], (index * (bar_height + gap), bar_height), facecolors='blue') - ax.text(start_time + duration / 2, index * (bar_height + gap) + bar_height / 2, action, - ha='center', va='center', fontsize=6, color='white') - - setup_ax(ax, 'Time (ms)', '', f'Gantt Chart of {filename.strip("_gantt.pdf")}', [], []) - ax.set_yticks([i * (bar_height + gap) + bar_height / 2 for i in range(len(activities))]) - ax.set_yticklabels([activity[0] for activity in activities], fontsize=8) +def sort_node_ids(node_ids): + def parse_id(node_id): + parts = node_id.split('+') + concrete_id = int(parts[0]) + iter_ids = tuple(int(iter_id) for iter_id in parts[1].split('-')) if len(parts) > 1 else () + return (concrete_id,) + iter_ids + + sorted_ids = sorted(node_ids, key=parse_id, reverse=True) + return sorted_ids + +def plot_prog_blocks(prog_blocks, output_dir, filename): + # Define colors for different statuses + colors = { + 'READY': 'red', + 'EXE': 'orange', + 'SPEC_E': 'blue', + 'SPEC_F': 'lightblue', + 'COMMIT': 'green', + 'UNSAFE': 'purple', + 'INIT': 'grey' + } + + first_time = prog_blocks[0][0] + times = [(block[0] - first_time).total_seconds() for block in prog_blocks] + + unsorted_node_ids = {node[0] for block in prog_blocks for node in block[1]} + node_ids = sort_node_ids(unsorted_node_ids) # Sort the node IDs using the custom sorting function + + statuses = {node_id: [] for node_id in node_ids} + + for block_time, nodes in prog_blocks: + elapsed_time = (block_time - first_time).total_seconds() + for node_id, status in nodes: + statuses[node_id].append((elapsed_time, status)) + + fig_height = 0.5 * len(node_ids) + fig_width = 12 # Fixed width + fig, ax = plt.subplots(figsize=(fig_width, fig_height)) + status_legend_handles = {} + + for node_id in node_ids: + y_pos = node_ids.index(node_id) + for i, (start_time, status) in enumerate(statuses[node_id]): + end_time = times[-1] if i == len(statuses[node_id]) - 1 else statuses[node_id][i + 1][0] + color = colors.get(status, 'grey') + ax.broken_barh([(start_time, end_time - start_time)], (y_pos - 0.4, 0.8), facecolors=color) + if status not in status_legend_handles: + status_legend_handles[status] = plt.Rectangle((0, 0), 1, 1, fc=color) + + ax.set_xlabel("Time since first tick (seconds)") + ax.set_ylabel("Node ID") + ax.set_title("Node Status Over Time") + ax.set_yticks(np.arange(len(node_ids))) + ax.set_yticklabels(node_ids) ax.grid(True) + ax.legend(status_legend_handles.values(), status_legend_handles.keys(), loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, frameon=True) - save_plot(output_dir, filename) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) diff --git a/report/benchmark_runner.py b/report/benchmark_runner.py old mode 100755 new mode 100644 index f62e380d..9cee12bb --- a/report/benchmark_runner.py +++ b/report/benchmark_runner.py @@ -1,48 +1,67 @@ import csv +from typing import List from command_executor import CommandExecutor +from config_parser import BenchmarkConfig from result_analyzer import ResultAnalyzer from report_generator import ReportGenerator import benchmark_plots import os +from pprint import pprint class BenchmarkRunner: - def __init__(self, benchmarks, args): + def __init__(self, benchmarks: "List[BenchmarkConfig]", args): self.benchmarks = benchmarks self.args = args self.results = [] self.activities = {} + + def __repr__(self): + return (f"BenchmarkRunner(benchmarks={self.benchmarks!r}, " + f"args={self.args!r}, results={self.results!r})") + + def __str__(self): + return (f"Benchmark Runner:\n" + f" Benchmarks: {self.benchmarks}\n" + f" Arguments: {self.args}\n" + f" Results: {self.results}") def run_all_benchmarks(self): for benchmark in self.benchmarks: self.run_benchmark(benchmark) - def run_benchmark(self, benchmark): + def run_benchmark(self, benchmark: BenchmarkConfig): # Setup environment and pre-execution commands benchmark.setup_environment() if self.args.verbose: # Print verbose information print(f"\n---------> Running benchmark: {benchmark.name} <---------\n") - print(f"Environment Variables: {benchmark.env}") + print(">", benchmark) for pre_command in benchmark.pre_execution_script: CommandExecutor.run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR'), self.args.verbose) + if benchmark.command_working_dir: + workdir = benchmark.command_working_dir + else: + workdir = os.environ.get('TEST_SCRIPT_DIR') + + # Execute the benchmark bash_time, bash_output, _ = CommandExecutor.run_command( benchmark.command.split(" "), - os.environ.get('TEST_SCRIPT_DIR'), + workdir, self.args.verbose) orch_time, orch_output, orch_log = CommandExecutor.run_command_with_orch( benchmark.command.split(" "), benchmark.orch_args, - os.environ.get('TEST_SCRIPT_DIR'), + workdir, os.environ.get('ORCH_COMMAND'), self.args.verbose) - activities = ResultAnalyzer.parse_logs_into_activities(orch_log) - self.activities[benchmark.name] = activities + prog_blocks = ResultAnalyzer.process_results(orch_log) + # pprint(prog_blocks) # Analyze and compare results diff_lines = ResultAnalyzer.compare_results(bash_output, orch_output) @@ -52,8 +71,8 @@ def run_benchmark(self, benchmark): ReportGenerator.print_results(benchmark.name, bash_time, orch_time, diff_lines, verbose=self.args.verbose) if not self.args.no_logs: ReportGenerator.save_log_data(orch_log, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_log.log") - - ResultAnalyzer.analyze_node_execution_times(orch_log, benchmark.name, os.environ.get('REPORT_OUTPUT_DIR'), self.args.verbose) + + self.activities[benchmark.name] = prog_blocks def generate_reports(self): @@ -76,10 +95,6 @@ def generate_plots(self): # Plot Gantt charts for each benchmark for benchmark in self.benchmarks: - activities = self.activities.get(benchmark.name, []) + activities = self.activities.get(benchmark.name) if activities: - benchmark_plots.plot_gantt(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt", simple=self.args.full_gantt) - - - - \ No newline at end of file + benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_progress") diff --git a/report/benchmarks/dgsh/17.sh b/report/benchmarks/dgsh/17.sh old mode 100644 new mode 100755 index 52ada7b4..df2788ff --- a/report/benchmarks/dgsh/17.sh +++ b/report/benchmarks/dgsh/17.sh @@ -38,20 +38,13 @@ file2=$(mktemp) file3=$(mktemp) file4=$(mktemp) -# Save the ls output to a temporary file -ls -n > "$file1" +cat $INPUT_FILE > $file1 -# Reorder fields in DIR-like way -awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2" +# Extract columns 5 and 6, save to temp1 +cut -d ',' -f 5-6 "$file1" > "$file2" -# Count number of files -wc -l "$file1" | tr -d \\n > "$file3" -echo -n ' File(s) ' >> "$file3" -awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3" +# Extract columns 2, 3, and 4, save to temp2 +cut -d ',' -f 2-4 "$file1" > "$file3" -# Count number of directories and print label for number of dirs and calculate free bytes -grep -c '^d' "$file1" | tr -d \\n > "$file4" -df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4" - -# Display the results -cat "$file2" "$file3" "$file4" +# Combine the columns +paste -d ',' "$file2" "$file3" diff --git a/report/benchmarks/dgsh/18.sh b/report/benchmarks/dgsh/18.sh old mode 100644 new mode 100755 index dac2892b..87c886f6 --- a/report/benchmarks/dgsh/18.sh +++ b/report/benchmarks/dgsh/18.sh @@ -29,21 +29,31 @@ file_details_file=$(mktemp) file_count_file=$(mktemp) dir_count_file=$(mktemp) byte_count_file=$(mktemp) +#!/bin/sh + +# Create temporary files +free_space_file=$(mktemp) +file_details_file=$(mktemp) +file_count_file=$(mktemp) +dir_count_file=$(mktemp) +byte_count_file=$(mktemp) + +# Base directory for the listing # Get free space df -h . | awk '!/Use%/{print $4}' > "$free_space_file" -# List details of files and directories -ls -l | awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file" +# Recursively list details of files +find . -type f -exec ls -l {} + | awk '{print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file" # Count number of files -ls -l | grep -v '^total' | grep -v '^d' | wc -l > "$file_count_file" +find . -type f | wc -l > "$file_count_file" # Count number of directories -ls -l | grep '^d' | wc -l > "$dir_count_file" +find . -type d | wc -l > "$dir_count_file" -# Calculate total bytes -ls -l | awk '{if($1 != "total") s += $5} END {print s}' > "$byte_count_file" +# Calculate total bytes for files +find . -type f -exec stat --format="%s" {} + | awk '{s+=$1} END {print s}' > "$byte_count_file" # Display the results cat "$file_details_file" diff --git a/report/benchmarks/dgsh/3.sh b/report/benchmarks/dgsh/3.sh old mode 100644 new mode 100755 diff --git a/report/benchmarks/dgsh/4.sh b/report/benchmarks/dgsh/4.sh old mode 100644 new mode 100755 diff --git a/report/benchmarks/dgsh/5.sh b/report/benchmarks/dgsh/5.sh index 0f2c5855..47a52b82 100755 --- a/report/benchmarks/dgsh/5.sh +++ b/report/benchmarks/dgsh/5.sh @@ -33,25 +33,22 @@ file1=$(mktemp) file2=$(mktemp) file3=$(mktemp) -file4=$(mktemp) - -# export LC_ALL=C cat $INPUT_FILE >"$file1" # Find errors # Obtain list of words in text -cat "$file1" | -tr -cs A-Za-z \\n | -tr A-Z a-z | -sort -u > "$file2" +cat "$file1" | +tr '[:upper:]' '[:lower:]' | +sed 's/[^a-z]/\n/g' | +grep -v '^$' | +sort | +uniq | +grep -v '^$' > "$file2" # Ensure dictionary is compatibly sorted -cat "$file1" | sort /usr/share/dict/words > "$file3" # List errors as a set difference -comm -23 "$file2" "$file3" > "$file4" - -fgrep -f "$file4" -i --color -w -C 2 "$file1" +comm -23 "$file2" "$file3" diff --git a/report/benchmarks/dgsh/6.sh b/report/benchmarks/dgsh/6.sh index 53828e0f..ec48a28f 100755 --- a/report/benchmarks/dgsh/6.sh +++ b/report/benchmarks/dgsh/6.sh @@ -42,9 +42,6 @@ file5=$(mktemp) cat $INPUT_FILE > $file1 -# Consistent sorting across machines -# export LC_ALL=C - # Stream input from file and split input one word per line # Create list of unique words tr -cs a-zA-Z '\n' < "$file1" | diff --git a/report/benchmarks/dgsh/8_no_func.sh b/report/benchmarks/dgsh/8_no_func.sh index ae676918..e4f760e3 100755 --- a/report/benchmarks/dgsh/8_no_func.sh +++ b/report/benchmarks/dgsh/8_no_func.sh @@ -38,9 +38,6 @@ # limitations under the License. # -# Consistent sorting across machines -# export LC_ALL=C - # Temporary files file1=$(mktemp) file2=$(mktemp) @@ -48,7 +45,6 @@ file3=$(mktemp) file4=$(mktemp) cat $INPUT_FILE > $file1 -cat $file1 # Split input one word per line tr -cs a-zA-Z '\n' < "$file1" > "$file2" @@ -86,7 +82,7 @@ awk '{count[$1]++} END {for (i in count) print count[i], i}' | sort -rn | tee "$file3" # Print relative -# echo "Relative character frequency" -# awk -v NCHARS=$nchars 'BEGIN { -# OFMT = "%.2g%%"} -# {print $1, $2, $1 / NCHARS * 100}' "$file3" \ No newline at end of file +echo "Relative character frequency" +awk -v NCHARS=$nchars 'BEGIN { + OFMT = "%.2g%%"} + {print $1, $2, $1 / NCHARS * 100}' "$file3" \ No newline at end of file diff --git a/report/benchmarks/dgsh/9.sh b/report/benchmarks/dgsh/9.sh old mode 100644 new mode 100755 index 88ee52a0..f090d36f --- a/report/benchmarks/dgsh/9.sh +++ b/report/benchmarks/dgsh/9.sh @@ -33,7 +33,7 @@ file2=$(mktemp) file3=$(mktemp) # Find object files and print defined symbols -find "$INPUT" -name "*.o" | xargs nm > "$file1" +find . -type f -name "*.o" | xargs nm > "$file1" # List all defined (exported) symbols awk 'NF == 3 && $2 ~ /[A-Z]/ {print $3}' "$file1" | sort > "$file2" diff --git a/report/benchmarks/dgsh/setup/config.json b/report/benchmarks/dgsh/setup/config.json index 1660c620..1e27be57 100644 --- a/report/benchmarks/dgsh/setup/config.json +++ b/report/benchmarks/dgsh/setup/config.json @@ -1,7 +1,7 @@ -[ +[ { "name": "Dgsh 1.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/testmini.csv"], + "env": ["INPUT_FILE={{RESOURCE_DIR}}/dblp.xml"], "command": "{{TEST_SCRIPT_DIR}}/dgsh/1.sh", "orch_args": "-d 2" }, @@ -13,8 +13,14 @@ }, { "name": "Dgsh 3.sh", - "command": "{{TEST_SCRIPT_DIR}}/dgsh/3.sh", - "working_dir": "{{RESOURCE_DIR}}/linux", + "command": "{{TEST_SCRIPT_DIR}}/3.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 4.sh", + "command": "{{TEST_SCRIPT_DIR}}/4.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", "orch_args": "-d 2" }, { @@ -24,8 +30,8 @@ "orch_args": "-d 2" }, { - "name": "Dgsh 6.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"], + "name": "6.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], "command": "{{TEST_SCRIPT_DIR}}/6.sh", "orch_args": "-d 2" }, @@ -37,8 +43,26 @@ }, { "name": "8.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"], + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], "command": "{{TEST_SCRIPT_DIR}}/8_no_func.sh", "orch_args": "-d 2" + }, + { + "name": "9.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], + "command": "{{TEST_SCRIPT_DIR}}/9.sh", + "orch_args": "-d 2" + }, + { + "name": "17.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/goods_classification.csv"], + "command": "{{TEST_SCRIPT_DIR}}/17.sh", + "orch_args": "-d 2" + }, + { + "name": "18.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", + "command": "{{TEST_SCRIPT_DIR}}/18.sh", + "orch_args": "-d 2" } -] +] \ No newline at end of file diff --git a/report/command_executor.py b/report/command_executor.py old mode 100755 new mode 100644 diff --git a/report/config_parser.py b/report/config_parser.py old mode 100755 new mode 100644 index fa6eeb50..512a89ab --- a/report/config_parser.py +++ b/report/config_parser.py @@ -3,10 +3,12 @@ class BenchmarkConfig: - def __init__(self, name, env, pre_execution_script, command, orch_args): + + def __init__(self, name, env, pre_execution_script, command_working_dir, command, orch_args): self.name = name self.env = [self.replace_env_var(e) for e in env] self.pre_execution_script = [self.replace_env_var(script) for script in pre_execution_script] + self.command_working_dir = self.replace_env_var(command_working_dir) self.command = self.replace_env_var(command) self.orch_args = self.replace_env_var(orch_args) @@ -21,6 +23,7 @@ def __str__(self): return (f"Benchmark '{self.name}':\n" f" Environment Variables: {env_str}\n" f" Pre-execution Script: {pre_exec_str}\n" + f" Command Working Directory: {self.command_working_dir}\n" f" Command: {self.command}\n" f" Orchestrator Arguments: {self.orch_args}") @@ -59,6 +62,7 @@ def parse_config(self): name=config.get('name'), env=config.get('env', []), pre_execution_script=config.get('pre_execution_script', []), + command_working_dir=config.get('working_dir', ""), command=config.get('command'), orch_args=config.get('orch_args', "") ) @@ -66,3 +70,4 @@ def parse_config(self): def get_benchmarks(self): return self.benchmarks + diff --git a/report/report_generator.py b/report/report_generator.py old mode 100755 new mode 100644 diff --git a/report/result_analyzer.py b/report/result_analyzer.py old mode 100755 new mode 100644 index 8729ab79..e0a7d1df --- a/report/result_analyzer.py +++ b/report/result_analyzer.py @@ -1,21 +1,37 @@ -import difflib +from datetime import datetime +import matplotlib.pyplot as plt +import matplotlib.dates as mdates import hashlib -import os -import csv +import numpy as np class ResultAnalyzer: @staticmethod - def parse_logs_into_activities(log_data): - info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")] - activities = [] - for line in info_lines: - if len(line) == 4: - activity = line[1] - end_time = float(line[2].split(":")[1].rstrip("ms")) - step_time = float(line[3].split(":")[1].rstrip("ms")) - start_time = end_time - step_time - activities.append((activity, start_time, step_time)) - return activities + def process_results(orch_log): + log_lines = orch_log.split("\n") + prog_blocks = [] + current_block = [] + block_start_time = None + + for line in log_lines: + if line.startswith("INFO|") and "[PROG_LOG]" in line: + parts = line.split("|") + time_str = parts[1] + log_content = parts[2].strip() + if log_content == "[PROG_LOG]": + # Start of a new block + if current_block: + prog_blocks.append((block_start_time, current_block)) + current_block = [] + block_start_time = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S,%f") + else: + # Continuing the current block + state, node_id, command = log_content.replace("[PROG_LOG] ", "").split(",", 2) + current_block.append((node_id.strip(), state.strip())) + # Append the last block if not empty + if current_block: + prog_blocks.append((block_start_time, current_block)) + + return prog_blocks @staticmethod def compare_results(bash_output, orch_output, max_lines=1000): @@ -35,56 +51,4 @@ def compare_results(bash_output, orch_output, max_lines=1000): if hash_value not in bash_hashes: diffs.append(f'+ {line}') - return diffs - - @staticmethod - def analyze_node_execution_times(orch_output, benchmark_name, output_dir, verbose): - node_times_dict = ResultAnalyzer.extract_node_times(orch_output) - - if verbose: - ResultAnalyzer.print_node_execution_times(node_times_dict) - - ResultAnalyzer.generate_node_times_csv(node_times_dict, benchmark_name, output_dir) - - @staticmethod - def print_node_execution_times(node_times_dict): - print("-" * 40) - print("Node Execution Times:") - for node in sorted(node_times_dict.keys()): - times = node_times_dict[node] - num_executions = len(times) - time_lost = sum(times) - times[-1] if times else 0 - times_str = ', '.join(f'{time:7.2f}ms' for time in times) - print(f"Node {node:2d}: Executions: {num_executions}, Time Lost: {time_lost:7.2f}ms Times = {times_str} ") - print("-" * 40) - - @staticmethod - def generate_node_times_csv(node_times_dict, benchmark_name, output_dir): - csv_filename = os.path.join(output_dir, f"{benchmark_name}_execution_times.csv") - with open(csv_filename, 'w', newline='') as csv_file: - writer = csv.writer(csv_file) - writer.writerow(["Node", "Execution Times (ms)", "Number of Executions", "Time Lost (ms)"]) - for node in sorted(node_times_dict.keys()): - times = node_times_dict[node] - num_executions = len(times) - time_lost = sum(times) - times[-1] if times else 0 - writer.writerow([node, ', '.join(str(time) for time in times), num_executions, time_lost]) - - @staticmethod - def extract_node_times(orch_output): - node_times_dict = {} - - relevant_lines = [line.replace("INFO:root:>|PartialOrder|RunNode,", "") - for line in orch_output.split("\n") - if line.startswith("INFO:root:>|PartialOrder|RunNode,") and "Step time:" in line] - - for line in relevant_lines: - parts = line.split("|") - node_id = int(parts[0]) - time = float(parts[2].split(":")[1][:-2]) # Extract step time - - if node_id not in node_times_dict: - node_times_dict[node_id] = [] - node_times_dict[node_id].append(time) - - return node_times_dict + return diffs \ No newline at end of file diff --git a/report/run_benchmarks.py b/report/run_benchmarks.py old mode 100755 new mode 100644 index bdacc596..5d560e69 --- a/report/run_benchmarks.py +++ b/report/run_benchmarks.py @@ -13,7 +13,7 @@ def print_startup_info(args): print(f" {arg + ':':13s} {value}") print("> Environment Variables:") - for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'PASH_TOP', 'PASH_SPEC_TOP']: + for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'REPORT_OUTPUT_DIR', 'PASH_TOP', 'PASH_SPEC_TOP', 'ORCH_COMMAND']: print(f" {env_var + ':':17s} {os.environ.get(env_var)}") def parse_args(): @@ -22,10 +22,10 @@ def parse_args(): parser.add_argument('--no-logs', action='store_true', help="Do not save log files of benchmark runs.") parser.add_argument('--csv-output', action='store_true', help="Generate and save results in CSV format.") parser.add_argument('--verbose', action='store_true', help="Enable verbose output.") - parser.add_argument('--full-gantt', action='store_false', help="Generate a full Gantt chart for each benchmark.") - parser.add_argument('--config-file', type=str, default='benchmark_config.json', help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.") + parser.add_argument('--config-file', type=str, default=None, help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.") parser.add_argument('--setup-script', type=str, default=None, help="Path to a setup script to run before running any other benchmark.") parser.add_argument('--subset', type=str, default=None, help="Name of a subset of benchmarks to run. Will instead download and store outputs in the dir with the specified name.") + parser.add_argument('--no-setup', action='store_true', help="Do not run any setup script before running benchmarks. Assumes subset is also set.") return parser.parse_args() # Sets the required environment variables for the benchmarking process. @@ -72,11 +72,16 @@ def main(): args = parse_args() set_environment_variables(args) - - - # Use the config file path from arguments - config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file) + if args.config_file is not None: + if args.verbose: + print(f"Config File: {args.config_file}") + config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file) + elif args.subset: + config_file_path = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", "config.json") + else: + config_file_path = os.path.join(os.environ['WORKING_DIR'], "benchmark_config.json") + # Parse benchmark configurations config_parser = ConfigParser(os.path.join(os.environ['WORKING_DIR'], config_file_path)) config_parser.parse_config() @@ -86,6 +91,21 @@ def main(): if args.verbose: print_startup_info(args) + print(config_parser) + + + if args.setup_script: + if args.verbose: + print(f"Running setup script: {args.setup_script}") + subprocess.run(['bash', args.setup_script]) + elif args.subset and not args.no_setup: + setup_script = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", 'setup.sh') + if os.path.exists(setup_script): + if args.verbose: + print(f"Running setup script: {args.setup_script}") + subprocess.run(['bash', setup_script]) + elif args.verbose: + print(f"No setup script found in {os.environ['TEST_SCRIPT_DIR']}, ignoring") # Initialize and run the BenchmarkRunner runner = BenchmarkRunner(config_parser.get_benchmarks(), args) diff --git a/report/time_script.py b/report/time_script.py new file mode 100644 index 00000000..3ef96f2e --- /dev/null +++ b/report/time_script.py @@ -0,0 +1,50 @@ +import subprocess +import time +import csv +import sys + +def format_time(seconds): + # Format time as ss.msms where msms is in milliseconds + return "{:.4f}".format(seconds) + +def time_commands(shell_script_path): + # Read the shell script + with open(shell_script_path, 'r') as file: + commands = file.readlines() + + # Prepare the CSV output + csv_filename = shell_script_path + "_timing.csv" + with open(csv_filename, 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["Command", "Time (seconds)"]) + + # Initialize total time + total_time = 0.0 + + # Execute each command and time it + for command in commands: + command = command.strip() + if command and not command.startswith('#'): # Ignore empty lines and comments + start_time = time.time() + try: + # Run the command + result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + execution_time = time.time() - start_time + # Write the result to the CSV with formatted time + csvwriter.writerow([command, format_time(execution_time)]) + total_time += execution_time + except subprocess.CalledProcessError as e: + print(f"An error occurred while executing the command: {command}") + print(e.output.decode()) + sys.exit(1) + + # Write the total time with formatted time + csvwriter.writerow(["Total", format_time(total_time)]) + + print(f"Timing results written to {csv_filename}") + +# Usage: python time_script.py /path/to/your/script.sh +if len(sys.argv) > 1: + time_commands(sys.argv[1]) +else: + print("Please provide the path to the shell script as an argument.") diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh index 8d239d2a..1bcf4122 100755 --- a/scripts/install_deps_ubuntu20.sh +++ b/scripts/install_deps_ubuntu20.sh @@ -1,20 +1,17 @@ #!/bin/bash -## Install Riker's dependencies sudo apt-get update -sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool -sudo update-alternatives --install /usr/bin/cram cram /usr/bin/cram3 100 +# TODO: some of these are Riker dependencies are no longer needed. +sudo apt install -y make git python3-cram file graphviz libtool python3-matplotlib libcap2-bin mergerfs export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} -pip3 install $PASH_SPEC_TOP/requirements.txt - ## Download submodule dependencies git submodule update --init --recursive -## Install Riker -(cd deps/riker; make; sudo make install) +# Install try +(cd deps/try; ./setup.sh) ## Install PaSh (cd deps/pash; ./scripts/distro-deps.sh; ./scripts/setup-pash.sh) diff --git a/test/misc/cat_and_sleep.sh b/test/misc/cat_and_sleep.sh new file mode 100755 index 00000000..77dbcbc5 --- /dev/null +++ b/test/misc/cat_and_sleep.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +cat $2 >> $3 +sleep $1 diff --git a/test/test_orch.sh b/test/test_orch.sh index 5cb27d4a..572b4057 100755 --- a/test/test_orch.sh +++ b/test/test_orch.sh @@ -370,6 +370,17 @@ test_command_var_assignments_2(){ $shell $2/test_command_var_assignments_2.sh } +test_early_stop1() +{ + local shell=$1 + $shell $2/test_early_stop1.sh +} + +test_early_stop2() +{ + local shell=$1 + $shell $2/test_early_stop2.sh +} ## TODO: make more loop tests with nested loops and commands after the loop diff --git a/test/test_scripts/test_early_stop1.sh b/test/test_scripts/test_early_stop1.sh new file mode 100644 index 00000000..51500540 --- /dev/null +++ b/test/test_scripts/test_early_stop1.sh @@ -0,0 +1,7 @@ +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 2 "output text" "$test_output_dir/out2" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out3" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out4" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out5" + + diff --git a/test/test_scripts/test_early_stop2.sh b/test/test_scripts/test_early_stop2.sh new file mode 100644 index 00000000..bf63e408 --- /dev/null +++ b/test/test_scripts/test_early_stop2.sh @@ -0,0 +1,9 @@ +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out2" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out3" + + + diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile new file mode 100644 index 00000000..74677e5f --- /dev/null +++ b/test/tracer_test/Makefile @@ -0,0 +1,13 @@ +objs = chdir pid fork symlink getcwd retval mkdir + +CFLAGS=-O2 + +.PHONY: all clean + +all: $(objs) thread + +thread: thread.c + $(CC) $(CFLAGS) -pthread $< -o $@ + +clean: + rm $(objs) diff --git a/test/tracer_test/chdir.c b/test/tracer_test/chdir.c new file mode 100644 index 00000000..e2282bfd --- /dev/null +++ b/test/tracer_test/chdir.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = chdir(TMPDIR); + if (ret < 0) + exit(1); + fd = syscall(SYS_open, "a", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/fork.c b/test/tracer_test/fork.c new file mode 100644 index 00000000..5a9533ae --- /dev/null +++ b/test/tracer_test/fork.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = fork(); + if (ret == 0) { + ret = chdir(TMPDIR "/a"); + if (ret < 0) + exit(1); + } else { + ret = chdir(TMPDIR "/b"); + if (ret < 0) + exit(1); + } + fd = syscall(SYS_open, "f", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/getcwd.c b/test/tracer_test/getcwd.c new file mode 100644 index 00000000..71b4f3bd --- /dev/null +++ b/test/tracer_test/getcwd.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + char name[1024]; + reset(); + ret = chdir(TMPDIR); + if (ret < 0) + exit(1); + ret = syscall(SYS_getcwd, name, 1024); + if (ret < 0) + exit(1); + return 0; +} diff --git a/test/tracer_test/mkdir.c b/test/tracer_test/mkdir.c new file mode 100644 index 00000000..6903e395 --- /dev/null +++ b/test/tracer_test/mkdir.c @@ -0,0 +1,23 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = system("mkdir -p " TMPDIR "/a"); + ret = system("mkdir -p " TMPDIR "/a"); + return 0; +} diff --git a/test/tracer_test/pid.c b/test/tracer_test/pid.c new file mode 100644 index 00000000..28b9cc04 --- /dev/null +++ b/test/tracer_test/pid.c @@ -0,0 +1,26 @@ +#include +#include +#include +#include + +#define NUM_CALLS 100000 + +int main() { + struct timeval start, end; + long seconds, useconds; + double mtime; + + gettimeofday(&start, NULL); // get the start time + for (int i = 0; i < NUM_CALLS; ++i) { + syscall(SYS_getpid); + } + gettimeofday(&end, NULL); // get the end time + + seconds = end.tv_sec - start.tv_sec; + useconds = end.tv_usec - start.tv_usec; + mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; + + printf("Elapsed time for getpid syscall: %.3f milliseconds\n", mtime); + printf("Average time per getpid syscall: %.3f microseconds\n", mtime * 1000 / NUM_CALLS); + return 0; +} diff --git a/test/tracer_test/retval.c b/test/tracer_test/retval.c new file mode 100644 index 00000000..986ab538 --- /dev/null +++ b/test/tracer_test/retval.c @@ -0,0 +1,22 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + char name[1024]; + reset(); + return 112; +} diff --git a/test/tracer_test/symlink.c b/test/tracer_test/symlink.c new file mode 100644 index 00000000..613b58dd --- /dev/null +++ b/test/tracer_test/symlink.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); + ret = system("touch " TMPDIR "/b/f"); + ret = system("ln -s " TMPDIR "/b/f " TMPDIR "/a/f"); +} + +int main(void) +{ + int fd, ret; + reset(); + /* if (ret == 0) { */ + /* ret = chdir(TMPDIR "/a"); */ + /* if (ret < 0) */ + /* exit(1); */ + /* } else { */ + /* ret = chdir(TMPDIR "/b"); */ + /* if (ret < 0) */ + /* exit(1); */ + /* } */ + ret = chdir(TMPDIR "/a"); + fd = syscall(SYS_open, "f", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/thread.c b/test/tracer_test/thread.c new file mode 100644 index 00000000..3d109917 --- /dev/null +++ b/test/tracer_test/thread.c @@ -0,0 +1,41 @@ +#include +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); + ret = system("touch " TMPDIR "/a/f"); +} + +void *threaded_chdir(void *p) +{ + int ret; + ret = chdir(TMPDIR "/a"); + return NULL; +} + +int main(void) +{ + pthread_t child; + int fd, ret; + reset(); + ret = chdir(TMPDIR "/b"); + ret = pthread_create(&child, NULL, threaded_chdir, NULL); + ret = pthread_join(child, NULL); + fd = syscall(SYS_open, "f", O_RDONLY); + fd = syscall(SYS_open, "g", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +}