From 09960a10d275c02a474ec2854fe11933790a7812 Mon Sep 17 00:00:00 2001 From: SleepyMug Date: Fri, 27 Oct 2023 08:19:42 -0400 Subject: [PATCH 01/39] handling basic cases --- parallel-orch/trace_v2.py | 163 ++++++++++++++++++++++++++++++++++++++ test/tracer_test/Makefile | 10 +++ test/tracer_test/chdir.c | 27 +++++++ test/tracer_test/pid.c | 26 ++++++ 4 files changed, 226 insertions(+) create mode 100644 parallel-orch/trace_v2.py create mode 100644 test/tracer_test/Makefile create mode 100644 test/tracer_test/chdir.c create mode 100644 test/tracer_test/pid.c diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py new file mode 100644 index 00000000..125614f2 --- /dev/null +++ b/parallel-orch/trace_v2.py @@ -0,0 +1,163 @@ +import re +import os.path +import sys +from dataclasses import dataclass + +# Global TODOs: +# handle pwd, such that open and stat can work + +def parse_info(l): + return 0 + +@dataclass +class RFile: + fname: str + +@dataclass +class WFile: + fname: str + +# openat +r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs']) +w_first_path_set = set(['mkdir']) +r_fd_path_set = set(['fstatat', 'newfstatat']) +w_fd_path_set = set(['unlinkat']) +ignore_set = set(['getpid']) + +def parse_string(s): + s = s.strip() + assert s[0] == '"' and s[-1] == '"' + return bytes(s[1:-1], "utf-8").decode("unicode_escape") + +def between(s, d1, d2): + return s.find(d1) + len(d1), s.rfind(d2) + +def get_path_first_path(args): + a, _ = args.split(sep=',', maxsplit=1) + return parse_string(a) + +def parse_r_first_path(args, ret): + return RFile(get_path_first_path(args)) + +def parse_w_first_path(args, ret): + path = get_path_first_path(args) + if is_ret_enoent(ret): + return RFile(path) + else: + return WFile(path) + +def is_open_flag(flags): + if 'O_RDONLY' in flags: + return 'r' + else: + return 'w' + +def is_absolute(path): + return path[0] == '/' + +def is_ret_enoent(ret): + return 'ENOENT' in ret + +def parse_openat(args, ret): + if args.count(',') <= 2: + dfd, path, flags = args.split(',', maxsplit=2) + else: + dfd, path, flags, _ = args.split(',', maxsplit=3) + path = parse_string(path) + if is_absolute(path): + total_path = path + else: + begin, end = between(dfd, '<', '>') + pwd = dfd[begin:end] + total_path = os.path.join(pwd, path) + if is_open_flag(flags) == 'r': + return RFile(total_path) + if is_ret_enoent(ret): + return RFile(total_path) + return WFile(total_path) + +def parse_chdir(args, ret): + return None + +def get_path_from_fd_path(args): + a0, a1, _ = args.split(sep=',', maxsplit=2) + a1 = parse_string(a1) + if a1[0] == '/': + return a1 + else: + begin, end = between(a0, '<', '>') + a0 = a0[begin:end] + return os.path.join(a0, a1) + +def parse_r_fd_path(args, ret): + return RFile(get_path_from_fd_path(args)) + +def parse_w_fd_path(args, ret): + if is_ret_enoent(ret): + return RFile(get_path_from_fd_path(args)) + else: + return WFile(get_path_from_fd_path(args)) + +def parse_syscall(syscall, args, ret): + if syscall in r_first_path_set: + return parse_r_first_path(args, ret) + elif syscall in w_first_path_set: + return parse_w_first_path(args, ret) + elif syscall == 'openat': + return parse_openat(args, ret) + elif syscall == 'chdir': + return parse_chdir(args, ret) + elif syscall in r_fd_path_set: + return parse_r_fd_path(args, ret) + elif syscall in w_fd_path_set: + return parse_w_fd_path(args, ret) + elif syscall in ignore_set: + return None + else: + raise ValueError('Unclassified syscall ' + syscall) + +def strip_prefix(l): + if l[0].isdigit(): + return l.split(' ', maxsplit=1)[1] + else: + return l + +def handle_info(l): + if '+++' in l: + return True, parse_info(l) + elif '---' in l: + return True, None + else: + return False, None + +def parse_line(l): + is_info, info = handle_info(l) + if is_info: + return info + if not len(l): + return None + l = strip_prefix(l) + lparen = l.find('(') + rparen = l.rfind(')') + equals = l.rfind('=') + syscall = l[:lparen] + args = l[lparen+1:rparen] + ret = l[equals+1] + return parse_syscall(syscall, args, ret) + + +def main(fname): + with open(fname) as f: + s = f.read() + for l in s.split('\n'): + print(parse_line(l)) + +debug_g = r''' +start: ESCAPED_STRING + +%import common.ESCAPED_STRING +''' +if __name__ == '__main__': + # parser = lark.Lark(debug_g) + # parser.parse('"lskjkf"') + main(sys.argv[1]) diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile new file mode 100644 index 00000000..350528a2 --- /dev/null +++ b/test/tracer_test/Makefile @@ -0,0 +1,10 @@ +objs = chdir pid + +CFLAGS=-O2 + +.PHONY: all clean + +all: $(objs) + +clean: + rm $(objs) diff --git a/test/tracer_test/chdir.c b/test/tracer_test/chdir.c new file mode 100644 index 00000000..a7a66a9e --- /dev/null +++ b/test/tracer_test/chdir.c @@ -0,0 +1,27 @@ +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = chdir(TMPDIR); + if (ret < 0) + exit(1); + fd = open("a", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/pid.c b/test/tracer_test/pid.c new file mode 100644 index 00000000..28b9cc04 --- /dev/null +++ b/test/tracer_test/pid.c @@ -0,0 +1,26 @@ +#include +#include +#include +#include + +#define NUM_CALLS 100000 + +int main() { + struct timeval start, end; + long seconds, useconds; + double mtime; + + gettimeofday(&start, NULL); // get the start time + for (int i = 0; i < NUM_CALLS; ++i) { + syscall(SYS_getpid); + } + gettimeofday(&end, NULL); // get the end time + + seconds = end.tv_sec - start.tv_sec; + useconds = end.tv_usec - start.tv_usec; + mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; + + printf("Elapsed time for getpid syscall: %.3f milliseconds\n", mtime); + printf("Average time per getpid syscall: %.3f microseconds\n", mtime * 1000 / NUM_CALLS); + return 0; +} From c189d274571440cd4db41dfaab16618b0e78d515 Mon Sep 17 00:00:00 2001 From: SleepyMug Date: Sun, 29 Oct 2023 03:34:32 -0400 Subject: [PATCH 02/39] changes to support clone and chdir --- parallel-orch/trace_v2.py | 180 +++++++++++++++++++++++++++---------- test/tracer_test/Makefile | 7 +- test/tracer_test/chdir.c | 3 +- test/tracer_test/fork.c | 37 ++++++++ test/tracer_test/symlink.c | 39 ++++++++ test/tracer_test/thread.c | 41 +++++++++ 6 files changed, 256 insertions(+), 51 deletions(-) create mode 100644 test/tracer_test/fork.c create mode 100644 test/tracer_test/symlink.c create mode 100644 test/tracer_test/thread.c diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 125614f2..824829e0 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -12,51 +12,110 @@ def parse_info(l): @dataclass class RFile: fname: str + def __init__(self, fname): + self.fname = os.path.normpath(fname) @dataclass class WFile: fname: str + def __init__(self, fname): + self.fname = os.path.normpath(fname) -# openat +class Context: + def __init__(self): + self.line_dict = {} + self.curdir_dict = {} + self.pid_group_dict = {} + + def do_clone(self, parent, child): + self.pid_group_dict[child] = parent + + def set_dir(self, path, pid=None): + self.curdir_fallback = path + if pid and pid in self.pid_group_dict: + pid = self.pid_group_dict[pid] + if pid: + self.curdir_dict[pid] = path + + def get_dir(self, pid: int): + if pid in self.pid_group_dict: + pid = self.pid_group_dict[pid] + if not pid in self.curdir_dict: + self.curdir_dict[pid] = self.curdir_fallback + return self.curdir_dict[pid] + + def push_half_line(self, pid: int, l): + index = l.find('') + len('resumed>') + total_line = self.line_dict[pid] + l[index:].strip() + del self.line_dict[pid] + return total_line + +# openat, open, chdir, clone r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs']) w_first_path_set = set(['mkdir']) r_fd_path_set = set(['fstatat', 'newfstatat']) -w_fd_path_set = set(['unlinkat']) +w_fd_path_set = set(['unlinkat', 'utimensat']) ignore_set = set(['getpid']) def parse_string(s): s = s.strip() + # handling cases such as utimensat + # if the open fails we will mark the file + # as a read when we handle return value anyway so it's fine + if s == 'NULL': + return '' assert s[0] == '"' and s[-1] == '"' return bytes(s[1:-1], "utf-8").decode("unicode_escape") def between(s, d1, d2): return s.find(d1) + len(d1), s.rfind(d2) -def get_path_first_path(args): - a, _ = args.split(sep=',', maxsplit=1) - return parse_string(a) +def is_absolute(path): + return path[0] == '/' + +def is_ret_err(ret: str): + ret = ret.strip() + return ret[0] == '-' + +def get_path_first_path(pid, args, ctx): + a = args.split(sep=',', maxsplit=1)[0] + if is_absolute(a): + return parse_string(a) + else: + return os.path.join(ctx.get_dir(pid), parse_string(a)) -def parse_r_first_path(args, ret): - return RFile(get_path_first_path(args)) +def parse_r_first_path(pid, args, ret, ctx): + return RFile(get_path_first_path(pid, args, ctx)) -def parse_w_first_path(args, ret): - path = get_path_first_path(args) - if is_ret_enoent(ret): +def parse_w_first_path(pid, args, ret, ctx): + path = get_path_first_path(pid, args, ctx) + if is_ret_err(ret): return RFile(path) else: return WFile(path) -def is_open_flag(flags): +def parse_chdir(pid, args, ret, ctx): + new_path = get_path_first_path(pid, args, ctx) + if not is_ret_err(ret): + ctx.set_dir(new_path, pid) + return RFile(new_path) + +def handle_open_flag(flags): if 'O_RDONLY' in flags: return 'r' else: return 'w' -def is_absolute(path): - return path[0] == '/' - -def is_ret_enoent(ret): - return 'ENOENT' in ret +def handle_open_common(total_path, flags, ret): + if handle_open_flag(flags) == 'r': + return RFile(total_path) + if is_ret_err(ret): + return RFile(total_path) + return WFile(total_path) def parse_openat(args, ret): if args.count(',') <= 2: @@ -70,19 +129,17 @@ def parse_openat(args, ret): begin, end = between(dfd, '<', '>') pwd = dfd[begin:end] total_path = os.path.join(pwd, path) - if is_open_flag(flags) == 'r': - return RFile(total_path) - if is_ret_enoent(ret): - return RFile(total_path) - return WFile(total_path) - -def parse_chdir(args, ret): - return None + return handle_open_common(total_path, flags, ret) +def parse_open(pid, args, ret, ctx): + total_path = get_path_first_path(pid, args, ctx) + flags = args.split(',')[1] + return handle_open_common(total_path, flags, ret) + def get_path_from_fd_path(args): a0, a1, _ = args.split(sep=',', maxsplit=2) a1 = parse_string(a1) - if a1[0] == '/': + if len(a1) and a1[0] == '/': return a1 else: begin, end = between(a0, '<', '>') @@ -93,34 +150,58 @@ def parse_r_fd_path(args, ret): return RFile(get_path_from_fd_path(args)) def parse_w_fd_path(args, ret): - if is_ret_enoent(ret): + if is_ret_err(ret): return RFile(get_path_from_fd_path(args)) else: return WFile(get_path_from_fd_path(args)) + +def has_clone_fs(flags): + if 'CLONE_FS' in flags: + return True + else: + return False + +def parse_clone(pid, args, ret, ctx): + try: + child = int(ret) + except ValueError: + child = -1 + if child < 0: + return + arg_list = [x.strip() for x in args.split(',')] + flags = [arg for arg in arg_list if arg.startswith('flags=')][0] + flags = flags[len('flags='):] + if has_clone_fs(flags): + ctx.do_clone(pid, child) -def parse_syscall(syscall, args, ret): +def parse_syscall(pid, syscall, args, ret, ctx): if syscall in r_first_path_set: - return parse_r_first_path(args, ret) + return parse_r_first_path(pid, args, ret, ctx) elif syscall in w_first_path_set: - return parse_w_first_path(args, ret) + return parse_w_first_path(pid, args, ret, ctx) elif syscall == 'openat': return parse_openat(args, ret) elif syscall == 'chdir': - return parse_chdir(args, ret) + return parse_chdir(pid, args, ret, ctx) + elif syscall == 'open': + return parse_open(pid, args, ret, ctx) elif syscall in r_fd_path_set: return parse_r_fd_path(args, ret) elif syscall in w_fd_path_set: return parse_w_fd_path(args, ret) + elif syscall == 'clone': + return parse_clone(pid, args, ret, ctx) elif syscall in ignore_set: return None else: raise ValueError('Unclassified syscall ' + syscall) -def strip_prefix(l): +def strip_pid(l): if l[0].isdigit(): - return l.split(' ', maxsplit=1)[1] + pair = l.split(' ', maxsplit=1) + return int(pair[0]), pair[1] else: - return l + raise ValueError('expect pid') def handle_info(l): if '+++' in l: @@ -129,35 +210,38 @@ def handle_info(l): return True, None else: return False, None - -def parse_line(l): + +def parse_line(l, ctx): is_info, info = handle_info(l) if is_info: return info if not len(l): return None - l = strip_prefix(l) + pid, l = strip_pid(l) + if "" in l: + l = ctx.pop_complete_line(pid, l) lparen = l.find('(') - rparen = l.rfind(')') equals = l.rfind('=') + rparen = l[:equals].rfind(')') + assert lparen >= 0 and equals >= 0 and rparen >= 0 syscall = l[:lparen] + ret = l[equals+1:] args = l[lparen+1:rparen] - ret = l[equals+1] - return parse_syscall(syscall, args, ret) + return parse_syscall(pid, syscall, args, ret, ctx) + - def main(fname): with open(fname) as f: s = f.read() + ctx = Context() + ctx.set_dir(os.getcwd()) for l in s.split('\n'): - print(parse_line(l)) - -debug_g = r''' -start: ESCAPED_STRING + record = parse_line(l, ctx) + if record: + print(record) -%import common.ESCAPED_STRING -''' if __name__ == '__main__': - # parser = lark.Lark(debug_g) - # parser.parse('"lskjkf"') main(sys.argv[1]) diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile index 350528a2..b7e40cac 100644 --- a/test/tracer_test/Makefile +++ b/test/tracer_test/Makefile @@ -1,10 +1,13 @@ -objs = chdir pid +objs = chdir pid fork symlink CFLAGS=-O2 .PHONY: all clean -all: $(objs) +all: $(objs) thread + +thread: thread.c + $(CC) $(CFLAGS) -pthread $< -o $@ clean: rm $(objs) diff --git a/test/tracer_test/chdir.c b/test/tracer_test/chdir.c index a7a66a9e..e2282bfd 100644 --- a/test/tracer_test/chdir.c +++ b/test/tracer_test/chdir.c @@ -2,6 +2,7 @@ #include #include #include +#include #define TMPDIR "/tmp/hs_tracer_test" @@ -19,7 +20,7 @@ int main(void) ret = chdir(TMPDIR); if (ret < 0) exit(1); - fd = open("a", O_RDONLY); + fd = syscall(SYS_open, "a", O_RDONLY); if (fd < 0) exit(1); close(fd); diff --git a/test/tracer_test/fork.c b/test/tracer_test/fork.c new file mode 100644 index 00000000..5a9533ae --- /dev/null +++ b/test/tracer_test/fork.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = fork(); + if (ret == 0) { + ret = chdir(TMPDIR "/a"); + if (ret < 0) + exit(1); + } else { + ret = chdir(TMPDIR "/b"); + if (ret < 0) + exit(1); + } + fd = syscall(SYS_open, "f", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/symlink.c b/test/tracer_test/symlink.c new file mode 100644 index 00000000..613b58dd --- /dev/null +++ b/test/tracer_test/symlink.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); + ret = system("touch " TMPDIR "/b/f"); + ret = system("ln -s " TMPDIR "/b/f " TMPDIR "/a/f"); +} + +int main(void) +{ + int fd, ret; + reset(); + /* if (ret == 0) { */ + /* ret = chdir(TMPDIR "/a"); */ + /* if (ret < 0) */ + /* exit(1); */ + /* } else { */ + /* ret = chdir(TMPDIR "/b"); */ + /* if (ret < 0) */ + /* exit(1); */ + /* } */ + ret = chdir(TMPDIR "/a"); + fd = syscall(SYS_open, "f", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} diff --git a/test/tracer_test/thread.c b/test/tracer_test/thread.c new file mode 100644 index 00000000..3d109917 --- /dev/null +++ b/test/tracer_test/thread.c @@ -0,0 +1,41 @@ +#include +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); + ret = system("mkdir " TMPDIR "/a"); + ret = system("mkdir " TMPDIR "/b"); + ret = system("touch " TMPDIR "/a/f"); +} + +void *threaded_chdir(void *p) +{ + int ret; + ret = chdir(TMPDIR "/a"); + return NULL; +} + +int main(void) +{ + pthread_t child; + int fd, ret; + reset(); + ret = chdir(TMPDIR "/b"); + ret = pthread_create(&child, NULL, threaded_chdir, NULL); + ret = pthread_join(child, NULL); + fd = syscall(SYS_open, "f", O_RDONLY); + fd = syscall(SYS_open, "g", O_RDONLY); + if (fd < 0) + exit(1); + close(fd); + return 0; +} From e7bb52bbfc61a3193eb1a366f3c2ab19fd458c6e Mon Sep 17 00:00:00 2001 From: SleepyMug Date: Sun, 29 Oct 2023 04:02:13 -0400 Subject: [PATCH 03/39] correct realtime pipe behavior --- parallel-orch/trace_v2.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 824829e0..8eee73a1 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -234,14 +234,13 @@ def parse_line(l, ctx): def main(fname): - with open(fname) as f: - s = f.read() ctx = Context() ctx.set_dir(os.getcwd()) - for l in s.split('\n'): - record = parse_line(l, ctx) - if record: - print(record) + with open(fname) as f: + for l in f: + record = parse_line(l, ctx) + if record: + print(record) if __name__ == '__main__': main(sys.argv[1]) From 1cb10416a3900996e5ef5e9250aef4a7b2b0e99b Mon Sep 17 00:00:00 2001 From: Guest Date: Sat, 11 Nov 2023 09:06:43 -0500 Subject: [PATCH 04/39] basic integration --- parallel-orch/partial_program_order.py | 5 ++- parallel-orch/template_script_to_execute.sh | 50 +++++++++++---------- parallel-orch/trace_v2.py | 44 ++++++++++++++++-- 3 files changed, 70 insertions(+), 29 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 8aabcd7f..4aff7c81 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -7,6 +7,7 @@ import config import executor import trace +import trace_v2 from util import * import util from collections import defaultdict @@ -1394,7 +1395,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand self.stopped.add(node_id) else: trace_object = executor.read_trace(sandbox_dir, trace_file) - cmd_exit_code = trace.parse_exit_code(trace_object) + cmd_exit_code = trace_v2.parse_exit_code(trace_object) ## Save the completed node info. Note that if the node doesn't commit ## this information will be invalid and rewritten the next time execution @@ -1405,7 +1406,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## We no longer add failed commands to the stopped set, ## because this leads to more repetitions than needed ## and does not allow us to properly speculate commands - read_set, write_set = trace.parse_and_gather_cmd_rw_sets(trace_object) + read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) rw_set = RWSet(read_set, write_set) self.update_rw_set(node_id, rw_set) diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh index a63596d1..d9f11171 100755 --- a/parallel-orch/template_script_to_execute.sh +++ b/parallel-orch/template_script_to_execute.sh @@ -1,35 +1,39 @@ #!/bin/bash -touch "$TEMPDIR/Rikerfile" +# touch "$TEMPDIR/Rikerfile" -## We source the latest env file -## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why. -echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile" +# ## We source the latest env file +# ## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why. +# echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile" -## Save the script to execute in the sandboxdir -echo $CMD_STRING >> "$TEMPDIR/Rikerfile" +# ## Save the script to execute in the sandboxdir +# echo $CMD_STRING >> "$TEMPDIR/Rikerfile" -## Add command to export Riker's environment variables after run is complete to a file -echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile" +# ## Add command to export Riker's environment variables after run is complete to a file +# echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile" -if [ $speculate_flag -eq 1 ]; then - rkr_cmd="rkr" -else - rkr_cmd="rkr --frontier" -fi +# if [ $speculate_flag -eq 1 ]; then +# rkr_cmd="rkr" +# else +# rkr_cmd="rkr --frontier" +# fi -cat "$TEMPDIR/Rikerfile" 1>&2 +# cat "$TEMPDIR/Rikerfile" 1>&2 -$rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" -exit_code="$?" +# $rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" +# exit_code="$?" -if [ "$exit_code" -eq 0 ]; then - echo "first riker run done (Node: ${CMD_ID})" 1>&2 -else - echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2 -fi +# if [ "$exit_code" -eq 0 ]; then +# echo "first riker run done (Node: ${CMD_ID})" 1>&2 +# else +# echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2 +# fi + + +# rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null +# echo 'second riker run done' 1>&2 +source $LATEST_ENV_FILE +eval $(echo "strace -y -f --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING") -rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null -echo 'second riker run done' 1>&2 (exit $exit_code) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 8eee73a1..8594e4c6 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -1,13 +1,25 @@ import re import os.path import sys +from typing import Tuple from dataclasses import dataclass # Global TODOs: # handle pwd, such that open and stat can work +@dataclass +class ExitStatus: + exitcode: int + def parse_info(l): - return 0 + if "exited" in l: + start = len("+++ exited with ") + end = -len(" +++") + return ExitStatus(int(l[start:end])) + elif 'killed' in l: + return ExitStatus(-1) + else: + raise ValueError @dataclass class RFile: @@ -59,7 +71,7 @@ def pop_complete_line(self, pid: int, l): w_first_path_set = set(['mkdir']) r_fd_path_set = set(['fstatat', 'newfstatat']) w_fd_path_set = set(['unlinkat', 'utimensat']) -ignore_set = set(['getpid']) +ignore_set = set(['getpid', 'getcwd']) def parse_string(s): s = s.strip() @@ -198,7 +210,7 @@ def parse_syscall(pid, syscall, args, ret, ctx): def strip_pid(l): if l[0].isdigit(): - pair = l.split(' ', maxsplit=1) + pair = l.split(maxsplit=1) return int(pair[0]), pair[1] else: raise ValueError('expect pid') @@ -212,12 +224,12 @@ def handle_info(l): return False, None def parse_line(l, ctx): + pid, l = strip_pid(l) is_info, info = handle_info(l) if is_info: return info if not len(l): return None - pid, l = strip_pid(l) if " int: + if len(trace_object) < 1: + return None + l = trace_object[0] + first_pid, _ = strip_pid(l) + for l in trace_object: + pid, tmpl = strip_pid(l) + is_info, info = handle_info(tmpl) + if is_info and pid == first_pid and isinstance(info, ExitStatus): + return info.exitcode + raise ValueError("No exitcode") + +def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]: + ctx = Context() + ctx.set_dir(os.getcwd()) + read_set = set() + write_set = set() + for l in trace_object: + record = parse_line(l, ctx) + if type(record) is RFile: + read_set.add(record.fname) + elif type(record) is WFile: + write_set.add(record.fname) + return read_set, write_set def main(fname): ctx = Context() From 8e439b6943ad9e1d9302f519cfa802d0ad9b58c1 Mon Sep 17 00:00:00 2001 From: Guest Date: Sat, 11 Nov 2023 17:49:25 -0500 Subject: [PATCH 05/39] more test cases --- test/tracer_test/Makefile | 2 +- test/tracer_test/getcwd.c | 28 ++++++++++++++++++++++++++++ test/tracer_test/mkdir.c | 23 +++++++++++++++++++++++ test/tracer_test/retval.c | 22 ++++++++++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/tracer_test/getcwd.c create mode 100644 test/tracer_test/mkdir.c create mode 100644 test/tracer_test/retval.c diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile index b7e40cac..74677e5f 100644 --- a/test/tracer_test/Makefile +++ b/test/tracer_test/Makefile @@ -1,4 +1,4 @@ -objs = chdir pid fork symlink +objs = chdir pid fork symlink getcwd retval mkdir CFLAGS=-O2 diff --git a/test/tracer_test/getcwd.c b/test/tracer_test/getcwd.c new file mode 100644 index 00000000..71b4f3bd --- /dev/null +++ b/test/tracer_test/getcwd.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + char name[1024]; + reset(); + ret = chdir(TMPDIR); + if (ret < 0) + exit(1); + ret = syscall(SYS_getcwd, name, 1024); + if (ret < 0) + exit(1); + return 0; +} diff --git a/test/tracer_test/mkdir.c b/test/tracer_test/mkdir.c new file mode 100644 index 00000000..6903e395 --- /dev/null +++ b/test/tracer_test/mkdir.c @@ -0,0 +1,23 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + reset(); + ret = system("mkdir -p " TMPDIR "/a"); + ret = system("mkdir -p " TMPDIR "/a"); + return 0; +} diff --git a/test/tracer_test/retval.c b/test/tracer_test/retval.c new file mode 100644 index 00000000..986ab538 --- /dev/null +++ b/test/tracer_test/retval.c @@ -0,0 +1,22 @@ +#include +#include +#include +#include +#include + +#define TMPDIR "/tmp/hs_tracer_test" + +void reset(void) +{ + int ret; + ret = system("rm -rf " TMPDIR); + ret = system("mkdir " TMPDIR); +} + +int main(void) +{ + int fd, ret; + char name[1024]; + reset(); + return 112; +} From 379d616546c0f47c4eeabbb5f17aa1e01a528ae1 Mon Sep 17 00:00:00 2001 From: Guest Date: Sun, 12 Nov 2023 01:00:10 -0500 Subject: [PATCH 06/39] catch exit code --- parallel-orch/template_script_to_execute.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh index d9f11171..ea36ac4e 100755 --- a/parallel-orch/template_script_to_execute.sh +++ b/parallel-orch/template_script_to_execute.sh @@ -34,6 +34,6 @@ # echo 'second riker run done' 1>&2 source $LATEST_ENV_FILE eval $(echo "strace -y -f --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING") - +exit_code=$? (exit $exit_code) From 9e921e572c510e4b4503b330c9f72a2d70aa448d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Wed, 15 Nov 2023 05:54:40 -0500 Subject: [PATCH 07/39] Fix bug in scheduler state updating --- parallel-orch/partial_program_order.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 4aff7c81..13c739ba 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1368,6 +1368,7 @@ def attempt_rerun_pending_nodes(self): for node in run_after_nodes: if node not in self.get_currently_executing(): logging.debug(f"Running node {node} after execution of {node_id}") + self.waiting_for_frontend.discard(node) self.workset.append(node) self.pending_to_execute.discard(node) self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id)) From 43c91758080f887c6db18a9c179e0a79b2181b28 Mon Sep 17 00:00:00 2001 From: SleepyMug Date: Wed, 13 Dec 2023 14:48:50 -0500 Subject: [PATCH 08/39] support for eager dependency resolving --- parallel-orch/executor.py | 13 +- parallel-orch/partial_program_order.py | 256 ++++++++++++++----------- parallel-orch/run_command.sh | 17 +- parallel-orch/scheduler_server.py | 20 +- parallel-orch/trace_v2.py | 62 ++++-- parallel-orch/util.py | 7 + test/misc/cat_and_sleep.sh | 4 + test/test_orch.sh | 11 ++ test/test_scripts/test_early_stop1.sh | 7 + test/test_scripts/test_early_stop2.sh | 9 + 10 files changed, 265 insertions(+), 141 deletions(-) create mode 100755 test/misc/cat_and_sleep.sh create mode 100644 test/test_scripts/test_early_stop1.sh create mode 100644 test/test_scripts/test_early_stop2.sh diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 0349285e..526b2dd4 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -13,20 +13,21 @@ def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, stdout_file = util.ptempfile() stderr_file = util.ptempfile() post_execution_env_file = util.ptempfile() + sandbox_dir, tmp_dir = util.create_sandbox() logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode) - return process, trace_file, stdout_file, stderr_file, post_execution_env_file + process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) + return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file): - process, trace_file, stdout_file, stderr_file, post_execution_env_file = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) - return process, trace_file, stdout_file, stderr_file, post_execution_env_file + process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) + return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' - args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file] + args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file, sandbox_dir, tmp_dir] if speculate_mode: args.append("speculate") else: diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 13c739ba..2242d9a4 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -30,7 +30,7 @@ def get_post_execution_env_file(self): def get_stdout_file(self): return self.stdout_file - + def get_sandbox_dir(self): return self.sandbox_dir @@ -61,7 +61,7 @@ def get_outer(self): def pop_outer(self): return self.loops.pop() - + def add_inner(self, loop_iter_id: int): self.loops.insert(0, loop_iter_id) @@ -92,16 +92,16 @@ def __eq__(self, other): class NodeId: def __init__(self, id: int, loop_iters=None): self.id = id - + if loop_iters is None: self.loop_iters = LoopStack() else: assert(isinstance(loop_iters, LoopStack)) self.loop_iters = loop_iters - + def has_iters(self): return not self.loop_iters.is_empty() - + def get_iters(self): return copy.deepcopy(self.loop_iters) @@ -135,17 +135,17 @@ def __ne__(self, other): # Not strictly necessary, but to avoid having both x==y and x!=y # True at the same time return not(self == other) - + ## TODO: Define this correctly if it is to be used for something other than dictionary indexing def __lt__(self, obj): return (str(self) < str(obj)) - + def __gt__(self, obj): return (str(self) > str(obj)) - + # def __le__(self, obj): # return ((self.b) <= (obj.b)) - + # def __ge__(self, obj): # return ((self.b) >= (obj.b)) @@ -191,10 +191,10 @@ def get_cmd(self) -> str: def get_cmd_no_redir(self) -> str: return self.cmd_no_redir - + def get_loop_context(self) -> LoopStack: return self.loop_context - + def in_loop(self) -> bool: return not self.loop_context.is_empty() @@ -211,11 +211,10 @@ def get_next_iter(self, loop_id: int) -> int: ## a node is committed. def set_completed_info(self, completed_node_info: CompletedNodeInfo): self.completed_node_info = completed_node_info - + def get_completed_node_info(self) -> CompletedNodeInfo: return self.completed_node_info - class RWSet: def __init__(self, read_set: set, write_set: set): @@ -258,7 +257,7 @@ def __init__(self, nodes, edges, initial_env_file): ## A dictionary from cmd_ids that are currently executing that contains their trace_files self.commands_currently_executing = {} ## A dictionary that contains information about completed nodes - ## from cmd_id -> CompletedNodeInfo + ## from cmd_id -> CompletedNodeInfo ## Note: this dictionary does not contain information ## TODO: Delete this self.completed_node_info = {} @@ -290,7 +289,7 @@ def __init__(self, nodes, edges, initial_env_file): self.pending_to_execute = set() self.to_be_resolved_prev = {} self.prechecked_env = set() - + def __str__(self): return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" @@ -334,11 +333,11 @@ def get_sub_po_source_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": if len(prev_ids_set) == 0 or \ not prev_ids_set.issubset(node_set): source_nodes.append(node_id) - + ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure assert(len(source_nodes) == 1) return source_nodes - + def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": # assert(self.is_closed_sub_partial_order(node_ids)) sink_nodes = list() @@ -349,31 +348,31 @@ def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": if len(next_ids_set) == 0 or \ not next_ids_set.issubset(node_set): sink_nodes.append(node_id) - + ## KK 2024-05-03: I don't see how we can get multiple sink with the current structure assert(len(sink_nodes) == 1) return sink_nodes - + def set_new_env_file_for_node(self, node_id: NodeId, new_env_file: str): self.new_envs[node_id] = new_env_file - + def get_new_env_file_for_node(self, node_id: NodeId) -> str: return self.new_envs.get(node_id) - + def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str): self.latest_envs[node_id] = latest_env_file - + def get_latest_env_file_for_node(self, node_id: NodeId) -> str: return self.latest_envs.get(node_id) - + def get_most_recent_possible_new_env_for_node(self, node_id) -> str: most_recent_env_node = node_id while self.get_new_env_file_for_node(most_recent_env_node) is None: predecessor = self.get_prev(most_recent_env_node) - + ## This will trigger when we move to full Partial Orders assert len(predecessor) <= 1 - + ## If there are no predecessors for a node it means we are at the source ## so there is no point to search further back if len(predecessor) == 0: @@ -391,7 +390,7 @@ def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": for node_id in node_ids: prev_ids_set = set(self.get_prev(node_id)) prev_nodes = prev_nodes.union(prev_ids_set - node_set) - + ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure assert(len(prev_nodes) <= 1) return list(prev_nodes) @@ -424,9 +423,9 @@ def init_partial_order(self): logging.debug(f'To be resolved sets per node:') logging.debug(self.to_be_resolved) logging.info(f'Initialized the partial order!') - self.log_partial_program_order_info() + # self.log_partial_program_order_info() assert(self.valid()) - + def init_latest_env_files(self, node=None): if node is None: @@ -449,7 +448,7 @@ def get_workset(self) -> list: def get_unsafe(self) -> set: return copy.deepcopy(self.unsafe) - + ## Only return the stopped that are not unsafe def get_stopped_safe(self) -> set: return copy.deepcopy(self.stopped.difference(self.unsafe)) @@ -478,7 +477,7 @@ def init_inverse_adjacency(self): ## TODO: Call valid and add assertiosn for loops here. def valid(self): logging.debug("Checking partial order validity...") - self.log_partial_program_order_info() + # self.log_partial_program_order_info() valid1 = self.loop_nodes_valid() ## TODO: Add a check that for x, y : NodeIds, x < y iff x is a predecessor to x ## This is necessary due to the `hypothetical_before` method. @@ -500,7 +499,7 @@ def loop_nodes_valid(self): self.get_workset() + \ list(self.stopped) + \ list(self.commands_currently_executing.keys()) - loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets + loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets if self.is_loop_node(node_id)] return len(loop_nodes_in_forbidden_sets) == 0 @@ -519,9 +518,9 @@ def get_node_loop_context(self, node_id: NodeId) -> LoopStack: def get_all_non_committed(self) -> "list[NodeId]": all_node_ids = self.nodes.keys() non_committed_node_ids = [node_id for node_id in all_node_ids - if not self.is_committed(node_id)] + if not self.is_committed(node_id)] return non_committed_node_ids - + ## This adds a node to the committed set and saves important information def commit_node(self, node_id: NodeId): logging.debug(f" > Commiting node {node_id}") @@ -535,7 +534,7 @@ def is_loop_node(self, node_id:NodeId) -> bool: def filter_standard_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": return [node_id for node_id in node_ids if not self.is_loop_node(node_id)] - + def filter_loop_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": return [node_id for node_id in node_ids if self.is_loop_node(node_id)] @@ -565,15 +564,15 @@ def get_next(self, node_id:NodeId) -> "list[NodeId]": def get_prev(self, node_id:NodeId) -> "list[NodeId]": return self.inverse_adjacency[node_id][:] - + def add_edge(self, from_id: NodeId, to_id: NodeId): ## KK 2023-05-04 Is it a problem that we append? Maybe we should make that a set self.adjacency[from_id].append(to_id) self.inverse_adjacency[to_id].append(from_id) - + def remove_edge(self, from_id: NodeId, to_id: NodeId): self.adjacency[from_id].remove(to_id) - self.inverse_adjacency[to_id].remove(from_id) + self.inverse_adjacency[to_id].remove(from_id) def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": all_next_transitive = set(target_node_ids) @@ -585,7 +584,7 @@ def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId all_next_transitive = all_next_transitive.union(successors) next_work.extend(new_next) return list(all_next_transitive) - + def get_inverse_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": all_prev_transitive = set(target_node_ids) next_work = target_node_ids.copy() @@ -607,13 +606,13 @@ def get_transitive_closure_if_can_be_resolved(self, can_be_resolved: list, targe all_next_transitive = all_next_transitive.union(successors) next_work.extend(new_next) return list(all_next_transitive) - + def update_rw_set(self, node_id, rw_set): self.rw_sets[node_id] = rw_set def get_rw_set(self, node_id) -> RWSet: return self.rw_sets[node_id] - + def get_rw_sets(self) -> dict: return self.rw_sets @@ -635,7 +634,7 @@ def is_first_node_when_env_is_uninitialized(self, speculate_immediately): logging.debug("Initializing latest env and speculating") return True return False - + # Check if the specific command can be resolved. # KK 2023-05-04 I am not even sure what this function does and why is it useful. def cmd_can_be_resolved(self, node_id: int) -> bool: @@ -669,7 +668,7 @@ def cmd_can_be_resolved(self, node_id: int) -> bool: ## Otherwise we can return logging.debug(f' >> Able to resolve {node_id}') return True - + def __kill_all_currently_executing_and_schedule_restart(self, start=None): nodes_to_kill = self.get_currently_executing() if start is not None: @@ -679,7 +678,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, start=None): most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id) self.prechecked_env.discard(cmd_id) if most_recent_new_env is not None: - + self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) self.workset.remove(cmd_id) log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id) @@ -691,7 +690,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, start=None): def __kill_node(self, cmd_id: "NodeId"): logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') - proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file = self.commands_currently_executing.pop(cmd_id) + proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file, _ = self.commands_currently_executing.pop(cmd_id) # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response self.banned_files.add(trace_file) @@ -720,7 +719,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd) log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}") log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling") - + if len(to_commit) == 0: logging.debug(" > No nodes to be committed this round") else: @@ -735,7 +734,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self): def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn): for second_cmd_id in cmds_to_check: for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id): - + if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id): update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id) @@ -780,10 +779,10 @@ def get_first_cmd_ids(second_cmd_id): def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}') new_workset.add(second_cmd_id) - + new_workset = set() self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency) - + return new_workset @@ -791,14 +790,14 @@ def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): ## Forward dependency is when a command's output is the same ## as the input of a following command def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): - self.log_partial_program_order_info() + # self.log_partial_program_order_info() for cmd in cmds_to_resolve: log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd) - + logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}") logging.debug(" --- Starting dependency resolution --- ") new_workset = self.resolve_dependencies(cmds_to_resolve) - + logging.debug(" > Modifying workset accordingly") # New workset contains previous unresolved commands and resolved commands with dependencies that have not been stopped workset_old = self.workset.copy() @@ -823,8 +822,8 @@ def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): ## Therefore it does not just check edges, but rather computes if it would be before ## based on ids and loop iterations. ## - ## 1. Check if the loop ids of the two abstract parents of both nodes differ - ## thus showing that one is before the other + ## 1. Check if the loop ids of the two abstract parents of both nodes differ + ## thus showing that one is before the other ## 2. If all loop ids are the same, now we can actually compare iterations. ## If a node is in the same loop ids but in a later iteration then it is later. ## 3. If all iterations are the same too, then we just compare node ids @@ -853,7 +852,7 @@ def hypothetical_before(self, nid1: NodeId, nid2: NodeId): ## We need to keep going i += 1 - ## If we reach this, we know that both nodes are in the same loops up to i + ## If we reach this, we know that both nodes are in the same loops up to i ## so we now compare iterations and node identifiers. iters1 = nid1.get_iters() @@ -900,7 +899,7 @@ def progress_po_due_to_wait(self, node_id: NodeId): all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed) non_committed_loop_nodes_that_would_be_predecessors = [n_id for n_id in all_non_committed_loop_nodes if self.hypothetical_before(n_id, node_id)] - + new_committed_nodes = non_committed_loop_nodes_that_would_be_predecessors else: @@ -915,7 +914,7 @@ def progress_po_due_to_wait(self, node_id: NodeId): if not self.is_committed(node_id) and self.is_loop_node(node_id)] logging.debug(f'Non committed loop nodes that are predecessors to {node_id} are: {non_committed_loop_nodes_in_inverse_tc}') - + new_committed_nodes = non_committed_loop_nodes_in_inverse_tc ## And "close them" @@ -924,7 +923,7 @@ def progress_po_due_to_wait(self, node_id: NodeId): logging.debug(f'Adding following loop nodes to committed: {new_committed_nodes}') for node_id in new_committed_nodes: self.commit_node(node_id) - + ## Since we committed some nodes, let's make sure that we also push the frontier ## TODO: Can we do this in a less hacky method? By using a well-defined commit_node_and_push_frontier method? if len(new_committed_nodes) > 0: @@ -942,26 +941,26 @@ def progress_po_due_to_wait(self, node_id: NodeId): ## TODO: Add some form of validity assertion after we are done with this. ## Just to make sure that we haven't violated the continuity of the committed set. - + ## We check if something can be resolved and stepped forward here ## KK 2023-05-10 This seems to work for all tests (so it might be idempotent ## since in many tests there is nothing new to resolve after a wait) self.resolve_commands_that_can_be_resolved_and_push_frontier() ## When the frontend sends a wait for a node, it means that execution in the frontend has - ## already surpassed all nodes prior to it. This is particularly important for loops, + ## already surpassed all nodes prior to it. This is particularly important for loops, ## since we can't always statically predict how many iterations they will do, so the only ## definitive way to know that they are done is to receive a wait for a node after them. def wait_received(self, node_id: NodeId): ## Whenever we receive a wait for a node, we always need to check and "commit" all prior loop nodes ## since we know that they won't have any more iterations (the JIT frontend has already passed them). - + ## We first have to push and progress the PO due to the wait and then unroll ## KK 2023-05-22 Currently this checks whether a still nonexistent node is - ## would be a successor of existing nodes to commit some of + ## would be a successor of existing nodes to commit some of ## them if needed. Unfortunately, to make this check for a non-existent - ## node is very complex and not elegant. - ## TODO: Could we swap unrolling and progressing so that we always + ## node is very complex and not elegant. + ## TODO: Could we swap unrolling and progressing so that we always ## check if a node can be progressed by checking edges? log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) self.progress_po_due_to_wait(node_id) @@ -975,7 +974,7 @@ def wait_received(self, node_id: NodeId): ## For now we are being conservative and that is why it only happens here ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work) self.maybe_unroll(node_id) - + assert(self.valid()) def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]": @@ -996,13 +995,13 @@ def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[No def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"): logging.info(f'Unrolling loop with id: {loop_id}') all_loop_node_ids = self.find_outer_loop_sub_partial_order(loop_id, nodes_subset) - + ## We don't want to unroll already committed nodes loop_node_ids = [nid for nid in all_loop_node_ids if not self.is_committed(nid)] logging.debug(f'Node ids for loop: {loop_id} are: {loop_node_ids}') - + ## Create the new nodes and remap adjacencies accordingly node_mappings = {} for node_id in loop_node_ids: @@ -1028,7 +1027,7 @@ def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"): for node_id, new_node_id in node_mappings.items(): old_prev_ids = self.get_prev(node_id) - ## Modify all id to be in the new set except for the + ## Modify all id to be in the new set except for the new_prev_ids = PartialProgramOrder.map_using_mapping(old_prev_ids, node_mappings) self.inverse_adjacency[new_node_id] = new_prev_ids for new_prev_id in new_prev_ids: @@ -1091,7 +1090,7 @@ def unroll_loops(self, loop_contexts: LoopStack) -> NodeId: ## Update all new nodes that we have added all_new_node_ids.update(new_node_ids) - ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once, + ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once, ## we just want to look at those new unrolled nodes for the next unrolling). relevant_node_ids = new_node_ids @@ -1103,7 +1102,7 @@ def unroll_loops(self, loop_contexts: LoopStack) -> NodeId: if not self.is_loop_node(new_node_id): self.workset.append(new_node_id) ## GL: 08-24-2023: This might not the best way to treat this as we need - ## to update the env half way through the loop. + ## to update the env half way through the loop. ## For now, we just copy the env from the parent loop node non_iter_id = new_node_id.get_non_iter_id() logging.debug(f"Copying latest env from loop context to loop node: {non_iter_id} -> {new_node_id}") @@ -1140,7 +1139,7 @@ def unroll_loop_node(self, target_concrete_node_id: NodeId): ## TODO: This needs to change when we modify unrolling to happen speculatively too ## TODO: This needs to properly add the node to frontier and to resolve dictionary - + # GL 2023-05-22: __frontier_commit_and_push() should be called here instead of step_forward() # Although without it the test cases pass self.frontier.append(new_first_node_id) @@ -1212,7 +1211,7 @@ def __frontier_commit_and_push(self): ## Update the frontier to the new frontier self.frontier = new_frontier - + ## For a file - dir forward dependency to exist, ## we need the succeding command to attempt to read anything that is a subpath of the @@ -1230,13 +1229,13 @@ def has_dir_file_dependency(self, first_cmd_set, second_cmd_set): logging.debug(f' > File forward dependency found C1:({dir}) C2:({other_path})') return True return False - + def is_subpath(self, dir, other_path): other_path.startswith(os.path.abspath(dir)+os.sep) def has_forward_dependency(self, first_id, second_id): first_write_set = set(self.rw_sets[first_id].get_write_set()) - second_read_set = set(self.rw_sets[second_id].get_read_set()) + second_read_set = set(self.rw_sets[second_id].get_read_set()).union(set(self.rw_sets[second_id].get_write_set())) logging.debug(f'Checking dependencies between {first_id} and {second_id}') if not first_write_set.isdisjoint(second_read_set): logging.debug(f' > Forward dependency found {first_write_set.intersection(second_read_set)}') @@ -1247,14 +1246,14 @@ def has_forward_dependency(self, first_id, second_id): else: logging.debug(f' > No dependencies') return False - + def get_all_next_non_committed_nodes(self) -> "list[NodeId]": next_non_committed_nodes = [] for cmd_id in self.get_all_non_committed(): if cmd_id in self.workset and self.is_next_non_committed_node(cmd_id): next_non_committed_nodes.append(cmd_id) return next_non_committed_nodes - + def is_next_non_committed_node(self, node_id: NodeId) -> bool: # We want the predecessor to be committed and the current node to not be committed for prev_node in self.get_prev(node_id): @@ -1288,10 +1287,13 @@ def schedule_work(self, limit=0): ## GL 2023-07-05 populate_to_be_resolved_dict() is OK to call anywhere, ## __frontier_commit_and_push() is not safe to call here self.populate_to_be_resolved_dict() - + ## TODO: Move loop unrolling here for speculation too + conflicted_nodes = self.nodes_with_uncommited_conflict() for cmd_id in self.get_workset(): + if cmd_id in conflicted_nodes: + continue # We only need to schedule non-committed and non-executing nodes if not (cmd_id in self.get_committed() or \ cmd_id in self.commands_currently_executing): @@ -1323,7 +1325,7 @@ def speculate_cmd_non_blocking(self, node_id: NodeId): logging.debug(f'Speculating command: {node_id} {self.get_node(node_id)}') ## TODO: Since these (this and the function above) ## are relevant for the report maker, - ## add them in some library (e.g., trace_for_report) + ## add them in some library (e.g., trace_for_report) ## so that we don't accidentally delete them. logging.debug(f"ExecutingSandboxAdd|{node_id}") self.execute_cmd_core(node_id, speculate=True) @@ -1335,7 +1337,7 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): is_safe = analysis.safe_to_execute(node.asts, variables) if not is_safe: logging.debug(f'Command: "{node}" is not safe to execute, sending to the original shell to execute...') - + ## Keep some state around to determine that this command is not safe to execute. self.stopped.add(node_id) self.unsafe.add(node_id) @@ -1354,11 +1356,44 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False): else: execute_func = executor.async_run_and_trace_command_return_trace - proc, trace_file, stdout, stderr, post_execution_env_file = execute_func(cmd, node_id, env_file_to_execute_with) - self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file) + proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = execute_func(cmd, node_id, env_file_to_execute_with) + self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir) logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}") + + def nodes_with_uncommited_conflict(self): + uncommited_run_after = [node_id for node_id in self.run_after if node_id not in self.committed] + total_conflicts = set() + for node_id in uncommited_run_after: + conflicts = self.run_after[node_id] + total_conflicts.update(conflicts) + return total_conflicts - # This method attempts to add to workset (rerun) + def kill_and_stop(self, node_id: NodeId): + proc, _, _, _, _, _ = self.commands_currently_executing.pop(node_id) + util.kill_process_tree(proc.pid, sig=signal.SIGTERM) + + def early_stop_using_dep(self): + for node_id, info_tuple in self.commands_currently_executing.items(): + trace_file = info_tuple[1] + sandbox_dir = info_tuple[5] + try: + trace_object = executor.read_trace(sandbox_dir, trace_file) + except FileNotFoundError: + continue + logging.info(f'going forward') + read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) + rw_set = RWSet(read_set, write_set) + self.update_rw_set(node_id, rw_set) + for node_id in self.commands_currently_executing: + self.resolve_dependencies_early(node_id) + self.log_partial_program_order_info() + conflicts = self.nodes_with_uncommited_conflict() + to_be_killed = [node_id for node_id in self.commands_currently_executing if node_id in conflicts] + logging.info(f'>>>>>>>>>>>>>>>>> to be killed: {to_be_killed}') + for node_id in to_be_killed: + self.kill_and_stop(node_id) + + # This method attempts to add to workset (rerun) # any command that found to have a dependency through early resolution def attempt_rerun_pending_nodes(self): restarted_nodes = set() @@ -1377,17 +1412,22 @@ def attempt_rerun_pending_nodes(self): new_run_after_nodes.discard(node) self.run_after[node_id] = new_run_after_nodes return restarted_nodes + + def set_sandbox(self, node_id, sandbox_dir): + self.sandbox_dirs[node_id] = sandbox_dir def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") - + logging.debug(f" --- Node {node_id}, just finished execution ---") self.sandbox_dirs[node_id] = sandbox_dir ## TODO: Store variable file somewhere so that we can return when wait - - _proc, trace_file, stdout, stderr, post_execution_env_file = self.commands_currently_executing.pop(node_id) - + if not node_id in self.commands_currently_executing: + return + _proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = self.commands_currently_executing.pop(node_id) + if not sandbox_dir == self.sandbox_dirs[node_id]: + return logging.trace(f"ExecutingRemove|{node_id}") # Handle stopped by riker due to network access if int(riker_exit_code) == 159: @@ -1403,8 +1443,8 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand ## is completed for this node. completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir) self.nodes[node_id].set_completed_info(completed_node_info) - - ## We no longer add failed commands to the stopped set, + + ## We no longer add failed commands to the stopped set, ## because this leads to more repetitions than needed ## and does not allow us to properly speculate commands read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) @@ -1416,26 +1456,24 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand if node_id in self.workset: self.workset.remove(node_id) logging.debug(f"WorksetRemove|{node_id}") - # If no commands can be resolved this round, + # If no commands can be resolved this round, # do nothing and wait until a new command finishes executing logging.debug("No resolvable nodes were found in this round, nothing will change...") return - - + + log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) - # Remove from workset and add it again later if necessary - self.workset.remove(node_id) log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) - + ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. if self.get_new_env_file_for_node(node_id) is None: logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") self.waiting_for_frontend.add(node_id) - + # We will however attempt to resolve dependencies early self.resolve_dependencies_early(node_id) restarted_cmds = self.attempt_rerun_pending_nodes() - self.log_partial_program_order_info() + # self.log_partial_program_order_info() ## Here we continue with the normal execution flow else: logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") @@ -1444,11 +1482,11 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. def exclude_insignificant_diffs(self, env_diff_dict): return {k: v for k, v in env_diff_dict.items() if k not in config.INSIGNIFICANT_VARS} - + #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. def include_only_significant_vars(self, env_diff_dict): return {k: v for k, v in env_diff_dict.items() if k in config.SIGNIFICANT_VARS} - + def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_in_both): # Exclude insignificant differences only_in_new_sig = self.include_only_significant_vars(only_in_new) @@ -1464,7 +1502,7 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i else: logging.debug("No significant differences found:") return False - + def update_env_and_restart_nodes(self, node_id: NodeId): logging.debug(f"Significant differences found between new and latest env files for {node_id}.") logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") @@ -1483,13 +1521,13 @@ def update_env_and_restart_nodes(self, node_id: NodeId): self.prechecked_env.discard(waiting_for_frontend_node) assert(self.get_new_env_file_for_node(node_id) is not None) assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) - self.log_partial_program_order_info() + # self.log_partial_program_order_info() logging.debug("-") self.waiting_for_frontend = new_waiting_for_frontend self.populate_to_be_resolved_dict() def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), + if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): self.update_env_and_restart_nodes(node_id) else: @@ -1498,7 +1536,7 @@ def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, r def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") self.waiting_for_frontend.discard(node_id) - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), + if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), self.get_latest_env_file_for_node(node_id)): self.update_env_and_restart_nodes(node_id) else: @@ -1509,7 +1547,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node self.resolve_dependencies_early(node_id) restarted_cmds = self.attempt_rerun_pending_nodes() logging.debug(f"Restarted after successful env resolution {restarted_cmds}") - self.log_partial_program_order_info() + # self.log_partial_program_order_info() self.resolve_commands_that_can_be_resolved_and_push_frontier() assert(self.valid()) @@ -1528,10 +1566,10 @@ def new_and_latest_env_files_have_significant_differences(self, new_env_file, la return False logging.debug(f"Comparing new and latest env files: {new_env_file} {latest_env_file}") assert(latest_env_file is not None) - + new_env = executor.read_env_file(new_env_file) latest_env = executor.read_env_file(latest_env_file) - + only_in_new, only_in_latest, different_in_both = util.compare_env_strings(new_env, latest_env) return self.significant_diff_in_env_dicts(only_in_new, only_in_latest, different_in_both) @@ -1557,7 +1595,7 @@ def commit_cmd_workspaces(self, to_commit_ids): def log_rw_sets(self): logging.debug("====== RW Sets " + "=" * 65) for node_id, rw_set in self.rw_sets.items(): - logging.debug(f"ID:{node_id} | R.size:{len(rw_set.get_read_set()) if rw_set is not None else None} | W:{rw_set.get_write_set() if rw_set is not None else None}") + logging.debug(f"ID:{node_id} | R:{[f for f in rw_set.get_read_set() if 'output_' in f] if rw_set else None} | W:{rw_set.get_write_set() if rw_set is not None else None}") def log_partial_program_order_info(self): logging.debug(f"=" * 80) @@ -1660,7 +1698,7 @@ def parse_loop_contexts(lines): def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder: with open(file_path) as f: raw_lines = f.readlines() - + ## Filter comments and remove new lines lines = [line.rstrip() for line in raw_lines if not line.startswith("#")] @@ -1692,15 +1730,15 @@ def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder file_path = f'{cmds_directory}/{i}' cmd, asts = parse_cmd_from_file(file_path) loop_ctx = loop_contexts[i] - nodes[NodeId(i)] = Node(NodeId(i), cmd, - asts=asts, + nodes[NodeId(i)] = Node(NodeId(i), cmd, + asts=asts, loop_context=LoopStack(loop_ctx)) edges = {NodeId(i) : [] for i in range(number_of_nodes)} for edge_line in edge_lines: from_id, to_id = parse_edge_line(edge_line) edges[NodeId(from_id)].append(NodeId(to_id)) - + logging.trace(f"Nodes|{','.join([str(node) for node in nodes])}") logging.trace(f"Edges|{edges}") return PartialProgramOrder(nodes, edges, initial_env_file) diff --git a/parallel-orch/run_command.sh b/parallel-orch/run_command.sh index e2c46e92..2d3597c1 100755 --- a/parallel-orch/run_command.sh +++ b/parallel-orch/run_command.sh @@ -5,10 +5,11 @@ export CMD_STRING=${1?No command was given to execute} export TRACE_FILE=${2?No trace file path given} export STDOUT_FILE=${3?No stdout file given} export LATEST_ENV_FILE=${4?No env file to run with given} -export EXEC_MODE=${5?No execution mode given} -export CMD_ID=${6?No command id given} -export POST_EXEC_ENV=${7?No Riker env file given} - +export SANDBOX_DIR=${5?No sandbox dir given} +export TMPDIR=${6?No tmp dir given} +export EXEC_MODE=${7?No execution mode given} +export CMD_ID=${8?No command id given} +export POST_EXEC_ENV=${9?No Riker env file given} ## KK 2023-04-24: Not sure this should be run every time we run a command ## GL 2023-07-08: Tests seem to pass without it @@ -23,10 +24,10 @@ else exit 1 fi -mkdir -p /tmp/pash_spec/a -mkdir -p /tmp/pash_spec/b -export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/" -export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)" +# mkdir -p /tmp/pash_spec/a +# mkdir -p /tmp/pash_spec/b +# export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/" +# export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)" # echo tempdir $TEMPDIR # echo sandbox $SANDBOX_DIR diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 29ebddbf..436eb642 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -34,7 +34,7 @@ def parse_args(): action="store_true", default=False, help="Speculate immediately instead of waiting for the first Wait message.") - + args, unknown_args = parser.parse_known_args() return args @@ -135,7 +135,7 @@ def handle_wait(self, input_cmd: str, connection): self.waiting_for_response[node_id] = connection - def __parse_command_exec_complete(self, input_cmd: str) -> "tuple[int, int]": + def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": try: components = input_cmd.rstrip().split("|") command_id = parse_node_id(components[0].split(":")[1]) @@ -179,10 +179,15 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str): socket_respond(connection, response) connection.close() + def handle_command_exec_start(self, input_cmd): + assert(input_cmd.startswith("CommandExecStart:")) + cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + self.partial_program_order.set_sandbox(cmd_id, sandbox_dir) + def handle_command_exec_complete(self, input_cmd: str): assert(input_cmd.startswith("CommandExecComplete:")) ## Read the node id from the command argument - cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_complete(input_cmd) + cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) if trace_file in self.partial_program_order.banned_files: logging.debug(f'CommandExecComplete: {cmd_id} ignored') return @@ -230,6 +235,9 @@ def process_next_cmd(self): self.partial_program_order.log_executions() self.done = True log_time_delta_from_named_timestamp("Scheduler", "Done") + elif input_cmd.startswith("CommandExecStart:"): + #TODO: add logging stuff + self.handle_command_exec_start(input_cmd) else: logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) raise Exception(f'Error: Unsupported command: {input_cmd}') @@ -267,12 +275,16 @@ def run(self): while not self.done: + # TODO: wrap this around something probably + self.partial_program_order.early_stop_using_dep() + ## Schedule some work (if we are already at capacity this will return immediately) self.schedule_work() ## Process a single request self.process_next_cmd() # If workset is empty we should end. # TODO: ec checks fail for now + self.socket.close() self.shutdown() @@ -284,7 +296,7 @@ def shutdown(self): def terminate_pending_commands(self): for _node_id, cmd_info in self.partial_program_order.commands_currently_executing.items(): - proc, _trace_file, _stdout, _stderr, _variable_file = cmd_info + proc, _trace_file, _stdout, _stderr, _variable_file, _ = cmd_info proc.terminate() diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 8594e4c6..1331cbb2 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -7,6 +7,21 @@ # Global TODOs: # handle pwd, such that open and stat can work +# not handled: listxattr, llistxattr, getxattr, lgetxattr, pivot_root, mount, umount2 +# setxattr lsetxattr removexattr lremovexattr, fanotify_mark, renameat2, chroot, quotactl +# handled individually openat, open, chdir, clone, rename +# TODO: link, symlink, renameat, symlinkat +r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs', + 'readlink', 'execve']) +w_first_path_set = set(['mkdir', 'rmdir', 'truncate', 'creat', 'chmod', 'chown', + 'lchown', 'utime', 'mknod', 'utimes', 'acct', 'unlink']) +r_fd_path_set = set(['fstatat', 'newfstatat', 'statx', 'name_to_handle_at', + 'readlinkat', 'faccessat', 'execveat']) +w_fd_path_set = set(['unlinkat', 'utimensat', 'mkdirat', 'mknodat', 'fchownat', 'futimeat', + 'unlinkat', 'linkat', 'fchmodat', 'utimensat']) +ignore_set = set(['getpid', 'getcwd']) + + @dataclass class ExitStatus: exitcode: int @@ -66,13 +81,6 @@ def pop_complete_line(self, pid: int, l): del self.line_dict[pid] return total_line -# openat, open, chdir, clone -r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs']) -w_first_path_set = set(['mkdir']) -r_fd_path_set = set(['fstatat', 'newfstatat']) -w_fd_path_set = set(['unlinkat', 'utimensat']) -ignore_set = set(['getpid', 'getcwd']) - def parse_string(s): s = s.strip() # handling cases such as utimensat @@ -93,12 +101,15 @@ def is_ret_err(ret: str): ret = ret.strip() return ret[0] == '-' -def get_path_first_path(pid, args, ctx): - a = args.split(sep=',', maxsplit=1)[0] - if is_absolute(a): - return parse_string(a) +def convert_absolute(cur_dir, path): + if is_absolute(path): + return path else: - return os.path.join(ctx.get_dir(pid), parse_string(a)) + return os.path.join(cur_dir, path) + +def get_path_first_path(pid, args, ctx): + a = parse_string(args.split(sep=',', maxsplit=1)[0]) + return convert_absolute(ctx.get_dir(pid), a) def parse_r_first_path(pid, args, ret, ctx): return RFile(get_path_first_path(pid, args, ctx)) @@ -110,6 +121,27 @@ def parse_w_first_path(pid, args, ret, ctx): else: return WFile(path) +def get_path_at(pid, positions, args, ctx): + args = args.split(sep=',') + if isinstance(positions, list): + rets = [] + for x in args: + rets.append(convert_absolute(ctx.get_dir(pid), parse_string(x))) + return rets + else: + return convert_absolute(ctx.get_dir(pid), parse_string(x)) + +def parse_rename(pid, args, ret, ctx): + path_a, path_b = get_path_at(pid, [0, 1], args, ctx) + return WFile(path_a), WFile(path_b) + +def parse_link(pid, args, ret, ctx): + path_a, path_b = get_path_at(pid, [0, 1], args, ctx) + return RFile(path_a), WFile(path_b) + +def parse_renameat(pid, args, ret, ctx): + + def parse_chdir(pid, args, ret, ctx): new_path = get_path_first_path(pid, args, ctx) if not is_ret_err(ret): @@ -201,6 +233,8 @@ def parse_syscall(pid, syscall, args, ret, ctx): return parse_r_fd_path(args, ret) elif syscall in w_fd_path_set: return parse_w_fd_path(args, ret) + elif syscall == 'rename': + return parse_rename(pid, args, ret, ctx) elif syscall == 'clone': return parse_clone(pid, args, ret, ctx) elif syscall in ignore_set: @@ -263,9 +297,9 @@ def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]: write_set = set() for l in trace_object: record = parse_line(l, ctx) - if type(record) is RFile: + if type(record) is RFile and record.fname != '/dev/tty': read_set.add(record.fname) - elif type(record) is WFile: + elif type(record) is WFile and record.fname != '/dev/tty': write_set.add(record.fname) return read_set, write_set diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 1ed96118..799cd379 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -15,6 +15,13 @@ def ptempfile(): os.close(fd) return name +def create_sandbox(): + os.makedirs("/tmp/pash_spec/a", exist_ok=True) + os.makedirs("/tmp/pash_spec/b", exist_ok=True) + sdir = tempfile.mkdtemp(dir="/tmp/pash_spec/a", prefix="sandbox_") + tdir = tempfile.mkdtemp(dir="/tmp/pash_spec/b", prefix="sandbox_") + return sdir, tdir + def init_unix_socket(socket_file: str) -> socket.socket: server_address = socket_file diff --git a/test/misc/cat_and_sleep.sh b/test/misc/cat_and_sleep.sh new file mode 100755 index 00000000..77dbcbc5 --- /dev/null +++ b/test/misc/cat_and_sleep.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +cat $2 >> $3 +sleep $1 diff --git a/test/test_orch.sh b/test/test_orch.sh index 5cb27d4a..572b4057 100755 --- a/test/test_orch.sh +++ b/test/test_orch.sh @@ -370,6 +370,17 @@ test_command_var_assignments_2(){ $shell $2/test_command_var_assignments_2.sh } +test_early_stop1() +{ + local shell=$1 + $shell $2/test_early_stop1.sh +} + +test_early_stop2() +{ + local shell=$1 + $shell $2/test_early_stop2.sh +} ## TODO: make more loop tests with nested loops and commands after the loop diff --git a/test/test_scripts/test_early_stop1.sh b/test/test_scripts/test_early_stop1.sh new file mode 100644 index 00000000..cfec04d0 --- /dev/null +++ b/test/test_scripts/test_early_stop1.sh @@ -0,0 +1,7 @@ +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 2 "output text" "$test_output_dir/out2" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out3" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out4" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out5" + + diff --git a/test/test_scripts/test_early_stop2.sh b/test/test_scripts/test_early_stop2.sh new file mode 100644 index 00000000..bf63e408 --- /dev/null +++ b/test/test_scripts/test_early_stop2.sh @@ -0,0 +1,9 @@ +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out2" +$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out3" + + + From 258e818847ba6a58278d2802cf838e338c254f14 Mon Sep 17 00:00:00 2001 From: Guest Date: Wed, 13 Dec 2023 15:22:40 -0500 Subject: [PATCH 09/39] fix wrong commit --- parallel-orch/trace_v2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 1331cbb2..dab3309c 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -139,8 +139,6 @@ def parse_link(pid, args, ret, ctx): path_a, path_b = get_path_at(pid, [0, 1], args, ctx) return RFile(path_a), WFile(path_b) -def parse_renameat(pid, args, ret, ctx): - def parse_chdir(pid, args, ret, ctx): new_path = get_path_first_path(pid, args, ctx) From 413df3e1572371b4b54d940d39f3a17e795fda8f Mon Sep 17 00:00:00 2001 From: Guest Date: Wed, 13 Dec 2023 16:08:16 -0500 Subject: [PATCH 10/39] fix parsing when partial line exists in the trace --- parallel-orch/trace_v2.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index dab3309c..003e880c 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -1,4 +1,5 @@ import re +import logging import os.path import sys from typing import Tuple @@ -270,7 +271,8 @@ def parse_line(l, ctx): lparen = l.find('(') equals = l.rfind('=') rparen = l[:equals].rfind(')') - assert lparen >= 0 and equals >= 0 and rparen >= 0 + if not (lparen >= 0 and equals >= 0 and rparen >= 0): + return None syscall = l[:lparen] ret = l[equals+1:] args = l[lparen+1:rparen] @@ -294,7 +296,11 @@ def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]: read_set = set() write_set = set() for l in trace_object: - record = parse_line(l, ctx) + try: + record = parse_line(l, ctx) + except Exception: + logging.debug(l) + raise ValueError("error while parsing trace") if type(record) is RFile and record.fname != '/dev/tty': read_set.add(record.fname) elif type(record) is WFile and record.fname != '/dev/tty': From 23fe3f59997d428e68e6371e7b430727010b4ed2 Mon Sep 17 00:00:00 2001 From: Guest Date: Wed, 13 Dec 2023 17:02:26 -0500 Subject: [PATCH 11/39] fix readline behavior --- parallel-orch/executor.py | 2 +- parallel-orch/trace_v2.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 526b2dd4..2d58deaf 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -57,7 +57,7 @@ def read_trace(sandbox_dir, trace_file): path = f"{sandbox_dir}/upperdir/{trace_file}" logging.debug(f'Reading trace from: {path}') with open(path) as f: - return f.readlines() + return f.read().split('\n')[:-1] def read_env_file(env_file, sandbox_dir=None): if sandbox_dir is None: diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 003e880c..10c52c47 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -257,6 +257,8 @@ def handle_info(l): return False, None def parse_line(l, ctx): + if len(l) == 0: + return None pid, l = strip_pid(l) is_info, info = handle_info(l) if is_info: @@ -279,7 +281,7 @@ def parse_line(l, ctx): return parse_syscall(pid, syscall, args, ret, ctx) def parse_exit_code(trace_object) -> int: - if len(trace_object) < 1: + if len(trace_object) == 0 or trace_object[0] == '': return None l = trace_object[0] first_pid, _ = strip_pid(l) From 88aedee43401fd1d7496c63f418f2ed8a6e909a3 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 9 Jan 2024 10:32:19 -0700 Subject: [PATCH 12/39] Start the refatoring. Implement basic structures and operations --- parallel-orch/node.py | 197 +++ parallel-orch/partial_program_order.py | 1782 +----------------------- parallel-orch/scheduler_server.py | 253 +--- parallel-orch/util.py | 82 ++ 4 files changed, 403 insertions(+), 1911 deletions(-) create mode 100644 parallel-orch/node.py diff --git a/parallel-orch/node.py b/parallel-orch/node.py new file mode 100644 index 00000000..616e1b58 --- /dev/null +++ b/parallel-orch/node.py @@ -0,0 +1,197 @@ +from enum import Enum, auto + +class NodeState(Enum): + INIT = auto() + READY = auto() + COMMITTED = auto() + STOP = auto() + SPECULATED = auto() + EXECUTING = auto() + SPEC_EXECUTING = auto() + UNSAFE = auto() + +class Sandbox: + def __init__(self, trace_file, exit_code, post_execution_env_file, stdout_file, sandbox_dir): + # These get predetermined prior to the execution + self.trace_file = trace_file + self.post_execution_env_file = post_execution_env_file + self.stdout_file = stdout_file + self.sandbox_dir = sandbox_dir + # These get set after execution is done + self.exit_code = None + self.proc_id = None + + def set_exit_code(self, exit_code): + self.exit_code = exit_code + + def set_proc_id(self, proc_id): + self.proc_id = proc_id + + def get_exit_code(self): + return self.exit_code + + def get_post_execution_env_file(self): + return self.post_execution_env_file + + def get_stdout_file(self): + return self.stdout_file + + def get_sandbox_dir(self): + return self.sandbox_dir + + def get_trace_file(self): + return self.trace_file + + def __str__(self): + return f'Sandbox(trace:{self.get_trace_file}, ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' + +class RWSet: + + def __init__(self, read_set: set, write_set: set): + self.read_set = read_set + self.write_set = write_set + + def add_to_read_set(self, item: str): + self.read_set.add(item) + + def add_to_write_set(self, item: str): + self.write_set.add(item) + + def get_read_set(self) -> set: + return self.read_set + + def get_write_set(self) -> set: + return self.write_set + + def __str__(self): + return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" + + +class NodeId: + + #TODO: Implement iteration support + + def __init__(self, id: int): + self.id = id + + def get_non_iter_id(self): + return NodeId(self.id) + + def __repr__(self): + ## TODO: Represent it using n. + output = f'{self.id}' + return output + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other): + # return self.loop_iters == other.loop_iters and self.id == other.id + return self.id == other.id + + def __ne__(self, other): + return not(self == other) + + def __lt__(self, obj): + return (str(self) < str(obj)) + + def __gt__(self, obj): + return (str(self) > str(obj)) + + @staticmethod + def parse_node_id(node_id_str: str): + return NodeId(int(node_id_str)) + + +class Node: + id: NodeId + cmd: str + asts: "list[AstNode]" + state: NodeState + # Nodes to check for fs dependencies before this node can be committed + # for this particular execution of the main sandbox. + # No need to do the same for the background sandbox since it will always get committed. + to_be_resolved_snapshot: "set[NodeId]" + # Read and write sets for this node + rwset: RWSet + # This contains the sandbox and execution info for a spec-executing node + # (or plain executing node if frontier background node execution is not enabled) + main_sandbox: Sandbox + # This can only be set while in the frontier and the background node execution is enabled + background_sandbox: Sandbox + + + def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): + self.id = node_id + self.cmd = cmd + self.asts = asts + # The node's state + self.state = NodeState.INIT + self.tracefile = None + self.rwset = None + # The + self.to_be_resolved_snapshot = None + + self.main_sandbox = None + + self.background_sandbox = None + + + def is_initialized(self): + return self.state == NodeState.INIT + + def is_ready(self): + return self.state == NodeState.READY + + def is_committed(self): + return self.state == NodeState.COMMITTED + + def is_stopped(self): + return self.state == NodeState.STOP + + def is_speculated(self): + return self.state == NodeState.SPECULATED + + def is_executing(self): + return self.state == NodeState.EXECUTING + + def is_spec_executing(self): + return self.state == NodeState.SPEC_EXECUTING + + def is_unsafe(self): + return self.state == NodeState.UNSAFE + + def get_main_sandbox(self): + return self.main_sandbox + + + ## ## + ## Transition Functions ## + ## ## + + def transition_to_ready(self): + assert self.state == NodeState.INIT + self.state = NodeState.READY + # Initialize data structures here + + def transition_to_executing(self): + assert self.state == NodeState.READY + self.state = NodeState.EXECUTING + # TODO + + def transition_to_spec_executing(self): + assert self.state == NodeState.READY + self.state = NodeState.SPEC_EXECUTING + # TODO + + def transition_to_committed(self): + assert self.state in [NodeState.EXECUTING, NodeState.SPECULATED] + self.state = NodeState.COMMITTED + # TODO + + # TODO: other transition functions + + + # Do we need this here of should we handle everything on scheduler server and ppo? + def handle_event(self, event_msg): + pass # TODO diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 2242d9a4..e4728d36 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1,1744 +1,70 @@ -import copy +from node import NodeId, Node import logging -import os -import sys - -import analysis -import config -import executor -import trace -import trace_v2 -from util import * -import util -from collections import defaultdict - -from shasta.ast_node import AstNode, CommandNode, PipeNode - - -class CompletedNodeInfo: - def __init__(self, exit_code, post_execution_env_file, stdout_file, sandbox_dir): - self.exit_code = exit_code - self.post_execution_env_file = post_execution_env_file - self.stdout_file = stdout_file - self.sandbox_dir = sandbox_dir - - def get_exit_code(self): - return self.exit_code - - def get_post_execution_env_file(self): - return self.post_execution_env_file - - def get_stdout_file(self): - return self.stdout_file - - def get_sandbox_dir(self): - return self.sandbox_dir - - def __str__(self): - return f'CompletedNodeInfo(ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' - -## This class is used for both loop contexts and loop iters -## The indices go from inner to outer -class LoopStack: - def __init__(self, loop_contexts_or_iters=None): - if loop_contexts_or_iters is None: - self.loops = [] - else: - self.loops = loop_contexts_or_iters - - def is_empty(self): - return len(self.loops) == 0 - - def __len__(self): - return len(self.loops) - - ## Generates a new loop stack with the same length but 0s as values - def new_zeroed_loop_stack(self): - return [0 for i in self.loops] - - def get_outer(self): - return self.loops[-1] - - def pop_outer(self): - return self.loops.pop() - - def add_inner(self, loop_iter_id: int): - self.loops.insert(0, loop_iter_id) - - def outer_to_inner(self): - return self.loops[::-1] - - def index(self, loop_iter_id: int) -> int: - return self.loops.index(loop_iter_id) - - def get(self, index: int): - return self.loops[index] - - def __repr__(self): - ## TODO: Represent it using 'it', 'it0', 'it1', etc - ## or -(iters)- in front of it. - output = "-".join([str(it) for it in self.loops]) - return output - - def __eq__(self, other): - if not len(self.loops) == len(other.loops): - return False - for i in range(len(self.loops)): - if not self.loops[i] == other.loops[i]: - return False - return True - - -class NodeId: - def __init__(self, id: int, loop_iters=None): - self.id = id - - if loop_iters is None: - self.loop_iters = LoopStack() - else: - assert(isinstance(loop_iters, LoopStack)) - self.loop_iters = loop_iters - - def has_iters(self): - return not self.loop_iters.is_empty() - - def get_iters(self): - return copy.deepcopy(self.loop_iters) - - def get_non_iter_id(self): - return NodeId(self.id) - - ## Returns a new NodeId - def generate_new_node_id_with_another_iter(self, new_iter: int): - ## This node already contains iterations for the outer loops potentially - ## so we just need to add another inner iteration - new_iters = copy.deepcopy(self.loop_iters) - new_iters.add_inner(new_iter) - - new_node_id = NodeId(self.id, new_iters) - return new_node_id - - def __repr__(self): - ## TODO: Represent it using n. - output = f'{self.id}' - if not self.loop_iters.is_empty(): - output += f'+{self.loop_iters}' - return output - - def __hash__(self): - return hash(str(self)) - - def __eq__(self, other): - return self.loop_iters == other.loop_iters and self.id == other.id - - def __ne__(self, other): - # Not strictly necessary, but to avoid having both x==y and x!=y - # True at the same time - return not(self == other) - - ## TODO: Define this correctly if it is to be used for something other than dictionary indexing - def __lt__(self, obj): - return (str(self) < str(obj)) - - def __gt__(self, obj): - return (str(self) > str(obj)) - - # def __le__(self, obj): - # return ((self.b) <= (obj.b)) - - # def __ge__(self, obj): - # return ((self.b) >= (obj.b)) - -def parse_node_id(node_id_str: str) -> NodeId: - if "+" in node_id_str: - node_id_int, iters_str = node_id_str.split("+") - iters = [int(it) for it in iters_str.split("-")] - return NodeId(int(node_id_int), LoopStack(iters)) - else: - return NodeId(int(node_id_str), LoopStack()) - -class Node: - id: NodeId - cmd: str - asts: "list[AstNode]" - loop_context: LoopStack - - def __init__(self, id, cmd, asts, loop_context: LoopStack): - self.id = id - self.cmd = cmd - self.asts = asts - ## There can only be a single AST per node, and this - ## must be a command. - assert(len(asts) == 1) - # Check that the node contains only CommandNode(s) - analysis.validate_node(asts[0]) - self.cmd_no_redir = trace.remove_command_redir(self.cmd) - self.loop_context = loop_context - ## Keep track of how many iterations of this loop node we have unrolled - if not loop_context.is_empty(): - self.current_iters = loop_context.new_zeroed_loop_stack() - - def __str__(self): - # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}" - return self.cmd - - def __repr__(self): - # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}" - return f'N({self.cmd})' - - def get_cmd(self) -> str: - return self.cmd - - def get_cmd_no_redir(self) -> str: - return self.cmd_no_redir - - def get_loop_context(self) -> LoopStack: - return self.loop_context - - def in_loop(self) -> bool: - return not self.loop_context.is_empty() - - ## KK 2023-05-17 Does this generate the correct iteration even in nested loops? - def get_next_iter(self, loop_id: int) -> int: - assert(self.in_loop()) - assert(self.loop_context.get_outer() == loop_id) - loop_id_index_in_loop_context_stack = self.loop_context.index(loop_id) - self.current_iters[loop_id_index_in_loop_context_stack] += 1 - return self.current_iters[loop_id_index_in_loop_context_stack] - - ## Note: This information is valid only after a node is committed. - ## It might be set even before that, but it should only be retrieved when - ## a node is committed. - def set_completed_info(self, completed_node_info: CompletedNodeInfo): - self.completed_node_info = completed_node_info - - def get_completed_node_info(self) -> CompletedNodeInfo: - return self.completed_node_info - -class RWSet: - - def __init__(self, read_set: set, write_set: set): - self.read_set = read_set - self.write_set = write_set - - def add_to_read_set(self, item: str): - self.read_set.add(item) - - def add_to_write_set(self, item: str): - self.write_set.add(item) - - def get_read_set(self) -> set: - return self.read_set - - def get_write_set(self) -> set: - return self.write_set - - def __str__(self): - return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" class PartialProgramOrder: - - def __init__(self, nodes, edges, initial_env_file): + frontier: set # Set of nodes at the frontier + run_after: set # Nodes that should run after certain conditions + window: int # Integer representing the window + to_be_resolved: "dict[NodeId, list[Node]]" # Mapping of nodes to lists of uncommitted nodes + nodes: "dict[NodeId, Node]" + adjacency: "dict[NodeId, list[NodeId]]" + inverse_adjacency: "dict[NodeId, list[NodeId]]" + + def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"): self.nodes = nodes - # TODO: consider changing values to sets instead of lists self.adjacency = edges - self.init_inverse_adjacency() - ## TODO: KK: Is it OK if we modify adjacency lists on the fly while processing the partial-order? - ## TODO: Remember to modify inverse_adjacency - ## self.committed is an add-only set, we never remove - ## TODO: For loop modify committed, workset, frontier, stopped - ## TODO: Add assertions that committed etc do not contain loop nodes - self.committed = set() - ## Nodes that are in the frontier can only move to committed - self.frontier = [] - self.rw_sets = {node_id: None for node_id in self.nodes.keys()} - self.workset = [] - ## A dictionary from cmd_ids that are currently executing that contains their trace_files - self.commands_currently_executing = {} - ## A dictionary that contains information about completed nodes - ## from cmd_id -> CompletedNodeInfo - ## Note: this dictionary does not contain information - ## TODO: Delete this - self.completed_node_info = {} - ## KK 2023-05-09 @Giorgo What is the difference of the following two? - self.to_be_resolved = {} - self.speculated = set() - ## Contains the most recent sandbox directory paths - self.sandbox_dirs = {} - ## Commands that were killed by riker - ## we should keep those in the workset but not execute them - ## until they reach the frontier - self.stopped = set() - ## Commands deemed unsafe from our analysis, that have to be executed - ## in the original shell (e.g., shell primitives) - ## Invariant: self.unsafe \subseteq self.stopped - self.unsafe = set() - self.committed_order = [] - self.commit_state = {} - ## Counts the times a node was (re)executed - self.executions = {node_id: 0 for node_id in self.nodes.keys()} - self.banned_files = set() - self.new_envs = {} - self.latest_envs = {} - self.initial_env_file = initial_env_file - self.waiting_for_frontend = set() - ## In case we spot a dependency meaning a node must execute after another node, it will appear here - ## Contains the nodes to execute only after the key node finishes execution - self.run_after = defaultdict(set) - self.pending_to_execute = set() - self.to_be_resolved_prev = {} - self.prechecked_env = set() - - def __str__(self): - return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}" - - def get_source_nodes(self) -> list: - sources = set() - for to_id, from_ids in self.inverse_adjacency.items(): - if len(from_ids) == 0: - sources.add(to_id) - return list(sources) - - def get_standard_source_nodes(self) -> list: - source_nodes = self.get_source_nodes() - return self.filter_standard_nodes(source_nodes) - - ## This returns the minimum w.r.t. to the PO of a bunch of node_ids. - ## In a real partial order, this could be many, - def get_min(self, node_ids: "list[NodeId]") -> "list[NodeId]": - potential_minima = set(copy.deepcopy(node_ids)) - for node_id in node_ids: - tc = self.get_transitive_closure([node_id]) - ## Remove the node itself from its transitive closure - tc.remove(node_id) - ## If a node is found in the tc of another node, then - ## it is not a minimum - for nid in tc: - potential_minima.discard(nid) - ## KK 2023-05-22 This will be removed at some point but I keep it here - ## for now for easier bug finding. - # logging.debug(f"Potential minima: {potential_minima}") - assert(len(potential_minima) == 1) - return list(potential_minima) - - ## This returns all previous nodes of a sub partial order - def get_sub_po_source_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - source_nodes = list() - node_set = set(node_ids) - for node_id in node_ids: - prev_ids_set = set(self.get_prev(node_id)) - ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so - if len(prev_ids_set) == 0 or \ - not prev_ids_set.issubset(node_set): - source_nodes.append(node_id) - - ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure - assert(len(source_nodes) == 1) - return source_nodes - - def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - sink_nodes = list() - node_set = set(node_ids) - for node_id in node_ids: - next_ids_set = set(self.get_next(node_id)) - ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so - if len(next_ids_set) == 0 or \ - not next_ids_set.issubset(node_set): - sink_nodes.append(node_id) - - ## KK 2024-05-03: I don't see how we can get multiple sink with the current structure - assert(len(sink_nodes) == 1) - return sink_nodes - - def set_new_env_file_for_node(self, node_id: NodeId, new_env_file: str): - self.new_envs[node_id] = new_env_file - - def get_new_env_file_for_node(self, node_id: NodeId) -> str: - return self.new_envs.get(node_id) - - def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str): - self.latest_envs[node_id] = latest_env_file - - def get_latest_env_file_for_node(self, node_id: NodeId) -> str: - return self.latest_envs.get(node_id) - - def get_most_recent_possible_new_env_for_node(self, node_id) -> str: - most_recent_env_node = node_id - while self.get_new_env_file_for_node(most_recent_env_node) is None: - predecessor = self.get_prev(most_recent_env_node) - - ## This will trigger when we move to full Partial Orders - assert len(predecessor) <= 1 - - ## If there are no predecessors for a node it means we are at the source - ## so there is no point to search further back - if len(predecessor) == 0: - break - else: - most_recent_env_node = predecessor[0] - - return self.get_new_env_file_for_node(most_recent_env_node) - - ## This returns all previous nodes of a sub partial order - def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - # assert(self.is_closed_sub_partial_order(node_ids)) - prev_nodes = set() - node_set = set(node_ids) - for node_id in node_ids: - prev_ids_set = set(self.get_prev(node_id)) - prev_nodes = prev_nodes.union(prev_ids_set - node_set) - - ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure - assert(len(prev_nodes) <= 1) - return list(prev_nodes) - - ## TODO: Implement this correctly. I have thought of a naive algorithm that - ## does a BFS forward and backward for each node and if we first see a - ## node outside of the set and then one inside it means that the subset is not closed. - def is_closed_sub_partial_order(self, node_ids: "list[NodeId]") -> bool: - # node_set = set(node_ids) - # visited_set = set() - # for node_id in node_ids: - # prev_ids_set = set(self.get_prev(node_id)) - # next_id_set = set(self.get_next(node_id)) - # ## If one of the previous or next nodes is not in the node set - # ## it means that the sub partial order is not closed. - # if not node_set.issuperset(prev_ids_set.union(next_id_set)): - # return False - - return True - + self.inverse_adjacency = self.init_inverse_adjacency() + self.frontier = set() + self.run_after = set() + self.window = 0 + self.to_be_resolved = {} + def init_partial_order(self): - ## Initialize the frontier with all non-loop source nodes - self.frontier = self.get_standard_source_nodes() - ## Initialize the workset - self.init_workset() - logging.debug(f'Initialized workset') - self.populate_to_be_resolved_dict() - if config.SPECULATE_IMMEDIATELY: - self.init_latest_env_files() - logging.debug(f'To be resolved sets per node:') - logging.debug(self.to_be_resolved) - logging.info(f'Initialized the partial order!') - # self.log_partial_program_order_info() - assert(self.valid()) - - - def init_latest_env_files(self, node=None): - if node is None: - env_to_assign = self.initial_env_file - else: - env_to_assign = self.get_new_env_file_for_node(node) - for node_id in self.get_all_non_committed(): - self.set_latest_env_file_for_node(node_id, env_to_assign) - - - def init_workset(self): - self.workset = self.get_all_non_committed_standard_nodes() + for node_id, node in self.nodes.items(): + node.transition_to_ready() + # TODO: Implement the rest of the partial order initialization - ## Check if the partial order is done - def is_completed(self) -> bool: - return len(self.get_all_non_committed_standard_nodes()) == 0 - - def get_workset(self) -> list: - return self.workset - - def get_unsafe(self) -> set: - return copy.deepcopy(self.unsafe) - - ## Only return the stopped that are not unsafe - def get_stopped_safe(self) -> set: - return copy.deepcopy(self.stopped.difference(self.unsafe)) - - ## When we remove a command from unsafe we always remove from stopped too - def remove_from_unsafe(self, node_id: NodeId): - self.unsafe.remove(node_id) - self.stopped.remove(node_id) - - def get_committed(self) -> set: - return copy.deepcopy(self.committed) - - def get_committed_list(self) -> list: - return sorted(list(self.committed)) - - def is_committed(self, node_id: NodeId) -> bool: - return node_id in self.committed + def commit_node(self, node): + # Logic to handle committing a node + node.transition_to_committed() + # Maybe update dependencies here + # etc. def init_inverse_adjacency(self): - self.inverse_adjacency = {i: [] for i in self.nodes.keys()} + inverse_adjacency = {i: [] for i in self.nodes.keys()} for from_id, to_ids in self.adjacency.items(): for to_id in to_ids: - self.inverse_adjacency[to_id].append(from_id) - - # ## TODO: (When there is time) Define a function that checks that the graph is valid - ## TODO: Call valid and add assertiosn for loops here. - def valid(self): - logging.debug("Checking partial order validity...") - # self.log_partial_program_order_info() - valid1 = self.loop_nodes_valid() - ## TODO: Add a check that for x, y : NodeIds, x < y iff x is a predecessor to x - ## This is necessary due to the `hypothetical_before` method. - - ## Any command in unsafe must also be in stopped - valid2 = self.unsafe.issubset(self.stopped) - - ## TODO: Fix the checks below because they do not work currently - ## TODO: Check that committed is prefix closed w.r.t partial order - return valid1 and valid2 - - ## Checks if loop nodes are all valid, i.e., that there are no loop nodes handled like normal ones, - ## e.g., in workset, frontier etc - ## - ## Note that loop nodes can be in the committed set (after we are done executing all iterations of a loop) - def loop_nodes_valid(self): - # GL 2023-07-08: This works without get_all_next_non_committed_nodes(), not sure why - forbidden_sets = self.get_all_next_non_committed_nodes() + \ - self.get_workset() + \ - list(self.stopped) + \ - list(self.commands_currently_executing.keys()) - loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets - if self.is_loop_node(node_id)] - return len(loop_nodes_in_forbidden_sets) == 0 - - def __len__(self): - return len(self.nodes) - - def get_node(self, node_id:NodeId) -> Node: + inverse_adjacency[to_id].append(from_id) + return inverse_adjacency + + def get_node(self, node_id: NodeId) -> Node: return self.nodes[node_id] - - def is_node_id(self, node_id:NodeId) -> bool: - return node_id in self.nodes - - def get_node_loop_context(self, node_id: NodeId) -> LoopStack: - return self.get_node(node_id).get_loop_context() - - def get_all_non_committed(self) -> "list[NodeId]": - all_node_ids = self.nodes.keys() - non_committed_node_ids = [node_id for node_id in all_node_ids - if not self.is_committed(node_id)] - return non_committed_node_ids - - ## This adds a node to the committed set and saves important information - def commit_node(self, node_id: NodeId): - logging.debug(f" > Commiting node {node_id}") - self.committed.add(node_id) - - - def is_loop_node(self, node_id:NodeId) -> bool: - return self.get_node(node_id).in_loop() - - ## Only keeps standard (non-loop) nodes - def filter_standard_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - return [node_id for node_id in node_ids - if not self.is_loop_node(node_id)] - - def filter_loop_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]": - return [node_id for node_id in node_ids - if self.is_loop_node(node_id)] - - ## This creates a new node_id and then creates a mapping from the node and iteration id to this node id - ## TODO: Currently doesn't work with nested loops - def create_node_id_with_one_less_loop_from_loop_node(self, node_id: NodeId, loop_id: int) -> NodeId: - node = self.get_node(node_id) - logging.debug(f' >>> Node: {node}') - logging.debug(f' >>> its loops: {node.loop_context} --- {node.current_iters}') - - new_iter = node.get_next_iter(loop_id) - ## Creates a new node id where we have appended the new iter - new_node_id = node_id.generate_new_node_id_with_another_iter(new_iter) - logging.debug(f' >>> new node_id with another iter: {new_node_id}') - return new_node_id - - - ## Returns all non committed non-loop nodes - def get_all_non_committed_standard_nodes(self) -> "list[NodeId]": - all_non_committed = self.get_all_non_committed() - logging.debug(f"All non committed nodes: {all_non_committed}") - return self.filter_standard_nodes(all_non_committed) - - def get_next(self, node_id:NodeId) -> "list[NodeId]": - return self.adjacency[node_id][:] - - def get_prev(self, node_id:NodeId) -> "list[NodeId]": - return self.inverse_adjacency[node_id][:] - - def add_edge(self, from_id: NodeId, to_id: NodeId): - ## KK 2023-05-04 Is it a problem that we append? Maybe we should make that a set - self.adjacency[from_id].append(to_id) - self.inverse_adjacency[to_id].append(from_id) - - def remove_edge(self, from_id: NodeId, to_id: NodeId): - self.adjacency[from_id].remove(to_id) - self.inverse_adjacency[to_id].remove(from_id) - - def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": - all_next_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - successors = set(self.get_next(node_id)) - new_next = successors - all_next_transitive - all_next_transitive = all_next_transitive.union(successors) - next_work.extend(new_next) - return list(all_next_transitive) - - def get_inverse_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]": - all_prev_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - predecessors = set(self.get_prev(node_id)) - new_prev = predecessors - all_prev_transitive - all_prev_transitive = all_prev_transitive.union(predecessors) - next_work.extend(new_prev) - return list(all_prev_transitive) - - def get_transitive_closure_if_can_be_resolved(self, can_be_resolved: list, target_node_ids: list) -> list: - all_next_transitive = set(target_node_ids) - next_work = target_node_ids.copy() - while len(next_work) > 0: - node_id = next_work.pop() - successors = {next_node_id for next_node_id in self.get_next(node_id) if next_node_id in can_be_resolved} - new_next = successors - all_next_transitive - all_next_transitive = all_next_transitive.union(successors) - next_work.extend(new_next) - return list(all_next_transitive) - - def update_rw_set(self, node_id, rw_set): - self.rw_sets[node_id] = rw_set - - def get_rw_set(self, node_id) -> RWSet: - return self.rw_sets[node_id] - - def get_rw_sets(self) -> dict: - return self.rw_sets - - def add_to_read_set(self, node_id: NodeId, item: str): - self.rw_sets[node_id].add_to_read_set(item) - - def add_to_write_set(self, node_id: NodeId, item: str): - self.rw_sets[node_id].add_to_write_set(item) - - def add_to_speculated(self, node_id: NodeId): - self.speculated = self.speculated.union([node_id]) - - def is_first_node_when_env_is_uninitialized(self, speculate_immediately): - if not speculate_immediately: - starting_env_node = self.get_source_nodes() - ## We may have a loop node at the start - ## In that case, we roll back to the initial env - if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None: - logging.debug("Initializing latest env and speculating") - return True - return False - - # Check if the specific command can be resolved. - # KK 2023-05-04 I am not even sure what this function does and why is it useful. - def cmd_can_be_resolved(self, node_id: int) -> bool: - logging.debug(f'Checking if node {node_id} can be resolved...') - ## Get inverse_transitive_closure to find all nodes that are before this one - inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id]) - - ## Out of those nodes, filter out the non-committed ones - non_committed_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids - if not self.is_committed(node_id)] - logging.debug(f' > Non committed nodes that are predecessors to {node_id} are: {non_committed_nodes_in_inverse_tc}') - - currently_executing_ids = self.get_currently_executing() - logging.debug(f' > Currently executing: {currently_executing_ids}') - - ## TODO: Make this check more efficient - for other_node_id in non_committed_nodes_in_inverse_tc: - ## If one of the non-committed nodes in the inverse_tc is currently executing then - ## we can't resolve this command - ## KK 2023-05-04 This is not sufficient. In the future (where we don't speculate everything at once) - ## there might be a case where nothing is executing but a command can still not be resolved. - if other_node_id in currently_executing_ids: - logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is currently executing') - return False - - ## If there exists a loop node that is not committed before the command then we cannot resolve. - if self.is_loop_node(other_node_id): - logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is a loop node') - return False - - ## Otherwise we can return - logging.debug(f' >> Able to resolve {node_id}') - return True - - def __kill_all_currently_executing_and_schedule_restart(self, start=None): - nodes_to_kill = self.get_currently_executing() - if start is not None: - nodes_to_kill = [node_id for node_id in nodes_to_kill if node_id in self.get_transitive_closure([start])] - for cmd_id in nodes_to_kill: - self.__kill_node(cmd_id) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id) - self.prechecked_env.discard(cmd_id) - if most_recent_new_env is not None: - - self.set_latest_env_file_for_node(cmd_id, most_recent_new_env) - self.workset.remove(cmd_id) - log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id) - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd_id, key=f"PostExecResolution-{cmd_id}") - # Our new workset is the nodes that were killed - # Previous workset got killed - self.workset.extend(nodes_to_kill) - - - def __kill_node(self, cmd_id: "NodeId"): - logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed') - proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file, _ = self.commands_currently_executing.pop(cmd_id) - # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response - self.banned_files.add(trace_file) - - alive_after_kill = util.kill_process_tree(proc_to_kill.pid) - - if alive_after_kill: - logging.critical("Processes still alive after attempting to kill:") - for proc in alive_after_kill: - logging.critical(proc) - else: - logging.debug("All processes were successfully terminated.") - - def resolve_commands_that_can_be_resolved_and_push_frontier(self): - # This may be obsolete since we only resolve one node at a time - # cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated() - # assert len(cmds_to_resolve) <= 1 - if len(self.speculated) == 0: - cmds_to_resolve = [] - else: - cmds_to_resolve = [self.speculated.pop()] - logging.debug(f"Commands to check for dependencies this round are: {sorted(cmds_to_resolve)}") - logging.debug(f"Commands that cannot be resolved this round are: {sorted(self.speculated)}") - ## Resolve dependencies for the commands that can actually be resolved - to_commit = self.__resolve_dependencies_continuous_and_move_frontier(cmds_to_resolve) - for cmd in to_commit: - log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd) - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}") - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling") - - if len(to_commit) == 0: - logging.debug(" > No nodes to be committed this round") - else: - logging.debug(f" > Nodes to be committed this round: {to_commit}") - logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit)) - if config.SANDBOX_KILLING: - logging.info("Sandbox killing") - self.__kill_all_currently_executing_and_schedule_restart(to_commit) - log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling") - self.commit_cmd_workspaces(to_commit) - - def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn): - for second_cmd_id in cmds_to_check: - for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id): - - if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id): - update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id) - - # Internal function, modified the run_after dict and the pending_to_execute set - def __populate_run_after_dict(self): - for node in self.pending_to_execute.copy(): - prev_to_be_resolved = self.to_be_resolved_prev.get(node) - if prev_to_be_resolved is None: - return - # Check if env has changed since last comparison - elif set(self.to_be_resolved[node]) == set(prev_to_be_resolved): - # Not caring about this dependency because env has not yet changed - self.pending_to_execute.remove(node) - for k, v in self.run_after.items(): - if node in v: - self.run_after[k].remove(node) - - ## Spots dependencies and updates the state. - ## Safe to call everywhere - def resolve_dependencies_early(self, node_id=None): - def get_first_cmd_ids(second_cmd_id): - return sorted(self.to_be_resolved[second_cmd_id], reverse=True) - - def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): - self.waiting_for_frontend.discard(second_cmd_id) - self.run_after[first_cmd_id].add(second_cmd_id) - self.pending_to_execute.add(second_cmd_id) - logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency") - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id) - - to_check = {node for node in self.waiting_for_frontend if node not in self.speculated} - if node_id is not None: - to_check.add(node_id) - self.check_dependencies(to_check, get_first_cmd_ids, update_state_due_to_a_dependency) - self.populate_to_be_resolved_dict() - self.__populate_run_after_dict() - - def resolve_dependencies(self, cmds_to_resolve): - def get_first_cmd_ids(second_cmd_id): - return sorted([cmd_id for cmd_id in self.to_be_resolved[second_cmd_id] if cmd_id not in self.stopped]) - - def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id): - logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}') - new_workset.add(second_cmd_id) - - new_workset = set() - self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency) - - return new_workset - - - ## Resolve all the forward dependencies and update the workset - ## Forward dependency is when a command's output is the same - ## as the input of a following command - def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve): - # self.log_partial_program_order_info() - for cmd in cmds_to_resolve: - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd) - - logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}") - logging.debug(" --- Starting dependency resolution --- ") - new_workset = self.resolve_dependencies(cmds_to_resolve) - - logging.debug(" > Modifying workset accordingly") - # New workset contains previous unresolved commands and resolved commands with dependencies that have not been stopped - workset_old = self.workset.copy() - self.workset = [cmd_id for cmd_id in self.workset if cmd_id not in cmds_to_resolve and cmd_id not in self.stopped] - self.workset.extend(list(new_workset)) - workset_diff = set(self.workset) - set(workset_old) - logging.trace(f"WorksetAdd|{','.join(str(cmd_id) for cmd_id in workset_diff)}") - - # Keep the previous committed state - old_committed = self.get_committed() - - # We want stopped commands to not enter the workset again yet - assert(set(self.workset).isdisjoint(self.stopped)) - - self.__frontier_commit_and_push() - # self.log_partial_program_order_info() - return set(self.get_committed()) - old_committed - - - ## This method checks if nid1 would be before nid2 if nid2 was part of the PO. - ## - ## Therefore it does not just check edges, but rather computes if it would be before - ## based on ids and loop iterations. - ## - ## 1. Check if the loop ids of the two abstract parents of both nodes differ - ## thus showing that one is before the other - ## 2. If all loop ids are the same, now we can actually compare iterations. - ## If a node is in the same loop ids but in a later iteration then it is later. - ## 3. If all iterations are the same too, then we just compare node ids - ## - ## KK 2023-05-22 This is a complex procedure, I wonder if we can simplify it in some way - def hypothetical_before(self, nid1: NodeId, nid2: NodeId): - raw_id1 = nid1.get_non_iter_id() - ## Get all loop ids that nid1 could be in - loop_ids1 = self.get_node_loop_context(raw_id1) - - raw_id2 = nid1.get_non_iter_id() - ## Get all loop ids that nid2 could be in - loop_ids2 = self.get_node_loop_context(raw_id2) - - i = 0 - while i < len(loop_ids1) and i < len(loop_ids2): - loop_id_1 = loop_ids1.get(len(loop_ids1) - 1 - i) - loop_id_2 = loop_ids2.get(len(loop_ids2) - 1 - i) - ## If the first node is in a previous loop than the second, - ## then we are done. - if loop_id_1 < loop_id_2: - return True - elif loop_id_1 > loop_id_2: - return False - - ## We need to keep going - i += 1 - - ## If we reach this, we know that both nodes are in the same loops up to i - ## so we now compare iterations and node identifiers. - - iters1 = nid1.get_iters() - iters2 = nid2.get_iters() - - i = 0 - while i < len(iters1) and i < len(iters2): - iter1 = iters1.get(len(iters1) - 1 - i) - iter2 = iters2.get(len(iters2) - 1 - i) - ## If the first node is in a previous iteration than the second, - ## then we are done. - if iter1 < iter2: - return True - elif iter1 > iter2: - return False - ## We need to keep going - i += 1 - - ## We now know that their common prefix of iterations is the same - - ## Check if the node could potentially generate other nodes that are bigger - ## i.e., if it is more abstract. If so, then it is not smaller. - common_loop_depth = min(len(loop_ids1), len(loop_ids2)) - abstract_depth1 = max(common_loop_depth - len(iters1), 0) - abstract_depth2 = max(common_loop_depth - len(iters2), 0) - if abstract_depth1 < abstract_depth2: - return True - elif abstract_depth1 > abstract_depth2: - return False - - return nid1.id < nid2.id - - - def progress_po_due_to_wait(self, node_id: NodeId): - logging.debug(f"Checking if we can progress the partial order after having received a wait for {node_id}") - ## The node might not be part of the partial order if it corresponds to - ## a loop node iteration. In this case, we just need to make sure that - ## we commit the right previous loop nodes that are relevant to it. - if not self.is_node_id(node_id): - ## TODO: This check is not correct currently, it works for now, but when we move to full partial orders it wont anymore, - ## due to the check happening with < in hypothetical before - logging.debug(f" > Node {node_id} is not part of the PO so we compute the nodes that would be before it...") - all_non_committed = self.get_all_non_committed() - all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed) - non_committed_loop_nodes_that_would_be_predecessors = [n_id for n_id in all_non_committed_loop_nodes - if self.hypothetical_before(n_id, node_id)] - - new_committed_nodes = non_committed_loop_nodes_that_would_be_predecessors - - else: - logging.debug(f" > Node {node_id} is part of the PO so we just check its predecessors following the inverse edges...") - ## If the node is in the PO, then we can proceed normally and find its predecessors and commit them - - ## Get inverse_transitive_closure to find all nodes that are before this one - inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id]) - - ## Out of those nodes, filter out the non-committed loop ones - non_committed_loop_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids - if not self.is_committed(node_id) and - self.is_loop_node(node_id)] - logging.debug(f'Non committed loop nodes that are predecessors to {node_id} are: {non_committed_loop_nodes_in_inverse_tc}') - - new_committed_nodes = non_committed_loop_nodes_in_inverse_tc - - ## And "close them" - ## TODO: This is a hack here, we need to have a proper method that commits - ## nodes and does whatever else is needed to do (e.g., add new nodes to frontier) - logging.debug(f'Adding following loop nodes to committed: {new_committed_nodes}') - for node_id in new_committed_nodes: - self.commit_node(node_id) - - ## Since we committed some nodes, let's make sure that we also push the frontier - ## TODO: Can we do this in a less hacky method? By using a well-defined commit_node_and_push_frontier method? - if len(new_committed_nodes) > 0: - new_nodes_sinks = self.get_sub_po_sink_nodes(new_committed_nodes) - assert(len(new_nodes_sinks) == 1) - new_nodes_sink = new_nodes_sinks[0] - logging.debug(f'The sink of the newly committed loop nodes is {new_nodes_sink}') - - next_nodes = self.get_next(new_nodes_sink) - next_standard_nodes = self.filter_standard_nodes(next_nodes) - logging.trace(f"Adding its next nodes to the frontier|{','.join(str(node_id) for node_id in next_standard_nodes)}") - self.frontier.extend(next_standard_nodes) - - - - ## TODO: Add some form of validity assertion after we are done with this. - ## Just to make sure that we haven't violated the continuity of the committed set. - - ## We check if something can be resolved and stepped forward here - ## KK 2023-05-10 This seems to work for all tests (so it might be idempotent - ## since in many tests there is nothing new to resolve after a wait) - self.resolve_commands_that_can_be_resolved_and_push_frontier() - - ## When the frontend sends a wait for a node, it means that execution in the frontend has - ## already surpassed all nodes prior to it. This is particularly important for loops, - ## since we can't always statically predict how many iterations they will do, so the only - ## definitive way to know that they are done is to receive a wait for a node after them. - def wait_received(self, node_id: NodeId): - ## Whenever we receive a wait for a node, we always need to check and "commit" all prior loop nodes - ## since we know that they won't have any more iterations (the JIT frontend has already passed them). - - ## We first have to push and progress the PO due to the wait and then unroll - ## KK 2023-05-22 Currently this checks whether a still nonexistent node is - ## would be a successor of existing nodes to commit some of - ## them if needed. Unfortunately, to make this check for a non-existent - ## node is very complex and not elegant. - ## TODO: Could we swap unrolling and progressing so that we always - ## check if a node can be progressed by checking edges? - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - self.progress_po_due_to_wait(node_id) - log_time_delta_from_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - - - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id) - ## Unroll some nodes if needed. - if node_id.has_iters(): - ## TODO: This unrolling can also happen and be moved to speculation. - ## For now we are being conservative and that is why it only happens here - ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work) - self.maybe_unroll(node_id) - - assert(self.valid()) - - def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]": - loop_node_ids = [] - for node_id in nodes_subset: - loop_context = self.get_node_loop_context(node_id) - ## Note: this only checks for the nodes that have this loop id as their outer loop - if not loop_context.is_empty() and loop_id == loop_context.get_outer(): - loop_node_ids.append(node_id) - ## TODO: Assert that this is closed w.r.t. partial order - return loop_node_ids - - - ## This function unrolls a single loop, by first finding all its nodes (they must be contiguous) and then creating new versions of them - ## that are concretized. Its second argument describes which subset of all partial order nodes we want to look at. - ## That is necessary because when unrolling nested loops, we might end up in a situation where we have unrolled the - ## outer loop, but some of the newly created nodes might still be loop nodes (so we might have loop nodes for the same loop in multiple locations). - def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"): - logging.info(f'Unrolling loop with id: {loop_id}') - all_loop_node_ids = self.find_outer_loop_sub_partial_order(loop_id, nodes_subset) - - ## We don't want to unroll already committed nodes - loop_node_ids = [nid for nid in all_loop_node_ids - if not self.is_committed(nid)] - - logging.debug(f'Node ids for loop: {loop_id} are: {loop_node_ids}') - - ## Create the new nodes and remap adjacencies accordingly - node_mappings = {} - for node_id in loop_node_ids: - node = self.get_node(node_id) - new_loop_node_id = self.create_node_id_with_one_less_loop_from_loop_node(node_id, loop_id) - node_mappings[node_id] = new_loop_node_id - ## The new node has one less loop context than the previous one - node_loop_contexts = node.get_loop_context() - logging.debug(f'Node: {node_id} loop_contexts: {node_loop_contexts}') - assert(node_loop_contexts.get_outer() == loop_id) - new_node_loop_contexts = copy.deepcopy(node_loop_contexts) - new_node_loop_contexts.pop_outer() - - ## Create the new node - self.nodes[new_loop_node_id] = Node(new_loop_node_id, node.cmd, node.asts, new_node_loop_contexts) - self.executions[new_loop_node_id] = 0 - logging.debug(f'New loop ids: {node_mappings}') - - ## Create the new adjacencies, by mapping adjacencies in the node set to the new node ids - ## and leaving outside adjacencies as they are - for _, new_node_id in node_mappings.items(): - self.adjacency[new_node_id] = [] - - for node_id, new_node_id in node_mappings.items(): - old_prev_ids = self.get_prev(node_id) - ## Modify all id to be in the new set except for the - new_prev_ids = PartialProgramOrder.map_using_mapping(old_prev_ids, node_mappings) - self.inverse_adjacency[new_node_id] = new_prev_ids - for new_prev_id in new_prev_ids: - self.adjacency[new_prev_id].append(new_node_id) - - ## TODO: The rest of the code here makes assumptions about the shape of the partial order - - ## Modify the previous node of the loop nodes - new_nodes_sinks = self.get_sub_po_sink_nodes(list(node_mappings.values())) - assert(len(new_nodes_sinks) == 1) - new_nodes_sink = new_nodes_sinks[0] - logging.debug(f'The sink of the new iteration for loop: {loop_id} is {new_nodes_sink}') - - old_nodes_sources = self.get_sub_po_source_nodes(list(node_mappings.keys())) - assert(len(old_nodes_sources) == 1) - old_nodes_source = old_nodes_sources[0] - - old_next_node_ids = self.get_next(new_nodes_sink) - assert(len(old_next_node_ids) <= 1) - - previous_ids = self.get_sub_po_prev_nodes(loop_node_ids) - assert(len(previous_ids) <= 1) - - ## Add a new edge between the new_sink (concrete iter) and the old_source (loop po) - self.add_edge(new_nodes_sink, old_nodes_source) - - ## Remove the old previous edge of the old_source if it exists - if len(previous_ids) == 1: - previous_id = previous_ids[0] - logging.debug(f'Previous node id for loop: {loop_id} is {previous_id}') - self.remove_edge(from_id=previous_id, - to_id=old_nodes_source) - - - ## Return the new first node and all node mappings - return node_mappings[old_nodes_source], node_mappings.values() - - ## Static method that just maps using a node mapping dictionary or leaves them as - ## they are if not - def map_using_mapping(node_ids: "list[NodeId]", mapping) -> "list[NodeId]": - new_node_ids = [] - for node_id in node_ids: - if node_id in mapping: - new_id = copy.deepcopy(mapping[node_id]) - else: - new_id = copy.deepcopy(node_id) - new_node_ids.append(new_id) - return new_node_ids - - ## This unrolls a sequence of loops by unrolling each loop outside-in - def unroll_loops(self, loop_contexts: LoopStack) -> NodeId: - logging.debug(f'Unrolling the following loops: {loop_contexts}') - - ## All new node_ids - all_new_node_ids = set() - relevant_node_ids = list(self.nodes.keys()) - for loop_ctx in loop_contexts.outer_to_inner(): - new_first_node_id, new_node_ids = self.unroll_single_loop(loop_ctx, relevant_node_ids) - logging.debug(f'New node ids after unrolling: {new_node_ids}') - ## Update all new nodes that we have added - all_new_node_ids.update(new_node_ids) - - ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once, - ## we just want to look at those new unrolled nodes for the next unrolling). - relevant_node_ids = new_node_ids - - logging.debug(f' >>> Edges after unrolling : {self.adjacency}') - logging.debug(f' >>> Inv Edges after unrolling: {self.inverse_adjacency}') - - ## Add all new standard nodes to the workset (since they have to be tracked) - for new_node_id in all_new_node_ids: - if not self.is_loop_node(new_node_id): - self.workset.append(new_node_id) - ## GL: 08-24-2023: This might not the best way to treat this as we need - ## to update the env half way through the loop. - ## For now, we just copy the env from the parent loop node - non_iter_id = new_node_id.get_non_iter_id() - logging.debug(f"Copying latest env from loop context to loop node: {non_iter_id} -> {new_node_id}") - self.latest_envs[new_node_id] = self.latest_envs[non_iter_id] - - ## KK 2023-05-22 Do we need to correctly populate the resolved set of next commands - ## after unrolling the loop. - - return new_first_node_id - - ## This unrolls a loop given a target concrete node id - def unroll_loop_node(self, target_concrete_node_id: NodeId): - raw_node_id = target_concrete_node_id.get_non_iter_id() - assert(self.is_loop_node(raw_node_id)) - - logging.debug(f'Edges: {self.adjacency}') - - ## Find the closest non-committed successor with this node id - ## Note: This is necessary because we might need to unroll only a subset of the loops that a node is part of. - ## This is relevant when we have nested loops. - all_non_committed = self.get_all_non_committed() - all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed) - logging.debug(f'All non committed loop nodes: {all_non_committed_loop_nodes}') - source_node_ids = self.get_min(all_non_committed_loop_nodes) - ## Note: This assertion might not hold once we have actual partial orders - assert(len(source_node_ids) == 1) - node_id = source_node_ids[0] - logging.debug(f'Closest non-committed loop node successor with raw_id {raw_node_id} is: {node_id}') - loop_contexts = self.get_node_loop_context(node_id) - - - ## Unroll all loops that this node is in - new_first_node_id = self.unroll_loops(loop_contexts) - - ## TODO: This needs to change when we modify unrolling to happen speculatively too - ## TODO: This needs to properly add the node to frontier and to resolve dictionary - - # GL 2023-05-22: __frontier_commit_and_push() should be called here instead of step_forward() - # Although without it the test cases pass - self.frontier.append(new_first_node_id) - - ## At the end of unrolling the target node must be part of the PO - assert(self.is_node_id(target_concrete_node_id)) - - - def maybe_unroll(self, node_id: NodeId) -> NodeId: - ## Only unrolls this node if it doesn't already exist in the PO - if not self.is_node_id(node_id): - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) - self.unroll_loop_node(node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id) - ## The node_id must be part of the PO after unrolling, otherwise we did something wrong - assert(self.is_node_id(node_id)) - - - ## Pushes the frontier forward as much as possible for all commands in it that can be committed - ## This function is not safe to call on its own, since it might leave the PO in a broken state - ## It should be called right after - def __frontier_commit_and_push(self): - logging.debug(" > Commiting and pushing frontier") - logging.debug(f' > Frontier: {self.frontier}') - changes_in_frontier = True - while changes_in_frontier: - new_frontier = [] - changes_in_frontier = False - # Second condition below may be unecessary - for frontier_node in self.frontier: - ## If a node is not in the workset it means that it is actually done executing - ## KK 2023-05-10 Do we need all these conditions in here? Some might be redundant? - if frontier_node not in self.get_currently_executing() \ - and frontier_node not in self.get_committed() \ - and frontier_node not in self.stopped \ - and frontier_node not in self.speculated \ - and frontier_node not in self.workset \ - and not self.is_loop_node(frontier_node) \ - and frontier_node not in self.waiting_for_frontend: - ## Commit the node - self.commit_node(frontier_node) - - ## Add its non-loop successors to the frontier - next_nodes = self.get_next(frontier_node) - next_standard_nodes = self.filter_standard_nodes(next_nodes) - logging.trace(f"FrontierAdd|{','.join(str(node_id) for node_id in next_standard_nodes)}") - new_frontier.extend(next_standard_nodes) - - ## There are some changes in the frontier so we need to reenter the loop - changes_in_frontier = True - # If node is still being executed, we cannot progress further - else: - new_frontier.extend([frontier_node]) - if frontier_node in self.get_currently_executing(): - logging.debug(f" > Node {frontier_node} is still being executed") - elif frontier_node in self.get_committed(): - logging.debug(f" > Node {frontier_node} is already committed") - elif frontier_node in self.stopped: - logging.debug(f" > Node {frontier_node} is stopped") - elif frontier_node in self.speculated: - logging.debug(f" > Node {frontier_node} is speculated") - elif frontier_node in self.workset: - logging.debug(f" > Node {frontier_node} is in the workset") - elif self.is_loop_node(frontier_node): - logging.debug(f" > Node {frontier_node} is a loop node") - elif frontier_node in self.waiting_for_frontend: - logging.debug(f" > Node {frontier_node} is waiting for frontend") - logging.debug(f" > Not commiting node {frontier_node}, keeping in frontier") - - ## Update the frontier to the new frontier - self.frontier = new_frontier - - - ## For a file - dir forward dependency to exist, - ## we need the succeding command to attempt to read anything that is a subpath of the - ## write set of the preceeding command. - ## e.g. in: W1: {/foo/} | R2: {/f1, /foo/f2, /foo/bar/f3} - ## /foo/f2 and /foo/bar/f3 will trigger the dependency check. - def has_dir_file_dependency(self, first_cmd_set, second_cmd_set): - # Get all directory paths without the "/" in the end - dirs = {dir_path[:-1] for dir_path in first_cmd_set if dir_path.endswith("/")} - # Get all files in a separate set - to_check = {filepath for filepath in second_cmd_set if not filepath.endswith("/")} - for dir in dirs: - for other_path in to_check: - if self.is_subpath(dir, other_path): - logging.debug(f' > File forward dependency found C1:({dir}) C2:({other_path})') - return True - return False - - def is_subpath(self, dir, other_path): - other_path.startswith(os.path.abspath(dir)+os.sep) - - def has_forward_dependency(self, first_id, second_id): - first_write_set = set(self.rw_sets[first_id].get_write_set()) - second_read_set = set(self.rw_sets[second_id].get_read_set()).union(set(self.rw_sets[second_id].get_write_set())) - logging.debug(f'Checking dependencies between {first_id} and {second_id}') - if not first_write_set.isdisjoint(second_read_set): - logging.debug(f' > Forward dependency found {first_write_set.intersection(second_read_set)}') - return True - - elif self.has_dir_file_dependency(first_write_set, second_read_set): - return True - else: - logging.debug(f' > No dependencies') - return False - - def get_all_next_non_committed_nodes(self) -> "list[NodeId]": - next_non_committed_nodes = [] - for cmd_id in self.get_all_non_committed(): - if cmd_id in self.workset and self.is_next_non_committed_node(cmd_id): - next_non_committed_nodes.append(cmd_id) - return next_non_committed_nodes - - def is_next_non_committed_node(self, node_id: NodeId) -> bool: - # We want the predecessor to be committed and the current node to not be committed - for prev_node in self.get_prev(node_id): - if not (self.is_committed(prev_node) and not self.is_committed(node_id)): - return False - return True - - # This command never leaves the partial order at a broken state - # It is always safe to call it - def attempt_move_stopped_to_workset(self): - new_stopped = self.stopped.copy() - ## We never remove stopped commands that are unsafe - ## from the stopped set to be reexecuted. - for cmd_id in self.get_stopped_safe(): - if self.is_next_non_committed_node(cmd_id): - self.workset.append(cmd_id) - logging.debug(f"StoppedRemove|{cmd_id}") - new_stopped.remove(cmd_id) - self.to_be_resolved[cmd_id] = [] - self.stopped = new_stopped - - ## TODO: Eventually, in the future, let's add here some form of limit - def schedule_work(self, limit=0): - if self.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): - logging.debug("Not scheduling work yet, waiting for first Wait") - return - # self.log_partial_program_order_info() - logging.debug("Rerunning stopped commands") - # attempt_move_stopped_to_workset() needs to happen before the node execution - self.attempt_move_stopped_to_workset() - ## GL 2023-07-05 populate_to_be_resolved_dict() is OK to call anywhere, - ## __frontier_commit_and_push() is not safe to call here - self.populate_to_be_resolved_dict() - - ## TODO: Move loop unrolling here for speculation too - - conflicted_nodes = self.nodes_with_uncommited_conflict() - for cmd_id in self.get_workset(): - if cmd_id in conflicted_nodes: - continue - # We only need to schedule non-committed and non-executing nodes - if not (cmd_id in self.get_committed() or \ - cmd_id in self.commands_currently_executing): - self.schedule_node(cmd_id) - assert(self.valid()) - - # Nodes to be scheduled are always not committed and not executing - def schedule_node(self, cmd_id): - # This replaced the old frontier check - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id) - if self.is_next_non_committed_node(cmd_id): - # TODO: run this and before committing kill any speculated commands still executing - self.run_cmd_non_blocking(cmd_id) - else: - if not cmd_id in self.speculated: - self.speculate_cmd_non_blocking(cmd_id) - return - - ## Run a command and add it to the dictionary of executing ones - def run_cmd_non_blocking(self, node_id: NodeId): - ## A command should only be run if it's in the frontier, otherwise it should be spec run - logging.debug(f'Running command: {node_id} {self.get_node(node_id)}') - logging.debug(f"ExecutingAdd|{node_id}") - self.to_be_resolved_prev[node_id] = self.to_be_resolved[node_id].copy() - self.execute_cmd_core(node_id, speculate=False) - - ## Run a command and add it to the dictionary of executing ones - def speculate_cmd_non_blocking(self, node_id: NodeId): - logging.debug(f'Speculating command: {node_id} {self.get_node(node_id)}') - ## TODO: Since these (this and the function above) - ## are relevant for the report maker, - ## add them in some library (e.g., trace_for_report) - ## so that we don't accidentally delete them. - logging.debug(f"ExecutingSandboxAdd|{node_id}") - self.execute_cmd_core(node_id, speculate=True) - - def execute_cmd_core(self, node_id: NodeId, speculate=False): - node = self.get_node(node_id) - ## TODO: Read and pass the actual variables in this - variables = {} - is_safe = analysis.safe_to_execute(node.asts, variables) - if not is_safe: - logging.debug(f'Command: "{node}" is not safe to execute, sending to the original shell to execute...') - - ## Keep some state around to determine that this command is not safe to execute. - self.stopped.add(node_id) - self.unsafe.add(node_id) - ## TODO: After we respond to the wait, we need to invalidate all later - ## commands as if they had dependencies with it. In the future, - ## we can be smarter with it. Many unsafe commands will not have - ## other side-effects, so we don't need to invalidate anything after them. - return - - cmd = node.get_cmd() - self.executions[node_id] += 1 - env_file_to_execute_with = self.get_latest_env_file_for_node(node_id) - logging.debug(f"Executing with environment file: {env_file_to_execute_with}") - if speculate: - execute_func = executor.async_run_and_trace_command_return_trace_in_sandbox_speculate - else: - execute_func = executor.async_run_and_trace_command_return_trace - - proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = execute_func(cmd, node_id, env_file_to_execute_with) - self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir) - logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}") - - def nodes_with_uncommited_conflict(self): - uncommited_run_after = [node_id for node_id in self.run_after if node_id not in self.committed] - total_conflicts = set() - for node_id in uncommited_run_after: - conflicts = self.run_after[node_id] - total_conflicts.update(conflicts) - return total_conflicts - - def kill_and_stop(self, node_id: NodeId): - proc, _, _, _, _, _ = self.commands_currently_executing.pop(node_id) - util.kill_process_tree(proc.pid, sig=signal.SIGTERM) - - def early_stop_using_dep(self): - for node_id, info_tuple in self.commands_currently_executing.items(): - trace_file = info_tuple[1] - sandbox_dir = info_tuple[5] - try: - trace_object = executor.read_trace(sandbox_dir, trace_file) - except FileNotFoundError: - continue - logging.info(f'going forward') - read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) - rw_set = RWSet(read_set, write_set) - self.update_rw_set(node_id, rw_set) - for node_id in self.commands_currently_executing: - self.resolve_dependencies_early(node_id) - self.log_partial_program_order_info() - conflicts = self.nodes_with_uncommited_conflict() - to_be_killed = [node_id for node_id in self.commands_currently_executing if node_id in conflicts] - logging.info(f'>>>>>>>>>>>>>>>>> to be killed: {to_be_killed}') - for node_id in to_be_killed: - self.kill_and_stop(node_id) - - # This method attempts to add to workset (rerun) - # any command that found to have a dependency through early resolution - def attempt_rerun_pending_nodes(self): - restarted_nodes = set() - for node_id, run_after_nodes in self.run_after.items(): - new_run_after_nodes = run_after_nodes.copy() - if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute and node_id not in self.get_currently_executing(): - for node in run_after_nodes: - if node not in self.get_currently_executing(): - logging.debug(f"Running node {node} after execution of {node_id}") - self.waiting_for_frontend.discard(node) - self.workset.append(node) - self.pending_to_execute.discard(node) - self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id)) - restarted_nodes.add(node) - self.prechecked_env.discard(node) - new_run_after_nodes.discard(node) - self.run_after[node_id] = new_run_after_nodes - return restarted_nodes - - def set_sandbox(self, node_id, sandbox_dir): - self.sandbox_dirs[node_id] = sandbox_dir - def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str): - log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}") - - logging.debug(f" --- Node {node_id}, just finished execution ---") - self.sandbox_dirs[node_id] = sandbox_dir - ## TODO: Store variable file somewhere so that we can return when wait - if not node_id in self.commands_currently_executing: - return - _proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = self.commands_currently_executing.pop(node_id) - if not sandbox_dir == self.sandbox_dirs[node_id]: - return - logging.trace(f"ExecutingRemove|{node_id}") - # Handle stopped by riker due to network access - if int(riker_exit_code) == 159: - logging.debug(f" > Adding {node_id} to stopped because it tried to access the network.") - logging.trace(f"StoppedAdd|{node_id}:network") - self.stopped.add(node_id) - else: - trace_object = executor.read_trace(sandbox_dir, trace_file) - cmd_exit_code = trace_v2.parse_exit_code(trace_object) - - ## Save the completed node info. Note that if the node doesn't commit - ## this information will be invalid and rewritten the next time execution - ## is completed for this node. - completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir) - self.nodes[node_id].set_completed_info(completed_node_info) - - ## We no longer add failed commands to the stopped set, - ## because this leads to more repetitions than needed - ## and does not allow us to properly speculate commands - read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) - rw_set = RWSet(read_set, write_set) - self.update_rw_set(node_id, rw_set) - - if node_id in self.stopped: - logging.debug(f"Nothing new to be resolved since {node_id} exited with an error.") - if node_id in self.workset: - self.workset.remove(node_id) - logging.debug(f"WorksetRemove|{node_id}") - # If no commands can be resolved this round, - # do nothing and wait until a new command finishes executing - logging.debug("No resolvable nodes were found in this round, nothing will change...") - return - - - log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id) - - ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet. - if self.get_new_env_file_for_node(node_id) is None: - logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...") - self.waiting_for_frontend.add(node_id) - - # We will however attempt to resolve dependencies early - self.resolve_dependencies_early(node_id) - restarted_cmds = self.attempt_rerun_pending_nodes() - # self.log_partial_program_order_info() - ## Here we continue with the normal execution flow - else: - logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) - - #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. - def exclude_insignificant_diffs(self, env_diff_dict): - return {k: v for k, v in env_diff_dict.items() if k not in config.INSIGNIFICANT_VARS} - - #TODO: Remove ths in the future - we need a more robust approach to check for env diffs. - def include_only_significant_vars(self, env_diff_dict): - return {k: v for k, v in env_diff_dict.items() if k in config.SIGNIFICANT_VARS} - - def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_in_both): - # Exclude insignificant differences - only_in_new_sig = self.include_only_significant_vars(only_in_new) - only_in_latest_sig = self.include_only_significant_vars(only_in_latest) - different_in_both_sig = self.include_only_significant_vars(different_in_both) - # If still diffs are present, return False - if len(only_in_new_sig) > 0 or len(only_in_latest_sig) > 0 or len(different_in_both_sig) > 0: - logging.debug("Significant differences found:") - logging.debug(f"Unique to new (Wait): {only_in_new_sig}") - logging.debug(f"Unique to latest (Before Riker): {only_in_latest_sig}") - logging.debug(f"Differing values: {different_in_both_sig}") - return True - else: - logging.debug("No significant differences found:") - return False - - def update_env_and_restart_nodes(self, node_id: NodeId): - logging.debug(f"Significant differences found between new and latest env files for {node_id}.") - logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.") - self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id)) - self.prechecked_env.discard(node_id) - if node_id not in self.workset: - self.workset.append(node_id) - self.__kill_all_currently_executing_and_schedule_restart(start=node_id) - new_waiting_for_frontend = self.waiting_for_frontend.copy() - for waiting_for_frontend_node in self.waiting_for_frontend: - if waiting_for_frontend_node not in self.workset and waiting_for_frontend_node in self.get_transitive_closure([node_id]): - self.workset.append(waiting_for_frontend_node) - new_waiting_for_frontend.remove(waiting_for_frontend_node) - most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node) - self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env) - self.prechecked_env.discard(waiting_for_frontend_node) - assert(self.get_new_env_file_for_node(node_id) is not None) - assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None) - # self.log_partial_program_order_info() - logging.debug("-") - self.waiting_for_frontend = new_waiting_for_frontend - self.populate_to_be_resolved_dict() - - def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None): - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), - self.get_latest_env_file_for_node(node_id)): - self.update_env_and_restart_nodes(node_id) - else: - self.prechecked_env.add(node_id) - - def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None): - logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.") - self.waiting_for_frontend.discard(node_id) - if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), - self.get_latest_env_file_for_node(node_id)): - self.update_env_and_restart_nodes(node_id) - else: - logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env") - assert(node_id not in self.stopped) - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id) - self.add_to_speculated(node_id) - self.resolve_dependencies_early(node_id) - restarted_cmds = self.attempt_rerun_pending_nodes() - logging.debug(f"Restarted after successful env resolution {restarted_cmds}") - # self.log_partial_program_order_info() - self.resolve_commands_that_can_be_resolved_and_push_frontier() - assert(self.valid()) - - def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId): - if node_id in self.waiting_for_frontend: - logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.") - self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id) - else: - logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.") - self.resolve_most_recent_envs_check_only_wait_node_early(node_id) - - def new_and_latest_env_files_have_significant_differences(self, new_env_file, latest_env_file): - # Early resolution if same files are compared - if new_env_file == latest_env_file: - logging.debug(f"Env files are the same. No need to compare.") - return False - logging.debug(f"Comparing new and latest env files: {new_env_file} {latest_env_file}") - assert(latest_env_file is not None) - - new_env = executor.read_env_file(new_env_file) - latest_env = executor.read_env_file(latest_env_file) - - only_in_new, only_in_latest, different_in_both = util.compare_env_strings(new_env, latest_env) - - return self.significant_diff_in_env_dicts(only_in_new, only_in_latest, different_in_both) - - def print_cmd_stderr(self, stderr): - # stdout.seek(0) - # print(stdout.read().decode(), end="") - stderr.seek(0) - print(stderr.read().decode(), file=sys.stderr, end="") - - def commit_cmd_workspaces(self, to_commit_ids): - for cmd_id in sorted(to_commit_ids): - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) - workspace = self.sandbox_dirs[cmd_id] - if workspace != "": - logging.debug(f" (!) Committing workspace of cmd {cmd_id} found in {workspace}") - commit_workspace_out = executor.commit_workspace(workspace) - logging.debug(commit_workspace_out.decode()) - else: - logging.debug(f" (!) No need to commit workspace of cmd {cmd_id} as it was run in the main workspace") - log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id) - - def log_rw_sets(self): - logging.debug("====== RW Sets " + "=" * 65) - for node_id, rw_set in self.rw_sets.items(): - logging.debug(f"ID:{node_id} | R:{[f for f in rw_set.get_read_set() if 'output_' in f] if rw_set else None} | W:{rw_set.get_write_set() if rw_set is not None else None}") - - def log_partial_program_order_info(self): - logging.debug(f"=" * 80) - logging.debug(f"WORKSET: {self.get_workset()}") - logging.debug(f"COMMITTED: {self.get_committed_list()}") - logging.debug(f"FRONTIER: {self.frontier}") - logging.debug(f"EXECUTING: {list(self.commands_currently_executing.keys())}") - logging.debug(f"STOPPED: {list(self.stopped)}") - logging.debug(f" of which UNSAFE: {list(self.get_unsafe())}") - logging.debug(f"WAITING: {sorted(list(self.speculated))}") - logging.debug(f"for FRONTEND: {sorted(list(self.waiting_for_frontend))}") - logging.debug(f"TO RESOLVE: {self.to_be_resolved}") - logging.debug(f"PENDING TO EXEC: {self.pending_to_execute}") - logging.debug(f"RUN AFTER: {self.run_after}") - logging.debug(f"New envs: {self.new_envs}") - logging.debug(f"Latest envs: {self.latest_envs}") - self.log_rw_sets() - logging.debug(f"=" * 80) - - ## TODO: Document how this finds the to be resolved dict - def populate_to_be_resolved_dict(self): - logging.debug("Populating the resolved dictionary for all nodes") - for node_id in self.nodes: - if self.is_committed(node_id): - logging.debug(f" > Node: {node_id} is committed, emptying its dict") - self.to_be_resolved[node_id] = [] - continue - # We don't want to modify the set of nodes to check for dependencies for this node - # as it started running before previous cmds had started executing - elif node_id in self.speculated: - logging.debug(f" > Node: {node_id} is waiting to be resolved, skipping...") - continue - elif node_id in self.get_currently_executing(): - logging.debug(f" > Node: {node_id} is currently executing, skipping...") - continue - elif node_id in self.waiting_for_frontend: - logging.debug(f" > Node: {node_id} is currently waiting for frontend, skipping...") - continue - else: - logging.debug(f" > Node: {node_id} is not executing or waiting to be resolved (speculated) so we modify its set.") - self.to_be_resolved[node_id] = [] - traversal = [] - relevant_committed = self.get_committed() - if node_id not in relevant_committed: - to_add = self.get_prev(node_id).copy() - traversal = to_add.copy() - to_be_resolved_nodes_ids = to_add.copy() - while len(traversal) > 0: - current_node_id = traversal.pop(0) - if current_node_id not in relevant_committed: - to_add = self.get_prev(current_node_id) - to_be_resolved_nodes_ids.extend(to_add) - traversal.extend(to_add) - self.to_be_resolved[node_id] = to_be_resolved_nodes_ids.copy() - self.to_be_resolved[node_id] = list(set(self.to_be_resolved[node_id]) - set(relevant_committed)) - logging.debug(f' |> New to be resolved set: {self.to_be_resolved[node_id]}') - - def get_currently_executing(self) -> list: - return sorted(list(self.commands_currently_executing.keys())) - - def log_executions(self): - logging.debug("---------- (Re)executions ------------") - for cmd in sorted(self.get_committed_list()): - logging.debug(f" CMD {cmd} executed {self.executions[cmd]} times") - logging.debug(f"Executions|{cmd},{self.executions[cmd]}") - logging.debug(f" Total (re)executions: {sum(list(self.executions.values()))}") - logging.debug(f"TotalExec|{sum(list(self.executions.values()))}") - logging.debug("--------------------------------------") - - -## TODO: Try to move those to PaSh and import them here -def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]": - logging.debug(f'Parsing: {file_path}') - with open(file_path) as f: - cmd = f.read() - asts = analysis.parse_shell_to_asts(file_path) - return cmd, asts - -def parse_edge_line(line: str) -> "tuple[int, int]": - from_str, to_str = line.split(" -> ") - return (int(from_str), int(to_str)) - -def parse_loop_context_line(line: str) -> "tuple[int, list[int]]": - node_id, loop_contexts_raw = line.split("-loop_ctx-") - if loop_contexts_raw != "": - loop_contexts_str = loop_contexts_raw.split(",") - loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str] - else: - loop_contexts = [] - return int(node_id), loop_contexts - -def parse_loop_contexts(lines): - loop_contexts = {} - for line in lines: - node_id, loop_ctx = parse_loop_context_line(line) - loop_contexts[node_id] = loop_ctx - - return loop_contexts - -def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder: - with open(file_path) as f: - raw_lines = f.readlines() - - ## Filter comments and remove new lines - lines = [line.rstrip() for line in raw_lines - if not line.startswith("#")] - - ## The directory in which cmd_files are - cmds_directory = str(lines[0]) - logging.debug(f'Cmds are stored in: {cmds_directory}') - - ## The initial env file - initial_env_file = str(lines[1]) - - ## The number of nodes - number_of_nodes = int(lines[2]) - logging.debug(f'Number of po cmds: {number_of_nodes}') - - ## The loop context for each node - loop_context_start=3 - loop_context_end=number_of_nodes+3 - loop_context_lines = lines[loop_context_start:loop_context_end] - loop_contexts = parse_loop_contexts(loop_context_lines) - logging.debug(f'Loop contexts: {loop_contexts}') - - ## The rest of the lines are edge_lines - edge_lines = lines[loop_context_end:] - logging.debug(f'Edges: {edge_lines}') - - nodes = {} - for i in range(number_of_nodes): - file_path = f'{cmds_directory}/{i}' - cmd, asts = parse_cmd_from_file(file_path) - loop_ctx = loop_contexts[i] - nodes[NodeId(i)] = Node(NodeId(i), cmd, - asts=asts, - loop_context=LoopStack(loop_ctx)) + def get_committed_nodes(self): + return [node for node in self.nodes.values() if node.is_committed()] + + def get_ready_nodes(self): + return [node for node in self.nodes.values() if node.is_ready()] + + def get_executing_nodes(self): + return [node for node in self.nodes.values() if node.is_executing()] + + def get_spec_executing_nodes(self): + return [node for node in self.nodes.values() if node.is_spec_executing()] + + def get_executing_normal_and_speculated_nodes(self): + return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()] + + def get_speculated_nodes(self): + return [node for node in self.nodes.values() if node.is_speculated()] + + def get_uncommitted_nodes(self): + return [node for node in self.nodes.values() if not node.is_committed()] - edges = {NodeId(i) : [] for i in range(number_of_nodes)} - for edge_line in edge_lines: - from_id, to_id = parse_edge_line(edge_line) - edges[NodeId(from_id)].append(NodeId(to_id)) + def log_state(self): + for node in self.nodes.values(): + logging.info(f"Node {node.id}: {node.state}") - logging.trace(f"Nodes|{','.join([str(node) for node in nodes])}") - logging.trace(f"Edges|{edges}") - return PartialProgramOrder(nodes, edges, initial_env_file) + def schedule_work(self): + pass diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 436eb642..e04b2e92 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -1,9 +1,9 @@ import argparse import logging import signal -from util import * +import util import config -from partial_program_order import parse_partial_program_order_from_file, LoopStack, NodeId, parse_node_id +from partial_program_order import PartialProgramOrder, NodeId ## ## A scheduler server @@ -67,7 +67,7 @@ class Scheduler: def __init__(self, socket_file): ## TODO: Add all the orchestrator state here (it should just be the partial order) self.done = False - self.socket = init_unix_socket(socket_file) + self.socket = util.init_unix_socket(socket_file) ## A map containing connections for node_ids that are waiting for a response self.waiting_for_response = {} self.partial_program_order = None @@ -76,69 +76,83 @@ def handle_init(self, input_cmd: str): assert(input_cmd.startswith("Init")) partial_order_file = input_cmd.split(":")[1].rstrip() logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}') - self.partial_program_order = parse_partial_program_order_from_file(partial_order_file) + self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file) self.partial_program_order.init_partial_order() + + def process_next_cmd(self): + connection, input_cmd = util.socket_get_next_cmd(self.socket) + + if(input_cmd.startswith("Init")): + connection.close() + self.handle_init(input_cmd) + ## TODO: Read the partial order from the given file + elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): + logging.info(f'Scheduler: Received daemon start message.') + connection.close() + elif (input_cmd.startswith("CommandExecComplete:")): + node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + logging.info(f'Scheduler: Received command exec complete message - {node_id}.') + connection.close() + elif (input_cmd.startswith("Wait")): + node_id, _ = self.__parse_wait(input_cmd) + self.waiting_for_response[node_id] = connection + logging.info(f'Scheduler: Received wait message - {node_id}.') + self.respond_to_pending_wait(node_id) + + elif (input_cmd.startswith("Done")): + # if not self.partial_program_order.is_completed(): + # logging.debug(" |- some nodes were skipped completed.") + util.socket_respond(connection, success_response("All finished!")) + self.partial_program_order.log_state() + self.done = True + elif input_cmd.startswith("CommandExecStart:"): + node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') + # self.handle_command_exec_start(input_cmd) + else: + logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) + raise Exception(f'Error: Unsupported command: {input_cmd}') + + def respond_to_frontend_core(self, node_id: NodeId, response: str): + assert(node_id in self.waiting_for_response) + ## Get the connection that we need to respond to + connection = self.waiting_for_response.pop(node_id) + util.socket_respond(connection, response) + connection.close() + + def respond_to_pending_wait(self, node_id: int): + logging.debug(f'Responding to pending wait for node: {node_id}') + ## Get the completed node info + node = self.partial_program_order.get_node(node_id) + completed_node_info = node.get_main_sandbox() + # George: Currently I don't init the sandbox info anywhere since there is no execution + msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' + response = success_response(msg) + ## Send the response + self.respond_to_frontend_core(node_id, response) + def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]": try: node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|") raw_node_id_int = int(node_id_component.split(":")[1].rstrip()) loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip() pash_runtime_vars_file_str = pash_runtime_vars_file_component.split(":")[1].rstrip() - if loop_counters_str == "None": - node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str - else: - loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] - node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str + # TODO Implement loops correctly + # if loop_counters_str == "None": + # node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str + # else: + # loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] + # node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str + node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str return node_id except: raise Exception(f'Parsing failure for line: {input_cmd}') - - def handle_wait(self, input_cmd: str, connection): - assert(input_cmd.startswith("Wait")) - ## We have received this message by the JIT, which waits for a node_id to - ## finish execution. - node_id, pash_runtime_vars_file_str = self.__parse_wait(input_cmd) - logging.debug(f'Scheduler: Received wait for node_id: {node_id}|New env file: {pash_runtime_vars_file_str}') - - ## Set the new env file for the node - self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str) - if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY): - logging.debug("Initializing latest env and speculating") - self.partial_program_order.init_latest_env_files(node_id) - - ## Attempt to rerun all pending nodes - self.partial_program_order.attempt_rerun_pending_nodes() - - ## Inform the partial order that we received a wait for a node so that it can push loops - ## forward and so on. - self.partial_program_order.maybe_unroll(node_id) - - # Moved this below wait_received, in order to support unrolled loop nodes - self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id) - - self.partial_program_order.wait_received(node_id) - - ## If the node_id is already committed, just return its exit code - if node_id in self.partial_program_order.get_committed(): - logging.debug(f'Node: {node_id} found in committed, responding immediately!') - self.waiting_for_response[node_id] = connection - self.respond_to_pending_wait(node_id) - elif node_id in self.partial_program_order.get_unsafe(): - logging.debug(f'Node: {node_id} found in unsafe, it must be executed in the original shell!') - self.waiting_for_response[node_id] = connection - self.respond_unsafe_to_pending_wait(node_id) - else: - ## Command has not executed yet, so we need to wait for it - logging.debug(f'Node: {node_id} has not finished execution, waiting for response...') - self.waiting_for_response[node_id] = connection - - def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": try: components = input_cmd.rstrip().split("|") - command_id = parse_node_id(components[0].split(":")[1]) + command_id = NodeId.parse_node_id(components[0].split(":")[1]) exit_code = int(components[1].split(":")[1]) sandbox_dir = components[2].split(":")[1] trace_file = components[3].split(":")[1] @@ -146,126 +160,10 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": except: raise Exception(f'Parsing failure for line: {input_cmd}') - def respond_unsafe_to_pending_wait(self, node_id: int): - assert(node_id in self.partial_program_order.get_unsafe()) - ## First remove node_id from unsafe and stopped and add to committed - ## since it will be executed immediately in the original shell - self.partial_program_order.remove_from_unsafe(node_id) - self.partial_program_order.commit_node(node_id) - - response = unsafe_response("") - - ## Send the response - self.respond_to_frontend_core(node_id, response) - - - ## TODO: send riker env here - def respond_to_pending_wait(self, node_id: int): - logging.debug(f'Responding to pending wait for node: {node_id}') - ## Get the completed node info - node = self.partial_program_order.get_node(node_id) - completed_node_info = node.get_completed_node_info() - msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' - response = success_response(msg) - ## Send the response - self.respond_to_frontend_core(node_id, response) - - - def respond_to_frontend_core(self, node_id: NodeId, response: str): - assert(node_id in self.waiting_for_response) - ## Get the connection that we need to respond to - connection = self.waiting_for_response.pop(node_id) - socket_respond(connection, response) - connection.close() - - def handle_command_exec_start(self, input_cmd): - assert(input_cmd.startswith("CommandExecStart:")) - cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - self.partial_program_order.set_sandbox(cmd_id, sandbox_dir) - - def handle_command_exec_complete(self, input_cmd: str): - assert(input_cmd.startswith("CommandExecComplete:")) - ## Read the node id from the command argument - cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - if trace_file in self.partial_program_order.banned_files: - logging.debug(f'CommandExecComplete: {cmd_id} ignored') - return - ## Gather RWset, resolve dependencies, and progress graph - self.partial_program_order.command_execution_completed(cmd_id, exit_code, sandbox_dir) - - ## If there is a connection waiting for this node_id, respond to it - if cmd_id in self.waiting_for_response and cmd_id in self.partial_program_order.get_committed(): - self.respond_to_pending_wait(cmd_id) - - def process_next_cmd(self): - connection, input_cmd = socket_get_next_cmd(self.socket) - - if(input_cmd.startswith("Init")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "PartialOrderInit") - connection.close() - self.handle_init(input_cmd) - ## TODO: Read the partial order from the given file - log_time_delta_from_named_timestamp("Scheduler", "PartialOrderInit") - elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "DaemonStart") - connection.close() - ## This happens when pa.sh first connects to daemon to see if it is on - logging.debug(f'PaSh made first contact with scheduler server.') - log_time_delta_from_named_timestamp("Scheduler", "DaemonStart") - elif (input_cmd.startswith("CommandExecComplete:")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "CommandExecComplete") - ## We have received this message from an a runner (tracer +isolation) - ## The runner should have already parsed RWsets and serialized them to - ## a file. - connection.close() - self.handle_command_exec_complete(input_cmd) - log_time_delta_from_named_timestamp("Scheduler", "CommandExecComplete") - elif (input_cmd.startswith("Wait")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Wait") - self.handle_wait(input_cmd, connection) - log_time_delta_from_named_timestamp("Scheduler", "Wait") - elif (input_cmd.startswith("Done")): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Done") - logging.debug(f'Scheduler server received shutdown message.') - logging.debug(f'The partial order was successfully completed.') - if not self.partial_program_order.is_completed(): - logging.debug(" |- some nodes were skipped completed.") - socket_respond(connection, success_response("All finished!")) - self.partial_program_order.log_executions() - self.done = True - log_time_delta_from_named_timestamp("Scheduler", "Done") - elif input_cmd.startswith("CommandExecStart:"): - #TODO: add logging stuff - self.handle_command_exec_start(input_cmd) - else: - logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) - raise Exception(f'Error: Unsupported command: {input_cmd}') - - def check_unsafe_and_waiting(self): - ## If a command is waiting and also deemed to be unsafe, we need to respond - waiting_for_response = set(self.waiting_for_response.keys()) - unsafe = set(self.partial_program_order.get_unsafe()) - unsafe_and_waiting = unsafe.intersection(waiting_for_response) - if len(unsafe_and_waiting) > 0: - assert(len(unsafe_and_waiting) == 1) - logging.debug(f'Unsafe and waiting for response nodes: {unsafe_and_waiting}') - logging.debug(f'Sending responses to them: {unsafe_and_waiting}') - unsafe_and_waiting_id = list(unsafe_and_waiting)[0] - self.respond_unsafe_to_pending_wait(unsafe_and_waiting_id) - - ## This function schedules commands for execution until our capacity is reached - ## - ## It should add some work (if possible), and then return immediately. - ## It is called once per loop iteration, making sure that there is always work happening def schedule_work(self): - log_time_delta_from_start_and_set_named_timestamp("Scheduler", "ScheduleWork") self.partial_program_order.schedule_work() - ## Respond to any waiting nodes that have been deemed to be unsafe - self.check_unsafe_and_waiting() - log_time_delta_from_named_timestamp("Scheduler", "ScheduleWork") - def run(self): ## The first command should be the daemon start self.process_next_cmd() @@ -273,17 +171,9 @@ def run(self): ## The second command should be the partial order init self.process_next_cmd() - while not self.done: - # TODO: wrap this around something probably - self.partial_program_order.early_stop_using_dep() - - ## Schedule some work (if we are already at capacity this will return immediately) self.schedule_work() - ## Process a single request self.process_next_cmd() - # If workset is empty we should end. - # TODO: ec checks fail for now self.socket.close() self.shutdown() @@ -295,13 +185,12 @@ def shutdown(self): self.terminate_pending_commands() def terminate_pending_commands(self): - for _node_id, cmd_info in self.partial_program_order.commands_currently_executing.items(): - proc, _trace_file, _stdout, _stderr, _variable_file, _ = cmd_info - proc.terminate() - + for node in self.partial_program_order.get_executing_normal_and_speculated_nodes(): + proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox() + logging.debug(f'Killing: {proc}') + # proc.terminate() def main(): - log_time_delta_from_start("Scheduler", "Scheduler Init") args = init() # Format logging @@ -318,9 +207,7 @@ def main(): logging.getLogger().setLevel(logging.INFO) elif args.debug_level >= 2: logging.getLogger().setLevel(logging.DEBUG) - # elif args.debug_level >= 3: - # logging.getLogger().setLevel(logging.TRACE) - + # Set optimization options config.SANDBOX_KILLING = args.sandbox_killing config.SPECULATE_IMMEDIATELY = args.speculate_immediately diff --git a/parallel-orch/util.py b/parallel-orch/util.py index 799cd379..af01273f 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -8,6 +8,9 @@ import re import psutil import signal +import analysis +from node import Node, NodeId +from partial_program_order import PartialProgramOrder def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX) @@ -176,3 +179,82 @@ def kill_process_tree(pid, sig=signal.SIGTERM): except: pass return alive_processes + + +## TODO: Try to move those to PaSh and import them here +def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]": + logging.debug(f'Parsing: {file_path}') + with open(file_path) as f: + cmd = f.read() + asts = analysis.parse_shell_to_asts(file_path) + return cmd, asts + +def parse_edge_line(line: str) -> "tuple[int, int]": + from_str, to_str = line.split(" -> ") + return (int(from_str), int(to_str)) + +def parse_loop_context_line(line: str) -> "tuple[int, list[int]]": + node_id, loop_contexts_raw = line.split("-loop_ctx-") + if loop_contexts_raw != "": + loop_contexts_str = loop_contexts_raw.split(",") + loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str] + else: + loop_contexts = [] + return int(node_id), loop_contexts + +def parse_loop_contexts(lines): + loop_contexts = {} + for line in lines: + node_id, loop_ctx = parse_loop_context_line(line) + loop_contexts[node_id] = loop_ctx + return loop_contexts + + +def parse_partial_program_order_from_file(file_path: str): + with open(file_path) as f: + raw_lines = f.readlines() + + ## Filter comments and remove new lines + lines = [line.rstrip() for line in raw_lines + if not line.startswith("#")] + + ## The directory in which cmd_files are + cmds_directory = str(lines[0]) + logging.debug(f'Cmds are stored in: {cmds_directory}') + + ## The initial env file + initial_env_file = str(lines[1]) + + ## The number of nodes + number_of_nodes = int(lines[2]) + logging.debug(f'Number of po cmds: {number_of_nodes}') + + ## The loop context for each node + loop_context_start=3 + loop_context_end=number_of_nodes+3 + loop_context_lines = lines[loop_context_start:loop_context_end] + loop_contexts = parse_loop_contexts(loop_context_lines) + logging.debug(f'Loop contexts: {loop_contexts}') + + ## The rest of the lines are edge_lines + edge_lines = lines[loop_context_end:] + logging.debug(f'Edges: {edge_lines}') + + nodes = {} + for i in range(number_of_nodes): + file_path = f'{cmds_directory}/{i}' + cmd, asts = parse_cmd_from_file(file_path) + # loop_ctx = loop_contexts[i] + # nodes[NodeId(i)] = Node(NodeId(i), cmd, + # asts=asts, + # loop_context=LoopStack(loop_ctx)) + nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts) + + edges = {NodeId(i) : [] for i in range(number_of_nodes)} + for edge_line in edge_lines: + from_id, to_id = parse_edge_line(edge_line) + edges[NodeId(from_id)].append(NodeId(to_id)) + + logging.info(f"Nodes|{','.join([str(node) for node in nodes])}") + logging.info(f"Edges|{edges}") + return PartialProgramOrder(nodes, edges) \ No newline at end of file From c82cf3b7cfb5c54bca5620065884dd6379371a54 Mon Sep 17 00:00:00 2001 From: George Liargkovas Date: Fri, 12 Jan 2024 00:01:13 +0200 Subject: [PATCH 13/39] Progress the refactor --- parallel-orch/node.py | 9 +- parallel-orch/partial_program_order.py | 134 ++++++++++++++++++++++++- parallel-orch/scheduler_server.py | 10 +- 3 files changed, 149 insertions(+), 4 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 616e1b58..1401b988 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -136,6 +136,11 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): self.background_sandbox = None + def __str__(self): + return f'Node(id:{self.id}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})' + + def __repr__(self): + return str(self) def is_initialized(self): return self.state == NodeState.INIT @@ -169,11 +174,13 @@ def get_main_sandbox(self): ## Transition Functions ## ## ## - def transition_to_ready(self): + def transition_from_init_to_ready(self): assert self.state == NodeState.INIT self.state = NodeState.READY # Initialize data structures here + # Also, probably unroll here? + def transition_to_executing(self): assert self.state == NodeState.READY self.state = NodeState.EXECUTING diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index e4728d36..23d41589 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1,5 +1,6 @@ from node import NodeId, Node import logging +from collections import deque class PartialProgramOrder: @@ -20,9 +21,24 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId self.window = 0 self.to_be_resolved = {} + # def init_partial_order(self): + # self.init_workset() + # logging.debug(f'Initialized workset') + # self.populate_to_be_resolved_dict() + # if config.SPECULATE_IMMEDIATELY: + # self.init_latest_env_files() + # logging.debug(f'To be resolved sets per node:') + # logging.debug(self.to_be_resolved) + # logging.info(f'Initialized the partial order!') + # # self.log_partial_program_order_info() + # assert(self.valid()) + def init_partial_order(self): for node_id, node in self.nodes.items(): - node.transition_to_ready() + if node.is_initialized(): + node.transition_from_init_to_ready() + + self.frontier = self.get_standard_source_nodes() # TODO: Implement the rest of the partial order initialization def commit_node(self, node): @@ -61,6 +77,15 @@ def get_speculated_nodes(self): def get_uncommitted_nodes(self): return [node for node in self.nodes.values() if not node.is_committed()] + + def get_frontier(self): + return self.frontier + + def log_info(self): + logging.info(f"Nodes: {self.nodes}") + logging.info(f"Adjacency: {self.adjacency}") + logging.info(f"Inverse adjacency: {self.inverse_adjacency}") + self.log_state() def log_state(self): for node in self.nodes.values(): @@ -68,3 +93,110 @@ def log_state(self): def schedule_work(self): pass + + def get_source_nodes(self) -> list: + sources = set() + for to_id, from_ids in self.inverse_adjacency.items(): + if len(from_ids) == 0: + sources.add(to_id) + return list(sources) + + ## Returns the next non-committed normal node + def progress_frontier(self) -> "list[NodeId]": + return self.get_next_frontier_nodes(self.get_frontier()) + + def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]": + return self.adjacency[node_id][:] + + def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]": + return self.inverse_adjacency[node_id][:] + + def get_source_nodes(self) -> list: + sources = set() + for to_id, from_ids in self.inverse_adjacency.items(): + if len(from_ids) == 0: + sources.add(to_id) + return list(sources) + + def get_standard_source_nodes(self) -> list: + source_nodes = self.get_source_nodes() + # TODO: Filter out loop nodes + # return self.filter_standard_nodes(source_nodes) + return source_nodes + + + + + def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": + # TODO: filter non-loop nodes + visited = set() + to_visit = [(node_id, 0) for node_id in start_nodes] # Pair each start node with depth 0 + non_committed_nodes = set() + first_non_committed_depth = None + + while to_visit: + current_node_id, depth = to_visit.pop() + if current_node_id in visited: + continue + + visited.add(current_node_id) + current_node = self.nodes.get(current_node_id) + + if not current_node.is_committed(): + if first_non_committed_depth is None: + first_non_committed_depth = depth + elif depth > first_non_committed_depth: + # Do not consider nodes deeper than the first non-committed depth + continue + + non_committed_nodes.add(current_node_id) + + if first_non_committed_depth is None or depth < first_non_committed_depth: + next_nodes = self.get_next_nodes(current_node_id) # Use the provided method to get next nodes + for neighbor in next_nodes: + if neighbor not in visited: + to_visit.append((neighbor, depth + 1)) # Increase depth for neighbors + + return non_committed_nodes + + def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": + if visited is None: + visited = set() + visited.add(current_node_id) + + all_next_nodes = set([current_node_id]) + for neighbor in self.get_next_nodes(current_node_id): + if neighbor not in visited: + all_next_nodes.update(self.get_all_next(neighbor, visited)) + + return all_next_nodes + + + def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": + if visited is None: + visited = set() + visited.add(current_node_id) + + all_previous_nodes = set([current_node_id]) + for neighbor in self.get_prev_nodes(current_node_id): + if neighbor not in visited: + all_previous_nodes.update(self.get_all_previous(neighbor, visited)) + + return all_previous_nodes + + def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": + previous = self.get_all_previous(node_id) + return set([node for node in previous if not self.nodes[node].is_committed()]) + + + def init_to_be_resolved_dict(self): + for node_id in self.nodes.keys(): + self.to_be_resolved[node_id] = ... + + def init_to_be_resolved_dict(self): + for node_id in self.nodes.keys(): + self.to_be_resolved[node_id] = ... + + + def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): + diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index e04b2e92..e1de16fa 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -104,8 +104,12 @@ def process_next_cmd(self): # if not self.partial_program_order.is_completed(): # logging.debug(" |- some nodes were skipped completed.") util.socket_respond(connection, success_response("All finished!")) - self.partial_program_order.log_state() + self.partial_program_order.log_info() self.done = True + nodes = self.partial_program_order.nodes + for k, v in nodes.items(): + logging.info(self.partial_program_order.progress_frontier()) + logging.info(f"{k} {self.partial_program_order.get_next_frontier_nodes([k])}") elif input_cmd.startswith("CommandExecStart:"): node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') @@ -127,8 +131,10 @@ def respond_to_pending_wait(self, node_id: int): node = self.partial_program_order.get_node(node_id) completed_node_info = node.get_main_sandbox() # George: Currently I don't init the sandbox info anywhere since there is no execution - msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' + # msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' + msg = "0 foo bar bax qux" response = success_response(msg) + ## Send the response self.respond_to_frontend_core(node_id, response) From 6b3140a373fb716efed5ad662ebf5a0a204d1274 Mon Sep 17 00:00:00 2001 From: Guest Date: Fri, 12 Jan 2024 01:29:28 -0500 Subject: [PATCH 14/39] basic sequential execution --- parallel-orch/node.py | 55 +++++++++++++++++++++----- parallel-orch/partial_program_order.py | 13 +++--- parallel-orch/scheduler_server.py | 19 +++++---- 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 1401b988..0454a9a1 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,3 +1,7 @@ +import executor +from dataclasses import dataclass +from subprocess import Popen +from typing import Tuple from enum import Enum, auto class NodeState(Enum): @@ -102,9 +106,23 @@ def __gt__(self, obj): def parse_node_id(node_id_str: str): return NodeId(int(node_id_str)) - +@dataclass +class ExecCtxt: + process: Popen + trace_file: str + stdout: str + stderr: str + post_env_file: str + sandbox_dir: str + +@dataclass +class ExecResult: + exit_code: int + proc_id: int + + class Node: - id: NodeId + id_: NodeId cmd: str asts: "list[AstNode]" state: NodeState @@ -119,25 +137,27 @@ class Node: main_sandbox: Sandbox # This can only be set while in the frontier and the background node execution is enabled background_sandbox: Sandbox - + exec_ctxt: ExecCtxt + exec_result: ExecResult def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): - self.id = node_id + self.id_ = node_id self.cmd = cmd self.asts = asts # The node's state self.state = NodeState.INIT self.tracefile = None self.rwset = None - # The + # The self.to_be_resolved_snapshot = None self.main_sandbox = None self.background_sandbox = None + self.exec_ctxt = None def __str__(self): - return f'Node(id:{self.id}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})' + return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})' def __repr__(self): return str(self) @@ -169,7 +189,18 @@ def is_unsafe(self): def get_main_sandbox(self): return self.main_sandbox - + + def start_command(self, env_file: str, speculate=False): + # TODO: implement speculate + # TODO: built-in commands + cmd = self.cmd + execute_func = executor.async_run_and_trace_command_return_trace + self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, env_file)) + + def execution_outcome(self) -> Tuple[int, str, str]: + assert self.exec_result is not None + return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout + ## ## ## Transition Functions ## ## ## @@ -181,11 +212,17 @@ def transition_from_init_to_ready(self): # Also, probably unroll here? - def transition_to_executing(self): + def start_executing(self, env_file): assert self.state == NodeState.READY + self.start_command(env_file) self.state = NodeState.EXECUTING - # TODO + def commit_frontier_execution(self): + assert self.state == NodeState.EXECUTING + self.state = NodeState.COMMITTED + self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) + executor.commit_workspace(self.exec_ctxt.sandbox_dir) + def transition_to_spec_executing(self): assert self.state == NodeState.READY self.state = NodeState.SPEC_EXECUTING diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 23d41589..e4e8d3ed 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -89,10 +89,10 @@ def log_info(self): def log_state(self): for node in self.nodes.values(): - logging.info(f"Node {node.id}: {node.state}") + logging.info(f"Node {node.id_}: {node.state}") - def schedule_work(self): - pass + def schedule_work(self, node_id: NodeId, env_file: str): + self.get_node(node_id).start_executing(env_file) def get_source_nodes(self) -> list: sources = set() @@ -122,10 +122,7 @@ def get_standard_source_nodes(self) -> list: source_nodes = self.get_source_nodes() # TODO: Filter out loop nodes # return self.filter_standard_nodes(source_nodes) - return source_nodes - - - + return source_nodes def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": # TODO: filter non-loop nodes @@ -199,4 +196,4 @@ def init_to_be_resolved_dict(self): def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): - + pass diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index e1de16fa..80dd1206 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -92,13 +92,19 @@ def process_next_cmd(self): connection.close() elif (input_cmd.startswith("CommandExecComplete:")): node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - logging.info(f'Scheduler: Received command exec complete message - {node_id}.') connection.close() + logging.info(f'Scheduler: Received command exec complete message - {node_id}.') + node = self.partial_program_order.get_node(node_id) + # TODO: condition here to do different things based on node state + node.commit_frontier_execution() + self.respond_to_pending_wait(node_id) elif (input_cmd.startswith("Wait")): - node_id, _ = self.__parse_wait(input_cmd) + node_id, env_file = self.__parse_wait(input_cmd) self.waiting_for_response[node_id] = connection logging.info(f'Scheduler: Received wait message - {node_id}.') - self.respond_to_pending_wait(node_id) + node = self.partial_program_order.get_node(node_id) + # TODO: condition here to do different things based on node state + self.partial_program_order.schedule_work(node_id, env_file) elif (input_cmd.startswith("Done")): # if not self.partial_program_order.is_completed(): @@ -130,9 +136,7 @@ def respond_to_pending_wait(self, node_id: int): ## Get the completed node info node = self.partial_program_order.get_node(node_id) completed_node_info = node.get_main_sandbox() - # George: Currently I don't init the sandbox info anywhere since there is no execution - # msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}' - msg = "0 foo bar bax qux" + msg = '{} {} {}'.format(*node.execution_outcome()) response = success_response(msg) ## Send the response @@ -168,7 +172,8 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": def schedule_work(self): - self.partial_program_order.schedule_work() + # self.partial_program_order.schedule_work() + pass def run(self): ## The first command should be the daemon start From 91901e75af5c26ea05033df643c988ab45a21737 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 13 Jan 2024 15:24:05 -0700 Subject: [PATCH 15/39] Cleanup and implement transitions triggered by Wait --- parallel-orch/executor.py | 14 ++-- parallel-orch/node.py | 105 ++++++++++--------------- parallel-orch/partial_program_order.py | 96 ++++++++++++++-------- parallel-orch/scheduler_server.py | 35 ++++----- 4 files changed, 129 insertions(+), 121 deletions(-) diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 2d58deaf..93439c4b 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -8,7 +8,7 @@ # and traces them with Riker. # All commands are run inside an overlay sandbox. -def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=False): trace_file = util.ptempfile() stdout_file = util.ptempfile() stderr_file = util.ptempfile() @@ -17,17 +17,17 @@ def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) - return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir + process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) + return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file): - process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True) +def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, pre_execution_env_file): + process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=True) return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' - args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file, sandbox_dir, tmp_dir] + args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir] if speculate_mode: args.append("speculate") else: diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 0454a9a1..b0008661 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,3 +1,4 @@ +import logging import executor from dataclasses import dataclass from subprocess import Popen @@ -14,40 +15,6 @@ class NodeState(Enum): SPEC_EXECUTING = auto() UNSAFE = auto() -class Sandbox: - def __init__(self, trace_file, exit_code, post_execution_env_file, stdout_file, sandbox_dir): - # These get predetermined prior to the execution - self.trace_file = trace_file - self.post_execution_env_file = post_execution_env_file - self.stdout_file = stdout_file - self.sandbox_dir = sandbox_dir - # These get set after execution is done - self.exit_code = None - self.proc_id = None - - def set_exit_code(self, exit_code): - self.exit_code = exit_code - - def set_proc_id(self, proc_id): - self.proc_id = proc_id - - def get_exit_code(self): - return self.exit_code - - def get_post_execution_env_file(self): - return self.post_execution_env_file - - def get_stdout_file(self): - return self.stdout_file - - def get_sandbox_dir(self): - return self.sandbox_dir - - def get_trace_file(self): - return self.trace_file - - def __str__(self): - return f'Sandbox(trace:{self.get_trace_file}, ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})' class RWSet: @@ -75,15 +42,15 @@ class NodeId: #TODO: Implement iteration support - def __init__(self, id: int): - self.id = id + def __init__(self, id_: int): + self.id_ = id_ def get_non_iter_id(self): - return NodeId(self.id) + return NodeId(self.id_) def __repr__(self): ## TODO: Represent it using n. - output = f'{self.id}' + output = f'{self.id_}' return output def __hash__(self): @@ -91,7 +58,7 @@ def __hash__(self): def __eq__(self, other): # return self.loop_iters == other.loop_iters and self.id == other.id - return self.id == other.id + return self.id_ == other.id_ def __ne__(self, other): return not(self == other) @@ -112,6 +79,7 @@ class ExecCtxt: trace_file: str stdout: str stderr: str + pre_env_file: str post_env_file: str sandbox_dir: str @@ -132,11 +100,11 @@ class Node: to_be_resolved_snapshot: "set[NodeId]" # Read and write sets for this node rwset: RWSet - # This contains the sandbox and execution info for a spec-executing node - # (or plain executing node if frontier background node execution is not enabled) - main_sandbox: Sandbox + # The wait trace file for this node + wait_env_file: str # This can only be set while in the frontier and the background node execution is enabled - background_sandbox: Sandbox + # TODO: For now ignore this. Maybe there is a better way to do this. + # background_sandbox: Sandbox exec_ctxt: ExecCtxt exec_result: ExecResult @@ -144,20 +112,15 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): self.id_ = node_id self.cmd = cmd self.asts = asts - # The node's state self.state = NodeState.INIT self.tracefile = None self.rwset = None - # The + self.wait_env_file = None self.to_be_resolved_snapshot = None - - self.main_sandbox = None - - self.background_sandbox = None self.exec_ctxt = None def __str__(self): - return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})' + return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})' def __repr__(self): return str(self) @@ -185,10 +148,6 @@ def is_spec_executing(self): def is_unsafe(self): return self.state == NodeState.UNSAFE - - def get_main_sandbox(self): - return self.main_sandbox - def start_command(self, env_file: str, speculate=False): # TODO: implement speculate @@ -218,24 +177,44 @@ def start_executing(self, env_file): self.state = NodeState.EXECUTING def commit_frontier_execution(self): - assert self.state == NodeState.EXECUTING + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] self.state = NodeState.COMMITTED self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) executor.commit_workspace(self.exec_ctxt.sandbox_dir) - def transition_to_spec_executing(self): + + def _attempt_start_command(self, env_file, speculate=False): + if self.wait_env_file is not None: + self.start_command(env_file=self.wait_env_file, speculate=speculate) + elif env_file is not None: + self.start_command(env_file=env_file, speculate=speculate) + else: + logging.error(f'Error: No valid execution env for Node {self.id_}') + + def transition_from_ready_to_executing(self, env_file=None): + assert self.state == NodeState.READY + self.state = NodeState.EXECUTING + self._attempt_start_command(env_file) + + def transition_from_ready_to_spec_executing(self, env_file=None): assert self.state == NodeState.READY self.state = NodeState.SPEC_EXECUTING - # TODO + self._attempt_start_command(env_file, speculate=True) + + def transition_from_stopped_to_executing(self, env_file=None): + assert self.state == NodeState.READY + self.state = NodeState.EXECUTING + self._attempt_start_command(env_file) def transition_to_committed(self): - assert self.state in [NodeState.EXECUTING, NodeState.SPECULATED] + assert self.state in NodeState.SPECULATED self.state = NodeState.COMMITTED # TODO - # TODO: other transition functions - + def transition_from_spec_executing_to_speculated(self): + pass - # Do we need this here of should we handle everything on scheduler server and ppo? - def handle_event(self, event_msg): - pass # TODO + def set_wait_env_file(self, env_file: str): + assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED] + self.post_env_file = env_file + \ No newline at end of file diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index e4e8d3ed..b94aad2c 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -20,24 +20,13 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId self.run_after = set() self.window = 0 self.to_be_resolved = {} - - # def init_partial_order(self): - # self.init_workset() - # logging.debug(f'Initialized workset') - # self.populate_to_be_resolved_dict() - # if config.SPECULATE_IMMEDIATELY: - # self.init_latest_env_files() - # logging.debug(f'To be resolved sets per node:') - # logging.debug(self.to_be_resolved) - # logging.info(f'Initialized the partial order!') - # # self.log_partial_program_order_info() - # assert(self.valid()) - + def init_partial_order(self): for node_id, node in self.nodes.items(): if node.is_initialized(): node.transition_from_init_to_ready() - + + # Init frontier self.frontier = self.get_standard_source_nodes() # TODO: Implement the rest of the partial order initialization @@ -80,7 +69,7 @@ def get_uncommitted_nodes(self): def get_frontier(self): return self.frontier - + def log_info(self): logging.info(f"Nodes: {self.nodes}") logging.info(f"Adjacency: {self.adjacency}") @@ -94,12 +83,6 @@ def log_state(self): def schedule_work(self, node_id: NodeId, env_file: str): self.get_node(node_id).start_executing(env_file) - def get_source_nodes(self) -> list: - sources = set() - for to_id, from_ids in self.inverse_adjacency.items(): - if len(from_ids) == 0: - sources.add(to_id) - return list(sources) ## Returns the next non-committed normal node def progress_frontier(self) -> "list[NodeId]": @@ -111,7 +94,7 @@ def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]": def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]": return self.inverse_adjacency[node_id][:] - def get_source_nodes(self) -> list: + def get_source_nodes(self) -> "list[NodeId]": sources = set() for to_id, from_ids in self.inverse_adjacency.items(): if len(from_ids) == 0: @@ -185,15 +168,66 @@ def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": previous = self.get_all_previous(node_id) return set([node for node in previous if not self.nodes[node].is_committed()]) + def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): + node = self.nodes.get(node_id) + if node.is_committed(): + self.to_be_resolved[node_id] = [] + elif node.is_ready(): + self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id) + + def adjust_to_be_resolved_dict(self): + for node_id in self.to_be_resolved.keys(): + self.adjust_to_be_resolved_dict_entry(node_id) + - def init_to_be_resolved_dict(self): - for node_id in self.nodes.keys(): - self.to_be_resolved[node_id] = ... + #TODO: Add partial order invariant checks + def valid(self): + return True + + def handle_wait(self, node_id: NodeId, env_file: str): + node = self.get_node(node_id) + + # Invalid state check + if node.is_committed() or node.is_unsafe() or node.is_initialized(): + logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - def init_to_be_resolved_dict(self): - for node_id in self.nodes.keys(): - self.to_be_resolved[node_id] = ... - - def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): - pass + # For all the valid states, set the wait env file + # Q to @Di: Do we need to make the wait env file a node attribute + # (same for most recent env file) or is it ok to just pass it around here? + # We might use it in the future so maybe we shouldn't drop it. + node.set_wait_env_file(env_file) + + + if node.is_ready(): + if node.id_ in self.get_frontier(): + node.transition_from_ready_to_executing(env_file) + else: + node.transition_from_ready_to_spec_executing(env_file) + elif node.is_stopped(): + if node in self.get_frontier(): + logging.info(f'Node {node_id} is stopped and in the frontier.') + node.transition_from_stopped_to_executing(env_file) + else: + logging.info(f'Node {node_id} is stopped but not in the frontier.') + elif node.is_speculated(): + pass + # TODO: handle this case + # Check if env conflicts exist + # Check fs deps + # If no env or fs conflicts, then commit the node + elif node.is_executing(): + # Do nothing + pass + elif node.is_spec_executing(): + # Do nothing + pass + else: + logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') + + # TODO: think about this + # self.schedule_work_single_node() + # self.schedule_work_all_nodes() + diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 80dd1206..19d43865 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -65,7 +65,6 @@ class Scheduler: """ def __init__(self, socket_file): - ## TODO: Add all the orchestrator state here (it should just be the partial order) self.done = False self.socket = util.init_unix_socket(socket_file) ## A map containing connections for node_ids that are waiting for a response @@ -78,7 +77,21 @@ def handle_init(self, input_cmd: str): logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}') self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file) self.partial_program_order.init_partial_order() + + def handle_command_exec_complete(): + # TODO: Implement + pass + + def handle_command_exec_start(): + # TODO: Implement + pass + + def handle_wait(self, input_cmd: str, connection): + node_id, env_file = self.__parse_wait(input_cmd) + self.waiting_for_response[node_id] = connection + logging.info(f'Scheduler: Received wait message - {node_id}.') + self.partial_program_order.handle_wait(node_id, env_file) def process_next_cmd(self): connection, input_cmd = util.socket_get_next_cmd(self.socket) @@ -99,23 +112,11 @@ def process_next_cmd(self): node.commit_frontier_execution() self.respond_to_pending_wait(node_id) elif (input_cmd.startswith("Wait")): - node_id, env_file = self.__parse_wait(input_cmd) - self.waiting_for_response[node_id] = connection - logging.info(f'Scheduler: Received wait message - {node_id}.') - node = self.partial_program_order.get_node(node_id) - # TODO: condition here to do different things based on node state - self.partial_program_order.schedule_work(node_id, env_file) - + self.handle_wait(input_cmd, connection) elif (input_cmd.startswith("Done")): - # if not self.partial_program_order.is_completed(): - # logging.debug(" |- some nodes were skipped completed.") util.socket_respond(connection, success_response("All finished!")) self.partial_program_order.log_info() self.done = True - nodes = self.partial_program_order.nodes - for k, v in nodes.items(): - logging.info(self.partial_program_order.progress_frontier()) - logging.info(f"{k} {self.partial_program_order.get_next_frontier_nodes([k])}") elif input_cmd.startswith("CommandExecStart:"): node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') @@ -135,7 +136,6 @@ def respond_to_pending_wait(self, node_id: int): logging.debug(f'Responding to pending wait for node: {node_id}') ## Get the completed node info node = self.partial_program_order.get_node(node_id) - completed_node_info = node.get_main_sandbox() msg = '{} {} {}'.format(*node.execution_outcome()) response = success_response(msg) @@ -171,10 +171,6 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": raise Exception(f'Parsing failure for line: {input_cmd}') - def schedule_work(self): - # self.partial_program_order.schedule_work() - pass - def run(self): ## The first command should be the daemon start self.process_next_cmd() @@ -183,7 +179,6 @@ def run(self): self.process_next_cmd() while not self.done: - self.schedule_work() self.process_next_cmd() self.socket.close() From 7b1741f1f68a74781f289e2400cdf81bab7704d2 Mon Sep 17 00:00:00 2001 From: Guest Date: Sun, 14 Jan 2024 12:15:17 -0500 Subject: [PATCH 16/39] simple speculation enabled --- parallel-orch/node.py | 102 ++++++++++++++++----- parallel-orch/partial_program_order.py | 117 +++++++++++++++++-------- parallel-orch/scheduler_server.py | 20 +++-- 3 files changed, 175 insertions(+), 64 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index b0008661..65b2ce2b 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,5 +1,6 @@ import logging import executor +import trace_v2 from dataclasses import dataclass from subprocess import Popen from typing import Tuple @@ -34,6 +35,14 @@ def get_read_set(self) -> set: def get_write_set(self) -> set: return self.write_set + def has_conflict(self, other: 'RWSet') -> bool: + if (self.write_set.intersection(other.read_set) or + self.read_set.intersection(other.write_set) or + self.write_set.intersection(other.write_set)): + return True + else: + return False + def __str__(self): return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" @@ -159,7 +168,8 @@ def start_command(self, env_file: str, speculate=False): def execution_outcome(self) -> Tuple[int, str, str]: assert self.exec_result is not None return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout - + + ## ## ## Transition Functions ## ## ## @@ -171,35 +181,61 @@ def transition_from_init_to_ready(self): # Also, probably unroll here? + def reset_to_ready(self): + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, + NodeState.SPECULATED] + # Probably delete them from tmpfs too + self.exec_ctxt = None + self.exec_result = None + self.rwset = None + self.state = NodeState.READY + def start_executing(self, env_file): assert self.state == NodeState.READY self.start_command(env_file) self.state = NodeState.EXECUTING + def start_spec_executing(self, env_file): + assert self.state == NodeState.READY + self.start_command(env_file, speculate=True) + self.state = NodeState.SPEC_EXECUTING + def commit_frontier_execution(self): - assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] + assert self.state == NodeState.EXECUTING + self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) + self.gather_fs_actions() + executor.commit_workspace(self.exec_ctxt.sandbox_dir) self.state = NodeState.COMMITTED + + def finish_spec_execution(self): + assert self.state == NodeState.SPEC_EXECUTING self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) + self.gather_fs_actions() + self.state = NodeState.SPECULATED + + + def commit_speculated(self): + assert self.state == NodeState.SPECULATED executor.commit_workspace(self.exec_ctxt.sandbox_dir) - + self.state = NodeState.COMMITTED - def _attempt_start_command(self, env_file, speculate=False): - if self.wait_env_file is not None: - self.start_command(env_file=self.wait_env_file, speculate=speculate) - elif env_file is not None: - self.start_command(env_file=env_file, speculate=speculate) - else: - logging.error(f'Error: No valid execution env for Node {self.id_}') + # def _attempt_start_command(self, env_file, speculate=False): + # if self.wait_env_file is not None: + # self.start_command(env_file=self.wait_env_file, speculate=speculate) + # elif env_file is not None: + # self.start_command(env_file=env_file, speculate=speculate) + # else: + # logging.error(f'Error: No valid execution env for Node {self.id_}') - def transition_from_ready_to_executing(self, env_file=None): - assert self.state == NodeState.READY - self.state = NodeState.EXECUTING - self._attempt_start_command(env_file) + # def transition_from_ready_to_executing(self, env_file=None): + # assert self.state == NodeState.READY + # self.state = NodeState.EXECUTING + # self._attempt_start_command(env_file) - def transition_from_ready_to_spec_executing(self, env_file=None): - assert self.state == NodeState.READY - self.state = NodeState.SPEC_EXECUTING - self._attempt_start_command(env_file, speculate=True) + # def transition_from_ready_to_spec_executing(self, env_file=None): + # assert self.state == NodeState.READY + # self.state = NodeState.SPEC_EXECUTING + # self._attempt_start_command(env_file, speculate=True) def transition_from_stopped_to_executing(self, env_file=None): assert self.state == NodeState.READY @@ -214,7 +250,29 @@ def transition_to_committed(self): def transition_from_spec_executing_to_speculated(self): pass - def set_wait_env_file(self, env_file: str): - assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED] - self.post_env_file = env_file - \ No newline at end of file + + def update_rw_set(self, rw_set): + self.rwset = rw_set + + def gather_fs_actions(self) -> RWSet: + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] + sandbox_dir = self.exec_ctxt.sandbox_dir + trace_file = self.exec_ctxt.trace_file + try: + trace_object = executor.read_trace(sandbox_dir, trace_file) + except FileNotFoundError: + self.update_rw_set(RWSet(set(), set())) + return + read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object) + rw_set = RWSet(read_set, write_set) + self.update_rw_set(rw_set) + + def get_rw_set(self): + # if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: + # self.gather_fs_actions() + return self.rwset + + # def set_wait_env_file(self, env_file: str): + # assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED] + # self.post_env_file = env_file + diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index b94aad2c..1b46b834 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -6,7 +6,6 @@ class PartialProgramOrder: frontier: set # Set of nodes at the frontier run_after: set # Nodes that should run after certain conditions - window: int # Integer representing the window to_be_resolved: "dict[NodeId, list[Node]]" # Mapping of nodes to lists of uncommitted nodes nodes: "dict[NodeId, Node]" adjacency: "dict[NodeId, list[NodeId]]" @@ -18,14 +17,15 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId self.inverse_adjacency = self.init_inverse_adjacency() self.frontier = set() self.run_after = set() - self.window = 0 - self.to_be_resolved = {} + self.to_be_resolved = {} def init_partial_order(self): for node_id, node in self.nodes.items(): if node.is_initialized(): node.transition_from_init_to_ready() - + + self.init_to_be_resolved_dict() + logging.info(self.to_be_resolved) # Init frontier self.frontier = self.get_standard_source_nodes() # TODO: Implement the rest of the partial order initialization @@ -80,9 +80,14 @@ def log_state(self): for node in self.nodes.values(): logging.info(f"Node {node.id_}: {node.state}") + def get_schedulable_nodes(self) -> list[NodeId]: + return [node.id_ for node in self.get_ready_nodes()] + def schedule_work(self, node_id: NodeId, env_file: str): self.get_node(node_id).start_executing(env_file) - + + def schedule_spec_work(self, node_id: NodeId, env_file: str): + self.get_node(node_id).start_spec_executing(env_file) ## Returns the next non-committed normal node def progress_frontier(self) -> "list[NodeId]": @@ -140,29 +145,29 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": return non_committed_nodes def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": - if visited is None: - visited = set() - visited.add(current_node_id) - - all_next_nodes = set([current_node_id]) - for neighbor in self.get_next_nodes(current_node_id): - if neighbor not in visited: - all_next_nodes.update(self.get_all_next(neighbor, visited)) - - return all_next_nodes + all_next = set() + def reachable_rec(cur, reachable): + if cur in reachable: + return + reachable.add(cur) + for n in self.get_next_nodes(cur): + reachable_rec(n, reachable) + for n in self.get_next_nodes(current_node_id): + reachable_rec(n, all_next) + return all_next def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": - if visited is None: - visited = set() - visited.add(current_node_id) - - all_previous_nodes = set([current_node_id]) - for neighbor in self.get_prev_nodes(current_node_id): - if neighbor not in visited: - all_previous_nodes.update(self.get_all_previous(neighbor, visited)) - - return all_previous_nodes + all_prev = set() + def reachable_rec(cur, reachable): + if cur in reachable: + return + reachable.add(cur) + for n in self.get_prev_nodes(cur): + reachable_rec(n, reachable) + for n in self.get_prev_nodes(current_node_id): + reachable_rec(n, all_prev) + return all_prev def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": previous = self.get_all_previous(node_id) @@ -175,14 +180,52 @@ def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): elif node.is_ready(): self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id) + def init_to_be_resolved_dict(self): + for node_id in self.nodes: + self.adjust_to_be_resolved_dict_entry(node_id) + def adjust_to_be_resolved_dict(self): + # TODO: this design seems to require the function to be called + # each time before a node entering EXECUTING or SPEC_EXECUTING + # to be optimal (that is, it might keep more things in the list). + # It's safe as is so I'm not touching it. for node_id in self.to_be_resolved.keys(): self.adjust_to_be_resolved_dict_entry(node_id) - - + #TODO: Add partial order invariant checks def valid(self): return True + + def has_fs_deps(self, node_id: NodeId): + node_of_interest : Node = self.get_node(node_id) + for node in self.get_executing_normal_and_speculated_nodes(): + node.gather_fs_actions() + for nid in self.to_be_resolved[node_id]: + node: Node = self.get_node(nid) + if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()): + return True + return False + + def handle_complete(self, node_id: NodeId, has_pending_wait: bool, + current_env: str): + node = self.get_node(node_id) + # TODO: complete the state matching + if node.is_executing(): + node.commit_frontier_execution() + self.adjust_to_be_resolved_dict() + elif node.is_spec_executing(): + if self.has_fs_deps(node_id): + node.reset_to_ready() + # otherwise it stays in ready state and waits to be scheduled by the scheduler + if has_pending_wait: + node.start_executing(current_env) + else: + node.finish_spec_execution() + if has_pending_wait: + node.commit_speculated() + self.adjust_to_be_resolved_dict() + else: + assert False def handle_wait(self, node_id: NodeId, env_file: str): node = self.get_node(node_id) @@ -197,14 +240,11 @@ def handle_wait(self, node_id: NodeId, env_file: str): # Q to @Di: Do we need to make the wait env file a node attribute # (same for most recent env file) or is it ok to just pass it around here? # We might use it in the future so maybe we shouldn't drop it. - node.set_wait_env_file(env_file) - + # TODO: remove this? + # node.set_wait_env_file(env_file) if node.is_ready(): - if node.id_ in self.get_frontier(): - node.transition_from_ready_to_executing(env_file) - else: - node.transition_from_ready_to_spec_executing(env_file) + node.start_executing(env_file) elif node.is_stopped(): if node in self.get_frontier(): logging.info(f'Node {node_id} is stopped and in the frontier.') @@ -212,11 +252,14 @@ def handle_wait(self, node_id: NodeId, env_file: str): else: logging.info(f'Node {node_id} is stopped but not in the frontier.') elif node.is_speculated(): - pass # TODO: handle this case # Check if env conflicts exist - # Check fs deps - # If no env or fs conflicts, then commit the node + if self.has_fs_deps(node_id): + node.reset_to_ready() + node.start_executing(env_file) + else: + node.commit_speculated() + self.adjust_to_be_resolved_dict() elif node.is_executing(): # Do nothing pass @@ -230,4 +273,4 @@ def handle_wait(self, node_id: NodeId, env_file: str): # TODO: think about this # self.schedule_work_single_node() # self.schedule_work_all_nodes() - + diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 19d43865..82d543cd 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -63,8 +63,11 @@ class Scheduler: | Wait -> The JIT component waits for the results of a specific command | Done -> We are done """ + window: int # Integer representing the window + latest_env: str # This variable should be initialized by the first wait, and always have a value since def __init__(self, socket_file): + self.window = 0 self.done = False self.socket = util.init_unix_socket(socket_file) ## A map containing connections for node_ids that are waiting for a response @@ -91,7 +94,10 @@ def handle_wait(self, input_cmd: str, connection): node_id, env_file = self.__parse_wait(input_cmd) self.waiting_for_response[node_id] = connection logging.info(f'Scheduler: Received wait message - {node_id}.') + self.latest_env = env_file self.partial_program_order.handle_wait(node_id, env_file) + if self.partial_program_order.get_node(node_id).is_committed(): + self.respond_to_pending_wait(node_id) def process_next_cmd(self): connection, input_cmd = util.socket_get_next_cmd(self.socket) @@ -105,12 +111,10 @@ def process_next_cmd(self): connection.close() elif (input_cmd.startswith("CommandExecComplete:")): node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - connection.close() logging.info(f'Scheduler: Received command exec complete message - {node_id}.') - node = self.partial_program_order.get_node(node_id) - # TODO: condition here to do different things based on node state - node.commit_frontier_execution() - self.respond_to_pending_wait(node_id) + self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) + if self.partial_program_order.get_node(node_id).is_committed(): + self.respond_to_pending_wait(node_id) elif (input_cmd.startswith("Wait")): self.handle_wait(input_cmd, connection) elif (input_cmd.startswith("Done")): @@ -171,6 +175,11 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": raise Exception(f'Parsing failure for line: {input_cmd}') + def schedule_work(self): + nodes = self.partial_program_order.get_schedulable_nodes() + if len(nodes): + self.partial_program_order.schedule_spec_work(nodes[0], self.latest_env) + def run(self): ## The first command should be the daemon start self.process_next_cmd() @@ -180,6 +189,7 @@ def run(self): while not self.done: self.process_next_cmd() + self.schedule_work() self.socket.close() self.shutdown() From 0737b2f54298458600702b25c22d90114afa8845 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sun, 14 Jan 2024 16:43:59 -0700 Subject: [PATCH 17/39] Env check and cleanup --- parallel-orch/node.py | 66 ++++++++++++++++---------- parallel-orch/partial_program_order.py | 39 ++++++++------- parallel-orch/scheduler_server.py | 1 - 3 files changed, 63 insertions(+), 43 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 65b2ce2b..621c2981 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,4 +1,5 @@ import logging +import re import executor import trace_v2 from dataclasses import dataclass @@ -177,19 +178,28 @@ def execution_outcome(self) -> Tuple[int, str, str]: def transition_from_init_to_ready(self): assert self.state == NodeState.INIT self.state = NodeState.READY - # Initialize data structures here - # Also, probably unroll here? + def kill(self): + assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] + self.exec_ctxt.process.kill() + def reset_to_ready(self): assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.SPECULATED] + + # Q for @Di: Should we kill the process here? + if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: + self.kill() + # Probably delete them from tmpfs too self.exec_ctxt = None self.exec_result = None self.rwset = None self.state = NodeState.READY + + def start_executing(self, env_file): assert self.state == NodeState.READY self.start_command(env_file) @@ -219,24 +229,6 @@ def commit_speculated(self): executor.commit_workspace(self.exec_ctxt.sandbox_dir) self.state = NodeState.COMMITTED - # def _attempt_start_command(self, env_file, speculate=False): - # if self.wait_env_file is not None: - # self.start_command(env_file=self.wait_env_file, speculate=speculate) - # elif env_file is not None: - # self.start_command(env_file=env_file, speculate=speculate) - # else: - # logging.error(f'Error: No valid execution env for Node {self.id_}') - - # def transition_from_ready_to_executing(self, env_file=None): - # assert self.state == NodeState.READY - # self.state = NodeState.EXECUTING - # self._attempt_start_command(env_file) - - # def transition_from_ready_to_spec_executing(self, env_file=None): - # assert self.state == NodeState.READY - # self.state = NodeState.SPEC_EXECUTING - # self._attempt_start_command(env_file, speculate=True) - def transition_from_stopped_to_executing(self, env_file=None): assert self.state == NodeState.READY self.state = NodeState.EXECUTING @@ -271,8 +263,34 @@ def get_rw_set(self): # if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: # self.gather_fs_actions() return self.rwset + + def has_env_conflict_with(self, other_env) -> bool: + # Early return if paths are the same + if self.exec_ctxt.pre_env_file == other_env: + return False + + ignore_vars = set(['RANDOM']) - # def set_wait_env_file(self, env_file: str): - # assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED] - # self.post_env_file = env_file - + re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"') + re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"') + re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))') + + def parse_env(content): + env_vars = {} + for line in content.splitlines(): + if line.startswith('#') or not line.strip(): + continue + for regex in [re_scalar_string, re_scalar_int, re_array]: + match = regex.match(line) + if match: + key, value = match.groups() + if key not in ignore_vars: + env_vars[key] = value + return env_vars + + with open(self.exec_ctxt.pre_env_file, 'r') as file: + node_env_vars = parse_env(file.read()) + + with open(other_env, 'r') as file: + other_env_vars = parse_env(file.read()) + return node_env_vars != other_env_vars diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 1b46b834..6579c787 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -169,10 +169,14 @@ def reachable_rec(cur, reachable): reachable_rec(n, all_prev) return all_prev + def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]": + next = self.get_all_next(node_id) + return set([node for node in next if not self.nodes[node].is_committed()]) + def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": previous = self.get_all_previous(node_id) return set([node for node in previous if not self.nodes[node].is_committed()]) - + def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): node = self.nodes.get(node_id) if node.is_committed(): @@ -234,14 +238,6 @@ def handle_wait(self, node_id: NodeId, env_file: str): if node.is_committed() or node.is_unsafe() or node.is_initialized(): logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - - - # For all the valid states, set the wait env file - # Q to @Di: Do we need to make the wait env file a node attribute - # (same for most recent env file) or is it ok to just pass it around here? - # We might use it in the future so maybe we shouldn't drop it. - # TODO: remove this? - # node.set_wait_env_file(env_file) if node.is_ready(): node.start_executing(env_file) @@ -254,23 +250,30 @@ def handle_wait(self, node_id: NodeId, env_file: str): elif node.is_speculated(): # TODO: handle this case # Check if env conflicts exist - if self.has_fs_deps(node_id): + + + if node.has_env_conflict_with(env_file) or self.has_fs_deps(node_id): + ## TODO: Optimization + ## FIXME: Currently causes AssertionError: assert(node_id in self.waiting_for_response) + # An env conflict means that every following node + # will have the same env conflict + # therefore, we have to reset them all + # for uncommitted_node_id in self.get_all_next_uncommitted(node_id): + # uncommitted_node = self.get_node(uncommitted_node_id) + # uncommitted_node.reset_to_ready() + # uncommitted_node.start_executing(env_file) node.reset_to_ready() node.start_executing(env_file) else: node.commit_speculated() self.adjust_to_be_resolved_dict() + elif node.is_executing(): - # Do nothing - pass + # Do nothing + pass elif node.is_spec_executing(): - # Do nothing + # Do nothing pass else: logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - - # TODO: think about this - # self.schedule_work_single_node() - # self.schedule_work_all_nodes() - diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 82d543cd..b637f930 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -105,7 +105,6 @@ def process_next_cmd(self): if(input_cmd.startswith("Init")): connection.close() self.handle_init(input_cmd) - ## TODO: Read the partial order from the given file elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): logging.info(f'Scheduler: Received daemon start message.') connection.close() From 47d32dc72e95776c46abcf8303faa6a14f947beb Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 16 Jan 2024 02:51:18 -0700 Subject: [PATCH 18/39] Fix env checking and add exec instanse wait msg matching --- parallel-orch/executor.py | 11 +++---- parallel-orch/node.py | 18 +++++++++--- parallel-orch/partial_program_order.py | 40 +++++++++++++------------- parallel-orch/scheduler_server.py | 34 ++++++++++------------ parallel-orch/util.py | 5 +++- 5 files changed, 59 insertions(+), 49 deletions(-) diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 93439c4b..52bc3d9f 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -8,7 +8,7 @@ # and traces them with Riker. # All commands are run inside an overlay sandbox. -def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace(command, node_id, execution_id, pre_execution_env_file, speculate_mode=False): trace_file = util.ptempfile() stdout_file = util.ptempfile() stderr_file = util.ptempfile() @@ -17,14 +17,14 @@ def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) + process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, pre_execution_env_file): - process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=True) +def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, node_id, pre_execution_env_file): + process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, node_id, pre_execution_env_file, speculate_mode=True) return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir] @@ -34,6 +34,7 @@ def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, nod args.append("standard") args.append(str(node_id)) args.append(post_execution_env_file) + args.append(str(execution_id)) # Save output to temporary files to not saturate the memory logging.debug(args) process = subprocess.Popen(args, stdout=None, stderr=None) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 621c2981..3bf56b23 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -6,6 +6,7 @@ from subprocess import Popen from typing import Tuple from enum import Enum, auto +import util class NodeState(Enum): INIT = auto() @@ -104,6 +105,8 @@ class Node: cmd: str asts: "list[AstNode]" state: NodeState + # Used for identifying the most recent valid execution + exec_id: int # Nodes to check for fs dependencies before this node can be committed # for this particular execution of the main sandbox. # No need to do the same for the background sandbox since it will always get committed. @@ -128,6 +131,7 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): self.wait_env_file = None self.to_be_resolved_snapshot = None self.exec_ctxt = None + self.exec_id = None def __str__(self): return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})' @@ -164,7 +168,9 @@ def start_command(self, env_file: str, speculate=False): # TODO: built-in commands cmd = self.cmd execute_func = executor.async_run_and_trace_command_return_trace - self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, env_file)) + # Set the execution id + self.exec_id = util.generate_id() + self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, self.exec_id, env_file)) def execution_outcome(self) -> Tuple[int, str, str]: assert self.exec_result is not None @@ -188,7 +194,12 @@ def reset_to_ready(self): assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.SPECULATED] - # Q for @Di: Should we kill the process here? + logging.info(f"Resetting node {self.id_} to ready {self.exec_id}") + # We reset the exec id so if we receive a message + # due to a race condition, we will ignore it. + self.exec_id = None + + # TODO: make this more sophisticated if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: self.kill() @@ -197,9 +208,8 @@ def reset_to_ready(self): self.exec_result = None self.rwset = None self.state = NodeState.READY - - + def start_executing(self, env_file): assert self.state == NodeState.READY self.start_command(env_file) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 6579c787..bdf80ab0 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -230,7 +230,13 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool, self.adjust_to_be_resolved_dict() else: assert False - + + def reset_succeeding_nodes_and_start_exec(self, node_id: NodeId, env_file: str): + for uncommitted_node_id in self.get_all_next_uncommitted(node_id): + uncommitted_node = self.get_node(uncommitted_node_id) + uncommitted_node.reset_to_ready() + uncommitted_node.start_spec_executing(env_file) + def handle_wait(self, node_id: NodeId, env_file: str): node = self.get_node(node_id) @@ -238,6 +244,7 @@ def handle_wait(self, node_id: NodeId, env_file: str): if node.is_committed() or node.is_unsafe() or node.is_initialized(): logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') + if node.is_ready(): node.start_executing(env_file) @@ -248,32 +255,25 @@ def handle_wait(self, node_id: NodeId, env_file: str): else: logging.info(f'Node {node_id} is stopped but not in the frontier.') elif node.is_speculated(): - # TODO: handle this case # Check if env conflicts exist - - - if node.has_env_conflict_with(env_file) or self.has_fs_deps(node_id): - ## TODO: Optimization - ## FIXME: Currently causes AssertionError: assert(node_id in self.waiting_for_response) - # An env conflict means that every following node - # will have the same env conflict - # therefore, we have to reset them all - # for uncommitted_node_id in self.get_all_next_uncommitted(node_id): - # uncommitted_node = self.get_node(uncommitted_node_id) - # uncommitted_node.reset_to_ready() - # uncommitted_node.start_executing(env_file) + if node.has_env_conflict_with(env_file): + node.reset_to_ready() + node.start_executing(env_file) + self.reset_succeeding_nodes_and_start_exec(node_id, env_file) + # Optimization: It would make sense to perform the checks independently, + # and if fs conflict, then update the run after dict. + elif self.has_fs_deps(node_id): node.reset_to_ready() node.start_executing(env_file) else: node.commit_speculated() self.adjust_to_be_resolved_dict() - - elif node.is_executing(): - # Do nothing - pass + elif node.is_executing(): + if node.has_env_conflict_with(env_file): + self.reset_succeeding_nodes_and_start_exec(node_id, env_file) elif node.is_spec_executing(): - # Do nothing - pass + if node.has_env_conflict_with(env_file): + self.reset_succeeding_nodes_and_start_exec(node_id, env_file) else: logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index b637f930..71586429 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -80,15 +80,6 @@ def handle_init(self, input_cmd: str): logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}') self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file) self.partial_program_order.init_partial_order() - - def handle_command_exec_complete(): - # TODO: Implement - pass - - def handle_command_exec_start(): - # TODO: Implement - pass - def handle_wait(self, input_cmd: str, connection): node_id, env_file = self.__parse_wait(input_cmd) @@ -109,11 +100,15 @@ def process_next_cmd(self): logging.info(f'Scheduler: Received daemon start message.') connection.close() elif (input_cmd.startswith("CommandExecComplete:")): - node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - logging.info(f'Scheduler: Received command exec complete message - {node_id}.') - self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) - if self.partial_program_order.get_node(node_id).is_committed(): - self.respond_to_pending_wait(node_id) + node_id, exec_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + if self.partial_program_order.get_node(node_id).exec_id == exec_id: + logging.info(f'Scheduler: Received command exec complete message - {node_id}.') + self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) + + if self.partial_program_order.get_node(node_id).is_committed(): + self.respond_to_pending_wait(node_id) + else: + logging.info(f'Scheduler: Received command exec complete message for a killed instance, ignoring - {node_id}.') elif (input_cmd.startswith("Wait")): self.handle_wait(input_cmd, connection) elif (input_cmd.startswith("Done")): @@ -121,7 +116,7 @@ def process_next_cmd(self): self.partial_program_order.log_info() self.done = True elif input_cmd.startswith("CommandExecStart:"): - node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) + node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') # self.handle_command_exec_start(input_cmd) else: @@ -166,10 +161,11 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": try: components = input_cmd.rstrip().split("|") command_id = NodeId.parse_node_id(components[0].split(":")[1]) - exit_code = int(components[1].split(":")[1]) - sandbox_dir = components[2].split(":")[1] - trace_file = components[3].split(":")[1] - return command_id, exit_code, sandbox_dir, trace_file + exec_id = int(components[1].split(":")[1]) + exit_code = int(components[2].split(":")[1]) + sandbox_dir = components[3].split(":")[1] + trace_file = components[4].split(":")[1] + return command_id, exec_id, exit_code, sandbox_dir, trace_file except: raise Exception(f'Parsing failure for line: {input_cmd}') diff --git a/parallel-orch/util.py b/parallel-orch/util.py index af01273f..dc0abdcc 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -257,4 +257,7 @@ def parse_partial_program_order_from_file(file_path: str): logging.info(f"Nodes|{','.join([str(node) for node in nodes])}") logging.info(f"Edges|{edges}") - return PartialProgramOrder(nodes, edges) \ No newline at end of file + return PartialProgramOrder(nodes, edges) + +def generate_id() -> int: + return int(time.time() * 1000000) From 80523521a45e63eb17e52ce6e5f952080eb3d7c1 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 16 Jan 2024 02:52:10 -0700 Subject: [PATCH 19/39] Update script to receive exec id from scheduler --- parallel-orch/run_command.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parallel-orch/run_command.sh b/parallel-orch/run_command.sh index 2d3597c1..9fe48f26 100755 --- a/parallel-orch/run_command.sh +++ b/parallel-orch/run_command.sh @@ -10,6 +10,7 @@ export TMPDIR=${6?No tmp dir given} export EXEC_MODE=${7?No execution mode given} export CMD_ID=${8?No command id given} export POST_EXEC_ENV=${9?No Riker env file given} +export EXECUTION_ID=${10?No execution id given} ## KK 2023-04-24: Not sure this should be run every time we run a command ## GL 2023-07-08: Tests seem to pass without it @@ -41,5 +42,5 @@ out=`head -3 $SANDBOX_DIR/upperdir/$TRACE_FILE` ## Assumes "${PASH_SPEC_SCHEDULER_SOCKET}" is set and exported ## Pass the proper exit code -msg="CommandExecComplete:${CMD_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}" +msg="CommandExecComplete:${CMD_ID}|Exec id:${EXECUTION_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}" daemon_response=$(pash_spec_communicate_scheduler_just_send "$msg") # Blocking step, daemon will not send response until it's safe to continue From 19b6a19fcc19d2d0076625316e0e7929c34e8f34 Mon Sep 17 00:00:00 2001 From: Guest Date: Sun, 21 Jan 2024 04:32:47 -0500 Subject: [PATCH 20/39] adding eager killing, WIP --- parallel-orch/node.py | 24 +++++++++- parallel-orch/partial_program_order.py | 63 ++++++++++++++++++++------ parallel-orch/scheduler_server.py | 10 ++-- parallel-orch/util.py | 2 +- 4 files changed, 81 insertions(+), 18 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 65b2ce2b..0f758412 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,6 +1,8 @@ import logging import executor import trace_v2 +import util +import signal from dataclasses import dataclass from subprocess import Popen from typing import Tuple @@ -16,6 +18,18 @@ class NodeState(Enum): SPEC_EXECUTING = auto() UNSAFE = auto() +def state_pstr(state: NodeState): + same_length_state_str = { + NodeState.INIT: ' INIT', + NodeState.READY: ' READY', + NodeState.COMMITTED: 'COMMIT', + NodeState.STOP: ' STOP', + NodeState.SPECULATED: 'SPEC_F', + NodeState.EXECUTING: ' EXE', + NodeState.SPEC_EXECUTING: 'SPEC_E', + NodeState.UNSAFE: 'UNSAFE' + } + return same_length_state_str[state] class RWSet: @@ -134,6 +148,9 @@ def __str__(self): def __repr__(self): return str(self) + def pretty_state_repr(self): + return f'{state_pstr(self.state)} {self.cmd}' + def is_initialized(self): return self.state == NodeState.INIT @@ -185,6 +202,11 @@ def reset_to_ready(self): assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.SPECULATED] # Probably delete them from tmpfs too + process = self.exec_ctxt.process + if process.poll() is None: + # Exceptions will be handled inside the call so we don't have to worry + util.kill_process_tree(process.pid, sig=signal.SIGKILL) + self.exec_ctxt = None self.exec_result = None self.rwset = None @@ -253,7 +275,7 @@ def transition_from_spec_executing_to_speculated(self): def update_rw_set(self, rw_set): self.rwset = rw_set - + def gather_fs_actions(self) -> RWSet: assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] sandbox_dir = self.exec_ctxt.sandbox_dir diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 1b46b834..0786aaeb 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -2,10 +2,21 @@ import logging from collections import deque +PROG_LOG = '[PROG_LOG] ' +EVENT_LOG = '[EVENT_LOG] ' +def event_log(s): + logging.info(EVENT_LOG + s) + +def progress_log(s): + logging.info(PROG_LOG + s) + class PartialProgramOrder: frontier: set # Set of nodes at the frontier - run_after: set # Nodes that should run after certain conditions + # Di: I'm going to ignore this for now and implement the feature without a local data structure + # Later we can add this back as a caching mechanism to avoid doing RWSet + # intersections of files all the time + # run_after: "dict[NodeId, list[Node]]" # Nodes that should run after certain conditions to_be_resolved: "dict[NodeId, list[Node]]" # Mapping of nodes to lists of uncommitted nodes nodes: "dict[NodeId, Node]" adjacency: "dict[NodeId, list[NodeId]]" @@ -16,7 +27,7 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId self.adjacency = edges self.inverse_adjacency = self.init_inverse_adjacency() self.frontier = set() - self.run_after = set() + # self.run_after = {} self.to_be_resolved = {} def init_partial_order(self): @@ -58,7 +69,7 @@ def get_executing_nodes(self): def get_spec_executing_nodes(self): return [node for node in self.nodes.values() if node.is_spec_executing()] - def get_executing_normal_and_speculated_nodes(self): + def get_executing_normal_and_spec_nodes(self): return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()] def get_speculated_nodes(self): @@ -78,17 +89,12 @@ def log_info(self): def log_state(self): for node in self.nodes.values(): - logging.info(f"Node {node.id_}: {node.state}") + progress_log(node.pretty_state_repr()) + progress_log('') def get_schedulable_nodes(self) -> list[NodeId]: return [node.id_ for node in self.get_ready_nodes()] - def schedule_work(self, node_id: NodeId, env_file: str): - self.get_node(node_id).start_executing(env_file) - - def schedule_spec_work(self, node_id: NodeId, env_file: str): - self.get_node(node_id).start_spec_executing(env_file) - ## Returns the next non-committed normal node def progress_frontier(self) -> "list[NodeId]": return self.get_next_frontier_nodes(self.get_frontier()) @@ -196,18 +202,37 @@ def adjust_to_be_resolved_dict(self): def valid(self): return True - def has_fs_deps(self, node_id: NodeId): - node_of_interest : Node = self.get_node(node_id) - for node in self.get_executing_normal_and_speculated_nodes(): + def fetch_fs_actions(self): + for node in self.get_executing_normal_and_spec_nodes(): node.gather_fs_actions() + + def _has_fs_deps(self, node_id: NodeId): + node_of_interest : Node = self.get_node(node_id) for nid in self.to_be_resolved[node_id]: node: Node = self.get_node(nid) if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()): return True return False + + # TODO: It's currently designed this way to avoid reading trace file all the time + # When we have complex caching code for this we can make this go away + def has_fs_deps(self, node_id:NodeId): + self.fetch_fs_actions() + self._has_fs_deps(node_id) + + ### external handler events ### + + def schedule_work(self, node_id: NodeId, env_file: str): + event_log("schedule_work") + self.get_node(node_id).start_executing(env_file) + + def schedule_spec_work(self, node_id: NodeId, env_file: str): + event_log("schedule_spec") + self.get_node(node_id).start_spec_executing(env_file) def handle_complete(self, node_id: NodeId, has_pending_wait: bool, current_env: str): + event_log(f"handle_complete {node_id}") node = self.get_node(node_id) # TODO: complete the state matching if node.is_executing(): @@ -224,10 +249,13 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool, if has_pending_wait: node.commit_speculated() self.adjust_to_be_resolved_dict() + elif node.is_ready(): + pass else: assert False def handle_wait(self, node_id: NodeId, env_file: str): + event_log(f"handle_wait {node_id}") node = self.get_node(node_id) # Invalid state check @@ -274,3 +302,12 @@ def handle_wait(self, node_id: NodeId, env_file: str): # self.schedule_work_single_node() # self.schedule_work_all_nodes() + def eager_fs_killing(self): + event_log("try to eagerly kill conflicted speculation") + to_be_killed = [] + self.fetch_fs_actions() + for node in self.get_spec_executing_nodes(): + if self._has_fs_deps(node.id_): + to_be_killed.append(node) + for node in to_be_killed: + node.reset_to_ready() diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 82d543cd..d303f0ec 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -186,11 +186,15 @@ def run(self): ## The second command should be the partial order init self.process_next_cmd() - + + self.partial_program_order.log_state() while not self.done: self.process_next_cmd() + self.partial_program_order.log_state() self.schedule_work() - + self.partial_program_order.log_state() + self.partial_program_order.eager_fs_killing() + self.partial_program_order.log_state() self.socket.close() self.shutdown() @@ -201,7 +205,7 @@ def shutdown(self): self.terminate_pending_commands() def terminate_pending_commands(self): - for node in self.partial_program_order.get_executing_normal_and_speculated_nodes(): + for node in self.partial_program_order.get_executing_normal_and_spec_nodes(): proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox() logging.debug(f'Killing: {proc}') # proc.terminate() diff --git a/parallel-orch/util.py b/parallel-orch/util.py index af01273f..a4c725b0 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -257,4 +257,4 @@ def parse_partial_program_order_from_file(file_path: str): logging.info(f"Nodes|{','.join([str(node) for node in nodes])}") logging.info(f"Edges|{edges}") - return PartialProgramOrder(nodes, edges) \ No newline at end of file + return PartialProgramOrder(nodes, edges) From f96b81613159104ccc5643a249b552ed01141e4b Mon Sep 17 00:00:00 2001 From: Guest Date: Sun, 21 Jan 2024 04:40:09 -0500 Subject: [PATCH 21/39] fixing reset following node on env change --- parallel-orch/partial_program_order.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index bdf80ab0..11a0167b 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -232,10 +232,11 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool, assert False def reset_succeeding_nodes_and_start_exec(self, node_id: NodeId, env_file: str): - for uncommitted_node_id in self.get_all_next_uncommitted(node_id): + for uncommitted_node_id in self.get_all_next(node_id): uncommitted_node = self.get_node(uncommitted_node_id) - uncommitted_node.reset_to_ready() - uncommitted_node.start_spec_executing(env_file) + if uncommitted_node.is_spec_executing(): + uncommitted_node.reset_to_ready() + # uncommitted_node.start_spec_executing(env_file) def handle_wait(self, node_id: NodeId, env_file: str): node = self.get_node(node_id) From ff736f7789bce3fbee22467c49b0049a3e62cc5a Mon Sep 17 00:00:00 2001 From: Guest Date: Mon, 5 Feb 2024 13:13:38 -0500 Subject: [PATCH 22/39] Create HSProg class and ConcreteNode class. Also remove trailing whitespaces. --- parallel-orch/node.py | 86 ++++++++++++------ parallel-orch/partial_program_order.py | 119 ++++++++++++------------- parallel-orch/util.py | 40 ++++++--- 3 files changed, 146 insertions(+), 99 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 36747ac5..6b83873e 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -63,15 +63,15 @@ def get_conflict(self, other: 'RWSet') -> set: return self.write_set.intersection(other.read_set).union( self.read_set.intersection(other.write_set)).union( self.write_set.intersection(other.write_set)) - + def __str__(self): return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})" class NodeId: - + #TODO: Implement iteration support - + def __init__(self, id_: int): self.id_ = id_ @@ -103,6 +103,27 @@ def __gt__(self, obj): def parse_node_id(node_id_str: str): return NodeId(int(node_id_str)) +class AbstractNode: + def __init__(self, node_id: NodeId): + self.node_id = node_id + +class HSBasicBlock: + def __init__(self, nodes): + self.nodes = nodes + +class HSProg: + abstract_nodes: "dict[NodeId, AbstractNode]" + adjacency: "dict[NodeId, list[NodeId]]" + inverse_adjacency: "dict[NodeId, list[NodeId]]" + def __init__(self, abstract_nodes: dict[NodeId, AbstractNode], + edges: dict[NodeId, list[NodeId]]): + self.abstract_nodes = abstract_nodes + self.adjacency = edges + self.inverse_adjacency = util.invert_graph(abstract_nodes, edges) + + + + @dataclass class ExecCtxt: process: Popen @@ -117,12 +138,15 @@ class ExecCtxt: class ExecResult: exit_code: int proc_id: int - - + +@dataclass class Node: id_: NodeId cmd: str asts: "list[AstNode]" + +class ConcreteNode: + abstract_node: AbstractNode state: NodeState # Used for identifying the most recent valid execution exec_id: int @@ -139,11 +163,9 @@ class Node: # background_sandbox: Sandbox exec_ctxt: ExecCtxt exec_result: ExecResult - - def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): - self.id_ = node_id - self.cmd = cmd - self.asts = asts + + def __init__(self, node: Node): + self.abstract_node = node self.state = NodeState.INIT self.tracefile = None self.rwset = None @@ -154,34 +176,46 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"): def __str__(self): return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})' - + def __repr__(self): return str(self) + @property + def id_(self): + return self.abstract_node.id_ + + @property + def cmd(self): + return self.abstract_node.cmd + + @property + def asts(self): + return self.abstract_node.asts + def pretty_state_repr(self): return f'{state_pstr(self.state)} {self.cmd}' - + def is_initialized(self): return self.state == NodeState.INIT - + def is_ready(self): return self.state == NodeState.READY - + def is_committed(self): return self.state == NodeState.COMMITTED - + def is_stopped(self): return self.state == NodeState.STOP - + def is_speculated(self): return self.state == NodeState.SPECULATED def is_executing(self): return self.state == NodeState.EXECUTING - + def is_spec_executing(self): return self.state == NodeState.SPEC_EXECUTING - + def is_unsafe(self): return self.state == NodeState.UNSAFE @@ -202,7 +236,7 @@ def execution_outcome(self) -> Tuple[int, str, str]: ## ## ## Transition Functions ## ## ## - + def transition_from_init_to_ready(self): assert self.state == NodeState.INIT self.state = NodeState.READY @@ -215,16 +249,16 @@ def kill(self): def reset_to_ready(self): assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.SPECULATED] - + logging.info(f"Resetting node {self.id_} to ready {self.exec_id}") - # We reset the exec id so if we receive a message + # We reset the exec id so if we receive a message # due to a race condition, we will ignore it. self.exec_id = None - + # TODO: make this more sophisticated if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]: self.kill() - + # Probably delete them from tmpfs too process = self.exec_ctxt.process if process.poll() is None: @@ -245,7 +279,7 @@ def start_spec_executing(self, env_file): assert self.state == NodeState.READY self.start_command(env_file, speculate=True) self.state = NodeState.SPEC_EXECUTING - + def commit_frontier_execution(self): assert self.state == NodeState.EXECUTING self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode) @@ -305,8 +339,8 @@ def has_env_conflict_with(self, other_env) -> bool: if self.exec_ctxt.pre_env_file == other_env: return False - ignore_vars = set(['RANDOM']) - + ignore_vars = set(['RANDOM']) + re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"') re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"') re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))') diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 475221bb..a3f545fd 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1,10 +1,9 @@ -from node import NodeId, Node +from node import NodeId, Node, ConcreteNode, HSProg import logging from collections import deque PROG_LOG = '[PROG_LOG] ' EVENT_LOG = '[EVENT_LOG] ' -DEBUG_LOG = '[DEBUG_LOG] ' def event_log(s): logging.info(EVENT_LOG + s) @@ -12,9 +11,6 @@ def event_log(s): def progress_log(s): logging.info(PROG_LOG + s) -def debug_log(s): - logging.debug(DEBUG_LOG + s) - class PartialProgramOrder: frontier: set # Set of nodes at the frontier # Di: I'm going to ignore this for now and implement the feature without a local data structure @@ -22,20 +18,18 @@ class PartialProgramOrder: # intersections of files all the time # run_after: "dict[NodeId, list[Node]]" # Nodes that should run after certain conditions to_be_resolved: "dict[NodeId, list[Node]]" # Mapping of nodes to lists of uncommitted nodes - nodes: "dict[NodeId, Node]" - adjacency: "dict[NodeId, list[NodeId]]" - inverse_adjacency: "dict[NodeId, list[NodeId]]" - - def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"): - self.nodes = nodes - self.adjacency = edges - self.inverse_adjacency = self.init_inverse_adjacency() + concrete_nodes: "dict[NodeId, Node]" + + def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"): + self.hsprog = HSProg(abstract_nodes, edges) + self.concrete_nodes = {node_id: ConcreteNode(ab_node) for node_id, ab_node + in abstract_nodes.items()} self.frontier = set() # self.run_after = {} self.to_be_resolved = {} def init_partial_order(self): - for node_id, node in self.nodes.items(): + for node_id, node in self.concrete_nodes.items(): if node.is_initialized(): node.transition_from_init_to_ready() @@ -45,63 +39,68 @@ def init_partial_order(self): self.frontier = self.get_standard_source_nodes() # TODO: Implement the rest of the partial order initialization + @property + def abstract_nodes(self): + return self.hsprog.abstract_nodes + + @property + def adjacency(self): + return self.hsprog.adjacency + + @property + def inverse_adjacency(self): + return self.hsprog.inverse_adjacency + def commit_node(self, node): # Logic to handle committing a node node.transition_to_committed() - # Maybe update dependencies here + # Maybe update dependencies here # etc. - def init_inverse_adjacency(self): - inverse_adjacency = {i: [] for i in self.nodes.keys()} - for from_id, to_ids in self.adjacency.items(): - for to_id in to_ids: - inverse_adjacency[to_id].append(from_id) - return inverse_adjacency - def get_node(self, node_id: NodeId) -> Node: - return self.nodes[node_id] + return self.concrete_nodes[node_id] def get_all_nodes(self): - return [node for node in self.nodes.values()] - + return [node for node in self.concrete_nodes.values()] + def get_committed_nodes(self): - return [node for node in self.nodes.values() if node.is_committed()] - + return [node for node in self.concrete_nodes.values() if node.is_committed()] + def get_ready_nodes(self): - return [node for node in self.nodes.values() if node.is_ready()] - + return [node for node in self.concrete_nodes.values() if node.is_ready()] + def get_executing_nodes(self): - return [node for node in self.nodes.values() if node.is_executing()] - + return [node for node in self.concrete_nodes.values() if node.is_executing()] + def get_spec_executing_nodes(self): - return [node for node in self.nodes.values() if node.is_spec_executing()] - + return [node for node in self.concrete_nodes.values() if node.is_spec_executing()] + def get_executing_normal_and_spec_nodes(self): - return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()] - + return [node for node in self.concrete_nodes.values() if node.is_executing() or node.is_spec_executing()] + def get_speculated_nodes(self): - return [node for node in self.nodes.values() if node.is_speculated()] - + return [node for node in self.concrete_nodes.values() if node.is_speculated()] + def get_uncommitted_nodes(self): - return [node for node in self.nodes.values() if not node.is_committed()] - + return [node for node in self.concrete_nodes.values() if not node.is_committed()] + def get_frontier(self): return self.frontier - + def log_info(self): - logging.info(f"Nodes: {self.nodes}") + logging.info(f"Nodes: {self.concrete_nodes}") logging.info(f"Adjacency: {self.adjacency}") logging.info(f"Inverse adjacency: {self.inverse_adjacency}") self.log_state() def log_state(self): - for node in self.nodes.values(): + for node in self.concrete_nodes.values(): progress_log(node.pretty_state_repr()) progress_log('') def get_schedulable_nodes(self) -> list[NodeId]: return [node.id_ for node in self.get_ready_nodes()] - + ## Returns the next non-committed normal node def progress_frontier(self) -> "list[NodeId]": return self.get_next_frontier_nodes(self.get_frontier()) @@ -111,19 +110,19 @@ def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]": def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]": return self.inverse_adjacency[node_id][:] - + def get_source_nodes(self) -> "list[NodeId]": sources = set() for to_id, from_ids in self.inverse_adjacency.items(): if len(from_ids) == 0: sources.add(to_id) return list(sources) - + def get_standard_source_nodes(self) -> list: source_nodes = self.get_source_nodes() # TODO: Filter out loop nodes # return self.filter_standard_nodes(source_nodes) - return source_nodes + return source_nodes def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": # TODO: filter non-loop nodes @@ -138,7 +137,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": continue visited.add(current_node_id) - current_node = self.nodes.get(current_node_id) + current_node = self.concrete_nodes.get(current_node_id) if not current_node.is_committed(): if first_non_committed_depth is None: @@ -156,7 +155,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": to_visit.append((neighbor, depth + 1)) # Increase depth for neighbors return non_committed_nodes - + def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": all_next = set() def reachable_rec(cur, reachable): @@ -181,24 +180,24 @@ def reachable_rec(cur, reachable): for n in self.get_prev_nodes(current_node_id): reachable_rec(n, all_prev) return all_prev - + def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]": next = self.get_all_next(node_id) - return set([node for node in next if not self.nodes[node].is_committed()]) - + return set([node for node in next if not self.concrete_nodes[node].is_committed()]) + def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": previous = self.get_all_previous(node_id) - return set([node for node in previous if not self.nodes[node].is_committed()]) + return set([node for node in previous if not self.concrete_nodes[node].is_committed()]) def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): - node = self.nodes.get(node_id) + node = self.concrete_nodes.get(node_id) if node.is_committed(): self.to_be_resolved[node_id] = [] elif node.is_ready(): self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id) def init_to_be_resolved_dict(self): - for node_id in self.nodes: + for node_id in self.concrete_nodes: self.adjust_to_be_resolved_dict_entry(node_id) def adjust_to_be_resolved_dict(self): @@ -216,7 +215,7 @@ def valid(self): def fetch_fs_actions(self): for node in self.get_executing_normal_and_spec_nodes(): node.gather_fs_actions() - + def _has_fs_deps(self, node_id: NodeId): node_of_interest : Node = self.get_node(node_id) for nid in self.to_be_resolved[node_id]: @@ -230,9 +229,9 @@ def _has_fs_deps(self, node_id: NodeId): def has_fs_deps(self, node_id:NodeId): self.fetch_fs_actions() self._has_fs_deps(node_id) - + ### external handler events ### - + def schedule_work(self, node_id: NodeId, env_file: str): event_log("schedule_work") self.get_node(node_id).start_executing(env_file) @@ -241,7 +240,7 @@ def schedule_spec_work(self, node_id: NodeId, env_file: str): event_log("schedule_spec") self.adjust_to_be_resolved_dict_entry(node_id) self.get_node(node_id).start_spec_executing(env_file) - + def handle_complete(self, node_id: NodeId, has_pending_wait: bool, current_env: str): event_log(f"handle_complete {node_id}") @@ -279,7 +278,7 @@ def handle_wait(self, node_id: NodeId, env_file: str): if node.is_committed() or node.is_unsafe() or node.is_initialized(): logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - + if node.is_ready(): node.start_executing(env_file) @@ -312,7 +311,7 @@ def handle_wait(self, node_id: NodeId, env_file: str): else: logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - + def eager_fs_killing(self): event_log("try to eagerly kill conflicted speculation") to_be_killed = [] diff --git a/parallel-orch/util.py b/parallel-orch/util.py index dc0abdcc..b9947233 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -12,6 +12,11 @@ from node import Node, NodeId from partial_program_order import PartialProgramOrder +DEBUG_LOG = '[DEBUG_LOG] ' + +def debug_log(s): + logging.debug(DEBUG_LOG + s) + def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX) ## TODO: Get a name without opening the fd too if possible @@ -42,11 +47,11 @@ def init_unix_socket(socket_file: str) -> socket.socket: logging.debug("SocketManager: Created socket") sock.bind(server_address) - logging.debug("SocketManager: Successfully bound to socket") + logging.debug("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the backlog - sock.listen() - logging.debug("SocketManager: Listenting on socket") + sock.listen() + logging.debug("SocketManager: Listenting on socket") return sock @@ -61,7 +66,7 @@ def socket_get_next_cmd(sock: socket.socket) -> "tuple[socket.socket, str]" : ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) assert(str_data.endswith("\n") or str_data == "") - + return (connection, str_data) def socket_respond(connection: socket.socket, message: str): @@ -83,7 +88,7 @@ def parse_env_string_to_dict(content): result = {key: value for key, value in scalar_vars_string} result.update({key: int(value) for key, value in scalar_vars_int}) result.update({key: value for key, value in array_vars}) - + return result def compare_dicts(dict1, dict2): @@ -114,19 +119,19 @@ def set_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" config.NAMED_TIMESTAMPS[key] = time.time() - + def invalidate_named_timestamp(action: str, node=None, key=None): if key is None: key = f"{action}{',' + str(node) if node is not None else ''}" del config.NAMED_TIMESTAMPS[key] - + def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None): try: set_named_timestamp(action, node, key) logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}") except KeyError: logging.error(f"Named timestamp {key} already exists") - + def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key=None, invalidate=True): try: if key is None: @@ -147,7 +152,7 @@ def get_all_child_processes(pid): parent = psutil.Process(pid) except psutil.NoSuchProcess: return [] - + children = parent.children(recursive=True) parent_of_parent = parent.parent() logging.critical("PARENT_PROCESS: " + str(parent_of_parent)) @@ -240,7 +245,7 @@ def parse_partial_program_order_from_file(file_path: str): edge_lines = lines[loop_context_end:] logging.debug(f'Edges: {edge_lines}') - nodes = {} + ab_nodes = {} for i in range(number_of_nodes): file_path = f'{cmds_directory}/{i}' cmd, asts = parse_cmd_from_file(file_path) @@ -248,16 +253,25 @@ def parse_partial_program_order_from_file(file_path: str): # nodes[NodeId(i)] = Node(NodeId(i), cmd, # asts=asts, # loop_context=LoopStack(loop_ctx)) - nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts) + ab_nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts) edges = {NodeId(i) : [] for i in range(number_of_nodes)} for edge_line in edge_lines: from_id, to_id = parse_edge_line(edge_line) edges[NodeId(from_id)].append(NodeId(to_id)) - logging.info(f"Nodes|{','.join([str(node) for node in nodes])}") + logging.info(f"Nodes|{','.join([str(node) for node in ab_nodes])}") logging.info(f"Edges|{edges}") - return PartialProgramOrder(nodes, edges) + return PartialProgramOrder(ab_nodes, edges) def generate_id() -> int: return int(time.time() * 1000000) + +# nodes is iterable of node +# edges is dict[node, list[node]] +def invert_graph(nodes, edges): + graph = {n: [] for n in nodes} + for from_id, to_ids in edges.items(): + for to_id in to_ids: + graph[to_id].append(from_id) + return graph From 9151f34d87bb6353d2a3a7e11e5f8e18160d0a11 Mon Sep 17 00:00:00 2001 From: Guest Date: Tue, 6 Feb 2024 00:13:50 -0500 Subject: [PATCH 23/39] minimal working loop --- parallel-orch/executor.py | 18 +-- parallel-orch/node.py | 183 +++++++++++++++++++++---- parallel-orch/partial_program_order.py | 165 +++++++++++++--------- parallel-orch/scheduler_server.py | 43 +++--- parallel-orch/util.py | 12 +- 5 files changed, 292 insertions(+), 129 deletions(-) diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py index 52bc3d9f..2ec5e962 100644 --- a/parallel-orch/executor.py +++ b/parallel-orch/executor.py @@ -8,23 +8,23 @@ # and traces them with Riker. # All commands are run inside an overlay sandbox. -def async_run_and_trace_command_return_trace(command, node_id, execution_id, pre_execution_env_file, speculate_mode=False): +def async_run_and_trace_command_return_trace(command, concrete_node_id, execution_id, pre_execution_env_file, speculate_mode=False): trace_file = util.ptempfile() stdout_file = util.ptempfile() stderr_file = util.ptempfile() post_execution_env_file = util.ptempfile() sandbox_dir, tmp_dir = util.create_sandbox() - logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}') - logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}') - logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}') - process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) + logging.debug(f'Scheduler: Stdout file for: {concrete_node_id} is: {stdout_file}') + logging.debug(f'Scheduler: Stderr file for: {concrete_node_id} is: {stderr_file}') + logging.debug(f'Scheduler: Trace file for: {concrete_node_id}: {trace_file}') + process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode) return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, node_id, pre_execution_env_file): - process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, node_id, pre_execution_env_file, speculate_mode=True) +def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, concrete_node_id, pre_execution_env_file): + process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, concrete_node_id, pre_execution_env_file, speculate_mode=True) return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir -def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): +def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False): ## Call Riker to execute the command run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh' args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir] @@ -32,7 +32,7 @@ def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, t args.append("speculate") else: args.append("standard") - args.append(str(node_id)) + args.append(str(concrete_node_id)) args.append(post_execution_env_file) args.append(str(execution_id)) # Save output to temporary files to not saturate the memory diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 6b83873e..914f5f44 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -1,3 +1,4 @@ +from itertools import chain import logging import re import executor @@ -69,9 +70,6 @@ def __str__(self): class NodeId: - - #TODO: Implement iteration support - def __init__(self, id_: int): self.id_ = id_ @@ -103,27 +101,6 @@ def __gt__(self, obj): def parse_node_id(node_id_str: str): return NodeId(int(node_id_str)) -class AbstractNode: - def __init__(self, node_id: NodeId): - self.node_id = node_id - -class HSBasicBlock: - def __init__(self, nodes): - self.nodes = nodes - -class HSProg: - abstract_nodes: "dict[NodeId, AbstractNode]" - adjacency: "dict[NodeId, list[NodeId]]" - inverse_adjacency: "dict[NodeId, list[NodeId]]" - def __init__(self, abstract_nodes: dict[NodeId, AbstractNode], - edges: dict[NodeId, list[NodeId]]): - self.abstract_nodes = abstract_nodes - self.adjacency = edges - self.inverse_adjacency = util.invert_graph(abstract_nodes, edges) - - - - @dataclass class ExecCtxt: process: Popen @@ -139,14 +116,59 @@ class ExecResult: exit_code: int proc_id: int +class LoopStack: + def __init__(self, loop_contexts_or_iters=None): + if loop_contexts_or_iters is None: + self.loops = [] + else: + self.loops = loop_contexts_or_iters + + def __repr__(self): + ## TODO: Represent it using 'it', 'it0', 'it1', etc + ## or -(iters)- in front of it. + output = "-".join([str(it) for it in self.loops]) + return output + def __eq__(self, other): + return self.loops == other.loops + @dataclass class Node: id_: NodeId cmd: str asts: "list[AstNode]" + loop_context: LoopStack + + def __init__(self, id_, cmd, asts, loop_context=None): + self.id_ = id_ + self.cmd = cmd + self.asts = asts + self.loop_context = loop_context if loop_context else LoopStack() + +class ConcreteNodeId: + def __init__(self, node_id: NodeId, loop_iters = list()): + self.node_id = node_id + self.loop_iters = tuple(loop_iters) + + def __repr__(self): + return f'cnid({self.node_id.id_})' + + def __hash__(self): + return hash((self.node_id, self.loop_iters)) + + def __eq__(self, other): + return self.node_id == other.node_id and self.loop_iters == other.loop_iters + + def __str__(self): + return f'{self.node_id}@' + ''.join(['-' + str(n) for n in self.loop_iters]) + @staticmethod + def parse(input_str): + node_id_str, loop_iters_str = input_str.split('@') + return ConcreteNodeId(NodeId(int(node_id_str)), [int(cnt) for cnt in loop_iters_str.split('-')[1:]]) + class ConcreteNode: - abstract_node: AbstractNode + cnid: ConcreteNodeId + abstract_node: Node state: NodeState # Used for identifying the most recent valid execution exec_id: int @@ -164,7 +186,8 @@ class ConcreteNode: exec_ctxt: ExecCtxt exec_result: ExecResult - def __init__(self, node: Node): + def __init__(self, cnid: ConcreteNodeId, node: Node): + self.cnid = cnid self.abstract_node = node self.state = NodeState.INIT self.tracefile = None @@ -226,7 +249,7 @@ def start_command(self, env_file: str, speculate=False): execute_func = executor.async_run_and_trace_command_return_trace # Set the execution id self.exec_id = util.generate_id() - self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, self.exec_id, env_file)) + self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.cnid, self.exec_id, env_file)) def execution_outcome(self) -> Tuple[int, str, str]: assert self.exec_result is not None @@ -364,3 +387,109 @@ def parse_env(content): with open(other_env, 'r') as file: other_env_vars = parse_env(file.read()) return node_env_vars != other_env_vars + + +class HSBasicBlock: + def __init__(self, nodes: list[Node]): + if len(nodes) == 0: + raise ValueError('basic block size 0') + self.nodes = nodes + + def __str__(self): + return ''.join([node.cmd for node in self.nodes]) + + @property + def loop_context(self): + return self.nodes[0].loop_context + + @property + def node_ids(self): + return [node.id_ for node in self.nodes] + + def get_node(self, node_id: NodeId) -> Node: + nodes = [node for node in self.nodes if node.id_ == node_id] + assert len(nodes) == 1 + return nodes[0] + +class HSProg: + abstract_nodes: "dict[NodeId, Node]" + adjacency: "dict[NodeId, list[NodeId]]" + inverse_adjacency: "dict[NodeId, list[NodeId]]" + basic_blocks: list[HSBasicBlock] = [] + block_adjacency: "dict[int, list[int]]" + BB_ENTER = -1 + BB_EXIT = -2 + + def __init__(self, abstract_nodes: dict[NodeId, Node], + edges: dict[NodeId, list[NodeId]]): + self.abstract_nodes = abstract_nodes + self.adjacency = edges + self.inverse_adjacency = util.invert_graph(abstract_nodes, edges) + self.construct_basic_blocks() + util.debug_log(str(self)) + + def construct_basic_blocks(self): + node_list = [] + block_id = LoopStack() + for node in self.abstract_nodes.values(): + if node.loop_context == block_id: + node_list.append(node) + else: + basic_block = HSBasicBlock(node_list) + self.basic_blocks.append(basic_block) + node_list = [node] + block_id = node.loop_context + basic_block = HSBasicBlock(node_list) + self.basic_blocks.append(basic_block) + if len(self.basic_blocks) == 0: + raise ValueError('empty hsprog') + + # TODO: the algorithm here is wrong, + # echo 1 + # for i in {1..n}; do + # echo 2 + # done + # for i in {1..m}; do + # echo 3 + # done + # echo 4 + # + # echo 1 can goto echo 2, echo 3, or echo 4 + self.block_adjacency = {} + prev_blocks = {tuple(): self.basic_blocks[0]} + for bb_id, bb in enumerate(self.basic_blocks): + # the fallthrough edge + if bb_id != len(self.basic_blocks) - 1: + self.block_adjacency[bb_id] = [bb_id + 1] + else: + self.block_adjacency[bb_id] = [HSProg.BB_EXIT] + break + + for next_bb_id in chain(range(bb_id + 1, len(self.basic_blocks)), + range(0, bb_id + 1)): + next_bb = self.basic_blocks[next_bb_id] + if next_bb.loop_context == bb.loop_context: + self.block_adjacency[bb_id].append(next_bb_id) + break + else: + raise ValueError('no jump block') + + def is_start_of_block(self, node_id: NodeId): + for bb in self.basic_blocks: + bb : HSBasicBlock + if bb.nodes[0].id_ == node_id: + return True + return False + + def find_basic_block(self, node_id: NodeId): + for bb in self.basic_blocks: + bb : HSBasicBlock + for node in bb.nodes: + if node.id_ == node_id: + return bb + raise ValueError('no such node_id') + + def __str__(self): + return 'prog:\n' + '\n'.join( + [f'block {i}:\n' + str(bb) + f'goto block {self.block_adjacency[i]}\n' for i, bb in enumerate(self.basic_blocks)]) + diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index a3f545fd..03da8a47 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -1,5 +1,6 @@ -from node import NodeId, Node, ConcreteNode, HSProg +from node import NodeId, Node, ConcreteNodeId, ConcreteNode, HSProg, HSBasicBlock import logging +import util from collections import deque PROG_LOG = '[PROG_LOG] ' @@ -17,27 +18,32 @@ class PartialProgramOrder: # Later we can add this back as a caching mechanism to avoid doing RWSet # intersections of files all the time # run_after: "dict[NodeId, list[Node]]" # Nodes that should run after certain conditions - to_be_resolved: "dict[NodeId, list[Node]]" # Mapping of nodes to lists of uncommitted nodes + + # Mapping of concrete nodes to lists of uncommitted concrete nodes the precedes them. + # It is the snapshot of the reachable uncommited concrete nodes from prev_concrete_node graph + # at the time the concrete node enters execution. So if there is fs conflict in them, + # it needs to be rerun + to_be_resolved: "dict[NodeId, list[Node]]" concrete_nodes: "dict[NodeId, Node]" def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"): self.hsprog = HSProg(abstract_nodes, edges) - self.concrete_nodes = {node_id: ConcreteNode(ab_node) for node_id, ab_node - in abstract_nodes.items()} + self.concrete_nodes: dict[ConcreteNodeId, ConcreteNode] = {} self.frontier = set() # self.run_after = {} - self.to_be_resolved = {} + self.prev_concrete_node: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} + self.to_be_resolved: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} - def init_partial_order(self): - for node_id, node in self.concrete_nodes.items(): - if node.is_initialized(): - node.transition_from_init_to_ready() + # def init_partial_order(self): + # for node_id, node in self.concrete_nodes.items(): + # if node.is_initialized(): + # node.transition_from_init_to_ready() - self.init_to_be_resolved_dict() - logging.info(self.to_be_resolved) - # Init frontier - self.frontier = self.get_standard_source_nodes() - # TODO: Implement the rest of the partial order initialization + # self.init_to_be_resolved_dict() + # logging.info(self.to_be_resolved) + # # Init frontier + # self.frontier = self.get_standard_source_nodes() + # # TODO: Implement the rest of the partial order initialization @property def abstract_nodes(self): @@ -56,9 +62,9 @@ def commit_node(self, node): node.transition_to_committed() # Maybe update dependencies here # etc. - - def get_node(self, node_id: NodeId) -> Node: - return self.concrete_nodes[node_id] + + def get_concrete_node(self, concrete_node_id: ConcreteNodeId) -> ConcreteNode: + return self.concrete_nodes[concrete_node_id] def get_all_nodes(self): return [node for node in self.concrete_nodes.values()] @@ -67,7 +73,7 @@ def get_committed_nodes(self): return [node for node in self.concrete_nodes.values() if node.is_committed()] def get_ready_nodes(self): - return [node for node in self.concrete_nodes.values() if node.is_ready()] + return [(cnid, n) for cnid, n in self.concrete_nodes.items() if n.is_ready()] def get_executing_nodes(self): return [node for node in self.concrete_nodes.values() if node.is_executing()] @@ -98,8 +104,8 @@ def log_state(self): progress_log(node.pretty_state_repr()) progress_log('') - def get_schedulable_nodes(self) -> list[NodeId]: - return [node.id_ for node in self.get_ready_nodes()] + def get_schedulable_nodes(self) -> list[ConcreteNodeId]: + return [concrete_node_id for concrete_node_id, _ in self.get_ready_nodes()] ## Returns the next non-committed normal node def progress_frontier(self) -> "list[NodeId]": @@ -108,8 +114,8 @@ def progress_frontier(self) -> "list[NodeId]": def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]": return self.adjacency[node_id][:] - def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]": - return self.inverse_adjacency[node_id][:] + def get_prev_nodes(self, concrete_node_id: ConcreteNodeId) -> "list[ConcreteNodeId]": + return self.prev_concrete_node[concrete_node_id][:] def get_source_nodes(self) -> "list[NodeId]": sources = set() @@ -156,7 +162,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": return non_committed_nodes - def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": + def get_all_next(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]": all_next = set() def reachable_rec(cur, reachable): if cur in reachable: @@ -169,7 +175,7 @@ def reachable_rec(cur, reachable): return all_next - def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]": + def get_all_previous(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]": all_prev = set() def reachable_rec(cur, reachable): if cur in reachable: @@ -181,20 +187,21 @@ def reachable_rec(cur, reachable): reachable_rec(n, all_prev) return all_prev - def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]": - next = self.get_all_next(node_id) - return set([node for node in next if not self.concrete_nodes[node].is_committed()]) + # TODO: fixme + # def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]": + # next = self.get_all_next(node_id) + # return set([node for node in next if not self.concrete_nodes[node].is_committed()]) - def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]": - previous = self.get_all_previous(node_id) - return set([node for node in previous if not self.concrete_nodes[node].is_committed()]) + def get_all_previous_uncommitted(self, concrete_node_id: ConcreteNodeId) -> "set[ConcreteNodeId]": + previous = self.get_all_previous(concrete_node_id) + return set([cnid for cnid in previous if not self.concrete_nodes[cnid].is_committed()]) - def adjust_to_be_resolved_dict_entry(self, node_id: NodeId): - node = self.concrete_nodes.get(node_id) + def adjust_to_be_resolved_dict_entry(self, concrete_node_id: ConcreteNodeId): + node = self.concrete_nodes.get(concrete_node_id) if node.is_committed(): - self.to_be_resolved[node_id] = [] + self.to_be_resolved[concrete_node_id] = [] elif node.is_ready(): - self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id) + self.to_be_resolved[concrete_node_id] = self.get_all_previous_uncommitted(concrete_node_id) def init_to_be_resolved_dict(self): for node_id in self.concrete_nodes: @@ -216,10 +223,10 @@ def fetch_fs_actions(self): for node in self.get_executing_normal_and_spec_nodes(): node.gather_fs_actions() - def _has_fs_deps(self, node_id: NodeId): - node_of_interest : Node = self.get_node(node_id) - for nid in self.to_be_resolved[node_id]: - node: Node = self.get_node(nid) + def _has_fs_deps(self, concrete_node_id: ConcreteNodeId): + node_of_interest : ConcreteNode = self.get_concrete_node(concrete_node_id) + for nid in self.to_be_resolved[concrete_node_id]: + node: ConcreteNode = self.get_concrete_node(nid) if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()): return True return False @@ -234,17 +241,17 @@ def has_fs_deps(self, node_id:NodeId): def schedule_work(self, node_id: NodeId, env_file: str): event_log("schedule_work") - self.get_node(node_id).start_executing(env_file) + self.get_concrete_node(node_id).start_executing(env_file) - def schedule_spec_work(self, node_id: NodeId, env_file: str): + def schedule_spec_work(self, concrete_node_id: ConcreteNodeId, env_file: str): event_log("schedule_spec") - self.adjust_to_be_resolved_dict_entry(node_id) - self.get_node(node_id).start_spec_executing(env_file) + self.adjust_to_be_resolved_dict_entry(concrete_node_id) + self.get_concrete_node(concrete_node_id).start_spec_executing(env_file) def handle_complete(self, node_id: NodeId, has_pending_wait: bool, current_env: str): event_log(f"handle_complete {node_id}") - node = self.get_node(node_id) + node = self.get_concrete_node(node_id) # TODO: complete the state matching if node.is_executing(): node.commit_frontier_execution() @@ -264,39 +271,69 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool, assert False def reset_succeeding_nodes(self, node_id: NodeId, env_file: str): - for uncommitted_node_id in self.get_all_next(node_id): - uncommitted_node = self.get_node(uncommitted_node_id) - if uncommitted_node.is_spec_executing(): - uncommitted_node.reset_to_ready() - # uncommitted_node.start_spec_executing(env_file) - - def handle_wait(self, node_id: NodeId, env_file: str): - event_log(f"handle_wait {node_id}") - node = self.get_node(node_id) + # TODO: fixme + pass + # for uncommitted_node_id in self.get_all_next(node_id): + # uncommitted_node = self.get_concrete_node(uncommitted_node_id) + # if uncommitted_node.is_spec_executing(): + # uncommitted_node.reset_to_ready() + # # uncommitted_node.start_spec_executing(env_file) + + def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId): + basic_block = self.hsprog.find_basic_block(concrete_node_id.node_id) + if len(self.concrete_nodes) != 0: + prev_concrete_node_id = next(reversed(self.concrete_nodes)) + else: + prev_concrete_node_id = None + loop_iters = concrete_node_id.loop_iters + for abstract_node_id in basic_block.node_ids: + new_concrete_node_id = ConcreteNodeId(abstract_node_id, loop_iters) + new_concrete_node = ConcreteNode(new_concrete_node_id, + basic_block.get_node(abstract_node_id)) + new_concrete_node.transition_from_init_to_ready() + self.concrete_nodes[new_concrete_node_id] = new_concrete_node + if prev_concrete_node_id is not None: + self.prev_concrete_node[new_concrete_node_id] = [prev_concrete_node_id] + else: + self.prev_concrete_node[new_concrete_node_id] = [] + prev_concrete_node_id = new_concrete_node_id + assert concrete_node_id in self.concrete_nodes + + def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): + event_log(f"handle_wait {concrete_node_id}") + + if not concrete_node_id in self.concrete_nodes: + abstract_node_id = concrete_node_id.node_id + assert self.hsprog.is_start_of_block(abstract_node_id) + self.adding_new_basic_block(concrete_node_id) + util.debug_log("try to add concrete node here") + util.debug_log(repr(self.prev_concrete_node)) + util.debug_log("") + node = self.get_concrete_node(concrete_node_id) # Invalid state check if node.is_committed() or node.is_unsafe() or node.is_initialized(): - logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') - raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') - + logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') if node.is_ready(): node.start_executing(env_file) elif node.is_stopped(): if node in self.get_frontier(): - logging.info(f'Node {node_id} is stopped and in the frontier.') + logging.info(f'Node {concrete_node_id} is stopped and in the frontier.') node.transition_from_stopped_to_executing(env_file) else: - logging.info(f'Node {node_id} is stopped but not in the frontier.') + logging.info(f'Node {concrete_node_id} is stopped but not in the frontier.') elif node.is_speculated(): # Check if env conflicts exist if node.has_env_conflict_with(env_file): + util.debug_log(f'prev_env: {node.exec_ctxt.pre_env_file}, real: {env_file}') node.reset_to_ready() node.start_executing(env_file) - self.reset_succeeding_nodes(node_id, env_file) + self.reset_succeeding_nodes(concrete_node_id, env_file) # Optimization: It would make sense to perform the checks independently, # and if fs conflict, then update the run after dict. - elif self.has_fs_deps(node_id): + elif self.has_fs_deps(concrete_node_id): node.reset_to_ready() node.start_executing(env_file) else: @@ -304,13 +341,13 @@ def handle_wait(self, node_id: NodeId, env_file: str): self.adjust_to_be_resolved_dict() elif node.is_executing(): if node.has_env_conflict_with(env_file): - self.reset_succeeding_nodes(node_id, env_file) + self.reset_succeeding_nodes(concrete_node_id, env_file) elif node.is_spec_executing(): if node.has_env_conflict_with(env_file): - self.reset_succeeding_nodes(node_id, env_file) + self.reset_succeeding_nodes(concrete_node_id, env_file) else: - logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}') - raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}') + logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') + raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') def eager_fs_killing(self): event_log("try to eagerly kill conflicted speculation") @@ -318,7 +355,7 @@ def eager_fs_killing(self): self.fetch_fs_actions() for node in self.get_all_nodes(): if ((node.is_speculated() or node.is_spec_executing()) - and self._has_fs_deps(node.id_)): + and self._has_fs_deps(node.cnid)): to_be_killed.append(node) for node in to_be_killed: node.reset_to_ready() diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 8e67f539..2a7cc5a2 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -4,6 +4,7 @@ import util import config from partial_program_order import PartialProgramOrder, NodeId +from node import LoopStack, ConcreteNodeId ## ## A scheduler server @@ -79,15 +80,15 @@ def handle_init(self, input_cmd: str): partial_order_file = input_cmd.split(":")[1].rstrip() logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}') self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file) - self.partial_program_order.init_partial_order() + util.debug_log(str(self.partial_program_order.hsprog)) def handle_wait(self, input_cmd: str, connection): - node_id, env_file = self.__parse_wait(input_cmd) - self.waiting_for_response[node_id] = connection - logging.info(f'Scheduler: Received wait message - {node_id}.') + concrete_node_id, env_file = self.__parse_wait(input_cmd) + self.waiting_for_response[concrete_node_id] = connection + logging.info(f'Scheduler: Received wait message - {concrete_node_id}.') self.latest_env = env_file - self.partial_program_order.handle_wait(node_id, env_file) - if self.partial_program_order.get_node(node_id).is_committed(): + self.partial_program_order.handle_wait(concrete_node_id, env_file) + if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed(): self.respond_to_pending_wait(node_id) def process_next_cmd(self): @@ -101,11 +102,11 @@ def process_next_cmd(self): connection.close() elif (input_cmd.startswith("CommandExecComplete:")): node_id, exec_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - if self.partial_program_order.get_node(node_id).exec_id == exec_id: + if self.partial_program_order.get_concrete_node(node_id).exec_id == exec_id: logging.info(f'Scheduler: Received command exec complete message - {node_id}.') self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) - if self.partial_program_order.get_node(node_id).is_committed(): + if self.partial_program_order.get_concrete_node(node_id).is_committed(): self.respond_to_pending_wait(node_id) else: logging.info(f'Scheduler: Received command exec complete message for a killed instance, ignoring - {node_id}.') @@ -116,6 +117,7 @@ def process_next_cmd(self): self.partial_program_order.log_info() self.done = True elif input_cmd.startswith("CommandExecStart:"): + assert False node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') # self.handle_command_exec_start(input_cmd) @@ -133,7 +135,7 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str): def respond_to_pending_wait(self, node_id: int): logging.debug(f'Responding to pending wait for node: {node_id}') ## Get the completed node info - node = self.partial_program_order.get_node(node_id) + node = self.partial_program_order.get_concrete_node(node_id) msg = '{} {} {}'.format(*node.execution_outcome()) response = success_response(msg) @@ -143,24 +145,21 @@ def respond_to_pending_wait(self, node_id: int): def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]": try: node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|") - raw_node_id_int = int(node_id_component.split(":")[1].rstrip()) + node_id = NodeId(int(node_id_component.split(":")[1].rstrip())) loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip() - pash_runtime_vars_file_str = pash_runtime_vars_file_component.split(":")[1].rstrip() - # TODO Implement loops correctly - # if loop_counters_str == "None": - # node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str - # else: - # loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] - # node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str - node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str - return node_id + pash_env_filename = pash_runtime_vars_file_component.split(":")[1].rstrip() + if loop_counters_str == "None": + return ConcreteNodeId(node_id), pash_env_filename + else: + loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")] + return ConcreteNodeId(node_id, loop_counters), pash_env_filename except: raise Exception(f'Parsing failure for line: {input_cmd}') def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": try: components = input_cmd.rstrip().split("|") - command_id = NodeId.parse_node_id(components[0].split(":")[1]) + command_id = ConcreteNodeId.parse(components[0].split(":")[1]) exec_id = int(components[1].split(":")[1]) exit_code = int(components[2].split(":")[1]) sandbox_dir = components[3].split(":")[1] @@ -171,8 +170,8 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": def schedule_work(self): - nodes = self.partial_program_order.get_schedulable_nodes() - for n in nodes[:2]: + concrete_node_ids = self.partial_program_order.get_schedulable_nodes() + for n in concrete_node_ids[:2]: self.partial_program_order.schedule_spec_work(n, self.latest_env) def run(self): diff --git a/parallel-orch/util.py b/parallel-orch/util.py index b9947233..3886a816 100644 --- a/parallel-orch/util.py +++ b/parallel-orch/util.py @@ -9,7 +9,7 @@ import psutil import signal import analysis -from node import Node, NodeId +from node import Node, NodeId, LoopStack from partial_program_order import PartialProgramOrder DEBUG_LOG = '[DEBUG_LOG] ' @@ -214,7 +214,6 @@ def parse_loop_contexts(lines): loop_contexts[node_id] = loop_ctx return loop_contexts - def parse_partial_program_order_from_file(file_path: str): with open(file_path) as f: raw_lines = f.readlines() @@ -249,11 +248,10 @@ def parse_partial_program_order_from_file(file_path: str): for i in range(number_of_nodes): file_path = f'{cmds_directory}/{i}' cmd, asts = parse_cmd_from_file(file_path) - # loop_ctx = loop_contexts[i] - # nodes[NodeId(i)] = Node(NodeId(i), cmd, - # asts=asts, - # loop_context=LoopStack(loop_ctx)) - ab_nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts) + loop_ctx = loop_contexts[i] + ab_nodes[NodeId(i)] = Node(NodeId(i), cmd.strip(), + asts=asts, + loop_context=LoopStack(loop_ctx)) edges = {NodeId(i) : [] for i in range(number_of_nodes)} for edge_line in edge_lines: From 87c68f5a2a75b228b2b61f4ec0d091078748e915 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 6 Feb 2024 07:06:30 -0700 Subject: [PATCH 24/39] Add ignore vars in env resolution --- parallel-orch/node.py | 25 ++++++++++++++++++++++--- parallel-orch/scheduler_server.py | 2 +- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 914f5f44..c94a5f0a 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -362,8 +362,13 @@ def has_env_conflict_with(self, other_env) -> bool: if self.exec_ctxt.pre_env_file == other_env: return False - ignore_vars = set(['RANDOM']) - + ignore_vars = set(["_", 'RANDOM', "msg", "pash_runtime_final_status", "pash_previous_set_status", + "pash_runtime_shell_variables_file", "from_set", "output_variable_file", + "pash_loop_iter_counters", "daemon_response", "vars_file", + "pash_speculative_command_id", "prev_env", "PREVIOUS_SET_STATUS", + "BASH_LINENO", "response_args", "stdout_file", "pash_spec_command_id", + "cmd_exit_code", "pash_set_to_add"]) + re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"') re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"') re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))') @@ -386,7 +391,21 @@ def parse_env(content): with open(other_env, 'r') as file: other_env_vars = parse_env(file.read()) - return node_env_vars != other_env_vars + + conflict_exists = False + for key in set(node_env_vars.keys()).union(other_env_vars.keys()): + if key not in node_env_vars: + logging.critical(f"Variable {key} missing in node environment") + conflict_exists = True + elif key not in other_env_vars: + logging.critical(f"Variable {key} missing in other environment") + conflict_exists = True + elif node_env_vars[key] != other_env_vars[key]: + logging.critical(f"Variable {key} differs: node environment has {node_env_vars[key]}, other has {other_env_vars[key]}") + conflict_exists = True + + return conflict_exists + class HSBasicBlock: diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 2a7cc5a2..ba109ba6 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -89,7 +89,7 @@ def handle_wait(self, input_cmd: str, connection): self.latest_env = env_file self.partial_program_order.handle_wait(concrete_node_id, env_file) if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed(): - self.respond_to_pending_wait(node_id) + self.respond_to_pending_wait(concrete_node_id) def process_next_cmd(self): connection, input_cmd = util.socket_get_next_cmd(self.socket) From 3e76190c36b05b02651bcc8cc5e37ddd8a07b90d Mon Sep 17 00:00:00 2001 From: gliargovas Date: Tue, 6 Feb 2024 07:23:12 -0700 Subject: [PATCH 25/39] Reset current node on env conflict --- parallel-orch/partial_program_order.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 03da8a47..c74cad30 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -341,9 +341,12 @@ def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): self.adjust_to_be_resolved_dict() elif node.is_executing(): if node.has_env_conflict_with(env_file): + node.reset_to_ready() + node.start_executing(env_file) self.reset_succeeding_nodes(concrete_node_id, env_file) elif node.is_spec_executing(): if node.has_env_conflict_with(env_file): + node.reset_to_ready() self.reset_succeeding_nodes(concrete_node_id, env_file) else: logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') From bf3c212c76e7f8390adae1c6b40f57e939cb5d2c Mon Sep 17 00:00:00 2001 From: Guest Date: Tue, 6 Feb 2024 10:43:29 -0500 Subject: [PATCH 26/39] fix empty first bb --- parallel-orch/node.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index c94a5f0a..3a1b3c7a 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -415,7 +415,7 @@ def __init__(self, nodes: list[Node]): self.nodes = nodes def __str__(self): - return ''.join([node.cmd for node in self.nodes]) + return ''.join([node.cmd.strip() + '\n' for node in self.nodes]) @property def loop_context(self): @@ -429,7 +429,7 @@ def get_node(self, node_id: NodeId) -> Node: nodes = [node for node in self.nodes if node.id_ == node_id] assert len(nodes) == 1 return nodes[0] - + class HSProg: abstract_nodes: "dict[NodeId, Node]" adjacency: "dict[NodeId, list[NodeId]]" @@ -451,11 +451,15 @@ def construct_basic_blocks(self): node_list = [] block_id = LoopStack() for node in self.abstract_nodes.values(): - if node.loop_context == block_id: + if (node.loop_context == block_id and + not (len(node_list) >= 1 and node_list[-1].cmd == 'break')): node_list.append(node) else: - basic_block = HSBasicBlock(node_list) - self.basic_blocks.append(basic_block) + if len(node_list) != 0: + # This branch happens for conditional at the beginning + # of the program + basic_block = HSBasicBlock(node_list) + self.basic_blocks.append(basic_block) node_list = [node] block_id = node.loop_context basic_block = HSBasicBlock(node_list) From 5116a34985cea0fb5433ef76fed6ae60e3f7b543 Mon Sep 17 00:00:00 2001 From: Guest Date: Tue, 6 Feb 2024 15:54:09 -0500 Subject: [PATCH 27/39] fix the break and unsafe command for now --- parallel-orch/node.py | 12 ++++++++++++ parallel-orch/partial_program_order.py | 12 ++++++++++-- parallel-orch/scheduler_server.py | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 3a1b3c7a..65c4de42 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -10,6 +10,7 @@ from typing import Tuple from enum import Enum, auto import util +import analysis class NodeState(Enum): INIT = auto() @@ -255,6 +256,9 @@ def execution_outcome(self) -> Tuple[int, str, str]: assert self.exec_result is not None return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout + def command_unsafe(self): + return not analysis.safe_to_execute(self.asts, {}) + ## ## ## Transition Functions ## @@ -263,8 +267,13 @@ def execution_outcome(self) -> Tuple[int, str, str]: def transition_from_init_to_ready(self): assert self.state == NodeState.INIT self.state = NodeState.READY + self.rwset = RWSet(set(), set()) # Also, probably unroll here? + def transition_from_ready_to_unsafe(self): + assert self.state == NodeState.READY + self.state = NodeState.UNSAFE + def kill(self): assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING] self.exec_ctxt.process.kill() @@ -335,6 +344,9 @@ def transition_to_committed(self): def transition_from_spec_executing_to_speculated(self): pass + def commit_unsafe_node(self): + assert self.state == NodeState.UNSAFE + self.state = NodeState.COMMITTED def update_rw_set(self, rw_set): self.rwset = rw_set diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index c74cad30..7bbdde5f 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -291,6 +291,8 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId): new_concrete_node = ConcreteNode(new_concrete_node_id, basic_block.get_node(abstract_node_id)) new_concrete_node.transition_from_init_to_ready() + if new_concrete_node.command_unsafe(): + new_concrete_node.transition_from_ready_to_unsafe() self.concrete_nodes[new_concrete_node_id] = new_concrete_node if prev_concrete_node_id is not None: self.prev_concrete_node[new_concrete_node_id] = [prev_concrete_node_id] @@ -298,7 +300,11 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId): self.prev_concrete_node[new_concrete_node_id] = [] prev_concrete_node_id = new_concrete_node_id assert concrete_node_id in self.concrete_nodes - + + def finish_wait_unsafe(self, concrete_node_id: ConcreteNodeId): + node = self.concrete_nodes[concrete_node_id] + node.commit_unsafe_node() + def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): event_log(f"handle_wait {concrete_node_id}") @@ -312,12 +318,14 @@ def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): node = self.get_concrete_node(concrete_node_id) # Invalid state check - if node.is_committed() or node.is_unsafe() or node.is_initialized(): + if node.is_committed() or node.is_initialized(): logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}') if node.is_ready(): node.start_executing(env_file) + elif node.is_unsafe(): + pass elif node.is_stopped(): if node in self.get_frontier(): logging.info(f'Node {concrete_node_id} is stopped and in the frontier.') diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index ba109ba6..1c2025d8 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -88,8 +88,12 @@ def handle_wait(self, input_cmd: str, connection): logging.info(f'Scheduler: Received wait message - {concrete_node_id}.') self.latest_env = env_file self.partial_program_order.handle_wait(concrete_node_id, env_file) - if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed(): + concrete_node = self.partial_program_order.get_concrete_node(concrete_node_id) + if concrete_node.is_committed(): self.respond_to_pending_wait(concrete_node_id) + elif concrete_node.is_unsafe(): + self.partial_program_order.finish_wait_unsafe(concrete_node_id) + self.respond_to_wait_on_unsafe(concrete_node_id) def process_next_cmd(self): connection, input_cmd = util.socket_get_next_cmd(self.socket) @@ -116,11 +120,6 @@ def process_next_cmd(self): util.socket_respond(connection, success_response("All finished!")) self.partial_program_order.log_info() self.done = True - elif input_cmd.startswith("CommandExecStart:"): - assert False - node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd) - logging.info(f'Scheduler: Received command exec start message - {input_cmd}.') - # self.handle_command_exec_start(input_cmd) else: logging.error(error_response(f'Error: Unsupported command: {input_cmd}')) raise Exception(f'Error: Unsupported command: {input_cmd}') @@ -132,7 +131,11 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str): util.socket_respond(connection, response) connection.close() - def respond_to_pending_wait(self, node_id: int): + def respond_to_wait_on_unsafe(self, node_id: ConcreteNodeId): + response = unsafe_response('') + self.respond_to_frontend_core(node_id, response) + + def respond_to_pending_wait(self, node_id: ConcreteNodeId): logging.debug(f'Responding to pending wait for node: {node_id}') ## Get the completed node info node = self.partial_program_order.get_concrete_node(node_id) @@ -142,7 +145,7 @@ def respond_to_pending_wait(self, node_id: int): ## Send the response self.respond_to_frontend_core(node_id, response) - def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]": + def __parse_wait(self, input_cmd: str) -> "tuple[ConcreteNodeId, str]": try: node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|") node_id = NodeId(int(node_id_component.split(":")[1].rstrip())) From 9bd4c8b0b1606b5889d030923e00704d433d2263 Mon Sep 17 00:00:00 2001 From: Guest Date: Tue, 6 Feb 2024 17:45:11 -0500 Subject: [PATCH 28/39] cleanup --- parallel-orch/node.py | 19 +++--- parallel-orch/partial_program_order.py | 85 ++++---------------------- parallel-orch/scheduler_server.py | 28 ++++----- 3 files changed, 34 insertions(+), 98 deletions(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 65c4de42..9bb8ae22 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -152,7 +152,7 @@ def __init__(self, node_id: NodeId, loop_iters = list()): def __repr__(self): return f'cnid({self.node_id.id_})' - + def __hash__(self): return hash((self.node_id, self.loop_iters)) @@ -166,7 +166,7 @@ def __str__(self): def parse(input_str): node_id_str, loop_iters_str = input_str.split('@') return ConcreteNodeId(NodeId(int(node_id_str)), [int(cnt) for cnt in loop_iters_str.split('-')[1:]]) - + class ConcreteNode: cnid: ConcreteNodeId abstract_node: Node @@ -258,7 +258,7 @@ def execution_outcome(self) -> Tuple[int, str, str]: def command_unsafe(self): return not analysis.safe_to_execute(self.asts, {}) - + ## ## ## Transition Functions ## @@ -380,7 +380,7 @@ def has_env_conflict_with(self, other_env) -> bool: "pash_speculative_command_id", "prev_env", "PREVIOUS_SET_STATUS", "BASH_LINENO", "response_args", "stdout_file", "pash_spec_command_id", "cmd_exit_code", "pash_set_to_add"]) - + re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"') re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"') re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))') @@ -403,7 +403,7 @@ def parse_env(content): with open(other_env, 'r') as file: other_env_vars = parse_env(file.read()) - + conflict_exists = False for key in set(node_env_vars.keys()).union(other_env_vars.keys()): if key not in node_env_vars: @@ -415,11 +415,11 @@ def parse_env(content): elif node_env_vars[key] != other_env_vars[key]: logging.critical(f"Variable {key} differs: node environment has {node_env_vars[key]}, other has {other_env_vars[key]}") conflict_exists = True - + return conflict_exists - + class HSBasicBlock: def __init__(self, nodes: list[Node]): if len(nodes) == 0: @@ -463,7 +463,7 @@ def construct_basic_blocks(self): node_list = [] block_id = LoopStack() for node in self.abstract_nodes.values(): - if (node.loop_context == block_id and + if (node.loop_context == block_id and not (len(node_list) >= 1 and node_list[-1].cmd == 'break')): node_list.append(node) else: @@ -523,8 +523,7 @@ def find_basic_block(self, node_id: NodeId): if node.id_ == node_id: return bb raise ValueError('no such node_id') - + def __str__(self): return 'prog:\n' + '\n'.join( [f'block {i}:\n' + str(bb) + f'goto block {self.block_adjacency[i]}\n' for i, bb in enumerate(self.basic_blocks)]) - diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py index 7bbdde5f..73ad58ef 100644 --- a/parallel-orch/partial_program_order.py +++ b/parallel-orch/partial_program_order.py @@ -34,17 +34,6 @@ def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, li self.prev_concrete_node: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} self.to_be_resolved: dict[ConcreteNodeId, list[ConcreteNodeId]] = {} - # def init_partial_order(self): - # for node_id, node in self.concrete_nodes.items(): - # if node.is_initialized(): - # node.transition_from_init_to_ready() - - # self.init_to_be_resolved_dict() - # logging.info(self.to_be_resolved) - # # Init frontier - # self.frontier = self.get_standard_source_nodes() - # # TODO: Implement the rest of the partial order initialization - @property def abstract_nodes(self): return self.hsprog.abstract_nodes @@ -56,13 +45,13 @@ def adjacency(self): @property def inverse_adjacency(self): return self.hsprog.inverse_adjacency - + def commit_node(self, node): # Logic to handle committing a node node.transition_to_committed() # Maybe update dependencies here # etc. - + def get_concrete_node(self, concrete_node_id: ConcreteNodeId) -> ConcreteNode: return self.concrete_nodes[concrete_node_id] @@ -107,61 +96,9 @@ def log_state(self): def get_schedulable_nodes(self) -> list[ConcreteNodeId]: return [concrete_node_id for concrete_node_id, _ in self.get_ready_nodes()] - ## Returns the next non-committed normal node - def progress_frontier(self) -> "list[NodeId]": - return self.get_next_frontier_nodes(self.get_frontier()) - - def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]": - return self.adjacency[node_id][:] - def get_prev_nodes(self, concrete_node_id: ConcreteNodeId) -> "list[ConcreteNodeId]": return self.prev_concrete_node[concrete_node_id][:] - def get_source_nodes(self) -> "list[NodeId]": - sources = set() - for to_id, from_ids in self.inverse_adjacency.items(): - if len(from_ids) == 0: - sources.add(to_id) - return list(sources) - - def get_standard_source_nodes(self) -> list: - source_nodes = self.get_source_nodes() - # TODO: Filter out loop nodes - # return self.filter_standard_nodes(source_nodes) - return source_nodes - - def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]": - # TODO: filter non-loop nodes - visited = set() - to_visit = [(node_id, 0) for node_id in start_nodes] # Pair each start node with depth 0 - non_committed_nodes = set() - first_non_committed_depth = None - - while to_visit: - current_node_id, depth = to_visit.pop() - if current_node_id in visited: - continue - - visited.add(current_node_id) - current_node = self.concrete_nodes.get(current_node_id) - - if not current_node.is_committed(): - if first_non_committed_depth is None: - first_non_committed_depth = depth - elif depth > first_non_committed_depth: - # Do not consider nodes deeper than the first non-committed depth - continue - - non_committed_nodes.add(current_node_id) - - if first_non_committed_depth is None or depth < first_non_committed_depth: - next_nodes = self.get_next_nodes(current_node_id) # Use the provided method to get next nodes - for neighbor in next_nodes: - if neighbor not in visited: - to_visit.append((neighbor, depth + 1)) # Increase depth for neighbors - - return non_committed_nodes - def get_all_next(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]": all_next = set() def reachable_rec(cur, reachable): @@ -233,31 +170,31 @@ def _has_fs_deps(self, concrete_node_id: ConcreteNodeId): # TODO: It's currently designed this way to avoid reading trace file all the time # When we have complex caching code for this we can make this go away - def has_fs_deps(self, node_id:NodeId): + def has_fs_deps(self, concrete_node_id: ConcreteNodeId): self.fetch_fs_actions() - self._has_fs_deps(node_id) + self._has_fs_deps(concrete_node_id) ### external handler events ### - def schedule_work(self, node_id: NodeId, env_file: str): + def schedule_work(self, concrete_node_id: ConcreteNodeId, env_file: str): event_log("schedule_work") - self.get_concrete_node(node_id).start_executing(env_file) + self.get_concrete_node(concrete_node_id).start_executing(env_file) def schedule_spec_work(self, concrete_node_id: ConcreteNodeId, env_file: str): event_log("schedule_spec") self.adjust_to_be_resolved_dict_entry(concrete_node_id) self.get_concrete_node(concrete_node_id).start_spec_executing(env_file) - def handle_complete(self, node_id: NodeId, has_pending_wait: bool, + def handle_complete(self, concrete_node_id: ConcreteNodeId, has_pending_wait: bool, current_env: str): - event_log(f"handle_complete {node_id}") - node = self.get_concrete_node(node_id) + event_log(f"handle_complete {concrete_node_id}") + node = self.get_concrete_node(concrete_node_id) # TODO: complete the state matching if node.is_executing(): node.commit_frontier_execution() self.adjust_to_be_resolved_dict() elif node.is_spec_executing(): - if self.has_fs_deps(node_id): + if self.has_fs_deps(concrete_node_id): node.reset_to_ready() # otherwise it stays in ready state and waits to be scheduled by the scheduler if has_pending_wait: @@ -304,7 +241,7 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId): def finish_wait_unsafe(self, concrete_node_id: ConcreteNodeId): node = self.concrete_nodes[concrete_node_id] node.commit_unsafe_node() - + def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str): event_log(f"handle_wait {concrete_node_id}") diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py index 1c2025d8..74a8583d 100644 --- a/parallel-orch/scheduler_server.py +++ b/parallel-orch/scheduler_server.py @@ -19,11 +19,11 @@ def handler(signum, frame): def parse_args(): parser = argparse.ArgumentParser(add_help=False) ## TODO: Import the arguments so that they are not duplicated here and in orch - parser.add_argument("-d", "--debug-level", - type=int, + parser.add_argument("-d", "--debug-level", + type=int, default=0, help="Set debugging level") - parser.add_argument("-f", "--log_file", + parser.add_argument("-f", "--log_file", type=str, default=None, help="Set logging output file. Default: stdout") @@ -57,7 +57,7 @@ def error_response(string): class Scheduler: """ Schedules a partial order of commands to run out-of-order Flow: - input cmd -> + input cmd -> | Daemon Start -> Receive whens tarting | Init -> Read the partial order from a file | CommandExecComplete -> A command completed its execution @@ -109,7 +109,7 @@ def process_next_cmd(self): if self.partial_program_order.get_concrete_node(node_id).exec_id == exec_id: logging.info(f'Scheduler: Received command exec complete message - {node_id}.') self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env) - + if self.partial_program_order.get_concrete_node(node_id).is_committed(): self.respond_to_pending_wait(node_id) else: @@ -134,14 +134,14 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str): def respond_to_wait_on_unsafe(self, node_id: ConcreteNodeId): response = unsafe_response('') self.respond_to_frontend_core(node_id, response) - + def respond_to_pending_wait(self, node_id: ConcreteNodeId): logging.debug(f'Responding to pending wait for node: {node_id}') ## Get the completed node info node = self.partial_program_order.get_concrete_node(node_id) msg = '{} {} {}'.format(*node.execution_outcome()) response = success_response(msg) - + ## Send the response self.respond_to_frontend_core(node_id, response) @@ -158,7 +158,7 @@ def __parse_wait(self, input_cmd: str) -> "tuple[ConcreteNodeId, str]": return ConcreteNodeId(node_id, loop_counters), pash_env_filename except: raise Exception(f'Parsing failure for line: {input_cmd}') - + def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]": try: components = input_cmd.rstrip().split("|") @@ -176,11 +176,11 @@ def schedule_work(self): concrete_node_ids = self.partial_program_order.get_schedulable_nodes() for n in concrete_node_ids[:2]: self.partial_program_order.schedule_spec_work(n, self.latest_env) - + def run(self): ## The first command should be the daemon start self.process_next_cmd() - + ## The second command should be the partial order init self.process_next_cmd() @@ -200,7 +200,7 @@ def shutdown(self): logging.debug("PaSh-Spec scheduler is shutting down...") logging.debug("PaSh-Spec scheduler shut down successfully...") self.terminate_pending_commands() - + def terminate_pending_commands(self): for node in self.partial_program_order.get_executing_normal_and_spec_nodes(): proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox() @@ -215,8 +215,8 @@ def main(): if args.log_file is None: logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s") else: - logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s", - filename=f"{os.path.abspath(args.log_file)}", + logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s", + filename=f"{os.path.abspath(args.log_file)}", filemode="w") # Set debug level @@ -230,7 +230,7 @@ def main(): config.SPECULATE_IMMEDIATELY = args.speculate_immediately scheduler = Scheduler(config.SCHEDULER_SOCKET) scheduler.run() - + if __name__ == "__main__": main() From 8f7a62bcbd84ce0bd643c798573518b75fc1b1a6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 9 Feb 2024 01:31:25 -0700 Subject: [PATCH 29/39] Adjust benchmark infra to the new scheduler logs --- report/benchmark_plots.py | 84 +++++++++++++++++++++++--------- report/benchmark_runner.py | 36 +++++++++----- report/command_executor.py | 0 report/config_parser.py | 2 + report/report_generator.py | 0 report/result_analyzer.py | 98 ++++++++++++-------------------------- report/run_benchmarks.py | 33 ++++++++++--- 7 files changed, 145 insertions(+), 108 deletions(-) mode change 100755 => 100644 report/benchmark_plots.py mode change 100755 => 100644 report/benchmark_runner.py mode change 100755 => 100644 report/command_executor.py mode change 100755 => 100644 report/config_parser.py mode change 100755 => 100644 report/report_generator.py mode change 100755 => 100644 report/result_analyzer.py mode change 100755 => 100644 report/run_benchmarks.py diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py old mode 100755 new mode 100644 index c74f099d..177a8bdf --- a/report/benchmark_plots.py +++ b/report/benchmark_plots.py @@ -1,5 +1,7 @@ import os import matplotlib.pyplot as plt +import numpy as np + # Set the plotting style if desired # plt.style.use('ggplot') # Example: ggplot style @@ -33,7 +35,9 @@ def plot_benchmark_times_combined(benchmarks, bash_times, orch_times, output_dir [i + bar_width / 2 for i in range(len(benchmarks))], benchmarks) ax.legend() - save_plot(output_dir, filename) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + # Plots individual comparison charts for each benchmark. def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_dir, filename): @@ -50,26 +54,62 @@ def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_d save_plot(output_dir, filename) -# Plots a Gantt chart of activities. -def plot_gantt(activities, output_dir, filename, simple=False): - if simple: - activities = [activity for activity in activities if activity[0].startswith("RunNode,") or activity[0] == "Wait"] - - fig_height = max(5, len(activities) * 0.3) - fig, ax = plt.subplots(figsize=(15, fig_height)) - - activities.sort(key=lambda x: x[1]) - bar_height = 0.8 - gap = 0.2 - - for index, (action, start_time, duration) in enumerate(activities): - ax.broken_barh([(start_time, duration)], (index * (bar_height + gap), bar_height), facecolors='blue') - ax.text(start_time + duration / 2, index * (bar_height + gap) + bar_height / 2, action, - ha='center', va='center', fontsize=6, color='white') - - setup_ax(ax, 'Time (ms)', '', f'Gantt Chart of {filename.strip("_gantt.pdf")}', [], []) - ax.set_yticks([i * (bar_height + gap) + bar_height / 2 for i in range(len(activities))]) - ax.set_yticklabels([activity[0] for activity in activities], fontsize=8) +def sort_node_ids(node_ids): + def parse_id(node_id): + parts = node_id.split('+') + concrete_id = int(parts[0]) + iter_ids = tuple(int(iter_id) for iter_id in parts[1].split('-')) if len(parts) > 1 else () + return (concrete_id,) + iter_ids + + sorted_ids = sorted(node_ids, key=parse_id, reverse=True) + return sorted_ids + +def plot_prog_blocks(prog_blocks, output_dir, filename): + # Define colors for different statuses + colors = { + 'READY': 'red', + 'EXE': 'orange', + 'SPEC_E': 'blue', + 'SPEC_F': 'lightblue', + 'COMMIT': 'green', + 'UNSAFE': 'purple', + 'INIT': 'grey' + } + + first_time = prog_blocks[0][0] + times = [(block[0] - first_time).total_seconds() for block in prog_blocks] + + unsorted_node_ids = {node[0] for block in prog_blocks for node in block[1]} + node_ids = sort_node_ids(unsorted_node_ids) # Sort the node IDs using the custom sorting function + + statuses = {node_id: [] for node_id in node_ids} + + for block_time, nodes in prog_blocks: + elapsed_time = (block_time - first_time).total_seconds() + for node_id, status in nodes: + statuses[node_id].append((elapsed_time, status)) + + fig_height = 0.5 * len(node_ids) + fig_width = 12 # Fixed width + fig, ax = plt.subplots(figsize=(fig_width, fig_height)) + status_legend_handles = {} + + for node_id in node_ids: + y_pos = node_ids.index(node_id) + for i, (start_time, status) in enumerate(statuses[node_id]): + end_time = times[-1] if i == len(statuses[node_id]) - 1 else statuses[node_id][i + 1][0] + color = colors.get(status, 'grey') + ax.broken_barh([(start_time, end_time - start_time)], (y_pos - 0.4, 0.8), facecolors=color) + if status not in status_legend_handles: + status_legend_handles[status] = plt.Rectangle((0, 0), 1, 1, fc=color) + + ax.set_xlabel("Time since first tick (seconds)") + ax.set_ylabel("Node ID") + ax.set_title("Node Status Over Time") + ax.set_yticks(np.arange(len(node_ids))) + ax.set_yticklabels(node_ids) ax.grid(True) + ax.legend(status_legend_handles.values(), status_legend_handles.keys(), loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, frameon=True) - save_plot(output_dir, filename) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) diff --git a/report/benchmark_runner.py b/report/benchmark_runner.py old mode 100755 new mode 100644 index f62e380d..a7531c5c --- a/report/benchmark_runner.py +++ b/report/benchmark_runner.py @@ -1,30 +1,43 @@ import csv +from typing import List from command_executor import CommandExecutor +from config_parser import BenchmarkConfig from result_analyzer import ResultAnalyzer from report_generator import ReportGenerator import benchmark_plots import os +from pprint import pprint class BenchmarkRunner: - def __init__(self, benchmarks, args): + def __init__(self, benchmarks: "List[BenchmarkConfig]", args): self.benchmarks = benchmarks self.args = args self.results = [] self.activities = {} + + def __repr__(self): + return (f"BenchmarkRunner(benchmarks={self.benchmarks!r}, " + f"args={self.args!r}, results={self.results!r})") + + def __str__(self): + return (f"Benchmark Runner:\n" + f" Benchmarks: {self.benchmarks}\n" + f" Arguments: {self.args}\n" + f" Results: {self.results}") def run_all_benchmarks(self): for benchmark in self.benchmarks: self.run_benchmark(benchmark) - def run_benchmark(self, benchmark): + def run_benchmark(self, benchmark: BenchmarkConfig): # Setup environment and pre-execution commands benchmark.setup_environment() if self.args.verbose: # Print verbose information print(f"\n---------> Running benchmark: {benchmark.name} <---------\n") - print(f"Environment Variables: {benchmark.env}") + print(">", benchmark) for pre_command in benchmark.pre_execution_script: CommandExecutor.run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR'), self.args.verbose) @@ -41,8 +54,8 @@ def run_benchmark(self, benchmark): os.environ.get('ORCH_COMMAND'), self.args.verbose) - activities = ResultAnalyzer.parse_logs_into_activities(orch_log) - self.activities[benchmark.name] = activities + prog_blocks = ResultAnalyzer.process_results(orch_log) + # pprint(prog_blocks) # Analyze and compare results diff_lines = ResultAnalyzer.compare_results(bash_output, orch_output) @@ -52,8 +65,8 @@ def run_benchmark(self, benchmark): ReportGenerator.print_results(benchmark.name, bash_time, orch_time, diff_lines, verbose=self.args.verbose) if not self.args.no_logs: ReportGenerator.save_log_data(orch_log, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_log.log") - - ResultAnalyzer.analyze_node_execution_times(orch_log, benchmark.name, os.environ.get('REPORT_OUTPUT_DIR'), self.args.verbose) + + self.activities[benchmark.name] = prog_blocks def generate_reports(self): @@ -76,10 +89,7 @@ def generate_plots(self): # Plot Gantt charts for each benchmark for benchmark in self.benchmarks: - activities = self.activities.get(benchmark.name, []) + activities = self.activities.get(benchmark.name) if activities: - benchmark_plots.plot_gantt(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt", simple=self.args.full_gantt) - - - - \ No newline at end of file + print(os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt") + benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt") diff --git a/report/command_executor.py b/report/command_executor.py old mode 100755 new mode 100644 diff --git a/report/config_parser.py b/report/config_parser.py old mode 100755 new mode 100644 index fa6eeb50..790713e5 --- a/report/config_parser.py +++ b/report/config_parser.py @@ -3,6 +3,7 @@ class BenchmarkConfig: + def __init__(self, name, env, pre_execution_script, command, orch_args): self.name = name self.env = [self.replace_env_var(e) for e in env] @@ -66,3 +67,4 @@ def parse_config(self): def get_benchmarks(self): return self.benchmarks + diff --git a/report/report_generator.py b/report/report_generator.py old mode 100755 new mode 100644 diff --git a/report/result_analyzer.py b/report/result_analyzer.py old mode 100755 new mode 100644 index 8729ab79..e0a7d1df --- a/report/result_analyzer.py +++ b/report/result_analyzer.py @@ -1,21 +1,37 @@ -import difflib +from datetime import datetime +import matplotlib.pyplot as plt +import matplotlib.dates as mdates import hashlib -import os -import csv +import numpy as np class ResultAnalyzer: @staticmethod - def parse_logs_into_activities(log_data): - info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")] - activities = [] - for line in info_lines: - if len(line) == 4: - activity = line[1] - end_time = float(line[2].split(":")[1].rstrip("ms")) - step_time = float(line[3].split(":")[1].rstrip("ms")) - start_time = end_time - step_time - activities.append((activity, start_time, step_time)) - return activities + def process_results(orch_log): + log_lines = orch_log.split("\n") + prog_blocks = [] + current_block = [] + block_start_time = None + + for line in log_lines: + if line.startswith("INFO|") and "[PROG_LOG]" in line: + parts = line.split("|") + time_str = parts[1] + log_content = parts[2].strip() + if log_content == "[PROG_LOG]": + # Start of a new block + if current_block: + prog_blocks.append((block_start_time, current_block)) + current_block = [] + block_start_time = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S,%f") + else: + # Continuing the current block + state, node_id, command = log_content.replace("[PROG_LOG] ", "").split(",", 2) + current_block.append((node_id.strip(), state.strip())) + # Append the last block if not empty + if current_block: + prog_blocks.append((block_start_time, current_block)) + + return prog_blocks @staticmethod def compare_results(bash_output, orch_output, max_lines=1000): @@ -35,56 +51,4 @@ def compare_results(bash_output, orch_output, max_lines=1000): if hash_value not in bash_hashes: diffs.append(f'+ {line}') - return diffs - - @staticmethod - def analyze_node_execution_times(orch_output, benchmark_name, output_dir, verbose): - node_times_dict = ResultAnalyzer.extract_node_times(orch_output) - - if verbose: - ResultAnalyzer.print_node_execution_times(node_times_dict) - - ResultAnalyzer.generate_node_times_csv(node_times_dict, benchmark_name, output_dir) - - @staticmethod - def print_node_execution_times(node_times_dict): - print("-" * 40) - print("Node Execution Times:") - for node in sorted(node_times_dict.keys()): - times = node_times_dict[node] - num_executions = len(times) - time_lost = sum(times) - times[-1] if times else 0 - times_str = ', '.join(f'{time:7.2f}ms' for time in times) - print(f"Node {node:2d}: Executions: {num_executions}, Time Lost: {time_lost:7.2f}ms Times = {times_str} ") - print("-" * 40) - - @staticmethod - def generate_node_times_csv(node_times_dict, benchmark_name, output_dir): - csv_filename = os.path.join(output_dir, f"{benchmark_name}_execution_times.csv") - with open(csv_filename, 'w', newline='') as csv_file: - writer = csv.writer(csv_file) - writer.writerow(["Node", "Execution Times (ms)", "Number of Executions", "Time Lost (ms)"]) - for node in sorted(node_times_dict.keys()): - times = node_times_dict[node] - num_executions = len(times) - time_lost = sum(times) - times[-1] if times else 0 - writer.writerow([node, ', '.join(str(time) for time in times), num_executions, time_lost]) - - @staticmethod - def extract_node_times(orch_output): - node_times_dict = {} - - relevant_lines = [line.replace("INFO:root:>|PartialOrder|RunNode,", "") - for line in orch_output.split("\n") - if line.startswith("INFO:root:>|PartialOrder|RunNode,") and "Step time:" in line] - - for line in relevant_lines: - parts = line.split("|") - node_id = int(parts[0]) - time = float(parts[2].split(":")[1][:-2]) # Extract step time - - if node_id not in node_times_dict: - node_times_dict[node_id] = [] - node_times_dict[node_id].append(time) - - return node_times_dict + return diffs \ No newline at end of file diff --git a/report/run_benchmarks.py b/report/run_benchmarks.py old mode 100755 new mode 100644 index bdacc596..1bf98ce2 --- a/report/run_benchmarks.py +++ b/report/run_benchmarks.py @@ -13,7 +13,7 @@ def print_startup_info(args): print(f" {arg + ':':13s} {value}") print("> Environment Variables:") - for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'PASH_TOP', 'PASH_SPEC_TOP']: + for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'REPORT_OUTPUT_DIR', 'PASH_TOP', 'PASH_SPEC_TOP', 'ORCH_COMMAND']: print(f" {env_var + ':':17s} {os.environ.get(env_var)}") def parse_args(): @@ -23,9 +23,10 @@ def parse_args(): parser.add_argument('--csv-output', action='store_true', help="Generate and save results in CSV format.") parser.add_argument('--verbose', action='store_true', help="Enable verbose output.") parser.add_argument('--full-gantt', action='store_false', help="Generate a full Gantt chart for each benchmark.") - parser.add_argument('--config-file', type=str, default='benchmark_config.json', help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.") + parser.add_argument('--config-file', type=str, default=None, help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.") parser.add_argument('--setup-script', type=str, default=None, help="Path to a setup script to run before running any other benchmark.") parser.add_argument('--subset', type=str, default=None, help="Name of a subset of benchmarks to run. Will instead download and store outputs in the dir with the specified name.") + parser.add_argument('--no-setup', action='store_true', help="Do not run any setup script before running benchmarks. Assumes subset is also set.") return parser.parse_args() # Sets the required environment variables for the benchmarking process. @@ -72,11 +73,16 @@ def main(): args = parse_args() set_environment_variables(args) - - - # Use the config file path from arguments - config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file) + if args.config_file is not None: + if args.verbose: + print(f"Config File: {args.config_file}") + config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file) + elif args.subset: + config_file_path = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", "config.json") + else: + config_file_path = os.path.join(os.environ['WORKING_DIR'], "benchmark_config.json") + # Parse benchmark configurations config_parser = ConfigParser(os.path.join(os.environ['WORKING_DIR'], config_file_path)) config_parser.parse_config() @@ -86,6 +92,21 @@ def main(): if args.verbose: print_startup_info(args) + print(config_parser) + + + if args.setup_script: + if args.verbose: + print(f"Running setup script: {args.setup_script}") + subprocess.run(['bash', args.setup_script]) + elif args.subset and not args.no_setup: + setup_script = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", 'setup.sh') + if os.path.exists(setup_script): + if args.verbose: + print(f"Running setup script: {args.setup_script}") + subprocess.run(['bash', setup_script]) + elif args.verbose: + print(f"No setup script found in {os.environ['TEST_SCRIPT_DIR']}, ignoring") # Initialize and run the BenchmarkRunner runner = BenchmarkRunner(config_parser.get_benchmarks(), args) From a1118589a2d1c923a0756f266f301207872f9ce7 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 9 Feb 2024 01:33:04 -0700 Subject: [PATCH 30/39] Print node id when logging node state --- parallel-orch/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel-orch/node.py b/parallel-orch/node.py index 9bb8ae22..1f34e1b3 100644 --- a/parallel-orch/node.py +++ b/parallel-orch/node.py @@ -217,7 +217,7 @@ def asts(self): return self.abstract_node.asts def pretty_state_repr(self): - return f'{state_pstr(self.state)} {self.cmd}' + return f'{state_pstr(self.state)},{self.id_},{self.cmd}' def is_initialized(self): return self.state == NodeState.INIT From c6e3df0e57165e1f20ab2819bba1d23a05a60258 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Fri, 9 Feb 2024 02:53:12 -0700 Subject: [PATCH 31/39] Add custom workdir definition functionality --- report/benchmark_runner.py | 13 +++++++--- report/config_parser.py | 5 +++- report/run_benchmarks.py | 1 - report/time_script.py | 50 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 report/time_script.py diff --git a/report/benchmark_runner.py b/report/benchmark_runner.py index a7531c5c..9cee12bb 100644 --- a/report/benchmark_runner.py +++ b/report/benchmark_runner.py @@ -42,15 +42,21 @@ def run_benchmark(self, benchmark: BenchmarkConfig): for pre_command in benchmark.pre_execution_script: CommandExecutor.run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR'), self.args.verbose) + if benchmark.command_working_dir: + workdir = benchmark.command_working_dir + else: + workdir = os.environ.get('TEST_SCRIPT_DIR') + + # Execute the benchmark bash_time, bash_output, _ = CommandExecutor.run_command( benchmark.command.split(" "), - os.environ.get('TEST_SCRIPT_DIR'), + workdir, self.args.verbose) orch_time, orch_output, orch_log = CommandExecutor.run_command_with_orch( benchmark.command.split(" "), benchmark.orch_args, - os.environ.get('TEST_SCRIPT_DIR'), + workdir, os.environ.get('ORCH_COMMAND'), self.args.verbose) @@ -91,5 +97,4 @@ def generate_plots(self): for benchmark in self.benchmarks: activities = self.activities.get(benchmark.name) if activities: - print(os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt") - benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt") + benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_progress") diff --git a/report/config_parser.py b/report/config_parser.py index 790713e5..512a89ab 100644 --- a/report/config_parser.py +++ b/report/config_parser.py @@ -4,10 +4,11 @@ class BenchmarkConfig: - def __init__(self, name, env, pre_execution_script, command, orch_args): + def __init__(self, name, env, pre_execution_script, command_working_dir, command, orch_args): self.name = name self.env = [self.replace_env_var(e) for e in env] self.pre_execution_script = [self.replace_env_var(script) for script in pre_execution_script] + self.command_working_dir = self.replace_env_var(command_working_dir) self.command = self.replace_env_var(command) self.orch_args = self.replace_env_var(orch_args) @@ -22,6 +23,7 @@ def __str__(self): return (f"Benchmark '{self.name}':\n" f" Environment Variables: {env_str}\n" f" Pre-execution Script: {pre_exec_str}\n" + f" Command Working Directory: {self.command_working_dir}\n" f" Command: {self.command}\n" f" Orchestrator Arguments: {self.orch_args}") @@ -60,6 +62,7 @@ def parse_config(self): name=config.get('name'), env=config.get('env', []), pre_execution_script=config.get('pre_execution_script', []), + command_working_dir=config.get('working_dir', ""), command=config.get('command'), orch_args=config.get('orch_args', "") ) diff --git a/report/run_benchmarks.py b/report/run_benchmarks.py index 1bf98ce2..5d560e69 100644 --- a/report/run_benchmarks.py +++ b/report/run_benchmarks.py @@ -22,7 +22,6 @@ def parse_args(): parser.add_argument('--no-logs', action='store_true', help="Do not save log files of benchmark runs.") parser.add_argument('--csv-output', action='store_true', help="Generate and save results in CSV format.") parser.add_argument('--verbose', action='store_true', help="Enable verbose output.") - parser.add_argument('--full-gantt', action='store_false', help="Generate a full Gantt chart for each benchmark.") parser.add_argument('--config-file', type=str, default=None, help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.") parser.add_argument('--setup-script', type=str, default=None, help="Path to a setup script to run before running any other benchmark.") parser.add_argument('--subset', type=str, default=None, help="Name of a subset of benchmarks to run. Will instead download and store outputs in the dir with the specified name.") diff --git a/report/time_script.py b/report/time_script.py new file mode 100644 index 00000000..3ef96f2e --- /dev/null +++ b/report/time_script.py @@ -0,0 +1,50 @@ +import subprocess +import time +import csv +import sys + +def format_time(seconds): + # Format time as ss.msms where msms is in milliseconds + return "{:.4f}".format(seconds) + +def time_commands(shell_script_path): + # Read the shell script + with open(shell_script_path, 'r') as file: + commands = file.readlines() + + # Prepare the CSV output + csv_filename = shell_script_path + "_timing.csv" + with open(csv_filename, 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["Command", "Time (seconds)"]) + + # Initialize total time + total_time = 0.0 + + # Execute each command and time it + for command in commands: + command = command.strip() + if command and not command.startswith('#'): # Ignore empty lines and comments + start_time = time.time() + try: + # Run the command + result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + execution_time = time.time() - start_time + # Write the result to the CSV with formatted time + csvwriter.writerow([command, format_time(execution_time)]) + total_time += execution_time + except subprocess.CalledProcessError as e: + print(f"An error occurred while executing the command: {command}") + print(e.output.decode()) + sys.exit(1) + + # Write the total time with formatted time + csvwriter.writerow(["Total", format_time(total_time)]) + + print(f"Timing results written to {csv_filename}") + +# Usage: python time_script.py /path/to/your/script.sh +if len(sys.argv) > 1: + time_commands(sys.argv[1]) +else: + print("Please provide the path to the shell script as an argument.") From fadd38207ab2a0357a7e7368cb7acc1f2768c264 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sun, 11 Feb 2024 12:39:28 -0700 Subject: [PATCH 32/39] Fix config --- report/benchmarks/dgsh/setup/config.json | 40 +++++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/report/benchmarks/dgsh/setup/config.json b/report/benchmarks/dgsh/setup/config.json index 1660c620..1e27be57 100644 --- a/report/benchmarks/dgsh/setup/config.json +++ b/report/benchmarks/dgsh/setup/config.json @@ -1,7 +1,7 @@ -[ +[ { "name": "Dgsh 1.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/testmini.csv"], + "env": ["INPUT_FILE={{RESOURCE_DIR}}/dblp.xml"], "command": "{{TEST_SCRIPT_DIR}}/dgsh/1.sh", "orch_args": "-d 2" }, @@ -13,8 +13,14 @@ }, { "name": "Dgsh 3.sh", - "command": "{{TEST_SCRIPT_DIR}}/dgsh/3.sh", - "working_dir": "{{RESOURCE_DIR}}/linux", + "command": "{{TEST_SCRIPT_DIR}}/3.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", + "orch_args": "-d 2" + }, + { + "name": "Dgsh 4.sh", + "command": "{{TEST_SCRIPT_DIR}}/4.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", "orch_args": "-d 2" }, { @@ -24,8 +30,8 @@ "orch_args": "-d 2" }, { - "name": "Dgsh 6.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"], + "name": "6.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], "command": "{{TEST_SCRIPT_DIR}}/6.sh", "orch_args": "-d 2" }, @@ -37,8 +43,26 @@ }, { "name": "8.sh", - "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"], + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], "command": "{{TEST_SCRIPT_DIR}}/8_no_func.sh", "orch_args": "-d 2" + }, + { + "name": "9.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"], + "command": "{{TEST_SCRIPT_DIR}}/9.sh", + "orch_args": "-d 2" + }, + { + "name": "17.sh", + "env": ["INPUT_FILE={{RESOURCE_DIR}}/goods_classification.csv"], + "command": "{{TEST_SCRIPT_DIR}}/17.sh", + "orch_args": "-d 2" + }, + { + "name": "18.sh", + "working_dir": "{{RESOURCE_DIR}}/linux/", + "command": "{{TEST_SCRIPT_DIR}}/18.sh", + "orch_args": "-d 2" } -] +] \ No newline at end of file From a98b95603e6042dfe6b59f735d59aa27f8558991 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sun, 11 Feb 2024 12:40:48 -0700 Subject: [PATCH 33/39] Fix benchmarks to work properly --- report/benchmarks/dgsh/17.sh | 21 +++++++-------------- report/benchmarks/dgsh/18.sh | 22 ++++++++++++++++------ report/benchmarks/dgsh/3.sh | 0 report/benchmarks/dgsh/4.sh | 0 report/benchmarks/dgsh/5.sh | 19 ++++++++----------- report/benchmarks/dgsh/6.sh | 3 --- report/benchmarks/dgsh/8_no_func.sh | 12 ++++-------- report/benchmarks/dgsh/9.sh | 2 +- 8 files changed, 36 insertions(+), 43 deletions(-) mode change 100644 => 100755 report/benchmarks/dgsh/17.sh mode change 100644 => 100755 report/benchmarks/dgsh/18.sh mode change 100644 => 100755 report/benchmarks/dgsh/3.sh mode change 100644 => 100755 report/benchmarks/dgsh/4.sh mode change 100644 => 100755 report/benchmarks/dgsh/9.sh diff --git a/report/benchmarks/dgsh/17.sh b/report/benchmarks/dgsh/17.sh old mode 100644 new mode 100755 index 52ada7b4..df2788ff --- a/report/benchmarks/dgsh/17.sh +++ b/report/benchmarks/dgsh/17.sh @@ -38,20 +38,13 @@ file2=$(mktemp) file3=$(mktemp) file4=$(mktemp) -# Save the ls output to a temporary file -ls -n > "$file1" +cat $INPUT_FILE > $file1 -# Reorder fields in DIR-like way -awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2" +# Extract columns 5 and 6, save to temp1 +cut -d ',' -f 5-6 "$file1" > "$file2" -# Count number of files -wc -l "$file1" | tr -d \\n > "$file3" -echo -n ' File(s) ' >> "$file3" -awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3" +# Extract columns 2, 3, and 4, save to temp2 +cut -d ',' -f 2-4 "$file1" > "$file3" -# Count number of directories and print label for number of dirs and calculate free bytes -grep -c '^d' "$file1" | tr -d \\n > "$file4" -df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4" - -# Display the results -cat "$file2" "$file3" "$file4" +# Combine the columns +paste -d ',' "$file2" "$file3" diff --git a/report/benchmarks/dgsh/18.sh b/report/benchmarks/dgsh/18.sh old mode 100644 new mode 100755 index dac2892b..87c886f6 --- a/report/benchmarks/dgsh/18.sh +++ b/report/benchmarks/dgsh/18.sh @@ -29,21 +29,31 @@ file_details_file=$(mktemp) file_count_file=$(mktemp) dir_count_file=$(mktemp) byte_count_file=$(mktemp) +#!/bin/sh + +# Create temporary files +free_space_file=$(mktemp) +file_details_file=$(mktemp) +file_count_file=$(mktemp) +dir_count_file=$(mktemp) +byte_count_file=$(mktemp) + +# Base directory for the listing # Get free space df -h . | awk '!/Use%/{print $4}' > "$free_space_file" -# List details of files and directories -ls -l | awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file" +# Recursively list details of files +find . -type f -exec ls -l {} + | awk '{print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file" # Count number of files -ls -l | grep -v '^total' | grep -v '^d' | wc -l > "$file_count_file" +find . -type f | wc -l > "$file_count_file" # Count number of directories -ls -l | grep '^d' | wc -l > "$dir_count_file" +find . -type d | wc -l > "$dir_count_file" -# Calculate total bytes -ls -l | awk '{if($1 != "total") s += $5} END {print s}' > "$byte_count_file" +# Calculate total bytes for files +find . -type f -exec stat --format="%s" {} + | awk '{s+=$1} END {print s}' > "$byte_count_file" # Display the results cat "$file_details_file" diff --git a/report/benchmarks/dgsh/3.sh b/report/benchmarks/dgsh/3.sh old mode 100644 new mode 100755 diff --git a/report/benchmarks/dgsh/4.sh b/report/benchmarks/dgsh/4.sh old mode 100644 new mode 100755 diff --git a/report/benchmarks/dgsh/5.sh b/report/benchmarks/dgsh/5.sh index 0f2c5855..47a52b82 100755 --- a/report/benchmarks/dgsh/5.sh +++ b/report/benchmarks/dgsh/5.sh @@ -33,25 +33,22 @@ file1=$(mktemp) file2=$(mktemp) file3=$(mktemp) -file4=$(mktemp) - -# export LC_ALL=C cat $INPUT_FILE >"$file1" # Find errors # Obtain list of words in text -cat "$file1" | -tr -cs A-Za-z \\n | -tr A-Z a-z | -sort -u > "$file2" +cat "$file1" | +tr '[:upper:]' '[:lower:]' | +sed 's/[^a-z]/\n/g' | +grep -v '^$' | +sort | +uniq | +grep -v '^$' > "$file2" # Ensure dictionary is compatibly sorted -cat "$file1" | sort /usr/share/dict/words > "$file3" # List errors as a set difference -comm -23 "$file2" "$file3" > "$file4" - -fgrep -f "$file4" -i --color -w -C 2 "$file1" +comm -23 "$file2" "$file3" diff --git a/report/benchmarks/dgsh/6.sh b/report/benchmarks/dgsh/6.sh index 53828e0f..ec48a28f 100755 --- a/report/benchmarks/dgsh/6.sh +++ b/report/benchmarks/dgsh/6.sh @@ -42,9 +42,6 @@ file5=$(mktemp) cat $INPUT_FILE > $file1 -# Consistent sorting across machines -# export LC_ALL=C - # Stream input from file and split input one word per line # Create list of unique words tr -cs a-zA-Z '\n' < "$file1" | diff --git a/report/benchmarks/dgsh/8_no_func.sh b/report/benchmarks/dgsh/8_no_func.sh index ae676918..e4f760e3 100755 --- a/report/benchmarks/dgsh/8_no_func.sh +++ b/report/benchmarks/dgsh/8_no_func.sh @@ -38,9 +38,6 @@ # limitations under the License. # -# Consistent sorting across machines -# export LC_ALL=C - # Temporary files file1=$(mktemp) file2=$(mktemp) @@ -48,7 +45,6 @@ file3=$(mktemp) file4=$(mktemp) cat $INPUT_FILE > $file1 -cat $file1 # Split input one word per line tr -cs a-zA-Z '\n' < "$file1" > "$file2" @@ -86,7 +82,7 @@ awk '{count[$1]++} END {for (i in count) print count[i], i}' | sort -rn | tee "$file3" # Print relative -# echo "Relative character frequency" -# awk -v NCHARS=$nchars 'BEGIN { -# OFMT = "%.2g%%"} -# {print $1, $2, $1 / NCHARS * 100}' "$file3" \ No newline at end of file +echo "Relative character frequency" +awk -v NCHARS=$nchars 'BEGIN { + OFMT = "%.2g%%"} + {print $1, $2, $1 / NCHARS * 100}' "$file3" \ No newline at end of file diff --git a/report/benchmarks/dgsh/9.sh b/report/benchmarks/dgsh/9.sh old mode 100644 new mode 100755 index 88ee52a0..f090d36f --- a/report/benchmarks/dgsh/9.sh +++ b/report/benchmarks/dgsh/9.sh @@ -33,7 +33,7 @@ file2=$(mktemp) file3=$(mktemp) # Find object files and print defined symbols -find "$INPUT" -name "*.o" | xargs nm > "$file1" +find . -type f -name "*.o" | xargs nm > "$file1" # List all defined (exported) symbols awk 'NF == 3 && $2 ~ /[A-Z]/ {print $3}' "$file1" | sort > "$file2" From 0450d5d47fcb0a84c100fed8b8fee45e0956eb73 Mon Sep 17 00:00:00 2001 From: Guest Date: Sun, 11 Feb 2024 17:29:17 -0500 Subject: [PATCH 34/39] make strace spawn shell such that it traces the redirection --- parallel-orch/template_script_to_execute.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh index ea36ac4e..5af85d64 100755 --- a/parallel-orch/template_script_to_execute.sh +++ b/parallel-orch/template_script_to_execute.sh @@ -33,7 +33,7 @@ # rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null # echo 'second riker run done' 1>&2 source $LATEST_ENV_FILE -eval $(echo "strace -y -f --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING") +strace -y -f --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE bash -c "source $LATEST_ENV_FILE; $CMD_STRING" exit_code=$? (exit $exit_code) From 5be2a6cce5c3e2ca4245f9e5bf63b75ae381b756 Mon Sep 17 00:00:00 2001 From: Guest Date: Mon, 12 Feb 2024 14:28:29 -0500 Subject: [PATCH 35/39] adding getxattr, lgetxattr, and faccessat2 into tracing --- parallel-orch/trace_v2.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py index 10c52c47..d0696944 100644 --- a/parallel-orch/trace_v2.py +++ b/parallel-orch/trace_v2.py @@ -8,16 +8,16 @@ # Global TODOs: # handle pwd, such that open and stat can work -# not handled: listxattr, llistxattr, getxattr, lgetxattr, pivot_root, mount, umount2 +# not handled: listxattr, llistxattr, getxattr, pivot_root, mount, umount2 # setxattr lsetxattr removexattr lremovexattr, fanotify_mark, renameat2, chroot, quotactl # handled individually openat, open, chdir, clone, rename # TODO: link, symlink, renameat, symlinkat r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs', - 'readlink', 'execve']) + 'readlink', 'execve', 'getxattr', 'lgetxattr']) w_first_path_set = set(['mkdir', 'rmdir', 'truncate', 'creat', 'chmod', 'chown', 'lchown', 'utime', 'mknod', 'utimes', 'acct', 'unlink']) r_fd_path_set = set(['fstatat', 'newfstatat', 'statx', 'name_to_handle_at', - 'readlinkat', 'faccessat', 'execveat']) + 'readlinkat', 'faccessat', 'execveat', 'faccessat2']) w_fd_path_set = set(['unlinkat', 'utimensat', 'mkdirat', 'mknodat', 'fchownat', 'futimeat', 'unlinkat', 'linkat', 'fchmodat', 'utimensat']) ignore_set = set(['getpid', 'getcwd']) @@ -89,6 +89,9 @@ def parse_string(s): # as a read when we handle return value anyway so it's fine if s == 'NULL': return '' + if not s[0] == '"' or not s[-1] == '"': + import pdb + pdb.set_trace() assert s[0] == '"' and s[-1] == '"' return bytes(s[1:-1], "utf-8").decode("unicode_escape") From 6f2266f3df6ceb3e098d87e82c96bbce6ac307da Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 17 Feb 2024 15:57:25 +0000 Subject: [PATCH 36/39] Use most recent try branch --- deps/try | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/try b/deps/try index 37bbf7da..ba6a9061 160000 --- a/deps/try +++ b/deps/try @@ -1 +1 @@ -Subproject commit 37bbf7da5bfde97f598c3327c9582d9b08d7e264 +Subproject commit ba6a90615944203a95d5a86638447da34e539d1b From 1172d7fbb12f29c7348a78c0319e895a3b1b4a97 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 17 Feb 2024 16:08:08 +0000 Subject: [PATCH 37/39] Remove Riker dependency --- .gitmodules | 4 ---- deps/riker | 1 - scripts/install_deps_ubuntu20.sh | 10 ++-------- 3 files changed, 2 insertions(+), 13 deletions(-) delete mode 160000 deps/riker diff --git a/.gitmodules b/.gitmodules index ba4f5c51..bf6ada7c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "deps/riker"] - path = deps/riker - url = https://github.com/angelhof/riker.git - branch = eric-custom-db-store [submodule "deps/pash"] path = deps/pash url = https://github.com/binpash/pash.git diff --git a/deps/riker b/deps/riker deleted file mode 160000 index f3bee7ba..00000000 --- a/deps/riker +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f3bee7ba19b8834199ff49dac53852f09f03338a diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh index 8d239d2a..f884202c 100755 --- a/scripts/install_deps_ubuntu20.sh +++ b/scripts/install_deps_ubuntu20.sh @@ -1,20 +1,14 @@ #!/bin/bash -## Install Riker's dependencies sudo apt-get update -sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool -sudo update-alternatives --install /usr/bin/cram cram /usr/bin/cram3 100 +# TODO: some of these are Riker dependencies are no longer needed. +sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool python3-matplotlib export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} -pip3 install $PASH_SPEC_TOP/requirements.txt - ## Download submodule dependencies git submodule update --init --recursive -## Install Riker -(cd deps/riker; make; sudo make install) - ## Install PaSh (cd deps/pash; ./scripts/distro-deps.sh; ./scripts/setup-pash.sh) From d0851fdd5988b6584e96d3c2464af29ff508cbc6 Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 17 Feb 2024 16:11:36 +0000 Subject: [PATCH 38/39] Update config to also setup try --- scripts/install_deps_ubuntu20.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh index f884202c..1bcf4122 100755 --- a/scripts/install_deps_ubuntu20.sh +++ b/scripts/install_deps_ubuntu20.sh @@ -2,7 +2,7 @@ sudo apt-get update # TODO: some of these are Riker dependencies are no longer needed. -sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool python3-matplotlib +sudo apt install -y make git python3-cram file graphviz libtool python3-matplotlib libcap2-bin mergerfs export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} @@ -10,5 +10,8 @@ export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash} ## Download submodule dependencies git submodule update --init --recursive +# Install try +(cd deps/try; ./setup.sh) + ## Install PaSh (cd deps/pash; ./scripts/distro-deps.sh; ./scripts/setup-pash.sh) From c569b8313a3a96f7dee6897d7cb5c637ea450a7b Mon Sep 17 00:00:00 2001 From: gliargovas Date: Sat, 17 Feb 2024 16:12:23 +0000 Subject: [PATCH 39/39] Make CI use Python 3.11 --- .github/workflows/tests.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 193e1af2..1e187580 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -31,6 +31,10 @@ jobs: if: github.event.pull_request.draft == false steps: - uses: actions/checkout@v2 + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: '3.11' - name: Running Correctness Tests run: | cd ..