From 09960a10d275c02a474ec2854fe11933790a7812 Mon Sep 17 00:00:00 2001
From: SleepyMug <di_jin@brown.edu>
Date: Fri, 27 Oct 2023 08:19:42 -0400
Subject: [PATCH 01/39] handling basic cases

---
 parallel-orch/trace_v2.py | 163 ++++++++++++++++++++++++++++++++++++++
 test/tracer_test/Makefile |  10 +++
 test/tracer_test/chdir.c  |  27 +++++++
 test/tracer_test/pid.c    |  26 ++++++
 4 files changed, 226 insertions(+)
 create mode 100644 parallel-orch/trace_v2.py
 create mode 100644 test/tracer_test/Makefile
 create mode 100644 test/tracer_test/chdir.c
 create mode 100644 test/tracer_test/pid.c

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
new file mode 100644
index 00000000..125614f2
--- /dev/null
+++ b/parallel-orch/trace_v2.py
@@ -0,0 +1,163 @@
+import re
+import os.path
+import sys
+from dataclasses import dataclass
+
+# Global TODOs:
+# handle pwd, such that open and stat can work
+
+def parse_info(l):
+    return 0
+
+@dataclass
+class RFile:
+    fname: str
+
+@dataclass
+class WFile:
+    fname: str
+
+# openat
+r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs'])
+w_first_path_set = set(['mkdir'])
+r_fd_path_set = set(['fstatat', 'newfstatat'])
+w_fd_path_set = set(['unlinkat'])
+ignore_set = set(['getpid'])
+
+def parse_string(s):
+    s = s.strip()
+    assert s[0] == '"' and s[-1] == '"'
+    return bytes(s[1:-1], "utf-8").decode("unicode_escape")
+
+def between(s, d1, d2):
+    return s.find(d1) + len(d1), s.rfind(d2)
+
+def get_path_first_path(args):
+    a, _ = args.split(sep=',', maxsplit=1)
+    return parse_string(a)
+
+def parse_r_first_path(args, ret):
+    return RFile(get_path_first_path(args))
+
+def parse_w_first_path(args, ret):
+    path = get_path_first_path(args)    
+    if is_ret_enoent(ret):
+        return RFile(path)
+    else:
+        return WFile(path)
+
+def is_open_flag(flags):
+    if 'O_RDONLY' in flags:
+        return 'r'
+    else:
+        return 'w'
+
+def is_absolute(path):
+    return path[0] == '/'
+
+def is_ret_enoent(ret):
+    return 'ENOENT' in ret
+
+def parse_openat(args, ret):
+    if args.count(',') <= 2:
+        dfd, path, flags = args.split(',', maxsplit=2)
+    else:
+        dfd, path, flags, _ = args.split(',', maxsplit=3)
+    path = parse_string(path)
+    if is_absolute(path):
+        total_path = path
+    else:
+        begin, end = between(dfd, '<', '>')
+        pwd = dfd[begin:end]
+        total_path = os.path.join(pwd, path)
+    if is_open_flag(flags) == 'r':
+        return RFile(total_path)
+    if is_ret_enoent(ret):
+        return RFile(total_path)
+    return WFile(total_path)
+
+def parse_chdir(args, ret):
+    return None
+
+def get_path_from_fd_path(args):
+    a0, a1, _ = args.split(sep=',', maxsplit=2)
+    a1 = parse_string(a1)
+    if a1[0] == '/':
+        return a1
+    else:
+        begin, end = between(a0, '<', '>')
+        a0 = a0[begin:end]
+        return os.path.join(a0, a1)
+
+def parse_r_fd_path(args, ret):
+    return RFile(get_path_from_fd_path(args))
+
+def parse_w_fd_path(args, ret):
+    if is_ret_enoent(ret):
+        return RFile(get_path_from_fd_path(args))
+    else:
+        return WFile(get_path_from_fd_path(args))
+    
+def parse_syscall(syscall, args, ret):
+    if syscall in r_first_path_set:
+        return parse_r_first_path(args, ret)
+    elif syscall in w_first_path_set:
+        return parse_w_first_path(args, ret)
+    elif syscall == 'openat':
+        return parse_openat(args, ret)
+    elif syscall == 'chdir':
+        return parse_chdir(args, ret)
+    elif syscall in r_fd_path_set:
+        return parse_r_fd_path(args, ret)
+    elif syscall in w_fd_path_set:
+        return parse_w_fd_path(args, ret)
+    elif syscall in ignore_set:
+        return None
+    else:
+        raise ValueError('Unclassified syscall ' + syscall)
+
+def strip_prefix(l):
+    if l[0].isdigit():
+        return l.split(' ', maxsplit=1)[1]
+    else:
+        return l
+
+def handle_info(l):
+    if '+++' in l:
+        return True, parse_info(l)
+    elif '---' in l:
+        return True, None
+    else:
+        return False, None
+        
+def parse_line(l):
+    is_info, info = handle_info(l)
+    if is_info:
+        return info
+    if not len(l):
+        return None
+    l = strip_prefix(l)
+    lparen = l.find('(')
+    rparen = l.rfind(')')
+    equals = l.rfind('=')
+    syscall = l[:lparen]
+    args = l[lparen+1:rparen]
+    ret = l[equals+1]
+    return parse_syscall(syscall, args, ret)
+
+    
+def main(fname):
+    with open(fname) as f:
+        s = f.read()
+    for l in s.split('\n'):
+        print(parse_line(l))
+
+debug_g = r'''
+start: ESCAPED_STRING
+
+%import common.ESCAPED_STRING
+'''
+if __name__ == '__main__':
+    # parser = lark.Lark(debug_g)
+    # parser.parse('"lskjkf"')
+    main(sys.argv[1])
diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile
new file mode 100644
index 00000000..350528a2
--- /dev/null
+++ b/test/tracer_test/Makefile
@@ -0,0 +1,10 @@
+objs = chdir pid
+
+CFLAGS=-O2
+
+.PHONY: all clean
+
+all: $(objs)
+
+clean:
+	rm $(objs)
diff --git a/test/tracer_test/chdir.c b/test/tracer_test/chdir.c
new file mode 100644
index 00000000..a7a66a9e
--- /dev/null
+++ b/test/tracer_test/chdir.c
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+}
+
+int main(void)
+{
+	int fd, ret;
+	reset();
+	ret = chdir(TMPDIR);
+	if (ret < 0)
+		exit(1);
+	fd = open("a", O_RDONLY);
+	if (fd < 0)
+		exit(1);
+	close(fd);
+	return 0;
+}
diff --git a/test/tracer_test/pid.c b/test/tracer_test/pid.c
new file mode 100644
index 00000000..28b9cc04
--- /dev/null
+++ b/test/tracer_test/pid.c
@@ -0,0 +1,26 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+
+#define NUM_CALLS 100000
+
+int main() {
+    struct timeval start, end;
+    long seconds, useconds;
+    double mtime;
+
+    gettimeofday(&start, NULL); // get the start time
+    for (int i = 0; i < NUM_CALLS; ++i) {
+        syscall(SYS_getpid);
+    }
+    gettimeofday(&end, NULL);  // get the end time
+
+    seconds  = end.tv_sec  - start.tv_sec;
+    useconds = end.tv_usec - start.tv_usec;
+    mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
+
+    printf("Elapsed time for getpid syscall: %.3f milliseconds\n", mtime);
+    printf("Average time per getpid syscall: %.3f microseconds\n", mtime * 1000 / NUM_CALLS);
+    return 0;
+}

From c189d274571440cd4db41dfaab16618b0e78d515 Mon Sep 17 00:00:00 2001
From: SleepyMug <di_jin@brown.edu>
Date: Sun, 29 Oct 2023 03:34:32 -0400
Subject: [PATCH 02/39] changes to support clone and chdir

---
 parallel-orch/trace_v2.py  | 180 +++++++++++++++++++++++++++----------
 test/tracer_test/Makefile  |   7 +-
 test/tracer_test/chdir.c   |   3 +-
 test/tracer_test/fork.c    |  37 ++++++++
 test/tracer_test/symlink.c |  39 ++++++++
 test/tracer_test/thread.c  |  41 +++++++++
 6 files changed, 256 insertions(+), 51 deletions(-)
 create mode 100644 test/tracer_test/fork.c
 create mode 100644 test/tracer_test/symlink.c
 create mode 100644 test/tracer_test/thread.c

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 125614f2..824829e0 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -12,51 +12,110 @@ def parse_info(l):
 @dataclass
 class RFile:
     fname: str
+    def __init__(self, fname):
+        self.fname = os.path.normpath(fname)
 
 @dataclass
 class WFile:
     fname: str
+    def __init__(self, fname):
+        self.fname = os.path.normpath(fname)
 
-# openat
+class Context:
+    def __init__(self):
+        self.line_dict = {}
+        self.curdir_dict = {}
+        self.pid_group_dict = {}
+
+    def do_clone(self, parent, child):
+        self.pid_group_dict[child] = parent
+        
+    def set_dir(self, path, pid=None):
+        self.curdir_fallback = path
+        if pid and pid in self.pid_group_dict:
+            pid = self.pid_group_dict[pid]
+        if pid:
+            self.curdir_dict[pid] = path
+
+    def get_dir(self, pid: int):
+        if pid in self.pid_group_dict:
+            pid = self.pid_group_dict[pid]
+        if not pid in self.curdir_dict:
+            self.curdir_dict[pid] = self.curdir_fallback
+        return self.curdir_dict[pid]
+
+    def push_half_line(self, pid: int, l):
+        index = l.find('<unfinished')
+        self.line_dict[pid] = l[:index].strip()
+
+    def pop_complete_line(self, pid: int, l):
+        index = l.find('resumed>') + len('resumed>')
+        total_line = self.line_dict[pid] + l[index:].strip()
+        del self.line_dict[pid]
+        return total_line
+
+# openat, open, chdir, clone
 r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs'])
 w_first_path_set = set(['mkdir'])
 r_fd_path_set = set(['fstatat', 'newfstatat'])
-w_fd_path_set = set(['unlinkat'])
+w_fd_path_set = set(['unlinkat', 'utimensat'])
 ignore_set = set(['getpid'])
 
 def parse_string(s):
     s = s.strip()
+    # handling cases such as utimensat
+    # if the open fails we will mark the file
+    # as a read when we handle return value anyway so it's fine
+    if s == 'NULL':
+        return ''
     assert s[0] == '"' and s[-1] == '"'
     return bytes(s[1:-1], "utf-8").decode("unicode_escape")
 
 def between(s, d1, d2):
     return s.find(d1) + len(d1), s.rfind(d2)
 
-def get_path_first_path(args):
-    a, _ = args.split(sep=',', maxsplit=1)
-    return parse_string(a)
+def is_absolute(path):
+    return path[0] == '/'
+
+def is_ret_err(ret: str):
+    ret = ret.strip()
+    return ret[0] == '-'
+
+def get_path_first_path(pid, args, ctx):
+    a = args.split(sep=',', maxsplit=1)[0]
+    if is_absolute(a):
+        return parse_string(a)
+    else:
+        return os.path.join(ctx.get_dir(pid), parse_string(a))
 
-def parse_r_first_path(args, ret):
-    return RFile(get_path_first_path(args))
+def parse_r_first_path(pid, args, ret, ctx):
+    return RFile(get_path_first_path(pid, args, ctx))
 
-def parse_w_first_path(args, ret):
-    path = get_path_first_path(args)    
-    if is_ret_enoent(ret):
+def parse_w_first_path(pid, args, ret, ctx):
+    path = get_path_first_path(pid, args, ctx)
+    if is_ret_err(ret):
         return RFile(path)
     else:
         return WFile(path)
 
-def is_open_flag(flags):
+def parse_chdir(pid, args, ret, ctx):
+    new_path = get_path_first_path(pid, args, ctx)
+    if not is_ret_err(ret):
+        ctx.set_dir(new_path, pid)
+    return RFile(new_path)
+
+def handle_open_flag(flags):
     if 'O_RDONLY' in flags:
         return 'r'
     else:
         return 'w'
 
-def is_absolute(path):
-    return path[0] == '/'
-
-def is_ret_enoent(ret):
-    return 'ENOENT' in ret
+def handle_open_common(total_path, flags, ret):
+    if handle_open_flag(flags) == 'r':
+        return RFile(total_path)
+    if is_ret_err(ret):
+        return RFile(total_path)
+    return WFile(total_path)
 
 def parse_openat(args, ret):
     if args.count(',') <= 2:
@@ -70,19 +129,17 @@ def parse_openat(args, ret):
         begin, end = between(dfd, '<', '>')
         pwd = dfd[begin:end]
         total_path = os.path.join(pwd, path)
-    if is_open_flag(flags) == 'r':
-        return RFile(total_path)
-    if is_ret_enoent(ret):
-        return RFile(total_path)
-    return WFile(total_path)
-
-def parse_chdir(args, ret):
-    return None
+    return handle_open_common(total_path, flags, ret)
 
+def parse_open(pid, args, ret, ctx):
+    total_path = get_path_first_path(pid, args, ctx)
+    flags = args.split(',')[1]
+    return handle_open_common(total_path, flags, ret)
+    
 def get_path_from_fd_path(args):
     a0, a1, _ = args.split(sep=',', maxsplit=2)
     a1 = parse_string(a1)
-    if a1[0] == '/':
+    if len(a1) and a1[0] == '/':
         return a1
     else:
         begin, end = between(a0, '<', '>')
@@ -93,34 +150,58 @@ def parse_r_fd_path(args, ret):
     return RFile(get_path_from_fd_path(args))
 
 def parse_w_fd_path(args, ret):
-    if is_ret_enoent(ret):
+    if is_ret_err(ret):
         return RFile(get_path_from_fd_path(args))
     else:
         return WFile(get_path_from_fd_path(args))
+
+def has_clone_fs(flags):
+    if 'CLONE_FS' in flags:
+        return True
+    else:
+        return False
+
+def parse_clone(pid, args, ret, ctx):
+    try:
+        child = int(ret)
+    except ValueError:
+        child = -1
+    if child < 0:
+        return
+    arg_list = [x.strip() for x in args.split(',')]
+    flags = [arg for arg in arg_list if arg.startswith('flags=')][0]
+    flags = flags[len('flags='):]
+    if has_clone_fs(flags):
+        ctx.do_clone(pid, child)
     
-def parse_syscall(syscall, args, ret):
+def parse_syscall(pid, syscall, args, ret, ctx):
     if syscall in r_first_path_set:
-        return parse_r_first_path(args, ret)
+        return parse_r_first_path(pid, args, ret, ctx)
     elif syscall in w_first_path_set:
-        return parse_w_first_path(args, ret)
+        return parse_w_first_path(pid, args, ret, ctx)
     elif syscall == 'openat':
         return parse_openat(args, ret)
     elif syscall == 'chdir':
-        return parse_chdir(args, ret)
+        return parse_chdir(pid, args, ret, ctx)
+    elif syscall == 'open':
+        return parse_open(pid, args, ret, ctx)
     elif syscall in r_fd_path_set:
         return parse_r_fd_path(args, ret)
     elif syscall in w_fd_path_set:
         return parse_w_fd_path(args, ret)
+    elif syscall == 'clone':
+        return parse_clone(pid, args, ret, ctx)
     elif syscall in ignore_set:
         return None
     else:
         raise ValueError('Unclassified syscall ' + syscall)
 
-def strip_prefix(l):
+def strip_pid(l):
     if l[0].isdigit():
-        return l.split(' ', maxsplit=1)[1]
+        pair = l.split(' ', maxsplit=1)
+        return int(pair[0]), pair[1]
     else:
-        return l
+        raise ValueError('expect pid')
 
 def handle_info(l):
     if '+++' in l:
@@ -129,35 +210,38 @@ def handle_info(l):
         return True, None
     else:
         return False, None
-        
-def parse_line(l):
+
+def parse_line(l, ctx):
     is_info, info = handle_info(l)
     if is_info:
         return info
     if not len(l):
         return None
-    l = strip_prefix(l)
+    pid, l = strip_pid(l)
+    if "<unfinished" in l:
+        ctx.push_half_line(pid, l)
+        return None
+    elif "resumed>" in l:
+        l = ctx.pop_complete_line(pid, l)
     lparen = l.find('(')
-    rparen = l.rfind(')')
     equals = l.rfind('=')
+    rparen = l[:equals].rfind(')')
+    assert lparen >= 0 and equals >= 0 and rparen >= 0
     syscall = l[:lparen]
+    ret = l[equals+1:]
     args = l[lparen+1:rparen]
-    ret = l[equals+1]
-    return parse_syscall(syscall, args, ret)
+    return parse_syscall(pid, syscall, args, ret, ctx)
+
 
-    
 def main(fname):
     with open(fname) as f:
         s = f.read()
+    ctx = Context()
+    ctx.set_dir(os.getcwd())
     for l in s.split('\n'):
-        print(parse_line(l))
-
-debug_g = r'''
-start: ESCAPED_STRING
+        record = parse_line(l, ctx)
+        if record:
+            print(record)
 
-%import common.ESCAPED_STRING
-'''
 if __name__ == '__main__':
-    # parser = lark.Lark(debug_g)
-    # parser.parse('"lskjkf"')
     main(sys.argv[1])
diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile
index 350528a2..b7e40cac 100644
--- a/test/tracer_test/Makefile
+++ b/test/tracer_test/Makefile
@@ -1,10 +1,13 @@
-objs = chdir pid
+objs = chdir pid fork symlink
 
 CFLAGS=-O2
 
 .PHONY: all clean
 
-all: $(objs)
+all: $(objs) thread
+
+thread: thread.c
+	$(CC) $(CFLAGS) -pthread $< -o $@
 
 clean:
 	rm $(objs)
diff --git a/test/tracer_test/chdir.c b/test/tracer_test/chdir.c
index a7a66a9e..e2282bfd 100644
--- a/test/tracer_test/chdir.c
+++ b/test/tracer_test/chdir.c
@@ -2,6 +2,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
+#include <syscall.h>
 
 #define TMPDIR "/tmp/hs_tracer_test"
 
@@ -19,7 +20,7 @@ int main(void)
 	ret = chdir(TMPDIR);
 	if (ret < 0)
 		exit(1);
-	fd = open("a", O_RDONLY);
+	fd = syscall(SYS_open, "a", O_RDONLY);
 	if (fd < 0)
 		exit(1);
 	close(fd);
diff --git a/test/tracer_test/fork.c b/test/tracer_test/fork.c
new file mode 100644
index 00000000..5a9533ae
--- /dev/null
+++ b/test/tracer_test/fork.c
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+ 	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+	ret = system("mkdir " TMPDIR "/a");
+	ret = system("mkdir " TMPDIR "/b");
+}
+
+int main(void)
+{
+ 	int fd, ret;
+	reset();
+	ret = fork();
+	if (ret == 0) {
+		ret = chdir(TMPDIR "/a");
+		if (ret < 0)
+			exit(1);
+	} else {
+		ret = chdir(TMPDIR "/b");
+		if (ret < 0)
+			exit(1);
+	}
+	fd = syscall(SYS_open, "f", O_RDONLY);
+	if (fd < 0)
+                exit(1);
+	close(fd);
+	return 0;
+}
diff --git a/test/tracer_test/symlink.c b/test/tracer_test/symlink.c
new file mode 100644
index 00000000..613b58dd
--- /dev/null
+++ b/test/tracer_test/symlink.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+ 	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+	ret = system("mkdir " TMPDIR "/a");
+	ret = system("mkdir " TMPDIR "/b");
+	ret = system("touch " TMPDIR "/b/f");
+	ret = system("ln -s " TMPDIR "/b/f " TMPDIR "/a/f");
+}
+
+int main(void)
+{
+ 	int fd, ret;
+	reset();
+	/* if (ret == 0) { */
+	/* 	ret = chdir(TMPDIR "/a"); */
+	/* 	if (ret < 0) */
+	/* 		exit(1); */
+	/* } else { */
+	/* 	ret = chdir(TMPDIR "/b"); */
+	/* 	if (ret < 0) */
+	/* 		exit(1); */
+	/* } */
+	ret = chdir(TMPDIR "/a");
+	fd = syscall(SYS_open, "f", O_RDONLY);
+	if (fd < 0)
+                exit(1);
+	close(fd);
+	return 0;
+}
diff --git a/test/tracer_test/thread.c b/test/tracer_test/thread.c
new file mode 100644
index 00000000..3d109917
--- /dev/null
+++ b/test/tracer_test/thread.c
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+ 	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+	ret = system("mkdir " TMPDIR "/a");
+	ret = system("mkdir " TMPDIR "/b");
+	ret = system("touch " TMPDIR "/a/f");
+}
+
+void *threaded_chdir(void *p)
+{
+	int ret;
+	ret = chdir(TMPDIR "/a");
+	return NULL;
+}
+
+int main(void)
+{
+	pthread_t child;
+ 	int fd, ret;
+	reset();
+	ret = chdir(TMPDIR "/b");
+	ret = pthread_create(&child, NULL, threaded_chdir, NULL);
+	ret = pthread_join(child, NULL);
+	fd = syscall(SYS_open, "f", O_RDONLY);
+	fd = syscall(SYS_open, "g", O_RDONLY);
+	if (fd < 0)
+                exit(1);
+	close(fd);
+	return 0;
+}

From e7bb52bbfc61a3193eb1a366f3c2ab19fd458c6e Mon Sep 17 00:00:00 2001
From: SleepyMug <di_jin@brown.edu>
Date: Sun, 29 Oct 2023 04:02:13 -0400
Subject: [PATCH 03/39] correct realtime pipe behavior

---
 parallel-orch/trace_v2.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 824829e0..8eee73a1 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -234,14 +234,13 @@ def parse_line(l, ctx):
 
 
 def main(fname):
-    with open(fname) as f:
-        s = f.read()
     ctx = Context()
     ctx.set_dir(os.getcwd())
-    for l in s.split('\n'):
-        record = parse_line(l, ctx)
-        if record:
-            print(record)
+    with open(fname) as f:
+        for l in f:
+            record = parse_line(l, ctx)
+            if record:
+                print(record)
 
 if __name__ == '__main__':
     main(sys.argv[1])

From 1cb10416a3900996e5ef5e9250aef4a7b2b0e99b Mon Sep 17 00:00:00 2001
From: Guest <guest@neutron>
Date: Sat, 11 Nov 2023 09:06:43 -0500
Subject: [PATCH 04/39] basic integration

---
 parallel-orch/partial_program_order.py      |  5 ++-
 parallel-orch/template_script_to_execute.sh | 50 +++++++++++----------
 parallel-orch/trace_v2.py                   | 44 ++++++++++++++++--
 3 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 8aabcd7f..4aff7c81 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -7,6 +7,7 @@
 import config
 import executor
 import trace
+import trace_v2
 from util import *
 import util
 from collections import defaultdict
@@ -1394,7 +1395,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand
             self.stopped.add(node_id)
         else:
             trace_object = executor.read_trace(sandbox_dir, trace_file)
-            cmd_exit_code = trace.parse_exit_code(trace_object)
+            cmd_exit_code = trace_v2.parse_exit_code(trace_object)
 
             ## Save the completed node info. Note that if the node doesn't commit
             ##  this information will be invalid and rewritten the next time execution
@@ -1405,7 +1406,7 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand
             ## We no longer add failed commands to the stopped set, 
             ## because this leads to more repetitions than needed
             ## and does not allow us to properly speculate commands
-            read_set, write_set = trace.parse_and_gather_cmd_rw_sets(trace_object)
+            read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
             rw_set = RWSet(read_set, write_set)
             self.update_rw_set(node_id, rw_set)
 
diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh
index a63596d1..d9f11171 100755
--- a/parallel-orch/template_script_to_execute.sh
+++ b/parallel-orch/template_script_to_execute.sh
@@ -1,35 +1,39 @@
 #!/bin/bash
 
-touch "$TEMPDIR/Rikerfile"
+# touch "$TEMPDIR/Rikerfile"
 
-## We source the latest env file
-## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why.
-echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile"
+# ## We source the latest env file
+# ## TODO: Executing through $RUNTIME_DIR/pash_source_declare_vars.sh fails. Figure out why.
+# echo "source $LATEST_ENV_FILE" > "$TEMPDIR/Rikerfile"
 
-## Save the script to execute in the sandboxdir
-echo $CMD_STRING >> "$TEMPDIR/Rikerfile"
+# ## Save the script to execute in the sandboxdir
+# echo $CMD_STRING >> "$TEMPDIR/Rikerfile"
 
-## Add command to export Riker's environment variables after run is complete to a file
-echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile"
+# ## Add command to export Riker's environment variables after run is complete to a file
+# echo "source $RUNTIME_DIR/pash_declare_vars.sh $POST_EXEC_ENV" >> "$TEMPDIR/Rikerfile"
 
-if [ $speculate_flag -eq 1 ]; then
-    rkr_cmd="rkr"
-else
-    rkr_cmd="rkr --frontier"
-fi
+# if [ $speculate_flag -eq 1 ]; then
+#     rkr_cmd="rkr"
+# else
+#     rkr_cmd="rkr --frontier"
+# fi
 
-cat "$TEMPDIR/Rikerfile" 1>&2
+# cat "$TEMPDIR/Rikerfile" 1>&2
 
-$rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile"
-exit_code="$?"
+# $rkr_cmd --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile"
+# exit_code="$?"
 
-if [ "$exit_code" -eq 0 ]; then
-    echo "first riker run done (Node: ${CMD_ID})" 1>&2
-else
-    echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2
-fi
+# if [ "$exit_code" -eq 0 ]; then
+#     echo "first riker run done (Node: ${CMD_ID})" 1>&2
+# else
+#     echo "Riker error: first Riker command failed with EC $exit_code - (Node: ${CMD_ID})" 1>&2
+# fi
+
+
+# rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null
+# echo 'second riker run done' 1>&2
+source $LATEST_ENV_FILE
+eval $(echo "strace -y -f  --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING")
 
-rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null
-echo 'second riker run done' 1>&2
 
 (exit $exit_code)
diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 8eee73a1..8594e4c6 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -1,13 +1,25 @@
 import re
 import os.path
 import sys
+from typing import Tuple
 from dataclasses import dataclass
 
 # Global TODOs:
 # handle pwd, such that open and stat can work
 
+@dataclass
+class ExitStatus:
+    exitcode: int
+    
 def parse_info(l):
-    return 0
+    if "exited" in l:
+        start = len("+++ exited with ")
+        end = -len(" +++")
+        return ExitStatus(int(l[start:end]))
+    elif 'killed' in l:
+        return ExitStatus(-1)
+    else:
+        raise ValueError
 
 @dataclass
 class RFile:
@@ -59,7 +71,7 @@ def pop_complete_line(self, pid: int, l):
 w_first_path_set = set(['mkdir'])
 r_fd_path_set = set(['fstatat', 'newfstatat'])
 w_fd_path_set = set(['unlinkat', 'utimensat'])
-ignore_set = set(['getpid'])
+ignore_set = set(['getpid', 'getcwd'])
 
 def parse_string(s):
     s = s.strip()
@@ -198,7 +210,7 @@ def parse_syscall(pid, syscall, args, ret, ctx):
 
 def strip_pid(l):
     if l[0].isdigit():
-        pair = l.split(' ', maxsplit=1)
+        pair = l.split(maxsplit=1)
         return int(pair[0]), pair[1]
     else:
         raise ValueError('expect pid')
@@ -212,12 +224,12 @@ def handle_info(l):
         return False, None
 
 def parse_line(l, ctx):
+    pid, l = strip_pid(l)
     is_info, info = handle_info(l)
     if is_info:
         return info
     if not len(l):
         return None
-    pid, l = strip_pid(l)
     if "<unfinished" in l:
         ctx.push_half_line(pid, l)
         return None
@@ -232,6 +244,30 @@ def parse_line(l, ctx):
     args = l[lparen+1:rparen]
     return parse_syscall(pid, syscall, args, ret, ctx)
 
+def parse_exit_code(trace_object) -> int:
+    if len(trace_object) < 1:
+        return None
+    l = trace_object[0]
+    first_pid, _ = strip_pid(l)
+    for l in trace_object:
+        pid, tmpl = strip_pid(l)
+        is_info, info = handle_info(tmpl)
+        if is_info and pid == first_pid and isinstance(info, ExitStatus):
+            return info.exitcode
+    raise ValueError("No exitcode")
+
+def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]:
+    ctx = Context()
+    ctx.set_dir(os.getcwd())
+    read_set = set()
+    write_set = set()
+    for l in trace_object:
+        record = parse_line(l, ctx)
+        if type(record) is RFile:
+            read_set.add(record.fname)
+        elif type(record) is WFile:
+            write_set.add(record.fname)
+    return read_set, write_set
 
 def main(fname):
     ctx = Context()

From 8e439b6943ad9e1d9302f519cfa802d0ad9b58c1 Mon Sep 17 00:00:00 2001
From: Guest <guest@neutron>
Date: Sat, 11 Nov 2023 17:49:25 -0500
Subject: [PATCH 05/39] more test cases

---
 test/tracer_test/Makefile |  2 +-
 test/tracer_test/getcwd.c | 28 ++++++++++++++++++++++++++++
 test/tracer_test/mkdir.c  | 23 +++++++++++++++++++++++
 test/tracer_test/retval.c | 22 ++++++++++++++++++++++
 4 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 test/tracer_test/getcwd.c
 create mode 100644 test/tracer_test/mkdir.c
 create mode 100644 test/tracer_test/retval.c

diff --git a/test/tracer_test/Makefile b/test/tracer_test/Makefile
index b7e40cac..74677e5f 100644
--- a/test/tracer_test/Makefile
+++ b/test/tracer_test/Makefile
@@ -1,4 +1,4 @@
-objs = chdir pid fork symlink
+objs = chdir pid fork symlink getcwd retval mkdir
 
 CFLAGS=-O2
 
diff --git a/test/tracer_test/getcwd.c b/test/tracer_test/getcwd.c
new file mode 100644
index 00000000..71b4f3bd
--- /dev/null
+++ b/test/tracer_test/getcwd.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+}
+
+int main(void)
+{
+	int fd, ret;
+	char name[1024];
+	reset();
+	ret = chdir(TMPDIR);
+	if (ret < 0)
+		exit(1);
+	ret = syscall(SYS_getcwd, name, 1024);
+	if (ret < 0)
+		exit(1);
+	return 0;
+}
diff --git a/test/tracer_test/mkdir.c b/test/tracer_test/mkdir.c
new file mode 100644
index 00000000..6903e395
--- /dev/null
+++ b/test/tracer_test/mkdir.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+}
+
+int main(void)
+{
+	int fd, ret;
+	reset();
+	ret = system("mkdir -p " TMPDIR "/a");
+	ret = system("mkdir -p " TMPDIR "/a");
+	return 0;
+}
diff --git a/test/tracer_test/retval.c b/test/tracer_test/retval.c
new file mode 100644
index 00000000..986ab538
--- /dev/null
+++ b/test/tracer_test/retval.c
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <syscall.h>
+
+#define TMPDIR "/tmp/hs_tracer_test"
+
+void reset(void)
+{
+	int ret;
+	ret = system("rm -rf " TMPDIR);
+	ret = system("mkdir " TMPDIR);
+}
+
+int main(void)
+{
+	int fd, ret;
+	char name[1024];
+	reset();
+	return 112;
+}

From 379d616546c0f47c4eeabbb5f17aa1e01a528ae1 Mon Sep 17 00:00:00 2001
From: Guest <guest@neutron>
Date: Sun, 12 Nov 2023 01:00:10 -0500
Subject: [PATCH 06/39] catch exit code

---
 parallel-orch/template_script_to_execute.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh
index d9f11171..ea36ac4e 100755
--- a/parallel-orch/template_script_to_execute.sh
+++ b/parallel-orch/template_script_to_execute.sh
@@ -34,6 +34,6 @@
 # echo 'second riker run done' 1>&2
 source $LATEST_ENV_FILE
 eval $(echo "strace -y -f  --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING")
-
+exit_code=$?
 
 (exit $exit_code)

From 9e921e572c510e4b4503b330c9f72a2d70aa448d Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Wed, 15 Nov 2023 05:54:40 -0500
Subject: [PATCH 07/39] Fix bug in scheduler state updating

---
 parallel-orch/partial_program_order.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 4aff7c81..13c739ba 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -1368,6 +1368,7 @@ def attempt_rerun_pending_nodes(self):
                 for node in run_after_nodes:
                     if node not in self.get_currently_executing():
                         logging.debug(f"Running node {node} after execution of {node_id}")
+                        self.waiting_for_frontend.discard(node)
                         self.workset.append(node)
                         self.pending_to_execute.discard(node)
                         self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id))

From 43c91758080f887c6db18a9c179e0a79b2181b28 Mon Sep 17 00:00:00 2001
From: SleepyMug <di_jin@brown.edu>
Date: Wed, 13 Dec 2023 14:48:50 -0500
Subject: [PATCH 08/39] support for eager dependency resolving

---
 parallel-orch/executor.py              |  13 +-
 parallel-orch/partial_program_order.py | 256 ++++++++++++++-----------
 parallel-orch/run_command.sh           |  17 +-
 parallel-orch/scheduler_server.py      |  20 +-
 parallel-orch/trace_v2.py              |  62 ++++--
 parallel-orch/util.py                  |   7 +
 test/misc/cat_and_sleep.sh             |   4 +
 test/test_orch.sh                      |  11 ++
 test/test_scripts/test_early_stop1.sh  |   7 +
 test/test_scripts/test_early_stop2.sh  |   9 +
 10 files changed, 265 insertions(+), 141 deletions(-)
 create mode 100755 test/misc/cat_and_sleep.sh
 create mode 100644 test/test_scripts/test_early_stop1.sh
 create mode 100644 test/test_scripts/test_early_stop2.sh

diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py
index 0349285e..526b2dd4 100644
--- a/parallel-orch/executor.py
+++ b/parallel-orch/executor.py
@@ -13,20 +13,21 @@ def async_run_and_trace_command_return_trace(command, node_id, latest_env_file,
     stdout_file = util.ptempfile()
     stderr_file = util.ptempfile()
     post_execution_env_file = util.ptempfile()
+    sandbox_dir, tmp_dir = util.create_sandbox()
     logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}')
     logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}')
     logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}')
-    process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode)
-    return process, trace_file, stdout_file, stderr_file, post_execution_env_file
+    process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
+    return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
 
 def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file):
-    process, trace_file, stdout_file, stderr_file, post_execution_env_file = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True)
-    return process, trace_file, stdout_file, stderr_file, post_execution_env_file
+    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True)
+    return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, speculate_mode=False):
+def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
     ## Call Riker to execute the command
     run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh'
-    args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file]
+    args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file, sandbox_dir, tmp_dir]
     if speculate_mode:
         args.append("speculate")
     else:
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 13c739ba..2242d9a4 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -30,7 +30,7 @@ def get_post_execution_env_file(self):
 
     def get_stdout_file(self):
         return self.stdout_file
-    
+
     def get_sandbox_dir(self):
         return self.sandbox_dir
 
@@ -61,7 +61,7 @@ def get_outer(self):
 
     def pop_outer(self):
         return self.loops.pop()
-    
+
     def add_inner(self, loop_iter_id: int):
         self.loops.insert(0, loop_iter_id)
 
@@ -92,16 +92,16 @@ def __eq__(self, other):
 class NodeId:
     def __init__(self, id: int, loop_iters=None):
         self.id = id
-        
+
         if loop_iters is None:
             self.loop_iters = LoopStack()
         else:
             assert(isinstance(loop_iters, LoopStack))
             self.loop_iters = loop_iters
-    
+
     def has_iters(self):
         return not self.loop_iters.is_empty()
-    
+
     def get_iters(self):
         return copy.deepcopy(self.loop_iters)
 
@@ -135,17 +135,17 @@ def __ne__(self, other):
         # Not strictly necessary, but to avoid having both x==y and x!=y
         # True at the same time
         return not(self == other)
-    
+
     ## TODO: Define this correctly if it is to be used for something other than dictionary indexing
     def __lt__(self, obj):
         return (str(self) < str(obj))
-  
+
     def __gt__(self, obj):
         return (str(self) > str(obj))
-  
+
     # def __le__(self, obj):
     #     return ((self.b) <= (obj.b))
-  
+
     # def __ge__(self, obj):
     #     return ((self.b) >= (obj.b))
 
@@ -191,10 +191,10 @@ def get_cmd(self) -> str:
 
     def get_cmd_no_redir(self) -> str:
         return self.cmd_no_redir
-    
+
     def get_loop_context(self) -> LoopStack:
         return self.loop_context
-    
+
     def in_loop(self) -> bool:
         return not self.loop_context.is_empty()
 
@@ -211,11 +211,10 @@ def get_next_iter(self, loop_id: int) -> int:
     ##         a node is committed.
     def set_completed_info(self, completed_node_info: CompletedNodeInfo):
         self.completed_node_info = completed_node_info
-    
+
     def get_completed_node_info(self) -> CompletedNodeInfo:
         return self.completed_node_info
 
-
 class RWSet:
 
     def __init__(self, read_set: set, write_set: set):
@@ -258,7 +257,7 @@ def __init__(self, nodes, edges, initial_env_file):
         ## A dictionary from cmd_ids that are currently executing that contains their trace_files
         self.commands_currently_executing = {}
         ## A dictionary that contains information about completed nodes
-        ## from cmd_id -> CompletedNodeInfo 
+        ## from cmd_id -> CompletedNodeInfo
         ## Note: this dictionary does not contain information
         ## TODO: Delete this
         self.completed_node_info = {}
@@ -290,7 +289,7 @@ def __init__(self, nodes, edges, initial_env_file):
         self.pending_to_execute = set()
         self.to_be_resolved_prev = {}
         self.prechecked_env = set()
-            
+
     def __str__(self):
         return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}"
 
@@ -334,11 +333,11 @@ def get_sub_po_source_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
             if len(prev_ids_set) == 0 or \
                 not prev_ids_set.issubset(node_set):
                 source_nodes.append(node_id)
-        
+
         ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure
         assert(len(source_nodes) == 1)
         return source_nodes
-    
+
     def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
         # assert(self.is_closed_sub_partial_order(node_ids))
         sink_nodes = list()
@@ -349,31 +348,31 @@ def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
             if len(next_ids_set) == 0 or \
                 not next_ids_set.issubset(node_set):
                 sink_nodes.append(node_id)
-        
+
         ## KK 2024-05-03: I don't see how we can get multiple sink with the current structure
         assert(len(sink_nodes) == 1)
         return sink_nodes
-    
+
     def set_new_env_file_for_node(self, node_id: NodeId, new_env_file: str):
         self.new_envs[node_id] = new_env_file
-        
+
     def get_new_env_file_for_node(self, node_id: NodeId) -> str:
         return self.new_envs.get(node_id)
-    
+
     def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str):
         self.latest_envs[node_id] = latest_env_file
-        
+
     def get_latest_env_file_for_node(self, node_id: NodeId) -> str:
         return self.latest_envs.get(node_id)
-    
+
     def get_most_recent_possible_new_env_for_node(self, node_id) -> str:
         most_recent_env_node = node_id
         while self.get_new_env_file_for_node(most_recent_env_node) is None:
             predecessor = self.get_prev(most_recent_env_node)
-            
+
             ## This will trigger when we move to full Partial Orders
             assert len(predecessor) <= 1
-            
+
             ## If there are no predecessors for a node it means we are at the source
             ## so there is no point to search further back
             if len(predecessor) == 0:
@@ -391,7 +390,7 @@ def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
         for node_id in node_ids:
             prev_ids_set = set(self.get_prev(node_id))
             prev_nodes = prev_nodes.union(prev_ids_set - node_set)
-        
+
         ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure
         assert(len(prev_nodes) <= 1)
         return list(prev_nodes)
@@ -424,9 +423,9 @@ def init_partial_order(self):
         logging.debug(f'To be resolved sets per node:')
         logging.debug(self.to_be_resolved)
         logging.info(f'Initialized the partial order!')
-        self.log_partial_program_order_info()
+        # self.log_partial_program_order_info()
         assert(self.valid())
-        
+
 
     def init_latest_env_files(self, node=None):
         if node is None:
@@ -449,7 +448,7 @@ def get_workset(self) -> list:
 
     def get_unsafe(self) -> set:
         return copy.deepcopy(self.unsafe)
-    
+
     ## Only return the stopped that are not unsafe
     def get_stopped_safe(self) -> set:
         return copy.deepcopy(self.stopped.difference(self.unsafe))
@@ -478,7 +477,7 @@ def init_inverse_adjacency(self):
     ## TODO: Call valid and add assertiosn for loops here.
     def valid(self):
         logging.debug("Checking partial order validity...")
-        self.log_partial_program_order_info()
+        # self.log_partial_program_order_info()
         valid1 = self.loop_nodes_valid()
         ## TODO: Add a check that for x, y : NodeIds, x < y iff x is a predecessor to x
         ##       This is necessary due to the `hypothetical_before` method.
@@ -500,7 +499,7 @@ def loop_nodes_valid(self):
                          self.get_workset() + \
                          list(self.stopped) + \
                          list(self.commands_currently_executing.keys())
-        loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets 
+        loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets
                                 if self.is_loop_node(node_id)]
         return len(loop_nodes_in_forbidden_sets) == 0
 
@@ -519,9 +518,9 @@ def get_node_loop_context(self, node_id: NodeId) -> LoopStack:
     def get_all_non_committed(self) -> "list[NodeId]":
         all_node_ids = self.nodes.keys()
         non_committed_node_ids = [node_id for node_id in all_node_ids
-                                  if not self.is_committed(node_id)]  
+                                  if not self.is_committed(node_id)]
         return non_committed_node_ids
-    
+
     ## This adds a node to the committed set and saves important information
     def commit_node(self, node_id: NodeId):
         logging.debug(f" > Commiting node {node_id}")
@@ -535,7 +534,7 @@ def is_loop_node(self, node_id:NodeId) -> bool:
     def filter_standard_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
         return [node_id for node_id in node_ids
                 if not self.is_loop_node(node_id)]
-    
+
     def filter_loop_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
         return [node_id for node_id in node_ids
                 if self.is_loop_node(node_id)]
@@ -565,15 +564,15 @@ def get_next(self, node_id:NodeId) -> "list[NodeId]":
 
     def get_prev(self, node_id:NodeId) -> "list[NodeId]":
         return self.inverse_adjacency[node_id][:]
-        
+
     def add_edge(self, from_id: NodeId, to_id: NodeId):
         ## KK 2023-05-04 Is it a problem that we append? Maybe we should make that a set
         self.adjacency[from_id].append(to_id)
         self.inverse_adjacency[to_id].append(from_id)
-        
+
     def remove_edge(self, from_id: NodeId, to_id: NodeId):
         self.adjacency[from_id].remove(to_id)
-        self.inverse_adjacency[to_id].remove(from_id)        
+        self.inverse_adjacency[to_id].remove(from_id)
 
     def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]":
         all_next_transitive = set(target_node_ids)
@@ -585,7 +584,7 @@ def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId
             all_next_transitive = all_next_transitive.union(successors)
             next_work.extend(new_next)
         return list(all_next_transitive)
-    
+
     def get_inverse_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]":
         all_prev_transitive = set(target_node_ids)
         next_work = target_node_ids.copy()
@@ -607,13 +606,13 @@ def get_transitive_closure_if_can_be_resolved(self, can_be_resolved: list, targe
             all_next_transitive = all_next_transitive.union(successors)
             next_work.extend(new_next)
         return list(all_next_transitive)
-    
+
     def update_rw_set(self, node_id, rw_set):
         self.rw_sets[node_id] = rw_set
 
     def get_rw_set(self, node_id) -> RWSet:
         return self.rw_sets[node_id]
-    
+
     def get_rw_sets(self) -> dict:
         return self.rw_sets
 
@@ -635,7 +634,7 @@ def is_first_node_when_env_is_uninitialized(self, speculate_immediately):
                 logging.debug("Initializing latest env and speculating")
                 return True
         return False
-    
+
     # Check if the specific command can be resolved.
     # KK 2023-05-04 I am not even sure what this function does and why is it useful.
     def cmd_can_be_resolved(self, node_id: int) -> bool:
@@ -669,7 +668,7 @@ def cmd_can_be_resolved(self, node_id: int) -> bool:
         ## Otherwise we can return
         logging.debug(f' >> Able to resolve {node_id}')
         return True
-    
+
     def __kill_all_currently_executing_and_schedule_restart(self, start=None):
         nodes_to_kill = self.get_currently_executing()
         if start is not None:
@@ -679,7 +678,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, start=None):
             most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id)
             self.prechecked_env.discard(cmd_id)
             if most_recent_new_env is not None:
-                
+
                 self.set_latest_env_file_for_node(cmd_id, most_recent_new_env)
             self.workset.remove(cmd_id)
             log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id)
@@ -691,7 +690,7 @@ def __kill_all_currently_executing_and_schedule_restart(self, start=None):
 
     def __kill_node(self, cmd_id: "NodeId"):
         logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed')
-        proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file = self.commands_currently_executing.pop(cmd_id)
+        proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file, _ = self.commands_currently_executing.pop(cmd_id)
         # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response
         self.banned_files.add(trace_file)
 
@@ -720,7 +719,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self):
             log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd)
             log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}")
             log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling")
-        
+
         if len(to_commit) == 0:
             logging.debug(" > No nodes to be committed this round")
         else:
@@ -735,7 +734,7 @@ def resolve_commands_that_can_be_resolved_and_push_frontier(self):
     def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn):
         for second_cmd_id in cmds_to_check:
             for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id):
-                
+
                 if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id):
                     update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id)
 
@@ -780,10 +779,10 @@ def get_first_cmd_ids(second_cmd_id):
         def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id):
             logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}')
             new_workset.add(second_cmd_id)
-        
+
         new_workset = set()
         self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency)
-        
+
         return new_workset
 
 
@@ -791,14 +790,14 @@ def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id):
     ## Forward dependency is when a command's output is the same
     ## as the input of a following command
     def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve):
-        self.log_partial_program_order_info()
+        # self.log_partial_program_order_info()
         for cmd in cmds_to_resolve:
             log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd)
-        
+
         logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}")
         logging.debug(" --- Starting dependency resolution --- ")
         new_workset = self.resolve_dependencies(cmds_to_resolve)
-        
+
         logging.debug(" > Modifying workset accordingly")
         # New workset contains previous unresolved commands and resolved commands with dependencies that have not been stopped
         workset_old = self.workset.copy()
@@ -823,8 +822,8 @@ def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve):
     ## Therefore it does not just check edges, but rather computes if it would be before
     ##  based on ids and loop iterations.
     ##
-    ## 1. Check if the loop ids of the two abstract parents of both nodes differ 
-    ##     thus showing that one is before the other 
+    ## 1. Check if the loop ids of the two abstract parents of both nodes differ
+    ##     thus showing that one is before the other
     ## 2. If all loop ids are the same, now we can actually compare iterations.
     ##     If a node is in the same loop ids but in a later iteration then it is later.
     ## 3. If all iterations are the same too, then we just compare node ids
@@ -853,7 +852,7 @@ def hypothetical_before(self, nid1: NodeId, nid2: NodeId):
             ## We need to keep going
             i += 1
 
-        ## If we reach this, we know that both nodes are in the same loops up to i 
+        ## If we reach this, we know that both nodes are in the same loops up to i
         ##  so we now compare iterations and node identifiers.
 
         iters1 = nid1.get_iters()
@@ -900,7 +899,7 @@ def progress_po_due_to_wait(self, node_id: NodeId):
             all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed)
             non_committed_loop_nodes_that_would_be_predecessors = [n_id for n_id in all_non_committed_loop_nodes
                                                                    if self.hypothetical_before(n_id, node_id)]
-            
+
             new_committed_nodes = non_committed_loop_nodes_that_would_be_predecessors
 
         else:
@@ -915,7 +914,7 @@ def progress_po_due_to_wait(self, node_id: NodeId):
                                                     if not self.is_committed(node_id) and
                                                     self.is_loop_node(node_id)]
             logging.debug(f'Non committed loop nodes that are predecessors to {node_id} are: {non_committed_loop_nodes_in_inverse_tc}')
-            
+
             new_committed_nodes = non_committed_loop_nodes_in_inverse_tc
 
         ## And "close them"
@@ -924,7 +923,7 @@ def progress_po_due_to_wait(self, node_id: NodeId):
         logging.debug(f'Adding following loop nodes to committed: {new_committed_nodes}')
         for node_id in new_committed_nodes:
             self.commit_node(node_id)
-        
+
         ## Since we committed some nodes, let's make sure that we also push the frontier
         ## TODO: Can we do this in a less hacky method? By using a well-defined commit_node_and_push_frontier method?
         if len(new_committed_nodes) > 0:
@@ -942,26 +941,26 @@ def progress_po_due_to_wait(self, node_id: NodeId):
 
         ## TODO: Add some form of validity assertion after we are done with this.
         ##       Just to make sure that we haven't violated the continuity of the committed set.
-        
+
         ## We check if something can be resolved and stepped forward here
         ## KK 2023-05-10 This seems to work for all tests (so it might be idempotent
         ##                since in many tests there is nothing new to resolve after a wait)
         self.resolve_commands_that_can_be_resolved_and_push_frontier()
 
     ## When the frontend sends a wait for a node, it means that execution in the frontend has
-    ## already surpassed all nodes prior to it. This is particularly important for loops, 
+    ## already surpassed all nodes prior to it. This is particularly important for loops,
     ## since we can't always statically predict how many iterations they will do, so the only
     ## definitive way to know that they are done is to receive a wait for a node after them.
     def wait_received(self, node_id: NodeId):
         ## Whenever we receive a wait for a node, we always need to check and "commit" all prior loop nodes
         ##   since we know that they won't have any more iterations (the JIT frontend has already passed them).
-        
+
         ## We first have to push and progress the PO due to the wait and then unroll
         ## KK 2023-05-22 Currently this checks whether a still nonexistent node is
-        ##               would be a successor of existing nodes to commit some of 
+        ##               would be a successor of existing nodes to commit some of
         ##               them if needed. Unfortunately, to make this check for a non-existent
-        ##               node is very complex and not elegant. 
-        ## TODO: Could we swap unrolling and progressing so that we always 
+        ##               node is very complex and not elegant.
+        ## TODO: Could we swap unrolling and progressing so that we always
         ##        check if a node can be progressed by checking edges?
         log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id)
         self.progress_po_due_to_wait(node_id)
@@ -975,7 +974,7 @@ def wait_received(self, node_id: NodeId):
             ##       For now we are being conservative and that is why it only happens here
             ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work)
             self.maybe_unroll(node_id)
-        
+
         assert(self.valid())
 
     def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]":
@@ -996,13 +995,13 @@ def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[No
     def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"):
         logging.info(f'Unrolling loop with id: {loop_id}')
         all_loop_node_ids = self.find_outer_loop_sub_partial_order(loop_id, nodes_subset)
-        
+
         ## We don't want to unroll already committed nodes
         loop_node_ids = [nid for nid in all_loop_node_ids
                          if not self.is_committed(nid)]
 
         logging.debug(f'Node ids for loop: {loop_id} are: {loop_node_ids}')
-        
+
         ## Create the new nodes and remap adjacencies accordingly
         node_mappings = {}
         for node_id in loop_node_ids:
@@ -1028,7 +1027,7 @@ def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"):
 
         for node_id, new_node_id in node_mappings.items():
             old_prev_ids = self.get_prev(node_id)
-            ## Modify all id to be in the new set except for the 
+            ## Modify all id to be in the new set except for the
             new_prev_ids = PartialProgramOrder.map_using_mapping(old_prev_ids, node_mappings)
             self.inverse_adjacency[new_node_id] = new_prev_ids
             for new_prev_id in new_prev_ids:
@@ -1091,7 +1090,7 @@ def unroll_loops(self, loop_contexts: LoopStack) -> NodeId:
             ## Update all new nodes that we have added
             all_new_node_ids.update(new_node_ids)
 
-            ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once, 
+            ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once,
             ##  we just want to look at those new unrolled nodes for the next unrolling).
             relevant_node_ids = new_node_ids
 
@@ -1103,7 +1102,7 @@ def unroll_loops(self, loop_contexts: LoopStack) -> NodeId:
             if not self.is_loop_node(new_node_id):
                 self.workset.append(new_node_id)
                 ## GL: 08-24-2023: This might not the best way to treat this as we need
-                ## to update the env half way through the loop. 
+                ## to update the env half way through the loop.
                 ## For now, we just copy the env from the parent loop node
                 non_iter_id = new_node_id.get_non_iter_id()
                 logging.debug(f"Copying latest env from loop context to loop node: {non_iter_id} -> {new_node_id}")
@@ -1140,7 +1139,7 @@ def unroll_loop_node(self, target_concrete_node_id: NodeId):
 
         ## TODO: This needs to change when we modify unrolling to happen speculatively too
         ## TODO: This needs to properly add the node to frontier and to resolve dictionary
-        
+
         # GL 2023-05-22: __frontier_commit_and_push() should be called here instead of step_forward()
         # Although without it the test cases pass
         self.frontier.append(new_first_node_id)
@@ -1212,7 +1211,7 @@ def __frontier_commit_and_push(self):
 
             ## Update the frontier to the new frontier
             self.frontier = new_frontier
-    
+
 
     ## For a file - dir forward dependency to exist,
     ## we need the succeding command to attempt to read anything that is a subpath of the
@@ -1230,13 +1229,13 @@ def has_dir_file_dependency(self, first_cmd_set, second_cmd_set):
                     logging.debug(f' > File forward dependency found C1:({dir}) C2:({other_path})')
                     return True
         return False
-    
+
     def is_subpath(self, dir, other_path):
         other_path.startswith(os.path.abspath(dir)+os.sep)
 
     def has_forward_dependency(self, first_id, second_id):
         first_write_set = set(self.rw_sets[first_id].get_write_set())
-        second_read_set = set(self.rw_sets[second_id].get_read_set())
+        second_read_set = set(self.rw_sets[second_id].get_read_set()).union(set(self.rw_sets[second_id].get_write_set()))
         logging.debug(f'Checking dependencies between {first_id} and {second_id}')
         if not first_write_set.isdisjoint(second_read_set):
             logging.debug(f' > Forward dependency found {first_write_set.intersection(second_read_set)}')
@@ -1247,14 +1246,14 @@ def has_forward_dependency(self, first_id, second_id):
         else:
             logging.debug(f' > No dependencies')
             return False
-        
+
     def get_all_next_non_committed_nodes(self) -> "list[NodeId]":
         next_non_committed_nodes = []
         for cmd_id in self.get_all_non_committed():
             if cmd_id in self.workset and self.is_next_non_committed_node(cmd_id):
                 next_non_committed_nodes.append(cmd_id)
         return next_non_committed_nodes
-    
+
     def is_next_non_committed_node(self, node_id: NodeId) -> bool:
         # We want the predecessor to be committed and the current node to not be committed
         for prev_node in self.get_prev(node_id):
@@ -1288,10 +1287,13 @@ def schedule_work(self, limit=0):
         ## GL 2023-07-05 populate_to_be_resolved_dict() is OK to call anywhere,
         ##            __frontier_commit_and_push() is not safe to call here
         self.populate_to_be_resolved_dict()
-        
+
         ## TODO: Move loop unrolling here for speculation too
 
+        conflicted_nodes = self.nodes_with_uncommited_conflict()
         for cmd_id in self.get_workset():
+            if cmd_id in conflicted_nodes:
+                continue
             # We only need to schedule non-committed and non-executing nodes
             if not (cmd_id in self.get_committed() or \
                cmd_id in self.commands_currently_executing):
@@ -1323,7 +1325,7 @@ def speculate_cmd_non_blocking(self, node_id: NodeId):
         logging.debug(f'Speculating command: {node_id} {self.get_node(node_id)}')
         ## TODO: Since these (this and the function above)
         ##       are relevant for the report maker,
-        ##       add them in some library (e.g., trace_for_report) 
+        ##       add them in some library (e.g., trace_for_report)
         ##       so that we don't accidentally delete them.
         logging.debug(f"ExecutingSandboxAdd|{node_id}")
         self.execute_cmd_core(node_id, speculate=True)
@@ -1335,7 +1337,7 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False):
         is_safe = analysis.safe_to_execute(node.asts, variables)
         if not is_safe:
             logging.debug(f'Command: "{node}" is not safe to execute, sending to the original shell to execute...')
-            
+
             ## Keep some state around to determine that this command is not safe to execute.
             self.stopped.add(node_id)
             self.unsafe.add(node_id)
@@ -1354,11 +1356,44 @@ def execute_cmd_core(self, node_id: NodeId, speculate=False):
         else:
             execute_func = executor.async_run_and_trace_command_return_trace
 
-        proc, trace_file, stdout, stderr, post_execution_env_file = execute_func(cmd, node_id, env_file_to_execute_with)
-        self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file)
+        proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = execute_func(cmd, node_id, env_file_to_execute_with)
+        self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir)
         logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}")
+
+    def nodes_with_uncommited_conflict(self):
+        uncommited_run_after = [node_id for node_id in self.run_after if node_id not in self.committed]
+        total_conflicts = set()
+        for node_id in uncommited_run_after:
+            conflicts = self.run_after[node_id]
+            total_conflicts.update(conflicts)
+        return total_conflicts
         
-    # This method attempts to add to workset (rerun) 
+    def kill_and_stop(self, node_id: NodeId):
+        proc, _, _, _, _, _ = self.commands_currently_executing.pop(node_id)
+        util.kill_process_tree(proc.pid, sig=signal.SIGTERM)
+
+    def early_stop_using_dep(self):
+        for node_id, info_tuple in self.commands_currently_executing.items():
+            trace_file = info_tuple[1]
+            sandbox_dir = info_tuple[5]
+            try:
+                trace_object = executor.read_trace(sandbox_dir, trace_file)
+            except FileNotFoundError:
+                continue
+            logging.info(f'going forward')
+            read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
+            rw_set = RWSet(read_set, write_set)
+            self.update_rw_set(node_id, rw_set)
+        for node_id in self.commands_currently_executing:
+            self.resolve_dependencies_early(node_id)
+        self.log_partial_program_order_info()
+        conflicts = self.nodes_with_uncommited_conflict()
+        to_be_killed = [node_id for node_id in self.commands_currently_executing if node_id in conflicts]
+        logging.info(f'>>>>>>>>>>>>>>>>> to be killed: {to_be_killed}')
+        for node_id in to_be_killed:
+            self.kill_and_stop(node_id)
+
+    # This method attempts to add to workset (rerun)
     # any command that found to have a dependency through early resolution
     def attempt_rerun_pending_nodes(self):
         restarted_nodes = set()
@@ -1377,17 +1412,22 @@ def attempt_rerun_pending_nodes(self):
                         new_run_after_nodes.discard(node)
             self.run_after[node_id] = new_run_after_nodes
         return restarted_nodes
+
+    def set_sandbox(self, node_id, sandbox_dir):
+        self.sandbox_dirs[node_id] = sandbox_dir
     
     def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str):
         log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id)
         log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}")
-        
+
         logging.debug(f" --- Node {node_id}, just finished execution ---")
         self.sandbox_dirs[node_id] = sandbox_dir
         ## TODO: Store variable file somewhere so that we can return when wait
-
-        _proc, trace_file, stdout, stderr, post_execution_env_file = self.commands_currently_executing.pop(node_id)
-
+        if not node_id in self.commands_currently_executing:
+            return
+        _proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = self.commands_currently_executing.pop(node_id)
+        if not sandbox_dir == self.sandbox_dirs[node_id]:
+            return
         logging.trace(f"ExecutingRemove|{node_id}")
         # Handle stopped by riker due to network access
         if int(riker_exit_code) == 159:
@@ -1403,8 +1443,8 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand
             ##  is completed for this node.
             completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir)
             self.nodes[node_id].set_completed_info(completed_node_info)
-            
-            ## We no longer add failed commands to the stopped set, 
+
+            ## We no longer add failed commands to the stopped set,
             ## because this leads to more repetitions than needed
             ## and does not allow us to properly speculate commands
             read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
@@ -1416,26 +1456,24 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand
             if node_id in self.workset:
                 self.workset.remove(node_id)
                 logging.debug(f"WorksetRemove|{node_id}")
-            # If no commands can be resolved this round, 
+            # If no commands can be resolved this round,
             # do nothing and wait until a new command finishes executing
             logging.debug("No resolvable nodes were found in this round, nothing will change...")
             return
-        
-        
+
+
         log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False)
-        # Remove from workset and add it again later if necessary
-        self.workset.remove(node_id)
         log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id)
-        
+
         ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet.
         if self.get_new_env_file_for_node(node_id) is None:
             logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...")
             self.waiting_for_frontend.add(node_id)
-            
+
             # We will however attempt to resolve dependencies early
             self.resolve_dependencies_early(node_id)
             restarted_cmds = self.attempt_rerun_pending_nodes()
-            self.log_partial_program_order_info()
+            # self.log_partial_program_order_info()
         ## Here we continue with the normal execution flow
         else:
             logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...")
@@ -1444,11 +1482,11 @@ def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sand
     #TODO: Remove ths in the future - we need a more robust approach to check for env diffs.
     def exclude_insignificant_diffs(self, env_diff_dict):
         return {k: v for k, v in env_diff_dict.items() if k not in config.INSIGNIFICANT_VARS}
-    
+
     #TODO: Remove ths in the future - we need a more robust approach to check for env diffs.
     def include_only_significant_vars(self, env_diff_dict):
         return {k: v for k, v in env_diff_dict.items() if k in config.SIGNIFICANT_VARS}
-    
+
     def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_in_both):
         # Exclude insignificant differences
         only_in_new_sig = self.include_only_significant_vars(only_in_new)
@@ -1464,7 +1502,7 @@ def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_i
         else:
             logging.debug("No significant differences found:")
             return False
-        
+
     def update_env_and_restart_nodes(self, node_id: NodeId):
         logging.debug(f"Significant differences found between new and latest env files for {node_id}.")
         logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.")
@@ -1483,13 +1521,13 @@ def update_env_and_restart_nodes(self, node_id: NodeId):
             self.prechecked_env.discard(waiting_for_frontend_node)
             assert(self.get_new_env_file_for_node(node_id) is not None)
             assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None)
-        self.log_partial_program_order_info()
+        # self.log_partial_program_order_info()
         logging.debug("-")
         self.waiting_for_frontend = new_waiting_for_frontend
         self.populate_to_be_resolved_dict()
 
     def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None):
-        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), 
+        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id),
                                                                     self.get_latest_env_file_for_node(node_id)):
             self.update_env_and_restart_nodes(node_id)
         else:
@@ -1498,7 +1536,7 @@ def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, r
     def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None):
         logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.")
         self.waiting_for_frontend.discard(node_id)
-        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id), 
+        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id),
                                                                     self.get_latest_env_file_for_node(node_id)):
             self.update_env_and_restart_nodes(node_id)
         else:
@@ -1509,7 +1547,7 @@ def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node
             self.resolve_dependencies_early(node_id)
             restarted_cmds = self.attempt_rerun_pending_nodes()
             logging.debug(f"Restarted after successful env resolution {restarted_cmds}")
-            self.log_partial_program_order_info()
+            # self.log_partial_program_order_info()
             self.resolve_commands_that_can_be_resolved_and_push_frontier()
             assert(self.valid())
 
@@ -1528,10 +1566,10 @@ def new_and_latest_env_files_have_significant_differences(self, new_env_file, la
             return False
         logging.debug(f"Comparing new and latest env files: {new_env_file} {latest_env_file}")
         assert(latest_env_file is not None)
-        
+
         new_env = executor.read_env_file(new_env_file)
         latest_env = executor.read_env_file(latest_env_file)
-        
+
         only_in_new, only_in_latest, different_in_both = util.compare_env_strings(new_env, latest_env)
 
         return self.significant_diff_in_env_dicts(only_in_new, only_in_latest, different_in_both)
@@ -1557,7 +1595,7 @@ def commit_cmd_workspaces(self, to_commit_ids):
     def log_rw_sets(self):
         logging.debug("====== RW Sets " + "=" * 65)
         for node_id, rw_set in self.rw_sets.items():
-            logging.debug(f"ID:{node_id} | R.size:{len(rw_set.get_read_set()) if rw_set is not None else None} | W:{rw_set.get_write_set() if rw_set is not None else None}")
+            logging.debug(f"ID:{node_id} | R:{[f for f in rw_set.get_read_set() if 'output_' in f] if rw_set else None} | W:{rw_set.get_write_set() if rw_set is not None else None}")
 
     def log_partial_program_order_info(self):
         logging.debug(f"=" * 80)
@@ -1660,7 +1698,7 @@ def parse_loop_contexts(lines):
 def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder:
     with open(file_path) as f:
         raw_lines = f.readlines()
-    
+
     ## Filter comments and remove new lines
     lines = [line.rstrip() for line in raw_lines
              if not line.startswith("#")]
@@ -1692,15 +1730,15 @@ def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder
         file_path = f'{cmds_directory}/{i}'
         cmd, asts = parse_cmd_from_file(file_path)
         loop_ctx = loop_contexts[i]
-        nodes[NodeId(i)] = Node(NodeId(i), cmd, 
-                                asts=asts, 
+        nodes[NodeId(i)] = Node(NodeId(i), cmd,
+                                asts=asts,
                                 loop_context=LoopStack(loop_ctx))
 
     edges = {NodeId(i) : [] for i in range(number_of_nodes)}
     for edge_line in edge_lines:
         from_id, to_id = parse_edge_line(edge_line)
         edges[NodeId(from_id)].append(NodeId(to_id))
-    
+
     logging.trace(f"Nodes|{','.join([str(node) for node in nodes])}")
     logging.trace(f"Edges|{edges}")
     return PartialProgramOrder(nodes, edges, initial_env_file)
diff --git a/parallel-orch/run_command.sh b/parallel-orch/run_command.sh
index e2c46e92..2d3597c1 100755
--- a/parallel-orch/run_command.sh
+++ b/parallel-orch/run_command.sh
@@ -5,10 +5,11 @@ export CMD_STRING=${1?No command was given to execute}
 export TRACE_FILE=${2?No trace file path given}
 export STDOUT_FILE=${3?No stdout file given}
 export LATEST_ENV_FILE=${4?No env file to run with given}
-export EXEC_MODE=${5?No execution mode given}
-export CMD_ID=${6?No command id given}
-export POST_EXEC_ENV=${7?No Riker env file given}
-
+export SANDBOX_DIR=${5?No sandbox dir given}
+export TMPDIR=${6?No tmp dir given}
+export EXEC_MODE=${7?No execution mode given}
+export CMD_ID=${8?No command id given}
+export POST_EXEC_ENV=${9?No Riker env file given}
 
 ## KK 2023-04-24: Not sure this should be run every time we run a command
 ## GL 2023-07-08: Tests seem to pass without it
@@ -23,10 +24,10 @@ else
     exit 1
 fi
 
-mkdir -p /tmp/pash_spec/a
-mkdir -p /tmp/pash_spec/b
-export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/"
-export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)"
+# mkdir -p /tmp/pash_spec/a
+# mkdir -p /tmp/pash_spec/b
+# export SANDBOX_DIR="$(mktemp -d /tmp/pash_spec/a/sandbox_XXXXXXX)/"
+# export TEMPDIR="$(mktemp -d /tmp/pash_spec/b/sandbox_XXXXXXX)"
 # echo tempdir $TEMPDIR
 # echo sandbox $SANDBOX_DIR
 
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 29ebddbf..436eb642 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -34,7 +34,7 @@ def parse_args():
                     action="store_true",
                     default=False,
                     help="Speculate immediately instead of waiting for the first Wait message.")
-    
+
     args, unknown_args = parser.parse_known_args()
     return args
 
@@ -135,7 +135,7 @@ def handle_wait(self, input_cmd: str, connection):
             self.waiting_for_response[node_id] = connection
 
 
-    def __parse_command_exec_complete(self, input_cmd: str) -> "tuple[int, int]":
+    def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         try:
             components = input_cmd.rstrip().split("|")
             command_id = parse_node_id(components[0].split(":")[1])
@@ -179,10 +179,15 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str):
         socket_respond(connection, response)
         connection.close()
 
+    def handle_command_exec_start(self, input_cmd):
+        assert(input_cmd.startswith("CommandExecStart:"))
+        cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
+        self.partial_program_order.set_sandbox(cmd_id, sandbox_dir)
+        
     def handle_command_exec_complete(self, input_cmd: str):
         assert(input_cmd.startswith("CommandExecComplete:"))
         ## Read the node id from the command argument
-        cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_complete(input_cmd)
+        cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
         if trace_file in self.partial_program_order.banned_files:
             logging.debug(f'CommandExecComplete: {cmd_id} ignored')
             return
@@ -230,6 +235,9 @@ def process_next_cmd(self):
             self.partial_program_order.log_executions()
             self.done = True
             log_time_delta_from_named_timestamp("Scheduler", "Done")
+        elif input_cmd.startswith("CommandExecStart:"):
+            #TODO: add logging stuff
+            self.handle_command_exec_start(input_cmd)
         else:
             logging.error(error_response(f'Error: Unsupported command: {input_cmd}'))
             raise Exception(f'Error: Unsupported command: {input_cmd}')
@@ -267,12 +275,16 @@ def run(self):
         
 
         while not self.done:
+            # TODO: wrap this around something probably
+            self.partial_program_order.early_stop_using_dep()
+
             ## Schedule some work (if we are already at capacity this will return immediately)
             self.schedule_work()
             ## Process a single request
             self.process_next_cmd()
             # If workset is empty we should end.
             # TODO: ec checks fail for now
+
         self.socket.close()
         self.shutdown()
 
@@ -284,7 +296,7 @@ def shutdown(self):
         
     def terminate_pending_commands(self):
         for _node_id, cmd_info in self.partial_program_order.commands_currently_executing.items():
-            proc, _trace_file, _stdout, _stderr, _variable_file = cmd_info
+            proc, _trace_file, _stdout, _stderr, _variable_file, _ = cmd_info
             proc.terminate()
 
 
diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 8594e4c6..1331cbb2 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -7,6 +7,21 @@
 # Global TODOs:
 # handle pwd, such that open and stat can work
 
+# not handled: listxattr, llistxattr, getxattr, lgetxattr, pivot_root, mount, umount2
+# setxattr lsetxattr removexattr lremovexattr, fanotify_mark, renameat2, chroot, quotactl
+# handled individually openat, open, chdir, clone, rename
+# TODO: link, symlink, renameat, symlinkat
+r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs',
+                        'readlink', 'execve'])
+w_first_path_set = set(['mkdir', 'rmdir', 'truncate', 'creat', 'chmod', 'chown',
+                        'lchown', 'utime', 'mknod', 'utimes', 'acct', 'unlink'])
+r_fd_path_set = set(['fstatat', 'newfstatat', 'statx', 'name_to_handle_at',
+                     'readlinkat', 'faccessat', 'execveat'])
+w_fd_path_set = set(['unlinkat', 'utimensat', 'mkdirat', 'mknodat', 'fchownat', 'futimeat',
+                     'unlinkat', 'linkat', 'fchmodat', 'utimensat'])
+ignore_set = set(['getpid', 'getcwd'])
+
+
 @dataclass
 class ExitStatus:
     exitcode: int
@@ -66,13 +81,6 @@ def pop_complete_line(self, pid: int, l):
         del self.line_dict[pid]
         return total_line
 
-# openat, open, chdir, clone
-r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs'])
-w_first_path_set = set(['mkdir'])
-r_fd_path_set = set(['fstatat', 'newfstatat'])
-w_fd_path_set = set(['unlinkat', 'utimensat'])
-ignore_set = set(['getpid', 'getcwd'])
-
 def parse_string(s):
     s = s.strip()
     # handling cases such as utimensat
@@ -93,12 +101,15 @@ def is_ret_err(ret: str):
     ret = ret.strip()
     return ret[0] == '-'
 
-def get_path_first_path(pid, args, ctx):
-    a = args.split(sep=',', maxsplit=1)[0]
-    if is_absolute(a):
-        return parse_string(a)
+def convert_absolute(cur_dir, path):
+    if is_absolute(path):
+        return path
     else:
-        return os.path.join(ctx.get_dir(pid), parse_string(a))
+        return os.path.join(cur_dir, path)
+
+def get_path_first_path(pid, args, ctx):
+    a = parse_string(args.split(sep=',', maxsplit=1)[0])
+    return convert_absolute(ctx.get_dir(pid), a)
 
 def parse_r_first_path(pid, args, ret, ctx):
     return RFile(get_path_first_path(pid, args, ctx))
@@ -110,6 +121,27 @@ def parse_w_first_path(pid, args, ret, ctx):
     else:
         return WFile(path)
 
+def get_path_at(pid, positions, args, ctx):
+    args = args.split(sep=',')
+    if isinstance(positions, list):
+        rets = []
+        for x in args:
+            rets.append(convert_absolute(ctx.get_dir(pid), parse_string(x)))
+        return rets
+    else:
+        return convert_absolute(ctx.get_dir(pid), parse_string(x))
+
+def parse_rename(pid, args, ret, ctx):
+    path_a, path_b = get_path_at(pid, [0, 1], args, ctx)
+    return WFile(path_a), WFile(path_b)
+
+def parse_link(pid, args, ret, ctx):
+    path_a, path_b = get_path_at(pid, [0, 1], args, ctx)
+    return RFile(path_a), WFile(path_b)
+
+def parse_renameat(pid, args, ret, ctx):
+    
+
 def parse_chdir(pid, args, ret, ctx):
     new_path = get_path_first_path(pid, args, ctx)
     if not is_ret_err(ret):
@@ -201,6 +233,8 @@ def parse_syscall(pid, syscall, args, ret, ctx):
         return parse_r_fd_path(args, ret)
     elif syscall in w_fd_path_set:
         return parse_w_fd_path(args, ret)
+    elif syscall == 'rename':
+        return parse_rename(pid, args, ret, ctx)
     elif syscall == 'clone':
         return parse_clone(pid, args, ret, ctx)
     elif syscall in ignore_set:
@@ -263,9 +297,9 @@ def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]:
     write_set = set()
     for l in trace_object:
         record = parse_line(l, ctx)
-        if type(record) is RFile:
+        if type(record) is RFile and record.fname != '/dev/tty':
             read_set.add(record.fname)
-        elif type(record) is WFile:
+        elif type(record) is WFile and record.fname != '/dev/tty':
             write_set.add(record.fname)
     return read_set, write_set
 
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index 1ed96118..799cd379 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -15,6 +15,13 @@ def ptempfile():
     os.close(fd)
     return name
 
+def create_sandbox():
+    os.makedirs("/tmp/pash_spec/a", exist_ok=True)
+    os.makedirs("/tmp/pash_spec/b", exist_ok=True)
+    sdir = tempfile.mkdtemp(dir="/tmp/pash_spec/a", prefix="sandbox_")
+    tdir = tempfile.mkdtemp(dir="/tmp/pash_spec/b", prefix="sandbox_")
+    return sdir, tdir
+
 def init_unix_socket(socket_file: str) -> socket.socket:
     server_address = socket_file
 
diff --git a/test/misc/cat_and_sleep.sh b/test/misc/cat_and_sleep.sh
new file mode 100755
index 00000000..77dbcbc5
--- /dev/null
+++ b/test/misc/cat_and_sleep.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cat $2 >> $3
+sleep $1
diff --git a/test/test_orch.sh b/test/test_orch.sh
index 5cb27d4a..572b4057 100755
--- a/test/test_orch.sh
+++ b/test/test_orch.sh
@@ -370,6 +370,17 @@ test_command_var_assignments_2(){
     $shell $2/test_command_var_assignments_2.sh
 }
 
+test_early_stop1()
+{
+    local shell=$1
+    $shell $2/test_early_stop1.sh
+}
+
+test_early_stop2()
+{
+    local shell=$1
+    $shell $2/test_early_stop2.sh
+}
 
 ## TODO: make more loop tests with nested loops and commands after the loop
 
diff --git a/test/test_scripts/test_early_stop1.sh b/test/test_scripts/test_early_stop1.sh
new file mode 100644
index 00000000..cfec04d0
--- /dev/null
+++ b/test/test_scripts/test_early_stop1.sh
@@ -0,0 +1,7 @@
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1"
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 2 "output text" "$test_output_dir/out2"
+$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out3"
+$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out4"
+$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out2" "$test_output_dir/out5"
+
+
diff --git a/test/test_scripts/test_early_stop2.sh b/test/test_scripts/test_early_stop2.sh
new file mode 100644
index 00000000..bf63e408
--- /dev/null
+++ b/test/test_scripts/test_early_stop2.sh
@@ -0,0 +1,9 @@
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out1"
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0"
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0"
+$MISC_SCRIPT_DIR/sleep_and_echo.sh 0 "output text" "$test_output_dir/out0"
+$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out2"
+$MISC_SCRIPT_DIR/cat_and_sleep.sh 2 "$test_output_dir/out1" "$test_output_dir/out3"
+
+
+

From 258e818847ba6a58278d2802cf838e338c254f14 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Wed, 13 Dec 2023 15:22:40 -0500
Subject: [PATCH 09/39] fix wrong commit

---
 parallel-orch/trace_v2.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 1331cbb2..dab3309c 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -139,8 +139,6 @@ def parse_link(pid, args, ret, ctx):
     path_a, path_b = get_path_at(pid, [0, 1], args, ctx)
     return RFile(path_a), WFile(path_b)
 
-def parse_renameat(pid, args, ret, ctx):
-    
 
 def parse_chdir(pid, args, ret, ctx):
     new_path = get_path_first_path(pid, args, ctx)

From 413df3e1572371b4b54d940d39f3a17e795fda8f Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Wed, 13 Dec 2023 16:08:16 -0500
Subject: [PATCH 10/39] fix parsing when partial line exists in the trace

---
 parallel-orch/trace_v2.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index dab3309c..003e880c 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -1,4 +1,5 @@
 import re
+import logging
 import os.path
 import sys
 from typing import Tuple
@@ -270,7 +271,8 @@ def parse_line(l, ctx):
     lparen = l.find('(')
     equals = l.rfind('=')
     rparen = l[:equals].rfind(')')
-    assert lparen >= 0 and equals >= 0 and rparen >= 0
+    if not (lparen >= 0 and equals >= 0 and rparen >= 0):
+        return None
     syscall = l[:lparen]
     ret = l[equals+1:]
     args = l[lparen+1:rparen]
@@ -294,7 +296,11 @@ def parse_and_gather_cmd_rw_sets(trace_object) -> Tuple[set, set]:
     read_set = set()
     write_set = set()
     for l in trace_object:
-        record = parse_line(l, ctx)
+        try:
+            record = parse_line(l, ctx)
+        except Exception:
+            logging.debug(l)
+            raise ValueError("error while parsing trace")
         if type(record) is RFile and record.fname != '/dev/tty':
             read_set.add(record.fname)
         elif type(record) is WFile and record.fname != '/dev/tty':

From 23fe3f59997d428e68e6371e7b430727010b4ed2 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Wed, 13 Dec 2023 17:02:26 -0500
Subject: [PATCH 11/39] fix readline behavior

---
 parallel-orch/executor.py | 2 +-
 parallel-orch/trace_v2.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py
index 526b2dd4..2d58deaf 100644
--- a/parallel-orch/executor.py
+++ b/parallel-orch/executor.py
@@ -57,7 +57,7 @@ def read_trace(sandbox_dir, trace_file):
         path = f"{sandbox_dir}/upperdir/{trace_file}"
     logging.debug(f'Reading trace from: {path}')
     with open(path) as f:
-        return f.readlines()
+        return f.read().split('\n')[:-1]
     
 def read_env_file(env_file, sandbox_dir=None):
     if sandbox_dir is None:
diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 003e880c..10c52c47 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -257,6 +257,8 @@ def handle_info(l):
         return False, None
 
 def parse_line(l, ctx):
+    if len(l) == 0:
+        return None
     pid, l = strip_pid(l)
     is_info, info = handle_info(l)
     if is_info:
@@ -279,7 +281,7 @@ def parse_line(l, ctx):
     return parse_syscall(pid, syscall, args, ret, ctx)
 
 def parse_exit_code(trace_object) -> int:
-    if len(trace_object) < 1:
+    if len(trace_object) == 0 or trace_object[0] == '':
         return None
     l = trace_object[0]
     first_pid, _ = strip_pid(l)

From 88aedee43401fd1d7496c63f418f2ed8a6e909a3 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Tue, 9 Jan 2024 10:32:19 -0700
Subject: [PATCH 12/39] Start the refatoring. Implement basic structures and
 operations

---
 parallel-orch/node.py                  |  197 +++
 parallel-orch/partial_program_order.py | 1782 +-----------------------
 parallel-orch/scheduler_server.py      |  253 +---
 parallel-orch/util.py                  |   82 ++
 4 files changed, 403 insertions(+), 1911 deletions(-)
 create mode 100644 parallel-orch/node.py

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
new file mode 100644
index 00000000..616e1b58
--- /dev/null
+++ b/parallel-orch/node.py
@@ -0,0 +1,197 @@
+from enum import Enum, auto
+
+class NodeState(Enum):
+    INIT = auto()
+    READY = auto()
+    COMMITTED = auto()
+    STOP = auto()
+    SPECULATED = auto()
+    EXECUTING = auto()
+    SPEC_EXECUTING = auto()
+    UNSAFE = auto()
+
+class Sandbox:
+    def __init__(self, trace_file, exit_code, post_execution_env_file, stdout_file, sandbox_dir):
+        # These get predetermined prior to the execution
+        self.trace_file = trace_file
+        self.post_execution_env_file = post_execution_env_file
+        self.stdout_file = stdout_file
+        self.sandbox_dir = sandbox_dir
+        # These get set after execution is done
+        self.exit_code = None
+        self.proc_id = None
+        
+    def set_exit_code(self, exit_code):
+        self.exit_code = exit_code
+        
+    def set_proc_id(self, proc_id):
+        self.proc_id = proc_id
+
+    def get_exit_code(self):
+        return self.exit_code
+
+    def get_post_execution_env_file(self):
+        return self.post_execution_env_file
+
+    def get_stdout_file(self):
+        return self.stdout_file
+
+    def get_sandbox_dir(self):
+        return self.sandbox_dir
+
+    def get_trace_file(self):
+        return self.trace_file
+
+    def __str__(self):
+        return f'Sandbox(trace:{self.get_trace_file}, ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})'
+
+class RWSet:
+
+    def __init__(self, read_set: set, write_set: set):
+        self.read_set = read_set
+        self.write_set = write_set
+
+    def add_to_read_set(self, item: str):
+        self.read_set.add(item)
+
+    def add_to_write_set(self, item: str):
+        self.write_set.add(item)
+
+    def get_read_set(self) -> set:
+        return self.read_set
+
+    def get_write_set(self) -> set:
+        return self.write_set
+
+    def __str__(self):
+        return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})"
+
+
+class NodeId:
+    
+    #TODO: Implement iteration support
+    
+    def __init__(self, id: int):
+        self.id = id
+
+    def get_non_iter_id(self):
+        return NodeId(self.id)
+
+    def __repr__(self):
+        ## TODO: Represent it using n.
+        output = f'{self.id}'
+        return output
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        # return self.loop_iters == other.loop_iters and self.id == other.id
+        return self.id == other.id
+
+    def __ne__(self, other):
+        return not(self == other)
+
+    def __lt__(self, obj):
+        return (str(self) < str(obj))
+
+    def __gt__(self, obj):
+        return (str(self) > str(obj))
+
+    @staticmethod
+    def parse_node_id(node_id_str: str):
+        return NodeId(int(node_id_str))
+
+
+class Node:
+    id: NodeId
+    cmd: str
+    asts: "list[AstNode]"
+    state: NodeState
+    # Nodes to check for fs dependencies before this node can be committed
+    # for this particular execution of the main sandbox.
+    # No need to do the same for the background sandbox since it will always get committed.
+    to_be_resolved_snapshot: "set[NodeId]"
+    # Read and write sets for this node
+    rwset: RWSet
+    # This contains the sandbox and execution info for a spec-executing node 
+    # (or plain executing node if frontier background node execution is not enabled)
+    main_sandbox: Sandbox
+    # This can only be set while in the frontier and the background node execution is enabled
+    background_sandbox: Sandbox
+    
+    
+    def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
+        self.id = node_id
+        self.cmd = cmd
+        self.asts = asts
+        # The node's state
+        self.state = NodeState.INIT
+        self.tracefile = None
+        self.rwset = None
+        # The 
+        self.to_be_resolved_snapshot = None
+        
+        self.main_sandbox = None
+        
+        self.background_sandbox = None
+
+
+    def is_initialized(self):
+        return self.state == NodeState.INIT
+    
+    def is_ready(self):
+        return self.state == NodeState.READY
+    
+    def is_committed(self):
+        return self.state == NodeState.COMMITTED
+    
+    def is_stopped(self):
+        return self.state == NodeState.STOP
+    
+    def is_speculated(self):
+        return self.state == NodeState.SPECULATED
+
+    def is_executing(self):
+        return self.state == NodeState.EXECUTING
+    
+    def is_spec_executing(self):
+        return self.state == NodeState.SPEC_EXECUTING
+    
+    def is_unsafe(self):
+        return self.state == NodeState.UNSAFE
+    
+    def get_main_sandbox(self):
+        return self.main_sandbox
+    
+    
+    ##                                      ##
+    ##          Transition Functions        ##
+    ##                                      ##
+    
+    def transition_to_ready(self):
+        assert self.state == NodeState.INIT
+        self.state = NodeState.READY
+        # Initialize data structures here
+
+    def transition_to_executing(self):
+        assert self.state == NodeState.READY
+        self.state = NodeState.EXECUTING
+        # TODO
+
+    def transition_to_spec_executing(self):
+        assert self.state == NodeState.READY
+        self.state = NodeState.SPEC_EXECUTING
+        # TODO
+
+    def transition_to_committed(self):
+        assert self.state in [NodeState.EXECUTING, NodeState.SPECULATED]
+        self.state = NodeState.COMMITTED
+        # TODO
+
+    # TODO: other transition functions
+
+
+    # Do we need this here of should we handle everything on scheduler server and ppo?
+    def handle_event(self, event_msg):
+        pass # TODO
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 2242d9a4..e4728d36 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -1,1744 +1,70 @@
-import copy
+from node import NodeId, Node
 import logging
-import os
-import sys
-
-import analysis
-import config
-import executor
-import trace
-import trace_v2
-from util import *
-import util
-from collections import defaultdict
-
-from shasta.ast_node import AstNode, CommandNode, PipeNode
-
-
-class CompletedNodeInfo:
-    def __init__(self, exit_code, post_execution_env_file, stdout_file, sandbox_dir):
-        self.exit_code = exit_code
-        self.post_execution_env_file = post_execution_env_file
-        self.stdout_file = stdout_file
-        self.sandbox_dir = sandbox_dir
-
-    def get_exit_code(self):
-        return self.exit_code
-
-    def get_post_execution_env_file(self):
-        return self.post_execution_env_file
-
-    def get_stdout_file(self):
-        return self.stdout_file
-
-    def get_sandbox_dir(self):
-        return self.sandbox_dir
-
-    def __str__(self):
-        return f'CompletedNodeInfo(ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})'
-
-## This class is used for both loop contexts and loop iters
-## The indices go from inner to outer
-class LoopStack:
-    def __init__(self, loop_contexts_or_iters=None):
-        if loop_contexts_or_iters is None:
-            self.loops = []
-        else:
-            self.loops = loop_contexts_or_iters
-
-    def is_empty(self):
-        return len(self.loops) == 0
-
-    def __len__(self):
-        return len(self.loops)
-
-    ## Generates a new loop stack with the same length but 0s as values
-    def new_zeroed_loop_stack(self):
-        return [0 for i in self.loops]
-
-    def get_outer(self):
-        return self.loops[-1]
-
-    def pop_outer(self):
-        return self.loops.pop()
-
-    def add_inner(self, loop_iter_id: int):
-        self.loops.insert(0, loop_iter_id)
-
-    def outer_to_inner(self):
-        return self.loops[::-1]
-
-    def index(self, loop_iter_id: int) -> int:
-        return self.loops.index(loop_iter_id)
-
-    def get(self, index: int):
-        return self.loops[index]
-
-    def __repr__(self):
-        ## TODO: Represent it using 'it', 'it0', 'it1', etc
-        ##       or -(iters)- in front of it.
-        output = "-".join([str(it) for it in self.loops])
-        return output
-
-    def __eq__(self, other):
-        if not len(self.loops) == len(other.loops):
-            return False
-        for i in range(len(self.loops)):
-            if not self.loops[i] == other.loops[i]:
-                return False
-        return True
-
-
-class NodeId:
-    def __init__(self, id: int, loop_iters=None):
-        self.id = id
-
-        if loop_iters is None:
-            self.loop_iters = LoopStack()
-        else:
-            assert(isinstance(loop_iters, LoopStack))
-            self.loop_iters = loop_iters
-
-    def has_iters(self):
-        return not self.loop_iters.is_empty()
-
-    def get_iters(self):
-        return copy.deepcopy(self.loop_iters)
-
-    def get_non_iter_id(self):
-        return NodeId(self.id)
-
-    ## Returns a new NodeId
-    def generate_new_node_id_with_another_iter(self, new_iter: int):
-        ## This node already contains iterations for the outer loops potentially
-        ##  so we just need to add another inner iteration
-        new_iters = copy.deepcopy(self.loop_iters)
-        new_iters.add_inner(new_iter)
-
-        new_node_id = NodeId(self.id, new_iters)
-        return new_node_id
-
-    def __repr__(self):
-        ## TODO: Represent it using n.
-        output = f'{self.id}'
-        if not self.loop_iters.is_empty():
-            output += f'+{self.loop_iters}'
-        return output
-
-    def __hash__(self):
-        return hash(str(self))
-
-    def __eq__(self, other):
-        return self.loop_iters == other.loop_iters and self.id == other.id
-
-    def __ne__(self, other):
-        # Not strictly necessary, but to avoid having both x==y and x!=y
-        # True at the same time
-        return not(self == other)
-
-    ## TODO: Define this correctly if it is to be used for something other than dictionary indexing
-    def __lt__(self, obj):
-        return (str(self) < str(obj))
-
-    def __gt__(self, obj):
-        return (str(self) > str(obj))
-
-    # def __le__(self, obj):
-    #     return ((self.b) <= (obj.b))
-
-    # def __ge__(self, obj):
-    #     return ((self.b) >= (obj.b))
-
-def parse_node_id(node_id_str: str) -> NodeId:
-    if "+" in node_id_str:
-        node_id_int, iters_str = node_id_str.split("+")
-        iters = [int(it) for it in iters_str.split("-")]
-        return NodeId(int(node_id_int), LoopStack(iters))
-    else:
-        return NodeId(int(node_id_str), LoopStack())
-
-class Node:
-    id: NodeId
-    cmd: str
-    asts: "list[AstNode]"
-    loop_context: LoopStack
-
-    def __init__(self, id, cmd, asts, loop_context: LoopStack):
-        self.id = id
-        self.cmd = cmd
-        self.asts = asts
-        ## There can only be a single AST per node, and this
-        ##  must be a command.
-        assert(len(asts) == 1)
-        # Check that the node contains only CommandNode(s)
-        analysis.validate_node(asts[0])
-        self.cmd_no_redir = trace.remove_command_redir(self.cmd)
-        self.loop_context = loop_context
-        ## Keep track of how many iterations of this loop node we have unrolled
-        if not loop_context.is_empty():
-            self.current_iters = loop_context.new_zeroed_loop_stack()
-
-    def __str__(self):
-        # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}"
-        return self.cmd
-
-    def __repr__(self):
-        # return f"ID: {self.id}\nCMD: {self.cmd}\nR: {self.read_set}\nW: {self.write_set}"
-        return f'N({self.cmd})'
-
-    def get_cmd(self) -> str:
-        return self.cmd
-
-    def get_cmd_no_redir(self) -> str:
-        return self.cmd_no_redir
-
-    def get_loop_context(self) -> LoopStack:
-        return self.loop_context
-
-    def in_loop(self) -> bool:
-        return not self.loop_context.is_empty()
-
-    ## KK 2023-05-17 Does this generate the correct iteration even in nested loops?
-    def get_next_iter(self, loop_id: int) -> int:
-        assert(self.in_loop())
-        assert(self.loop_context.get_outer() == loop_id)
-        loop_id_index_in_loop_context_stack = self.loop_context.index(loop_id)
-        self.current_iters[loop_id_index_in_loop_context_stack] += 1
-        return self.current_iters[loop_id_index_in_loop_context_stack]
-
-    ## Note: This information is valid only after a node is committed.
-    ##       It might be set even before that, but it should only be retrieved when
-    ##         a node is committed.
-    def set_completed_info(self, completed_node_info: CompletedNodeInfo):
-        self.completed_node_info = completed_node_info
-
-    def get_completed_node_info(self) -> CompletedNodeInfo:
-        return self.completed_node_info
-
-class RWSet:
-
-    def __init__(self, read_set: set, write_set: set):
-        self.read_set = read_set
-        self.write_set = write_set
-
-    def add_to_read_set(self, item: str):
-        self.read_set.add(item)
-
-    def add_to_write_set(self, item: str):
-        self.write_set.add(item)
-
-    def get_read_set(self) -> set:
-        return self.read_set
-
-    def get_write_set(self) -> set:
-        return self.write_set
-
-    def __str__(self):
-        return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})"
 
 
 class PartialProgramOrder:
-
-    def __init__(self, nodes, edges, initial_env_file):
+    frontier: set  # Set of nodes at the frontier
+    run_after: set  # Nodes that should run after certain conditions
+    window: int  # Integer representing the window
+    to_be_resolved: "dict[NodeId, list[Node]]"  # Mapping of nodes to lists of uncommitted nodes
+    nodes: "dict[NodeId, Node]"
+    adjacency: "dict[NodeId, list[NodeId]]"
+    inverse_adjacency: "dict[NodeId, list[NodeId]]"
+    
+    def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"):
         self.nodes = nodes
-        # TODO: consider changing values to sets instead of lists
         self.adjacency = edges
-        self.init_inverse_adjacency()
-        ## TODO: KK: Is it OK if we modify adjacency lists on the fly while processing the partial-order?
-        ## TODO: Remember to modify inverse_adjacency
-        ## self.committed is an add-only set, we never remove
-        ## TODO: For loop modify committed, workset, frontier, stopped
-        ## TODO: Add assertions that committed etc do not contain loop nodes
-        self.committed = set()
-        ## Nodes that are in the frontier can only move to committed
-        self.frontier = []
-        self.rw_sets = {node_id: None for node_id in self.nodes.keys()}
-        self.workset = []
-        ## A dictionary from cmd_ids that are currently executing that contains their trace_files
-        self.commands_currently_executing = {}
-        ## A dictionary that contains information about completed nodes
-        ## from cmd_id -> CompletedNodeInfo
-        ## Note: this dictionary does not contain information
-        ## TODO: Delete this
-        self.completed_node_info = {}
-        ## KK 2023-05-09 @Giorgo What is the difference of the following two?
-        self.to_be_resolved = {}
-        self.speculated = set()
-        ## Contains the most recent sandbox directory paths
-        self.sandbox_dirs = {}
-        ## Commands that were killed by riker
-        ## we should keep those in the workset but not execute them
-        ## until they reach the frontier
-        self.stopped = set()
-        ## Commands deemed unsafe from our analysis, that have to be executed
-        ##  in the original shell (e.g., shell primitives)
-        ## Invariant: self.unsafe \subseteq self.stopped
-        self.unsafe = set()
-        self.committed_order = []
-        self.commit_state = {}
-        ## Counts the times a node was (re)executed
-        self.executions = {node_id: 0 for node_id in self.nodes.keys()}
-        self.banned_files = set()
-        self.new_envs = {}
-        self.latest_envs = {}
-        self.initial_env_file = initial_env_file
-        self.waiting_for_frontend = set()
-        ## In case we spot a dependency meaning a node must execute after another node, it will appear here
-        ## Contains the nodes to execute only after the key node finishes execution
-        self.run_after = defaultdict(set)
-        self.pending_to_execute = set()
-        self.to_be_resolved_prev = {}
-        self.prechecked_env = set()
-
-    def __str__(self):
-        return f"NODES: {len(self.nodes.keys())} | ADJACENCY: {self.adjacency}"
-
-    def get_source_nodes(self) -> list:
-        sources = set()
-        for to_id, from_ids in self.inverse_adjacency.items():
-            if len(from_ids) == 0:
-                sources.add(to_id)
-        return list(sources)
-
-    def get_standard_source_nodes(self) -> list:
-        source_nodes = self.get_source_nodes()
-        return self.filter_standard_nodes(source_nodes)
-
-    ## This returns the minimum w.r.t. to the PO of a bunch of node_ids.
-    ## In a real partial order, this could be many,
-    def get_min(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        potential_minima = set(copy.deepcopy(node_ids))
-        for node_id in node_ids:
-            tc = self.get_transitive_closure([node_id])
-            ## Remove the node itself from its transitive closure
-            tc.remove(node_id)
-            ## If a node is found in the tc of another node, then
-            ##  it is not a minimum
-            for nid in tc:
-                potential_minima.discard(nid)
-        ## KK 2023-05-22 This will be removed at some point but I keep it here
-        ##    for now for easier bug finding.
-        # logging.debug(f"Potential minima: {potential_minima}")
-        assert(len(potential_minima) == 1)
-        return list(potential_minima)
-
-    ## This returns all previous nodes of a sub partial order
-    def get_sub_po_source_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        # assert(self.is_closed_sub_partial_order(node_ids))
-        source_nodes = list()
-        node_set = set(node_ids)
-        for node_id in node_ids:
-            prev_ids_set = set(self.get_prev(node_id))
-            ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so
-            if len(prev_ids_set) == 0 or \
-                not prev_ids_set.issubset(node_set):
-                source_nodes.append(node_id)
-
-        ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure
-        assert(len(source_nodes) == 1)
-        return source_nodes
-
-    def get_sub_po_sink_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        # assert(self.is_closed_sub_partial_order(node_ids))
-        sink_nodes = list()
-        node_set = set(node_ids)
-        for node_id in node_ids:
-            next_ids_set = set(self.get_next(node_id))
-            ## KK 2023-05-04 is it ever the case that some (but not all) prev nodes might be outside. I don't think so
-            if len(next_ids_set) == 0 or \
-                not next_ids_set.issubset(node_set):
-                sink_nodes.append(node_id)
-
-        ## KK 2024-05-03: I don't see how we can get multiple sink with the current structure
-        assert(len(sink_nodes) == 1)
-        return sink_nodes
-
-    def set_new_env_file_for_node(self, node_id: NodeId, new_env_file: str):
-        self.new_envs[node_id] = new_env_file
-
-    def get_new_env_file_for_node(self, node_id: NodeId) -> str:
-        return self.new_envs.get(node_id)
-
-    def set_latest_env_file_for_node(self, node_id: NodeId, latest_env_file: str):
-        self.latest_envs[node_id] = latest_env_file
-
-    def get_latest_env_file_for_node(self, node_id: NodeId) -> str:
-        return self.latest_envs.get(node_id)
-
-    def get_most_recent_possible_new_env_for_node(self, node_id) -> str:
-        most_recent_env_node = node_id
-        while self.get_new_env_file_for_node(most_recent_env_node) is None:
-            predecessor = self.get_prev(most_recent_env_node)
-
-            ## This will trigger when we move to full Partial Orders
-            assert len(predecessor) <= 1
-
-            ## If there are no predecessors for a node it means we are at the source
-            ## so there is no point to search further back
-            if len(predecessor) == 0:
-                break
-            else:
-                most_recent_env_node = predecessor[0]
-
-        return self.get_new_env_file_for_node(most_recent_env_node)
-
-    ## This returns all previous nodes of a sub partial order
-    def get_sub_po_prev_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        # assert(self.is_closed_sub_partial_order(node_ids))
-        prev_nodes = set()
-        node_set = set(node_ids)
-        for node_id in node_ids:
-            prev_ids_set = set(self.get_prev(node_id))
-            prev_nodes = prev_nodes.union(prev_ids_set - node_set)
-
-        ## KK 2024-05-03: I don't see how we can get multiple sources with the current structure
-        assert(len(prev_nodes) <= 1)
-        return list(prev_nodes)
-
-    ## TODO: Implement this correctly. I have thought of a naive algorithm that
-    ##       does a BFS forward and backward for each node and if we first see a
-    ##       node outside of the set and then one inside it means that the subset is not closed.
-    def is_closed_sub_partial_order(self, node_ids: "list[NodeId]") -> bool:
-        # node_set = set(node_ids)
-        # visited_set = set()
-        # for node_id in node_ids:
-        #     prev_ids_set = set(self.get_prev(node_id))
-        #     next_id_set = set(self.get_next(node_id))
-        #     ## If one of the previous or next nodes is not in the node set
-        #     ## it means that the sub partial order is not closed.
-        #     if not node_set.issuperset(prev_ids_set.union(next_id_set)):
-        #         return False
-
-        return True
-
+        self.inverse_adjacency = self.init_inverse_adjacency()
+        self.frontier = set()
+        self.run_after = set()
+        self.window = 0
+        self.to_be_resolved = {} 
+        
     def init_partial_order(self):
-        ## Initialize the frontier with all non-loop source nodes
-        self.frontier = self.get_standard_source_nodes()
-        ## Initialize the workset
-        self.init_workset()
-        logging.debug(f'Initialized workset')
-        self.populate_to_be_resolved_dict()
-        if config.SPECULATE_IMMEDIATELY:
-            self.init_latest_env_files()
-        logging.debug(f'To be resolved sets per node:')
-        logging.debug(self.to_be_resolved)
-        logging.info(f'Initialized the partial order!')
-        # self.log_partial_program_order_info()
-        assert(self.valid())
-
-
-    def init_latest_env_files(self, node=None):
-        if node is None:
-            env_to_assign = self.initial_env_file
-        else:
-            env_to_assign = self.get_new_env_file_for_node(node)
-        for node_id in self.get_all_non_committed():
-            self.set_latest_env_file_for_node(node_id, env_to_assign)
-
-
-    def init_workset(self):
-        self.workset = self.get_all_non_committed_standard_nodes()
+        for node_id, node in self.nodes.items():
+            node.transition_to_ready()
+        # TODO: Implement the rest of the partial order initialization
 
-    ## Check if the partial order is done
-    def is_completed(self) -> bool:
-        return len(self.get_all_non_committed_standard_nodes()) == 0
-
-    def get_workset(self) -> list:
-        return self.workset
-
-    def get_unsafe(self) -> set:
-        return copy.deepcopy(self.unsafe)
-
-    ## Only return the stopped that are not unsafe
-    def get_stopped_safe(self) -> set:
-        return copy.deepcopy(self.stopped.difference(self.unsafe))
-
-    ## When we remove a command from unsafe we always remove from stopped too
-    def remove_from_unsafe(self, node_id: NodeId):
-        self.unsafe.remove(node_id)
-        self.stopped.remove(node_id)
-
-    def get_committed(self) -> set:
-        return copy.deepcopy(self.committed)
-
-    def get_committed_list(self) -> list:
-        return sorted(list(self.committed))
-
-    def is_committed(self, node_id: NodeId) -> bool:
-        return node_id in self.committed
+    def commit_node(self, node):
+        # Logic to handle committing a node
+        node.transition_to_committed()
+        # Maybe update dependencies here 
+        # etc.
 
     def init_inverse_adjacency(self):
-        self.inverse_adjacency = {i: [] for i in self.nodes.keys()}
+        inverse_adjacency = {i: [] for i in self.nodes.keys()}
         for from_id, to_ids in self.adjacency.items():
             for to_id in to_ids:
-                self.inverse_adjacency[to_id].append(from_id)
-
-    # ## TODO: (When there is time) Define a function that checks that the graph is valid
-    ## TODO: Call valid and add assertiosn for loops here.
-    def valid(self):
-        logging.debug("Checking partial order validity...")
-        # self.log_partial_program_order_info()
-        valid1 = self.loop_nodes_valid()
-        ## TODO: Add a check that for x, y : NodeIds, x < y iff x is a predecessor to x
-        ##       This is necessary due to the `hypothetical_before` method.
-
-        ## Any command in unsafe must also be in stopped
-        valid2 = self.unsafe.issubset(self.stopped)
-
-        ## TODO: Fix the checks below because they do not work currently
-        ## TODO: Check that committed is prefix closed w.r.t partial order
-        return valid1 and valid2
-
-    ## Checks if loop nodes are all valid, i.e., that there are no loop nodes handled like normal ones,
-    ##   e.g., in workset, frontier etc
-    ##
-    ## Note that loop nodes can be in the committed set (after we are done executing all iterations of a loop)
-    def loop_nodes_valid(self):
-        # GL 2023-07-08: This works without get_all_next_non_committed_nodes(), not sure why
-        forbidden_sets = self.get_all_next_non_committed_nodes() + \
-                         self.get_workset() + \
-                         list(self.stopped) + \
-                         list(self.commands_currently_executing.keys())
-        loop_nodes_in_forbidden_sets = [node_id for node_id in forbidden_sets
-                                if self.is_loop_node(node_id)]
-        return len(loop_nodes_in_forbidden_sets) == 0
-
-    def __len__(self):
-        return len(self.nodes)
-
-    def get_node(self, node_id:NodeId) -> Node:
+                inverse_adjacency[to_id].append(from_id)
+        return inverse_adjacency
+    
+    def get_node(self, node_id: NodeId) -> Node:
         return self.nodes[node_id]
-
-    def is_node_id(self, node_id:NodeId) -> bool:
-        return node_id in self.nodes
-
-    def get_node_loop_context(self, node_id: NodeId) -> LoopStack:
-        return self.get_node(node_id).get_loop_context()
-
-    def get_all_non_committed(self) -> "list[NodeId]":
-        all_node_ids = self.nodes.keys()
-        non_committed_node_ids = [node_id for node_id in all_node_ids
-                                  if not self.is_committed(node_id)]
-        return non_committed_node_ids
-
-    ## This adds a node to the committed set and saves important information
-    def commit_node(self, node_id: NodeId):
-        logging.debug(f" > Commiting node {node_id}")
-        self.committed.add(node_id)
-
-
-    def is_loop_node(self, node_id:NodeId) -> bool:
-        return self.get_node(node_id).in_loop()
-
-    ## Only keeps standard (non-loop) nodes
-    def filter_standard_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        return [node_id for node_id in node_ids
-                if not self.is_loop_node(node_id)]
-
-    def filter_loop_nodes(self, node_ids: "list[NodeId]") -> "list[NodeId]":
-        return [node_id for node_id in node_ids
-                if self.is_loop_node(node_id)]
-
-    ## This creates a new node_id and then creates a mapping from the node and iteration id to this node id
-    ## TODO: Currently doesn't work with nested loops
-    def create_node_id_with_one_less_loop_from_loop_node(self, node_id: NodeId, loop_id: int) -> NodeId:
-        node = self.get_node(node_id)
-        logging.debug(f' >>> Node: {node}')
-        logging.debug(f' >>> its loops: {node.loop_context} --- {node.current_iters}')
-
-        new_iter = node.get_next_iter(loop_id)
-        ## Creates a new node id where we have appended the new iter
-        new_node_id = node_id.generate_new_node_id_with_another_iter(new_iter)
-        logging.debug(f' >>> new node_id with another iter: {new_node_id}')
-        return new_node_id
-
-
-    ## Returns all non committed non-loop nodes
-    def get_all_non_committed_standard_nodes(self) -> "list[NodeId]":
-        all_non_committed = self.get_all_non_committed()
-        logging.debug(f"All non committed nodes: {all_non_committed}")
-        return self.filter_standard_nodes(all_non_committed)
-
-    def get_next(self, node_id:NodeId) -> "list[NodeId]":
-        return self.adjacency[node_id][:]
-
-    def get_prev(self, node_id:NodeId) -> "list[NodeId]":
-        return self.inverse_adjacency[node_id][:]
-
-    def add_edge(self, from_id: NodeId, to_id: NodeId):
-        ## KK 2023-05-04 Is it a problem that we append? Maybe we should make that a set
-        self.adjacency[from_id].append(to_id)
-        self.inverse_adjacency[to_id].append(from_id)
-
-    def remove_edge(self, from_id: NodeId, to_id: NodeId):
-        self.adjacency[from_id].remove(to_id)
-        self.inverse_adjacency[to_id].remove(from_id)
-
-    def get_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]":
-        all_next_transitive = set(target_node_ids)
-        next_work = target_node_ids.copy()
-        while len(next_work) > 0:
-            node_id = next_work.pop()
-            successors = set(self.get_next(node_id))
-            new_next = successors - all_next_transitive
-            all_next_transitive = all_next_transitive.union(successors)
-            next_work.extend(new_next)
-        return list(all_next_transitive)
-
-    def get_inverse_transitive_closure(self, target_node_ids:"list[NodeId]") -> "list[NodeId]":
-        all_prev_transitive = set(target_node_ids)
-        next_work = target_node_ids.copy()
-        while len(next_work) > 0:
-            node_id = next_work.pop()
-            predecessors = set(self.get_prev(node_id))
-            new_prev = predecessors - all_prev_transitive
-            all_prev_transitive = all_prev_transitive.union(predecessors)
-            next_work.extend(new_prev)
-        return list(all_prev_transitive)
-
-    def get_transitive_closure_if_can_be_resolved(self, can_be_resolved: list, target_node_ids: list) -> list:
-        all_next_transitive = set(target_node_ids)
-        next_work = target_node_ids.copy()
-        while len(next_work) > 0:
-            node_id = next_work.pop()
-            successors = {next_node_id for next_node_id in self.get_next(node_id) if next_node_id in can_be_resolved}
-            new_next = successors - all_next_transitive
-            all_next_transitive = all_next_transitive.union(successors)
-            next_work.extend(new_next)
-        return list(all_next_transitive)
-
-    def update_rw_set(self, node_id, rw_set):
-        self.rw_sets[node_id] = rw_set
-
-    def get_rw_set(self, node_id) -> RWSet:
-        return self.rw_sets[node_id]
-
-    def get_rw_sets(self) -> dict:
-        return self.rw_sets
-
-    def add_to_read_set(self, node_id: NodeId, item: str):
-        self.rw_sets[node_id].add_to_read_set(item)
-
-    def add_to_write_set(self, node_id: NodeId, item: str):
-        self.rw_sets[node_id].add_to_write_set(item)
-
-    def add_to_speculated(self, node_id: NodeId):
-        self.speculated = self.speculated.union([node_id])
-
-    def is_first_node_when_env_is_uninitialized(self, speculate_immediately):
-        if not speculate_immediately:
-            starting_env_node = self.get_source_nodes()
-            ## We may have a loop node at the start
-            ## In that case, we roll back to the initial env
-            if len(starting_env_node) > 0 and self.get_latest_env_file_for_node(starting_env_node[0]) is None:
-                logging.debug("Initializing latest env and speculating")
-                return True
-        return False
-
-    # Check if the specific command can be resolved.
-    # KK 2023-05-04 I am not even sure what this function does and why is it useful.
-    def cmd_can_be_resolved(self, node_id: int) -> bool:
-        logging.debug(f'Checking if node {node_id} can be resolved...')
-        ## Get inverse_transitive_closure to find all nodes that are before this one
-        inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id])
-
-        ## Out of those nodes, filter out the non-committed ones
-        non_committed_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids
-                                                  if not self.is_committed(node_id)]
-        logging.debug(f' > Non committed nodes that are predecessors to {node_id} are: {non_committed_nodes_in_inverse_tc}')
-
-        currently_executing_ids = self.get_currently_executing()
-        logging.debug(f' > Currently executing: {currently_executing_ids}')
-
-        ## TODO: Make this check more efficient
-        for other_node_id in non_committed_nodes_in_inverse_tc:
-            ## If one of the non-committed nodes in the inverse_tc is currently executing then
-            ## we can't resolve this command
-            ## KK 2023-05-04 This is not sufficient. In the future (where we don't speculate everything at once)
-            ##               there might be a case where nothing is executing but a command can still not be resolved.
-            if other_node_id in currently_executing_ids:
-                logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is currently executing')
-                return False
-
-            ## If there exists a loop node that is not committed before the command then we cannot resolve.
-            if self.is_loop_node(other_node_id):
-                logging.debug(f' >> Cannot resolve {node_id}: Node {other_node_id} in non committed inverse tc is a loop node')
-                return False
-
-        ## Otherwise we can return
-        logging.debug(f' >> Able to resolve {node_id}')
-        return True
-
-    def __kill_all_currently_executing_and_schedule_restart(self, start=None):
-        nodes_to_kill = self.get_currently_executing()
-        if start is not None:
-            nodes_to_kill = [node_id for node_id in nodes_to_kill if node_id in self.get_transitive_closure([start])]
-        for cmd_id in nodes_to_kill:
-            self.__kill_node(cmd_id)
-            most_recent_new_env = self.get_most_recent_possible_new_env_for_node(cmd_id)
-            self.prechecked_env.discard(cmd_id)
-            if most_recent_new_env is not None:
-
-                self.set_latest_env_file_for_node(cmd_id, most_recent_new_env)
-            self.workset.remove(cmd_id)
-            log_time_delta_from_named_timestamp("PartialOrder", "RunNode", cmd_id)
-            log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd_id, key=f"PostExecResolution-{cmd_id}")
-        # Our new workset is the nodes that were killed
-        # Previous workset got killed
-        self.workset.extend(nodes_to_kill)
-
-
-    def __kill_node(self, cmd_id: "NodeId"):
-        logging.debug(f'Killing and restarting node {cmd_id} because some workspaces have to be committed')
-        proc_to_kill, trace_file, _stdout, _stderr, _post_execution_env_file, _ = self.commands_currently_executing.pop(cmd_id)
-        # Add the trace file to the banned file list so we know to ignore the CommandExecComplete response
-        self.banned_files.add(trace_file)
-
-        alive_after_kill = util.kill_process_tree(proc_to_kill.pid)
-
-        if alive_after_kill:
-            logging.critical("Processes still alive after attempting to kill:")
-            for proc in alive_after_kill:
-                logging.critical(proc)
-        else:
-            logging.debug("All processes were successfully terminated.")
-
-    def resolve_commands_that_can_be_resolved_and_push_frontier(self):
-        # This may be obsolete since we only resolve one node at a time
-        # cmds_to_resolve = self.__pop_cmds_to_resolve_from_speculated()
-        # assert len(cmds_to_resolve) <= 1
-        if len(self.speculated) == 0:
-            cmds_to_resolve = []
-        else:
-            cmds_to_resolve = [self.speculated.pop()]
-        logging.debug(f"Commands to check for dependencies this round are: {sorted(cmds_to_resolve)}")
-        logging.debug(f"Commands that cannot be resolved this round are: {sorted(self.speculated)}")
-        ## Resolve dependencies for the commands that can actually be resolved
-        to_commit = self.__resolve_dependencies_continuous_and_move_frontier(cmds_to_resolve)
-        for cmd in to_commit:
-            log_time_delta_from_named_timestamp("PartialOrder", "ResolveDependencies", cmd)
-            log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", cmd, key=f"PostExecResolution-{cmd}")
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProcKilling")
-
-        if len(to_commit) == 0:
-            logging.debug(" > No nodes to be committed this round")
-        else:
-            logging.debug(f" > Nodes to be committed this round: {to_commit}")
-            logging.trace(f"Commit|"+",".join(str(node_id) for node_id in to_commit))
-            if config.SANDBOX_KILLING:
-                logging.info("Sandbox killing")
-                self.__kill_all_currently_executing_and_schedule_restart(to_commit)
-            log_time_delta_from_named_timestamp("PartialOrder", "ProcKilling")
-            self.commit_cmd_workspaces(to_commit)
-
-    def check_dependencies(self, cmds_to_check, get_first_cmd_ids_fn, update_state_due_to_a_dependency_fn):
-        for second_cmd_id in cmds_to_check:
-            for first_cmd_id in get_first_cmd_ids_fn(second_cmd_id):
-
-                if self.rw_sets.get(first_cmd_id) is not None and self.has_forward_dependency(first_cmd_id, second_cmd_id):
-                    update_state_due_to_a_dependency_fn(first_cmd_id, second_cmd_id)
-
-    # Internal function, modified the run_after dict and the pending_to_execute set
-    def __populate_run_after_dict(self):
-        for node in self.pending_to_execute.copy():
-            prev_to_be_resolved = self.to_be_resolved_prev.get(node)
-            if prev_to_be_resolved is None:
-                return
-            # Check if env has changed since last comparison
-            elif set(self.to_be_resolved[node]) == set(prev_to_be_resolved):
-                # Not caring about this dependency because env has not yet changed
-                self.pending_to_execute.remove(node)
-                for k, v in self.run_after.items():
-                    if node in v:
-                        self.run_after[k].remove(node)
-
-    ## Spots dependencies and updates the state.
-    ## Safe to call everywhere
-    def resolve_dependencies_early(self, node_id=None):
-        def get_first_cmd_ids(second_cmd_id):
-            return sorted(self.to_be_resolved[second_cmd_id], reverse=True)
-
-        def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id):
-            self.waiting_for_frontend.discard(second_cmd_id)
-            self.run_after[first_cmd_id].add(second_cmd_id)
-            self.pending_to_execute.add(second_cmd_id)
-            logging.debug(f"Early resolution: Rerunning node {second_cmd_id} after {first_cmd_id} because of a dependency")
-            log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolution", second_cmd_id)
-
-        to_check = {node for node in self.waiting_for_frontend if node not in self.speculated}
-        if node_id is not None:
-            to_check.add(node_id)
-        self.check_dependencies(to_check, get_first_cmd_ids, update_state_due_to_a_dependency)
-        self.populate_to_be_resolved_dict()
-        self.__populate_run_after_dict()
-
-    def resolve_dependencies(self, cmds_to_resolve):
-        def get_first_cmd_ids(second_cmd_id):
-            return sorted([cmd_id for cmd_id in self.to_be_resolved[second_cmd_id] if cmd_id not in self.stopped])
-
-        def update_state_due_to_a_dependency(first_cmd_id, second_cmd_id):
-            logging.debug(f' > Command {second_cmd_id} was added to the workset, due to a forward dependency with {first_cmd_id}')
-            new_workset.add(second_cmd_id)
-
-        new_workset = set()
-        self.check_dependencies(sorted(cmds_to_resolve), get_first_cmd_ids, update_state_due_to_a_dependency)
-
-        return new_workset
-
-
-    ## Resolve all the forward dependencies and update the workset
-    ## Forward dependency is when a command's output is the same
-    ## as the input of a following command
-    def __resolve_dependencies_continuous_and_move_frontier(self, cmds_to_resolve):
-        # self.log_partial_program_order_info()
-        for cmd in cmds_to_resolve:
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ResolveDependencies", cmd)
-
-        logging.debug(f"Commands to be checked for dependencies: {sorted(cmds_to_resolve)}")
-        logging.debug(" --- Starting dependency resolution --- ")
-        new_workset = self.resolve_dependencies(cmds_to_resolve)
-
-        logging.debug(" > Modifying workset accordingly")
-        # New workset contains previous unresolved commands and resolved commands with dependencies that have not been stopped
-        workset_old = self.workset.copy()
-        self.workset = [cmd_id for cmd_id in self.workset if cmd_id not in cmds_to_resolve and cmd_id not in self.stopped]
-        self.workset.extend(list(new_workset))
-        workset_diff = set(self.workset) - set(workset_old)
-        logging.trace(f"WorksetAdd|{','.join(str(cmd_id) for cmd_id in workset_diff)}")
-
-        # Keep the previous committed state
-        old_committed = self.get_committed()
-
-        # We want stopped commands to not enter the workset again yet
-        assert(set(self.workset).isdisjoint(self.stopped))
-
-        self.__frontier_commit_and_push()
-        # self.log_partial_program_order_info()
-        return set(self.get_committed()) - old_committed
-
-
-    ## This method checks if nid1 would be before nid2 if nid2 was part of the PO.
-    ##
-    ## Therefore it does not just check edges, but rather computes if it would be before
-    ##  based on ids and loop iterations.
-    ##
-    ## 1. Check if the loop ids of the two abstract parents of both nodes differ
-    ##     thus showing that one is before the other
-    ## 2. If all loop ids are the same, now we can actually compare iterations.
-    ##     If a node is in the same loop ids but in a later iteration then it is later.
-    ## 3. If all iterations are the same too, then we just compare node ids
-    ##
-    ## KK 2023-05-22 This is a complex procedure, I wonder if we can simplify it in some way
-    def hypothetical_before(self, nid1: NodeId, nid2: NodeId):
-        raw_id1 = nid1.get_non_iter_id()
-        ## Get all loop ids that nid1 could be in
-        loop_ids1 = self.get_node_loop_context(raw_id1)
-
-        raw_id2 = nid1.get_non_iter_id()
-        ## Get all loop ids that nid2 could be in
-        loop_ids2 = self.get_node_loop_context(raw_id2)
-
-        i = 0
-        while i < len(loop_ids1) and i < len(loop_ids2):
-            loop_id_1 = loop_ids1.get(len(loop_ids1) - 1 - i)
-            loop_id_2 = loop_ids2.get(len(loop_ids2) - 1 - i)
-            ## If the first node is in a previous loop than the second,
-            ##  then we are done.
-            if loop_id_1 < loop_id_2:
-                return True
-            elif loop_id_1 > loop_id_2:
-                return False
-
-            ## We need to keep going
-            i += 1
-
-        ## If we reach this, we know that both nodes are in the same loops up to i
-        ##  so we now compare iterations and node identifiers.
-
-        iters1 = nid1.get_iters()
-        iters2 = nid2.get_iters()
-
-        i = 0
-        while i < len(iters1) and i < len(iters2):
-            iter1 = iters1.get(len(iters1) - 1 - i)
-            iter2 = iters2.get(len(iters2) - 1 - i)
-            ## If the first node is in a previous iteration than the second,
-            ##  then we are done.
-            if iter1 < iter2:
-                return True
-            elif iter1 > iter2:
-                return False
-            ## We need to keep going
-            i += 1
-
-        ## We now know that their common prefix of iterations is the same
-
-        ## Check if the node could potentially generate other nodes that are bigger
-        ##  i.e., if it is more abstract. If so, then it is not smaller.
-        common_loop_depth = min(len(loop_ids1), len(loop_ids2))
-        abstract_depth1 = max(common_loop_depth - len(iters1), 0)
-        abstract_depth2 = max(common_loop_depth - len(iters2), 0)
-        if abstract_depth1 < abstract_depth2:
-            return True
-        elif abstract_depth1 > abstract_depth2:
-            return False
-
-        return nid1.id < nid2.id
-
-
-    def progress_po_due_to_wait(self, node_id: NodeId):
-        logging.debug(f"Checking if we can progress the partial order after having received a wait for {node_id}")
-        ## The node might not be part of the partial order if it corresponds to
-        ##  a loop node iteration. In this case, we just need to make sure that
-        ##  we commit the right previous loop nodes that are relevant to it.
-        if not self.is_node_id(node_id):
-            ## TODO: This check is not correct currently, it works for now, but when we move to full partial orders it wont anymore,
-            ##        due to the check happening with < in hypothetical before
-            logging.debug(f" > Node {node_id} is not part of the PO so we compute the nodes that would be before it...")
-            all_non_committed = self.get_all_non_committed()
-            all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed)
-            non_committed_loop_nodes_that_would_be_predecessors = [n_id for n_id in all_non_committed_loop_nodes
-                                                                   if self.hypothetical_before(n_id, node_id)]
-
-            new_committed_nodes = non_committed_loop_nodes_that_would_be_predecessors
-
-        else:
-            logging.debug(f" > Node {node_id} is part of the PO so we just check its predecessors following the inverse edges...")
-            ## If the node is in the PO, then we can proceed normally and find its predecessors and commit them
-
-            ## Get inverse_transitive_closure to find all nodes that are before this one
-            inverse_tc_node_ids = self.get_inverse_transitive_closure([node_id])
-
-            ## Out of those nodes, filter out the non-committed loop ones
-            non_committed_loop_nodes_in_inverse_tc = [node_id for node_id in inverse_tc_node_ids
-                                                    if not self.is_committed(node_id) and
-                                                    self.is_loop_node(node_id)]
-            logging.debug(f'Non committed loop nodes that are predecessors to {node_id} are: {non_committed_loop_nodes_in_inverse_tc}')
-
-            new_committed_nodes = non_committed_loop_nodes_in_inverse_tc
-
-        ## And "close them"
-        ## TODO: This is a hack here, we need to have a proper method that commits
-        ##       nodes and does whatever else is needed to do (e.g., add new nodes to frontier)
-        logging.debug(f'Adding following loop nodes to committed: {new_committed_nodes}')
-        for node_id in new_committed_nodes:
-            self.commit_node(node_id)
-
-        ## Since we committed some nodes, let's make sure that we also push the frontier
-        ## TODO: Can we do this in a less hacky method? By using a well-defined commit_node_and_push_frontier method?
-        if len(new_committed_nodes) > 0:
-            new_nodes_sinks = self.get_sub_po_sink_nodes(new_committed_nodes)
-            assert(len(new_nodes_sinks) == 1)
-            new_nodes_sink = new_nodes_sinks[0]
-            logging.debug(f'The sink of the newly committed loop nodes is {new_nodes_sink}')
-
-            next_nodes = self.get_next(new_nodes_sink)
-            next_standard_nodes = self.filter_standard_nodes(next_nodes)
-            logging.trace(f"Adding its next nodes to the frontier|{','.join(str(node_id) for node_id in next_standard_nodes)}")
-            self.frontier.extend(next_standard_nodes)
-
-
-
-        ## TODO: Add some form of validity assertion after we are done with this.
-        ##       Just to make sure that we haven't violated the continuity of the committed set.
-
-        ## We check if something can be resolved and stepped forward here
-        ## KK 2023-05-10 This seems to work for all tests (so it might be idempotent
-        ##                since in many tests there is nothing new to resolve after a wait)
-        self.resolve_commands_that_can_be_resolved_and_push_frontier()
-
-    ## When the frontend sends a wait for a node, it means that execution in the frontend has
-    ## already surpassed all nodes prior to it. This is particularly important for loops,
-    ## since we can't always statically predict how many iterations they will do, so the only
-    ## definitive way to know that they are done is to receive a wait for a node after them.
-    def wait_received(self, node_id: NodeId):
-        ## Whenever we receive a wait for a node, we always need to check and "commit" all prior loop nodes
-        ##   since we know that they won't have any more iterations (the JIT frontend has already passed them).
-
-        ## We first have to push and progress the PO due to the wait and then unroll
-        ## KK 2023-05-22 Currently this checks whether a still nonexistent node is
-        ##               would be a successor of existing nodes to commit some of
-        ##               them if needed. Unfortunately, to make this check for a non-existent
-        ##               node is very complex and not elegant.
-        ## TODO: Could we swap unrolling and progressing so that we always
-        ##        check if a node can be progressed by checking edges?
-        log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id)
-        self.progress_po_due_to_wait(node_id)
-        log_time_delta_from_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id)
-
-
-        log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "ProgressingPoDueToWait", node_id)
-        ## Unroll some nodes if needed.
-        if node_id.has_iters():
-            ## TODO: This unrolling can also happen and be moved to speculation.
-            ##       For now we are being conservative and that is why it only happens here
-            ## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work)
-            self.maybe_unroll(node_id)
-
-        assert(self.valid())
-
-    def find_outer_loop_sub_partial_order(self, loop_id: int, nodes_subset: "list[NodeId]") -> "list[NodeId]":
-        loop_node_ids = []
-        for node_id in nodes_subset:
-            loop_context = self.get_node_loop_context(node_id)
-            ## Note: this only checks for the nodes that have this loop id as their outer loop
-            if not loop_context.is_empty() and loop_id == loop_context.get_outer():
-                loop_node_ids.append(node_id)
-        ## TODO: Assert that this is closed w.r.t. partial order
-        return loop_node_ids
-
-
-    ## This function unrolls a single loop, by first finding all its nodes (they must be contiguous) and then creating new versions of them
-    ## that are concretized. Its second argument describes which subset of all partial order nodes we want to look at.
-    ## That is necessary because when unrolling nested loops, we might end up in a situation where we have unrolled the
-    ## outer loop, but some of the newly created nodes might still be loop nodes (so we might have loop nodes for the same loop in multiple locations).
-    def unroll_single_loop(self, loop_id: int, nodes_subset: "list[NodeId]"):
-        logging.info(f'Unrolling loop with id: {loop_id}')
-        all_loop_node_ids = self.find_outer_loop_sub_partial_order(loop_id, nodes_subset)
-
-        ## We don't want to unroll already committed nodes
-        loop_node_ids = [nid for nid in all_loop_node_ids
-                         if not self.is_committed(nid)]
-
-        logging.debug(f'Node ids for loop: {loop_id} are: {loop_node_ids}')
-
-        ## Create the new nodes and remap adjacencies accordingly
-        node_mappings = {}
-        for node_id in loop_node_ids:
-            node = self.get_node(node_id)
-            new_loop_node_id = self.create_node_id_with_one_less_loop_from_loop_node(node_id, loop_id)
-            node_mappings[node_id] = new_loop_node_id
-            ## The new node has one less loop context than the previous one
-            node_loop_contexts = node.get_loop_context()
-            logging.debug(f'Node: {node_id} loop_contexts: {node_loop_contexts}')
-            assert(node_loop_contexts.get_outer() == loop_id)
-            new_node_loop_contexts = copy.deepcopy(node_loop_contexts)
-            new_node_loop_contexts.pop_outer()
-
-            ## Create the new node
-            self.nodes[new_loop_node_id] = Node(new_loop_node_id, node.cmd, node.asts, new_node_loop_contexts)
-            self.executions[new_loop_node_id] = 0
-        logging.debug(f'New loop ids: {node_mappings}')
-
-        ## Create the new adjacencies, by mapping adjacencies in the node set to the new node ids
-        ## and leaving outside adjacencies as they are
-        for _, new_node_id in node_mappings.items():
-            self.adjacency[new_node_id] = []
-
-        for node_id, new_node_id in node_mappings.items():
-            old_prev_ids = self.get_prev(node_id)
-            ## Modify all id to be in the new set except for the
-            new_prev_ids = PartialProgramOrder.map_using_mapping(old_prev_ids, node_mappings)
-            self.inverse_adjacency[new_node_id] = new_prev_ids
-            for new_prev_id in new_prev_ids:
-                self.adjacency[new_prev_id].append(new_node_id)
-
-        ## TODO: The rest of the code here makes assumptions about the shape of the partial order
-
-        ## Modify the previous node of the loop nodes
-        new_nodes_sinks = self.get_sub_po_sink_nodes(list(node_mappings.values()))
-        assert(len(new_nodes_sinks) == 1)
-        new_nodes_sink = new_nodes_sinks[0]
-        logging.debug(f'The sink of the new iteration for loop: {loop_id} is {new_nodes_sink}')
-
-        old_nodes_sources = self.get_sub_po_source_nodes(list(node_mappings.keys()))
-        assert(len(old_nodes_sources) == 1)
-        old_nodes_source = old_nodes_sources[0]
-
-        old_next_node_ids = self.get_next(new_nodes_sink)
-        assert(len(old_next_node_ids) <= 1)
-
-        previous_ids = self.get_sub_po_prev_nodes(loop_node_ids)
-        assert(len(previous_ids) <= 1)
-
-        ## Add a new edge between the new_sink (concrete iter) and the old_source (loop po)
-        self.add_edge(new_nodes_sink, old_nodes_source)
-
-        ## Remove the old previous edge of the old_source if it exists
-        if len(previous_ids) == 1:
-            previous_id = previous_ids[0]
-            logging.debug(f'Previous node id for loop: {loop_id} is {previous_id}')
-            self.remove_edge(from_id=previous_id,
-                             to_id=old_nodes_source)
-
-
-        ## Return the new first node and all node mappings
-        return node_mappings[old_nodes_source], node_mappings.values()
-
-    ## Static method that just maps using a node mapping dictionary or leaves them as
-    ## they are if not
-    def map_using_mapping(node_ids: "list[NodeId]", mapping) -> "list[NodeId]":
-        new_node_ids = []
-        for node_id in node_ids:
-            if node_id in mapping:
-                new_id = copy.deepcopy(mapping[node_id])
-            else:
-                new_id = copy.deepcopy(node_id)
-            new_node_ids.append(new_id)
-        return new_node_ids
-
-    ## This unrolls a sequence of loops by unrolling each loop outside-in
-    def unroll_loops(self, loop_contexts: LoopStack) -> NodeId:
-        logging.debug(f'Unrolling the following loops: {loop_contexts}')
-
-        ## All new node_ids
-        all_new_node_ids = set()
-        relevant_node_ids = list(self.nodes.keys())
-        for loop_ctx in loop_contexts.outer_to_inner():
-            new_first_node_id, new_node_ids = self.unroll_single_loop(loop_ctx, relevant_node_ids)
-            logging.debug(f'New node ids after unrolling: {new_node_ids}')
-            ## Update all new nodes that we have added
-            all_new_node_ids.update(new_node_ids)
-
-            ## Re-set the relevant node ids to only the new nodes (if we unrolled a big loop once,
-            ##  we just want to look at those new unrolled nodes for the next unrolling).
-            relevant_node_ids = new_node_ids
-
-            logging.debug(f' >>> Edges after unrolling    : {self.adjacency}')
-            logging.debug(f' >>> Inv Edges after unrolling: {self.inverse_adjacency}')
-
-        ## Add all new standard nodes to the workset (since they have to be tracked)
-        for new_node_id in all_new_node_ids:
-            if not self.is_loop_node(new_node_id):
-                self.workset.append(new_node_id)
-                ## GL: 08-24-2023: This might not the best way to treat this as we need
-                ## to update the env half way through the loop.
-                ## For now, we just copy the env from the parent loop node
-                non_iter_id = new_node_id.get_non_iter_id()
-                logging.debug(f"Copying latest env from loop context to loop node: {non_iter_id} -> {new_node_id}")
-                self.latest_envs[new_node_id] = self.latest_envs[non_iter_id]
-
-        ## KK 2023-05-22 Do we need to correctly populate the resolved set of next commands
-        ##               after unrolling the loop.
-
-        return new_first_node_id
-
-    ## This unrolls a loop given a target concrete node id
-    def unroll_loop_node(self, target_concrete_node_id: NodeId):
-        raw_node_id = target_concrete_node_id.get_non_iter_id()
-        assert(self.is_loop_node(raw_node_id))
-
-        logging.debug(f'Edges: {self.adjacency}')
-
-        ## Find the closest non-committed successor with this node id
-        ## Note: This is necessary because we might need to unroll only a subset of the loops that a node is part of.
-        ##       This is relevant when we have nested loops.
-        all_non_committed = self.get_all_non_committed()
-        all_non_committed_loop_nodes = self.filter_loop_nodes(all_non_committed)
-        logging.debug(f'All non committed loop nodes: {all_non_committed_loop_nodes}')
-        source_node_ids = self.get_min(all_non_committed_loop_nodes)
-        ## Note: This assertion might not hold once we have actual partial orders
-        assert(len(source_node_ids) == 1)
-        node_id = source_node_ids[0]
-        logging.debug(f'Closest non-committed loop node successor with raw_id {raw_node_id} is: {node_id}')
-        loop_contexts = self.get_node_loop_context(node_id)
-
-
-        ## Unroll all loops that this node is in
-        new_first_node_id = self.unroll_loops(loop_contexts)
-
-        ## TODO: This needs to change when we modify unrolling to happen speculatively too
-        ## TODO: This needs to properly add the node to frontier and to resolve dictionary
-
-        # GL 2023-05-22: __frontier_commit_and_push() should be called here instead of step_forward()
-        # Although without it the test cases pass
-        self.frontier.append(new_first_node_id)
-
-        ## At the end of unrolling the target node must be part of the PO
-        assert(self.is_node_id(target_concrete_node_id))
-
-
-    def maybe_unroll(self, node_id: NodeId) -> NodeId:
-        ## Only unrolls this node if it doesn't already exist in the PO
-        if not self.is_node_id(node_id):
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id)
-            self.unroll_loop_node(node_id)
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "Unrolling", node_id)
-        ## The node_id must be part of the PO after unrolling, otherwise we did something wrong
-        assert(self.is_node_id(node_id))
-
-
-    ## Pushes the frontier forward as much as possible for all commands in it that can be committed
-    ## This function is not safe to call on its own, since it might leave the PO in a broken state
-    ## It should be called right after
-    def __frontier_commit_and_push(self):
-        logging.debug(" > Commiting and pushing frontier")
-        logging.debug(f' > Frontier: {self.frontier}')
-        changes_in_frontier = True
-        while changes_in_frontier:
-            new_frontier = []
-            changes_in_frontier = False
-            # Second condition below may be unecessary
-            for frontier_node in self.frontier:
-                ## If a node is not in the workset it means that it is actually done executing
-                ## KK 2023-05-10 Do we need all these conditions in here? Some might be redundant?
-                if frontier_node not in self.get_currently_executing() \
-                    and frontier_node not in self.get_committed() \
-                    and frontier_node not in self.stopped \
-                    and frontier_node not in self.speculated \
-                    and frontier_node not in self.workset \
-                    and not self.is_loop_node(frontier_node) \
-                    and frontier_node not in self.waiting_for_frontend:
-                    ## Commit the node
-                    self.commit_node(frontier_node)
-
-                    ## Add its non-loop successors to the frontier
-                    next_nodes = self.get_next(frontier_node)
-                    next_standard_nodes = self.filter_standard_nodes(next_nodes)
-                    logging.trace(f"FrontierAdd|{','.join(str(node_id) for node_id in next_standard_nodes)}")
-                    new_frontier.extend(next_standard_nodes)
-
-                    ## There are some changes in the frontier so we need to reenter the loop
-                    changes_in_frontier = True
-                # If node is still being executed, we cannot progress further
-                else:
-                    new_frontier.extend([frontier_node])
-                    if frontier_node in self.get_currently_executing():
-                        logging.debug(f" > Node {frontier_node} is still being executed")
-                    elif frontier_node in self.get_committed():
-                        logging.debug(f" > Node {frontier_node} is already committed")
-                    elif frontier_node in self.stopped:
-                        logging.debug(f" > Node {frontier_node} is stopped")
-                    elif frontier_node in self.speculated:
-                        logging.debug(f" > Node {frontier_node} is speculated")
-                    elif frontier_node in self.workset:
-                        logging.debug(f" > Node {frontier_node} is in the workset")
-                    elif self.is_loop_node(frontier_node):
-                        logging.debug(f" > Node {frontier_node} is a loop node")
-                    elif frontier_node in self.waiting_for_frontend:
-                        logging.debug(f" > Node {frontier_node} is waiting for frontend")
-                    logging.debug(f" > Not commiting node {frontier_node}, keeping in frontier")
-
-            ## Update the frontier to the new frontier
-            self.frontier = new_frontier
-
-
-    ## For a file - dir forward dependency to exist,
-    ## we need the succeding command to attempt to read anything that is a subpath of the
-    ## write set of the preceeding command.
-    ## e.g. in: W1: {/foo/}  | R2: {/f1, /foo/f2, /foo/bar/f3}
-    ## /foo/f2 and /foo/bar/f3 will trigger the dependency check.
-    def has_dir_file_dependency(self, first_cmd_set, second_cmd_set):
-        # Get all directory paths without the "/" in the end
-        dirs = {dir_path[:-1] for dir_path in first_cmd_set if dir_path.endswith("/")}
-        # Get all files in a separate set
-        to_check = {filepath for filepath in second_cmd_set if not filepath.endswith("/")}
-        for dir in dirs:
-            for other_path in to_check:
-                if self.is_subpath(dir, other_path):
-                    logging.debug(f' > File forward dependency found C1:({dir}) C2:({other_path})')
-                    return True
-        return False
-
-    def is_subpath(self, dir, other_path):
-        other_path.startswith(os.path.abspath(dir)+os.sep)
-
-    def has_forward_dependency(self, first_id, second_id):
-        first_write_set = set(self.rw_sets[first_id].get_write_set())
-        second_read_set = set(self.rw_sets[second_id].get_read_set()).union(set(self.rw_sets[second_id].get_write_set()))
-        logging.debug(f'Checking dependencies between {first_id} and {second_id}')
-        if not first_write_set.isdisjoint(second_read_set):
-            logging.debug(f' > Forward dependency found {first_write_set.intersection(second_read_set)}')
-            return True
-
-        elif self.has_dir_file_dependency(first_write_set, second_read_set):
-            return True
-        else:
-            logging.debug(f' > No dependencies')
-            return False
-
-    def get_all_next_non_committed_nodes(self) -> "list[NodeId]":
-        next_non_committed_nodes = []
-        for cmd_id in self.get_all_non_committed():
-            if cmd_id in self.workset and self.is_next_non_committed_node(cmd_id):
-                next_non_committed_nodes.append(cmd_id)
-        return next_non_committed_nodes
-
-    def is_next_non_committed_node(self, node_id: NodeId) -> bool:
-        # We want the predecessor to be committed and the current node to not be committed
-        for prev_node in self.get_prev(node_id):
-            if not (self.is_committed(prev_node) and not self.is_committed(node_id)):
-                return False
-        return True
-
-    # This command never leaves the partial order at a broken state
-    # It is always safe to call it
-    def attempt_move_stopped_to_workset(self):
-        new_stopped = self.stopped.copy()
-        ## We never remove stopped commands that are unsafe
-        ## from the stopped set to be reexecuted.
-        for cmd_id in self.get_stopped_safe():
-            if self.is_next_non_committed_node(cmd_id):
-                self.workset.append(cmd_id)
-                logging.debug(f"StoppedRemove|{cmd_id}")
-                new_stopped.remove(cmd_id)
-                self.to_be_resolved[cmd_id] = []
-        self.stopped = new_stopped
-
-    ## TODO: Eventually, in the future, let's add here some form of limit
-    def schedule_work(self, limit=0):
-        if self.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY):
-            logging.debug("Not scheduling work yet, waiting for first Wait")
-            return
-        # self.log_partial_program_order_info()
-        logging.debug("Rerunning stopped commands")
-        # attempt_move_stopped_to_workset() needs to happen before the node execution
-        self.attempt_move_stopped_to_workset()
-        ## GL 2023-07-05 populate_to_be_resolved_dict() is OK to call anywhere,
-        ##            __frontier_commit_and_push() is not safe to call here
-        self.populate_to_be_resolved_dict()
-
-        ## TODO: Move loop unrolling here for speculation too
-
-        conflicted_nodes = self.nodes_with_uncommited_conflict()
-        for cmd_id in self.get_workset():
-            if cmd_id in conflicted_nodes:
-                continue
-            # We only need to schedule non-committed and non-executing nodes
-            if not (cmd_id in self.get_committed() or \
-               cmd_id in self.commands_currently_executing):
-                self.schedule_node(cmd_id)
-        assert(self.valid())
-
-    # Nodes to be scheduled are always not committed and not executing
-    def schedule_node(self, cmd_id):
-        # This replaced the old frontier check
-        log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "RunNode", cmd_id)
-        if self.is_next_non_committed_node(cmd_id):
-            # TODO: run this and before committing kill any speculated commands still executing
-            self.run_cmd_non_blocking(cmd_id)
-        else:
-            if not cmd_id in self.speculated:
-                self.speculate_cmd_non_blocking(cmd_id)
-        return
-
-    ## Run a command and add it to the dictionary of executing ones
-    def run_cmd_non_blocking(self, node_id: NodeId):
-        ## A command should only be run if it's in the frontier, otherwise it should be spec run
-        logging.debug(f'Running command: {node_id} {self.get_node(node_id)}')
-        logging.debug(f"ExecutingAdd|{node_id}")
-        self.to_be_resolved_prev[node_id] = self.to_be_resolved[node_id].copy()
-        self.execute_cmd_core(node_id, speculate=False)
-
-    ## Run a command and add it to the dictionary of executing ones
-    def speculate_cmd_non_blocking(self, node_id: NodeId):
-        logging.debug(f'Speculating command: {node_id} {self.get_node(node_id)}')
-        ## TODO: Since these (this and the function above)
-        ##       are relevant for the report maker,
-        ##       add them in some library (e.g., trace_for_report)
-        ##       so that we don't accidentally delete them.
-        logging.debug(f"ExecutingSandboxAdd|{node_id}")
-        self.execute_cmd_core(node_id, speculate=True)
-
-    def execute_cmd_core(self, node_id: NodeId, speculate=False):
-        node = self.get_node(node_id)
-        ## TODO: Read and pass the actual variables in this
-        variables = {}
-        is_safe = analysis.safe_to_execute(node.asts, variables)
-        if not is_safe:
-            logging.debug(f'Command: "{node}" is not safe to execute, sending to the original shell to execute...')
-
-            ## Keep some state around to determine that this command is not safe to execute.
-            self.stopped.add(node_id)
-            self.unsafe.add(node_id)
-            ## TODO: After we respond to the wait, we need to invalidate all later
-            ##        commands as if they had dependencies with it. In the future,
-            ##        we can be smarter with it. Many unsafe commands will not have
-            ##        other side-effects, so we don't need to invalidate anything after them.
-            return
-
-        cmd = node.get_cmd()
-        self.executions[node_id] += 1
-        env_file_to_execute_with = self.get_latest_env_file_for_node(node_id)
-        logging.debug(f"Executing with environment file: {env_file_to_execute_with}")
-        if speculate:
-            execute_func = executor.async_run_and_trace_command_return_trace_in_sandbox_speculate
-        else:
-            execute_func = executor.async_run_and_trace_command_return_trace
-
-        proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = execute_func(cmd, node_id, env_file_to_execute_with)
-        self.commands_currently_executing[node_id] = (proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir)
-        logging.debug(f" >>>>> Command {node_id} - {proc.pid} just started executing - {post_execution_env_file}")
-
-    def nodes_with_uncommited_conflict(self):
-        uncommited_run_after = [node_id for node_id in self.run_after if node_id not in self.committed]
-        total_conflicts = set()
-        for node_id in uncommited_run_after:
-            conflicts = self.run_after[node_id]
-            total_conflicts.update(conflicts)
-        return total_conflicts
-        
-    def kill_and_stop(self, node_id: NodeId):
-        proc, _, _, _, _, _ = self.commands_currently_executing.pop(node_id)
-        util.kill_process_tree(proc.pid, sig=signal.SIGTERM)
-
-    def early_stop_using_dep(self):
-        for node_id, info_tuple in self.commands_currently_executing.items():
-            trace_file = info_tuple[1]
-            sandbox_dir = info_tuple[5]
-            try:
-                trace_object = executor.read_trace(sandbox_dir, trace_file)
-            except FileNotFoundError:
-                continue
-            logging.info(f'going forward')
-            read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
-            rw_set = RWSet(read_set, write_set)
-            self.update_rw_set(node_id, rw_set)
-        for node_id in self.commands_currently_executing:
-            self.resolve_dependencies_early(node_id)
-        self.log_partial_program_order_info()
-        conflicts = self.nodes_with_uncommited_conflict()
-        to_be_killed = [node_id for node_id in self.commands_currently_executing if node_id in conflicts]
-        logging.info(f'>>>>>>>>>>>>>>>>> to be killed: {to_be_killed}')
-        for node_id in to_be_killed:
-            self.kill_and_stop(node_id)
-
-    # This method attempts to add to workset (rerun)
-    # any command that found to have a dependency through early resolution
-    def attempt_rerun_pending_nodes(self):
-        restarted_nodes = set()
-        for node_id, run_after_nodes in self.run_after.items():
-            new_run_after_nodes = run_after_nodes.copy()
-            if self.get_new_env_file_for_node(node_id) is not None and node_id not in self.pending_to_execute and node_id not in self.get_currently_executing():
-                for node in run_after_nodes:
-                    if node not in self.get_currently_executing():
-                        logging.debug(f"Running node {node} after execution of {node_id}")
-                        self.waiting_for_frontend.discard(node)
-                        self.workset.append(node)
-                        self.pending_to_execute.discard(node)
-                        self.set_latest_env_file_for_node(node, self.get_new_env_file_for_node(node_id))
-                        restarted_nodes.add(node)
-                        self.prechecked_env.discard(node)
-                        new_run_after_nodes.discard(node)
-            self.run_after[node_id] = new_run_after_nodes
-        return restarted_nodes
-
-    def set_sandbox(self, node_id, sandbox_dir):
-        self.sandbox_dirs[node_id] = sandbox_dir
     
-    def command_execution_completed(self, node_id: NodeId, riker_exit_code:int, sandbox_dir: str):
-        log_time_delta_from_named_timestamp("PartialOrder", "RunNode", node_id)
-        log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolution", node_id, key=f"PostExecResolution-{node_id}")
-
-        logging.debug(f" --- Node {node_id}, just finished execution ---")
-        self.sandbox_dirs[node_id] = sandbox_dir
-        ## TODO: Store variable file somewhere so that we can return when wait
-        if not node_id in self.commands_currently_executing:
-            return
-        _proc, trace_file, stdout, stderr, post_execution_env_file, sandbox_dir = self.commands_currently_executing.pop(node_id)
-        if not sandbox_dir == self.sandbox_dirs[node_id]:
-            return
-        logging.trace(f"ExecutingRemove|{node_id}")
-        # Handle stopped by riker due to network access
-        if int(riker_exit_code) == 159:
-            logging.debug(f" > Adding {node_id} to stopped because it tried to access the network.")
-            logging.trace(f"StoppedAdd|{node_id}:network")
-            self.stopped.add(node_id)
-        else:
-            trace_object = executor.read_trace(sandbox_dir, trace_file)
-            cmd_exit_code = trace_v2.parse_exit_code(trace_object)
-
-            ## Save the completed node info. Note that if the node doesn't commit
-            ##  this information will be invalid and rewritten the next time execution
-            ##  is completed for this node.
-            completed_node_info = CompletedNodeInfo(cmd_exit_code, post_execution_env_file, stdout, sandbox_dir)
-            self.nodes[node_id].set_completed_info(completed_node_info)
-
-            ## We no longer add failed commands to the stopped set,
-            ## because this leads to more repetitions than needed
-            ## and does not allow us to properly speculate commands
-            read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
-            rw_set = RWSet(read_set, write_set)
-            self.update_rw_set(node_id, rw_set)
-
-        if node_id in self.stopped:
-            logging.debug(f"Nothing new to be resolved since {node_id} exited with an error.")
-            if node_id in self.workset:
-                self.workset.remove(node_id)
-                logging.debug(f"WorksetRemove|{node_id}")
-            # If no commands can be resolved this round,
-            # do nothing and wait until a new command finishes executing
-            logging.debug("No resolvable nodes were found in this round, nothing will change...")
-            return
-
-
-        log_time_delta_from_named_timestamp("PartialOrder", "PostExecResolutionECCheck", node_id, key=f"PostExecResolution-{node_id}", invalidate=False)
-        log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "PostExecResolutionFrontendWait", node_id)
-
-        ## Here we check if the most recent env has been received. If not, we cannot resolve anything just yet.
-        if self.get_new_env_file_for_node(node_id) is None:
-            logging.debug(f"Node {node_id} has not received its latest env from runtime yet. Waiting...")
-            self.waiting_for_frontend.add(node_id)
-
-            # We will however attempt to resolve dependencies early
-            self.resolve_dependencies_early(node_id)
-            restarted_cmds = self.attempt_rerun_pending_nodes()
-            # self.log_partial_program_order_info()
-        ## Here we continue with the normal execution flow
-        else:
-            logging.debug(f"Node {node_id} has already received its latest env from runtime. Examining differences...")
-            self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id)
-
-    #TODO: Remove ths in the future - we need a more robust approach to check for env diffs.
-    def exclude_insignificant_diffs(self, env_diff_dict):
-        return {k: v for k, v in env_diff_dict.items() if k not in config.INSIGNIFICANT_VARS}
-
-    #TODO: Remove ths in the future - we need a more robust approach to check for env diffs.
-    def include_only_significant_vars(self, env_diff_dict):
-        return {k: v for k, v in env_diff_dict.items() if k in config.SIGNIFICANT_VARS}
-
-    def significant_diff_in_env_dicts(self, only_in_new, only_in_latest, different_in_both):
-        # Exclude insignificant differences
-        only_in_new_sig = self.include_only_significant_vars(only_in_new)
-        only_in_latest_sig = self.include_only_significant_vars(only_in_latest)
-        different_in_both_sig = self.include_only_significant_vars(different_in_both)
-        # If still diffs are present, return False
-        if len(only_in_new_sig) > 0 or len(only_in_latest_sig) > 0 or len(different_in_both_sig) > 0:
-            logging.debug("Significant differences found:")
-            logging.debug(f"Unique to new (Wait):            {only_in_new_sig}")
-            logging.debug(f"Unique to latest (Before Riker): {only_in_latest_sig}")
-            logging.debug(f"Differing values:                {different_in_both_sig}")
-            return True
-        else:
-            logging.debug("No significant differences found:")
-            return False
-
-    def update_env_and_restart_nodes(self, node_id: NodeId):
-        logging.debug(f"Significant differences found between new and latest env files for {node_id}.")
-        logging.debug(f"Assigning node {node_id} new env (Wait) as the new latest env and re-executing.")
-        self.set_latest_env_file_for_node(node_id, self.get_new_env_file_for_node(node_id))
-        self.prechecked_env.discard(node_id)
-        if node_id not in self.workset:
-            self.workset.append(node_id)
-        self.__kill_all_currently_executing_and_schedule_restart(start=node_id)
-        new_waiting_for_frontend = self.waiting_for_frontend.copy()
-        for waiting_for_frontend_node in self.waiting_for_frontend:
-            if waiting_for_frontend_node not in self.workset and waiting_for_frontend_node in self.get_transitive_closure([node_id]):
-                self.workset.append(waiting_for_frontend_node)
-                new_waiting_for_frontend.remove(waiting_for_frontend_node)
-            most_recent_new_env = self.get_most_recent_possible_new_env_for_node(waiting_for_frontend_node)
-            self.set_latest_env_file_for_node(waiting_for_frontend_node, most_recent_new_env)
-            self.prechecked_env.discard(waiting_for_frontend_node)
-            assert(self.get_new_env_file_for_node(node_id) is not None)
-            assert(self.get_latest_env_file_for_node(waiting_for_frontend_node) is not None)
-        # self.log_partial_program_order_info()
-        logging.debug("-")
-        self.waiting_for_frontend = new_waiting_for_frontend
-        self.populate_to_be_resolved_dict()
-
-    def resolve_most_recent_envs_check_only_wait_node_early(self, node_id: NodeId, restarted_cmds=None):
-        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id),
-                                                                    self.get_latest_env_file_for_node(node_id)):
-            self.update_env_and_restart_nodes(node_id)
-        else:
-            self.prechecked_env.add(node_id)
-
-    def resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(self, node_id: NodeId, restarted_cmds=None):
-        logging.debug(f"Node {node_id} received its latest env from runtime, continuing resolution.")
-        self.waiting_for_frontend.discard(node_id)
-        if node_id not in self.prechecked_env and self.new_and_latest_env_files_have_significant_differences(self.get_new_env_file_for_node(node_id),
-                                                                    self.get_latest_env_file_for_node(node_id)):
-            self.update_env_and_restart_nodes(node_id)
-        else:
-            logging.debug(f"Finding sets of commands that can be resolved after {node_id} finished executing and got its latest env")
-            assert(node_id not in self.stopped)
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "WaitingToResolve", node_id)
-            self.add_to_speculated(node_id)
-            self.resolve_dependencies_early(node_id)
-            restarted_cmds = self.attempt_rerun_pending_nodes()
-            logging.debug(f"Restarted after successful env resolution {restarted_cmds}")
-            # self.log_partial_program_order_info()
-            self.resolve_commands_that_can_be_resolved_and_push_frontier()
-            assert(self.valid())
-
-    def maybe_resolve_most_recent_envs_and_continue_resolution(self, node_id: NodeId):
-        if node_id in self.waiting_for_frontend:
-            logging.debug(f"Node {node_id} received its new env from runtime, continuing full env resolution.")
-            self.resolve_most_recent_envs_and_continue_command_execution_check_only_wait_node(node_id)
-        else:
-            logging.debug(f"Node {node_id} received its new env from runtime, continuing early env resolution.")
-            self.resolve_most_recent_envs_check_only_wait_node_early(node_id)
-
-    def new_and_latest_env_files_have_significant_differences(self, new_env_file, latest_env_file):
-        # Early resolution if same files are compared
-        if new_env_file == latest_env_file:
-            logging.debug(f"Env files are the same. No need to compare.")
-            return False
-        logging.debug(f"Comparing new and latest env files: {new_env_file} {latest_env_file}")
-        assert(latest_env_file is not None)
-
-        new_env = executor.read_env_file(new_env_file)
-        latest_env = executor.read_env_file(latest_env_file)
-
-        only_in_new, only_in_latest, different_in_both = util.compare_env_strings(new_env, latest_env)
-
-        return self.significant_diff_in_env_dicts(only_in_new, only_in_latest, different_in_both)
-
-    def print_cmd_stderr(self, stderr):
-        # stdout.seek(0)
-        # print(stdout.read().decode(), end="")
-        stderr.seek(0)
-        print(stderr.read().decode(), file=sys.stderr, end="")
-
-    def commit_cmd_workspaces(self, to_commit_ids):
-        for cmd_id in sorted(to_commit_ids):
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id)
-            workspace = self.sandbox_dirs[cmd_id]
-            if workspace != "":
-                logging.debug(f" (!) Committing workspace of cmd {cmd_id} found in {workspace}")
-                commit_workspace_out = executor.commit_workspace(workspace)
-                logging.debug(commit_workspace_out.decode())
-            else:
-                logging.debug(f" (!) No need to commit workspace of cmd {cmd_id} as it was run in the main workspace")
-            log_time_delta_from_start_and_set_named_timestamp("PartialOrder", "CommitNode", cmd_id)
-
-    def log_rw_sets(self):
-        logging.debug("====== RW Sets " + "=" * 65)
-        for node_id, rw_set in self.rw_sets.items():
-            logging.debug(f"ID:{node_id} | R:{[f for f in rw_set.get_read_set() if 'output_' in f] if rw_set else None} | W:{rw_set.get_write_set() if rw_set is not None else None}")
-
-    def log_partial_program_order_info(self):
-        logging.debug(f"=" * 80)
-        logging.debug(f"WORKSET:          {self.get_workset()}")
-        logging.debug(f"COMMITTED:        {self.get_committed_list()}")
-        logging.debug(f"FRONTIER:         {self.frontier}")
-        logging.debug(f"EXECUTING:        {list(self.commands_currently_executing.keys())}")
-        logging.debug(f"STOPPED:          {list(self.stopped)}")
-        logging.debug(f" of which UNSAFE: {list(self.get_unsafe())}")
-        logging.debug(f"WAITING:          {sorted(list(self.speculated))}")
-        logging.debug(f"for FRONTEND:     {sorted(list(self.waiting_for_frontend))}")
-        logging.debug(f"TO RESOLVE:       {self.to_be_resolved}")
-        logging.debug(f"PENDING TO EXEC:  {self.pending_to_execute}")
-        logging.debug(f"RUN AFTER:        {self.run_after}")
-        logging.debug(f"New envs:         {self.new_envs}")
-        logging.debug(f"Latest envs:      {self.latest_envs}")
-        self.log_rw_sets()
-        logging.debug(f"=" * 80)
-
-    ## TODO: Document how this finds the to be resolved dict
-    def populate_to_be_resolved_dict(self):
-        logging.debug("Populating the resolved dictionary for all nodes")
-        for node_id in self.nodes:
-            if self.is_committed(node_id):
-                logging.debug(f" > Node: {node_id} is committed, emptying its dict")
-                self.to_be_resolved[node_id] = []
-                continue
-            # We don't want to modify the set of nodes to check for dependencies for this node
-            # as it started running before previous cmds had started executing
-            elif node_id in self.speculated:
-                logging.debug(f" > Node: {node_id} is waiting to be resolved, skipping...")
-                continue
-            elif node_id in self.get_currently_executing():
-                logging.debug(f" > Node: {node_id} is currently executing, skipping...")
-                continue
-            elif node_id in self.waiting_for_frontend:
-                logging.debug(f" > Node: {node_id} is currently waiting for frontend, skipping...")
-                continue
-            else:
-                logging.debug(f" > Node: {node_id} is not executing or waiting to be resolved (speculated) so we modify its set.")
-                self.to_be_resolved[node_id] = []
-                traversal = []
-                relevant_committed = self.get_committed()
-                if node_id not in relevant_committed:
-                    to_add = self.get_prev(node_id).copy()
-                    traversal = to_add.copy()
-                    to_be_resolved_nodes_ids = to_add.copy()
-                while len(traversal) > 0:
-                    current_node_id = traversal.pop(0)
-                    if current_node_id not in relevant_committed:
-                        to_add = self.get_prev(current_node_id)
-                        to_be_resolved_nodes_ids.extend(to_add)
-                        traversal.extend(to_add)
-                self.to_be_resolved[node_id] = to_be_resolved_nodes_ids.copy()
-                self.to_be_resolved[node_id] = list(set(self.to_be_resolved[node_id]) - set(relevant_committed))
-                logging.debug(f' |> New to be resolved set: {self.to_be_resolved[node_id]}')
-
-    def get_currently_executing(self) -> list:
-        return sorted(list(self.commands_currently_executing.keys()))
-
-    def log_executions(self):
-        logging.debug("---------- (Re)executions ------------")
-        for cmd in sorted(self.get_committed_list()):
-            logging.debug(f" CMD {cmd} executed {self.executions[cmd]} times")
-            logging.debug(f"Executions|{cmd},{self.executions[cmd]}")
-        logging.debug(f" Total (re)executions: {sum(list(self.executions.values()))}")
-        logging.debug(f"TotalExec|{sum(list(self.executions.values()))}")
-        logging.debug("--------------------------------------")
-
-
-## TODO: Try to move those to PaSh and import them here
-def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]":
-    logging.debug(f'Parsing: {file_path}')
-    with open(file_path) as f:
-        cmd = f.read()
-    asts = analysis.parse_shell_to_asts(file_path)
-    return cmd, asts
-
-def parse_edge_line(line: str) -> "tuple[int, int]":
-    from_str, to_str = line.split(" -> ")
-    return (int(from_str), int(to_str))
-
-def parse_loop_context_line(line: str) -> "tuple[int, list[int]]":
-    node_id, loop_contexts_raw = line.split("-loop_ctx-")
-    if loop_contexts_raw != "":
-        loop_contexts_str = loop_contexts_raw.split(",")
-        loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str]
-    else:
-        loop_contexts = []
-    return int(node_id), loop_contexts
-
-def parse_loop_contexts(lines):
-    loop_contexts = {}
-    for line in lines:
-        node_id, loop_ctx = parse_loop_context_line(line)
-        loop_contexts[node_id] = loop_ctx
-
-    return loop_contexts
-
-def parse_partial_program_order_from_file(file_path: str) -> PartialProgramOrder:
-    with open(file_path) as f:
-        raw_lines = f.readlines()
-
-    ## Filter comments and remove new lines
-    lines = [line.rstrip() for line in raw_lines
-             if not line.startswith("#")]
-
-    ## The directory in which cmd_files are
-    cmds_directory = str(lines[0])
-    logging.debug(f'Cmds are stored in: {cmds_directory}')
-
-    ## The initial env file
-    initial_env_file = str(lines[1])
-
-    ## The number of nodes
-    number_of_nodes = int(lines[2])
-    logging.debug(f'Number of po cmds: {number_of_nodes}')
-
-    ## The loop context for each node
-    loop_context_start=3
-    loop_context_end=number_of_nodes+3
-    loop_context_lines = lines[loop_context_start:loop_context_end]
-    loop_contexts = parse_loop_contexts(loop_context_lines)
-    logging.debug(f'Loop contexts: {loop_contexts}')
-
-    ## The rest of the lines are edge_lines
-    edge_lines = lines[loop_context_end:]
-    logging.debug(f'Edges: {edge_lines}')
-
-    nodes = {}
-    for i in range(number_of_nodes):
-        file_path = f'{cmds_directory}/{i}'
-        cmd, asts = parse_cmd_from_file(file_path)
-        loop_ctx = loop_contexts[i]
-        nodes[NodeId(i)] = Node(NodeId(i), cmd,
-                                asts=asts,
-                                loop_context=LoopStack(loop_ctx))
+    def get_committed_nodes(self):
+        return [node for node in self.nodes.values() if node.is_committed()]
+    
+    def get_ready_nodes(self):
+        return [node for node in self.nodes.values() if node.is_ready()]
+    
+    def get_executing_nodes(self):
+        return [node for node in self.nodes.values() if node.is_executing()]
+    
+    def get_spec_executing_nodes(self):
+        return [node for node in self.nodes.values() if node.is_spec_executing()]
+    
+    def get_executing_normal_and_speculated_nodes(self):
+        return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()]
+    
+    def get_speculated_nodes(self):
+        return [node for node in self.nodes.values() if node.is_speculated()]
+    
+    def get_uncommitted_nodes(self):
+        return [node for node in self.nodes.values() if not node.is_committed()]
 
-    edges = {NodeId(i) : [] for i in range(number_of_nodes)}
-    for edge_line in edge_lines:
-        from_id, to_id = parse_edge_line(edge_line)
-        edges[NodeId(from_id)].append(NodeId(to_id))
+    def log_state(self):
+        for node in self.nodes.values():
+            logging.info(f"Node {node.id}: {node.state}")
 
-    logging.trace(f"Nodes|{','.join([str(node) for node in nodes])}")
-    logging.trace(f"Edges|{edges}")
-    return PartialProgramOrder(nodes, edges, initial_env_file)
+    def schedule_work(self):
+        pass
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 436eb642..e04b2e92 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -1,9 +1,9 @@
 import argparse
 import logging
 import signal
-from util import *
+import util
 import config
-from partial_program_order import parse_partial_program_order_from_file, LoopStack, NodeId, parse_node_id
+from partial_program_order import PartialProgramOrder, NodeId
 
 ##
 ## A scheduler server
@@ -67,7 +67,7 @@ class Scheduler:
     def __init__(self, socket_file):
         ## TODO: Add all the orchestrator state here (it should just be the partial order)
         self.done = False
-        self.socket = init_unix_socket(socket_file)
+        self.socket = util.init_unix_socket(socket_file)
         ## A map containing connections for node_ids that are waiting for a response
         self.waiting_for_response = {}
         self.partial_program_order = None
@@ -76,69 +76,83 @@ def handle_init(self, input_cmd: str):
         assert(input_cmd.startswith("Init"))
         partial_order_file = input_cmd.split(":")[1].rstrip()
         logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}')
-        self.partial_program_order = parse_partial_program_order_from_file(partial_order_file)
+        self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file)
         self.partial_program_order.init_partial_order()
 
+
+    def process_next_cmd(self):
+        connection, input_cmd = util.socket_get_next_cmd(self.socket)
+
+        if(input_cmd.startswith("Init")):
+            connection.close()
+            self.handle_init(input_cmd)
+            ## TODO: Read the partial order from the given file
+        elif (input_cmd.startswith("Daemon Start") or input_cmd == ""):
+            logging.info(f'Scheduler: Received daemon start message.')
+            connection.close()
+        elif (input_cmd.startswith("CommandExecComplete:")):
+            node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
+            logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
+            connection.close()
+        elif (input_cmd.startswith("Wait")):
+            node_id, _ = self.__parse_wait(input_cmd)
+            self.waiting_for_response[node_id] = connection
+            logging.info(f'Scheduler: Received wait message - {node_id}.')
+            self.respond_to_pending_wait(node_id)
+            
+        elif (input_cmd.startswith("Done")):
+            # if not self.partial_program_order.is_completed():
+            #     logging.debug(" |- some nodes were skipped completed.")
+            util.socket_respond(connection, success_response("All finished!"))
+            self.partial_program_order.log_state()
+            self.done = True
+        elif input_cmd.startswith("CommandExecStart:"):
+            node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
+            logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
+            # self.handle_command_exec_start(input_cmd)
+        else:
+            logging.error(error_response(f'Error: Unsupported command: {input_cmd}'))
+            raise Exception(f'Error: Unsupported command: {input_cmd}')
+
+    def respond_to_frontend_core(self, node_id: NodeId, response: str):
+        assert(node_id in self.waiting_for_response)
+        ## Get the connection that we need to respond to
+        connection = self.waiting_for_response.pop(node_id)
+        util.socket_respond(connection, response)
+        connection.close()
+
+    def respond_to_pending_wait(self, node_id: int):
+        logging.debug(f'Responding to pending wait for node: {node_id}')
+        ## Get the completed node info
+        node = self.partial_program_order.get_node(node_id)
+        completed_node_info = node.get_main_sandbox()
+        # George: Currently I don't init the sandbox info anywhere since there is no execution
+        msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}'
+        response = success_response(msg)
+        ## Send the response
+        self.respond_to_frontend_core(node_id, response)
+
     def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]":
         try:
             node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|")
             raw_node_id_int = int(node_id_component.split(":")[1].rstrip())
             loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip()
             pash_runtime_vars_file_str = pash_runtime_vars_file_component.split(":")[1].rstrip()
-            if loop_counters_str == "None":
-                node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str
-            else:
-                loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")]
-                node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str           
+            # TODO Implement loops correctly
+            # if loop_counters_str == "None":
+            #     node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str
+            # else:
+            #     loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")]
+            #     node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str      
+            node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str     
             return node_id
         except:
             raise Exception(f'Parsing failure for line: {input_cmd}')
-
-    def handle_wait(self, input_cmd: str, connection):
-        assert(input_cmd.startswith("Wait"))
-        ## We have received this message by the JIT, which waits for a node_id to
-        ## finish execution.
-        node_id, pash_runtime_vars_file_str = self.__parse_wait(input_cmd)        
-        logging.debug(f'Scheduler: Received wait for node_id: {node_id}|New env file: {pash_runtime_vars_file_str}')
-
-        ## Set the new env file for the node
-        self.partial_program_order.set_new_env_file_for_node(node_id, pash_runtime_vars_file_str)
         
-        if self.partial_program_order.is_first_node_when_env_is_uninitialized(config.SPECULATE_IMMEDIATELY):
-            logging.debug("Initializing latest env and speculating")
-            self.partial_program_order.init_latest_env_files(node_id)
-        
-        ## Attempt to rerun all pending nodes
-        self.partial_program_order.attempt_rerun_pending_nodes()
-
-        ## Inform the partial order that we received a wait for a node so that it can push loops
-        ## forward and so on.
-        self.partial_program_order.maybe_unroll(node_id)
-        
-        # Moved this below wait_received, in order to support unrolled loop nodes
-        self.partial_program_order.maybe_resolve_most_recent_envs_and_continue_resolution(node_id)
-        
-        self.partial_program_order.wait_received(node_id)
-
-        ## If the node_id is already committed, just return its exit code
-        if node_id in self.partial_program_order.get_committed():
-            logging.debug(f'Node: {node_id} found in committed, responding immediately!')
-            self.waiting_for_response[node_id] = connection
-            self.respond_to_pending_wait(node_id)
-        elif node_id in self.partial_program_order.get_unsafe():
-            logging.debug(f'Node: {node_id} found in unsafe, it must be executed in the original shell!')
-            self.waiting_for_response[node_id] = connection
-            self.respond_unsafe_to_pending_wait(node_id)
-        else:
-            ## Command has not executed yet, so we need to wait for it
-            logging.debug(f'Node: {node_id} has not finished execution, waiting for response...')
-            self.waiting_for_response[node_id] = connection
-
-
     def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         try:
             components = input_cmd.rstrip().split("|")
-            command_id = parse_node_id(components[0].split(":")[1])
+            command_id = NodeId.parse_node_id(components[0].split(":")[1])
             exit_code = int(components[1].split(":")[1])
             sandbox_dir = components[2].split(":")[1]
             trace_file = components[3].split(":")[1]
@@ -146,126 +160,10 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         except:
             raise Exception(f'Parsing failure for line: {input_cmd}')
 
-    def respond_unsafe_to_pending_wait(self, node_id: int):
-        assert(node_id in self.partial_program_order.get_unsafe())
 
-        ## First remove node_id from unsafe and stopped and add to committed
-        ##  since it will be executed immediately in the original shell
-        self.partial_program_order.remove_from_unsafe(node_id)
-        self.partial_program_order.commit_node(node_id)
-
-        response = unsafe_response("")
-
-        ## Send the response
-        self.respond_to_frontend_core(node_id, response)
-
-
-    ## TODO: send riker env here
-    def respond_to_pending_wait(self, node_id: int):
-        logging.debug(f'Responding to pending wait for node: {node_id}')
-        ## Get the completed node info
-        node = self.partial_program_order.get_node(node_id)
-        completed_node_info = node.get_completed_node_info()
-        msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}'
-        response = success_response(msg)
-        ## Send the response
-        self.respond_to_frontend_core(node_id, response)
-
-
-    def respond_to_frontend_core(self, node_id: NodeId, response: str):
-        assert(node_id in self.waiting_for_response)
-        ## Get the connection that we need to respond to
-        connection = self.waiting_for_response.pop(node_id)
-        socket_respond(connection, response)
-        connection.close()
-
-    def handle_command_exec_start(self, input_cmd):
-        assert(input_cmd.startswith("CommandExecStart:"))
-        cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-        self.partial_program_order.set_sandbox(cmd_id, sandbox_dir)
-        
-    def handle_command_exec_complete(self, input_cmd: str):
-        assert(input_cmd.startswith("CommandExecComplete:"))
-        ## Read the node id from the command argument
-        cmd_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-        if trace_file in self.partial_program_order.banned_files:
-            logging.debug(f'CommandExecComplete: {cmd_id} ignored')
-            return
-        ## Gather RWset, resolve dependencies, and progress graph
-        self.partial_program_order.command_execution_completed(cmd_id, exit_code, sandbox_dir)
-
-        ## If there is a connection waiting for this node_id, respond to it
-        if cmd_id in self.waiting_for_response and cmd_id in self.partial_program_order.get_committed():
-            self.respond_to_pending_wait(cmd_id)
-
-    def process_next_cmd(self):
-        connection, input_cmd = socket_get_next_cmd(self.socket)
-
-        if(input_cmd.startswith("Init")):
-            log_time_delta_from_start_and_set_named_timestamp("Scheduler", "PartialOrderInit")
-            connection.close()
-            self.handle_init(input_cmd)
-            ## TODO: Read the partial order from the given file
-            log_time_delta_from_named_timestamp("Scheduler", "PartialOrderInit")
-        elif (input_cmd.startswith("Daemon Start") or input_cmd == ""):
-            log_time_delta_from_start_and_set_named_timestamp("Scheduler", "DaemonStart")
-            connection.close()
-            ## This happens when pa.sh first connects to daemon to see if it is on
-            logging.debug(f'PaSh made first contact with scheduler server.')
-            log_time_delta_from_named_timestamp("Scheduler", "DaemonStart")
-        elif (input_cmd.startswith("CommandExecComplete:")):
-            log_time_delta_from_start_and_set_named_timestamp("Scheduler", "CommandExecComplete")
-            ## We have received this message from an a runner (tracer +isolation)
-            ## The runner should have already parsed RWsets and serialized them to
-            ## a file.
-            connection.close()
-            self.handle_command_exec_complete(input_cmd)
-            log_time_delta_from_named_timestamp("Scheduler", "CommandExecComplete")
-        elif (input_cmd.startswith("Wait")):
-            log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Wait")
-            self.handle_wait(input_cmd, connection)
-            log_time_delta_from_named_timestamp("Scheduler", "Wait")
-        elif (input_cmd.startswith("Done")):
-            log_time_delta_from_start_and_set_named_timestamp("Scheduler", "Done")
-            logging.debug(f'Scheduler server received shutdown message.')
-            logging.debug(f'The partial order was successfully completed.')
-            if not self.partial_program_order.is_completed():
-                logging.debug(" |- some nodes were skipped completed.")
-            socket_respond(connection, success_response("All finished!"))
-            self.partial_program_order.log_executions()
-            self.done = True
-            log_time_delta_from_named_timestamp("Scheduler", "Done")
-        elif input_cmd.startswith("CommandExecStart:"):
-            #TODO: add logging stuff
-            self.handle_command_exec_start(input_cmd)
-        else:
-            logging.error(error_response(f'Error: Unsupported command: {input_cmd}'))
-            raise Exception(f'Error: Unsupported command: {input_cmd}')
-
-    def check_unsafe_and_waiting(self):
-        ## If a command is waiting and also deemed to be unsafe, we need to respond
-        waiting_for_response = set(self.waiting_for_response.keys())
-        unsafe = set(self.partial_program_order.get_unsafe())
-        unsafe_and_waiting = unsafe.intersection(waiting_for_response)
-        if len(unsafe_and_waiting) > 0:
-            assert(len(unsafe_and_waiting) == 1)
-            logging.debug(f'Unsafe and waiting for response nodes: {unsafe_and_waiting}')
-            logging.debug(f'Sending responses to them: {unsafe_and_waiting}')
-            unsafe_and_waiting_id = list(unsafe_and_waiting)[0]
-            self.respond_unsafe_to_pending_wait(unsafe_and_waiting_id)
-
-    ## This function schedules commands for execution until our capacity is reached
-    ##
-    ## It should add some work (if possible), and then return immediately.
-    ## It is called once per loop iteration, making sure that there is always work happening
     def schedule_work(self):
-        log_time_delta_from_start_and_set_named_timestamp("Scheduler", "ScheduleWork")
         self.partial_program_order.schedule_work()
 
-        ## Respond to any waiting nodes that have been deemed to be unsafe
-        self.check_unsafe_and_waiting()
-        log_time_delta_from_named_timestamp("Scheduler", "ScheduleWork")
-
     def run(self):
         ## The first command should be the daemon start
         self.process_next_cmd()
@@ -273,17 +171,9 @@ def run(self):
         ## The second command should be the partial order init
         self.process_next_cmd()
         
-
         while not self.done:
-            # TODO: wrap this around something probably
-            self.partial_program_order.early_stop_using_dep()
-
-            ## Schedule some work (if we are already at capacity this will return immediately)
             self.schedule_work()
-            ## Process a single request
             self.process_next_cmd()
-            # If workset is empty we should end.
-            # TODO: ec checks fail for now
 
         self.socket.close()
         self.shutdown()
@@ -295,13 +185,12 @@ def shutdown(self):
         self.terminate_pending_commands()
         
     def terminate_pending_commands(self):
-        for _node_id, cmd_info in self.partial_program_order.commands_currently_executing.items():
-            proc, _trace_file, _stdout, _stderr, _variable_file, _ = cmd_info
-            proc.terminate()
-
+        for node in self.partial_program_order.get_executing_normal_and_speculated_nodes():
+            proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox()
+            logging.debug(f'Killing: {proc}')
+            # proc.terminate()
 
 def main():
-    log_time_delta_from_start("Scheduler", "Scheduler Init")
     args = init()
 
     # Format logging
@@ -318,9 +207,7 @@ def main():
         logging.getLogger().setLevel(logging.INFO)
     elif args.debug_level >= 2:
         logging.getLogger().setLevel(logging.DEBUG)
-    # elif args.debug_level >= 3:
-    #     logging.getLogger().setLevel(logging.TRACE)
-    
+
     # Set optimization options
     config.SANDBOX_KILLING = args.sandbox_killing
     config.SPECULATE_IMMEDIATELY = args.speculate_immediately
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index 799cd379..af01273f 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -8,6 +8,9 @@
 import re
 import psutil
 import signal
+import analysis
+from node import Node, NodeId
+from partial_program_order import PartialProgramOrder
 
 def ptempfile():
     fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX)
@@ -176,3 +179,82 @@ def kill_process_tree(pid, sig=signal.SIGTERM):
         except:
             pass
     return alive_processes
+
+
+## TODO: Try to move those to PaSh and import them here
+def parse_cmd_from_file(file_path: str) -> "tuple[str,list[AstNode]]":
+    logging.debug(f'Parsing: {file_path}')
+    with open(file_path) as f:
+        cmd = f.read()
+    asts = analysis.parse_shell_to_asts(file_path)
+    return cmd, asts
+
+def parse_edge_line(line: str) -> "tuple[int, int]":
+    from_str, to_str = line.split(" -> ")
+    return (int(from_str), int(to_str))
+
+def parse_loop_context_line(line: str) -> "tuple[int, list[int]]":
+    node_id, loop_contexts_raw = line.split("-loop_ctx-")
+    if loop_contexts_raw != "":
+        loop_contexts_str = loop_contexts_raw.split(",")
+        loop_contexts = [int(loop_ctx) for loop_ctx in loop_contexts_str]
+    else:
+        loop_contexts = []
+    return int(node_id), loop_contexts
+
+def parse_loop_contexts(lines):
+    loop_contexts = {}
+    for line in lines:
+        node_id, loop_ctx = parse_loop_context_line(line)
+        loop_contexts[node_id] = loop_ctx
+    return loop_contexts
+
+
+def parse_partial_program_order_from_file(file_path: str):
+    with open(file_path) as f:
+        raw_lines = f.readlines()
+
+    ## Filter comments and remove new lines
+    lines = [line.rstrip() for line in raw_lines
+            if not line.startswith("#")]
+
+    ## The directory in which cmd_files are
+    cmds_directory = str(lines[0])
+    logging.debug(f'Cmds are stored in: {cmds_directory}')
+
+    ## The initial env file
+    initial_env_file = str(lines[1])
+
+    ## The number of nodes
+    number_of_nodes = int(lines[2])
+    logging.debug(f'Number of po cmds: {number_of_nodes}')
+
+    ## The loop context for each node
+    loop_context_start=3
+    loop_context_end=number_of_nodes+3
+    loop_context_lines = lines[loop_context_start:loop_context_end]
+    loop_contexts = parse_loop_contexts(loop_context_lines)
+    logging.debug(f'Loop contexts: {loop_contexts}')
+
+    ## The rest of the lines are edge_lines
+    edge_lines = lines[loop_context_end:]
+    logging.debug(f'Edges: {edge_lines}')
+
+    nodes = {}
+    for i in range(number_of_nodes):
+        file_path = f'{cmds_directory}/{i}'
+        cmd, asts = parse_cmd_from_file(file_path)
+        # loop_ctx = loop_contexts[i]
+        # nodes[NodeId(i)] = Node(NodeId(i), cmd,
+        #                         asts=asts,
+        #                         loop_context=LoopStack(loop_ctx))
+        nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts)
+
+    edges = {NodeId(i) : [] for i in range(number_of_nodes)}
+    for edge_line in edge_lines:
+        from_id, to_id = parse_edge_line(edge_line)
+        edges[NodeId(from_id)].append(NodeId(to_id))
+
+    logging.info(f"Nodes|{','.join([str(node) for node in nodes])}")
+    logging.info(f"Edges|{edges}")
+    return PartialProgramOrder(nodes, edges)
\ No newline at end of file

From c82cf3b7cfb5c54bca5620065884dd6379371a54 Mon Sep 17 00:00:00 2001
From: George Liargkovas <gliargovas@aueb.gr>
Date: Fri, 12 Jan 2024 00:01:13 +0200
Subject: [PATCH 13/39] Progress the refactor

---
 parallel-orch/node.py                  |   9 +-
 parallel-orch/partial_program_order.py | 134 ++++++++++++++++++++++++-
 parallel-orch/scheduler_server.py      |  10 +-
 3 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 616e1b58..1401b988 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -136,6 +136,11 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
         
         self.background_sandbox = None
 
+    def __str__(self):
+        return f'Node(id:{self.id}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})'
+    
+    def __repr__(self):
+        return str(self)
 
     def is_initialized(self):
         return self.state == NodeState.INIT
@@ -169,11 +174,13 @@ def get_main_sandbox(self):
     ##          Transition Functions        ##
     ##                                      ##
     
-    def transition_to_ready(self):
+    def transition_from_init_to_ready(self):
         assert self.state == NodeState.INIT
         self.state = NodeState.READY
         # Initialize data structures here
 
+        # Also, probably unroll here?
+
     def transition_to_executing(self):
         assert self.state == NodeState.READY
         self.state = NodeState.EXECUTING
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index e4728d36..23d41589 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -1,5 +1,6 @@
 from node import NodeId, Node
 import logging
+from collections import deque
 
 
 class PartialProgramOrder:
@@ -20,9 +21,24 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId
         self.window = 0
         self.to_be_resolved = {} 
         
+    # def init_partial_order(self):
+    #     self.init_workset()
+    #     logging.debug(f'Initialized workset')
+    #     self.populate_to_be_resolved_dict()
+    #     if config.SPECULATE_IMMEDIATELY:
+    #         self.init_latest_env_files()
+    #     logging.debug(f'To be resolved sets per node:')
+    #     logging.debug(self.to_be_resolved)
+    #     logging.info(f'Initialized the partial order!')
+    #     # self.log_partial_program_order_info()
+    #     assert(self.valid())
+    
     def init_partial_order(self):
         for node_id, node in self.nodes.items():
-            node.transition_to_ready()
+            if node.is_initialized():
+                node.transition_from_init_to_ready()
+                
+        self.frontier = self.get_standard_source_nodes()
         # TODO: Implement the rest of the partial order initialization
 
     def commit_node(self, node):
@@ -61,6 +77,15 @@ def get_speculated_nodes(self):
     
     def get_uncommitted_nodes(self):
         return [node for node in self.nodes.values() if not node.is_committed()]
+    
+    def get_frontier(self):
+        return self.frontier
+
+    def log_info(self):
+        logging.info(f"Nodes: {self.nodes}")
+        logging.info(f"Adjacency: {self.adjacency}")
+        logging.info(f"Inverse adjacency: {self.inverse_adjacency}")
+        self.log_state()
 
     def log_state(self):
         for node in self.nodes.values():
@@ -68,3 +93,110 @@ def log_state(self):
 
     def schedule_work(self):
         pass
+    
+    def get_source_nodes(self) -> list:
+        sources = set()
+        for to_id, from_ids in self.inverse_adjacency.items():
+            if len(from_ids) == 0:
+                sources.add(to_id)
+        return list(sources)
+    
+    ## Returns the next non-committed normal node
+    def progress_frontier(self) -> "list[NodeId]":
+        return self.get_next_frontier_nodes(self.get_frontier())
+
+    def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]":
+        return self.adjacency[node_id][:]
+
+    def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]":
+        return self.inverse_adjacency[node_id][:]
+    
+    def get_source_nodes(self) -> list:
+        sources = set()
+        for to_id, from_ids in self.inverse_adjacency.items():
+            if len(from_ids) == 0:
+                sources.add(to_id)
+        return list(sources)
+    
+    def get_standard_source_nodes(self) -> list:
+        source_nodes = self.get_source_nodes()
+        # TODO: Filter out loop nodes
+        # return self.filter_standard_nodes(source_nodes)
+        return source_nodes
+    
+    
+
+
+    def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
+        # TODO: filter non-loop nodes
+        visited = set()
+        to_visit = [(node_id, 0) for node_id in start_nodes]  # Pair each start node with depth 0
+        non_committed_nodes = set()
+        first_non_committed_depth = None
+
+        while to_visit:
+            current_node_id, depth = to_visit.pop()
+            if current_node_id in visited:
+                continue
+
+            visited.add(current_node_id)
+            current_node = self.nodes.get(current_node_id)
+
+            if not current_node.is_committed():
+                if first_non_committed_depth is None:
+                    first_non_committed_depth = depth
+                elif depth > first_non_committed_depth:
+                    # Do not consider nodes deeper than the first non-committed depth
+                    continue
+
+                non_committed_nodes.add(current_node_id)
+
+            if first_non_committed_depth is None or depth < first_non_committed_depth:
+                next_nodes = self.get_next_nodes(current_node_id)  # Use the provided method to get next nodes
+                for neighbor in next_nodes:
+                    if neighbor not in visited:
+                        to_visit.append((neighbor, depth + 1))  # Increase depth for neighbors
+
+        return non_committed_nodes
+    
+    def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
+        if visited is None:
+            visited = set()
+        visited.add(current_node_id)
+
+        all_next_nodes = set([current_node_id])
+        for neighbor in self.get_next_nodes(current_node_id):
+            if neighbor not in visited:
+                all_next_nodes.update(self.get_all_next(neighbor, visited))
+
+        return all_next_nodes
+
+
+    def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
+        if visited is None:
+            visited = set()
+        visited.add(current_node_id)
+
+        all_previous_nodes = set([current_node_id])
+        for neighbor in self.get_prev_nodes(current_node_id):
+            if neighbor not in visited:
+                all_previous_nodes.update(self.get_all_previous(neighbor, visited))
+
+        return all_previous_nodes
+    
+    def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
+        previous = self.get_all_previous(node_id)
+        return set([node for node in previous if not self.nodes[node].is_committed()])
+        
+    
+    def init_to_be_resolved_dict(self):
+        for node_id in self.nodes.keys():
+            self.to_be_resolved[node_id] = ...
+        
+    def init_to_be_resolved_dict(self):
+        for node_id in self.nodes.keys():
+            self.to_be_resolved[node_id] = ...
+            
+    
+    def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
+        
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index e04b2e92..e1de16fa 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -104,8 +104,12 @@ def process_next_cmd(self):
             # if not self.partial_program_order.is_completed():
             #     logging.debug(" |- some nodes were skipped completed.")
             util.socket_respond(connection, success_response("All finished!"))
-            self.partial_program_order.log_state()
+            self.partial_program_order.log_info()
             self.done = True
+            nodes = self.partial_program_order.nodes
+            for k, v in nodes.items():
+                logging.info(self.partial_program_order.progress_frontier())
+                logging.info(f"{k} {self.partial_program_order.get_next_frontier_nodes([k])}")
         elif input_cmd.startswith("CommandExecStart:"):
             node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
             logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
@@ -127,8 +131,10 @@ def respond_to_pending_wait(self, node_id: int):
         node = self.partial_program_order.get_node(node_id)
         completed_node_info = node.get_main_sandbox()
         # George: Currently I don't init the sandbox info anywhere since there is no execution
-        msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}'
+        # msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}'
+        msg = "0 foo bar bax qux"
         response = success_response(msg)
+        
         ## Send the response
         self.respond_to_frontend_core(node_id, response)
 

From 6b3140a373fb716efed5ad662ebf5a0a204d1274 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Fri, 12 Jan 2024 01:29:28 -0500
Subject: [PATCH 14/39] basic sequential execution

---
 parallel-orch/node.py                  | 55 +++++++++++++++++++++-----
 parallel-orch/partial_program_order.py | 13 +++---
 parallel-orch/scheduler_server.py      | 19 +++++----
 3 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 1401b988..0454a9a1 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,3 +1,7 @@
+import executor
+from dataclasses import dataclass
+from subprocess import Popen
+from typing import Tuple
 from enum import Enum, auto
 
 class NodeState(Enum):
@@ -102,9 +106,23 @@ def __gt__(self, obj):
     def parse_node_id(node_id_str: str):
         return NodeId(int(node_id_str))
 
-
+@dataclass
+class ExecCtxt:
+    process: Popen
+    trace_file: str
+    stdout: str
+    stderr: str
+    post_env_file: str
+    sandbox_dir: str
+
+@dataclass
+class ExecResult:
+    exit_code: int
+    proc_id: int
+    
+    
 class Node:
-    id: NodeId
+    id_: NodeId
     cmd: str
     asts: "list[AstNode]"
     state: NodeState
@@ -119,25 +137,27 @@ class Node:
     main_sandbox: Sandbox
     # This can only be set while in the frontier and the background node execution is enabled
     background_sandbox: Sandbox
-    
+    exec_ctxt: ExecCtxt
+    exec_result: ExecResult
     
     def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
-        self.id = node_id
+        self.id_ = node_id
         self.cmd = cmd
         self.asts = asts
         # The node's state
         self.state = NodeState.INIT
         self.tracefile = None
         self.rwset = None
-        # The 
+        # The
         self.to_be_resolved_snapshot = None
         
         self.main_sandbox = None
         
         self.background_sandbox = None
+        self.exec_ctxt = None
 
     def __str__(self):
-        return f'Node(id:{self.id}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})'
+        return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})'
     
     def __repr__(self):
         return str(self)
@@ -169,7 +189,18 @@ def is_unsafe(self):
     def get_main_sandbox(self):
         return self.main_sandbox
     
-    
+
+    def start_command(self, env_file: str, speculate=False):
+        # TODO: implement speculate
+        # TODO: built-in commands
+        cmd = self.cmd
+        execute_func = executor.async_run_and_trace_command_return_trace
+        self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, env_file))
+
+    def execution_outcome(self) -> Tuple[int, str, str]:
+        assert self.exec_result is not None
+        return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout
+        
     ##                                      ##
     ##          Transition Functions        ##
     ##                                      ##
@@ -181,11 +212,17 @@ def transition_from_init_to_ready(self):
 
         # Also, probably unroll here?
 
-    def transition_to_executing(self):
+    def start_executing(self, env_file):
         assert self.state == NodeState.READY
+        self.start_command(env_file)
         self.state = NodeState.EXECUTING
-        # TODO
 
+    def commit_frontier_execution(self):
+        assert self.state == NodeState.EXECUTING
+        self.state = NodeState.COMMITTED
+        self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode)
+        executor.commit_workspace(self.exec_ctxt.sandbox_dir)
+        
     def transition_to_spec_executing(self):
         assert self.state == NodeState.READY
         self.state = NodeState.SPEC_EXECUTING
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 23d41589..e4e8d3ed 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -89,10 +89,10 @@ def log_info(self):
 
     def log_state(self):
         for node in self.nodes.values():
-            logging.info(f"Node {node.id}: {node.state}")
+            logging.info(f"Node {node.id_}: {node.state}")
 
-    def schedule_work(self):
-        pass
+    def schedule_work(self, node_id: NodeId, env_file: str):
+        self.get_node(node_id).start_executing(env_file)
     
     def get_source_nodes(self) -> list:
         sources = set()
@@ -122,10 +122,7 @@ def get_standard_source_nodes(self) -> list:
         source_nodes = self.get_source_nodes()
         # TODO: Filter out loop nodes
         # return self.filter_standard_nodes(source_nodes)
-        return source_nodes
-    
-    
-
+        return source_nodes    
 
     def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
         # TODO: filter non-loop nodes
@@ -199,4 +196,4 @@ def init_to_be_resolved_dict(self):
             
     
     def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
-        
+        pass
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index e1de16fa..80dd1206 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -92,13 +92,19 @@ def process_next_cmd(self):
             connection.close()
         elif (input_cmd.startswith("CommandExecComplete:")):
             node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-            logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
             connection.close()
+            logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
+            node = self.partial_program_order.get_node(node_id)
+            # TODO: condition here to do different things based on node state
+            node.commit_frontier_execution()
+            self.respond_to_pending_wait(node_id)
         elif (input_cmd.startswith("Wait")):
-            node_id, _ = self.__parse_wait(input_cmd)
+            node_id, env_file = self.__parse_wait(input_cmd)
             self.waiting_for_response[node_id] = connection
             logging.info(f'Scheduler: Received wait message - {node_id}.')
-            self.respond_to_pending_wait(node_id)
+            node = self.partial_program_order.get_node(node_id)
+            # TODO: condition here to do different things based on node state
+            self.partial_program_order.schedule_work(node_id, env_file)
             
         elif (input_cmd.startswith("Done")):
             # if not self.partial_program_order.is_completed():
@@ -130,9 +136,7 @@ def respond_to_pending_wait(self, node_id: int):
         ## Get the completed node info
         node = self.partial_program_order.get_node(node_id)
         completed_node_info = node.get_main_sandbox()
-        # George: Currently I don't init the sandbox info anywhere since there is no execution
-        # msg = f'{completed_node_info.get_exit_code()} {completed_node_info.get_post_execution_env_file()} {completed_node_info.get_stdout_file()}'
-        msg = "0 foo bar bax qux"
+        msg = '{} {} {}'.format(*node.execution_outcome())
         response = success_response(msg)
         
         ## Send the response
@@ -168,7 +172,8 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
 
 
     def schedule_work(self):
-        self.partial_program_order.schedule_work()
+        # self.partial_program_order.schedule_work()
+        pass
 
     def run(self):
         ## The first command should be the daemon start

From 91901e75af5c26ea05033df643c988ab45a21737 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sat, 13 Jan 2024 15:24:05 -0700
Subject: [PATCH 15/39] Cleanup and implement transitions triggered by Wait

---
 parallel-orch/executor.py              |  14 ++--
 parallel-orch/node.py                  | 105 ++++++++++---------------
 parallel-orch/partial_program_order.py |  96 ++++++++++++++--------
 parallel-orch/scheduler_server.py      |  35 ++++-----
 4 files changed, 129 insertions(+), 121 deletions(-)

diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py
index 2d58deaf..93439c4b 100644
--- a/parallel-orch/executor.py
+++ b/parallel-orch/executor.py
@@ -8,7 +8,7 @@
 # and traces them with Riker. 
 # All commands are run inside an overlay sandbox.
 
-def async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=False):
+def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=False):
     trace_file = util.ptempfile()
     stdout_file = util.ptempfile()
     stderr_file = util.ptempfile()
@@ -17,17 +17,17 @@ def async_run_and_trace_command_return_trace(command, node_id, latest_env_file,
     logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}')
     logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}')
     logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}')
-    process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
-    return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
+    process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
+    return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, latest_env_file):
-    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, latest_env_file, speculate_mode=True)
+def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, pre_execution_env_file):
+    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=True)
     return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, latest_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
+def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
     ## Call Riker to execute the command
     run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh'
-    args = ["/bin/bash", run_script, command, trace_file, stdout_file, latest_env_file, sandbox_dir, tmp_dir]
+    args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir]
     if speculate_mode:
         args.append("speculate")
     else:
diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 0454a9a1..b0008661 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,3 +1,4 @@
+import logging
 import executor
 from dataclasses import dataclass
 from subprocess import Popen
@@ -14,40 +15,6 @@ class NodeState(Enum):
     SPEC_EXECUTING = auto()
     UNSAFE = auto()
 
-class Sandbox:
-    def __init__(self, trace_file, exit_code, post_execution_env_file, stdout_file, sandbox_dir):
-        # These get predetermined prior to the execution
-        self.trace_file = trace_file
-        self.post_execution_env_file = post_execution_env_file
-        self.stdout_file = stdout_file
-        self.sandbox_dir = sandbox_dir
-        # These get set after execution is done
-        self.exit_code = None
-        self.proc_id = None
-        
-    def set_exit_code(self, exit_code):
-        self.exit_code = exit_code
-        
-    def set_proc_id(self, proc_id):
-        self.proc_id = proc_id
-
-    def get_exit_code(self):
-        return self.exit_code
-
-    def get_post_execution_env_file(self):
-        return self.post_execution_env_file
-
-    def get_stdout_file(self):
-        return self.stdout_file
-
-    def get_sandbox_dir(self):
-        return self.sandbox_dir
-
-    def get_trace_file(self):
-        return self.trace_file
-
-    def __str__(self):
-        return f'Sandbox(trace:{self.get_trace_file}, ec:{self.get_exit_code()}, env:{self.get_post_execution_env_file()}, stdout:{self.get_stdout_file()}, sandbox:{self.get_sandbox_dir()})'
 
 class RWSet:
 
@@ -75,15 +42,15 @@ class NodeId:
     
     #TODO: Implement iteration support
     
-    def __init__(self, id: int):
-        self.id = id
+    def __init__(self, id_: int):
+        self.id_ = id_
 
     def get_non_iter_id(self):
-        return NodeId(self.id)
+        return NodeId(self.id_)
 
     def __repr__(self):
         ## TODO: Represent it using n.
-        output = f'{self.id}'
+        output = f'{self.id_}'
         return output
 
     def __hash__(self):
@@ -91,7 +58,7 @@ def __hash__(self):
 
     def __eq__(self, other):
         # return self.loop_iters == other.loop_iters and self.id == other.id
-        return self.id == other.id
+        return self.id_ == other.id_
 
     def __ne__(self, other):
         return not(self == other)
@@ -112,6 +79,7 @@ class ExecCtxt:
     trace_file: str
     stdout: str
     stderr: str
+    pre_env_file: str
     post_env_file: str
     sandbox_dir: str
 
@@ -132,11 +100,11 @@ class Node:
     to_be_resolved_snapshot: "set[NodeId]"
     # Read and write sets for this node
     rwset: RWSet
-    # This contains the sandbox and execution info for a spec-executing node 
-    # (or plain executing node if frontier background node execution is not enabled)
-    main_sandbox: Sandbox
+    # The wait trace file for this node
+    wait_env_file: str
     # This can only be set while in the frontier and the background node execution is enabled
-    background_sandbox: Sandbox
+    # TODO: For now ignore this. Maybe there is a better way to do this.
+    # background_sandbox: Sandbox
     exec_ctxt: ExecCtxt
     exec_result: ExecResult
     
@@ -144,20 +112,15 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
         self.id_ = node_id
         self.cmd = cmd
         self.asts = asts
-        # The node's state
         self.state = NodeState.INIT
         self.tracefile = None
         self.rwset = None
-        # The
+        self.wait_env_file = None
         self.to_be_resolved_snapshot = None
-        
-        self.main_sandbox = None
-        
-        self.background_sandbox = None
         self.exec_ctxt = None
 
     def __str__(self):
-        return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, main_sandbox:{self.main_sandbox}, background_sandbox:{self.background_sandbox})'
+        return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})'
     
     def __repr__(self):
         return str(self)
@@ -185,10 +148,6 @@ def is_spec_executing(self):
     
     def is_unsafe(self):
         return self.state == NodeState.UNSAFE
-    
-    def get_main_sandbox(self):
-        return self.main_sandbox
-    
 
     def start_command(self, env_file: str, speculate=False):
         # TODO: implement speculate
@@ -218,24 +177,44 @@ def start_executing(self, env_file):
         self.state = NodeState.EXECUTING
 
     def commit_frontier_execution(self):
-        assert self.state == NodeState.EXECUTING
+        assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
         self.state = NodeState.COMMITTED
         self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode)
         executor.commit_workspace(self.exec_ctxt.sandbox_dir)
         
-    def transition_to_spec_executing(self):
+
+    def _attempt_start_command(self, env_file, speculate=False):
+        if self.wait_env_file is not None:
+            self.start_command(env_file=self.wait_env_file, speculate=speculate)
+        elif env_file is not None:
+            self.start_command(env_file=env_file, speculate=speculate)
+        else:
+            logging.error(f'Error: No valid execution env for Node {self.id_}')
+
+    def transition_from_ready_to_executing(self, env_file=None):
+        assert self.state == NodeState.READY
+        self.state = NodeState.EXECUTING
+        self._attempt_start_command(env_file)
+
+    def transition_from_ready_to_spec_executing(self, env_file=None):
         assert self.state == NodeState.READY
         self.state = NodeState.SPEC_EXECUTING
-        # TODO
+        self._attempt_start_command(env_file, speculate=True)
+
+    def transition_from_stopped_to_executing(self, env_file=None):
+        assert self.state == NodeState.READY
+        self.state = NodeState.EXECUTING
+        self._attempt_start_command(env_file)
 
     def transition_to_committed(self):
-        assert self.state in [NodeState.EXECUTING, NodeState.SPECULATED]
+        assert self.state in NodeState.SPECULATED
         self.state = NodeState.COMMITTED
         # TODO
 
-    # TODO: other transition functions
-
+    def transition_from_spec_executing_to_speculated(self):
+        pass
 
-    # Do we need this here of should we handle everything on scheduler server and ppo?
-    def handle_event(self, event_msg):
-        pass # TODO
+    def set_wait_env_file(self, env_file: str):
+        assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED]
+        self.post_env_file = env_file
+    
\ No newline at end of file
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index e4e8d3ed..b94aad2c 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -20,24 +20,13 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId
         self.run_after = set()
         self.window = 0
         self.to_be_resolved = {} 
-        
-    # def init_partial_order(self):
-    #     self.init_workset()
-    #     logging.debug(f'Initialized workset')
-    #     self.populate_to_be_resolved_dict()
-    #     if config.SPECULATE_IMMEDIATELY:
-    #         self.init_latest_env_files()
-    #     logging.debug(f'To be resolved sets per node:')
-    #     logging.debug(self.to_be_resolved)
-    #     logging.info(f'Initialized the partial order!')
-    #     # self.log_partial_program_order_info()
-    #     assert(self.valid())
-    
+
     def init_partial_order(self):
         for node_id, node in self.nodes.items():
             if node.is_initialized():
                 node.transition_from_init_to_ready()
-                
+        
+        # Init frontier
         self.frontier = self.get_standard_source_nodes()
         # TODO: Implement the rest of the partial order initialization
 
@@ -80,7 +69,7 @@ def get_uncommitted_nodes(self):
     
     def get_frontier(self):
         return self.frontier
-
+    
     def log_info(self):
         logging.info(f"Nodes: {self.nodes}")
         logging.info(f"Adjacency: {self.adjacency}")
@@ -94,12 +83,6 @@ def log_state(self):
     def schedule_work(self, node_id: NodeId, env_file: str):
         self.get_node(node_id).start_executing(env_file)
     
-    def get_source_nodes(self) -> list:
-        sources = set()
-        for to_id, from_ids in self.inverse_adjacency.items():
-            if len(from_ids) == 0:
-                sources.add(to_id)
-        return list(sources)
     
     ## Returns the next non-committed normal node
     def progress_frontier(self) -> "list[NodeId]":
@@ -111,7 +94,7 @@ def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]":
     def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]":
         return self.inverse_adjacency[node_id][:]
     
-    def get_source_nodes(self) -> list:
+    def get_source_nodes(self) -> "list[NodeId]":
         sources = set()
         for to_id, from_ids in self.inverse_adjacency.items():
             if len(from_ids) == 0:
@@ -185,15 +168,66 @@ def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
         previous = self.get_all_previous(node_id)
         return set([node for node in previous if not self.nodes[node].is_committed()])
         
+    def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
+        node = self.nodes.get(node_id)
+        if node.is_committed():
+            self.to_be_resolved[node_id] = []
+        elif node.is_ready():
+            self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id)
+
+    def adjust_to_be_resolved_dict(self):
+        for node_id in self.to_be_resolved.keys():
+            self.adjust_to_be_resolved_dict_entry(node_id)
+            
     
-    def init_to_be_resolved_dict(self):
-        for node_id in self.nodes.keys():
-            self.to_be_resolved[node_id] = ...
+    #TODO: Add partial order invariant checks
+    def valid(self):
+        return True
+    
+    def handle_wait(self, node_id: NodeId, env_file: str):
+        node = self.get_node(node_id)
+
+        # Invalid state check
+        if node.is_committed() or node.is_unsafe() or node.is_initialized():
+            logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
+            raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
         
-    def init_to_be_resolved_dict(self):
-        for node_id in self.nodes.keys():
-            self.to_be_resolved[node_id] = ...
-            
     
-    def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
-        pass
+        # For all the valid states, set the wait env file
+        # Q to @Di: Do we need to make the wait env file a node attribute 
+        # (same for most recent env file) or is it ok to just pass it around here?
+        # We might use it in the future so maybe we shouldn't drop it.
+        node.set_wait_env_file(env_file)
+                
+
+        if node.is_ready():
+            if node.id_ in self.get_frontier():
+                node.transition_from_ready_to_executing(env_file)
+            else:
+                node.transition_from_ready_to_spec_executing(env_file)
+        elif node.is_stopped():
+            if node in self.get_frontier():
+                logging.info(f'Node {node_id} is stopped and in the frontier.')
+                node.transition_from_stopped_to_executing(env_file)
+            else:
+                logging.info(f'Node {node_id} is stopped but not in the frontier.')
+        elif node.is_speculated():
+            pass
+            # TODO: handle this case
+            # Check if env conflicts exist
+            # Check fs deps
+            # If no env or fs conflicts, then commit the node
+        elif node.is_executing(): 
+            # Do nothing 
+            pass 
+        elif node.is_spec_executing():
+            # Do nothing 
+            pass
+        else:
+            logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
+            raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
+
+        # TODO: think about this
+        # self.schedule_work_single_node()
+        # self.schedule_work_all_nodes()
+
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 80dd1206..19d43865 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -65,7 +65,6 @@ class Scheduler:
     """
 
     def __init__(self, socket_file):
-        ## TODO: Add all the orchestrator state here (it should just be the partial order)
         self.done = False
         self.socket = util.init_unix_socket(socket_file)
         ## A map containing connections for node_ids that are waiting for a response
@@ -78,7 +77,21 @@ def handle_init(self, input_cmd: str):
         logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}')
         self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file)
         self.partial_program_order.init_partial_order()
+        
+    def handle_command_exec_complete():
+        # TODO: Implement
+        pass
+    
+    def handle_command_exec_start():
+        # TODO: Implement
+        pass
+    
 
+    def handle_wait(self, input_cmd: str, connection):
+        node_id, env_file = self.__parse_wait(input_cmd)
+        self.waiting_for_response[node_id] = connection
+        logging.info(f'Scheduler: Received wait message - {node_id}.')
+        self.partial_program_order.handle_wait(node_id, env_file)
 
     def process_next_cmd(self):
         connection, input_cmd = util.socket_get_next_cmd(self.socket)
@@ -99,23 +112,11 @@ def process_next_cmd(self):
             node.commit_frontier_execution()
             self.respond_to_pending_wait(node_id)
         elif (input_cmd.startswith("Wait")):
-            node_id, env_file = self.__parse_wait(input_cmd)
-            self.waiting_for_response[node_id] = connection
-            logging.info(f'Scheduler: Received wait message - {node_id}.')
-            node = self.partial_program_order.get_node(node_id)
-            # TODO: condition here to do different things based on node state
-            self.partial_program_order.schedule_work(node_id, env_file)
-            
+            self.handle_wait(input_cmd, connection)
         elif (input_cmd.startswith("Done")):
-            # if not self.partial_program_order.is_completed():
-            #     logging.debug(" |- some nodes were skipped completed.")
             util.socket_respond(connection, success_response("All finished!"))
             self.partial_program_order.log_info()
             self.done = True
-            nodes = self.partial_program_order.nodes
-            for k, v in nodes.items():
-                logging.info(self.partial_program_order.progress_frontier())
-                logging.info(f"{k} {self.partial_program_order.get_next_frontier_nodes([k])}")
         elif input_cmd.startswith("CommandExecStart:"):
             node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
             logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
@@ -135,7 +136,6 @@ def respond_to_pending_wait(self, node_id: int):
         logging.debug(f'Responding to pending wait for node: {node_id}')
         ## Get the completed node info
         node = self.partial_program_order.get_node(node_id)
-        completed_node_info = node.get_main_sandbox()
         msg = '{} {} {}'.format(*node.execution_outcome())
         response = success_response(msg)
         
@@ -171,10 +171,6 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
             raise Exception(f'Parsing failure for line: {input_cmd}')
 
 
-    def schedule_work(self):
-        # self.partial_program_order.schedule_work()
-        pass
-
     def run(self):
         ## The first command should be the daemon start
         self.process_next_cmd()
@@ -183,7 +179,6 @@ def run(self):
         self.process_next_cmd()
         
         while not self.done:
-            self.schedule_work()
             self.process_next_cmd()
 
         self.socket.close()

From 7b1741f1f68a74781f289e2400cdf81bab7704d2 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Sun, 14 Jan 2024 12:15:17 -0500
Subject: [PATCH 16/39] simple speculation enabled

---
 parallel-orch/node.py                  | 102 ++++++++++++++++-----
 parallel-orch/partial_program_order.py | 117 +++++++++++++++++--------
 parallel-orch/scheduler_server.py      |  20 +++--
 3 files changed, 175 insertions(+), 64 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index b0008661..65b2ce2b 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,5 +1,6 @@
 import logging
 import executor
+import trace_v2
 from dataclasses import dataclass
 from subprocess import Popen
 from typing import Tuple
@@ -34,6 +35,14 @@ def get_read_set(self) -> set:
     def get_write_set(self) -> set:
         return self.write_set
 
+    def has_conflict(self, other: 'RWSet') -> bool:
+        if (self.write_set.intersection(other.read_set) or
+            self.read_set.intersection(other.write_set) or
+            self.write_set.intersection(other.write_set)):
+            return True
+        else:
+            return False
+        
     def __str__(self):
         return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})"
 
@@ -159,7 +168,8 @@ def start_command(self, env_file: str, speculate=False):
     def execution_outcome(self) -> Tuple[int, str, str]:
         assert self.exec_result is not None
         return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout
-        
+
+
     ##                                      ##
     ##          Transition Functions        ##
     ##                                      ##
@@ -171,35 +181,61 @@ def transition_from_init_to_ready(self):
 
         # Also, probably unroll here?
 
+    def reset_to_ready(self):
+        assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING,
+                              NodeState.SPECULATED]
+        # Probably delete them from tmpfs too
+        self.exec_ctxt = None
+        self.exec_result = None
+        self.rwset = None
+        self.state = NodeState.READY
+        
     def start_executing(self, env_file):
         assert self.state == NodeState.READY
         self.start_command(env_file)
         self.state = NodeState.EXECUTING
 
+    def start_spec_executing(self, env_file):
+        assert self.state == NodeState.READY
+        self.start_command(env_file, speculate=True)
+        self.state = NodeState.SPEC_EXECUTING
+        
     def commit_frontier_execution(self):
-        assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
+        assert self.state == NodeState.EXECUTING
+        self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode)
+        self.gather_fs_actions()
+        executor.commit_workspace(self.exec_ctxt.sandbox_dir)
         self.state = NodeState.COMMITTED
+
+    def finish_spec_execution(self):
+        assert self.state == NodeState.SPEC_EXECUTING
         self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode)
+        self.gather_fs_actions()
+        self.state = NodeState.SPECULATED
+
+
+    def commit_speculated(self):
+        assert self.state == NodeState.SPECULATED
         executor.commit_workspace(self.exec_ctxt.sandbox_dir)
-        
+        self.state = NodeState.COMMITTED
 
-    def _attempt_start_command(self, env_file, speculate=False):
-        if self.wait_env_file is not None:
-            self.start_command(env_file=self.wait_env_file, speculate=speculate)
-        elif env_file is not None:
-            self.start_command(env_file=env_file, speculate=speculate)
-        else:
-            logging.error(f'Error: No valid execution env for Node {self.id_}')
+    # def _attempt_start_command(self, env_file, speculate=False):
+    #     if self.wait_env_file is not None:
+    #         self.start_command(env_file=self.wait_env_file, speculate=speculate)
+    #     elif env_file is not None:
+    #         self.start_command(env_file=env_file, speculate=speculate)
+    #     else:
+    #         logging.error(f'Error: No valid execution env for Node {self.id_}')
 
-    def transition_from_ready_to_executing(self, env_file=None):
-        assert self.state == NodeState.READY
-        self.state = NodeState.EXECUTING
-        self._attempt_start_command(env_file)
+    # def transition_from_ready_to_executing(self, env_file=None):
+    #     assert self.state == NodeState.READY
+    #     self.state = NodeState.EXECUTING
+    #     self._attempt_start_command(env_file)
 
-    def transition_from_ready_to_spec_executing(self, env_file=None):
-        assert self.state == NodeState.READY
-        self.state = NodeState.SPEC_EXECUTING
-        self._attempt_start_command(env_file, speculate=True)
+    # def transition_from_ready_to_spec_executing(self, env_file=None):
+    #     assert self.state == NodeState.READY
+    #     self.state = NodeState.SPEC_EXECUTING
+    #     self._attempt_start_command(env_file, speculate=True)
 
     def transition_from_stopped_to_executing(self, env_file=None):
         assert self.state == NodeState.READY
@@ -214,7 +250,29 @@ def transition_to_committed(self):
     def transition_from_spec_executing_to_speculated(self):
         pass
 
-    def set_wait_env_file(self, env_file: str):
-        assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED]
-        self.post_env_file = env_file
-    
\ No newline at end of file
+
+    def update_rw_set(self, rw_set):
+        self.rwset = rw_set
+    
+    def gather_fs_actions(self) -> RWSet:
+        assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
+        sandbox_dir = self.exec_ctxt.sandbox_dir
+        trace_file = self.exec_ctxt.trace_file
+        try:
+            trace_object = executor.read_trace(sandbox_dir, trace_file)
+        except FileNotFoundError:
+            self.update_rw_set(RWSet(set(), set()))
+            return
+        read_set, write_set = trace_v2.parse_and_gather_cmd_rw_sets(trace_object)
+        rw_set = RWSet(read_set, write_set)
+        self.update_rw_set(rw_set)
+
+    def get_rw_set(self):
+        # if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]:
+        #     self.gather_fs_actions()
+        return self.rwset
+        
+    # def set_wait_env_file(self, env_file: str):
+    #     assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED]
+    #     self.post_env_file = env_file
+    
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index b94aad2c..1b46b834 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -6,7 +6,6 @@
 class PartialProgramOrder:
     frontier: set  # Set of nodes at the frontier
     run_after: set  # Nodes that should run after certain conditions
-    window: int  # Integer representing the window
     to_be_resolved: "dict[NodeId, list[Node]]"  # Mapping of nodes to lists of uncommitted nodes
     nodes: "dict[NodeId, Node]"
     adjacency: "dict[NodeId, list[NodeId]]"
@@ -18,14 +17,15 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId
         self.inverse_adjacency = self.init_inverse_adjacency()
         self.frontier = set()
         self.run_after = set()
-        self.window = 0
-        self.to_be_resolved = {} 
+        self.to_be_resolved = {}
 
     def init_partial_order(self):
         for node_id, node in self.nodes.items():
             if node.is_initialized():
                 node.transition_from_init_to_ready()
-        
+
+        self.init_to_be_resolved_dict()
+        logging.info(self.to_be_resolved)
         # Init frontier
         self.frontier = self.get_standard_source_nodes()
         # TODO: Implement the rest of the partial order initialization
@@ -80,9 +80,14 @@ def log_state(self):
         for node in self.nodes.values():
             logging.info(f"Node {node.id_}: {node.state}")
 
+    def get_schedulable_nodes(self) -> list[NodeId]:
+        return [node.id_ for node in self.get_ready_nodes()]
+            
     def schedule_work(self, node_id: NodeId, env_file: str):
         self.get_node(node_id).start_executing(env_file)
-    
+
+    def schedule_spec_work(self, node_id: NodeId, env_file: str):
+        self.get_node(node_id).start_spec_executing(env_file)
     
     ## Returns the next non-committed normal node
     def progress_frontier(self) -> "list[NodeId]":
@@ -140,29 +145,29 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
         return non_committed_nodes
     
     def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
-        if visited is None:
-            visited = set()
-        visited.add(current_node_id)
-
-        all_next_nodes = set([current_node_id])
-        for neighbor in self.get_next_nodes(current_node_id):
-            if neighbor not in visited:
-                all_next_nodes.update(self.get_all_next(neighbor, visited))
-
-        return all_next_nodes
+        all_next = set()
+        def reachable_rec(cur, reachable):
+            if cur in reachable:
+                return
+            reachable.add(cur)
+            for n in self.get_next_nodes(cur):
+                reachable_rec(n, reachable)
+        for n in self.get_next_nodes(current_node_id):
+            reachable_rec(n, all_next)
+        return all_next
 
 
     def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
-        if visited is None:
-            visited = set()
-        visited.add(current_node_id)
-
-        all_previous_nodes = set([current_node_id])
-        for neighbor in self.get_prev_nodes(current_node_id):
-            if neighbor not in visited:
-                all_previous_nodes.update(self.get_all_previous(neighbor, visited))
-
-        return all_previous_nodes
+        all_prev = set()
+        def reachable_rec(cur, reachable):
+            if cur in reachable:
+                return
+            reachable.add(cur)
+            for n in self.get_prev_nodes(cur):
+                reachable_rec(n, reachable)
+        for n in self.get_prev_nodes(current_node_id):
+            reachable_rec(n, all_prev)
+        return all_prev
     
     def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
         previous = self.get_all_previous(node_id)
@@ -175,14 +180,52 @@ def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
         elif node.is_ready():
             self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id)
 
+    def init_to_be_resolved_dict(self):
+        for node_id in self.nodes:
+            self.adjust_to_be_resolved_dict_entry(node_id)
+
     def adjust_to_be_resolved_dict(self):
+        # TODO: this design seems to require the function to be called
+        # each time before a node entering EXECUTING or SPEC_EXECUTING
+        # to be optimal (that is, it might keep more things in the list).
+        # It's safe as is so I'm not touching it.
         for node_id in self.to_be_resolved.keys():
             self.adjust_to_be_resolved_dict_entry(node_id)
-            
-    
+
     #TODO: Add partial order invariant checks
     def valid(self):
         return True
+
+    def has_fs_deps(self, node_id: NodeId):
+        node_of_interest : Node = self.get_node(node_id)
+        for node in self.get_executing_normal_and_speculated_nodes():
+            node.gather_fs_actions()
+        for nid in self.to_be_resolved[node_id]:
+            node: Node = self.get_node(nid)
+            if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()):
+                return True
+        return False
+    
+    def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
+                        current_env: str):
+        node = self.get_node(node_id)
+        # TODO: complete the state matching
+        if node.is_executing():
+            node.commit_frontier_execution()
+            self.adjust_to_be_resolved_dict()
+        elif node.is_spec_executing():
+            if self.has_fs_deps(node_id):
+                node.reset_to_ready()
+                # otherwise it stays in ready state and waits to be scheduled by the scheduler
+                if has_pending_wait:
+                    node.start_executing(current_env)
+            else:
+                node.finish_spec_execution()
+                if has_pending_wait:
+                    node.commit_speculated()
+                    self.adjust_to_be_resolved_dict()
+        else:
+            assert False
     
     def handle_wait(self, node_id: NodeId, env_file: str):
         node = self.get_node(node_id)
@@ -197,14 +240,11 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         # Q to @Di: Do we need to make the wait env file a node attribute 
         # (same for most recent env file) or is it ok to just pass it around here?
         # We might use it in the future so maybe we shouldn't drop it.
-        node.set_wait_env_file(env_file)
-                
+        # TODO: remove this?
+        # node.set_wait_env_file(env_file)
 
         if node.is_ready():
-            if node.id_ in self.get_frontier():
-                node.transition_from_ready_to_executing(env_file)
-            else:
-                node.transition_from_ready_to_spec_executing(env_file)
+            node.start_executing(env_file)
         elif node.is_stopped():
             if node in self.get_frontier():
                 logging.info(f'Node {node_id} is stopped and in the frontier.')
@@ -212,11 +252,14 @@ def handle_wait(self, node_id: NodeId, env_file: str):
             else:
                 logging.info(f'Node {node_id} is stopped but not in the frontier.')
         elif node.is_speculated():
-            pass
             # TODO: handle this case
             # Check if env conflicts exist
-            # Check fs deps
-            # If no env or fs conflicts, then commit the node
+            if self.has_fs_deps(node_id):
+                node.reset_to_ready()
+                node.start_executing(env_file)
+            else:
+                node.commit_speculated()
+                self.adjust_to_be_resolved_dict()
         elif node.is_executing(): 
             # Do nothing 
             pass 
@@ -230,4 +273,4 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         # TODO: think about this
         # self.schedule_work_single_node()
         # self.schedule_work_all_nodes()
-
+        
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 19d43865..82d543cd 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -63,8 +63,11 @@ class Scheduler:
                     |   Wait -> The JIT component waits for the results of a specific command
                     |   Done -> We are done
     """
+    window: int  # Integer representing the window
+    latest_env: str # This variable should be initialized by the first wait, and always have a value since
 
     def __init__(self, socket_file):
+        self.window = 0
         self.done = False
         self.socket = util.init_unix_socket(socket_file)
         ## A map containing connections for node_ids that are waiting for a response
@@ -91,7 +94,10 @@ def handle_wait(self, input_cmd: str, connection):
         node_id, env_file = self.__parse_wait(input_cmd)
         self.waiting_for_response[node_id] = connection
         logging.info(f'Scheduler: Received wait message - {node_id}.')
+        self.latest_env = env_file
         self.partial_program_order.handle_wait(node_id, env_file)
+        if self.partial_program_order.get_node(node_id).is_committed():
+            self.respond_to_pending_wait(node_id)
 
     def process_next_cmd(self):
         connection, input_cmd = util.socket_get_next_cmd(self.socket)
@@ -105,12 +111,10 @@ def process_next_cmd(self):
             connection.close()
         elif (input_cmd.startswith("CommandExecComplete:")):
             node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-            connection.close()
             logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
-            node = self.partial_program_order.get_node(node_id)
-            # TODO: condition here to do different things based on node state
-            node.commit_frontier_execution()
-            self.respond_to_pending_wait(node_id)
+            self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env)
+            if self.partial_program_order.get_node(node_id).is_committed():
+                self.respond_to_pending_wait(node_id)
         elif (input_cmd.startswith("Wait")):
             self.handle_wait(input_cmd, connection)
         elif (input_cmd.startswith("Done")):
@@ -171,6 +175,11 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
             raise Exception(f'Parsing failure for line: {input_cmd}')
 
 
+    def schedule_work(self):
+        nodes = self.partial_program_order.get_schedulable_nodes()
+        if len(nodes):
+            self.partial_program_order.schedule_spec_work(nodes[0], self.latest_env)
+        
     def run(self):
         ## The first command should be the daemon start
         self.process_next_cmd()
@@ -180,6 +189,7 @@ def run(self):
         
         while not self.done:
             self.process_next_cmd()
+            self.schedule_work()
 
         self.socket.close()
         self.shutdown()

From 0737b2f54298458600702b25c22d90114afa8845 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sun, 14 Jan 2024 16:43:59 -0700
Subject: [PATCH 17/39] Env check and cleanup

---
 parallel-orch/node.py                  | 66 ++++++++++++++++----------
 parallel-orch/partial_program_order.py | 39 ++++++++-------
 parallel-orch/scheduler_server.py      |  1 -
 3 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 65b2ce2b..621c2981 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import executor
 import trace_v2
 from dataclasses import dataclass
@@ -177,19 +178,28 @@ def execution_outcome(self) -> Tuple[int, str, str]:
     def transition_from_init_to_ready(self):
         assert self.state == NodeState.INIT
         self.state = NodeState.READY
-        # Initialize data structures here
-
         # Also, probably unroll here?
 
+    def kill(self):
+        assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
+        self.exec_ctxt.process.kill()
+
     def reset_to_ready(self):
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING,
                               NodeState.SPECULATED]
+        
+        # Q for @Di: Should we kill the process here?
+        if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]:
+            self.kill()
+        
         # Probably delete them from tmpfs too
         self.exec_ctxt = None
         self.exec_result = None
         self.rwset = None
         self.state = NodeState.READY
         
+
+        
     def start_executing(self, env_file):
         assert self.state == NodeState.READY
         self.start_command(env_file)
@@ -219,24 +229,6 @@ def commit_speculated(self):
         executor.commit_workspace(self.exec_ctxt.sandbox_dir)
         self.state = NodeState.COMMITTED
 
-    # def _attempt_start_command(self, env_file, speculate=False):
-    #     if self.wait_env_file is not None:
-    #         self.start_command(env_file=self.wait_env_file, speculate=speculate)
-    #     elif env_file is not None:
-    #         self.start_command(env_file=env_file, speculate=speculate)
-    #     else:
-    #         logging.error(f'Error: No valid execution env for Node {self.id_}')
-
-    # def transition_from_ready_to_executing(self, env_file=None):
-    #     assert self.state == NodeState.READY
-    #     self.state = NodeState.EXECUTING
-    #     self._attempt_start_command(env_file)
-
-    # def transition_from_ready_to_spec_executing(self, env_file=None):
-    #     assert self.state == NodeState.READY
-    #     self.state = NodeState.SPEC_EXECUTING
-    #     self._attempt_start_command(env_file, speculate=True)
-
     def transition_from_stopped_to_executing(self, env_file=None):
         assert self.state == NodeState.READY
         self.state = NodeState.EXECUTING
@@ -271,8 +263,34 @@ def get_rw_set(self):
         # if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]:
         #     self.gather_fs_actions()
         return self.rwset
+
+    def has_env_conflict_with(self, other_env) -> bool:
+        # Early return if paths are the same
+        if self.exec_ctxt.pre_env_file == other_env:
+            return False
+
+        ignore_vars = set(['RANDOM'])  
         
-    # def set_wait_env_file(self, env_file: str):
-    #     assert self.state in [NodeState.READY, NodeState.EXECUTING, NodeState.SPEC_EXECUTING, NodeState.STOP, NodeState.SPECULATED]
-    #     self.post_env_file = env_file
-    
+        re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"')
+        re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"')
+        re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))')
+
+        def parse_env(content):
+            env_vars = {}
+            for line in content.splitlines():
+                if line.startswith('#') or not line.strip():
+                    continue
+                for regex in [re_scalar_string, re_scalar_int, re_array]:
+                    match = regex.match(line)
+                    if match:
+                        key, value = match.groups()
+                        if key not in ignore_vars:
+                            env_vars[key] = value
+            return env_vars
+
+        with open(self.exec_ctxt.pre_env_file, 'r') as file:
+            node_env_vars = parse_env(file.read())
+
+        with open(other_env, 'r') as file:
+            other_env_vars = parse_env(file.read())
+        return node_env_vars != other_env_vars
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 1b46b834..6579c787 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -169,10 +169,14 @@ def reachable_rec(cur, reachable):
             reachable_rec(n, all_prev)
         return all_prev
     
+    def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
+        next = self.get_all_next(node_id)
+        return set([node for node in next if not self.nodes[node].is_committed()])
+    
     def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
         previous = self.get_all_previous(node_id)
         return set([node for node in previous if not self.nodes[node].is_committed()])
-        
+
     def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
         node = self.nodes.get(node_id)
         if node.is_committed():
@@ -234,14 +238,6 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         if node.is_committed() or node.is_unsafe() or node.is_initialized():
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
-        
-    
-        # For all the valid states, set the wait env file
-        # Q to @Di: Do we need to make the wait env file a node attribute 
-        # (same for most recent env file) or is it ok to just pass it around here?
-        # We might use it in the future so maybe we shouldn't drop it.
-        # TODO: remove this?
-        # node.set_wait_env_file(env_file)
 
         if node.is_ready():
             node.start_executing(env_file)
@@ -254,23 +250,30 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         elif node.is_speculated():
             # TODO: handle this case
             # Check if env conflicts exist
-            if self.has_fs_deps(node_id):
+            
+            
+            if node.has_env_conflict_with(env_file) or self.has_fs_deps(node_id):
+                ## TODO: Optimization
+                ## FIXME: Currently causes AssertionError: assert(node_id in self.waiting_for_response)
+                # An env conflict means that every following node 
+                # will have the same env conflict
+                # therefore, we have to reset them all
+                # for uncommitted_node_id in self.get_all_next_uncommitted(node_id):
+                #     uncommitted_node = self.get_node(uncommitted_node_id)
+                #     uncommitted_node.reset_to_ready()
+                #     uncommitted_node.start_executing(env_file)
                 node.reset_to_ready()
                 node.start_executing(env_file)
             else:
                 node.commit_speculated()
                 self.adjust_to_be_resolved_dict()
+            
         elif node.is_executing(): 
-            # Do nothing 
-            pass 
+            # Do nothing
+            pass
         elif node.is_spec_executing():
-            # Do nothing 
+            # Do nothing
             pass
         else:
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
-
-        # TODO: think about this
-        # self.schedule_work_single_node()
-        # self.schedule_work_all_nodes()
-        
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 82d543cd..b637f930 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -105,7 +105,6 @@ def process_next_cmd(self):
         if(input_cmd.startswith("Init")):
             connection.close()
             self.handle_init(input_cmd)
-            ## TODO: Read the partial order from the given file
         elif (input_cmd.startswith("Daemon Start") or input_cmd == ""):
             logging.info(f'Scheduler: Received daemon start message.')
             connection.close()

From 47d32dc72e95776c46abcf8303faa6a14f947beb Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Tue, 16 Jan 2024 02:51:18 -0700
Subject: [PATCH 18/39] Fix env checking and add exec instanse wait msg
 matching

---
 parallel-orch/executor.py              | 11 +++----
 parallel-orch/node.py                  | 18 +++++++++---
 parallel-orch/partial_program_order.py | 40 +++++++++++++-------------
 parallel-orch/scheduler_server.py      | 34 ++++++++++------------
 parallel-orch/util.py                  |  5 +++-
 5 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py
index 93439c4b..52bc3d9f 100644
--- a/parallel-orch/executor.py
+++ b/parallel-orch/executor.py
@@ -8,7 +8,7 @@
 # and traces them with Riker. 
 # All commands are run inside an overlay sandbox.
 
-def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=False):
+def async_run_and_trace_command_return_trace(command, node_id, execution_id, pre_execution_env_file, speculate_mode=False):
     trace_file = util.ptempfile()
     stdout_file = util.ptempfile()
     stderr_file = util.ptempfile()
@@ -17,14 +17,14 @@ def async_run_and_trace_command_return_trace(command, node_id, pre_execution_env
     logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}')
     logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}')
     logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}')
-    process = async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
+    process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
     return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, node_id, pre_execution_env_file):
-    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, node_id, pre_execution_env_file, speculate_mode=True)
+def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, node_id, pre_execution_env_file):
+    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, node_id, pre_execution_env_file, speculate_mode=True)
     return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
+def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
     ## Call Riker to execute the command
     run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh'
     args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir]
@@ -34,6 +34,7 @@ def async_run_and_trace_command_return_trace_in_sandbox(command, trace_file, nod
         args.append("standard")
     args.append(str(node_id))
     args.append(post_execution_env_file)
+    args.append(str(execution_id))
     # Save output to temporary files to not saturate the memory
     logging.debug(args)
     process = subprocess.Popen(args, stdout=None, stderr=None)
diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 621c2981..3bf56b23 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -6,6 +6,7 @@
 from subprocess import Popen
 from typing import Tuple
 from enum import Enum, auto
+import util
 
 class NodeState(Enum):
     INIT = auto()
@@ -104,6 +105,8 @@ class Node:
     cmd: str
     asts: "list[AstNode]"
     state: NodeState
+    # Used for identifying the most recent valid execution
+    exec_id: int
     # Nodes to check for fs dependencies before this node can be committed
     # for this particular execution of the main sandbox.
     # No need to do the same for the background sandbox since it will always get committed.
@@ -128,6 +131,7 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
         self.wait_env_file = None
         self.to_be_resolved_snapshot = None
         self.exec_ctxt = None
+        self.exec_id = None
 
     def __str__(self):
         return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})'
@@ -164,7 +168,9 @@ def start_command(self, env_file: str, speculate=False):
         # TODO: built-in commands
         cmd = self.cmd
         execute_func = executor.async_run_and_trace_command_return_trace
-        self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, env_file))
+        # Set the execution id
+        self.exec_id = util.generate_id()
+        self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, self.exec_id, env_file))
 
     def execution_outcome(self) -> Tuple[int, str, str]:
         assert self.exec_result is not None
@@ -188,7 +194,12 @@ def reset_to_ready(self):
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING,
                               NodeState.SPECULATED]
         
-        # Q for @Di: Should we kill the process here?
+        logging.info(f"Resetting node {self.id_} to ready {self.exec_id}")
+        # We reset the exec id so if we receive a message 
+        # due to a race condition, we will ignore it.
+        self.exec_id = None
+        
+        # TODO: make this more sophisticated
         if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]:
             self.kill()
         
@@ -197,9 +208,8 @@ def reset_to_ready(self):
         self.exec_result = None
         self.rwset = None
         self.state = NodeState.READY
-        
 
-        
+
     def start_executing(self, env_file):
         assert self.state == NodeState.READY
         self.start_command(env_file)
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 6579c787..bdf80ab0 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -230,7 +230,13 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
                     self.adjust_to_be_resolved_dict()
         else:
             assert False
-    
+
+    def reset_succeeding_nodes_and_start_exec(self, node_id: NodeId, env_file: str):
+        for uncommitted_node_id in self.get_all_next_uncommitted(node_id):
+            uncommitted_node = self.get_node(uncommitted_node_id)
+            uncommitted_node.reset_to_ready()
+            uncommitted_node.start_spec_executing(env_file)
+
     def handle_wait(self, node_id: NodeId, env_file: str):
         node = self.get_node(node_id)
 
@@ -238,6 +244,7 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         if node.is_committed() or node.is_unsafe() or node.is_initialized():
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
+        
 
         if node.is_ready():
             node.start_executing(env_file)
@@ -248,32 +255,25 @@ def handle_wait(self, node_id: NodeId, env_file: str):
             else:
                 logging.info(f'Node {node_id} is stopped but not in the frontier.')
         elif node.is_speculated():
-            # TODO: handle this case
             # Check if env conflicts exist
-            
-            
-            if node.has_env_conflict_with(env_file) or self.has_fs_deps(node_id):
-                ## TODO: Optimization
-                ## FIXME: Currently causes AssertionError: assert(node_id in self.waiting_for_response)
-                # An env conflict means that every following node 
-                # will have the same env conflict
-                # therefore, we have to reset them all
-                # for uncommitted_node_id in self.get_all_next_uncommitted(node_id):
-                #     uncommitted_node = self.get_node(uncommitted_node_id)
-                #     uncommitted_node.reset_to_ready()
-                #     uncommitted_node.start_executing(env_file)
+            if node.has_env_conflict_with(env_file):
+                node.reset_to_ready()
+                node.start_executing(env_file)
+                self.reset_succeeding_nodes_and_start_exec(node_id, env_file)
+            # Optimization: It would make sense to perform the checks independently,
+            # and if fs conflict, then update the run after dict.
+            elif self.has_fs_deps(node_id):
                 node.reset_to_ready()
                 node.start_executing(env_file)
             else:
                 node.commit_speculated()
                 self.adjust_to_be_resolved_dict()
-            
-        elif node.is_executing(): 
-            # Do nothing
-            pass
+        elif node.is_executing():
+            if node.has_env_conflict_with(env_file):
+                self.reset_succeeding_nodes_and_start_exec(node_id, env_file)
         elif node.is_spec_executing():
-            # Do nothing
-            pass
+            if node.has_env_conflict_with(env_file):
+                self.reset_succeeding_nodes_and_start_exec(node_id, env_file)
         else:
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index b637f930..71586429 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -80,15 +80,6 @@ def handle_init(self, input_cmd: str):
         logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}')
         self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file)
         self.partial_program_order.init_partial_order()
-        
-    def handle_command_exec_complete():
-        # TODO: Implement
-        pass
-    
-    def handle_command_exec_start():
-        # TODO: Implement
-        pass
-    
 
     def handle_wait(self, input_cmd: str, connection):
         node_id, env_file = self.__parse_wait(input_cmd)
@@ -109,11 +100,15 @@ def process_next_cmd(self):
             logging.info(f'Scheduler: Received daemon start message.')
             connection.close()
         elif (input_cmd.startswith("CommandExecComplete:")):
-            node_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-            logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
-            self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env)
-            if self.partial_program_order.get_node(node_id).is_committed():
-                self.respond_to_pending_wait(node_id)
+            node_id, exec_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
+            if self.partial_program_order.get_node(node_id).exec_id == exec_id:
+                logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
+                self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env)
+                
+                if self.partial_program_order.get_node(node_id).is_committed():
+                    self.respond_to_pending_wait(node_id)
+            else:
+                logging.info(f'Scheduler: Received command exec complete message for a killed instance, ignoring - {node_id}.')
         elif (input_cmd.startswith("Wait")):
             self.handle_wait(input_cmd, connection)
         elif (input_cmd.startswith("Done")):
@@ -121,7 +116,7 @@ def process_next_cmd(self):
             self.partial_program_order.log_info()
             self.done = True
         elif input_cmd.startswith("CommandExecStart:"):
-            node_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
+            node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
             logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
             # self.handle_command_exec_start(input_cmd)
         else:
@@ -166,10 +161,11 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         try:
             components = input_cmd.rstrip().split("|")
             command_id = NodeId.parse_node_id(components[0].split(":")[1])
-            exit_code = int(components[1].split(":")[1])
-            sandbox_dir = components[2].split(":")[1]
-            trace_file = components[3].split(":")[1]
-            return command_id, exit_code, sandbox_dir, trace_file
+            exec_id = int(components[1].split(":")[1])
+            exit_code = int(components[2].split(":")[1])
+            sandbox_dir = components[3].split(":")[1]
+            trace_file = components[4].split(":")[1]
+            return command_id, exec_id, exit_code, sandbox_dir, trace_file
         except:
             raise Exception(f'Parsing failure for line: {input_cmd}')
 
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index af01273f..dc0abdcc 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -257,4 +257,7 @@ def parse_partial_program_order_from_file(file_path: str):
 
     logging.info(f"Nodes|{','.join([str(node) for node in nodes])}")
     logging.info(f"Edges|{edges}")
-    return PartialProgramOrder(nodes, edges)
\ No newline at end of file
+    return PartialProgramOrder(nodes, edges)
+
+def generate_id() -> int:
+    return int(time.time() * 1000000)

From 80523521a45e63eb17e52ce6e5f952080eb3d7c1 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Tue, 16 Jan 2024 02:52:10 -0700
Subject: [PATCH 19/39] Update script to receive exec id from scheduler

---
 parallel-orch/run_command.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parallel-orch/run_command.sh b/parallel-orch/run_command.sh
index 2d3597c1..9fe48f26 100755
--- a/parallel-orch/run_command.sh
+++ b/parallel-orch/run_command.sh
@@ -10,6 +10,7 @@ export TMPDIR=${6?No tmp dir given}
 export EXEC_MODE=${7?No execution mode given}
 export CMD_ID=${8?No command id given}
 export POST_EXEC_ENV=${9?No Riker env file given}
+export EXECUTION_ID=${10?No execution id given}
 
 ## KK 2023-04-24: Not sure this should be run every time we run a command
 ## GL 2023-07-08: Tests seem to pass without it
@@ -41,5 +42,5 @@ out=`head -3 $SANDBOX_DIR/upperdir/$TRACE_FILE`
 ## Assumes "${PASH_SPEC_SCHEDULER_SOCKET}" is set and exported
 
 ## Pass the proper exit code
-msg="CommandExecComplete:${CMD_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}"
+msg="CommandExecComplete:${CMD_ID}|Exec id:${EXECUTION_ID}|Exit code:${exit_code}|Sandbox dir:${SANDBOX_DIR}|Trace file:${TRACE_FILE}|Tempdir:${TEMPDIR}"
 daemon_response=$(pash_spec_communicate_scheduler_just_send "$msg") # Blocking step, daemon will not send response until it's safe to continue

From 19b6a19fcc19d2d0076625316e0e7929c34e8f34 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Sun, 21 Jan 2024 04:32:47 -0500
Subject: [PATCH 20/39] adding eager killing, WIP

---
 parallel-orch/node.py                  | 24 +++++++++-
 parallel-orch/partial_program_order.py | 63 ++++++++++++++++++++------
 parallel-orch/scheduler_server.py      | 10 ++--
 parallel-orch/util.py                  |  2 +-
 4 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 65b2ce2b..0f758412 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,6 +1,8 @@
 import logging
 import executor
 import trace_v2
+import util
+import signal
 from dataclasses import dataclass
 from subprocess import Popen
 from typing import Tuple
@@ -16,6 +18,18 @@ class NodeState(Enum):
     SPEC_EXECUTING = auto()
     UNSAFE = auto()
 
+def state_pstr(state: NodeState):
+    same_length_state_str = {
+        NodeState.INIT:           '  INIT',
+        NodeState.READY:          ' READY',
+        NodeState.COMMITTED:      'COMMIT',
+        NodeState.STOP:           '  STOP',
+        NodeState.SPECULATED:     'SPEC_F',
+        NodeState.EXECUTING:      '   EXE',
+        NodeState.SPEC_EXECUTING: 'SPEC_E',
+        NodeState.UNSAFE:         'UNSAFE'
+    }
+    return same_length_state_str[state]
 
 class RWSet:
 
@@ -134,6 +148,9 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
+    def pretty_state_repr(self):
+        return f'{state_pstr(self.state)} {self.cmd}'
+    
     def is_initialized(self):
         return self.state == NodeState.INIT
     
@@ -185,6 +202,11 @@ def reset_to_ready(self):
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING,
                               NodeState.SPECULATED]
         # Probably delete them from tmpfs too
+        process = self.exec_ctxt.process
+        if process.poll() is None:
+            # Exceptions will be handled inside the call so we don't have to worry
+            util.kill_process_tree(process.pid, sig=signal.SIGKILL)
+
         self.exec_ctxt = None
         self.exec_result = None
         self.rwset = None
@@ -253,7 +275,7 @@ def transition_from_spec_executing_to_speculated(self):
 
     def update_rw_set(self, rw_set):
         self.rwset = rw_set
-    
+
     def gather_fs_actions(self) -> RWSet:
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
         sandbox_dir = self.exec_ctxt.sandbox_dir
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 1b46b834..0786aaeb 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -2,10 +2,21 @@
 import logging
 from collections import deque
 
+PROG_LOG = '[PROG_LOG] '
+EVENT_LOG = '[EVENT_LOG] '
 
+def event_log(s):
+    logging.info(EVENT_LOG + s)
+
+def progress_log(s):
+    logging.info(PROG_LOG + s)
+    
 class PartialProgramOrder:
     frontier: set  # Set of nodes at the frontier
-    run_after: set  # Nodes that should run after certain conditions
+    # Di: I'm going to ignore this for now and implement the feature without a local data structure
+    # Later we can add this back as a caching mechanism to avoid doing RWSet
+    # intersections of files all the time
+    # run_after: "dict[NodeId, list[Node]]"  # Nodes that should run after certain conditions
     to_be_resolved: "dict[NodeId, list[Node]]"  # Mapping of nodes to lists of uncommitted nodes
     nodes: "dict[NodeId, Node]"
     adjacency: "dict[NodeId, list[NodeId]]"
@@ -16,7 +27,7 @@ def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId
         self.adjacency = edges
         self.inverse_adjacency = self.init_inverse_adjacency()
         self.frontier = set()
-        self.run_after = set()
+        # self.run_after = {}
         self.to_be_resolved = {}
 
     def init_partial_order(self):
@@ -58,7 +69,7 @@ def get_executing_nodes(self):
     def get_spec_executing_nodes(self):
         return [node for node in self.nodes.values() if node.is_spec_executing()]
     
-    def get_executing_normal_and_speculated_nodes(self):
+    def get_executing_normal_and_spec_nodes(self):
         return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()]
     
     def get_speculated_nodes(self):
@@ -78,17 +89,12 @@ def log_info(self):
 
     def log_state(self):
         for node in self.nodes.values():
-            logging.info(f"Node {node.id_}: {node.state}")
+            progress_log(node.pretty_state_repr())
+        progress_log('')
 
     def get_schedulable_nodes(self) -> list[NodeId]:
         return [node.id_ for node in self.get_ready_nodes()]
             
-    def schedule_work(self, node_id: NodeId, env_file: str):
-        self.get_node(node_id).start_executing(env_file)
-
-    def schedule_spec_work(self, node_id: NodeId, env_file: str):
-        self.get_node(node_id).start_spec_executing(env_file)
-    
     ## Returns the next non-committed normal node
     def progress_frontier(self) -> "list[NodeId]":
         return self.get_next_frontier_nodes(self.get_frontier())
@@ -196,18 +202,37 @@ def adjust_to_be_resolved_dict(self):
     def valid(self):
         return True
 
-    def has_fs_deps(self, node_id: NodeId):
-        node_of_interest : Node = self.get_node(node_id)
-        for node in self.get_executing_normal_and_speculated_nodes():
+    def fetch_fs_actions(self):
+        for node in self.get_executing_normal_and_spec_nodes():
             node.gather_fs_actions()
+            
+    def _has_fs_deps(self, node_id: NodeId):
+        node_of_interest : Node = self.get_node(node_id)
         for nid in self.to_be_resolved[node_id]:
             node: Node = self.get_node(nid)
             if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()):
                 return True
         return False
+
+    # TODO: It's currently designed this way to avoid reading trace file all the time
+    # When we have complex caching code for this we can make this go away
+    def has_fs_deps(self, node_id:NodeId):
+        self.fetch_fs_actions()
+        self._has_fs_deps(node_id)
+    
+    ### external handler events ###
+    
+    def schedule_work(self, node_id: NodeId, env_file: str):
+        event_log("schedule_work")
+        self.get_node(node_id).start_executing(env_file)
+
+    def schedule_spec_work(self, node_id: NodeId, env_file: str):
+        event_log("schedule_spec")
+        self.get_node(node_id).start_spec_executing(env_file)
     
     def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
                         current_env: str):
+        event_log(f"handle_complete {node_id}")
         node = self.get_node(node_id)
         # TODO: complete the state matching
         if node.is_executing():
@@ -224,10 +249,13 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
                 if has_pending_wait:
                     node.commit_speculated()
                     self.adjust_to_be_resolved_dict()
+        elif node.is_ready():
+            pass        
         else:
             assert False
     
     def handle_wait(self, node_id: NodeId, env_file: str):
+        event_log(f"handle_wait {node_id}")
         node = self.get_node(node_id)
 
         # Invalid state check
@@ -274,3 +302,12 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         # self.schedule_work_single_node()
         # self.schedule_work_all_nodes()
         
+    def eager_fs_killing(self):
+        event_log("try to eagerly kill conflicted speculation")
+        to_be_killed = []
+        self.fetch_fs_actions()
+        for node in self.get_spec_executing_nodes():
+            if self._has_fs_deps(node.id_):
+                to_be_killed.append(node)
+        for node in to_be_killed:
+            node.reset_to_ready()
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 82d543cd..d303f0ec 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -186,11 +186,15 @@ def run(self):
         
         ## The second command should be the partial order init
         self.process_next_cmd()
-        
+
+        self.partial_program_order.log_state()
         while not self.done:
             self.process_next_cmd()
+            self.partial_program_order.log_state()
             self.schedule_work()
-
+            self.partial_program_order.log_state()
+            self.partial_program_order.eager_fs_killing()
+            self.partial_program_order.log_state()
         self.socket.close()
         self.shutdown()
 
@@ -201,7 +205,7 @@ def shutdown(self):
         self.terminate_pending_commands()
         
     def terminate_pending_commands(self):
-        for node in self.partial_program_order.get_executing_normal_and_speculated_nodes():
+        for node in self.partial_program_order.get_executing_normal_and_spec_nodes():
             proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox()
             logging.debug(f'Killing: {proc}')
             # proc.terminate()
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index af01273f..a4c725b0 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -257,4 +257,4 @@ def parse_partial_program_order_from_file(file_path: str):
 
     logging.info(f"Nodes|{','.join([str(node) for node in nodes])}")
     logging.info(f"Edges|{edges}")
-    return PartialProgramOrder(nodes, edges)
\ No newline at end of file
+    return PartialProgramOrder(nodes, edges)

From f96b81613159104ccc5643a249b552ed01141e4b Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Sun, 21 Jan 2024 04:40:09 -0500
Subject: [PATCH 21/39] fixing reset following node on env change

---
 parallel-orch/partial_program_order.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index bdf80ab0..11a0167b 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -232,10 +232,11 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
             assert False
 
     def reset_succeeding_nodes_and_start_exec(self, node_id: NodeId, env_file: str):
-        for uncommitted_node_id in self.get_all_next_uncommitted(node_id):
+        for uncommitted_node_id in self.get_all_next(node_id):
             uncommitted_node = self.get_node(uncommitted_node_id)
-            uncommitted_node.reset_to_ready()
-            uncommitted_node.start_spec_executing(env_file)
+            if uncommitted_node.is_spec_executing():
+                uncommitted_node.reset_to_ready()
+            # uncommitted_node.start_spec_executing(env_file)
 
     def handle_wait(self, node_id: NodeId, env_file: str):
         node = self.get_node(node_id)

From ff736f7789bce3fbee22467c49b0049a3e62cc5a Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Mon, 5 Feb 2024 13:13:38 -0500
Subject: [PATCH 22/39] Create HSProg class and ConcreteNode class. Also remove
 trailing whitespaces.

---
 parallel-orch/node.py                  |  86 ++++++++++++------
 parallel-orch/partial_program_order.py | 119 ++++++++++++-------------
 parallel-orch/util.py                  |  40 ++++++---
 3 files changed, 146 insertions(+), 99 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 36747ac5..6b83873e 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -63,15 +63,15 @@ def get_conflict(self, other: 'RWSet') -> set:
         return self.write_set.intersection(other.read_set).union(
             self.read_set.intersection(other.write_set)).union(
                 self.write_set.intersection(other.write_set))
-        
+
     def __str__(self):
         return f"RW(R:{self.get_read_set()}, W:{self.get_write_set()})"
 
 
 class NodeId:
-    
+
     #TODO: Implement iteration support
-    
+
     def __init__(self, id_: int):
         self.id_ = id_
 
@@ -103,6 +103,27 @@ def __gt__(self, obj):
     def parse_node_id(node_id_str: str):
         return NodeId(int(node_id_str))
 
+class AbstractNode:
+    def __init__(self, node_id: NodeId):
+        self.node_id = node_id
+
+class HSBasicBlock:
+    def __init__(self, nodes):
+        self.nodes = nodes
+
+class HSProg:
+    abstract_nodes: "dict[NodeId, AbstractNode]"
+    adjacency: "dict[NodeId, list[NodeId]]"
+    inverse_adjacency: "dict[NodeId, list[NodeId]]"
+    def __init__(self, abstract_nodes: dict[NodeId, AbstractNode],
+                 edges: dict[NodeId, list[NodeId]]):
+        self.abstract_nodes = abstract_nodes
+        self.adjacency = edges
+        self.inverse_adjacency = util.invert_graph(abstract_nodes, edges)
+
+
+
+
 @dataclass
 class ExecCtxt:
     process: Popen
@@ -117,12 +138,15 @@ class ExecCtxt:
 class ExecResult:
     exit_code: int
     proc_id: int
-    
-    
+
+@dataclass
 class Node:
     id_: NodeId
     cmd: str
     asts: "list[AstNode]"
+
+class ConcreteNode:
+    abstract_node: AbstractNode
     state: NodeState
     # Used for identifying the most recent valid execution
     exec_id: int
@@ -139,11 +163,9 @@ class Node:
     # background_sandbox: Sandbox
     exec_ctxt: ExecCtxt
     exec_result: ExecResult
-    
-    def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
-        self.id_ = node_id
-        self.cmd = cmd
-        self.asts = asts
+
+    def __init__(self, node: Node):
+        self.abstract_node = node
         self.state = NodeState.INIT
         self.tracefile = None
         self.rwset = None
@@ -154,34 +176,46 @@ def __init__(self, node_id: NodeId, cmd: str, asts: "list[AstNode]"):
 
     def __str__(self):
         return f'Node(id:{self.id_}, cmd:{self.cmd}, state:{self.state}, rwset:{self.rwset}, to_be_resolved_snapshot:{self.to_be_resolved_snapshot}, wait_env_file:{self.wait_env_file}, exec_ctxt:{self.exec_ctxt})'
-    
+
     def __repr__(self):
         return str(self)
 
+    @property
+    def id_(self):
+        return self.abstract_node.id_
+
+    @property
+    def cmd(self):
+        return self.abstract_node.cmd
+
+    @property
+    def asts(self):
+        return self.abstract_node.asts
+
     def pretty_state_repr(self):
         return f'{state_pstr(self.state)} {self.cmd}'
-    
+
     def is_initialized(self):
         return self.state == NodeState.INIT
-    
+
     def is_ready(self):
         return self.state == NodeState.READY
-    
+
     def is_committed(self):
         return self.state == NodeState.COMMITTED
-    
+
     def is_stopped(self):
         return self.state == NodeState.STOP
-    
+
     def is_speculated(self):
         return self.state == NodeState.SPECULATED
 
     def is_executing(self):
         return self.state == NodeState.EXECUTING
-    
+
     def is_spec_executing(self):
         return self.state == NodeState.SPEC_EXECUTING
-    
+
     def is_unsafe(self):
         return self.state == NodeState.UNSAFE
 
@@ -202,7 +236,7 @@ def execution_outcome(self) -> Tuple[int, str, str]:
     ##                                      ##
     ##          Transition Functions        ##
     ##                                      ##
-    
+
     def transition_from_init_to_ready(self):
         assert self.state == NodeState.INIT
         self.state = NodeState.READY
@@ -215,16 +249,16 @@ def kill(self):
     def reset_to_ready(self):
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING,
                               NodeState.SPECULATED]
-        
+
         logging.info(f"Resetting node {self.id_} to ready {self.exec_id}")
-        # We reset the exec id so if we receive a message 
+        # We reset the exec id so if we receive a message
         # due to a race condition, we will ignore it.
         self.exec_id = None
-        
+
         # TODO: make this more sophisticated
         if self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]:
             self.kill()
-        
+
         # Probably delete them from tmpfs too
         process = self.exec_ctxt.process
         if process.poll() is None:
@@ -245,7 +279,7 @@ def start_spec_executing(self, env_file):
         assert self.state == NodeState.READY
         self.start_command(env_file, speculate=True)
         self.state = NodeState.SPEC_EXECUTING
-        
+
     def commit_frontier_execution(self):
         assert self.state == NodeState.EXECUTING
         self.exec_result = ExecResult(self.exec_ctxt.process.pid, self.exec_ctxt.process.returncode)
@@ -305,8 +339,8 @@ def has_env_conflict_with(self, other_env) -> bool:
         if self.exec_ctxt.pre_env_file == other_env:
             return False
 
-        ignore_vars = set(['RANDOM'])  
-        
+        ignore_vars = set(['RANDOM'])
+
         re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"')
         re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"')
         re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))')
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 475221bb..a3f545fd 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -1,10 +1,9 @@
-from node import NodeId, Node
+from node import NodeId, Node, ConcreteNode, HSProg
 import logging
 from collections import deque
 
 PROG_LOG = '[PROG_LOG] '
 EVENT_LOG = '[EVENT_LOG] '
-DEBUG_LOG = '[DEBUG_LOG] '
 
 def event_log(s):
     logging.info(EVENT_LOG + s)
@@ -12,9 +11,6 @@ def event_log(s):
 def progress_log(s):
     logging.info(PROG_LOG + s)
 
-def debug_log(s):
-    logging.debug(DEBUG_LOG + s)
-    
 class PartialProgramOrder:
     frontier: set  # Set of nodes at the frontier
     # Di: I'm going to ignore this for now and implement the feature without a local data structure
@@ -22,20 +18,18 @@ class PartialProgramOrder:
     # intersections of files all the time
     # run_after: "dict[NodeId, list[Node]]"  # Nodes that should run after certain conditions
     to_be_resolved: "dict[NodeId, list[Node]]"  # Mapping of nodes to lists of uncommitted nodes
-    nodes: "dict[NodeId, Node]"
-    adjacency: "dict[NodeId, list[NodeId]]"
-    inverse_adjacency: "dict[NodeId, list[NodeId]]"
-    
-    def __init__(self, nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"):
-        self.nodes = nodes
-        self.adjacency = edges
-        self.inverse_adjacency = self.init_inverse_adjacency()
+    concrete_nodes: "dict[NodeId, Node]"
+
+    def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"):
+        self.hsprog = HSProg(abstract_nodes, edges)
+        self.concrete_nodes = {node_id: ConcreteNode(ab_node) for node_id, ab_node
+                               in abstract_nodes.items()}
         self.frontier = set()
         # self.run_after = {}
         self.to_be_resolved = {}
 
     def init_partial_order(self):
-        for node_id, node in self.nodes.items():
+        for node_id, node in self.concrete_nodes.items():
             if node.is_initialized():
                 node.transition_from_init_to_ready()
 
@@ -45,63 +39,68 @@ def init_partial_order(self):
         self.frontier = self.get_standard_source_nodes()
         # TODO: Implement the rest of the partial order initialization
 
+    @property
+    def abstract_nodes(self):
+        return self.hsprog.abstract_nodes
+
+    @property
+    def adjacency(self):
+        return self.hsprog.adjacency
+
+    @property
+    def inverse_adjacency(self):
+        return self.hsprog.inverse_adjacency
+        
     def commit_node(self, node):
         # Logic to handle committing a node
         node.transition_to_committed()
-        # Maybe update dependencies here 
+        # Maybe update dependencies here
         # etc.
 
-    def init_inverse_adjacency(self):
-        inverse_adjacency = {i: [] for i in self.nodes.keys()}
-        for from_id, to_ids in self.adjacency.items():
-            for to_id in to_ids:
-                inverse_adjacency[to_id].append(from_id)
-        return inverse_adjacency
-    
     def get_node(self, node_id: NodeId) -> Node:
-        return self.nodes[node_id]
+        return self.concrete_nodes[node_id]
 
     def get_all_nodes(self):
-        return [node for node in self.nodes.values()]
-    
+        return [node for node in self.concrete_nodes.values()]
+
     def get_committed_nodes(self):
-        return [node for node in self.nodes.values() if node.is_committed()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_committed()]
+
     def get_ready_nodes(self):
-        return [node for node in self.nodes.values() if node.is_ready()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_ready()]
+
     def get_executing_nodes(self):
-        return [node for node in self.nodes.values() if node.is_executing()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_executing()]
+
     def get_spec_executing_nodes(self):
-        return [node for node in self.nodes.values() if node.is_spec_executing()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_spec_executing()]
+
     def get_executing_normal_and_spec_nodes(self):
-        return [node for node in self.nodes.values() if node.is_executing() or node.is_spec_executing()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_executing() or node.is_spec_executing()]
+
     def get_speculated_nodes(self):
-        return [node for node in self.nodes.values() if node.is_speculated()]
-    
+        return [node for node in self.concrete_nodes.values() if node.is_speculated()]
+
     def get_uncommitted_nodes(self):
-        return [node for node in self.nodes.values() if not node.is_committed()]
-    
+        return [node for node in self.concrete_nodes.values() if not node.is_committed()]
+
     def get_frontier(self):
         return self.frontier
-    
+
     def log_info(self):
-        logging.info(f"Nodes: {self.nodes}")
+        logging.info(f"Nodes: {self.concrete_nodes}")
         logging.info(f"Adjacency: {self.adjacency}")
         logging.info(f"Inverse adjacency: {self.inverse_adjacency}")
         self.log_state()
 
     def log_state(self):
-        for node in self.nodes.values():
+        for node in self.concrete_nodes.values():
             progress_log(node.pretty_state_repr())
         progress_log('')
 
     def get_schedulable_nodes(self) -> list[NodeId]:
         return [node.id_ for node in self.get_ready_nodes()]
-            
+
     ## Returns the next non-committed normal node
     def progress_frontier(self) -> "list[NodeId]":
         return self.get_next_frontier_nodes(self.get_frontier())
@@ -111,19 +110,19 @@ def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]":
 
     def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]":
         return self.inverse_adjacency[node_id][:]
-    
+
     def get_source_nodes(self) -> "list[NodeId]":
         sources = set()
         for to_id, from_ids in self.inverse_adjacency.items():
             if len(from_ids) == 0:
                 sources.add(to_id)
         return list(sources)
-    
+
     def get_standard_source_nodes(self) -> list:
         source_nodes = self.get_source_nodes()
         # TODO: Filter out loop nodes
         # return self.filter_standard_nodes(source_nodes)
-        return source_nodes    
+        return source_nodes
 
     def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
         # TODO: filter non-loop nodes
@@ -138,7 +137,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
                 continue
 
             visited.add(current_node_id)
-            current_node = self.nodes.get(current_node_id)
+            current_node = self.concrete_nodes.get(current_node_id)
 
             if not current_node.is_committed():
                 if first_non_committed_depth is None:
@@ -156,7 +155,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
                         to_visit.append((neighbor, depth + 1))  # Increase depth for neighbors
 
         return non_committed_nodes
-    
+
     def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
         all_next = set()
         def reachable_rec(cur, reachable):
@@ -181,24 +180,24 @@ def reachable_rec(cur, reachable):
         for n in self.get_prev_nodes(current_node_id):
             reachable_rec(n, all_prev)
         return all_prev
-    
+
     def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
         next = self.get_all_next(node_id)
-        return set([node for node in next if not self.nodes[node].is_committed()])
-    
+        return set([node for node in next if not self.concrete_nodes[node].is_committed()])
+
     def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
         previous = self.get_all_previous(node_id)
-        return set([node for node in previous if not self.nodes[node].is_committed()])
+        return set([node for node in previous if not self.concrete_nodes[node].is_committed()])
 
     def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
-        node = self.nodes.get(node_id)
+        node = self.concrete_nodes.get(node_id)
         if node.is_committed():
             self.to_be_resolved[node_id] = []
         elif node.is_ready():
             self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id)
 
     def init_to_be_resolved_dict(self):
-        for node_id in self.nodes:
+        for node_id in self.concrete_nodes:
             self.adjust_to_be_resolved_dict_entry(node_id)
 
     def adjust_to_be_resolved_dict(self):
@@ -216,7 +215,7 @@ def valid(self):
     def fetch_fs_actions(self):
         for node in self.get_executing_normal_and_spec_nodes():
             node.gather_fs_actions()
-            
+
     def _has_fs_deps(self, node_id: NodeId):
         node_of_interest : Node = self.get_node(node_id)
         for nid in self.to_be_resolved[node_id]:
@@ -230,9 +229,9 @@ def _has_fs_deps(self, node_id: NodeId):
     def has_fs_deps(self, node_id:NodeId):
         self.fetch_fs_actions()
         self._has_fs_deps(node_id)
-    
+
     ### external handler events ###
-    
+
     def schedule_work(self, node_id: NodeId, env_file: str):
         event_log("schedule_work")
         self.get_node(node_id).start_executing(env_file)
@@ -241,7 +240,7 @@ def schedule_spec_work(self, node_id: NodeId, env_file: str):
         event_log("schedule_spec")
         self.adjust_to_be_resolved_dict_entry(node_id)
         self.get_node(node_id).start_spec_executing(env_file)
-    
+
     def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
                         current_env: str):
         event_log(f"handle_complete {node_id}")
@@ -279,7 +278,7 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         if node.is_committed() or node.is_unsafe() or node.is_initialized():
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
-        
+
 
         if node.is_ready():
             node.start_executing(env_file)
@@ -312,7 +311,7 @@ def handle_wait(self, node_id: NodeId, env_file: str):
         else:
             logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
-        
+
     def eager_fs_killing(self):
         event_log("try to eagerly kill conflicted speculation")
         to_be_killed = []
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index dc0abdcc..b9947233 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -12,6 +12,11 @@
 from node import Node, NodeId
 from partial_program_order import PartialProgramOrder
 
+DEBUG_LOG = '[DEBUG_LOG] '
+
+def debug_log(s):
+    logging.debug(DEBUG_LOG + s)
+
 def ptempfile():
     fd, name = tempfile.mkstemp(dir=config.PASH_SPEC_TMP_PREFIX)
     ## TODO: Get a name without opening the fd too if possible
@@ -42,11 +47,11 @@ def init_unix_socket(socket_file: str) -> socket.socket:
     logging.debug("SocketManager: Created socket")
 
     sock.bind(server_address)
-    logging.debug("SocketManager: Successfully bound to socket")    
+    logging.debug("SocketManager: Successfully bound to socket")
 
     ## TODO: Check if we need to configure the backlog
-    sock.listen()    
-    logging.debug("SocketManager: Listenting on socket")    
+    sock.listen()
+    logging.debug("SocketManager: Listenting on socket")
 
     return sock
 
@@ -61,7 +66,7 @@ def socket_get_next_cmd(sock: socket.socket) -> "tuple[socket.socket, str]" :
     ##
     ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation)
     assert(str_data.endswith("\n") or str_data == "")
-    
+
     return (connection, str_data)
 
 def socket_respond(connection: socket.socket, message: str):
@@ -83,7 +88,7 @@ def parse_env_string_to_dict(content):
     result = {key: value for key, value in scalar_vars_string}
     result.update({key: int(value) for key, value in scalar_vars_int})
     result.update({key: value for key, value in array_vars})
-    
+
     return result
 
 def compare_dicts(dict1, dict2):
@@ -114,19 +119,19 @@ def set_named_timestamp(action: str, node=None, key=None):
     if key is None:
         key = f"{action}{',' + str(node) if node is not None else ''}"
     config.NAMED_TIMESTAMPS[key] = time.time()
-    
+
 def invalidate_named_timestamp(action: str, node=None, key=None):
     if key is None:
         key = f"{action}{',' + str(node) if node is not None else ''}"
     del config.NAMED_TIMESTAMPS[key]
-    
+
 def log_time_delta_from_start_and_set_named_timestamp(module: str, action: str, node=None, key=None):
     try:
         set_named_timestamp(action, node, key)
         logging.info(f">|{module}|{action}{',' + str(node) if node is not None else ''}|Time from start:{to_milliseconds_str(time.time() - config.START_TIME)}")
     except KeyError:
         logging.error(f"Named timestamp {key} already exists")
-    
+
 def log_time_delta_from_named_timestamp(module: str, action: str, node=None, key=None, invalidate=True):
     try:
         if key is None:
@@ -147,7 +152,7 @@ def get_all_child_processes(pid):
         parent = psutil.Process(pid)
     except psutil.NoSuchProcess:
         return []
-    
+
     children = parent.children(recursive=True)
     parent_of_parent = parent.parent()
     logging.critical("PARENT_PROCESS: " + str(parent_of_parent))
@@ -240,7 +245,7 @@ def parse_partial_program_order_from_file(file_path: str):
     edge_lines = lines[loop_context_end:]
     logging.debug(f'Edges: {edge_lines}')
 
-    nodes = {}
+    ab_nodes = {}
     for i in range(number_of_nodes):
         file_path = f'{cmds_directory}/{i}'
         cmd, asts = parse_cmd_from_file(file_path)
@@ -248,16 +253,25 @@ def parse_partial_program_order_from_file(file_path: str):
         # nodes[NodeId(i)] = Node(NodeId(i), cmd,
         #                         asts=asts,
         #                         loop_context=LoopStack(loop_ctx))
-        nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts)
+        ab_nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts)
 
     edges = {NodeId(i) : [] for i in range(number_of_nodes)}
     for edge_line in edge_lines:
         from_id, to_id = parse_edge_line(edge_line)
         edges[NodeId(from_id)].append(NodeId(to_id))
 
-    logging.info(f"Nodes|{','.join([str(node) for node in nodes])}")
+    logging.info(f"Nodes|{','.join([str(node) for node in ab_nodes])}")
     logging.info(f"Edges|{edges}")
-    return PartialProgramOrder(nodes, edges)
+    return PartialProgramOrder(ab_nodes, edges)
 
 def generate_id() -> int:
     return int(time.time() * 1000000)
+
+# nodes is iterable of node
+# edges is dict[node, list[node]]
+def invert_graph(nodes, edges):
+    graph = {n: [] for n in nodes}
+    for from_id, to_ids in edges.items():
+        for to_id in to_ids:
+            graph[to_id].append(from_id)
+    return graph

From 9151f34d87bb6353d2a3a7e11e5f8e18160d0a11 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Tue, 6 Feb 2024 00:13:50 -0500
Subject: [PATCH 23/39] minimal working loop

---
 parallel-orch/executor.py              |  18 +--
 parallel-orch/node.py                  | 183 +++++++++++++++++++++----
 parallel-orch/partial_program_order.py | 165 +++++++++++++---------
 parallel-orch/scheduler_server.py      |  43 +++---
 parallel-orch/util.py                  |  12 +-
 5 files changed, 292 insertions(+), 129 deletions(-)

diff --git a/parallel-orch/executor.py b/parallel-orch/executor.py
index 52bc3d9f..2ec5e962 100644
--- a/parallel-orch/executor.py
+++ b/parallel-orch/executor.py
@@ -8,23 +8,23 @@
 # and traces them with Riker. 
 # All commands are run inside an overlay sandbox.
 
-def async_run_and_trace_command_return_trace(command, node_id, execution_id, pre_execution_env_file, speculate_mode=False):
+def async_run_and_trace_command_return_trace(command, concrete_node_id, execution_id, pre_execution_env_file, speculate_mode=False):
     trace_file = util.ptempfile()
     stdout_file = util.ptempfile()
     stderr_file = util.ptempfile()
     post_execution_env_file = util.ptempfile()
     sandbox_dir, tmp_dir = util.create_sandbox()
-    logging.debug(f'Scheduler: Stdout file for: {node_id} is: {stdout_file}')
-    logging.debug(f'Scheduler: Stderr file for: {node_id} is: {stderr_file}')
-    logging.debug(f'Scheduler: Trace file for: {node_id}: {trace_file}')
-    process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
+    logging.debug(f'Scheduler: Stdout file for: {concrete_node_id} is: {stdout_file}')
+    logging.debug(f'Scheduler: Stderr file for: {concrete_node_id} is: {stderr_file}')
+    logging.debug(f'Scheduler: Trace file for: {concrete_node_id}: {trace_file}')
+    process = async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode)
     return process, trace_file, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, node_id, pre_execution_env_file):
-    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, node_id, pre_execution_env_file, speculate_mode=True)
+def async_run_and_trace_command_return_trace_in_sandbox_speculate(command, execution_id, concrete_node_id, pre_execution_env_file):
+    process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir = async_run_and_trace_command_return_trace(command, execution_id, concrete_node_id, pre_execution_env_file, speculate_mode=True)
     return process, trace_file, stdout_file, stderr_file, post_execution_env_file, sandbox_dir
 
-def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
+def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, trace_file, concrete_node_id, stdout_file, stderr_file, pre_execution_env_file, post_execution_env_file, sandbox_dir, tmp_dir, speculate_mode=False):
     ## Call Riker to execute the command
     run_script = f'{config.PASH_SPEC_TOP}/parallel-orch/run_command.sh'
     args = ["/bin/bash", run_script, command, trace_file, stdout_file, pre_execution_env_file, sandbox_dir, tmp_dir]
@@ -32,7 +32,7 @@ def async_run_and_trace_command_return_trace_in_sandbox(command, execution_id, t
         args.append("speculate")
     else:
         args.append("standard")
-    args.append(str(node_id))
+    args.append(str(concrete_node_id))
     args.append(post_execution_env_file)
     args.append(str(execution_id))
     # Save output to temporary files to not saturate the memory
diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 6b83873e..914f5f44 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -1,3 +1,4 @@
+from itertools import chain
 import logging
 import re
 import executor
@@ -69,9 +70,6 @@ def __str__(self):
 
 
 class NodeId:
-
-    #TODO: Implement iteration support
-
     def __init__(self, id_: int):
         self.id_ = id_
 
@@ -103,27 +101,6 @@ def __gt__(self, obj):
     def parse_node_id(node_id_str: str):
         return NodeId(int(node_id_str))
 
-class AbstractNode:
-    def __init__(self, node_id: NodeId):
-        self.node_id = node_id
-
-class HSBasicBlock:
-    def __init__(self, nodes):
-        self.nodes = nodes
-
-class HSProg:
-    abstract_nodes: "dict[NodeId, AbstractNode]"
-    adjacency: "dict[NodeId, list[NodeId]]"
-    inverse_adjacency: "dict[NodeId, list[NodeId]]"
-    def __init__(self, abstract_nodes: dict[NodeId, AbstractNode],
-                 edges: dict[NodeId, list[NodeId]]):
-        self.abstract_nodes = abstract_nodes
-        self.adjacency = edges
-        self.inverse_adjacency = util.invert_graph(abstract_nodes, edges)
-
-
-
-
 @dataclass
 class ExecCtxt:
     process: Popen
@@ -139,14 +116,59 @@ class ExecResult:
     exit_code: int
     proc_id: int
 
+class LoopStack:
+    def __init__(self, loop_contexts_or_iters=None):
+        if loop_contexts_or_iters is None:
+            self.loops = []
+        else:
+            self.loops = loop_contexts_or_iters
+
+    def __repr__(self):
+        ## TODO: Represent it using 'it', 'it0', 'it1', etc
+        ##       or -(iters)- in front of it.
+        output = "-".join([str(it) for it in self.loops])
+        return output
+    def __eq__(self, other):
+        return self.loops == other.loops
+
 @dataclass
 class Node:
     id_: NodeId
     cmd: str
     asts: "list[AstNode]"
+    loop_context: LoopStack
+
+    def __init__(self, id_, cmd, asts, loop_context=None):
+        self.id_ = id_
+        self.cmd = cmd
+        self.asts = asts
+        self.loop_context = loop_context if loop_context else LoopStack()
+
+class ConcreteNodeId:
+    def __init__(self, node_id: NodeId, loop_iters = list()):
+        self.node_id = node_id
+        self.loop_iters = tuple(loop_iters)
+
+    def __repr__(self):
+        return f'cnid({self.node_id.id_})'
+        
+    def __hash__(self):
+        return hash((self.node_id, self.loop_iters))
+
+    def __eq__(self, other):
+        return self.node_id == other.node_id and self.loop_iters == other.loop_iters
+
+    def __str__(self):
+        return f'{self.node_id}@' + ''.join(['-' + str(n) for n in self.loop_iters])
 
+    @staticmethod
+    def parse(input_str):
+        node_id_str, loop_iters_str = input_str.split('@')
+        return ConcreteNodeId(NodeId(int(node_id_str)), [int(cnt) for cnt in loop_iters_str.split('-')[1:]])
+    
 class ConcreteNode:
-    abstract_node: AbstractNode
+    cnid: ConcreteNodeId
+    abstract_node: Node
     state: NodeState
     # Used for identifying the most recent valid execution
     exec_id: int
@@ -164,7 +186,8 @@ class ConcreteNode:
     exec_ctxt: ExecCtxt
     exec_result: ExecResult
 
-    def __init__(self, node: Node):
+    def __init__(self, cnid: ConcreteNodeId, node: Node):
+        self.cnid = cnid
         self.abstract_node = node
         self.state = NodeState.INIT
         self.tracefile = None
@@ -226,7 +249,7 @@ def start_command(self, env_file: str, speculate=False):
         execute_func = executor.async_run_and_trace_command_return_trace
         # Set the execution id
         self.exec_id = util.generate_id()
-        self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.id_, self.exec_id, env_file))
+        self.exec_ctxt = ExecCtxt(*execute_func(cmd, self.cnid, self.exec_id, env_file))
 
     def execution_outcome(self) -> Tuple[int, str, str]:
         assert self.exec_result is not None
@@ -364,3 +387,109 @@ def parse_env(content):
         with open(other_env, 'r') as file:
             other_env_vars = parse_env(file.read())
         return node_env_vars != other_env_vars
+
+    
+class HSBasicBlock:
+    def __init__(self, nodes: list[Node]):
+        if len(nodes) == 0:
+            raise ValueError('basic block size 0')
+        self.nodes = nodes
+
+    def __str__(self):
+        return ''.join([node.cmd for node in self.nodes])
+
+    @property
+    def loop_context(self):
+        return self.nodes[0].loop_context
+
+    @property
+    def node_ids(self):
+        return [node.id_ for node in self.nodes]
+
+    def get_node(self, node_id: NodeId) -> Node:
+        nodes = [node for node in self.nodes if node.id_ == node_id]
+        assert len(nodes) == 1
+        return nodes[0]
+        
+class HSProg:
+    abstract_nodes: "dict[NodeId, Node]"
+    adjacency: "dict[NodeId, list[NodeId]]"
+    inverse_adjacency: "dict[NodeId, list[NodeId]]"
+    basic_blocks: list[HSBasicBlock] = []
+    block_adjacency: "dict[int, list[int]]"
+    BB_ENTER = -1
+    BB_EXIT = -2
+
+    def __init__(self, abstract_nodes: dict[NodeId, Node],
+                 edges: dict[NodeId, list[NodeId]]):
+        self.abstract_nodes = abstract_nodes
+        self.adjacency = edges
+        self.inverse_adjacency = util.invert_graph(abstract_nodes, edges)
+        self.construct_basic_blocks()
+        util.debug_log(str(self))
+
+    def construct_basic_blocks(self):
+        node_list = []
+        block_id = LoopStack()
+        for node in self.abstract_nodes.values():
+            if node.loop_context == block_id:
+                node_list.append(node)
+            else:
+                basic_block = HSBasicBlock(node_list)
+                self.basic_blocks.append(basic_block)
+                node_list = [node]
+                block_id = node.loop_context
+        basic_block = HSBasicBlock(node_list)
+        self.basic_blocks.append(basic_block)
+        if len(self.basic_blocks) == 0:
+            raise ValueError('empty hsprog')
+
+        # TODO: the algorithm here is wrong,
+        # echo 1
+        # for i in {1..n}; do
+        #   echo 2
+        # done
+        # for i in {1..m}; do
+        #   echo 3
+        # done
+        # echo 4
+        #
+        # echo 1 can goto echo 2, echo 3, or echo 4
+        self.block_adjacency = {}
+        prev_blocks = {tuple(): self.basic_blocks[0]}
+        for bb_id, bb in enumerate(self.basic_blocks):
+            # the fallthrough edge
+            if bb_id != len(self.basic_blocks) - 1:
+                self.block_adjacency[bb_id] = [bb_id + 1]
+            else:
+                self.block_adjacency[bb_id] = [HSProg.BB_EXIT]
+                break
+
+            for next_bb_id in chain(range(bb_id + 1, len(self.basic_blocks)),
+                                    range(0, bb_id + 1)):
+                next_bb = self.basic_blocks[next_bb_id]
+                if next_bb.loop_context == bb.loop_context:
+                    self.block_adjacency[bb_id].append(next_bb_id)
+                    break
+            else:
+                raise ValueError('no jump block')
+
+    def is_start_of_block(self, node_id: NodeId):
+        for bb in self.basic_blocks:
+            bb : HSBasicBlock
+            if bb.nodes[0].id_ == node_id:
+                return True
+        return False
+
+    def find_basic_block(self, node_id: NodeId):
+        for bb in self.basic_blocks:
+            bb : HSBasicBlock
+            for node in bb.nodes:
+                if node.id_ == node_id:
+                    return bb
+        raise ValueError('no such node_id')
+    
+    def __str__(self):
+        return 'prog:\n' + '\n'.join(
+            [f'block {i}:\n' + str(bb) + f'goto block {self.block_adjacency[i]}\n' for i, bb in enumerate(self.basic_blocks)])
+    
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index a3f545fd..03da8a47 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -1,5 +1,6 @@
-from node import NodeId, Node, ConcreteNode, HSProg
+from node import NodeId, Node, ConcreteNodeId, ConcreteNode, HSProg, HSBasicBlock
 import logging
+import util
 from collections import deque
 
 PROG_LOG = '[PROG_LOG] '
@@ -17,27 +18,32 @@ class PartialProgramOrder:
     # Later we can add this back as a caching mechanism to avoid doing RWSet
     # intersections of files all the time
     # run_after: "dict[NodeId, list[Node]]"  # Nodes that should run after certain conditions
-    to_be_resolved: "dict[NodeId, list[Node]]"  # Mapping of nodes to lists of uncommitted nodes
+
+    # Mapping of concrete nodes to lists of uncommitted concrete nodes the precedes them.
+    # It is the snapshot of the reachable uncommited concrete nodes from prev_concrete_node graph
+    # at the time the concrete node enters execution. So if there is fs conflict in them,
+    # it needs to be rerun
+    to_be_resolved: "dict[NodeId, list[Node]]"
     concrete_nodes: "dict[NodeId, Node]"
 
     def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, list[NodeId]]"):
         self.hsprog = HSProg(abstract_nodes, edges)
-        self.concrete_nodes = {node_id: ConcreteNode(ab_node) for node_id, ab_node
-                               in abstract_nodes.items()}
+        self.concrete_nodes: dict[ConcreteNodeId, ConcreteNode] = {}
         self.frontier = set()
         # self.run_after = {}
-        self.to_be_resolved = {}
+        self.prev_concrete_node: dict[ConcreteNodeId, list[ConcreteNodeId]] = {}
+        self.to_be_resolved: dict[ConcreteNodeId, list[ConcreteNodeId]] = {}
 
-    def init_partial_order(self):
-        for node_id, node in self.concrete_nodes.items():
-            if node.is_initialized():
-                node.transition_from_init_to_ready()
+    # def init_partial_order(self):
+    #     for node_id, node in self.concrete_nodes.items():
+    #         if node.is_initialized():
+    #             node.transition_from_init_to_ready()
 
-        self.init_to_be_resolved_dict()
-        logging.info(self.to_be_resolved)
-        # Init frontier
-        self.frontier = self.get_standard_source_nodes()
-        # TODO: Implement the rest of the partial order initialization
+    #     self.init_to_be_resolved_dict()
+    #     logging.info(self.to_be_resolved)
+    #     # Init frontier
+    #     self.frontier = self.get_standard_source_nodes()
+    #     # TODO: Implement the rest of the partial order initialization
 
     @property
     def abstract_nodes(self):
@@ -56,9 +62,9 @@ def commit_node(self, node):
         node.transition_to_committed()
         # Maybe update dependencies here
         # etc.
-
-    def get_node(self, node_id: NodeId) -> Node:
-        return self.concrete_nodes[node_id]
+        
+    def get_concrete_node(self, concrete_node_id: ConcreteNodeId) -> ConcreteNode:
+        return self.concrete_nodes[concrete_node_id]
 
     def get_all_nodes(self):
         return [node for node in self.concrete_nodes.values()]
@@ -67,7 +73,7 @@ def get_committed_nodes(self):
         return [node for node in self.concrete_nodes.values() if node.is_committed()]
 
     def get_ready_nodes(self):
-        return [node for node in self.concrete_nodes.values() if node.is_ready()]
+        return [(cnid, n) for cnid, n in self.concrete_nodes.items() if n.is_ready()]
 
     def get_executing_nodes(self):
         return [node for node in self.concrete_nodes.values() if node.is_executing()]
@@ -98,8 +104,8 @@ def log_state(self):
             progress_log(node.pretty_state_repr())
         progress_log('')
 
-    def get_schedulable_nodes(self) -> list[NodeId]:
-        return [node.id_ for node in self.get_ready_nodes()]
+    def get_schedulable_nodes(self) -> list[ConcreteNodeId]:
+        return [concrete_node_id for concrete_node_id, _ in self.get_ready_nodes()]
 
     ## Returns the next non-committed normal node
     def progress_frontier(self) -> "list[NodeId]":
@@ -108,8 +114,8 @@ def progress_frontier(self) -> "list[NodeId]":
     def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]":
         return self.adjacency[node_id][:]
 
-    def get_prev_nodes(self, node_id:NodeId) -> "list[NodeId]":
-        return self.inverse_adjacency[node_id][:]
+    def get_prev_nodes(self, concrete_node_id: ConcreteNodeId) -> "list[ConcreteNodeId]":
+        return self.prev_concrete_node[concrete_node_id][:]
 
     def get_source_nodes(self) -> "list[NodeId]":
         sources = set()
@@ -156,7 +162,7 @@ def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
 
         return non_committed_nodes
 
-    def get_all_next(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
+    def get_all_next(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]":
         all_next = set()
         def reachable_rec(cur, reachable):
             if cur in reachable:
@@ -169,7 +175,7 @@ def reachable_rec(cur, reachable):
         return all_next
 
 
-    def get_all_previous(self, current_node_id: NodeId, visited=None) -> "set[NodeId]":
+    def get_all_previous(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]":
         all_prev = set()
         def reachable_rec(cur, reachable):
             if cur in reachable:
@@ -181,20 +187,21 @@ def reachable_rec(cur, reachable):
             reachable_rec(n, all_prev)
         return all_prev
 
-    def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
-        next = self.get_all_next(node_id)
-        return set([node for node in next if not self.concrete_nodes[node].is_committed()])
+    # TODO: fixme
+    # def get_all_next_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
+    #     next = self.get_all_next(node_id)
+    #     return set([node for node in next if not self.concrete_nodes[node].is_committed()])
 
-    def get_all_previous_uncommitted(self, node_id: NodeId) -> "set[NodeId]":
-        previous = self.get_all_previous(node_id)
-        return set([node for node in previous if not self.concrete_nodes[node].is_committed()])
+    def get_all_previous_uncommitted(self, concrete_node_id: ConcreteNodeId) -> "set[ConcreteNodeId]":
+        previous = self.get_all_previous(concrete_node_id)
+        return set([cnid for cnid in previous if not self.concrete_nodes[cnid].is_committed()])
 
-    def adjust_to_be_resolved_dict_entry(self, node_id: NodeId):
-        node = self.concrete_nodes.get(node_id)
+    def adjust_to_be_resolved_dict_entry(self, concrete_node_id: ConcreteNodeId):
+        node = self.concrete_nodes.get(concrete_node_id)
         if node.is_committed():
-            self.to_be_resolved[node_id] = []
+            self.to_be_resolved[concrete_node_id] = []
         elif node.is_ready():
-            self.to_be_resolved[node_id] = self.get_all_previous_uncommitted(node_id)
+            self.to_be_resolved[concrete_node_id] = self.get_all_previous_uncommitted(concrete_node_id)
 
     def init_to_be_resolved_dict(self):
         for node_id in self.concrete_nodes:
@@ -216,10 +223,10 @@ def fetch_fs_actions(self):
         for node in self.get_executing_normal_and_spec_nodes():
             node.gather_fs_actions()
 
-    def _has_fs_deps(self, node_id: NodeId):
-        node_of_interest : Node = self.get_node(node_id)
-        for nid in self.to_be_resolved[node_id]:
-            node: Node = self.get_node(nid)
+    def _has_fs_deps(self, concrete_node_id: ConcreteNodeId):
+        node_of_interest : ConcreteNode = self.get_concrete_node(concrete_node_id)
+        for nid in self.to_be_resolved[concrete_node_id]:
+            node: ConcreteNode = self.get_concrete_node(nid)
             if node.get_rw_set().has_conflict(node_of_interest.get_rw_set()):
                 return True
         return False
@@ -234,17 +241,17 @@ def has_fs_deps(self, node_id:NodeId):
 
     def schedule_work(self, node_id: NodeId, env_file: str):
         event_log("schedule_work")
-        self.get_node(node_id).start_executing(env_file)
+        self.get_concrete_node(node_id).start_executing(env_file)
 
-    def schedule_spec_work(self, node_id: NodeId, env_file: str):
+    def schedule_spec_work(self, concrete_node_id: ConcreteNodeId, env_file: str):
         event_log("schedule_spec")
-        self.adjust_to_be_resolved_dict_entry(node_id)
-        self.get_node(node_id).start_spec_executing(env_file)
+        self.adjust_to_be_resolved_dict_entry(concrete_node_id)
+        self.get_concrete_node(concrete_node_id).start_spec_executing(env_file)
 
     def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
                         current_env: str):
         event_log(f"handle_complete {node_id}")
-        node = self.get_node(node_id)
+        node = self.get_concrete_node(node_id)
         # TODO: complete the state matching
         if node.is_executing():
             node.commit_frontier_execution()
@@ -264,39 +271,69 @@ def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
             assert False
 
     def reset_succeeding_nodes(self, node_id: NodeId, env_file: str):
-        for uncommitted_node_id in self.get_all_next(node_id):
-            uncommitted_node = self.get_node(uncommitted_node_id)
-            if uncommitted_node.is_spec_executing():
-                uncommitted_node.reset_to_ready()
-            # uncommitted_node.start_spec_executing(env_file)
-
-    def handle_wait(self, node_id: NodeId, env_file: str):
-        event_log(f"handle_wait {node_id}")
-        node = self.get_node(node_id)
+        # TODO: fixme
+        pass
+        # for uncommitted_node_id in self.get_all_next(node_id):
+        #     uncommitted_node = self.get_concrete_node(uncommitted_node_id)
+        #     if uncommitted_node.is_spec_executing():
+        #         uncommitted_node.reset_to_ready()
+        #     # uncommitted_node.start_spec_executing(env_file)
+
+    def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId):
+        basic_block = self.hsprog.find_basic_block(concrete_node_id.node_id)
+        if len(self.concrete_nodes) != 0:
+            prev_concrete_node_id = next(reversed(self.concrete_nodes))
+        else:
+            prev_concrete_node_id = None
+        loop_iters = concrete_node_id.loop_iters
+        for abstract_node_id in basic_block.node_ids:
+            new_concrete_node_id = ConcreteNodeId(abstract_node_id, loop_iters)
+            new_concrete_node = ConcreteNode(new_concrete_node_id,
+                                             basic_block.get_node(abstract_node_id))
+            new_concrete_node.transition_from_init_to_ready()
+            self.concrete_nodes[new_concrete_node_id] = new_concrete_node
+            if prev_concrete_node_id is not None:
+                self.prev_concrete_node[new_concrete_node_id] = [prev_concrete_node_id]
+            else:
+                self.prev_concrete_node[new_concrete_node_id] = []
+            prev_concrete_node_id = new_concrete_node_id
+        assert concrete_node_id in self.concrete_nodes
+            
+    def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str):
+        event_log(f"handle_wait {concrete_node_id}")
+
+        if not concrete_node_id in self.concrete_nodes:
+            abstract_node_id = concrete_node_id.node_id
+            assert self.hsprog.is_start_of_block(abstract_node_id)
+            self.adding_new_basic_block(concrete_node_id)
+            util.debug_log("try to add concrete node here")
+            util.debug_log(repr(self.prev_concrete_node))
+            util.debug_log("")
+        node = self.get_concrete_node(concrete_node_id)
 
         # Invalid state check
         if node.is_committed() or node.is_unsafe() or node.is_initialized():
-            logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
-            raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
-
+            logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
+            raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
 
         if node.is_ready():
             node.start_executing(env_file)
         elif node.is_stopped():
             if node in self.get_frontier():
-                logging.info(f'Node {node_id} is stopped and in the frontier.')
+                logging.info(f'Node {concrete_node_id} is stopped and in the frontier.')
                 node.transition_from_stopped_to_executing(env_file)
             else:
-                logging.info(f'Node {node_id} is stopped but not in the frontier.')
+                logging.info(f'Node {concrete_node_id} is stopped but not in the frontier.')
         elif node.is_speculated():
             # Check if env conflicts exist
             if node.has_env_conflict_with(env_file):
+                util.debug_log(f'prev_env: {node.exec_ctxt.pre_env_file}, real: {env_file}')
                 node.reset_to_ready()
                 node.start_executing(env_file)
-                self.reset_succeeding_nodes(node_id, env_file)
+                self.reset_succeeding_nodes(concrete_node_id, env_file)
             # Optimization: It would make sense to perform the checks independently,
             # and if fs conflict, then update the run after dict.
-            elif self.has_fs_deps(node_id):
+            elif self.has_fs_deps(concrete_node_id):
                 node.reset_to_ready()
                 node.start_executing(env_file)
             else:
@@ -304,13 +341,13 @@ def handle_wait(self, node_id: NodeId, env_file: str):
                 self.adjust_to_be_resolved_dict()
         elif node.is_executing():
             if node.has_env_conflict_with(env_file):
-                self.reset_succeeding_nodes(node_id, env_file)
+                self.reset_succeeding_nodes(concrete_node_id, env_file)
         elif node.is_spec_executing():
             if node.has_env_conflict_with(env_file):
-                self.reset_succeeding_nodes(node_id, env_file)
+                self.reset_succeeding_nodes(concrete_node_id, env_file)
         else:
-            logging.error(f'Error: Node {node_id} is in an invalid state: {node.state}')
-            raise Exception(f'Error: Node {node_id} is in an invalid state: {node.state}')
+            logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
+            raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
 
     def eager_fs_killing(self):
         event_log("try to eagerly kill conflicted speculation")
@@ -318,7 +355,7 @@ def eager_fs_killing(self):
         self.fetch_fs_actions()
         for node in self.get_all_nodes():
             if ((node.is_speculated() or node.is_spec_executing())
-                and self._has_fs_deps(node.id_)):
+                and self._has_fs_deps(node.cnid)):
                 to_be_killed.append(node)
         for node in to_be_killed:
             node.reset_to_ready()
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 8e67f539..2a7cc5a2 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -4,6 +4,7 @@
 import util
 import config
 from partial_program_order import PartialProgramOrder, NodeId
+from node import LoopStack, ConcreteNodeId
 
 ##
 ## A scheduler server
@@ -79,15 +80,15 @@ def handle_init(self, input_cmd: str):
         partial_order_file = input_cmd.split(":")[1].rstrip()
         logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}')
         self.partial_program_order = util.parse_partial_program_order_from_file(partial_order_file)
-        self.partial_program_order.init_partial_order()
+        util.debug_log(str(self.partial_program_order.hsprog))
 
     def handle_wait(self, input_cmd: str, connection):
-        node_id, env_file = self.__parse_wait(input_cmd)
-        self.waiting_for_response[node_id] = connection
-        logging.info(f'Scheduler: Received wait message - {node_id}.')
+        concrete_node_id, env_file = self.__parse_wait(input_cmd)
+        self.waiting_for_response[concrete_node_id] = connection
+        logging.info(f'Scheduler: Received wait message - {concrete_node_id}.')
         self.latest_env = env_file
-        self.partial_program_order.handle_wait(node_id, env_file)
-        if self.partial_program_order.get_node(node_id).is_committed():
+        self.partial_program_order.handle_wait(concrete_node_id, env_file)
+        if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed():
             self.respond_to_pending_wait(node_id)
 
     def process_next_cmd(self):
@@ -101,11 +102,11 @@ def process_next_cmd(self):
             connection.close()
         elif (input_cmd.startswith("CommandExecComplete:")):
             node_id, exec_id, exit_code, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-            if self.partial_program_order.get_node(node_id).exec_id == exec_id:
+            if self.partial_program_order.get_concrete_node(node_id).exec_id == exec_id:
                 logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
                 self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env)
                 
-                if self.partial_program_order.get_node(node_id).is_committed():
+                if self.partial_program_order.get_concrete_node(node_id).is_committed():
                     self.respond_to_pending_wait(node_id)
             else:
                 logging.info(f'Scheduler: Received command exec complete message for a killed instance, ignoring - {node_id}.')
@@ -116,6 +117,7 @@ def process_next_cmd(self):
             self.partial_program_order.log_info()
             self.done = True
         elif input_cmd.startswith("CommandExecStart:"):
+            assert False
             node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
             logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
             # self.handle_command_exec_start(input_cmd)
@@ -133,7 +135,7 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str):
     def respond_to_pending_wait(self, node_id: int):
         logging.debug(f'Responding to pending wait for node: {node_id}')
         ## Get the completed node info
-        node = self.partial_program_order.get_node(node_id)
+        node = self.partial_program_order.get_concrete_node(node_id)
         msg = '{} {} {}'.format(*node.execution_outcome())
         response = success_response(msg)
         
@@ -143,24 +145,21 @@ def respond_to_pending_wait(self, node_id: int):
     def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]":
         try:
             node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|")
-            raw_node_id_int = int(node_id_component.split(":")[1].rstrip())
+            node_id = NodeId(int(node_id_component.split(":")[1].rstrip()))
             loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip()
-            pash_runtime_vars_file_str = pash_runtime_vars_file_component.split(":")[1].rstrip()
-            # TODO Implement loops correctly
-            # if loop_counters_str == "None":
-            #     node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str
-            # else:
-            #     loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")]
-            #     node_id = NodeId(raw_node_id_int, LoopStack(loop_counters)), pash_runtime_vars_file_str      
-            node_id = NodeId(raw_node_id_int), pash_runtime_vars_file_str     
-            return node_id
+            pash_env_filename = pash_runtime_vars_file_component.split(":")[1].rstrip()
+            if loop_counters_str == "None":
+                return ConcreteNodeId(node_id), pash_env_filename
+            else:
+                loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")]
+                return ConcreteNodeId(node_id, loop_counters), pash_env_filename
         except:
             raise Exception(f'Parsing failure for line: {input_cmd}')
         
     def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         try:
             components = input_cmd.rstrip().split("|")
-            command_id = NodeId.parse_node_id(components[0].split(":")[1])
+            command_id = ConcreteNodeId.parse(components[0].split(":")[1])
             exec_id = int(components[1].split(":")[1])
             exit_code = int(components[2].split(":")[1])
             sandbox_dir = components[3].split(":")[1]
@@ -171,8 +170,8 @@ def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
 
 
     def schedule_work(self):
-        nodes = self.partial_program_order.get_schedulable_nodes()
-        for n in nodes[:2]:
+        concrete_node_ids = self.partial_program_order.get_schedulable_nodes()
+        for n in concrete_node_ids[:2]:
             self.partial_program_order.schedule_spec_work(n, self.latest_env)
         
     def run(self):
diff --git a/parallel-orch/util.py b/parallel-orch/util.py
index b9947233..3886a816 100644
--- a/parallel-orch/util.py
+++ b/parallel-orch/util.py
@@ -9,7 +9,7 @@
 import psutil
 import signal
 import analysis
-from node import Node, NodeId
+from node import Node, NodeId, LoopStack
 from partial_program_order import PartialProgramOrder
 
 DEBUG_LOG = '[DEBUG_LOG] '
@@ -214,7 +214,6 @@ def parse_loop_contexts(lines):
         loop_contexts[node_id] = loop_ctx
     return loop_contexts
 
-
 def parse_partial_program_order_from_file(file_path: str):
     with open(file_path) as f:
         raw_lines = f.readlines()
@@ -249,11 +248,10 @@ def parse_partial_program_order_from_file(file_path: str):
     for i in range(number_of_nodes):
         file_path = f'{cmds_directory}/{i}'
         cmd, asts = parse_cmd_from_file(file_path)
-        # loop_ctx = loop_contexts[i]
-        # nodes[NodeId(i)] = Node(NodeId(i), cmd,
-        #                         asts=asts,
-        #                         loop_context=LoopStack(loop_ctx))
-        ab_nodes[NodeId(i)] = Node(NodeId(i), cmd, asts=asts)
+        loop_ctx = loop_contexts[i]
+        ab_nodes[NodeId(i)] = Node(NodeId(i), cmd.strip(),
+                                   asts=asts,
+                                   loop_context=LoopStack(loop_ctx))
 
     edges = {NodeId(i) : [] for i in range(number_of_nodes)}
     for edge_line in edge_lines:

From 87c68f5a2a75b228b2b61f4ec0d091078748e915 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Tue, 6 Feb 2024 07:06:30 -0700
Subject: [PATCH 24/39] Add ignore vars in env resolution

---
 parallel-orch/node.py             | 25 ++++++++++++++++++++++---
 parallel-orch/scheduler_server.py |  2 +-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 914f5f44..c94a5f0a 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -362,8 +362,13 @@ def has_env_conflict_with(self, other_env) -> bool:
         if self.exec_ctxt.pre_env_file == other_env:
             return False
 
-        ignore_vars = set(['RANDOM'])
-
+        ignore_vars = set(["_", 'RANDOM', "msg", "pash_runtime_final_status", "pash_previous_set_status",
+                           "pash_runtime_shell_variables_file", "from_set", "output_variable_file",
+                           "pash_loop_iter_counters", "daemon_response", "vars_file",
+                           "pash_speculative_command_id", "prev_env", "PREVIOUS_SET_STATUS",
+                           "BASH_LINENO", "response_args", "stdout_file", "pash_spec_command_id",
+                           "cmd_exit_code", "pash_set_to_add"])
+        
         re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"')
         re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"')
         re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))')
@@ -386,7 +391,21 @@ def parse_env(content):
 
         with open(other_env, 'r') as file:
             other_env_vars = parse_env(file.read())
-        return node_env_vars != other_env_vars
+        
+        conflict_exists = False
+        for key in set(node_env_vars.keys()).union(other_env_vars.keys()):
+            if key not in node_env_vars:
+                logging.critical(f"Variable {key} missing in node environment")
+                conflict_exists = True
+            elif key not in other_env_vars:
+                logging.critical(f"Variable {key} missing in other environment")
+                conflict_exists = True
+            elif node_env_vars[key] != other_env_vars[key]:
+                logging.critical(f"Variable {key} differs: node environment has {node_env_vars[key]}, other has {other_env_vars[key]}")
+                conflict_exists = True
+        
+        return conflict_exists
+
 
     
 class HSBasicBlock:
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 2a7cc5a2..ba109ba6 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -89,7 +89,7 @@ def handle_wait(self, input_cmd: str, connection):
         self.latest_env = env_file
         self.partial_program_order.handle_wait(concrete_node_id, env_file)
         if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed():
-            self.respond_to_pending_wait(node_id)
+            self.respond_to_pending_wait(concrete_node_id)
 
     def process_next_cmd(self):
         connection, input_cmd = util.socket_get_next_cmd(self.socket)

From 3e76190c36b05b02651bcc8cc5e37ddd8a07b90d Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Tue, 6 Feb 2024 07:23:12 -0700
Subject: [PATCH 25/39] Reset current node on env conflict

---
 parallel-orch/partial_program_order.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 03da8a47..c74cad30 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -341,9 +341,12 @@ def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str):
                 self.adjust_to_be_resolved_dict()
         elif node.is_executing():
             if node.has_env_conflict_with(env_file):
+                node.reset_to_ready()
+                node.start_executing(env_file)
                 self.reset_succeeding_nodes(concrete_node_id, env_file)
         elif node.is_spec_executing():
             if node.has_env_conflict_with(env_file):
+                node.reset_to_ready()
                 self.reset_succeeding_nodes(concrete_node_id, env_file)
         else:
             logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')

From bf3c212c76e7f8390adae1c6b40f57e939cb5d2c Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Tue, 6 Feb 2024 10:43:29 -0500
Subject: [PATCH 26/39] fix empty first bb

---
 parallel-orch/node.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index c94a5f0a..3a1b3c7a 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -415,7 +415,7 @@ def __init__(self, nodes: list[Node]):
         self.nodes = nodes
 
     def __str__(self):
-        return ''.join([node.cmd for node in self.nodes])
+        return ''.join([node.cmd.strip() + '\n' for node in self.nodes])
 
     @property
     def loop_context(self):
@@ -429,7 +429,7 @@ def get_node(self, node_id: NodeId) -> Node:
         nodes = [node for node in self.nodes if node.id_ == node_id]
         assert len(nodes) == 1
         return nodes[0]
-        
+
 class HSProg:
     abstract_nodes: "dict[NodeId, Node]"
     adjacency: "dict[NodeId, list[NodeId]]"
@@ -451,11 +451,15 @@ def construct_basic_blocks(self):
         node_list = []
         block_id = LoopStack()
         for node in self.abstract_nodes.values():
-            if node.loop_context == block_id:
+            if (node.loop_context == block_id and 
+                not (len(node_list) >= 1 and node_list[-1].cmd == 'break')):
                 node_list.append(node)
             else:
-                basic_block = HSBasicBlock(node_list)
-                self.basic_blocks.append(basic_block)
+                if len(node_list) != 0:
+                    # This branch happens for conditional at the beginning
+                    # of the program
+                    basic_block = HSBasicBlock(node_list)
+                    self.basic_blocks.append(basic_block)
                 node_list = [node]
                 block_id = node.loop_context
         basic_block = HSBasicBlock(node_list)

From 5116a34985cea0fb5433ef76fed6ae60e3f7b543 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Tue, 6 Feb 2024 15:54:09 -0500
Subject: [PATCH 27/39] fix the break and unsafe command for now

---
 parallel-orch/node.py                  | 12 ++++++++++++
 parallel-orch/partial_program_order.py | 12 ++++++++++--
 parallel-orch/scheduler_server.py      | 19 +++++++++++--------
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 3a1b3c7a..65c4de42 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -10,6 +10,7 @@
 from typing import Tuple
 from enum import Enum, auto
 import util
+import analysis
 
 class NodeState(Enum):
     INIT = auto()
@@ -255,6 +256,9 @@ def execution_outcome(self) -> Tuple[int, str, str]:
         assert self.exec_result is not None
         return self.exec_result.exit_code, self.exec_ctxt.post_env_file, self.exec_ctxt.stdout
 
+    def command_unsafe(self):
+        return not analysis.safe_to_execute(self.asts, {})
+        
 
     ##                                      ##
     ##          Transition Functions        ##
@@ -263,8 +267,13 @@ def execution_outcome(self) -> Tuple[int, str, str]:
     def transition_from_init_to_ready(self):
         assert self.state == NodeState.INIT
         self.state = NodeState.READY
+        self.rwset = RWSet(set(), set())
         # Also, probably unroll here?
 
+    def transition_from_ready_to_unsafe(self):
+        assert self.state == NodeState.READY
+        self.state = NodeState.UNSAFE
+
     def kill(self):
         assert self.state in [NodeState.EXECUTING, NodeState.SPEC_EXECUTING]
         self.exec_ctxt.process.kill()
@@ -335,6 +344,9 @@ def transition_to_committed(self):
     def transition_from_spec_executing_to_speculated(self):
         pass
 
+    def commit_unsafe_node(self):
+        assert self.state == NodeState.UNSAFE
+        self.state = NodeState.COMMITTED
 
     def update_rw_set(self, rw_set):
         self.rwset = rw_set
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index c74cad30..7bbdde5f 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -291,6 +291,8 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId):
             new_concrete_node = ConcreteNode(new_concrete_node_id,
                                              basic_block.get_node(abstract_node_id))
             new_concrete_node.transition_from_init_to_ready()
+            if new_concrete_node.command_unsafe():
+                new_concrete_node.transition_from_ready_to_unsafe()
             self.concrete_nodes[new_concrete_node_id] = new_concrete_node
             if prev_concrete_node_id is not None:
                 self.prev_concrete_node[new_concrete_node_id] = [prev_concrete_node_id]
@@ -298,7 +300,11 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId):
                 self.prev_concrete_node[new_concrete_node_id] = []
             prev_concrete_node_id = new_concrete_node_id
         assert concrete_node_id in self.concrete_nodes
-            
+
+    def finish_wait_unsafe(self, concrete_node_id: ConcreteNodeId):
+        node = self.concrete_nodes[concrete_node_id]
+        node.commit_unsafe_node()
+        
     def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str):
         event_log(f"handle_wait {concrete_node_id}")
 
@@ -312,12 +318,14 @@ def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str):
         node = self.get_concrete_node(concrete_node_id)
 
         # Invalid state check
-        if node.is_committed() or node.is_unsafe() or node.is_initialized():
+        if node.is_committed() or node.is_initialized():
             logging.error(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
             raise Exception(f'Error: Node {concrete_node_id} is in an invalid state: {node.state}')
 
         if node.is_ready():
             node.start_executing(env_file)
+        elif node.is_unsafe():
+            pass
         elif node.is_stopped():
             if node in self.get_frontier():
                 logging.info(f'Node {concrete_node_id} is stopped and in the frontier.')
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index ba109ba6..1c2025d8 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -88,8 +88,12 @@ def handle_wait(self, input_cmd: str, connection):
         logging.info(f'Scheduler: Received wait message - {concrete_node_id}.')
         self.latest_env = env_file
         self.partial_program_order.handle_wait(concrete_node_id, env_file)
-        if self.partial_program_order.get_concrete_node(concrete_node_id).is_committed():
+        concrete_node = self.partial_program_order.get_concrete_node(concrete_node_id)
+        if concrete_node.is_committed():
             self.respond_to_pending_wait(concrete_node_id)
+        elif concrete_node.is_unsafe():
+            self.partial_program_order.finish_wait_unsafe(concrete_node_id)
+            self.respond_to_wait_on_unsafe(concrete_node_id)
 
     def process_next_cmd(self):
         connection, input_cmd = util.socket_get_next_cmd(self.socket)
@@ -116,11 +120,6 @@ def process_next_cmd(self):
             util.socket_respond(connection, success_response("All finished!"))
             self.partial_program_order.log_info()
             self.done = True
-        elif input_cmd.startswith("CommandExecStart:"):
-            assert False
-            node_id, exec_id, sandbox_dir, trace_file = self.__parse_command_exec_x(input_cmd)
-            logging.info(f'Scheduler: Received command exec start message - {input_cmd}.')
-            # self.handle_command_exec_start(input_cmd)
         else:
             logging.error(error_response(f'Error: Unsupported command: {input_cmd}'))
             raise Exception(f'Error: Unsupported command: {input_cmd}')
@@ -132,7 +131,11 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str):
         util.socket_respond(connection, response)
         connection.close()
 
-    def respond_to_pending_wait(self, node_id: int):
+    def respond_to_wait_on_unsafe(self, node_id: ConcreteNodeId):
+        response = unsafe_response('')
+        self.respond_to_frontend_core(node_id, response)
+        
+    def respond_to_pending_wait(self, node_id: ConcreteNodeId):
         logging.debug(f'Responding to pending wait for node: {node_id}')
         ## Get the completed node info
         node = self.partial_program_order.get_concrete_node(node_id)
@@ -142,7 +145,7 @@ def respond_to_pending_wait(self, node_id: int):
         ## Send the response
         self.respond_to_frontend_core(node_id, response)
 
-    def __parse_wait(self, input_cmd: str) -> "tuple[NodeId, str]":
+    def __parse_wait(self, input_cmd: str) -> "tuple[ConcreteNodeId, str]":
         try:
             node_id_component, loop_iter_counter_component, pash_runtime_vars_file_component = input_cmd.rstrip().split("|")
             node_id = NodeId(int(node_id_component.split(":")[1].rstrip()))

From 9bd4c8b0b1606b5889d030923e00704d433d2263 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Tue, 6 Feb 2024 17:45:11 -0500
Subject: [PATCH 28/39] cleanup

---
 parallel-orch/node.py                  | 19 +++---
 parallel-orch/partial_program_order.py | 85 ++++----------------------
 parallel-orch/scheduler_server.py      | 28 ++++-----
 3 files changed, 34 insertions(+), 98 deletions(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 65c4de42..9bb8ae22 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -152,7 +152,7 @@ def __init__(self, node_id: NodeId, loop_iters = list()):
 
     def __repr__(self):
         return f'cnid({self.node_id.id_})'
-        
+
     def __hash__(self):
         return hash((self.node_id, self.loop_iters))
 
@@ -166,7 +166,7 @@ def __str__(self):
     def parse(input_str):
         node_id_str, loop_iters_str = input_str.split('@')
         return ConcreteNodeId(NodeId(int(node_id_str)), [int(cnt) for cnt in loop_iters_str.split('-')[1:]])
-    
+
 class ConcreteNode:
     cnid: ConcreteNodeId
     abstract_node: Node
@@ -258,7 +258,7 @@ def execution_outcome(self) -> Tuple[int, str, str]:
 
     def command_unsafe(self):
         return not analysis.safe_to_execute(self.asts, {})
-        
+
 
     ##                                      ##
     ##          Transition Functions        ##
@@ -380,7 +380,7 @@ def has_env_conflict_with(self, other_env) -> bool:
                            "pash_speculative_command_id", "prev_env", "PREVIOUS_SET_STATUS",
                            "BASH_LINENO", "response_args", "stdout_file", "pash_spec_command_id",
                            "cmd_exit_code", "pash_set_to_add"])
-        
+
         re_scalar_string = re.compile(r'declare (?:-x|--)? (\w+)="([^"]*)"')
         re_scalar_int = re.compile(r'declare -i (\w+)="(\d+)"')
         re_array = re.compile(r'declare -a (\w+)=(\([^)]+\))')
@@ -403,7 +403,7 @@ def parse_env(content):
 
         with open(other_env, 'r') as file:
             other_env_vars = parse_env(file.read())
-        
+
         conflict_exists = False
         for key in set(node_env_vars.keys()).union(other_env_vars.keys()):
             if key not in node_env_vars:
@@ -415,11 +415,11 @@ def parse_env(content):
             elif node_env_vars[key] != other_env_vars[key]:
                 logging.critical(f"Variable {key} differs: node environment has {node_env_vars[key]}, other has {other_env_vars[key]}")
                 conflict_exists = True
-        
+
         return conflict_exists
 
 
-    
+
 class HSBasicBlock:
     def __init__(self, nodes: list[Node]):
         if len(nodes) == 0:
@@ -463,7 +463,7 @@ def construct_basic_blocks(self):
         node_list = []
         block_id = LoopStack()
         for node in self.abstract_nodes.values():
-            if (node.loop_context == block_id and 
+            if (node.loop_context == block_id and
                 not (len(node_list) >= 1 and node_list[-1].cmd == 'break')):
                 node_list.append(node)
             else:
@@ -523,8 +523,7 @@ def find_basic_block(self, node_id: NodeId):
                 if node.id_ == node_id:
                     return bb
         raise ValueError('no such node_id')
-    
+
     def __str__(self):
         return 'prog:\n' + '\n'.join(
             [f'block {i}:\n' + str(bb) + f'goto block {self.block_adjacency[i]}\n' for i, bb in enumerate(self.basic_blocks)])
-    
diff --git a/parallel-orch/partial_program_order.py b/parallel-orch/partial_program_order.py
index 7bbdde5f..73ad58ef 100644
--- a/parallel-orch/partial_program_order.py
+++ b/parallel-orch/partial_program_order.py
@@ -34,17 +34,6 @@ def __init__(self, abstract_nodes: "dict[NodeId, Node]", edges: "dict[NodeId, li
         self.prev_concrete_node: dict[ConcreteNodeId, list[ConcreteNodeId]] = {}
         self.to_be_resolved: dict[ConcreteNodeId, list[ConcreteNodeId]] = {}
 
-    # def init_partial_order(self):
-    #     for node_id, node in self.concrete_nodes.items():
-    #         if node.is_initialized():
-    #             node.transition_from_init_to_ready()
-
-    #     self.init_to_be_resolved_dict()
-    #     logging.info(self.to_be_resolved)
-    #     # Init frontier
-    #     self.frontier = self.get_standard_source_nodes()
-    #     # TODO: Implement the rest of the partial order initialization
-
     @property
     def abstract_nodes(self):
         return self.hsprog.abstract_nodes
@@ -56,13 +45,13 @@ def adjacency(self):
     @property
     def inverse_adjacency(self):
         return self.hsprog.inverse_adjacency
-        
+
     def commit_node(self, node):
         # Logic to handle committing a node
         node.transition_to_committed()
         # Maybe update dependencies here
         # etc.
-        
+
     def get_concrete_node(self, concrete_node_id: ConcreteNodeId) -> ConcreteNode:
         return self.concrete_nodes[concrete_node_id]
 
@@ -107,61 +96,9 @@ def log_state(self):
     def get_schedulable_nodes(self) -> list[ConcreteNodeId]:
         return [concrete_node_id for concrete_node_id, _ in self.get_ready_nodes()]
 
-    ## Returns the next non-committed normal node
-    def progress_frontier(self) -> "list[NodeId]":
-        return self.get_next_frontier_nodes(self.get_frontier())
-
-    def get_next_nodes(self, node_id:NodeId) -> "list[NodeId]":
-        return self.adjacency[node_id][:]
-
     def get_prev_nodes(self, concrete_node_id: ConcreteNodeId) -> "list[ConcreteNodeId]":
         return self.prev_concrete_node[concrete_node_id][:]
 
-    def get_source_nodes(self) -> "list[NodeId]":
-        sources = set()
-        for to_id, from_ids in self.inverse_adjacency.items():
-            if len(from_ids) == 0:
-                sources.add(to_id)
-        return list(sources)
-
-    def get_standard_source_nodes(self) -> list:
-        source_nodes = self.get_source_nodes()
-        # TODO: Filter out loop nodes
-        # return self.filter_standard_nodes(source_nodes)
-        return source_nodes
-
-    def get_next_frontier_nodes(self, start_nodes: "list[NodeId]") -> "set[int]":
-        # TODO: filter non-loop nodes
-        visited = set()
-        to_visit = [(node_id, 0) for node_id in start_nodes]  # Pair each start node with depth 0
-        non_committed_nodes = set()
-        first_non_committed_depth = None
-
-        while to_visit:
-            current_node_id, depth = to_visit.pop()
-            if current_node_id in visited:
-                continue
-
-            visited.add(current_node_id)
-            current_node = self.concrete_nodes.get(current_node_id)
-
-            if not current_node.is_committed():
-                if first_non_committed_depth is None:
-                    first_non_committed_depth = depth
-                elif depth > first_non_committed_depth:
-                    # Do not consider nodes deeper than the first non-committed depth
-                    continue
-
-                non_committed_nodes.add(current_node_id)
-
-            if first_non_committed_depth is None or depth < first_non_committed_depth:
-                next_nodes = self.get_next_nodes(current_node_id)  # Use the provided method to get next nodes
-                for neighbor in next_nodes:
-                    if neighbor not in visited:
-                        to_visit.append((neighbor, depth + 1))  # Increase depth for neighbors
-
-        return non_committed_nodes
-
     def get_all_next(self, current_node_id: ConcreteNodeId, visited=None) -> "set[NodeId]":
         all_next = set()
         def reachable_rec(cur, reachable):
@@ -233,31 +170,31 @@ def _has_fs_deps(self, concrete_node_id: ConcreteNodeId):
 
     # TODO: It's currently designed this way to avoid reading trace file all the time
     # When we have complex caching code for this we can make this go away
-    def has_fs_deps(self, node_id:NodeId):
+    def has_fs_deps(self, concrete_node_id: ConcreteNodeId):
         self.fetch_fs_actions()
-        self._has_fs_deps(node_id)
+        self._has_fs_deps(concrete_node_id)
 
     ### external handler events ###
 
-    def schedule_work(self, node_id: NodeId, env_file: str):
+    def schedule_work(self, concrete_node_id: ConcreteNodeId, env_file: str):
         event_log("schedule_work")
-        self.get_concrete_node(node_id).start_executing(env_file)
+        self.get_concrete_node(concrete_node_id).start_executing(env_file)
 
     def schedule_spec_work(self, concrete_node_id: ConcreteNodeId, env_file: str):
         event_log("schedule_spec")
         self.adjust_to_be_resolved_dict_entry(concrete_node_id)
         self.get_concrete_node(concrete_node_id).start_spec_executing(env_file)
 
-    def handle_complete(self, node_id: NodeId, has_pending_wait: bool,
+    def handle_complete(self, concrete_node_id: ConcreteNodeId, has_pending_wait: bool,
                         current_env: str):
-        event_log(f"handle_complete {node_id}")
-        node = self.get_concrete_node(node_id)
+        event_log(f"handle_complete {concrete_node_id}")
+        node = self.get_concrete_node(concrete_node_id)
         # TODO: complete the state matching
         if node.is_executing():
             node.commit_frontier_execution()
             self.adjust_to_be_resolved_dict()
         elif node.is_spec_executing():
-            if self.has_fs_deps(node_id):
+            if self.has_fs_deps(concrete_node_id):
                 node.reset_to_ready()
                 # otherwise it stays in ready state and waits to be scheduled by the scheduler
                 if has_pending_wait:
@@ -304,7 +241,7 @@ def adding_new_basic_block(self, concrete_node_id: ConcreteNodeId):
     def finish_wait_unsafe(self, concrete_node_id: ConcreteNodeId):
         node = self.concrete_nodes[concrete_node_id]
         node.commit_unsafe_node()
-        
+
     def handle_wait(self, concrete_node_id: ConcreteNodeId, env_file: str):
         event_log(f"handle_wait {concrete_node_id}")
 
diff --git a/parallel-orch/scheduler_server.py b/parallel-orch/scheduler_server.py
index 1c2025d8..74a8583d 100644
--- a/parallel-orch/scheduler_server.py
+++ b/parallel-orch/scheduler_server.py
@@ -19,11 +19,11 @@ def handler(signum, frame):
 def parse_args():
     parser = argparse.ArgumentParser(add_help=False)
     ## TODO: Import the arguments so that they are not duplicated here and in orch
-    parser.add_argument("-d", "--debug-level", 
-                        type=int, 
+    parser.add_argument("-d", "--debug-level",
+                        type=int,
                         default=0,
                         help="Set debugging level")
-    parser.add_argument("-f", "--log_file", 
+    parser.add_argument("-f", "--log_file",
                         type=str,
                         default=None,
                         help="Set logging output file. Default: stdout")
@@ -57,7 +57,7 @@ def error_response(string):
 class Scheduler:
     """ Schedules a partial order of commands to run out-of-order
     Flow:
-        input cmd -> 
+        input cmd ->
                     |   Daemon Start -> Receive whens tarting
                     |   Init -> Read the partial order from a file
                     |   CommandExecComplete -> A command completed its execution
@@ -109,7 +109,7 @@ def process_next_cmd(self):
             if self.partial_program_order.get_concrete_node(node_id).exec_id == exec_id:
                 logging.info(f'Scheduler: Received command exec complete message - {node_id}.')
                 self.partial_program_order.handle_complete(node_id, node_id in self.waiting_for_response, self.latest_env)
-                
+
                 if self.partial_program_order.get_concrete_node(node_id).is_committed():
                     self.respond_to_pending_wait(node_id)
             else:
@@ -134,14 +134,14 @@ def respond_to_frontend_core(self, node_id: NodeId, response: str):
     def respond_to_wait_on_unsafe(self, node_id: ConcreteNodeId):
         response = unsafe_response('')
         self.respond_to_frontend_core(node_id, response)
-        
+
     def respond_to_pending_wait(self, node_id: ConcreteNodeId):
         logging.debug(f'Responding to pending wait for node: {node_id}')
         ## Get the completed node info
         node = self.partial_program_order.get_concrete_node(node_id)
         msg = '{} {} {}'.format(*node.execution_outcome())
         response = success_response(msg)
-        
+
         ## Send the response
         self.respond_to_frontend_core(node_id, response)
 
@@ -158,7 +158,7 @@ def __parse_wait(self, input_cmd: str) -> "tuple[ConcreteNodeId, str]":
                 return ConcreteNodeId(node_id, loop_counters), pash_env_filename
         except:
             raise Exception(f'Parsing failure for line: {input_cmd}')
-        
+
     def __parse_command_exec_x(self, input_cmd: str) -> "tuple[int, int]":
         try:
             components = input_cmd.rstrip().split("|")
@@ -176,11 +176,11 @@ def schedule_work(self):
         concrete_node_ids = self.partial_program_order.get_schedulable_nodes()
         for n in concrete_node_ids[:2]:
             self.partial_program_order.schedule_spec_work(n, self.latest_env)
-        
+
     def run(self):
         ## The first command should be the daemon start
         self.process_next_cmd()
-        
+
         ## The second command should be the partial order init
         self.process_next_cmd()
 
@@ -200,7 +200,7 @@ def shutdown(self):
         logging.debug("PaSh-Spec scheduler is shutting down...")
         logging.debug("PaSh-Spec scheduler shut down successfully...")
         self.terminate_pending_commands()
-        
+
     def terminate_pending_commands(self):
         for node in self.partial_program_order.get_executing_normal_and_spec_nodes():
             proc, _trace_file, _stdout, _stderr, _variable_file, _ = node.get_main_sandbox()
@@ -215,8 +215,8 @@ def main():
     if args.log_file is None:
         logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s")
     else:
-        logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s", 
-                            filename=f"{os.path.abspath(args.log_file)}", 
+        logging.basicConfig(format="%(levelname)s|%(asctime)s|%(message)s",
+                            filename=f"{os.path.abspath(args.log_file)}",
                             filemode="w")
 
     # Set debug level
@@ -230,7 +230,7 @@ def main():
     config.SPECULATE_IMMEDIATELY = args.speculate_immediately
     scheduler = Scheduler(config.SCHEDULER_SOCKET)
     scheduler.run()
-   
+
 
 if __name__ == "__main__":
     main()

From 8f7a62bcbd84ce0bd643c798573518b75fc1b1a6 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Fri, 9 Feb 2024 01:31:25 -0700
Subject: [PATCH 29/39] Adjust benchmark infra to the new scheduler logs

---
 report/benchmark_plots.py  | 84 +++++++++++++++++++++++---------
 report/benchmark_runner.py | 36 +++++++++-----
 report/command_executor.py |  0
 report/config_parser.py    |  2 +
 report/report_generator.py |  0
 report/result_analyzer.py  | 98 ++++++++++++--------------------------
 report/run_benchmarks.py   | 33 ++++++++++---
 7 files changed, 145 insertions(+), 108 deletions(-)
 mode change 100755 => 100644 report/benchmark_plots.py
 mode change 100755 => 100644 report/benchmark_runner.py
 mode change 100755 => 100644 report/command_executor.py
 mode change 100755 => 100644 report/config_parser.py
 mode change 100755 => 100644 report/report_generator.py
 mode change 100755 => 100644 report/result_analyzer.py
 mode change 100755 => 100644 report/run_benchmarks.py

diff --git a/report/benchmark_plots.py b/report/benchmark_plots.py
old mode 100755
new mode 100644
index c74f099d..177a8bdf
--- a/report/benchmark_plots.py
+++ b/report/benchmark_plots.py
@@ -1,5 +1,7 @@
 import os
 import matplotlib.pyplot as plt
+import numpy as np
+
 
 # Set the plotting style if desired
 # plt.style.use('ggplot')  # Example: ggplot style
@@ -33,7 +35,9 @@ def plot_benchmark_times_combined(benchmarks, bash_times, orch_times, output_dir
              [i + bar_width / 2 for i in range(len(benchmarks))], benchmarks)
     ax.legend()
 
-    save_plot(output_dir, filename)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, f"{filename}.pdf"))
+
 
 # Plots individual comparison charts for each benchmark.
 def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_dir, filename):
@@ -50,26 +54,62 @@ def plot_benchmark_times_individual(benchmarks, bash_times, orch_times, output_d
 
     save_plot(output_dir, filename)
 
-# Plots a Gantt chart of activities.
-def plot_gantt(activities, output_dir, filename, simple=False):
-    if simple:
-        activities = [activity for activity in activities if activity[0].startswith("RunNode,") or activity[0] == "Wait"]
-
-    fig_height = max(5, len(activities) * 0.3)
-    fig, ax = plt.subplots(figsize=(15, fig_height))
-
-    activities.sort(key=lambda x: x[1])
-    bar_height = 0.8
-    gap = 0.2
-
-    for index, (action, start_time, duration) in enumerate(activities):
-        ax.broken_barh([(start_time, duration)], (index * (bar_height + gap), bar_height), facecolors='blue')
-        ax.text(start_time + duration / 2, index * (bar_height + gap) + bar_height / 2, action, 
-                ha='center', va='center', fontsize=6, color='white')
-
-    setup_ax(ax, 'Time (ms)', '', f'Gantt Chart of {filename.strip("_gantt.pdf")}', [], [])
-    ax.set_yticks([i * (bar_height + gap) + bar_height / 2 for i in range(len(activities))])
-    ax.set_yticklabels([activity[0] for activity in activities], fontsize=8)
+def sort_node_ids(node_ids):
+    def parse_id(node_id):
+        parts = node_id.split('+')
+        concrete_id = int(parts[0])
+        iter_ids = tuple(int(iter_id) for iter_id in parts[1].split('-')) if len(parts) > 1 else ()
+        return (concrete_id,) + iter_ids
+
+    sorted_ids = sorted(node_ids, key=parse_id, reverse=True)
+    return sorted_ids
+
+def plot_prog_blocks(prog_blocks, output_dir, filename):
+    # Define colors for different statuses
+    colors = {
+        'READY': 'red',
+        'EXE': 'orange',
+        'SPEC_E': 'blue',
+        'SPEC_F': 'lightblue',
+        'COMMIT': 'green',
+        'UNSAFE': 'purple',
+        'INIT': 'grey'
+    }
+
+    first_time = prog_blocks[0][0]
+    times = [(block[0] - first_time).total_seconds() for block in prog_blocks]
+
+    unsorted_node_ids = {node[0] for block in prog_blocks for node in block[1]}
+    node_ids = sort_node_ids(unsorted_node_ids)  # Sort the node IDs using the custom sorting function
+
+    statuses = {node_id: [] for node_id in node_ids}
+
+    for block_time, nodes in prog_blocks:
+        elapsed_time = (block_time - first_time).total_seconds()
+        for node_id, status in nodes:
+            statuses[node_id].append((elapsed_time, status))
+
+    fig_height = 0.5 * len(node_ids)
+    fig_width = 12  # Fixed width
+    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
+    status_legend_handles = {}
+
+    for node_id in node_ids:
+        y_pos = node_ids.index(node_id)
+        for i, (start_time, status) in enumerate(statuses[node_id]):
+            end_time = times[-1] if i == len(statuses[node_id]) - 1 else statuses[node_id][i + 1][0]
+            color = colors.get(status, 'grey')
+            ax.broken_barh([(start_time, end_time - start_time)], (y_pos - 0.4, 0.8), facecolors=color)
+            if status not in status_legend_handles:
+                status_legend_handles[status] = plt.Rectangle((0, 0), 1, 1, fc=color)
+
+    ax.set_xlabel("Time since first tick (seconds)")
+    ax.set_ylabel("Node ID")
+    ax.set_title("Node Status Over Time")
+    ax.set_yticks(np.arange(len(node_ids)))
+    ax.set_yticklabels(node_ids)
     ax.grid(True)
+    ax.legend(status_legend_handles.values(), status_legend_handles.keys(), loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, frameon=True)
 
-    save_plot(output_dir, filename)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, f"{filename}.pdf"))
diff --git a/report/benchmark_runner.py b/report/benchmark_runner.py
old mode 100755
new mode 100644
index f62e380d..a7531c5c
--- a/report/benchmark_runner.py
+++ b/report/benchmark_runner.py
@@ -1,30 +1,43 @@
 import csv
+from typing import List
 from command_executor import CommandExecutor
+from config_parser import BenchmarkConfig
 from result_analyzer import ResultAnalyzer
 from report_generator import ReportGenerator
 import benchmark_plots
 import os
+from pprint import pprint
 
 class BenchmarkRunner:
-    def __init__(self, benchmarks, args):
+    def __init__(self, benchmarks: "List[BenchmarkConfig]", args):
         self.benchmarks = benchmarks
         self.args = args
         self.results = []
         self.activities = {}
+        
+    def __repr__(self):
+        return (f"BenchmarkRunner(benchmarks={self.benchmarks!r}, "
+                f"args={self.args!r}, results={self.results!r})")
+        
+    def __str__(self):
+        return (f"Benchmark Runner:\n"
+                f"  Benchmarks: {self.benchmarks}\n"
+                f"  Arguments: {self.args}\n"
+                f"  Results: {self.results}")
 
 
     def run_all_benchmarks(self):
         for benchmark in self.benchmarks:
             self.run_benchmark(benchmark)
 
-    def run_benchmark(self, benchmark):
+    def run_benchmark(self, benchmark: BenchmarkConfig):
         # Setup environment and pre-execution commands
         benchmark.setup_environment()
 
         if self.args.verbose:
             # Print verbose information
             print(f"\n---------> Running benchmark: {benchmark.name} <---------\n")
-            print(f"Environment Variables: {benchmark.env}")
+            print(">", benchmark)
         
         for pre_command in benchmark.pre_execution_script:
             CommandExecutor.run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR'), self.args.verbose)
@@ -41,8 +54,8 @@ def run_benchmark(self, benchmark):
             os.environ.get('ORCH_COMMAND'),
             self.args.verbose)
         
-        activities = ResultAnalyzer.parse_logs_into_activities(orch_log)
-        self.activities[benchmark.name] = activities
+        prog_blocks = ResultAnalyzer.process_results(orch_log)
+        # pprint(prog_blocks)
 
         # Analyze and compare results
         diff_lines = ResultAnalyzer.compare_results(bash_output, orch_output)
@@ -52,8 +65,8 @@ def run_benchmark(self, benchmark):
         ReportGenerator.print_results(benchmark.name, bash_time, orch_time, diff_lines, verbose=self.args.verbose)
         if not self.args.no_logs:
             ReportGenerator.save_log_data(orch_log, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_log.log")
-        
-        ResultAnalyzer.analyze_node_execution_times(orch_log, benchmark.name, os.environ.get('REPORT_OUTPUT_DIR'), self.args.verbose)
+            
+        self.activities[benchmark.name] = prog_blocks
 
 
     def generate_reports(self):
@@ -76,10 +89,7 @@ def generate_plots(self):
 
         # Plot Gantt charts for each benchmark
         for benchmark in self.benchmarks:
-            activities = self.activities.get(benchmark.name, [])
+            activities = self.activities.get(benchmark.name)
             if activities:
-                benchmark_plots.plot_gantt(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt", simple=self.args.full_gantt)
-
-    
-
-    
\ No newline at end of file
+                print(os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt")
+                benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt")
diff --git a/report/command_executor.py b/report/command_executor.py
old mode 100755
new mode 100644
diff --git a/report/config_parser.py b/report/config_parser.py
old mode 100755
new mode 100644
index fa6eeb50..790713e5
--- a/report/config_parser.py
+++ b/report/config_parser.py
@@ -3,6 +3,7 @@
 
 
 class BenchmarkConfig:
+    
     def __init__(self, name, env, pre_execution_script, command, orch_args):
         self.name = name
         self.env = [self.replace_env_var(e) for e in env]
@@ -66,3 +67,4 @@ def parse_config(self):
 
     def get_benchmarks(self):
         return self.benchmarks
+    
diff --git a/report/report_generator.py b/report/report_generator.py
old mode 100755
new mode 100644
diff --git a/report/result_analyzer.py b/report/result_analyzer.py
old mode 100755
new mode 100644
index 8729ab79..e0a7d1df
--- a/report/result_analyzer.py
+++ b/report/result_analyzer.py
@@ -1,21 +1,37 @@
-import difflib
+from datetime import datetime
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
 import hashlib
-import os
-import csv
+import numpy as np
 
 class ResultAnalyzer:
     @staticmethod
-    def parse_logs_into_activities(log_data):
-        info_lines = [line.replace("INFO:root:>|", "").split("|") for line in log_data.split("\n") if line.startswith("INFO:root:>|")]
-        activities = []
-        for line in info_lines:
-            if len(line) == 4:
-                activity = line[1]
-                end_time = float(line[2].split(":")[1].rstrip("ms"))
-                step_time = float(line[3].split(":")[1].rstrip("ms"))
-                start_time = end_time - step_time
-                activities.append((activity, start_time, step_time))
-        return activities
+    def process_results(orch_log):
+        log_lines = orch_log.split("\n")
+        prog_blocks = []
+        current_block = []
+        block_start_time = None
+
+        for line in log_lines:
+            if line.startswith("INFO|") and "[PROG_LOG]" in line:
+                parts = line.split("|")
+                time_str = parts[1]
+                log_content = parts[2].strip()
+                if log_content == "[PROG_LOG]":
+                    # Start of a new block
+                    if current_block:
+                        prog_blocks.append((block_start_time, current_block))
+                        current_block = []
+                    block_start_time = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S,%f")
+                else:
+                    # Continuing the current block
+                    state, node_id, command = log_content.replace("[PROG_LOG] ", "").split(",", 2)
+                    current_block.append((node_id.strip(), state.strip()))
+        # Append the last block if not empty
+        if current_block:
+            prog_blocks.append((block_start_time, current_block))
+
+        return prog_blocks
 
     @staticmethod
     def compare_results(bash_output, orch_output, max_lines=1000):
@@ -35,56 +51,4 @@ def compare_results(bash_output, orch_output, max_lines=1000):
             if hash_value not in bash_hashes:
                 diffs.append(f'+ {line}')
 
-        return diffs
-    
-    @staticmethod
-    def analyze_node_execution_times(orch_output, benchmark_name, output_dir, verbose):
-        node_times_dict = ResultAnalyzer.extract_node_times(orch_output)
-
-        if verbose:
-            ResultAnalyzer.print_node_execution_times(node_times_dict)
-
-        ResultAnalyzer.generate_node_times_csv(node_times_dict, benchmark_name, output_dir)
-
-    @staticmethod
-    def print_node_execution_times(node_times_dict):
-        print("-" * 40)
-        print("Node Execution Times:")
-        for node in sorted(node_times_dict.keys()):
-            times = node_times_dict[node]
-            num_executions = len(times)
-            time_lost = sum(times) - times[-1] if times else 0
-            times_str = ', '.join(f'{time:7.2f}ms' for time in times)
-            print(f"Node {node:2d}: Executions: {num_executions}, Time Lost: {time_lost:7.2f}ms Times = {times_str} ")
-        print("-" * 40)
-        
-    @staticmethod
-    def generate_node_times_csv(node_times_dict, benchmark_name, output_dir):
-        csv_filename = os.path.join(output_dir, f"{benchmark_name}_execution_times.csv")
-        with open(csv_filename, 'w', newline='') as csv_file:
-            writer = csv.writer(csv_file)
-            writer.writerow(["Node", "Execution Times (ms)", "Number of Executions", "Time Lost (ms)"])
-            for node in sorted(node_times_dict.keys()):
-                times = node_times_dict[node]
-                num_executions = len(times)
-                time_lost = sum(times) - times[-1] if times else 0
-                writer.writerow([node, ', '.join(str(time) for time in times), num_executions, time_lost])
-
-    @staticmethod
-    def extract_node_times(orch_output):
-        node_times_dict = {}
-
-        relevant_lines = [line.replace("INFO:root:>|PartialOrder|RunNode,", "") 
-                        for line in orch_output.split("\n") 
-                        if line.startswith("INFO:root:>|PartialOrder|RunNode,") and "Step time:" in line]
-
-        for line in relevant_lines:
-            parts = line.split("|")
-            node_id = int(parts[0])
-            time = float(parts[2].split(":")[1][:-2])  # Extract step time
-
-            if node_id not in node_times_dict:
-                node_times_dict[node_id] = []
-            node_times_dict[node_id].append(time)
-
-        return node_times_dict
+        return diffs
\ No newline at end of file
diff --git a/report/run_benchmarks.py b/report/run_benchmarks.py
old mode 100755
new mode 100644
index bdacc596..1bf98ce2
--- a/report/run_benchmarks.py
+++ b/report/run_benchmarks.py
@@ -13,7 +13,7 @@ def print_startup_info(args):
         print(f"    {arg + ':':13s} {value}")
 
     print("> Environment Variables:")
-    for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'PASH_TOP', 'PASH_SPEC_TOP']:
+    for env_var in ['ORCH_TOP', 'WORKING_DIR', 'TEST_SCRIPT_DIR', 'RESOURCE_DIR', 'REPORT_OUTPUT_DIR', 'PASH_TOP', 'PASH_SPEC_TOP', 'ORCH_COMMAND']:
         print(f"    {env_var + ':':17s} {os.environ.get(env_var)}")
 
 def parse_args():
@@ -23,9 +23,10 @@ def parse_args():
     parser.add_argument('--csv-output', action='store_true', help="Generate and save results in CSV format.")
     parser.add_argument('--verbose', action='store_true', help="Enable verbose output.")
     parser.add_argument('--full-gantt', action='store_false', help="Generate a full Gantt chart for each benchmark.")
-    parser.add_argument('--config-file', type=str, default='benchmark_config.json', help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.")
+    parser.add_argument('--config-file', type=str, default=None, help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.")
     parser.add_argument('--setup-script', type=str, default=None, help="Path to a setup script to run before running any other benchmark.")
     parser.add_argument('--subset', type=str, default=None, help="Name of a subset of benchmarks to run. Will instead download and store outputs in the dir with the specified name.")
+    parser.add_argument('--no-setup', action='store_true', help="Do not run any setup script before running benchmarks. Assumes subset is also set.")
     return parser.parse_args()
 
 # Sets the required environment variables for the benchmarking process.
@@ -72,11 +73,16 @@ def main():
     args = parse_args()
     
     set_environment_variables(args)
-    
-
-    # Use the config file path from arguments
-    config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file)
 
+    if args.config_file is not None:
+        if args.verbose:
+            print(f"Config File: {args.config_file}")
+        config_file_path = os.path.join(os.environ['WORKING_DIR'], args.config_file)
+    elif args.subset:
+        config_file_path = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", "config.json")
+    else:
+        config_file_path = os.path.join(os.environ['WORKING_DIR'], "benchmark_config.json")
+        
     # Parse benchmark configurations
     config_parser = ConfigParser(os.path.join(os.environ['WORKING_DIR'], config_file_path))
     config_parser.parse_config()
@@ -86,6 +92,21 @@ def main():
     
     if args.verbose:
         print_startup_info(args)
+        print(config_parser)
+
+    
+    if args.setup_script:
+        if args.verbose:
+            print(f"Running setup script: {args.setup_script}")
+        subprocess.run(['bash', args.setup_script])
+    elif args.subset and not args.no_setup:
+        setup_script = os.path.join(os.environ['TEST_SCRIPT_DIR'], "setup", 'setup.sh')
+        if os.path.exists(setup_script):
+            if args.verbose:
+                print(f"Running setup script: {args.setup_script}")
+            subprocess.run(['bash', setup_script])
+        elif args.verbose:
+                print(f"No setup script found in {os.environ['TEST_SCRIPT_DIR']}, ignoring")
     
     # Initialize and run the BenchmarkRunner
     runner = BenchmarkRunner(config_parser.get_benchmarks(), args)

From a1118589a2d1c923a0756f266f301207872f9ce7 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Fri, 9 Feb 2024 01:33:04 -0700
Subject: [PATCH 30/39] Print node id when logging node state

---
 parallel-orch/node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parallel-orch/node.py b/parallel-orch/node.py
index 9bb8ae22..1f34e1b3 100644
--- a/parallel-orch/node.py
+++ b/parallel-orch/node.py
@@ -217,7 +217,7 @@ def asts(self):
         return self.abstract_node.asts
 
     def pretty_state_repr(self):
-        return f'{state_pstr(self.state)} {self.cmd}'
+        return f'{state_pstr(self.state)},{self.id_},{self.cmd}'
 
     def is_initialized(self):
         return self.state == NodeState.INIT

From c6e3df0e57165e1f20ab2819bba1d23a05a60258 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Fri, 9 Feb 2024 02:53:12 -0700
Subject: [PATCH 31/39] Add custom workdir definition functionality

---
 report/benchmark_runner.py | 13 +++++++---
 report/config_parser.py    |  5 +++-
 report/run_benchmarks.py   |  1 -
 report/time_script.py      | 50 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 6 deletions(-)
 create mode 100644 report/time_script.py

diff --git a/report/benchmark_runner.py b/report/benchmark_runner.py
index a7531c5c..9cee12bb 100644
--- a/report/benchmark_runner.py
+++ b/report/benchmark_runner.py
@@ -42,15 +42,21 @@ def run_benchmark(self, benchmark: BenchmarkConfig):
         for pre_command in benchmark.pre_execution_script:
             CommandExecutor.run_pre_execution_command(pre_command, os.environ.get('RESOURCE_DIR'), self.args.verbose)
 
+        if benchmark.command_working_dir:
+            workdir = benchmark.command_working_dir
+        else:
+            workdir = os.environ.get('TEST_SCRIPT_DIR')
+        
+        
         # Execute the benchmark
         bash_time, bash_output, _ = CommandExecutor.run_command(
             benchmark.command.split(" "), 
-            os.environ.get('TEST_SCRIPT_DIR'), 
+            workdir, 
             self.args.verbose)
         orch_time, orch_output, orch_log = CommandExecutor.run_command_with_orch(
             benchmark.command.split(" "), 
             benchmark.orch_args, 
-            os.environ.get('TEST_SCRIPT_DIR'), 
+            workdir, 
             os.environ.get('ORCH_COMMAND'),
             self.args.verbose)
         
@@ -91,5 +97,4 @@ def generate_plots(self):
         for benchmark in self.benchmarks:
             activities = self.activities.get(benchmark.name)
             if activities:
-                print(os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt")
-                benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_gantt")
+                benchmark_plots.plot_prog_blocks(activities, os.environ.get('REPORT_OUTPUT_DIR'), f"{benchmark.name}_progress")
diff --git a/report/config_parser.py b/report/config_parser.py
index 790713e5..512a89ab 100644
--- a/report/config_parser.py
+++ b/report/config_parser.py
@@ -4,10 +4,11 @@
 
 class BenchmarkConfig:
     
-    def __init__(self, name, env, pre_execution_script, command, orch_args):
+    def __init__(self, name, env, pre_execution_script, command_working_dir, command, orch_args):
         self.name = name
         self.env = [self.replace_env_var(e) for e in env]
         self.pre_execution_script = [self.replace_env_var(script) for script in pre_execution_script]
+        self.command_working_dir = self.replace_env_var(command_working_dir)
         self.command = self.replace_env_var(command)
         self.orch_args = self.replace_env_var(orch_args)
         
@@ -22,6 +23,7 @@ def __str__(self):
         return (f"Benchmark '{self.name}':\n"
                 f"  Environment Variables: {env_str}\n"
                 f"  Pre-execution Script: {pre_exec_str}\n"
+                f"  Command Working Directory: {self.command_working_dir}\n"
                 f"  Command: {self.command}\n"
                 f"  Orchestrator Arguments: {self.orch_args}")
 
@@ -60,6 +62,7 @@ def parse_config(self):
                     name=config.get('name'),
                     env=config.get('env', []),
                     pre_execution_script=config.get('pre_execution_script', []),
+                    command_working_dir=config.get('working_dir', ""),
                     command=config.get('command'),
                     orch_args=config.get('orch_args', "")
                 )
diff --git a/report/run_benchmarks.py b/report/run_benchmarks.py
index 1bf98ce2..5d560e69 100644
--- a/report/run_benchmarks.py
+++ b/report/run_benchmarks.py
@@ -22,7 +22,6 @@ def parse_args():
     parser.add_argument('--no-logs', action='store_true', help="Do not save log files of benchmark runs.")
     parser.add_argument('--csv-output', action='store_true', help="Generate and save results in CSV format.")
     parser.add_argument('--verbose', action='store_true', help="Enable verbose output.")
-    parser.add_argument('--full-gantt', action='store_false', help="Generate a full Gantt chart for each benchmark.")
     parser.add_argument('--config-file', type=str, default=None, help="Path to the benchmark configuration file. Default is 'benchmark_config.json'.")
     parser.add_argument('--setup-script', type=str, default=None, help="Path to a setup script to run before running any other benchmark.")
     parser.add_argument('--subset', type=str, default=None, help="Name of a subset of benchmarks to run. Will instead download and store outputs in the dir with the specified name.")
diff --git a/report/time_script.py b/report/time_script.py
new file mode 100644
index 00000000..3ef96f2e
--- /dev/null
+++ b/report/time_script.py
@@ -0,0 +1,50 @@
+import subprocess
+import time
+import csv
+import sys
+
+def format_time(seconds):
+    # Format time as ss.msms where msms is in milliseconds
+    return "{:.4f}".format(seconds)
+
+def time_commands(shell_script_path):
+    # Read the shell script
+    with open(shell_script_path, 'r') as file:
+        commands = file.readlines()
+    
+    # Prepare the CSV output
+    csv_filename = shell_script_path + "_timing.csv"
+    with open(csv_filename, 'w', newline='') as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(["Command", "Time (seconds)"])
+        
+        # Initialize total time
+        total_time = 0.0
+        
+        # Execute each command and time it
+        for command in commands:
+            command = command.strip()
+            if command and not command.startswith('#'):  # Ignore empty lines and comments
+                start_time = time.time()
+                try:
+                    # Run the command
+                    result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    execution_time = time.time() - start_time
+                    # Write the result to the CSV with formatted time
+                    csvwriter.writerow([command, format_time(execution_time)])
+                    total_time += execution_time
+                except subprocess.CalledProcessError as e:
+                    print(f"An error occurred while executing the command: {command}")
+                    print(e.output.decode())
+                    sys.exit(1)
+        
+        # Write the total time with formatted time
+        csvwriter.writerow(["Total", format_time(total_time)])
+
+    print(f"Timing results written to {csv_filename}")
+
+# Usage: python time_script.py /path/to/your/script.sh
+if len(sys.argv) > 1:
+    time_commands(sys.argv[1])
+else:
+    print("Please provide the path to the shell script as an argument.")

From fadd38207ab2a0357a7e7368cb7acc1f2768c264 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sun, 11 Feb 2024 12:39:28 -0700
Subject: [PATCH 32/39] Fix config

---
 report/benchmarks/dgsh/setup/config.json | 40 +++++++++++++++++++-----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/report/benchmarks/dgsh/setup/config.json b/report/benchmarks/dgsh/setup/config.json
index 1660c620..1e27be57 100644
--- a/report/benchmarks/dgsh/setup/config.json
+++ b/report/benchmarks/dgsh/setup/config.json
@@ -1,7 +1,7 @@
-[
+[ 
     {
         "name": "Dgsh 1.sh",
-        "env": ["INPUT_FILE={{RESOURCE_DIR}}/testmini.csv"],
+        "env": ["INPUT_FILE={{RESOURCE_DIR}}/dblp.xml"],
         "command": "{{TEST_SCRIPT_DIR}}/dgsh/1.sh",
         "orch_args": "-d 2"
     },
@@ -13,8 +13,14 @@
     },
     {
         "name": "Dgsh 3.sh",
-        "command": "{{TEST_SCRIPT_DIR}}/dgsh/3.sh",
-        "working_dir": "{{RESOURCE_DIR}}/linux",
+        "command": "{{TEST_SCRIPT_DIR}}/3.sh",
+        "working_dir": "{{RESOURCE_DIR}}/linux/",
+        "orch_args": "-d 2"
+    },
+    {
+        "name": "Dgsh 4.sh",
+        "command": "{{TEST_SCRIPT_DIR}}/4.sh",
+        "working_dir": "{{RESOURCE_DIR}}/linux/",
         "orch_args": "-d 2"
     },
     {
@@ -24,8 +30,8 @@
         "orch_args": "-d 2"
     },
     {
-        "name": "Dgsh 6.sh",
-        "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"],
+        "name": "6.sh",
+        "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"],
         "command": "{{TEST_SCRIPT_DIR}}/6.sh",
         "orch_args": "-d 2"
     },
@@ -37,8 +43,26 @@
     },
     {
         "name": "8.sh",
-        "env": ["INPUT_FILE={{RESOURCE_DIR}}/larger_file.txt"],
+        "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"],
         "command": "{{TEST_SCRIPT_DIR}}/8_no_func.sh",
         "orch_args": "-d 2"
+    },
+    {
+        "name": "9.sh",
+        "env": ["INPUT_FILE={{RESOURCE_DIR}}/pg100.txt"],
+        "command": "{{TEST_SCRIPT_DIR}}/9.sh",
+        "orch_args": "-d 2"
+    },
+    {
+        "name": "17.sh",
+        "env": ["INPUT_FILE={{RESOURCE_DIR}}/goods_classification.csv"],
+        "command": "{{TEST_SCRIPT_DIR}}/17.sh",
+        "orch_args": "-d 2"
+    },
+    {
+        "name": "18.sh",
+        "working_dir": "{{RESOURCE_DIR}}/linux/",
+        "command": "{{TEST_SCRIPT_DIR}}/18.sh",
+        "orch_args": "-d 2"
     }
-]
+]
\ No newline at end of file

From a98b95603e6042dfe6b59f735d59aa27f8558991 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sun, 11 Feb 2024 12:40:48 -0700
Subject: [PATCH 33/39] Fix benchmarks to work properly

---
 report/benchmarks/dgsh/17.sh        | 21 +++++++--------------
 report/benchmarks/dgsh/18.sh        | 22 ++++++++++++++++------
 report/benchmarks/dgsh/3.sh         |  0
 report/benchmarks/dgsh/4.sh         |  0
 report/benchmarks/dgsh/5.sh         | 19 ++++++++-----------
 report/benchmarks/dgsh/6.sh         |  3 ---
 report/benchmarks/dgsh/8_no_func.sh | 12 ++++--------
 report/benchmarks/dgsh/9.sh         |  2 +-
 8 files changed, 36 insertions(+), 43 deletions(-)
 mode change 100644 => 100755 report/benchmarks/dgsh/17.sh
 mode change 100644 => 100755 report/benchmarks/dgsh/18.sh
 mode change 100644 => 100755 report/benchmarks/dgsh/3.sh
 mode change 100644 => 100755 report/benchmarks/dgsh/4.sh
 mode change 100644 => 100755 report/benchmarks/dgsh/9.sh

diff --git a/report/benchmarks/dgsh/17.sh b/report/benchmarks/dgsh/17.sh
old mode 100644
new mode 100755
index 52ada7b4..df2788ff
--- a/report/benchmarks/dgsh/17.sh
+++ b/report/benchmarks/dgsh/17.sh
@@ -38,20 +38,13 @@ file2=$(mktemp)
 file3=$(mktemp)
 file4=$(mktemp)
 
-# Save the ls output to a temporary file
-ls -n > "$file1"
+cat $INPUT_FILE > $file1
 
-# Reorder fields in DIR-like way
-awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' "$file1" > "$file2"
+# Extract columns 5 and 6, save to temp1
+cut -d ',' -f 5-6 "$file1" > "$file2"
 
-# Count number of files
-wc -l "$file1" | tr -d \\n > "$file3"
-echo -n ' File(s) ' >> "$file3"
-awk '{s += $5} END {printf("%d bytes\n", s)}' "$file1" >> "$file3"
+# Extract columns 2, 3, and 4, save to temp2
+cut -d ',' -f 2-4 "$file1" > "$file3"
 
-# Count number of directories and print label for number of dirs and calculate free bytes
-grep -c '^d' "$file1" | tr -d \\n > "$file4"
-df -h . | awk '!/Use%/{print " Dir(s) " $4 " bytes free"}' >> "$file4"
-
-# Display the results
-cat "$file2" "$file3" "$file4"
+# Combine the columns
+paste -d ',' "$file2" "$file3"
diff --git a/report/benchmarks/dgsh/18.sh b/report/benchmarks/dgsh/18.sh
old mode 100644
new mode 100755
index dac2892b..87c886f6
--- a/report/benchmarks/dgsh/18.sh
+++ b/report/benchmarks/dgsh/18.sh
@@ -29,21 +29,31 @@ file_details_file=$(mktemp)
 file_count_file=$(mktemp)
 dir_count_file=$(mktemp)
 byte_count_file=$(mktemp)
+#!/bin/sh
+
+# Create temporary files
+free_space_file=$(mktemp)
+file_details_file=$(mktemp)
+file_count_file=$(mktemp)
+dir_count_file=$(mktemp)
+byte_count_file=$(mktemp)
+
+# Base directory for the listing
 
 # Get free space
 df -h . | awk '!/Use%/{print $4}' > "$free_space_file"
 
-# List details of files and directories
-ls -l | awk '!/^total/ {print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file"
+# Recursively list details of files
+find . -type f -exec ls -l {} + | awk '{print $6, $7, $8, $1, sprintf("%8d", $5), $9}' > "$file_details_file"
 
 # Count number of files
-ls -l | grep -v '^total' | grep -v '^d' | wc -l > "$file_count_file"
+find . -type f | wc -l > "$file_count_file"
 
 # Count number of directories
-ls -l | grep '^d' | wc -l > "$dir_count_file"
+find . -type d | wc -l > "$dir_count_file"
 
-# Calculate total bytes
-ls -l | awk '{if($1 != "total") s += $5} END {print s}' > "$byte_count_file"
+# Calculate total bytes for files
+find . -type f -exec stat --format="%s" {} + | awk '{s+=$1} END {print s}' > "$byte_count_file"
 
 # Display the results
 cat "$file_details_file"
diff --git a/report/benchmarks/dgsh/3.sh b/report/benchmarks/dgsh/3.sh
old mode 100644
new mode 100755
diff --git a/report/benchmarks/dgsh/4.sh b/report/benchmarks/dgsh/4.sh
old mode 100644
new mode 100755
diff --git a/report/benchmarks/dgsh/5.sh b/report/benchmarks/dgsh/5.sh
index 0f2c5855..47a52b82 100755
--- a/report/benchmarks/dgsh/5.sh
+++ b/report/benchmarks/dgsh/5.sh
@@ -33,25 +33,22 @@
 file1=$(mktemp)
 file2=$(mktemp)
 file3=$(mktemp)
-file4=$(mktemp)
-
-# export LC_ALL=C
 
 cat $INPUT_FILE >"$file1"
 
 # Find errors
 
 # Obtain list of words in text
-cat "$file1" |
-tr -cs A-Za-z \\n |
-tr A-Z a-z |
-sort -u > "$file2"
+cat "$file1" | 
+tr '[:upper:]' '[:lower:]' | 
+sed 's/[^a-z]/\n/g' | 
+grep -v '^$' | 
+sort | 
+uniq | 
+grep -v '^$' > "$file2"
 
 # Ensure dictionary is compatibly sorted
-cat "$file1" |
 sort /usr/share/dict/words > "$file3"
 
 # List errors as a set difference
-comm -23 "$file2" "$file3" > "$file4"
-
-fgrep -f "$file4" -i --color -w -C 2 "$file1"
+comm -23 "$file2" "$file3" 
diff --git a/report/benchmarks/dgsh/6.sh b/report/benchmarks/dgsh/6.sh
index 53828e0f..ec48a28f 100755
--- a/report/benchmarks/dgsh/6.sh
+++ b/report/benchmarks/dgsh/6.sh
@@ -42,9 +42,6 @@ file5=$(mktemp)
 
 cat $INPUT_FILE > $file1
 
-# Consistent sorting across machines
-# export LC_ALL=C
-
 # Stream input from file and split input one word per line
 # Create list of unique words
 tr -cs a-zA-Z '\n' < "$file1" |
diff --git a/report/benchmarks/dgsh/8_no_func.sh b/report/benchmarks/dgsh/8_no_func.sh
index ae676918..e4f760e3 100755
--- a/report/benchmarks/dgsh/8_no_func.sh
+++ b/report/benchmarks/dgsh/8_no_func.sh
@@ -38,9 +38,6 @@
 #  limitations under the License.
 #
 
-# Consistent sorting across machines
-# export LC_ALL=C
-
 # Temporary files
 file1=$(mktemp)
 file2=$(mktemp)
@@ -48,7 +45,6 @@ file3=$(mktemp)
 file4=$(mktemp)
 
 cat $INPUT_FILE > $file1
-cat $file1
 
 # Split input one word per line
 tr -cs a-zA-Z '\n' < "$file1" > "$file2"
@@ -86,7 +82,7 @@ awk '{count[$1]++} END {for (i in count) print count[i], i}' |
 sort -rn | tee "$file3"
 
 # Print relative
-# echo "Relative character frequency"
-# awk -v NCHARS=$nchars 'BEGIN {
-# 		OFMT = "%.2g%%"}
-# 		{print $1, $2, $1 / NCHARS * 100}' "$file3"
\ No newline at end of file
+echo "Relative character frequency"
+awk -v NCHARS=$nchars 'BEGIN {
+		OFMT = "%.2g%%"}
+		{print $1, $2, $1 / NCHARS * 100}' "$file3"
\ No newline at end of file
diff --git a/report/benchmarks/dgsh/9.sh b/report/benchmarks/dgsh/9.sh
old mode 100644
new mode 100755
index 88ee52a0..f090d36f
--- a/report/benchmarks/dgsh/9.sh
+++ b/report/benchmarks/dgsh/9.sh
@@ -33,7 +33,7 @@ file2=$(mktemp)
 file3=$(mktemp)
 
 # Find object files and print defined symbols
-find "$INPUT" -name "*.o" | xargs nm > "$file1"
+find . -type f -name "*.o" | xargs nm > "$file1"
 
 # List all defined (exported) symbols
 awk 'NF == 3 && $2 ~ /[A-Z]/ {print $3}' "$file1" | sort > "$file2"

From 0450d5d47fcb0a84c100fed8b8fee45e0956eb73 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Sun, 11 Feb 2024 17:29:17 -0500
Subject: [PATCH 34/39] make strace spawn shell such that it traces the
 redirection

---
 parallel-orch/template_script_to_execute.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parallel-orch/template_script_to_execute.sh b/parallel-orch/template_script_to_execute.sh
index ea36ac4e..5af85d64 100755
--- a/parallel-orch/template_script_to_execute.sh
+++ b/parallel-orch/template_script_to_execute.sh
@@ -33,7 +33,7 @@
 # rkr --db "$TEMPDIR" --rikerfile "$TEMPDIR/Rikerfile" --debug trace -o "$TRACE_FILE" > /dev/null
 # echo 'second riker run done' 1>&2
 source $LATEST_ENV_FILE
-eval $(echo "strace -y -f  --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE $CMD_STRING")
+strace -y -f  --seccomp-bpf --trace=fork,clone,%file -o $TRACE_FILE bash -c "source $LATEST_ENV_FILE; $CMD_STRING"
 exit_code=$?
 
 (exit $exit_code)

From 5be2a6cce5c3e2ca4245f9e5bf63b75ae381b756 Mon Sep 17 00:00:00 2001
From: Guest <di_jin@brown.edu>
Date: Mon, 12 Feb 2024 14:28:29 -0500
Subject: [PATCH 35/39] adding getxattr, lgetxattr, and faccessat2 into tracing

---
 parallel-orch/trace_v2.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/parallel-orch/trace_v2.py b/parallel-orch/trace_v2.py
index 10c52c47..d0696944 100644
--- a/parallel-orch/trace_v2.py
+++ b/parallel-orch/trace_v2.py
@@ -8,16 +8,16 @@
 # Global TODOs:
 # handle pwd, such that open and stat can work
 
-# not handled: listxattr, llistxattr, getxattr, lgetxattr, pivot_root, mount, umount2
+# not handled: listxattr, llistxattr, getxattr, pivot_root, mount, umount2
 # setxattr lsetxattr removexattr lremovexattr, fanotify_mark, renameat2, chroot, quotactl
 # handled individually openat, open, chdir, clone, rename
 # TODO: link, symlink, renameat, symlinkat
 r_first_path_set = set(['execve', 'stat', 'lstat', 'access', 'statfs',
-                        'readlink', 'execve'])
+                        'readlink', 'execve', 'getxattr', 'lgetxattr'])
 w_first_path_set = set(['mkdir', 'rmdir', 'truncate', 'creat', 'chmod', 'chown',
                         'lchown', 'utime', 'mknod', 'utimes', 'acct', 'unlink'])
 r_fd_path_set = set(['fstatat', 'newfstatat', 'statx', 'name_to_handle_at',
-                     'readlinkat', 'faccessat', 'execveat'])
+                     'readlinkat', 'faccessat', 'execveat', 'faccessat2'])
 w_fd_path_set = set(['unlinkat', 'utimensat', 'mkdirat', 'mknodat', 'fchownat', 'futimeat',
                      'unlinkat', 'linkat', 'fchmodat', 'utimensat'])
 ignore_set = set(['getpid', 'getcwd'])
@@ -89,6 +89,9 @@ def parse_string(s):
     # as a read when we handle return value anyway so it's fine
     if s == 'NULL':
         return ''
+    if not s[0] == '"' or not s[-1] == '"':
+        import pdb
+        pdb.set_trace()
     assert s[0] == '"' and s[-1] == '"'
     return bytes(s[1:-1], "utf-8").decode("unicode_escape")
 

From 6f2266f3df6ceb3e098d87e82c96bbce6ac307da Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sat, 17 Feb 2024 15:57:25 +0000
Subject: [PATCH 36/39] Use most recent try branch

---
 deps/try | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/try b/deps/try
index 37bbf7da..ba6a9061 160000
--- a/deps/try
+++ b/deps/try
@@ -1 +1 @@
-Subproject commit 37bbf7da5bfde97f598c3327c9582d9b08d7e264
+Subproject commit ba6a90615944203a95d5a86638447da34e539d1b

From 1172d7fbb12f29c7348a78c0319e895a3b1b4a97 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sat, 17 Feb 2024 16:08:08 +0000
Subject: [PATCH 37/39] Remove Riker dependency

---
 .gitmodules                      |  4 ----
 deps/riker                       |  1 -
 scripts/install_deps_ubuntu20.sh | 10 ++--------
 3 files changed, 2 insertions(+), 13 deletions(-)
 delete mode 160000 deps/riker

diff --git a/.gitmodules b/.gitmodules
index ba4f5c51..bf6ada7c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "deps/riker"]
-	path = deps/riker
-	url = https://github.com/angelhof/riker.git
-	branch = eric-custom-db-store
 [submodule "deps/pash"]
 	path = deps/pash
 	url = https://github.com/binpash/pash.git
diff --git a/deps/riker b/deps/riker
deleted file mode 160000
index f3bee7ba..00000000
--- a/deps/riker
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f3bee7ba19b8834199ff49dac53852f09f03338a
diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh
index 8d239d2a..f884202c 100755
--- a/scripts/install_deps_ubuntu20.sh
+++ b/scripts/install_deps_ubuntu20.sh
@@ -1,20 +1,14 @@
 #!/bin/bash
 
-## Install Riker's dependencies
 sudo apt-get update
-sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool
-sudo update-alternatives --install /usr/bin/cram cram /usr/bin/cram3 100
+# TODO: some of these are Riker dependencies are no longer needed.
+sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool python3-matplotlib
 
 export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
 export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash}
 
-pip3 install $PASH_SPEC_TOP/requirements.txt
-
 ## Download submodule dependencies
 git submodule update --init --recursive
 
-## Install Riker
-(cd deps/riker; make; sudo make install)
-
 ## Install PaSh
 (cd deps/pash; ./scripts/distro-deps.sh; ./scripts/setup-pash.sh)

From d0851fdd5988b6584e96d3c2464af29ff508cbc6 Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sat, 17 Feb 2024 16:11:36 +0000
Subject: [PATCH 38/39] Update config to also setup try

---
 scripts/install_deps_ubuntu20.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/install_deps_ubuntu20.sh b/scripts/install_deps_ubuntu20.sh
index f884202c..1bcf4122 100755
--- a/scripts/install_deps_ubuntu20.sh
+++ b/scripts/install_deps_ubuntu20.sh
@@ -2,7 +2,7 @@
 
 sudo apt-get update
 # TODO: some of these are Riker dependencies are no longer needed.
-sudo apt install -y make clang llvm git gcc python3-cram file graphviz libtool python3-matplotlib
+sudo apt install -y make git python3-cram file graphviz libtool python3-matplotlib libcap2-bin mergerfs
 
 export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
 export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash}
@@ -10,5 +10,8 @@ export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash}
 ## Download submodule dependencies
 git submodule update --init --recursive
 
+# Install try
+(cd deps/try; ./setup.sh)
+
 ## Install PaSh
 (cd deps/pash; ./scripts/distro-deps.sh; ./scripts/setup-pash.sh)

From c569b8313a3a96f7dee6897d7cb5c637ea450a7b Mon Sep 17 00:00:00 2001
From: gliargovas <gliargovas@aueb.gr>
Date: Sat, 17 Feb 2024 16:12:23 +0000
Subject: [PATCH 39/39] Make CI use Python 3.11

---
 .github/workflows/tests.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 193e1af2..1e187580 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,6 +31,10 @@ jobs:
       if: github.event.pull_request.draft == false
       steps:
       - uses: actions/checkout@v2 
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
       - name: Running Correctness Tests
         run: |
           cd ..