Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2d16cd7
some refactoring of spec support
angelhof Apr 27, 2023
57a0dc4
Add some initial scaffoling for loop support
angelhof Apr 27, 2023
ca702f3
Fix an issue in installation
angelhof May 2, 2023
80a2255
Add comments on how to address the loop issue
angelhof May 2, 2023
c135d1d
push pash
angelhof May 2, 2023
ece0f17
Add a comment about how to unroll a loop concretely
angelhof May 2, 2023
bfc9877
follow latest pash on loop iterations
angelhof May 2, 2023
198a493
checkpoint
angelhof May 3, 2023
84dbdb2
checkpoint
angelhof May 3, 2023
057fe54
Correctly pass loop counters to scheduler from pash-jit
angelhof May 3, 2023
8570725
Merge remote-tracking branch 'origin/main' into loop-support
angelhof May 3, 2023
bb9be4e
add some assertions to check that loop nodes cannot be executed
angelhof May 4, 2023
52ad1d0
Push a checkpoint before doing the big change of changing all NodeIds…
angelhof May 4, 2023
cf97e13
Make NodeId its own class
angelhof May 4, 2023
9ecd8d5
Start unrolling the loop
angelhof May 4, 2023
8a0d787
rough first draft of loops seems to be working
angelhof May 4, 2023
0001a66
Solve a bug where the rkr cache was there and so there was no reexecu…
angelhof May 4, 2023
7f16341
first draft of loop works, next fix multiple commands in the same loop
angelhof May 4, 2023
6691454
refactor dependency resolution
angelhof May 8, 2023
004b506
Fix the issue with multiple commands in the same loop
angelhof May 8, 2023
edea974
follow latest pash branch
angelhof May 8, 2023
92fdefa
Find non committed nodes properly
angelhof May 8, 2023
bc76c94
correctly reroute edges when unrolling a loop
angelhof May 8, 2023
0c7f88f
Remove an unneccessary cursed old method
angelhof May 8, 2023
f75ba68
Refactor cmd_can_be_resolved so that it is easier to work with
angelhof May 8, 2023
5cbb019
Add loop nodes to committed once we receive a later wait
angelhof May 8, 2023
f446671
Refactor PO
angelhof May 9, 2023
ed1db37
Merge remote-tracking branch 'origin/main' into loop-support
angelhof May 9, 2023
03300ab
Add a loop test
angelhof May 10, 2023
6100325
Refactor step_forward
angelhof May 10, 2023
d78fc9b
Clean up a side-effectful pure function
angelhof May 10, 2023
4825e77
Fix commands after loop and some more refactoring
angelhof May 10, 2023
d2352b6
remove loop
angelhof May 10, 2023
b6e7b76
Follow pash future
angelhof May 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
747 changes: 627 additions & 120 deletions parallel-orch/partial_program_order.py

Large diffs are not rendered by default.

46 changes: 36 additions & 10 deletions parallel-orch/scheduler_server.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import copy
import logging
import signal
from util import *
import config
from partial_program_order import parse_partial_program_order_from_file
import sys
from partial_program_order import parse_partial_program_order_from_file, NodeId, parse_node_id

##
## A scheduler server
Expand Down Expand Up @@ -66,19 +68,43 @@ def handle_init(self, input_cmd: str):
partial_order_file = input_cmd.split(":")[1].rstrip()
logging.debug(f'Scheduler: Received partial_order_file: {partial_order_file}')
self.partial_program_order = parse_partial_program_order_from_file(partial_order_file)
self.partial_program_order.init_workset()
logging.debug(f'Parsed partial program order:')
self.partial_program_order.populate_to_be_resolved_dict([])
logging.debug(f'To be resolved sets per node:')
logging.debug(self.partial_program_order.to_be_resolved)
self.partial_program_order.init_partial_order()

def __parse_wait(self, input_cmd: str):
try:
node_id_component, loop_iter_counter_component = input_cmd.rstrip().split("|")
node_id = NodeId(int(node_id_component.split(":")[1].rstrip()))
loop_counters_str = loop_iter_counter_component.split(":")[1].rstrip()
if loop_counters_str == "None":
loop_counters = []
else:
loop_counters = [int(cnt) for cnt in loop_counters_str.split("-")]
return node_id, loop_counters
except:
raise Exception(f'Parsing failure for line: {input_cmd}')

def handle_wait(self, input_cmd: str, connection):
assert(input_cmd.startswith("Wait"))
## We have received this message by the JIT, which waits for a node_id to
## finish execution.
node_id = int(input_cmd.split(":")[1].rstrip())
logging.debug(f'Scheduler: Received wait for node_id: {node_id}')

raw_node_id, loop_counters = self.__parse_wait(input_cmd)
logging.debug(f'Scheduler: Received wait for node_id: {raw_node_id} with loop counters: {loop_counters}')

if self.partial_program_order.is_loop_node(raw_node_id):
node_id = NodeId(raw_node_id.id, loop_counters)
if not self.partial_program_order.is_node_id(node_id):
## TODO: This unrolling can also happen and be moved to speculation.
## For now we are being conservative and that is why it only happens here
## TODO: Move this to the scheduler.schedule_work() (if we have a loop node waiting for response and we are not unrolled, unroll to create work)
self.partial_program_order.unroll_loop_node(raw_node_id)
else:
## If we are not in a loop, then the node id corresponds to the concrete node
node_id = raw_node_id

## Inform the partial order that we received a wait for a node so that it can push loops
## forward and so on.
self.partial_program_order.wait_received(node_id)

## If the node_id is already committed, just return its exit code
if node_id in self.partial_program_order.get_committed():
logging.debug(f'Node: {node_id} found in committed, responding immediately!')
Expand All @@ -94,7 +120,7 @@ def handle_wait(self, input_cmd: str, connection):
def __parse_command_exec_complete(self, input_cmd: str) -> "tuple[int, int]":
try:
components = input_cmd.rstrip().split("|")
command_id = int(components[0].split(":")[1])
command_id = parse_node_id(components[0].split(":")[1])
exit_code = int(components[1].split(":")[1])
sandbox_dir = components[2].split(":")[1]
return command_id, exit_code, sandbox_dir
Expand Down
5 changes: 5 additions & 0 deletions parallel-orch/template_script_to_execute.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## TODO: Pass frontier flag here instead of separate scripts

## Clean up the riker directory
## KK 2023-05-04 should this be done somewhere else? Could this interfere with overlay fs?
## TODO: Can we just ask riker to use a different cache (or put the cache to /dev/null)
## since we never really want it to take the cache into account
rm -rf ./.rkr

## Save the script to execute in the sandboxdir
echo $CMD_STRING > ./Rikerfile
Expand Down
3 changes: 3 additions & 0 deletions scripts/install_deps_ubuntu20.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
export PASH_TOP=${PASH_TOP:-$PASH_SPEC_TOP/deps/pash}

## Install Riker's dependencies
sudo apt-get update
sudo apt install -y make clang llvm git gcc python3-cram file graphviz
Expand Down
10 changes: 10 additions & 0 deletions test/test_orch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ run_test()
test_orch_ec=$?

## Print stderr
## TODO: Fix this to print the stderr continuously by doing the execution checking inside pash-spec
if [ $DEBUG -ge 1 ]; then
cat "$stderr_file" 1>&2
fi
Expand Down Expand Up @@ -295,6 +296,14 @@ test_stdout()
$shell $2/test_stdout.sh
}

test_loop()
{
local shell=$1
$shell $2/test_loop.sh
}

## TODO: make more loop tests with nested loops and commands after the loop

# We run all tests composed with && to exit on the first that fails
if [ "$#" -eq 0 ]; then
run_test test1_1 # "1 2 2 1"
Expand Down Expand Up @@ -323,6 +332,7 @@ if [ "$#" -eq 0 ]; then
run_test test9_2
run_test test9_3
run_test test_stdout
run_test test_loop
else
for testname in $@
do
Expand Down
13 changes: 13 additions & 0 deletions test/test_scripts/test_loop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
echo hi
for i in 1 2 3; do
echo hi1
sleep 1
echo hi2
done
echo hi3

## Future loop tests must include:
## 1. A single loop with a single command without anything else
## 2. Multiple commands in the same loop
## 3. Nested loops
## 4. Commands before and after a loop