In [None]:
# Ray aDAG Developer Guide - Hands-on Walkthrough

## 1. Introduction to Ray aDAGs
# Note: Transition to slides to explain "What is Ray aDAG?" and "Why Use aDAGs?"
# (Discuss performance benefits and specific use cases like LLM inference.)

# Also note that this requires both torch and ray installed (obviously) but both are prepped already as part of the image 
# for Ray Summit Training 2024

In [None]:
# Step 2: Define and Create Actors with Ray Core
import ray

@ray.remote
class EchoActor:
    def echo(self, msg):
        return msg

# Create two actors
a = EchoActor.remote()
b = EchoActor.remote()

In [None]:
# Send a message and get a response
msg_ref = a.echo.remote("hello")
msg_ref = b.echo.remote(msg_ref)
print(ray.get(msg_ref))  # Expected output: "hello"

In [None]:
## 3. Using Ray aDAGs for Performance Optimization
# Note: Transition to slides to explain "How Ray Core traditionally executes tasks" 
# and "Challenges with dynamic control flow" (discuss overheads with serialization and object store).

# Step 3: Define and Execute with Ray DAG API (Classic Ray Core)
import ray.dag
import time

In [None]:
# Define a lazy DAG
with ray.dag.InputNode() as inp:
    intermediate_inp = a.echo.bind(inp)
    dag = b.echo.bind(intermediate_inp)

In [None]:
# Execute the DAG with inputs
print(ray.get(dag.execute("hello")))
print(ray.get(dag.execute("world")))

In [None]:
# Time the execution
for _ in range(5):
    start = time.perf_counter()
    ray.get(dag.execute("hello"))
    print("Took", time.perf_counter() - start)

In [None]:
## 4. Optimizing with Ray aDAGs

# Step 4: Compile and Execute with aDAG Backend and time and compare the difference in exec speed
adag = dag.experimental_compile()

In [None]:
# Execute the aDAG and measure the time
for _ in range(5):
    start = time.perf_counter()
    ray.get(adag.execute("hello"))
    print("Took", time.perf_counter() - start)

In [None]:
# Tear down the DAG
adag.teardown()

In [None]:
## 5. [BONUS #1] Multi-Actor Execution in Ray aDAG

# Step 5: Executing Across Multiple Actors with Ray aDAG
# Create multiple actors
N = 3
actors = [EchoActor.remote() for _ in range(N)]


In [None]:

# Define the DAG with multiple outputs
with ray.dag.InputNode() as inp:
    outputs = [actor.echo.bind(inp) for actor in actors]
    dag = ray.dag.MultiOutputNode(outputs)


In [None]:

# Compile and execute the DAG
adag = dag.experimental_compile()
print(ray.get(adag.execute("hello")))  # Expected: ["hello", "hello", "hello"]


In [None]:

# Tear down the DAG
adag.teardown()



In [None]:
## 6. [BONUS #2] GPU-GPU Communication with aDAGs

# Note: Transition to slides to discuss "GPU-GPU communication and NCCL".

# Step 6: GPU to GPU Data Transfer Example
import torch
from ray.experimental.channel.torch_tensor_type import TorchTensorType

@ray.remote(num_gpus=1)
class GPUSender:
    def send(self, shape):
        return torch.zeros(shape, device="cuda")
@ray.remote(num_gpus=1)
class GPUReceiver:
    def recv(self, tensor: torch.Tensor):
        assert tensor.device.type == "cuda"
        return tensor.shape


In [None]:
# Create the sender and receiver actors
sender = GPUSender.remote()
receiver = GPUReceiver.remote()


In [None]:
# Define and compile a DAG for GPU-GPU communication
with ray.dag.InputNode() as inp:
    dag = sender.send.bind(inp)
    dag = dag.with_type_hint(TorchTensorType())
    dag = receiver.recv.bind(dag)
adag = dag.experimental_compile()


In [None]:
# Execute the DAG and check the results
assert ray.get(adag.execute((10, ))) == (10, )
adag.teardown()

## 7. Conclusion and Summary
# Note: Transition to slides for summarizing key takeaways and discussing 
# limitations of aDAGs (e.g., actor constraints, NCCL).

In [None]:
# Send a message and get a response
msg_ref = a.echo.remote("hello")
msg_ref = b.echo.remote(msg_ref)
print(ray.get(msg_ref))  # Expected output: "hello"
## 3. Using Ray aDAGs for Performance Optimization

# Note: Transition to slides to explain "How Ray Core traditionally executes tasks" 
# and "Challenges with dynamic control flow" (discuss overheads with serialization and object store).
# Step 3: Define and Execute with Ray DAG API (Classic Ray Core)
import ray.dag
import time

# Define a lazy DAG
with ray.dag.InputNode() as inp:
    intermediate_inp = a.echo.bind(inp)
    dag = b.echo.bind(intermediate_inp)
# Execute the DAG with inputs
print(ray.get(dag.execute("hello")))
print(ray.get(dag.execute("world")))
# Time the execution
for _ in range(5):
    start = time.perf_counter()
    ray.get(dag.execute("hello"))
    print("Took", time.perf_counter() - start)
## 4. Optimizing with Ray aDAGs

# Step 4: Compile and Execute with aDAG Backend
# Compile the DAG for aDAG backend

adag = dag.experimental_compile()
# Execute the aDAG and measure the time
for _ in range(5):
    start = time.perf_counter()
    ray.get(adag.execute("hello"))
    print("Took", time.perf_counter() - start)
# Tear down the DAG
adag.teardown()

## 5. [BONUS #1] Multi-Actor Execution in Ray aDAG

# Step 5: Executing Across Multiple Actors with Ray aDAG
# Create multiple actors
N = 3
actors = [EchoActor.remote() for _ in range(N)]
# Define the DAG with multiple outputs
with ray.dag.InputNode() as inp:
    outputs = [actor.echo.bind(inp) for actor in actors]
    dag = ray.dag.MultiOutputNode(outputs)
# Compile and execute the DAG
adag = dag.experimental_compile()
print(ray.get(adag.execute("hello")))  # Expected: ["hello", "hello", "hello"]
# Tear down the DAG
adag.teardown()

## 6. [BONUS #2] GPU-GPU Communication with aDAGs

# Note: Transition to slides to discuss "GPU-GPU communication and NCCL".

# Step 6: GPU to GPU Data Transfer Example
import torch
from ray.experimental.channel.torch_tensor_type import TorchTensorType

@ray.remote(num_gpus=1)
class GPUSender:
    def send(self, shape):
        return torch.zeros(shape, device="cuda")
@ray.remote(num_gpus=1)
class GPUReceiver:
    def recv(self, tensor: torch.Tensor):
        assert tensor.device.type == "cuda"
        return tensor.shape
# Create the sender and receiver actors
sender = GPUSender.remote()
receiver = GPUReceiver.remote()
# Define and compile a DAG for GPU-GPU communication
with ray.dag.InputNode() as inp:
    dag = sender.send.bind(inp)
    dag = dag.with_type_hint(TorchTensorType())
    dag = receiver.recv.bind(dag)
adag = dag.experimental_compile()
# Execute the DAG and check the results
assert ray.get(adag.execute((10, ))) == (10, )
adag.teardown()

#

In [None]:
# 7. Conclusion and Summary
# Note: Transition to slides for summarizing key takeaways and discussing 
# limitations of aDAGs (e.g., actor constraints, NCCL).