# 13.3. Automatic Parallelism¶


In [None]:
import torch
from d2l import torch as d2l

# NEED GPU TO RUN FOLLOWING CODE



In [None]:
devices = d2l.try_all_gpus()
def run(x: torch.Tensor):
    return [x.mm(x) for _ in range(1000)]

x_gpu1 = torch.rand(size=(1000, 1000), device=devices[0])
x_gpu2 = torch.rand(size=(1000, 1000), device=devices[1])

run(x_gpu1)
run(x_gpu2)  # Warm-up all devices
torch.cuda.synchronize(devices[0])
torch.cuda.synchronize(devices[1])

with d2l.Benchmark('GPU1 time'):
    run(x_gpu1)
    torch.cuda.synchronize(devices[0])

with d2l.Benchmark('GPU2 time'):
    run(x_gpu2)
    torch.cuda.synchronize(devices[1])

# Run both GPUs in parallel
with d2l.Benchmark('GPU1 & GPU2'):
    run(x_gpu1)
    run(x_gpu2)
    torch.cuda.synchronize()

In [None]:
def copy_to_cpu(x, non_blocking=False):
    return [y.to('cpu', non_blocking=non_blocking) for y in x]

with d2l.Benchmark('Run on GPU1'):
    y = run(x_gpu1)
    torch.cuda.synchronize()

with d2l.Benchmark('Copy to CPU'):
    y_cpu = copy_to_cpu(y)
    torch.cuda.synchronize()

In [None]:
# will be much faster if the copy is non-blocking
with d2l.Benchmark('Run on GPU1 and copy to CPU'):
    y = run(x_gpu1)
    y_cpu = copy_to_cpu(y, True) # Non-blocking copy
    torch.cuda.synchronize()