In [None]:
import dace
import numpy as np
from dace.transformation.interstate import GPUTransformSDFG
from dace.frontend.common import op_repository as oprepo


# Examples

In [None]:
# 32x1 -> 1x1 using 1 block--1warp
@dace.program
def myprog(a: dace.float64[32]):
    b = dace.ndarray([1,1], dtype=dace.float64)
    for blockIdx_x in dace.map[0:1]:
        for threadIdx_x in dace.map[0:32]:
            with dace.tasklet:
                ain << a[threadIdx_x]
                aaout >> b[0](-1)
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 16)
                aout += __shfl_down_sync(0xffffffff, aout, 8)
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout
                
    return b


In [None]:
# 16x32 -> 16x1 using 1 block--16warp
@dace.program
def myprog(a: dace.float64[16,32]):
    b = dace.ndarray([16], dtype=dace.float64)
    for blockIdx_x in dace.map[0:1]:
        for threadIdx_y, threadIdx_x in dace.map[0:16,0:32]:
            with dace.tasklet:                   
                ain << a[threadIdx_y,threadIdx_x]
                aaout >> b(-1)[threadIdx_y]
                
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 16)
                aout += __shfl_down_sync(0xffffffff, aout, 8)
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

In [None]:
# 32x16 -> 32x1 using 1 block--16warp
@dace.program
def myprog(a: dace.float64[32,16]):
    b = dace.ndarray([32], dtype=dace.float64)
    for blockIdx_x in dace.map[0:1]:
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:16]:
            with dace.tasklet:                   
                ain << a[threadIdx_y,threadIdx_x]
                aaout >> b(-1)[threadIdx_y]
                
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 8)
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

In [None]:
# 128x8 -> 128x1 using 2 block--32warp
@dace.program
def myprog(a: dace.float64[512,8]):
    b = dace.ndarray([512], dtype=dace.float64)
    for blockIdx_x in dace.map[0:2]:
        for threadIdx_y, threadIdx_x in dace.map[0:256,0:8]:
            with dace.tasklet:                   
                ain << a[blockIdx_x*256+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*256+threadIdx_y]
                
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

a = np.random.rand(512,8)
b = sdfg(a)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
b

# Kernels

## Nx16 reduce

In [None]:
H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def myprog(inputs: dace.float64[H,W], gridDim_x: dace.int64, blockDim_y: dace.int64):
    b = dace.ndarray([H], dtype=dace.float64)
    for blockIdx_x in dace.map[0:gridDim_x]:
        for threadIdx_y, threadIdx_x in dace.map[0:64,0:16]:
            with dace.tasklet:                   
                ain << inputs[blockIdx_x*64+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*64+threadIdx_y]
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 8)
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

a = np.random.rand(512,16)
b = sdfg(H=512,W=16,inputs=a,gridDim_x=8,blockDim_y=64)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
b

## Nx8 reduce

In [None]:
H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def myprog(inputs: dace.float64[H,W], gridDim_x: dace.int64, blockDim_y: dace.int64):
    b = dace.ndarray([H], dtype=dace.float64)
    for blockIdx_x in dace.map[0:2]:
        for threadIdx_y, threadIdx_x in dace.map[0:256,0:8]:
            with dace.tasklet:                   
                ain << inputs[blockIdx_x*256+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*256+threadIdx_y]
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

a = np.random.rand(512,8)
b = sdfg(H=512,W=8,inputs=a,gridDim_x=2,blockDim_y=256)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
b

## Nx4 reduce

In [None]:
H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def myprog(inputs: dace.float64[H,W], gridDim_x: dace.int64, blockDim_y: dace.int64):
    b = dace.ndarray([H], dtype=dace.float64)
    for blockIdx_x in dace.map[0:gridDim_x]:
        for threadIdx_y, threadIdx_x in dace.map[0:64,0:4]:
            with dace.tasklet:                   
                ain << inputs[blockIdx_x*64+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*64+threadIdx_y]
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 2)
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

a = np.random.rand(256,4)
b = sdfg(H=256,W=4,inputs=a,gridDim_x=4,blockDim_y=64)
assert np.allclose(b, np.sum(a, axis=1))

## Nx2 reduce

In [None]:
H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def myprog(inputs: dace.float64[H,W], gridDim_x: dace.int64, blockDim_y: dace.int64):
    b = dace.ndarray([H], dtype=dace.float64)
    for blockIdx_x in dace.map[0:gridDim_x]:
        for threadIdx_y, threadIdx_x in dace.map[0:128,0:2]:
            with dace.tasklet:                   
                ain << inputs[blockIdx_x*128+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*128+threadIdx_y]
                aout = ain
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

a = np.random.rand(512,2)
b = sdfg(H=512,W=2,inputs=a,gridDim_x=4,blockDim_y=128)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})



In [None]:
# Test
a = np.random.rand(128,8)
b = sdfg(a)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
b

In [None]:
np.sum(a, axis=1)

In [None]:

H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def myprog(inputs: dace.float64[H,W], gridDim_x: dace.int64, blockDim_y: dace.int64):
    b = dace.ndarray([H], dtype=dace.float64)
    for blockIdx_x in dace.map[0:gridDim_x]:
        for threadIdx_y, threadIdx_x in dace.map[0:256,0:8]:
            with dace.tasklet:                   
                ain << inputs[blockIdx_x*256+threadIdx_y,threadIdx_x]
                aaout >> b(-1)[blockIdx_x*256+threadIdx_y]
                aout = ain
#                 aout += __shfl_down_sync(0xffffffff, aout, 8)
                aout += __shfl_down_sync(0xffffffff, aout, 4)
                aout += __shfl_down_sync(0xffffffff, aout, 2) 
                aout += __shfl_down_sync(0xffffffff, aout, 1) 
                if threadIdx_x == 0:
                    aaout = aout

    return b

# Transform to GPU, keep thread-block map
sdfg = myprog.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
# Test

hh=512
ww=8
tt=int(hh/256)

a = np.random.rand(hh,ww)
b = sdfg(H=hh,W=ww,inputs=a,gridDim_x=tt,blockDim_y=256)
assert np.allclose(b, np.sum(a, axis=1))

In [None]:
np.sum(a, axis=1)

In [None]:
b