In [1]:
import dace
import numpy as np
from dace.transformation.interstate import GPUTransformSDFG
from dace.frontend.common import op_repository as oprepo


## 4096x2 -> 4096x1 using 4 blocks with method THREAD REDUCE

In [None]:
@dace.program
def test1(inputs: dace.float64[4096, 2]):
    outputs = dace.ndarray([4096], dtype=dace.float64)
    outputs[:] = 0
    for blockIdx_x in dace.map[0:4]:
        for threadIdx_x in dace.map[0:1024]:
            value = dace.float64(0)
            for i in dace.map[0:2]:
                value += inputs[blockIdx_x*1024+threadIdx_x,i]
            
            outputs[blockIdx_x*1024+threadIdx_x] = value
            
    return outputs

# Transform to GPU, keep thread-block map
sdfg = test1.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

# Test
a = np.random.rand(4096, 2)
b = sdfg(a)
c = np.sum(a, axis=1)
assert np.allclose(b, c)

In [72]:
H = dace.symbol('H')
W = dace.symbol('W')

@dace.program
def thread_reduce(inputs: dace.float64[H, W], gridDim_x: dace.int64, blockDim_x: dace.int64):
    outputs = dace.ndarray([H], dtype=dace.float64)
    outputs[:] = 0
    for blockIdx_x in dace.map[0:gridDim_x]:
        newDim = blockDim_x
        for threadIdx_x in dace.map[0:blockDim_x]:
            value = dace.float64(0)
            for i in dace.map[0:W]:
                value += inputs[blockIdx_x*newDim+threadIdx_x,i]
            
            outputs[blockIdx_x*newDim+threadIdx_x] = value
            
    return outputs

# Transform to GPU, keep thread-block map
sdfg = thread_reduce.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})


# Test
a = np.random.rand(4096, 2)
b = sdfg(H=4096,W=2,inputs=a,gridDim_x=8,blockDim_x=512)
c = np.sum(a, axis=1)
assert np.allclose(b, c)