In [1]:
import dace
import numpy as np
from dace.transformation.interstate import GPUTransformSDFG
from dace.frontend.common import op_repository as oprepo

@oprepo.replaces('warpReduce_sum')
def warpReduce_sum(pv, sdfg: dace.SDFG, state: dace.SDFGState, x: str) -> str:
   desc = sdfg.arrays[x]
   newname, _ = sdfg.add_temp_transient(desc.shape, desc.dtype, desc.storage)
   ctype = desc.dtype.ctype

   t = state.add_tasklet(
       'warpReduce', {'__a'}, {'__out'}, f'''
       __out = dace::warpReduce<dace::ReductionType::Sum, {ctype}>::reduce(__a);
   ''', dace.Language.CPP)
   r = state.add_read(x)
   w = state.add_write(newname)
   state.add_edge(r, None, t, '__a', dace.Memlet(data=x))
   state.add_edge(t, '__out', w, None, dace.Memlet(data=newname))
   return newname

## 128x128 -> 1x128 using 8 blocks with method WARP READ WARP REDUCE

In [2]:
@dace.program
def reduce_test1(inputs: dace.float64[128, 128]):
    outputs = dace.ndarray([128], dtype=dace.float64)
    outputs[:] = 0
    for blockIdx_y, blockIdx_x in dace.map[0:2, 0:4]:
        shared = dace.ndarray([32,32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared)
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:32]:
            value = dace.float64(0)
            for i in dace.map[0:2]:
                value += inputs[64*i+32*blockIdx_y+threadIdx_y, 32*blockIdx_x+threadIdx_x]
            shared[threadIdx_x,threadIdx_y] = value
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:32]:
            reduced = warpReduce_sum(shared[threadIdx_y,threadIdx_x])
            if threadIdx_x==0:
                outputs[32*blockIdx_x+threadIdx_y] += reduced
    return outputs

# Transform to GPU, keep thread-block map
sdfg = reduce_test1.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

# Test
a = np.random.rand(128, 128)
b = sdfg(a)
c = np.sum(a, axis=0)
assert np.allclose(b, c)

## 128x128 -> 1x128 using 8 blocks with method THREAD REDUCE

In [3]:
@dace.program
def reduce_test2(inputs: dace.float64[128, 128]):
    outputs = dace.ndarray([128], dtype=dace.float64)
    outputs[:] = 0
    for blockIdx_y, blockIdx_x in dace.map[0:2, 0:4]:
        shared = dace.ndarray([32,32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared)
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:32]:
            value = dace.float64(0)
            for i in dace.map[0:2]:
                value += inputs[64*i+32*blockIdx_y+threadIdx_y, 32*blockIdx_x+threadIdx_x]
            shared[threadIdx_x,threadIdx_y] = value
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:32]:
            reduced = warpReduce_sum(shared[threadIdx_y,threadIdx_x])
            if threadIdx_x==0:
                outputs[32*blockIdx_x+threadIdx_y] += reduced
    return outputs

# Transform to GPU, keep thread-block map
sdfg = reduce_test2.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

# Test
a = np.random.rand(128, 128)
b = sdfg(a)
c = np.sum(a, axis=0)
assert np.allclose(b, c)

## 8x4096 -> 8x1 using 8 blocks with Multiple Blocks per Row (use atomic add)

In [4]:
@dace.program
def reduce_test3(inputs: dace.float64[8, 4096]):
    outputs = dace.ndarray([4096], dtype=dace.float64)
    outputs[:] = 0
    for blockIdx_y, blockIdx_x in dace.map[0:2, 0:4]:
        for threadIdx_y, threadIdx_x in dace.map[0:32,0:32]:
            value = dace.float64(0)
            for i in dace.map[0:4]:
                value += inputs[2*i+blockIdx_y, 1024*blockIdx_x+32*threadIdx_y+threadIdx_x]
            outputs[1024*blockIdx_x+32*threadIdx_y+threadIdx_x] += value
    return outputs

# Transform to GPU, keep thread-block map
sdfg = reduce_test3.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

# Test
a = np.random.rand(8, 4096)
b = sdfg(a)
c = np.sum(a, axis=0)
assert np.allclose(b, c)