# Multiple Wraps per Row

In [104]:
import dace
import numpy as np
from dace.transformation.interstate import GPUTransformSDFG, StateFusion
from dace.frontend.common import op_repository as oprepo

In [105]:
@oprepo.replaces('warpReduce_sum')
def warpReduce_sum(pv, sdfg: dace.SDFG, state: dace.SDFGState, x: str) -> str:
   desc = sdfg.arrays[x]
   newname, _ = sdfg.add_temp_transient(desc.shape, desc.dtype, desc.storage)
   ctype = desc.dtype.ctype

   t = state.add_tasklet(
       'warpReduce', {'__a'}, {'__out'}, f'''
       __out = dace::warpReduce<dace::ReductionType::Sum, {ctype}>::reduce(__a);
   ''', dace.Language.CPP)
   r = state.add_read(x)
   w = state.add_write(newname)
   state.add_edge(r, None, t, '__a', dace.Memlet(data=x))
   state.add_edge(t, '__out', w, None, dace.Memlet(data=newname))
   return newname

### define size

In [106]:
H = dace.symbol('H')
W = dace.symbol('W')

### 120x640 -> 120x1 using 120 blocks, 20 warps each block, one block for one row 

In [107]:
@dace.program
def AB_MWPR_1(inputs: dace.float64[H, W], wn: dace.int64):
    outputs = dace.ndarray([H], dtype=dace.float64)
    outputs[:] = 0
    for block_id in dace.map[0:H]:
        for warp_id, thread_id in dace.map[0:wn, 0:32]:
            col = warp_id*32+thread_id
            reduced = warpReduce_sum(inputs[block_id, col])
            if thread_id == 0:
                outputs[block_id] += reduced
    return outputs

In [108]:
sdfg1 = AB_MWPR_1.to_sdfg()
sdfg1.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

h, w = 120, 640
a1 = np.random.rand(h, w)
warp_num = w // 32
b1 = sdfg1(H = h, W = w, inputs = a1, wn = warp_num)
c1 = np.sum(a1, axis=1)
assert np.allclose(b1, c1)

In [109]:
@dace.program
def AtomicReduceToGlobalMem(inputs: dace.float64[H, W], 
#                             num_blocks_per_row: dace.int64, 
#                             loopNum: dace.int64, 
                            blockDim_y: dace.int64):
    outputs = dace.ndarray([H], dtype=dace.float64)
    outputs[:] = 0
#     for blockIdx_y, blockIdx_x in dace.map[0:num_blocks_per_row, 0:H]:
    for blockIdx_x in dace.map[0:H]:
        for warp_id, thread_id in dace.map[0:blockDim_y,0:32]:
#             row_id = blockIdx_x
#             col_id = 32*blockDim_y*blockIdx_y + 32*warp_id +thread_id
#             value = dace.float64(0)
#             delta = 32*blockDim_y*num_blocks_per_row
#             for loopIdx in dace.map[0:loopNum]:
#                 if col_id<W:
#                     value += inputs[row_id, col_id]
#                 col_id += delta
            col_id = warp_id*32+thread_id
            reduced = warpReduce_sum(inputs[blockIdx_x, col_id])
            if thread_id == 0:
                outputs[blockIdx_x] += reduced
    return outputs

In [110]:
sdfg = AtomicReduceToGlobalMem.to_sdfg()
sdfg.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})
row = 12
col = 640
num_blocks_per_row = 1
loopNum = 1
WarpNum = 20
test_input = np.random.rand(row, col)
test_output = sdfg(H=row, W=col, inputs=test_input, 
#                    num_blocks_per_row=num_blocks_per_row, loopNum=loopNum, 
                   blockDim_y=WarpNum)
expected_output = np.sum(test_input, axis=1)
assert np.allclose(test_output, expected_output)

### 40x640 -> 40x1 using 25 blocks, 32 warps each block, 20 wraps for one row

In [111]:
@dace.program
def AB_MWPR_2(inputs: dace.float64[H, W], bn: dace.int64):
    outputs = dace.ndarray([H], dtype=dace.float64)
    outputs[:] = 0
    for block_id in dace.map[0:bn]:
        for warp_id, thread_id in dace.map[0:32, 0:32]:
            index = block_id * 1024 + warp_id * 32 + thread_id
            index_x = dace.int64(index/640)
            index_y = index % 640
            value = warpReduce_sum(inputs[index_x, index_y])
            if thread_id == 0:
                outputs[index_x] += value
    return outputs

In [112]:
sdfg2 = AB_MWPR_2.to_sdfg()
sdfg2.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

h, w = 120, 640
block_num = h * w // 512

a2 = np.random.rand(h, w)
b2 = sdfg2(H = h, W = w, inputs = a2, bn = block_num)
c2 = np.sum(a2, axis=1)
assert np.allclose(b2, c2)

### 120x640 -> 120x1 using 15 blocks, 32 warps each block, one block for 8 rows

In [113]:
@dace.program
def AB_MWPR_3(inputs: dace.float64[H, W], bn: dace.int64, ln: dace.int64, rpb: dace.int64, wpr: dace.int64):
    outputs = dace.ndarray([H], dtype=dace.float64)
    outputs[:] = 0
    for block_id in dace.map[0:bn]:
        _rpb = dace.int32(rpb)
        _wpr = dace.int32(wpr)
        shared = dace.ndarray([32,32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared)
        for warp_id_x, warp_id_y, thread_id in dace.map[0:rpb, 0:wpr, 0:32]:
            value = dace.float64(0)
            for i in dace.map[0:ln]:
                value += inputs[block_id * _rpb + warp_id_x 
                    ,i * _wpr * 32 + warp_id_y * 32 + thread_id]
            shared[warp_id_x*wpr+warp_id_y, thread_id] = value
        reduced = dace.ndarray([32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) # should be rpb instead of 32, but stupid dace can't work with variables here...
        for warp_id_x, warp_id_y, thread_id in dace.map[0:rpb, 0:wpr, 0:32]:
            reduced[warp_id_x] = warpReduce_sum(shared[warp_id_x * wpr + warp_id_y, thread_id])
            if thread_id == 0:
                outputs[block_id * rpb + warp_id_x] += reduced[warp_id_x]
    return outputs

In [114]:
sdfg3 = AB_MWPR_3.to_sdfg()
sdfg3.apply_transformations(GPUTransformSDFG, {'sequential_innermaps': False})

h, w = 120, 640
row_per_block = 8
warp_per_row = 32 // row_per_block
block_num = h // row_per_block
loop_num = w * row_per_block // 1024

a3 = np.random.rand(h, w)
b3 = sdfg3(H = h, W = w, inputs = a3, bn = block_num, ln = loop_num, rpb = row_per_block, wpr = warp_per_row)
c3 = np.sum(a3, axis=1)
assert np.allclose(b3, c3)