From 7b231823e4c6c472409cc77e5dbdb4281a938173 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 12 Apr 2018 14:55:55 -0700 Subject: [PATCH 01/19] intel gpu conv schedule added --- topi/python/topi/intel_gpu/__init__.py | 5 + topi/python/topi/intel_gpu/conv2d.py | 294 +++++++++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 topi/python/topi/intel_gpu/__init__.py create mode 100644 topi/python/topi/intel_gpu/conv2d.py diff --git a/topi/python/topi/intel_gpu/__init__.py b/topi/python/topi/intel_gpu/__init__.py new file mode 100644 index 000000000000..336b1508f977 --- /dev/null +++ b/topi/python/topi/intel_gpu/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""Intel Gen9 GPU specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .conv2d import * diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py new file mode 100644 index 000000000000..7380aeabfe81 --- /dev/null +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -0,0 +1,294 @@ +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""conv2d schedule on Intel GPU""" + +from __future__ import absolute_import as _abs + +import numpy as np +import tvm + +from .. import generic +from .. import util +from .. import tag +from ..nn import pad +from ..nn.conv2d import conv2d +from ..nn.util import get_pad_tuple +from ..util import simplify + + +##### SCHEDULE UTILITIES ##### +def fuse_and_bind(s, tensor, axis=None, num_thread=None): + """ fuse all the axis and bind to GPU threads """ + axis = axis or s[tensor].op.axis + fused = s[tensor].fuse(*axis) + max_threads = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[tensor].split(fused, num_thread or max_threads) + s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) + return bx, tx + +def split_and_bind(s, tensor, x, x_factor=1): + bx, tx = s[tensor].split(x, factor = x_factor) + s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) + s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) + return bx, tx + +def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None): + """ tile and bind to GPU threads """ + x_factor = x_factor or y_factor + yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor) + s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + return yo, xo, yi, xi + +def cache_tile_and_bind3d(s, tensor, z, y, x, z_factor = 2, y_factor=None, x_factor=None): + """ tile and bind cache to GPU threads""" + x_factor = x_factor or z_factor + y_factor = y_factor or z_factor + zo, zi = s[tensor].split(z, z_factor) + yo, yi = s[tensor].split(y, y_factor) + xo, xi = s[tensor].split(x, x_factor) + s[tensor].reorder(zo, yo, xo, zi, yi, xi) + s[tensor].bind(zi, tvm.thread_axis("threadIdx.z")) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + return zo, yo, xo, zi, yi, xi + +def cache_tile_and_bind(s, tensor, y, x, y_factor=2, x_factor=None): + """ tile and bind cache to GPU threads""" + x_factor = x_factor or y_factor + yo, yi = s[tensor].split(y, y_factor) + xo, xi = s[tensor].split(x, x_factor) + s[tensor].reorder(yo, xo, yi, xi) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + return yo, xo, yi, xi + +def cache_split_and_bind(s, tensor, x, x_factor=1): + xo, xi = s[tensor].split(x, x_factor) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + return xo + +def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): + """ tile and bind 3d """ + y_factor = y_factor or z_factor + x_factor = x_factor or y_factor + zo, zi = s[tensor].split(z, z_factor) + yo, yi = s[tensor].split(y, y_factor) + xo, xi = s[tensor].split(x, x_factor) + s[tensor].reorder(zo, yo, xo ,zi, yi, xi) + + thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") + thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") + thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") + s[tensor].bind(zo, tvm.thread_axis("blockIdx.z")) + s[tensor].bind(zi, thread_z) + s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) + s[tensor].bind(yi, thread_y) + s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(xi, thread_x) + return xi, thread_z, thread_y, thread_x + +@conv2d.register(["intel_gpu"]) +def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """Conv2D operator for Intel GPU backend. + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + kernel : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert layout == 'NCHW', "only support NCHW convolution on intel gpu" + assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" + assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." + + out_dtype = data.dtype + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + kernel_shape = util.get_const_tuple(kernel.shape) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype) + +@generic.schedule_conv2d_nchw.register(["intel_gpu"]) +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw for Intel GPU + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag: + _schedule_cl_spatialpack(s,op) + + traverse(outs[0].op) + return s + +def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): + batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] + num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] + pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + stride_h, stride_w = stride + else: + stride_h, stride_w = stride, stride + + out_channel = num_filter + out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) + # pad 3 and 2 in order to achieve 1, 3, 229, 229 + pad_before = [0, 0, pad_top, pad_left] + pad_after = [0, 0, pad_down, pad_right] + temp = pad(data, pad_before, pad_after, name="pad_temp") + + rc = tvm.reduce_axis((0, in_channel), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + + if stride_h == 2: + if num_filter + kernel_h == 515: + conv_tag = "4_4" + else: + conv_tag = "4_5" + elif kernel_h == 3: + if num_filter == 512: + conv_tag = "2_7" + else: + conv_tag = "2_14" + else: + conv_tag = "1_16" + + return tvm.compute( + (batch, out_channel, out_height, out_width), + lambda nn, ff, yy, xx: tvm.sum( + temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * + kernel[ff, rc, ry, rx].astype(out_dtype), + axis=[rc, ry, rx]), tag=conv_tag) + +def _schedule_cl_spatialpack(s, op): + output = op.output(0) + temp = op.input_tensors[0] + kernel = op.input_tensors[1] +# temp_S = s.cache_read(temp, "shared", [output]) + temp_W = s.cache_read(temp, "warp", [output]) + + if output.op in s.outputs: + out = output + out_L = s.cache_write(out, "local") + else: + s[output].compute_inline() + out = s.outputs[0] + s[output].set_scope("local") + out_L = output + kernel_L = s.cache_read(kernel, "local", [out_L]) + _, _, out_height, out_width = [util.get_const_int(x) for x in out_L.shape] + _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] + if "1_16" in op.tag: + OUTPUT_BLOCK_HEIGHT = 1 + OUTPUT_BLOCK_WIDTH = 16 + num_threads_y = 1 + num_threads_x = 64 + elif "2_14" in op.tag: + OUTPUT_BLOCK_HEIGHT = 2 + OUTPUT_BLOCK_WIDTH = 14 + num_threads_y = 1 + num_threads_x = temp_h + elif "2_7" in op.tag: + OUTPUT_BLOCK_HEIGHT = 2 + OUTPUT_BLOCK_WIDTH = 7 + num_threads_y = 10 + num_threads_x = 9 + elif "4_5" in op.tag: + OUTPUT_BLOCK_HEIGHT = 4 + OUTPUT_BLOCK_WIDTH = 5 + num_threads_y = 1 + num_threads_x = 235 + elif "4_4" in op.tag: + OUTPUT_BLOCK_HEIGHT = 4 + OUTPUT_BLOCK_WIDTH = 4 + num_threads_y = 1 + num_threads_x = 17 + + PREFETCH = 4 + SUBGROUP_SIZE = 16 + IN_BLOCK_ARRAY_SIZE = 13 + STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT + STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH + + _, co, oh, ow = s[out].op.axis + ooh, ioh = s[out].split(oh, factor = OUTPUT_BLOCK_HEIGHT) + oow, iow = s[out].split(ow, factor = OUTPUT_BLOCK_WIDTH) + s[out].reorder(_, co, ooh, oow, ioh, iow) + + tx, thread_z, thread_y, thread_x = tile_and_bind3d(s, out, oow, ooh, co, 1, 1, 16) + + s[out_L].compute_at(s[out], tx) + + i, oc, h, w = s[out_L].op.axis + + oh, ih = s[out_L].split(h, factor = OUTPUT_BLOCK_HEIGHT) + ow, iw = s[out_L].split(w, factor = OUTPUT_BLOCK_WIDTH) + rc, ry, rx = s[out_L].op.reduce_axis + s[out_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw) +# s[out_L].unroll(ry) +# s[out_L].unroll(rx) + + # data load from global to padded data global + _, ci, h, w = s[temp].op.axis + # prameter should be changed for each workload + tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x) + +# s[temp_S].compute_at(s[out_L], rc) + s[temp_W].compute_at(s[out_L], rc) +# _, ci, h, w = s[temp_S].op.axis + _, ci, h, w = s[temp_W].op.axis + + zo, zi = s[temp_W].split(ci, 1) + yo, yi = s[temp_W].split(h, 1) + xo, xi = s[temp_W].split(w, 16) + s[temp_W].reorder(zo, yo, xo, zi, yi, xi) + s[temp_W].bind(zi, thread_z) + s[temp_W].bind(yi, thread_y) + s[temp_W].bind(xi, thread_x) +# s[temp_S].storage_align(s[temp_S].op.axis[2], 16, 0) + s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) + s[kernel_L].compute_at(s[out_L], rx) From ba97cbe76a564a4a74defea5a0075bc507a285d2 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 13 Apr 2018 00:53:59 -0700 Subject: [PATCH 02/19] conv2d output packing added --- topi/python/topi/intel_gpu/conv2d.py | 138 +++++++++++++++++---------- 1 file changed, 88 insertions(+), 50 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 7380aeabfe81..658be93c2443 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -175,7 +175,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - # pad 3 and 2 in order to achieve 1, 3, 229, 229 + oshape = (batch, out_channel, out_height, out_width) pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(data, pad_before, pad_after, name="pad_temp") @@ -184,65 +184,97 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') + block_w = 0 + block_h = 0 if stride_h == 2: if num_filter + kernel_h == 515: conv_tag = "4_4" + block_h = 4 + block_w = 4 else: conv_tag = "4_5" + block_h = 4 + block_w = 5 elif kernel_h == 3: if num_filter == 512: conv_tag = "2_7" + block_h = 2 + block_w = 7 else: conv_tag = "2_14" + block_h = 2 + block_w = 14 else: conv_tag = "1_16" + block_h = 1 + block_w = 16 - return tvm.compute( - (batch, out_channel, out_height, out_width), + c_h = 0 + c_w = 0 + + if out_height % block_h == 0: + c_h = out_height + else: + c_h = (out_height // block_h + 1) * block_h + + if out_width % block_w == 0: + c_w = out_width + else: + c_w = (out_width // block_w + 1) * block_w + + cshape = (batch, out_channel, c_h, c_w) + + conv = tvm.compute( + cshape, lambda nn, ff, yy, xx: tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * kernel[ff, rc, ry, rx].astype(out_dtype), - axis=[rc, ry, rx]), tag=conv_tag) + axis=[rc, ry, rx]), tag=conv_tag, name='conv') + + output = tvm.compute( + oshape, + lambda nn, ff, yy, xx: + conv[nn][ff][yy][xx], + name='output_unpack', tag=conv_tag) + +# if out_height % block_h ==0 and out_width % block_w == 0: +# return conv + + return output def _schedule_cl_spatialpack(s, op): output = op.output(0) - temp = op.input_tensors[0] - kernel = op.input_tensors[1] -# temp_S = s.cache_read(temp, "shared", [output]) - temp_W = s.cache_read(temp, "warp", [output]) + _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] - if output.op in s.outputs: - out = output - out_L = s.cache_write(out, "local") - else: - s[output].compute_inline() - out = s.outputs[0] - s[output].set_scope("local") - out_L = output - kernel_L = s.cache_read(kernel, "local", [out_L]) - _, _, out_height, out_width = [util.get_const_int(x) for x in out_L.shape] + conv = op.input_tensors[0] + temp = s[conv].op.input_tensors[0] + kernel = s[conv].op.input_tensors[1] + temp_W = s.cache_read(temp, "warp", [conv]) + conv_L = s.cache_write(conv, "local") + + kernel_L = s.cache_read(kernel, "local", [conv_L]) _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] - if "1_16" in op.tag: + if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_WIDTH = 16 num_threads_y = 1 num_threads_x = 64 - elif "2_14" in op.tag: + elif "2_14" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 14 num_threads_y = 1 num_threads_x = temp_h - elif "2_7" in op.tag: + elif "2_7" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 7 num_threads_y = 10 num_threads_x = 9 - elif "4_5" in op.tag: + elif "4_5" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 5 num_threads_y = 1 num_threads_x = 235 - elif "4_4" in op.tag: + elif "4_4" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 4 num_threads_y = 1 @@ -250,38 +282,33 @@ def _schedule_cl_spatialpack(s, op): PREFETCH = 4 SUBGROUP_SIZE = 16 - IN_BLOCK_ARRAY_SIZE = 13 STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH - _, co, oh, ow = s[out].op.axis - ooh, ioh = s[out].split(oh, factor = OUTPUT_BLOCK_HEIGHT) - oow, iow = s[out].split(ow, factor = OUTPUT_BLOCK_WIDTH) - s[out].reorder(_, co, ooh, oow, ioh, iow) - - tx, thread_z, thread_y, thread_x = tile_and_bind3d(s, out, oow, ooh, co, 1, 1, 16) - - s[out_L].compute_at(s[out], tx) - - i, oc, h, w = s[out_L].op.axis - - oh, ih = s[out_L].split(h, factor = OUTPUT_BLOCK_HEIGHT) - ow, iw = s[out_L].split(w, factor = OUTPUT_BLOCK_WIDTH) - rc, ry, rx = s[out_L].op.reduce_axis - s[out_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw) -# s[out_L].unroll(ry) -# s[out_L].unroll(rx) - - # data load from global to padded data global + # schedule conv + _, co, oh, ow = s[conv].op.axis + ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) + oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH) + s[conv].reorder(_, co, ooh, oow, ioh, iow) + tx, thread_z, thread_y, thread_x = tile_and_bind3d(s, conv, oow, ooh, co, 1, 1, 16) + + # schedule conv_L + s[conv_L].compute_at(s[conv], tx) + i, oc, h, w = s[conv_L].op.axis + oh, ih = s[conv_L].split(h, factor = OUTPUT_BLOCK_HEIGHT) + ow, iw = s[conv_L].split(w, factor = OUTPUT_BLOCK_WIDTH) + rc, ry, rx = s[conv_L].op.reduce_axis + s[conv_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw) +# s[conv_L].unroll(ry) +# s[conv_L].unroll(rx) + + # schedule temp _, ci, h, w = s[temp].op.axis - # prameter should be changed for each workload tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x) -# s[temp_S].compute_at(s[out_L], rc) - s[temp_W].compute_at(s[out_L], rc) -# _, ci, h, w = s[temp_S].op.axis + # schedule temp_W + s[temp_W].compute_at(s[conv_L], rc) _, ci, h, w = s[temp_W].op.axis - zo, zi = s[temp_W].split(ci, 1) yo, yi = s[temp_W].split(h, 1) xo, xi = s[temp_W].split(w, 16) @@ -289,6 +316,17 @@ def _schedule_cl_spatialpack(s, op): s[temp_W].bind(zi, thread_z) s[temp_W].bind(yi, thread_y) s[temp_W].bind(xi, thread_x) -# s[temp_S].storage_align(s[temp_S].op.axis[2], 16, 0) s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) - s[kernel_L].compute_at(s[out_L], rx) + # schedule kernel_L + s[kernel_L].compute_at(s[conv_L], rx) + + # schedule output + if output.op in s.outputs: + out = output + else: + s[output].compute_inline() + out = s.outputs[0] + + _, co, h, w = s[out].op.axis + tile_and_bind3d(s, out, w, h, co, 1, 1, 64) + From 2579bfbded26fb1313766b046a897aa5ca255ad2 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 17 Apr 2018 16:37:36 -0700 Subject: [PATCH 03/19] intel gpu conv2d updated --- topi/python/topi/intel_gpu/conv2d.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 658be93c2443..cf219fad29f6 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -295,10 +295,8 @@ def _schedule_cl_spatialpack(s, op): # schedule conv_L s[conv_L].compute_at(s[conv], tx) i, oc, h, w = s[conv_L].op.axis - oh, ih = s[conv_L].split(h, factor = OUTPUT_BLOCK_HEIGHT) - ow, iw = s[conv_L].split(w, factor = OUTPUT_BLOCK_WIDTH) rc, ry, rx = s[conv_L].op.reduce_axis - s[conv_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw) + s[conv_L].reorder(i, oc, rc, ry, rx, h, w) # s[conv_L].unroll(ry) # s[conv_L].unroll(rx) @@ -318,7 +316,13 @@ def _schedule_cl_spatialpack(s, op): s[temp_W].bind(xi, thread_x) s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) # schedule kernel_L - s[kernel_L].compute_at(s[conv_L], rx) + if "2_14" in s[conv].op.tag: +# i, oc, h, w = s[conv_L].op.axis +# s[conv_L].reorder(i, oc, rc, ry, h, w, rx) + s[kernel_L].compute_at(s[conv_L], ry) +# s[conv_L].vectorize(rx) + else: + s[kernel_L].compute_at(s[conv_L], rx) # schedule output if output.op in s.outputs: From c0821be6389a880bb491b80c58410068dd666cea Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 19 Apr 2018 11:19:05 -0700 Subject: [PATCH 04/19] minor changes --- topi/python/topi/intel_gpu/conv2d.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index cf219fad29f6..6e44fecc6add 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -280,11 +280,6 @@ def _schedule_cl_spatialpack(s, op): num_threads_y = 1 num_threads_x = 17 - PREFETCH = 4 - SUBGROUP_SIZE = 16 - STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT - STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH - # schedule conv _, co, oh, ow = s[conv].op.axis ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) From 436e3359b9a0a5e628d25ac632585e726d114b3a Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 3 May 2018 12:21:21 -0700 Subject: [PATCH 05/19] conv2d packing schedule fixed --- topi/python/topi/intel_gpu/conv2d.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 6e44fecc6add..55dd4dfab0ab 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -257,28 +257,18 @@ def _schedule_cl_spatialpack(s, op): if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_WIDTH = 16 - num_threads_y = 1 - num_threads_x = 64 elif "2_14" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 14 - num_threads_y = 1 - num_threads_x = temp_h elif "2_7" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 7 - num_threads_y = 10 - num_threads_x = 9 elif "4_5" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 5 - num_threads_y = 1 - num_threads_x = 235 elif "4_4" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 4 - num_threads_y = 1 - num_threads_x = 17 # schedule conv _, co, oh, ow = s[conv].op.axis @@ -292,12 +282,16 @@ def _schedule_cl_spatialpack(s, op): i, oc, h, w = s[conv_L].op.axis rc, ry, rx = s[conv_L].op.reduce_axis s[conv_L].reorder(i, oc, rc, ry, rx, h, w) -# s[conv_L].unroll(ry) -# s[conv_L].unroll(rx) + s[conv_L].unroll(ry) + s[conv_L].unroll(rx) # schedule temp + num_thread_z = 1 + num_thread_y = 16 + num_thread_x = 16 + _, ci, h, w = s[temp].op.axis - tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x) + tile_and_bind3d(s, temp, ci, h, w, num_thread_z, num_thread_y, num_thread_x) # schedule temp_W s[temp_W].compute_at(s[conv_L], rc) @@ -327,5 +321,5 @@ def _schedule_cl_spatialpack(s, op): out = s.outputs[0] _, co, h, w = s[out].op.axis - tile_and_bind3d(s, out, w, h, co, 1, 1, 64) + tile_and_bind3d(s, out, w, h, co, 4, 8, 8) From e1628c118f8a5146a4f039b46cdb21d93ad5d771 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 3 May 2018 16:07:48 -0700 Subject: [PATCH 06/19] conv2d.py --- topi/python/topi/intel_gpu/conv2d.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 55dd4dfab0ab..a934aa6aaeae 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -282,8 +282,9 @@ def _schedule_cl_spatialpack(s, op): i, oc, h, w = s[conv_L].op.axis rc, ry, rx = s[conv_L].op.reduce_axis s[conv_L].reorder(i, oc, rc, ry, rx, h, w) - s[conv_L].unroll(ry) - s[conv_L].unroll(rx) + if kernel.shape[3].value != 7: + s[conv_L].unroll(ry) + s[conv_L].unroll(rx) # schedule temp num_thread_z = 1 @@ -306,10 +307,7 @@ def _schedule_cl_spatialpack(s, op): s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) # schedule kernel_L if "2_14" in s[conv].op.tag: -# i, oc, h, w = s[conv_L].op.axis -# s[conv_L].reorder(i, oc, rc, ry, h, w, rx) s[kernel_L].compute_at(s[conv_L], ry) -# s[conv_L].vectorize(rx) else: s[kernel_L].compute_at(s[conv_L], rx) From c1b83d95a22900f5457401e4ffb9da5812a3435c Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 3 May 2018 16:56:39 -0700 Subject: [PATCH 07/19] weight reordering added --- topi/python/topi/intel_gpu/conv2d.py | 70 ++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index a934aa6aaeae..b5f06cbb21c0 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -222,24 +222,29 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float else: c_w = (out_width // block_w + 1) * block_w - cshape = (batch, out_channel, c_h, c_w) + nv = 16 + cshape = (batch, out_channel // nv, c_h, c_w, nv) + kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) + + kernel_vec = tvm.compute( + kvshape, + lambda co, ci, kh, kw, vc: + kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') conv = tvm.compute( cshape, - lambda nn, ff, yy, xx: tvm.sum( + lambda nn, ff, yy, xx, vc:\ + tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel[ff, rc, ry, rx].astype(out_dtype), + kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), axis=[rc, ry, rx]), tag=conv_tag, name='conv') output = tvm.compute( oshape, lambda nn, ff, yy, xx: - conv[nn][ff][yy][xx], + conv[nn][ff//nv][yy][xx][ff%nv], name='output_unpack', tag=conv_tag) -# if out_height % block_h ==0 and out_width % block_w == 0: -# return conv - return output def _schedule_cl_spatialpack(s, op): @@ -248,51 +253,69 @@ def _schedule_cl_spatialpack(s, op): conv = op.input_tensors[0] temp = s[conv].op.input_tensors[0] - kernel = s[conv].op.input_tensors[1] + kernel_vec = s[conv].op.input_tensors[1] + kernel = s[kernel_vec].op.input_tensors[0] temp_W = s.cache_read(temp, "warp", [conv]) conv_L = s.cache_write(conv, "local") - kernel_L = s.cache_read(kernel, "local", [conv_L]) + kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_WIDTH = 16 +# kernel_vec_tile = "fuse" elif "2_14" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 14 +# kernel_vec_tile = "fuse" elif "2_7" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 7 +# kernel_vec_tile = "inline" elif "4_5" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 5 +# kernel_vec_tile = "inline" elif "4_4" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 4 +# kernel_vec_tile = "fuse" # schedule conv - _, co, oh, ow = s[conv].op.axis + z_factor = 1 + y_factor = 1 + x_factor = 16 + thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") + thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") + thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") + _, co, oh, ow, vc = s[conv].op.axis ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH) - s[conv].reorder(_, co, ooh, oow, ioh, iow) - tx, thread_z, thread_y, thread_x = tile_and_bind3d(s, conv, oow, ooh, co, 1, 1, 16) + s[conv].reorder(_, co, ooh, oow, vc, ioh, iow) + coo, coi = s[conv].split(co, nparts = 1) + ooho, oohi = s[conv].split(ooh, factor = z_factor) + oowo, oowi = s[conv].split(oow, factor = y_factor) + vco, vci = s[conv].split(vc, factor = x_factor) + s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow) + s[conv].bind(oohi, thread_z) + s[conv].bind(oowi, thread_y) + s[conv].bind(vci, thread_x) + s[conv].bind(ooho, tvm.thread_axis("blockIdx.z")) + s[conv].bind(oowo, tvm.thread_axis("blockIdx.y")) + s[conv].bind(coi, tvm.thread_axis("blockIdx.x")) # schedule conv_L - s[conv_L].compute_at(s[conv], tx) - i, oc, h, w = s[conv_L].op.axis + s[conv_L].compute_at(s[conv], vci) + i, oc, h, w, vc = s[conv_L].op.axis rc, ry, rx = s[conv_L].op.reduce_axis - s[conv_L].reorder(i, oc, rc, ry, rx, h, w) + s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) if kernel.shape[3].value != 7: s[conv_L].unroll(ry) s[conv_L].unroll(rx) # schedule temp - num_thread_z = 1 - num_thread_y = 16 - num_thread_x = 16 - _, ci, h, w = s[temp].op.axis - tile_and_bind3d(s, temp, ci, h, w, num_thread_z, num_thread_y, num_thread_x) + tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) # schedule temp_W s[temp_W].compute_at(s[conv_L], rc) @@ -305,6 +328,13 @@ def _schedule_cl_spatialpack(s, op): s[temp_W].bind(yi, thread_y) s[temp_W].bind(xi, thread_x) s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) + + #schedule kernel +# if kernel_vec_tile == "fuse": +# fuse_and_bind(s, kernel_vec, num_thread = 256) +# else: + s[kernel_vec].compute_inline() + # schedule kernel_L if "2_14" in s[conv].op.tag: s[kernel_L].compute_at(s[conv_L], ry) From ed68a27de269962ccf0277d6d460beaa61f113d4 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 17 May 2018 13:24:02 -0700 Subject: [PATCH 08/19] code reduce --- topi/python/topi/intel_gpu/conv2d.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index b5f06cbb21c0..6080a8c46592 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -209,17 +209,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float block_h = 1 block_w = 16 - c_h = 0 - c_w = 0 + c_h = out_height + c_w = out_width - if out_height % block_h == 0: - c_h = out_height - else: + if not out_height % block_h == 0: c_h = (out_height // block_h + 1) * block_h - if out_width % block_w == 0: - c_w = out_width - else: + if not out_width % block_w == 0: c_w = (out_width // block_w + 1) * block_w nv = 16 From 97f0771411d21361ea1e44cbad254b1d1468cd4d Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 21 May 2018 16:20:25 -0700 Subject: [PATCH 09/19] kernel consist packing added with improved end-to-end performance --- topi/python/topi/intel_gpu/conv2d.py | 121 ++++++++------------------- 1 file changed, 36 insertions(+), 85 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 6080a8c46592..3ac7117cc99d 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -10,66 +10,16 @@ from .. import util from .. import tag from ..nn import pad -from ..nn.conv2d import conv2d +from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload from ..nn.util import get_pad_tuple from ..util import simplify +import nnvm +import nnvm.symbol as sym +from nnvm.top import registry as reg -##### SCHEDULE UTILITIES ##### -def fuse_and_bind(s, tensor, axis=None, num_thread=None): - """ fuse all the axis and bind to GPU threads """ - axis = axis or s[tensor].op.axis - fused = s[tensor].fuse(*axis) - max_threads = tvm.target.current_target(allow_none=False).max_num_threads - bx, tx = s[tensor].split(fused, num_thread or max_threads) - s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) - s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) - return bx, tx - -def split_and_bind(s, tensor, x, x_factor=1): - bx, tx = s[tensor].split(x, factor = x_factor) - s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) - s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) - return bx, tx - -def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None): - """ tile and bind to GPU threads """ - x_factor = x_factor or y_factor - yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor) - s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) - s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) - s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) - s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) - return yo, xo, yi, xi - -def cache_tile_and_bind3d(s, tensor, z, y, x, z_factor = 2, y_factor=None, x_factor=None): - """ tile and bind cache to GPU threads""" - x_factor = x_factor or z_factor - y_factor = y_factor or z_factor - zo, zi = s[tensor].split(z, z_factor) - yo, yi = s[tensor].split(y, y_factor) - xo, xi = s[tensor].split(x, x_factor) - s[tensor].reorder(zo, yo, xo, zi, yi, xi) - s[tensor].bind(zi, tvm.thread_axis("threadIdx.z")) - s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) - s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) - return zo, yo, xo, zi, yi, xi - -def cache_tile_and_bind(s, tensor, y, x, y_factor=2, x_factor=None): - """ tile and bind cache to GPU threads""" - x_factor = x_factor or y_factor - yo, yi = s[tensor].split(y, y_factor) - xo, xi = s[tensor].split(x, x_factor) - s[tensor].reorder(yo, xo, yi, xi) - s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) - s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) - return yo, xo, yi, xi - -def cache_split_and_bind(s, tensor, x, x_factor=1): - xo, xi = s[tensor].split(x, x_factor) - s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) - return xo +##### SCHEDULE UTILITIES ##### def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): """ tile and bind 3d """ y_factor = y_factor or z_factor @@ -90,8 +40,27 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None s[tensor].bind(xi, thread_x) return xi, thread_z, thread_y, thread_x -@conv2d.register(["intel_gpu"]) -def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): +@conv2d_alter_layout.register(["intel_gpu"]) +def _alter_conv2d_layout(attrs, inputs, tinfos): + copy_inputs = [s for s in inputs] + + data = tinfos[0] + kernel = tinfos[1] + + import ast + padding = ast.literal_eval(attrs['padding']) + stride = ast.literal_eval(attrs['strides']) + + wkl = _get_workload(data, kernel, stride, padding, data.dtype) + oc_bn = 16 + + new_attrs = {k: attrs[k] for k in attrs.keys()} + new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn) + + return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) + +@conv2d_NCHWc.register(["intel_gpu"]) +def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, out_dtype='float32'): """Conv2D operator for Intel GPU backend. Parameters @@ -100,7 +69,7 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32 4-D with shape [batch, in_channel, in_height, in_width] kernel : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] + 5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -116,7 +85,6 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32 output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ - assert layout == 'NCHW', "only support NCHW convolution on intel gpu" assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." @@ -128,10 +96,10 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32 else: HSTR, WSTR = stride, stride - return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype) + return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype) -@generic.schedule_conv2d_nchw.register(["intel_gpu"]) -def schedule_conv2d_nchw(outs): +@generic.schedule_conv2d_NCHWc.register(["intel_gpu"]) +def schedule_conv2d_nchw(num_filter, kernel_size, stride, padding, outs): """Schedule for conv2d_nchw for Intel GPU Parameters @@ -162,9 +130,10 @@ def traverse(op): traverse(outs[0].op) return s -def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): +def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] - num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] + num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape] + num_filter = num_filter * nv pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): @@ -218,21 +187,14 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float if not out_width % block_w == 0: c_w = (out_width // block_w + 1) * block_w - nv = 16 cshape = (batch, out_channel // nv, c_h, c_w, nv) - kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) - - kernel_vec = tvm.compute( - kvshape, - lambda co, ci, kh, kw, vc: - kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') conv = tvm.compute( cshape, lambda nn, ff, yy, xx, vc:\ tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), + kernel[ff, rc, ry, rx, vc].astype(out_dtype), axis=[rc, ry, rx]), tag=conv_tag, name='conv') output = tvm.compute( @@ -249,33 +211,27 @@ def _schedule_cl_spatialpack(s, op): conv = op.input_tensors[0] temp = s[conv].op.input_tensors[0] - kernel_vec = s[conv].op.input_tensors[1] - kernel = s[kernel_vec].op.input_tensors[0] + kernel = s[conv].op.input_tensors[1] temp_W = s.cache_read(temp, "warp", [conv]) conv_L = s.cache_write(conv, "local") - kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) + kernel_L = s.cache_read(kernel, "local", [conv_L]) _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_WIDTH = 16 -# kernel_vec_tile = "fuse" elif "2_14" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 14 -# kernel_vec_tile = "fuse" elif "2_7" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 2 OUTPUT_BLOCK_WIDTH = 7 -# kernel_vec_tile = "inline" elif "4_5" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 5 -# kernel_vec_tile = "inline" elif "4_4" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 4 OUTPUT_BLOCK_WIDTH = 4 -# kernel_vec_tile = "fuse" # schedule conv z_factor = 1 @@ -326,10 +282,6 @@ def _schedule_cl_spatialpack(s, op): s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) #schedule kernel -# if kernel_vec_tile == "fuse": -# fuse_and_bind(s, kernel_vec, num_thread = 256) -# else: - s[kernel_vec].compute_inline() # schedule kernel_L if "2_14" in s[conv].op.tag: @@ -346,4 +298,3 @@ def _schedule_cl_spatialpack(s, op): _, co, h, w = s[out].op.axis tile_and_bind3d(s, out, w, h, co, 4, 8, 8) - From b2f450fcc52945dec9d957b30b15405e6de6cb69 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 22 May 2018 12:57:03 -0700 Subject: [PATCH 10/19] merged conv2d_nchw and conv2d_NCHWc --- topi/python/topi/intel_gpu/conv2d.py | 252 ++++++++++++++++++++++++++- 1 file changed, 247 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 3ac7117cc99d..1445016129a6 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -96,10 +96,10 @@ def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, out_dt else: HSTR, WSTR = stride, stride - return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype) + return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype) @generic.schedule_conv2d_NCHWc.register(["intel_gpu"]) -def schedule_conv2d_nchw(num_filter, kernel_size, stride, padding, outs): +def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, outs): """Schedule for conv2d_nchw for Intel GPU Parameters @@ -125,12 +125,12 @@ def traverse(op): if tensor.op.input_tensors: traverse(tensor.op) if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag: - _schedule_cl_spatialpack(s,op) + _schedule_cl_spatialpack_NCHWc(s,op) traverse(outs[0].op) return s -def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'): +def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape] num_filter = num_filter * nv @@ -205,7 +205,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'): return output -def _schedule_cl_spatialpack(s, op): +def _schedule_cl_spatialpack_NCHWc(s, op): output = op.output(0) _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] @@ -298,3 +298,245 @@ def _schedule_cl_spatialpack(s, op): _, co, h, w = s[out].op.axis tile_and_bind3d(s, out, w, h, co, 4, 8, 8) + + +@conv2d.register(["intel_gpu"]) +def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """Conv2D operator for Intel GPU backend. + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + kernel : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + layout : str + layout of data + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert layout == 'NCHW', "only support NCHW convolution on intel gpu" + assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" + assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." + + out_dtype = data.dtype + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + kernel_shape = util.get_const_tuple(kernel.shape) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype) + +@generic.schedule_conv2d_nchw.register(["intel_gpu"]) +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw for Intel GPU + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag: + _schedule_cl_spatialpack(s,op) + + traverse(outs[0].op) + return s + +def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): + batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] + num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] + pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + stride_h, stride_w = stride + else: + stride_h, stride_w = stride, stride + + out_channel = num_filter + out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) + oshape = (batch, out_channel, out_height, out_width) + pad_before = [0, 0, pad_top, pad_left] + pad_after = [0, 0, pad_down, pad_right] + temp = pad(data, pad_before, pad_after, name="pad_temp") + + rc = tvm.reduce_axis((0, in_channel), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + + block_w = 0 + block_h = 0 + if stride_h == 2: + if num_filter + kernel_h == 515: + conv_tag = "4_4" + block_h = 4 + block_w = 4 + else: + conv_tag = "4_5" + block_h = 4 + block_w = 5 + elif kernel_h == 3: + if num_filter == 512: + conv_tag = "2_7" + block_h = 2 + block_w = 7 + else: + conv_tag = "2_14" + block_h = 2 + block_w = 14 + else: + conv_tag = "1_16" + block_h = 1 + block_w = 16 + + c_h = out_height + c_w = out_width + + if not out_height % block_h == 0: + c_h = (out_height // block_h + 1) * block_h + + if not out_width % block_w == 0: + c_w = (out_width // block_w + 1) * block_w + + nv = 16 + cshape = (batch, out_channel // nv, c_h, c_w, nv) + kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) + + kernel_vec = tvm.compute( + kvshape, + lambda co, ci, kh, kw, vc: + kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') + + conv = tvm.compute( + cshape, + lambda nn, ff, yy, xx, vc:\ + tvm.sum( + temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * + kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), + axis=[rc, ry, rx]), tag=conv_tag, name='conv') + + output = tvm.compute( + oshape, + lambda nn, ff, yy, xx: + conv[nn][ff//nv][yy][xx][ff%nv], + name='output_unpack', tag=conv_tag) + + return output + +def _schedule_cl_spatialpack(s, op): + output = op.output(0) + _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] + + conv = op.input_tensors[0] + temp = s[conv].op.input_tensors[0] + kernel_vec = s[conv].op.input_tensors[1] + kernel = s[kernel_vec].op.input_tensors[0] + temp_W = s.cache_read(temp, "warp", [conv]) + conv_L = s.cache_write(conv, "local") + + kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) + _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] + + if "1_16" in s[conv].op.tag: + OUTPUT_BLOCK_HEIGHT = 1 + OUTPUT_BLOCK_WIDTH = 16 + elif "2_14" in s[conv].op.tag: + OUTPUT_BLOCK_HEIGHT = 2 + OUTPUT_BLOCK_WIDTH = 14 + elif "2_7" in s[conv].op.tag: + OUTPUT_BLOCK_HEIGHT = 2 + OUTPUT_BLOCK_WIDTH = 7 + elif "4_5" in s[conv].op.tag: + OUTPUT_BLOCK_HEIGHT = 4 + OUTPUT_BLOCK_WIDTH = 5 + elif "4_4" in s[conv].op.tag: + OUTPUT_BLOCK_HEIGHT = 4 + OUTPUT_BLOCK_WIDTH = 4 + + # schedule conv + z_factor = 1 + y_factor = 1 + x_factor = 16 + thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") + thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") + thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") + _, co, oh, ow, vc = s[conv].op.axis + ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) + oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH) + s[conv].reorder(_, co, ooh, oow, vc, ioh, iow) + coo, coi = s[conv].split(co, nparts = 1) + ooho, oohi = s[conv].split(ooh, factor = z_factor) + oowo, oowi = s[conv].split(oow, factor = y_factor) + vco, vci = s[conv].split(vc, factor = x_factor) + s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow) + s[conv].bind(oohi, thread_z) + s[conv].bind(oowi, thread_y) + s[conv].bind(vci, thread_x) + s[conv].bind(ooho, tvm.thread_axis("blockIdx.z")) + s[conv].bind(oowo, tvm.thread_axis("blockIdx.y")) + s[conv].bind(coi, tvm.thread_axis("blockIdx.x")) + + # schedule conv_L + s[conv_L].compute_at(s[conv], vci) + i, oc, h, w, vc = s[conv_L].op.axis + rc, ry, rx = s[conv_L].op.reduce_axis + s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) + if kernel.shape[3].value != 7: + s[conv_L].unroll(ry) + s[conv_L].unroll(rx) + + # schedule temp + _, ci, h, w = s[temp].op.axis + tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) + + # schedule temp_W + s[temp_W].compute_at(s[conv_L], rc) + _, ci, h, w = s[temp_W].op.axis + zo, zi = s[temp_W].split(ci, 1) + yo, yi = s[temp_W].split(h, 1) + xo, xi = s[temp_W].split(w, 16) + s[temp_W].reorder(zo, yo, xo, zi, yi, xi) + s[temp_W].bind(zi, thread_z) + s[temp_W].bind(yi, thread_y) + s[temp_W].bind(xi, thread_x) + s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) + + s[kernel_vec].compute_inline() + + # schedule kernel_L + if "2_14" in s[conv].op.tag: + s[kernel_L].compute_at(s[conv_L], ry) + else: + s[kernel_L].compute_at(s[conv_L], rx) + + # schedule output + if output.op in s.outputs: + out = output + else: + s[output].compute_inline() + out = s.outputs[0] + + _, co, h, w = s[out].op.axis + tile_and_bind3d(s, out, w, h, co, 4, 8, 8) From 27265fd106bfc47f032adbbe530e25754bafed08 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 22 May 2018 15:17:18 -0700 Subject: [PATCH 11/19] resnet50 end-to-end perform added --- topi/python/topi/intel_gpu/conv2d.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py index 1445016129a6..c394d79f6744 100644 --- a/topi/python/topi/intel_gpu/conv2d.py +++ b/topi/python/topi/intel_gpu/conv2d.py @@ -216,7 +216,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op): conv_L = s.cache_write(conv, "local") kernel_L = s.cache_read(kernel, "local", [conv_L]) - _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] + _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_WIDTH = 16 @@ -260,7 +260,17 @@ def _schedule_cl_spatialpack_NCHWc(s, op): s[conv_L].compute_at(s[conv], vci) i, oc, h, w, vc = s[conv_L].op.axis rc, ry, rx = s[conv_L].op.reduce_axis - s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) + if in_channel == 2048: + rco, rci = s[conv_L].split(rc, nparts = 128) + s[conv_L].unroll(rci) + s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w) + s[temp_W].compute_at(s[conv_L], rco) + else: + s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) + s[temp_W].compute_at(s[conv_L], rc) + if kernel.shape[3].value != 7: + s[conv_L].unroll(ry) + s[conv_L].unroll(rx) if kernel.shape[3].value != 7: s[conv_L].unroll(ry) s[conv_L].unroll(rx) @@ -270,7 +280,6 @@ def _schedule_cl_spatialpack_NCHWc(s, op): tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) # schedule temp_W - s[temp_W].compute_at(s[conv_L], rc) _, ci, h, w = s[temp_W].op.axis zo, zi = s[temp_W].split(ci, 1) yo, yi = s[temp_W].split(h, 1) @@ -457,7 +466,7 @@ def _schedule_cl_spatialpack(s, op): conv_L = s.cache_write(conv, "local") kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) - _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] + _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] if "1_16" in s[conv].op.tag: OUTPUT_BLOCK_HEIGHT = 1 @@ -502,7 +511,14 @@ def _schedule_cl_spatialpack(s, op): s[conv_L].compute_at(s[conv], vci) i, oc, h, w, vc = s[conv_L].op.axis rc, ry, rx = s[conv_L].op.reduce_axis - s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) + if in_channel == 2048: + rco, rci = s[conv_L].split(rc, nparts = 128) + s[conv_L].unroll(rci) + s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w) + s[temp_W].compute_at(s[conv_L], rco) + else: + s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) + s[temp_W].compute_at(s[conv_L], rc) if kernel.shape[3].value != 7: s[conv_L].unroll(ry) s[conv_L].unroll(rx) @@ -512,7 +528,6 @@ def _schedule_cl_spatialpack(s, op): tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) # schedule temp_W - s[temp_W].compute_at(s[conv_L], rc) _, ci, h, w = s[temp_W].op.axis zo, zi = s[temp_W].split(ci, 1) yo, yi = s[temp_W].split(h, 1) From d4aed968ddf85b9eff512d41d356e1285b11cb4b Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 11 Jun 2018 23:28:10 -0700 Subject: [PATCH 12/19] multibox_prior gpu ir added, clip tests udpated --- topi/python/topi/cuda/__init__.py | 3 + topi/python/topi/cuda/ssd/__init__.py | 5 + topi/python/topi/cuda/ssd/multibox.py | 328 ++++++++++++++++++++++++++ topi/python/topi/cuda/vision.py | 29 ++- topi/tests/python/test_topi_clip.py | 14 +- topi/tests/python/test_topi_vision.py | 13 +- 6 files changed, 379 insertions(+), 13 deletions(-) create mode 100644 topi/python/topi/cuda/ssd/__init__.py create mode 100644 topi/python/topi/cuda/ssd/multibox.py diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index dbf00ebeb52b..dff76fc4ebf5 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -18,3 +18,6 @@ from .vision import schedule_region from .vision import schedule_reorg from .nn import schedule_lrn, schedule_l2_normalize +from .vision import schedule_multibox_prior +from . import ssd +from .ssd import multibox_prior diff --git a/topi/python/topi/cuda/ssd/__init__.py b/topi/python/topi/cuda/ssd/__init__.py new file mode 100644 index 000000000000..d680c578e7aa --- /dev/null +++ b/topi/python/topi/cuda/ssd/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=wildcard-import +"""VISION network operators""" +from __future__ import absolute_import as _abs + +from .multibox import * diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py new file mode 100644 index 000000000000..fde2b182141e --- /dev/null +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -0,0 +1,328 @@ +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments +"""SSD multibox operators""" +from __future__ import absolute_import as _abs +import math +import tvm + +from tvm import api + +import topi + +from ..nms import nms +from topi.vision.ssd import multibox_prior +from topi.vision.ssd import multibox_detection +from topi.vision.ssd import multibox_transform_loc + +def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): + """Low level IR routing for multibox_prior operator. + + Parameters + ---------- + data : Buffer + Input data buffer. + + out : Buffer + Output buffer. + + sizes : tuple of float + Tuple of sizes for anchor boxes. + + ratios : tuple of float + Tuple of ratios for anchor boxes. + + steps : Tuple of int + Priorbox step across y and x, -1 for auto calculation. + + offsets : tuple of int + Priorbox center offsets, y and x respectively. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + tx = tvm.thread_axis("threadIdx.x") + ty = tvm.thread_axis("threadIdx.y") + bx = tvm.thread_axis("blockIdx.x") + by = tvm.thread_axis("blockIdx.y") + ib = tvm.ir_builder.create() + p_out = ib.buffer_ptr(out) + in_height = data.shape[2] + in_width = data.shape[3] + nthread_tx = max_threads + nthread_bx = in_height // max_threads + 1 + nthread_ty = max_threads + nthread_by = in_width // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(ty, "thread_extent", nthread_ty) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", nthread_by) + + num_sizes = len(sizes) + num_ratios = len(ratios) + size_ratio_concat = sizes + ratios + steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height + steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width + offset_h = offsets[0] + offset_w = offsets[1] + + i = bx * max_threads + tx + j = by * max_threads + ty + with ib.if_scope((i < in_height)): + with ib.if_scope((j < in_width)): + center_h = (i + offset_h) * steps_h + center_w = (j + offset_w) * steps_w + + for k in range(num_sizes + num_ratios - 1): + w = tvm.select(k < num_sizes, + size_ratio_concat[k] * in_height / in_width / 2.0, + size_ratio_concat[0] * in_height / in_width * + math.sqrt(size_ratio_concat[k + 1]) / 2.0) + h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0, + size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0) + count = (i * in_width * (num_sizes + num_ratios - 1) + + j * (num_sizes + num_ratios - 1) + k) * 4 + p_out[count] = center_w - w + p_out[count + 1] = center_h - h + p_out[count + 2] = center_w + w + p_out[count + 3] = center_h + h + + body = ib.get() + return body + + +@multibox_prior.register("cuda") +def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): + """Generate prior(anchor) boxes from data, sizes and ratios. + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, c_in, h_in, w_in]] + + sizes : tuple of float + Tuple of sizes for anchor boxes. + + ratios : tuple of float + Tuple of ratios for anchor boxes. + + steps : Tuple of int + Priorbox step across y and x, -1 for auto calculation. + + offsets : tuple of int + Priorbox center offsets, y and x respectively. + + clip : boolean + Whether to clip out-of-boundary boxes. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] + """ + num_sizes = len(sizes) + num_ratios = len(ratios) + oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) + out = tvm.extern(oshape, [data], lambda ins, outs: + multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets), + tag="multibox_prior") + if clip: + out = topi.clip(out, 0, 1) + return out + + +def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances): + """Low level IR routing for transform location in multibox_detection operator. + + Parameters + ---------- + cls_prob : Buffer + Buffer of class probabilities. + + loc_pred : Buffer + Buffer of location regression predictions. + + anchor : Buffer + Buffer of prior anchor boxes. + + valid_count : Buffer + Buffer of number of valid output boxes. + + out : Buffer + Output buffer. + + clip : boolean + Whether to clip out-of-boundary boxes. + + threshold : float + Threshold to be a positive prediction. + + variances : tuple of float + Variances to be decoded from box regression output. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): + """Transform prior anchor box to output box through location predictions. + """ + al = anchor[anchor_base_idx] + at = anchor[anchor_base_idx + 1] + ar = anchor[anchor_base_idx + 2] + ab = anchor[anchor_base_idx + 3] + aw = ar - al + ah = ab - at + ax = (al + ar) / 2.0 + ay = (at + ab) / 2.0 + px = loc[loc_base_idx] + py = loc[loc_base_idx + 1] + pw = loc[loc_base_idx + 2] + ph = loc[loc_base_idx + 3] + ox = px * vx * aw + ax + oy = py * vy * ah + ay + ow = tvm.exp(pw * vw) * aw / 2.0 + oh = tvm.exp(ph * vh) * ah / 2.0 + return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh) + + batch_size = cls_prob.shape[0] + num_classes = cls_prob.shape[1] + num_anchors = cls_prob.shape[2] + + ib = tvm.ir_builder.create() + p_cls_prob = ib.buffer_ptr(cls_prob) + p_loc_pred = ib.buffer_ptr(loc_pred) + p_anchor = ib.buffer_ptr(anchor) + p_valid_count = ib.buffer_ptr(valid_count) + p_out = ib.buffer_ptr(out) + with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: + p_valid_count[n] = 0 + with ib.for_range(0, num_anchors, name="i") as i: + # Find the predicted class id and probability + score = ib.allocate('float32', (1,), name="score", scope="local") + cls_id = ib.allocate('int32', (1,), name="id", scope="local") + score[0] = -1.0 + cls_id[0] = 0 + with ib.for_range(0, num_classes, name="j") as j: + with ib.if_scope(j > 0): + temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i] + cls_id[0] = tvm.select(temp > score[0], j, cls_id[0]) + score[0] = tvm.make.Max(temp, score[0]) + with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)): + cls_id[0] = 0 + # [id, prob, xmin, ymin, xmax, ymax] + # Remove background, restore original id + with ib.if_scope(cls_id[0] > 0): + out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6 + p_out[out_base_idx] = cls_id[0] - 1.0 + p_out[out_base_idx + 1] = score[0] + offset = i * 4 + p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ + p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset, + p_anchor, offset, clip, variances[0], + variances[1], variances[2], variances[3]) + p_valid_count[n] += 1 + + return ib.get() + + +@multibox_transform_loc.register(["cuda", "gpu"]) +def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, + variances=(0.1, 0.1, 0.2, 0.2)): + """Location transformation for multibox detection + + Parameters + ---------- + cls_prob : tvm.Tensor + Class probabilities. + + loc_pred : tvm.Tensor + Location regression predictions. + + anchor : tvm.Tensor + Prior anchor boxes. + + clip : boolean + Whether to clip out-of-boundary boxes. + + threshold : float + Threshold to be a positive prediction. + + variances : tuple of float + Variances to be decoded from box regression output. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape (batch_size, num_anchors, 6) + + valid_count : tvm.Tensor + 1-D tensor with shape (batch_size,), number of valid anchor boxes. + """ + batch_size = cls_prob.shape[0] + num_anchors = anchor.shape[1] + oshape = (batch_size, num_anchors, 6) + # Define data alignment for intermediate buffer + valid_count_dtype = "int32" + valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype, + "valid_count_buf", data_alignment=4) + out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8) + valid_count, out = \ + tvm.extern([(batch_size,), oshape], + [cls_prob, loc_pred, anchor], + lambda ins, outs: transform_loc_ir( + ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances), + dtype=[valid_count_dtype, cls_prob.dtype], + out_buffers=[valid_count_buf, out_buf], + tag="multibox_transform_loc") + return out, valid_count + + +@multibox_detection.register(["cuda", "gpu"]) +def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, + force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): + """Convert multibox detection predictions. + + Parameters + ---------- + cls_prob : tvm.Tensor + Class probabilities. + + loc_pred : tvm.Tensor + Location regression predictions. + + anchor : tvm.Tensor + Prior anchor boxes. + + clip : boolean + Whether to clip out-of-boundary boxes. + + nms_threshold : float + Non-maximum suppression threshold. + + force_suppress : boolean + Whether to suppress all detections regardless of class_id. + + threshold : float + Threshold to be a positive prediction. + + variances : tuple of float + Variances to be decoded from box regression output. + + nms_topk : int + Keep maximum top k detections before nms, -1 for no limit. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape (batch_size, num_anchors, 6) + """ + inter_out, valid_count = mutibox_transform_loc(cls_prob, loc_pred, anchor, + clip, threshold, variances) + out = nms(inter_out, valid_count, nms_threshold, force_suppress, nms_topk) + return out diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index 106d13665ad8..64774fc79254 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -4,6 +4,8 @@ import tvm from .. import generic from .. import cpp +from .. import tag +import topi @generic.schedule_reorg.register(["cuda", "gpu"]) def schedule_reorg(outs): @@ -42,7 +44,7 @@ def schedule_region(outs): return cpp.cuda.schedule_region(cpp_target, outs) @generic.schedule_multibox_prior.register(["cuda", "gpu"]) -def schedule_multibox_prior(out): +def schedule_multibox_prior(outs): """Schedule for multibox_prior operator. Parameters @@ -56,10 +58,31 @@ def schedule_multibox_prior(out): s: Schedule The computation schedule for multibox_prior. """ - raise RuntimeError("Currently multibox_prior only supports CPU.") + target = tvm.target.current_target() + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def traverse(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + #TODO: should be injected automatically + else: + x = op.output(0) + fused = s[x].fuse(*s[x].op.axis) + num_thread = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[x].split(fused, factor=num_thread) + s[x].bind(bx, tvm.thread_axis("blockIdx.x")) + s[x].bind(tx, tvm.thread_axis("threadIdx.x")) + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + traverse(outs[0].op) + return s @generic.schedule_multibox_detection.register(["cuda", "gpu"]) -def schedule_multibox_detection(out): +def schedule_multibox_detection(outs): """Schedule for multibox_detection operator. Parameters diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py index 52da4922e1d6..041565433bcc 100644 --- a/topi/tests/python/test_topi_clip.py +++ b/topi/tests/python/test_topi_clip.py @@ -20,23 +20,27 @@ def get_ref_data(): a_np, b_np = get_ref_data() def check_device(device): - if not tvm.module.enabled(device): + ctx = tvm.context(device, 0) + if not ctx.exist: print("Skip because %s is not enabled" % device) return - ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_injective(B) + a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device in ['llvm']: + for device in ['llvm', 'opencl']: check_device(device) def test_clip(): - verify_clip(1024, -127, 127, 'int8') - verify_clip(1024, -127, 127, 'int16') verify_clip(1024, -127, 127, 'float32') + verify_clip(1024, -127, 127, 'int16') + verify_clip(1024, -127, 127, 'int8') if __name__ == "__main__": diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 3c624726d562..9ef5445dc5e8 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -46,7 +46,6 @@ def check_device(device): def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): data = tvm.placeholder(dshape, name="data") - out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) dtype = data.dtype input_data = np.random.uniform(size=dshape).astype(dtype) @@ -88,15 +87,19 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): + if device == 'llvm': + out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) + else: + out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) s = topi.generic.schedule_multibox_prior(out) tvm_input_data = tvm.nd.array(input_data, ctx) tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx) f = tvm.build(s, [data, out], device) f(tvm_input_data, tvm_out) - np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-4) + np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3) - for device in ['llvm']: + for device in ['llvm', 'opencl']: check_device(device) @@ -146,6 +149,6 @@ def check_device(device): if __name__ == "__main__": - test_nms() +# test_nms() test_multibox_prior() - test_multibox_detection() \ No newline at end of file +# test_multibox_detection() From 0aef45e886198e02ddcae0089c511fc5acd461b9 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 13 Jun 2018 13:49:23 -0700 Subject: [PATCH 13/19] topi vision test typo fixed --- topi/tests/python/test_topi_vision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 9ef5445dc5e8..8ee594ddd35b 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -149,6 +149,6 @@ def check_device(device): if __name__ == "__main__": -# test_nms() + test_nms() test_multibox_prior() -# test_multibox_detection() + test_multibox_detection() From d2f3948083d86d2d5e97f44d55f0c3ab36735efa Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 15 Jun 2018 10:58:53 -0700 Subject: [PATCH 14/19] default gpu schedule added for vision --- topi/python/topi/cuda/vision.py | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index 64774fc79254..da787c64e2ee 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -7,6 +7,31 @@ from .. import tag import topi +def _default_schedule(outs): + """Default schedule for gpu.""" + target = tvm.target.current_target() + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def traverse(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + #TODO: should be injected automatically + else: + x = op.output(0) + fused = s[x].fuse(*s[x].op.axis) + num_thread = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[x].split(fused, factor=num_thread) + s[x].bind(bx, tvm.thread_axis("blockIdx.x")) + s[x].bind(tx, tvm.thread_axis("threadIdx.x")) + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + traverse(outs[0].op) + return s + @generic.schedule_reorg.register(["cuda", "gpu"]) def schedule_reorg(outs): """Schedule for reorg operator. @@ -43,6 +68,23 @@ def schedule_region(outs): cpp_target = cpp.TEST_create_target(target.target_name) return cpp.cuda.schedule_region(cpp_target, outs) +@generic.schedule_nms.register(["cuda", "gpu"]) +def schedule_nms(outs): + """Schedule for non-maximum suppression + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of nms + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs) + @generic.schedule_multibox_prior.register(["cuda", "gpu"]) def schedule_multibox_prior(outs): """Schedule for multibox_prior operator. From 52eb115c2d44b67cf3b68f8874bd4d0e9a81b218 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 21 Jun 2018 10:26:34 -0700 Subject: [PATCH 15/19] nms gpu added --- topi/python/topi/cuda/__init__.py | 1 + topi/python/topi/cuda/nms.py | 328 ++++++++++++++++++++++++++++++ 2 files changed, 329 insertions(+) create mode 100644 topi/python/topi/cuda/nms.py diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index dff76fc4ebf5..c0e3c5e173f2 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -21,3 +21,4 @@ from .vision import schedule_multibox_prior from . import ssd from .ssd import multibox_prior +from .nms import * diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py new file mode 100644 index 000000000000..f7e65040b555 --- /dev/null +++ b/topi/python/topi/cuda/nms.py @@ -0,0 +1,328 @@ +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments +"""Non-maximum suppression operator""" +import tvm + +from tvm import api +from topi.vision import nms +import math +import numpy as np + +def sort_ir(data, index, output, axis, is_descend): + def swap(a, b): + a, b = b, a + max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + tx = tvm.thread_axis("threadIdx.x") + ty = tvm.thread_axis("threadIdx.y") + bx = tvm.thread_axis("blockIdx.x") + by = tvm.thread_axis("blockIdx.y") + ib = tvm.ir_builder.create() + p_data = ib.buffer_ptr(data) + p_index = ib.buffer_ptr(index) + p_out = ib.buffer_ptr(output) + dshape = p_index[index.shape[0]-1] + data_new = ib.allocate("float32", dshape, name="data_new", scope="global") + index_new = ib.allocate("int32", dshape, name="index_new", scope="global") +# p_index_new = ib.buffer_ptr(index_new) +# p_data_new = ib.buffer_ptr(data_new) + ndim = len(data.shape) + assert data.dtype == "float32", "Currently only supports input dtype to be float32" + assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim + + axis_mul_before = 1 + axis_mul_after = 1 + if axis < 0: + axis = ndim + axis + for i in range(0, ndim): + if i < axis: + axis_mul_before *= data.shape[i] + elif i > axis: + axis_mul_after *= data.shape[i] + + nthread_tx = max_threads + nthread_bx = axis_mul_before // max_threads + 1 + nthread_ty = max_threads + nthread_by = axis_mul_after // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(ty, "thread_extent", nthread_ty) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", nthread_by) + i = bx * max_threads + tx + j = by * max_threads + ty + tid = i * nthread_tx * nthread_bx + j + + with ib.if_scope(i < axis_mul_before): + with ib.if_scope(j < axis_mul_after): + current_sort_num = p_index[i * axis_mul_after + j] + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, current_sort_num, name = "k") as k: + full_idx = base_idx + k * axis_mul_after + index_new[k] = k + data_new[k] = p_data[full_idx] + # sync_threads + # sorting + size = current_sort_num + with ib.if_scope(tid < size - 1): + with ib.for_range(0, size - 1, name = "level") as level: + with ib.if_scope(tid % 2 == (level & 1)): + with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)): + swap(data_new[tid], data_new[tid+1]) + swap(index_new[tid], index_new[tid+1]) + #convert back + with ib.for_range(0, data.shape[axis], for_type = "unroll", name = "l") as l: + p_out[base_idx + l * axis_mul_after] = tvm.select(k < size, index_new[l], l) + + body = ib.get() + input(body) + return body + + +def OddEvenTransposeSort_ir(data, index, is_descend): + """ Low level IR routing for sorting operation on GPUs. + + Parameters + ---------- + ---------- + """ + def swap(a, b): + a, b = b, a + + max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + ib = tvm.ir_builder.create() + p_data = ib.buffer_ptr(data) + p_index = ib.buffer_ptr(index) + in_size = data.shape + nthread_tx = max_threads + nthread_bx = in_size // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < in_size - 1): + with ib.for_range(0, in_size - 1, name = "level", for_type = "unroll") as level: + with ib.if_scope(tid % 2 == (1 & level)): + with ib.if_scope(~(is_descend ^ (p_data[tid] < p_data[tid+1]))): # xnor comp + # doing swap on global mem which is non-efficient + swap(p_data[tid], p_data[tid+1]) + swap(p_index[tid], p_index[tid+1]) + body = ib.get() + return body + + +def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): + """Low level IR routing for transform location in multibox_detection operator. + + Parameters + ---------- + data: Buffer + Buffer of output boxes with class and score. + + sort_result : Buffer + Buffer of output box indexes sorted by score. + + valid_count : Buffer + Buffer of number of valid output boxes. + + out : Buffer + Output buffer. + + nms_threshold : float + Non-maximum suppression threshold. + + force_suppress : boolean + Whether to suppress all detections regardless of class_id. + + nms_topk : int + Keep maximum top k detections before nms, -1 for no limit. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + def calculate_overlap(out_tensor, box_a_idx, box_b_idx): + """Calculate overlap of two boxes. + """ + w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) + - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) + h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) + - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) + i = w * h + u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ + (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ + (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ + (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i + return tvm.select(u <= 0.0, 0.0, i / u) + + max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + tx = tvm.thread_axis("threadIdx.x") + ty = tvm.thread_axis("threadIdx.y") + bx = tvm.thread_axis("blockIdx.x") + by = tvm.thread_axis("blockIdx.y") + ib = tvm.ir_builder.create() + p_data = ib.buffer_ptr(data) + p_sort_result = ib.buffer_ptr(sort_result) + p_valid_count = ib.buffer_ptr(valid_count) + p_out = ib.buffer_ptr(out) + batch_size = out.shape[0] + num_anchors = out.shape[1] + nthread_tx = max_threads + nthread_bx = num_anchors // max_threads + 1 + nthread_ty = max_threads + nthread_by = 6 // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(ty, "thread_extent", nthread_ty) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", nthread_by) + i = bx * max_threads + tx + j = by * max_threads + ty + + nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) + nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) + force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) + with ib.for_range(0, batch_size, for_type="unroll", name="n") as n: + with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, + p_valid_count[0] > 0)): + # Reorder output + nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), + nms_topk, p_valid_count[n]) + with ib.if_scope(i < nkeep): + with ib.if_scope(j < 6): + p_out[(n * num_anchors * 6 + + i * 6 + j)] = p_data[(n * num_anchors * 6 + + p_sort_result[n * num_anchors + i] * 6 + j)] + with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): + with ib.if_scope(i < p_valid_count[n] - nkeep): + with ib.if_scope(j < 6): + p_out[(n * num_anchors * 6 + + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6 + + (i + nkeep) * 6 + j)] + # Apply nms + with ib.if_scope(i < p_valid_count[n]): + offset_i = i * 6 + with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0): + with ib.if_scope(j < p_valid_count[n]): + offset_j = j * 6 + with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6 + + offset_j] >= 0)): + with ib.if_scope(tvm.any(force_suppress_node > 0, + p_out[n * num_anchors * 6 + offset_i] == + p_out[n * num_anchors * 6 + offset_j])): + # When force_suppress == True or class_id equals + iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_i + 2, + n * num_anchors * 6 + offset_j + 2) + with ib.if_scope(iou >= nms_threshold): + p_out[n * num_anchors * 6 + offset_j] = -1.0 + with ib.else_scope(): + with ib.if_scope(i < p_valid_count[n]): + with ib.if_scope(j < 6): + p_out[(n * num_anchors * 6 + + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j] + # Set invalid entry to be -1 + with ib.if_scope(i < num_anchors - p_valid_count[n]): + with ib.if_scope(j < 6): + p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 + body = ib.get() + input(body) + return body + +#@sort.register("cuda") +def sort(data, index, is_descend): + oshape = data.shape + out = tvm.extern(oshape, [data, index], lambda ins, outs: + OddEvenTransposeSort_ir(ins[0], ins[1], is_descend), + tag="sort_gpu") + return out + +@nms.register("cuda") +def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): + """Non-maximum suppression operator for object detection. + + Parameters + ---------- + data: tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors, 6]. + The last dimension should be in format of + [class_id, score, box_left, box_top, box_right, box_bottom]. + + valid_count : tvm.Tensor + 1-D tensor for valid number of boxes. + + nms_threshold : float + Non-maximum suppression threshold. + + force_suppress : boolean + Whether to suppress all detections regardless of class_id. + + nms_topk : int + Keep maximum top k detections before nms, -1 for no limit. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors, 6]. + + Example + -------- + .. code-block:: python + + # An example to use nms + dshape = (1, 5, 6) + data = tvm.placeholder(dshape, name="data") + valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") + nms_threshold = 0.7 + force_suppress = True + nms_topk = -1 + out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) + np_data = np.random.uniform(dshape) + np_valid_count = np.array([4]) + s = topi.generic.schedule_nms(out) + f = tvm.build(s, [data, valid_count, out], "llvm") + ctx = tvm.cpu() + tvm_data = tvm.nd.array(np_data, ctx) + tvm_valid_count = tvm.nd.array(np_valid_count, ctx) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) + f(tvm_data, tvm_valid_count, tvm_out) + """ + batch_size = data.shape[0] + num_anchors = data.shape[1] + valid_count_dtype = "int32" + valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, + "valid_count_buf", data_alignment=4) + data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + score_axis = 1 + score_shape = (batch_size, num_anchors) + score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) + score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, + "score_tensor_buf", data_alignment=8) + sort_tensor_dtype = "int32" + sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, + "sort_tensor_buf", data_alignment=8) + +# dshape = (valid_count[valid_count.shape[0]-1],) +# sorter = tvm.placeholder(dshape, name = "sorter", dtype = "float32") +# index = tvm.placeholder(dshape, name = "index", dtype = "int32") +# sorter_buf = api.decl_buffer(sorter.shape, sorter.dtype, +# "sorter_data_buf", data_alignment=8) +# index_buf = api.decl_buffer(index.shape, index.dtype, +# "sorter_index_buf", data_alignment=8) + + sort_tensor = \ + tvm.extern(score_shape, + [score_tensor, valid_count], + lambda ins, outs: sort_ir( + ins[0], ins[1], outs[0], score_axis, True), + dtype=sort_tensor_dtype, + in_buffers=[score_tensor_buf, valid_count_buf], + out_buffers=sort_tensor_buf, + name="nms_sort") + out = \ + tvm.extern(data.shape, + [data, sort_tensor, valid_count], + lambda ins, outs: nms_ir( + ins[0], ins[1], ins[2], outs[0], nms_threshold, + force_suppress, nms_topk), + dtype="float32", + in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], + tag="nms") + return out From 172167a24f5babd6541f70aa9141974c1fb76a8b Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 3 Jul 2018 11:35:24 -0700 Subject: [PATCH 16/19] sorting debugged, detection completed --- topi/python/topi/cuda/nms.py | 157 +++++++++++++------------- topi/python/topi/cuda/ssd/multibox.py | 86 +++++++++----- topi/python/topi/cuda/vision.py | 33 ++---- topi/tests/python/test_topi_vision.py | 14 ++- 4 files changed, 156 insertions(+), 134 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index f7e65040b555..10f3ec2683d9 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -6,11 +6,19 @@ from topi.vision import nms import math import numpy as np +from ..nn import tag def sort_ir(data, index, output, axis, is_descend): def swap(a, b): a, b = b, a - max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + def OETS(ib, tid, data_new, index_new, is_descend, size): + with ib.if_scope(tid < size - 1): + with ib.for_range(0, size - 1, name = "level") as level: + with ib.if_scope(tid % 2 == (level & 1)): + with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)): + swap(data_new[tid], data_new[tid+1]) + swap(index_new[tid], index_new[tid+1]) + max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") @@ -19,11 +27,6 @@ def swap(a, b): p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) p_out = ib.buffer_ptr(output) - dshape = p_index[index.shape[0]-1] - data_new = ib.allocate("float32", dshape, name="data_new", scope="global") - index_new = ib.allocate("int32", dshape, name="index_new", scope="global") -# p_index_new = ib.buffer_ptr(index_new) -# p_data_new = ib.buffer_ptr(data_new) ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim @@ -38,73 +41,80 @@ def swap(a, b): elif i > axis: axis_mul_after *= data.shape[i] - nthread_tx = max_threads - nthread_bx = axis_mul_before // max_threads + 1 - nthread_ty = max_threads - nthread_by = axis_mul_after // max_threads + 1 - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(ty, "thread_extent", nthread_ty) - ib.scope_attr(bx, "thread_extent", nthread_bx) - ib.scope_attr(by, "thread_extent", nthread_by) - i = bx * max_threads + tx - j = by * max_threads + ty - tid = i * nthread_tx * nthread_bx + j - - with ib.if_scope(i < axis_mul_before): - with ib.if_scope(j < axis_mul_after): - current_sort_num = p_index[i * axis_mul_after + j] - base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, current_sort_num, name = "k") as k: - full_idx = base_idx + k * axis_mul_after - index_new[k] = k - data_new[k] = p_data[full_idx] - # sync_threads - # sorting - size = current_sort_num - with ib.if_scope(tid < size - 1): - with ib.for_range(0, size - 1, name = "level") as level: - with ib.if_scope(tid % 2 == (level & 1)): - with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)): - swap(data_new[tid], data_new[tid+1]) - swap(index_new[tid], index_new[tid+1]) - #convert back - with ib.for_range(0, data.shape[axis], for_type = "unroll", name = "l") as l: - p_out[base_idx + l * axis_mul_after] = tvm.select(k < size, index_new[l], l) - - body = ib.get() - input(body) - return body - - -def OddEvenTransposeSort_ir(data, index, is_descend): - """ Low level IR routing for sorting operation on GPUs. - - Parameters - ---------- - ---------- - """ - def swap(a, b): - a, b = b, a + dshape = 0 + for i in range(0, len(index.shape)): + dshape += index.shape[i] + dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, axis_mul_before*axis_mul_after) - max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) - tx = tvm.thread_axis("threadIdx.x") - bx = tvm.thread_axis("blockIdx.x") - ib = tvm.ir_builder.create() - p_data = ib.buffer_ptr(data) - p_index = ib.buffer_ptr(index) - in_size = data.shape + sizes_temp = ib.allocate("int32", dshape, name="sizes_temp", scope="global") + sizes = ib.allocate("int32", dshape, name="sizes", scope="global") + temp_index = ib.allocate("int32", dshape, name="temp_index", scope = "local") + temp_data = ib.allocate("float32", dshape, name="temp_data", scope = "local") + data_new = ib.allocate("float32", dshape, name="data_new", scope="global") + index_new = ib.allocate("int32", dshape, name="index_new", scope="global") nthread_tx = max_threads - nthread_bx = in_size // max_threads + 1 + nthread_bx = dshape // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx - with ib.if_scope(tid < in_size - 1): - with ib.for_range(0, in_size - 1, name = "level", for_type = "unroll") as level: - with ib.if_scope(tid % 2 == (1 & level)): - with ib.if_scope(~(is_descend ^ (p_data[tid] < p_data[tid+1]))): # xnor comp - # doing swap on global mem which is non-efficient - swap(p_data[tid], p_data[tid+1]) - swap(p_index[tid], p_index[tid+1]) + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + sizes[tid] = p_index[tid] + sizes_temp[tid] = p_index[tid] + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after).astype("float32"))) + 1, name="k") as k: + with ib.if_scope(tid- (tvm.const(1, "int32") << k) >= 0): + with ib.if_scope(k % 2 == 0): + sizes[tid] += sizes_temp[tid - (tvm.const(1, "int32") << k)] + sizes_temp[tid] = sizes[tid] + with ib.else_scope(): + sizes_temp[tid] += sizes[tid - (tvm.const(1, "int32") << k)] + sizes[tid] = sizes_temp[tid] +# sizes[tid] += sizes[tid - (tvm.const(1, "int32") << k)] + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + current_sort_num = p_index[tid] + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, current_sort_num, name = "k") as k: + full_idx = base_idx + k * axis_mul_after + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + index_new[start + k] = k + data_new[start + k] = p_data[full_idx] + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + # OddEvenTransposeSort + with ib.for_range(0, p_index[tid], name = "k") as k: + with ib.for_range(0, p_index[tid] - 1, name = "i") as i: + with ib.if_scope(i % 2 == (k & 1)): + with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ is_descend) == False): + temp_data[tid] = data_new[i+start] + data_new[i+start] = data_new[i+start+1] + data_new[i+start+1] = temp_data[tid] + temp_index[tid] = index_new[i+start] + index_new[i+start] = index_new[i+start+1] + index_new[i+start+1] = temp_index[tid] + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + current_sort_num = p_index[tid] + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, data.shape[axis], name = "k") as k: + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + p_out[base_idx + k * axis_mul_after] = tvm.select(k < current_sort_num, index_new[k+start], k) body = ib.get() return body @@ -223,7 +233,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): with ib.if_scope(j < 6): p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 body = ib.get() - input(body) return body #@sort.register("cuda") @@ -292,21 +301,13 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) - score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) + score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor") score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) -# dshape = (valid_count[valid_count.shape[0]-1],) -# sorter = tvm.placeholder(dshape, name = "sorter", dtype = "float32") -# index = tvm.placeholder(dshape, name = "index", dtype = "int32") -# sorter_buf = api.decl_buffer(sorter.shape, sorter.dtype, -# "sorter_data_buf", data_alignment=8) -# index_buf = api.decl_buffer(index.shape, index.dtype, -# "sorter_index_buf", data_alignment=8) - sort_tensor = \ tvm.extern(score_shape, [score_tensor, valid_count], diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index fde2b182141e..147160333f0d 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -195,38 +195,64 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, num_anchors = cls_prob.shape[2] ib = tvm.ir_builder.create() + temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, ), name="temp_score", scope="global") + score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local") + cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local") + flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global") + max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + nthread_tx = max_threads + nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx p_cls_prob = ib.buffer_ptr(cls_prob) p_loc_pred = ib.buffer_ptr(loc_pred) p_anchor = ib.buffer_ptr(anchor) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) - with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: + with ib.if_scope(tid < batch_size * num_anchors * num_classes): + n = tid / (num_anchors * num_classes) + j = (tid % (num_anchors * num_classes)) / num_anchors + i = tid % num_anchors + with ib.if_scope(j > 0): + temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = p_cls_prob[tid] p_valid_count[n] = 0 - with ib.for_range(0, num_anchors, name="i") as i: - # Find the predicted class id and probability - score = ib.allocate('float32', (1,), name="score", scope="local") - cls_id = ib.allocate('int32', (1,), name="id", scope="local") - score[0] = -1.0 - cls_id[0] = 0 - with ib.for_range(0, num_classes, name="j") as j: - with ib.if_scope(j > 0): - temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i] - cls_id[0] = tvm.select(temp > score[0], j, cls_id[0]) - score[0] = tvm.make.Max(temp, score[0]) - with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)): - cls_id[0] = 0 - # [id, prob, xmin, ymin, xmax, ymax] - # Remove background, restore original id - with ib.if_scope(cls_id[0] > 0): - out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6 - p_out[out_base_idx] = cls_id[0] - 1.0 - p_out[out_base_idx + 1] = score[0] - offset = i * 4 - p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ - p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset, - p_anchor, offset, clip, variances[0], - variances[1], variances[2], variances[3]) - p_valid_count[n] += 1 + with ib.if_scope(tid < batch_size * num_anchors): + n = tid / num_anchors + i = tid % num_anchors + score[tid] = -1.0 + cls_id[tid] = 0 + with ib.for_range(0, num_classes-1, name="k") as k: + temp = temp_score[tid * (num_classes-1) + k] + cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid]) + score[tid] = tvm.make.Max(temp, score[tid]) + with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)): + cls_id[tid] = 0 + with ib.if_scope(cls_id[tid] > 0): + flag[tid] = 1 + with ib.else_scope(): + flag[tid] = 0 + with ib.if_scope(tid < batch_size): + with ib.for_range(0, num_anchors, name="k") as k: + with ib.if_scope(k > 0): + flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1] + p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1] + with ib.if_scope(tid < batch_size * num_anchors): + n = tid / num_anchors + i = tid % num_anchors + with ib.if_scope(cls_id[tid] > 0): + with ib.if_scope(i == 0): + out_base_id = n * num_anchors * 6 + with ib.else_scope(): + out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6 + p_out[out_base_idx] = cls_id[tid] - 1.0 + p_out[out_base_idx + 1] = score[tid] + p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ + p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4, + clip, variances[0], variances[1], + variances[2], variances[3]) return ib.get() @@ -258,6 +284,8 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 Returns ------- + ret : tuple of tvm.Tensor composed of + out : tvm.Tensor 3-D tensor with shape (batch_size, num_anchors, 6) @@ -280,7 +308,7 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 dtype=[valid_count_dtype, cls_prob.dtype], out_buffers=[valid_count_buf, out_buf], tag="multibox_transform_loc") - return out, valid_count + return [out, valid_count] @multibox_detection.register(["cuda", "gpu"]) @@ -322,7 +350,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm out : tvm.Tensor 3-D tensor with shape (batch_size, num_anchors, 6) """ - inter_out, valid_count = mutibox_transform_loc(cls_prob, loc_pred, anchor, + inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances) - out = nms(inter_out, valid_count, nms_threshold, force_suppress, nms_topk) + out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) return out diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index da787c64e2ee..136d62dbe998 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -14,6 +14,14 @@ def _default_schedule(outs): s = tvm.create_schedule([x.op for x in outs]) def traverse(op): """inline all one-to-one-mapping operators except the last stage (output)""" + if "nms" in op.tag: + sort = op.input_tensors[1] + score = s[sort].op.input_tensors[0] + fused = s[score].fuse(*s[score].op.axis) + num_thread = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[score].split(fused, factor=num_thread) + s[score].bind(bx, tvm.thread_axis("blockIdx.x")) + s[score].bind(tx, tvm.thread_axis("threadIdx.x")) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() @@ -100,28 +108,7 @@ def schedule_multibox_prior(outs): s: Schedule The computation schedule for multibox_prior. """ - target = tvm.target.current_target() - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - def traverse(op): - """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - #TODO: should be injected automatically - else: - x = op.output(0) - fused = s[x].fuse(*s[x].op.axis) - num_thread = tvm.target.current_target(allow_none=False).max_num_threads - bx, tx = s[x].split(fused, factor=num_thread) - s[x].bind(bx, tvm.thread_axis("blockIdx.x")) - s[x].bind(tx, tvm.thread_axis("threadIdx.x")) - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - - traverse(outs[0].op) - return s + return _default_schedule(outs) @generic.schedule_multibox_detection.register(["cuda", "gpu"]) def schedule_multibox_detection(outs): @@ -138,4 +125,4 @@ def schedule_multibox_detection(outs): s: Schedule The computation schedule for multibox_detection. """ - raise RuntimeError("Currently multibox_detection only supports CPU.") + return _default_schedule(outs) diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 8ee594ddd35b..959b10f82ca5 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -14,7 +14,6 @@ def test_nms(): nms_threshold = 0.7 force_suppress = True nms_topk = 2 - out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79], @@ -31,6 +30,10 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): + if device == 'llvm': + out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) + else: + out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk) s = topi.generic.schedule_nms(out) tvm_data = tvm.nd.array(np_data, ctx) @@ -40,7 +43,7 @@ def check_device(device): f(tvm_data, tvm_valid_count, tvm_out) np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) - for device in ['llvm']: + for device in ['llvm', 'opencl']: check_device(device) @@ -116,7 +119,6 @@ def test_multibox_detection(): cls_prob = tvm.placeholder((batch_size, num_anchors, num_classes), name="cls_prob") loc_preds = tvm.placeholder((batch_size, num_anchors * 4), name="loc_preds") anchors = tvm.placeholder((1, num_anchors, 4), name="anchors") - out = ssd.multibox_detection(cls_prob, loc_preds, anchors) # Manually create test case np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]]) @@ -134,6 +136,10 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): + if device == 'llvm': + out = ssd.multibox_detection(cls_prob, loc_preds, anchors) + else: + out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors) s = topi.generic.schedule_multibox_detection(out) tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx) @@ -144,7 +150,7 @@ def check_device(device): f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out) np.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4) - for device in ['llvm']: + for device in ['llvm', 'opencl']: check_device(device) From df0a2d9d697fe436858bec46d15661efc57ee9b1 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 6 Jul 2018 13:50:39 -0700 Subject: [PATCH 17/19] minor changes --- topi/python/topi/cuda/__init__.py | 6 ++---- topi/python/topi/cuda/nms.py | 11 +---------- topi/python/topi/cuda/ssd/multibox.py | 9 +++++---- topi/python/topi/cuda/vision.py | 19 ++++++++++++++++++- topi/tests/python/test_topi_conv2d_nchw.py | 19 ++++++++++++++++++- 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index c0e3c5e173f2..b8740f811ff7 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -15,10 +15,8 @@ from .pooling import schedule_pool, schedule_global_pool from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw from .extern import schedule_extern -from .vision import schedule_region -from .vision import schedule_reorg from .nn import schedule_lrn, schedule_l2_normalize -from .vision import schedule_multibox_prior +from .vision import * from . import ssd -from .ssd import multibox_prior +from .ssd import * from .nms import * diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 10f3ec2683d9..231796ecb1da 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -71,7 +71,6 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): with ib.else_scope(): sizes_temp[tid] += sizes[tid - (tvm.const(1, "int32") << k)] sizes[tid] = sizes_temp[tid] -# sizes[tid] += sizes[tid - (tvm.const(1, "int32") << k)] with ib.if_scope(tid < axis_mul_before * axis_mul_after): i = tid / axis_mul_after @@ -235,15 +234,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): body = ib.get() return body -#@sort.register("cuda") -def sort(data, index, is_descend): - oshape = data.shape - out = tvm.extern(oshape, [data, index], lambda ins, outs: - OddEvenTransposeSort_ir(ins[0], ins[1], is_descend), - tag="sort_gpu") - return out - -@nms.register("cuda") +@nms.register(["cuda", "gpu"]) def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index 147160333f0d..1f9ce14c23f4 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -30,7 +30,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): ratios : tuple of float Tuple of ratios for anchor boxes. - steps : Tuple of int + steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int @@ -92,7 +92,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): return body -@multibox_prior.register("cuda") +@multibox_prior.register(["cuda", "gpu"]) def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. @@ -107,7 +107,7 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, ratios : tuple of float Tuple of ratios for anchor boxes. - steps : Tuple of int + steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int @@ -254,7 +254,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, clip, variances[0], variances[1], variances[2], variances[3]) - return ib.get() + body = ib.get() + return body @multibox_transform_loc.register(["cuda", "gpu"]) diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index 136d62dbe998..19b1db5827d8 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -25,7 +25,6 @@ def traverse(op): if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() - #TODO: should be injected automatically else: x = op.output(0) fused = s[x].fuse(*s[x].op.axis) @@ -110,6 +109,24 @@ def schedule_multibox_prior(outs): """ return _default_schedule(outs) +@generic.schedule_multibox_transform_loc.register(["cuda", "gpu"]) +def schedule_multibox_transform_loc(outs): + """Schedule for multibox_transform_loc + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of + multibox_transform_loc in the format + of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs) + @generic.schedule_multibox_detection.register(["cuda", "gpu"]) def schedule_multibox_detection(outs): """Schedule for multibox_detection operator. diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 0d55a9163466..7f3935f3aad7 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -8,6 +8,8 @@ from topi.util import get_const_tuple def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): + print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) + in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') @@ -59,7 +61,7 @@ def check_device(device): def test_conv2d_nchw(): - # ResNet18 worklaods + # ResNet18 workloads verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) @@ -72,6 +74,21 @@ def test_conv2d_nchw(): verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1) verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0) verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + # ResNet 50 workloads + verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0) + verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0) + verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0) + verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0) + verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0) + verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0) + verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0) + verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0) + verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0) + verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0) + verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0) + verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0) + verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0) + verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0) # Vgg16 workloads verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1) # Super resolution workloads From ff87074a82fa2757300fe486f67a07cfdd09a10f Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 6 Jul 2018 19:54:59 -0700 Subject: [PATCH 18/19] pylint error fixed --- topi/python/topi/cuda/nms.py | 124 ++++++++++++++++---------- topi/python/topi/cuda/ssd/multibox.py | 27 +++--- topi/python/topi/cuda/vision.py | 3 +- 3 files changed, 95 insertions(+), 59 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 231796ecb1da..4d4e402de5c2 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -1,28 +1,42 @@ -# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison """Non-maximum suppression operator""" +import math import tvm from tvm import api from topi.vision import nms -import math -import numpy as np -from ..nn import tag + def sort_ir(data, index, output, axis, is_descend): - def swap(a, b): - a, b = b, a - def OETS(ib, tid, data_new, index_new, is_descend, size): - with ib.if_scope(tid < size - 1): - with ib.for_range(0, size - 1, name = "level") as level: - with ib.if_scope(tid % 2 == (level & 1)): - with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)): - swap(data_new[tid], data_new[tid+1]) - swap(index_new[tid], index_new[tid+1]) - max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) + """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. + + Parameters + ---------- + data: Buffer + 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. + + index : Buffer + Buffer of number of valid number of boxes. + + output : Buffer + Output buffer of indicies of sorted tensor. + + axis : int + The axis used for sorting. + + is_descend : bool + If the sorted data is in descending order. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") - ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") - by = tvm.thread_axis("blockIdx.y") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) @@ -44,12 +58,14 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): dshape = 0 for i in range(0, len(index.shape)): dshape += index.shape[i] - dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, axis_mul_before*axis_mul_after) + dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, + axis_mul_before*axis_mul_after) - sizes_temp = ib.allocate("int32", dshape, name="sizes_temp", scope="global") + sizes_temp = ib.allocate( + "int32", dshape, name="sizes_temp", scope="global") sizes = ib.allocate("int32", dshape, name="sizes", scope="global") - temp_index = ib.allocate("int32", dshape, name="temp_index", scope = "local") - temp_data = ib.allocate("float32", dshape, name="temp_data", scope = "local") + temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local") + temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local") data_new = ib.allocate("float32", dshape, name="data_new", scope="global") index_new = ib.allocate("int32", dshape, name="index_new", scope="global") nthread_tx = max_threads @@ -63,13 +79,16 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): sizes_temp[tid] = p_index[tid] with ib.if_scope(tid < axis_mul_before * axis_mul_after): - with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after).astype("float32"))) + 1, name="k") as k: - with ib.if_scope(tid- (tvm.const(1, "int32") << k) >= 0): + with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \ + .astype("float32"))) + 1, name="k") as k: + with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0): with ib.if_scope(k % 2 == 0): - sizes[tid] += sizes_temp[tid - (tvm.const(1, "int32") << k)] + sizes[tid] += sizes_temp[tid - ( + tvm.const(1, "int32") << k)] sizes_temp[tid] = sizes[tid] with ib.else_scope(): - sizes_temp[tid] += sizes[tid - (tvm.const(1, "int32") << k)] + sizes_temp[tid] += sizes[tid - ( + tvm.const(1, "int32") << k)] sizes[tid] = sizes_temp[tid] with ib.if_scope(tid < axis_mul_before * axis_mul_after): @@ -77,7 +96,7 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): j = tid % axis_mul_after current_sort_num = p_index[tid] base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, current_sort_num, name = "k") as k: + with ib.for_range(0, current_sort_num, name="k") as k: full_idx = base_idx + k * axis_mul_after with ib.if_scope(tid == 0): start = 0 @@ -92,10 +111,11 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): with ib.else_scope(): start = sizes[tid-1] # OddEvenTransposeSort - with ib.for_range(0, p_index[tid], name = "k") as k: - with ib.for_range(0, p_index[tid] - 1, name = "i") as i: + with ib.for_range(0, p_index[tid], name="k") as k: + with ib.for_range(0, p_index[tid] - 1, name="i") as i: with ib.if_scope(i % 2 == (k & 1)): - with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ is_descend) == False): + with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ + is_descend) == False): temp_data[tid] = data_new[i+start] data_new[i+start] = data_new[i+start+1] data_new[i+start+1] = temp_data[tid] @@ -108,12 +128,14 @@ def OETS(ib, tid, data_new, index_new, is_descend, size): j = tid % axis_mul_after current_sort_num = p_index[tid] base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, data.shape[axis], name = "k") as k: + with ib.for_range(0, data.shape[axis], name="k") as k: with ib.if_scope(tid == 0): start = 0 with ib.else_scope(): start = sizes[tid-1] - p_out[base_idx + k * axis_mul_after] = tvm.select(k < current_sort_num, index_new[k+start], k) + p_out[base_idx + k * axis_mul_after] = tvm.select( + k < current_sort_num, + index_new[k+start], k) body = ib.get() return body @@ -163,7 +185,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) - max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + max_threads = int(math.sqrt( + tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") @@ -186,15 +209,19 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): i = bx * max_threads + tx j = by * max_threads + ty - nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) + nms_threshold_node = tvm.make.node( + "FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) - force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) + force_suppress_node = tvm.make.node( + "IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll", name="n") as n: - with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, - p_valid_count[0] > 0)): + with ib.if_scope( + tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, + p_valid_count[0] > 0)): # Reorder output - nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), - nms_topk, p_valid_count[n]) + nkeep = tvm.select( + tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), + nms_topk, p_valid_count[n]) with ib.if_scope(i < nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 @@ -218,10 +245,12 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): p_out[n * num_anchors * 6 + offset_i] == p_out[n * num_anchors * 6 + offset_j])): # When force_suppress == True or class_id equals - iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_i + 2, - n * num_anchors * 6 + offset_j + 2) + iou = calculate_overlap( + p_out, n * num_anchors * 6 + offset_i + 2, + n * num_anchors * 6 + offset_j + 2) with ib.if_scope(iou >= nms_threshold): - p_out[n * num_anchors * 6 + offset_j] = -1.0 + p_out[ + n * num_anchors * 6 + offset_j] = -1.0 with ib.else_scope(): with ib.if_scope(i < p_valid_count[n]): with ib.if_scope(j < 6): @@ -230,12 +259,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): # Set invalid entry to be -1 with ib.if_scope(i < num_anchors - p_valid_count[n]): with ib.if_scope(j < 6): - p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 + p_out[n * num_anchors * 6 + (i + + p_valid_count[n]) * 6 + j] = -1.0 body = ib.get() return body + @nms.register(["cuda", "gpu"]) -def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): +def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. Parameters @@ -269,7 +300,8 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1) # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") - valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") + valid_count = tvm.placeholder( + (dshape[0],), dtype="int32", name="valid_count") nms_threshold = 0.7 force_suppress = True nms_topk = -1 @@ -289,10 +321,12 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1) valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) - data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + data_buf = api.decl_buffer( + data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) - score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor") + score_tensor = tvm.compute( + score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor") score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index 1f9ce14c23f4..c22e7a513d7d 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements """SSD multibox operators""" from __future__ import absolute_import as _abs import math @@ -8,10 +8,10 @@ import topi -from ..nms import nms from topi.vision.ssd import multibox_prior from topi.vision.ssd import multibox_detection from topi.vision.ssd import multibox_transform_loc +from ..nms import nms def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): """Low level IR routing for multibox_prior operator. @@ -93,7 +93,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): @multibox_prior.register(["cuda", "gpu"]) -def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): +def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \ + offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters @@ -195,7 +196,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, num_anchors = cls_prob.shape[2] ib = tvm.ir_builder.create() - temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, ), name="temp_score", scope="global") + temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \ + ), name="temp_score", scope="global") score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local") cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local") flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global") @@ -217,7 +219,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, j = (tid % (num_anchors * num_classes)) / num_anchors i = tid % num_anchors with ib.if_scope(j > 0): - temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = p_cls_prob[tid] + temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \ + p_cls_prob[tid] p_valid_count[n] = 0 with ib.if_scope(tid < batch_size * num_anchors): n = tid / num_anchors @@ -243,8 +246,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, n = tid / num_anchors i = tid % num_anchors with ib.if_scope(cls_id[tid] > 0): - with ib.if_scope(i == 0): - out_base_id = n * num_anchors * 6 + with ib.if_scope(tid == 0): + out_base_idx = n * num_anchors * 6 with ib.else_scope(): out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6 p_out[out_base_idx] = cls_id[tid] - 1.0 @@ -259,8 +262,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, @multibox_transform_loc.register(["cuda", "gpu"]) -def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, - variances=(0.1, 0.1, 0.2, 0.2)): +def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, + variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters @@ -313,8 +316,8 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 @multibox_detection.register(["cuda", "gpu"]) -def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, - force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): +def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, + force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): """Convert multibox detection predictions. Parameters @@ -352,6 +355,6 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm 3-D tensor with shape (batch_size, num_anchors, 6) """ inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, - clip, threshold, variances) + clip, threshold, variances) out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) return out diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index 19b1db5827d8..c5d94b5ab4de 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -1,11 +1,10 @@ -# pylint: disable=invalid-name, unused-variable, unused-argument +# pylint: disable=invalid-name, unused-variable, unused-argument, no-member """Schedule for vision operators""" from __future__ import absolute_import as _abs import tvm from .. import generic from .. import cpp from .. import tag -import topi def _default_schedule(outs): """Default schedule for gpu.""" From e1d67e90ee5c798d394e31b953f6dd52207ab0bb Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 6 Jul 2018 19:57:40 -0700 Subject: [PATCH 19/19] removing intel gpu folder --- topi/python/topi/intel_gpu/__init__.py | 5 - topi/python/topi/intel_gpu/conv2d.py | 557 ------------------------- 2 files changed, 562 deletions(-) delete mode 100644 topi/python/topi/intel_gpu/__init__.py delete mode 100644 topi/python/topi/intel_gpu/conv2d.py diff --git a/topi/python/topi/intel_gpu/__init__.py b/topi/python/topi/intel_gpu/__init__.py deleted file mode 100644 index 336b1508f977..000000000000 --- a/topi/python/topi/intel_gpu/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# pylint: disable=redefined-builtin, wildcard-import -"""Intel Gen9 GPU specific declaration and schedules.""" -from __future__ import absolute_import as _abs - -from .conv2d import * diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py deleted file mode 100644 index c394d79f6744..000000000000 --- a/topi/python/topi/intel_gpu/conv2d.py +++ /dev/null @@ -1,557 +0,0 @@ -# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return -"""conv2d schedule on Intel GPU""" - -from __future__ import absolute_import as _abs - -import numpy as np -import tvm - -from .. import generic -from .. import util -from .. import tag -from ..nn import pad -from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload -from ..nn.util import get_pad_tuple -from ..util import simplify - -import nnvm -import nnvm.symbol as sym -from nnvm.top import registry as reg - - -##### SCHEDULE UTILITIES ##### -def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): - """ tile and bind 3d """ - y_factor = y_factor or z_factor - x_factor = x_factor or y_factor - zo, zi = s[tensor].split(z, z_factor) - yo, yi = s[tensor].split(y, y_factor) - xo, xi = s[tensor].split(x, x_factor) - s[tensor].reorder(zo, yo, xo ,zi, yi, xi) - - thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") - thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") - thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") - s[tensor].bind(zo, tvm.thread_axis("blockIdx.z")) - s[tensor].bind(zi, thread_z) - s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) - s[tensor].bind(yi, thread_y) - s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) - s[tensor].bind(xi, thread_x) - return xi, thread_z, thread_y, thread_x - -@conv2d_alter_layout.register(["intel_gpu"]) -def _alter_conv2d_layout(attrs, inputs, tinfos): - copy_inputs = [s for s in inputs] - - data = tinfos[0] - kernel = tinfos[1] - - import ast - padding = ast.literal_eval(attrs['padding']) - stride = ast.literal_eval(attrs['strides']) - - wkl = _get_workload(data, kernel, stride, padding, data.dtype) - oc_bn = 16 - - new_attrs = {k: attrs[k] for k in attrs.keys()} - new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn) - - return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) - -@conv2d_NCHWc.register(["intel_gpu"]) -def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, out_dtype='float32'): - """Conv2D operator for Intel GPU backend. - - Parameters - ---------- - data : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] - - kernel : tvm.Tensor - 5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec] - - stride : int or a list/tuple of two ints - stride size, or [stride_height, stride_width] - - padding : int or a list/tuple of two ints - padding size, or [pad_height, pad_width] - - layout : str - layout of data - - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" - assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." - - out_dtype = data.dtype - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) - kernel_shape = util.get_const_tuple(kernel.shape) - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - - return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype) - -@generic.schedule_conv2d_NCHWc.register(["intel_gpu"]) -def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, outs): - """Schedule for conv2d_nchw for Intel GPU - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of conv2d_nchw - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for conv2d_nchw. - """ - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - - def traverse(op): - """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag: - _schedule_cl_spatialpack_NCHWc(s,op) - - traverse(outs[0].op) - return s - -def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'): - batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] - num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape] - num_filter = num_filter * nv - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) - - if isinstance(stride, (tuple, list)): - stride_h, stride_w = stride - else: - stride_h, stride_w = stride, stride - - out_channel = num_filter - out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - oshape = (batch, out_channel, out_height, out_width) - pad_before = [0, 0, pad_top, pad_left] - pad_after = [0, 0, pad_down, pad_right] - temp = pad(data, pad_before, pad_after, name="pad_temp") - - rc = tvm.reduce_axis((0, in_channel), name='rc') - ry = tvm.reduce_axis((0, kernel_h), name='ry') - rx = tvm.reduce_axis((0, kernel_w), name='rx') - - block_w = 0 - block_h = 0 - if stride_h == 2: - if num_filter + kernel_h == 515: - conv_tag = "4_4" - block_h = 4 - block_w = 4 - else: - conv_tag = "4_5" - block_h = 4 - block_w = 5 - elif kernel_h == 3: - if num_filter == 512: - conv_tag = "2_7" - block_h = 2 - block_w = 7 - else: - conv_tag = "2_14" - block_h = 2 - block_w = 14 - else: - conv_tag = "1_16" - block_h = 1 - block_w = 16 - - c_h = out_height - c_w = out_width - - if not out_height % block_h == 0: - c_h = (out_height // block_h + 1) * block_h - - if not out_width % block_w == 0: - c_w = (out_width // block_w + 1) * block_w - - cshape = (batch, out_channel // nv, c_h, c_w, nv) - - conv = tvm.compute( - cshape, - lambda nn, ff, yy, xx, vc:\ - tvm.sum( - temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel[ff, rc, ry, rx, vc].astype(out_dtype), - axis=[rc, ry, rx]), tag=conv_tag, name='conv') - - output = tvm.compute( - oshape, - lambda nn, ff, yy, xx: - conv[nn][ff//nv][yy][xx][ff%nv], - name='output_unpack', tag=conv_tag) - - return output - -def _schedule_cl_spatialpack_NCHWc(s, op): - output = op.output(0) - _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] - - conv = op.input_tensors[0] - temp = s[conv].op.input_tensors[0] - kernel = s[conv].op.input_tensors[1] - temp_W = s.cache_read(temp, "warp", [conv]) - conv_L = s.cache_write(conv, "local") - - kernel_L = s.cache_read(kernel, "local", [conv_L]) - _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] - if "1_16" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 1 - OUTPUT_BLOCK_WIDTH = 16 - elif "2_14" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 2 - OUTPUT_BLOCK_WIDTH = 14 - elif "2_7" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 2 - OUTPUT_BLOCK_WIDTH = 7 - elif "4_5" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 4 - OUTPUT_BLOCK_WIDTH = 5 - elif "4_4" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 4 - OUTPUT_BLOCK_WIDTH = 4 - - # schedule conv - z_factor = 1 - y_factor = 1 - x_factor = 16 - thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") - thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") - thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") - _, co, oh, ow, vc = s[conv].op.axis - ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) - oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH) - s[conv].reorder(_, co, ooh, oow, vc, ioh, iow) - coo, coi = s[conv].split(co, nparts = 1) - ooho, oohi = s[conv].split(ooh, factor = z_factor) - oowo, oowi = s[conv].split(oow, factor = y_factor) - vco, vci = s[conv].split(vc, factor = x_factor) - s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow) - s[conv].bind(oohi, thread_z) - s[conv].bind(oowi, thread_y) - s[conv].bind(vci, thread_x) - s[conv].bind(ooho, tvm.thread_axis("blockIdx.z")) - s[conv].bind(oowo, tvm.thread_axis("blockIdx.y")) - s[conv].bind(coi, tvm.thread_axis("blockIdx.x")) - - # schedule conv_L - s[conv_L].compute_at(s[conv], vci) - i, oc, h, w, vc = s[conv_L].op.axis - rc, ry, rx = s[conv_L].op.reduce_axis - if in_channel == 2048: - rco, rci = s[conv_L].split(rc, nparts = 128) - s[conv_L].unroll(rci) - s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w) - s[temp_W].compute_at(s[conv_L], rco) - else: - s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) - s[temp_W].compute_at(s[conv_L], rc) - if kernel.shape[3].value != 7: - s[conv_L].unroll(ry) - s[conv_L].unroll(rx) - if kernel.shape[3].value != 7: - s[conv_L].unroll(ry) - s[conv_L].unroll(rx) - - # schedule temp - _, ci, h, w = s[temp].op.axis - tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) - - # schedule temp_W - _, ci, h, w = s[temp_W].op.axis - zo, zi = s[temp_W].split(ci, 1) - yo, yi = s[temp_W].split(h, 1) - xo, xi = s[temp_W].split(w, 16) - s[temp_W].reorder(zo, yo, xo, zi, yi, xi) - s[temp_W].bind(zi, thread_z) - s[temp_W].bind(yi, thread_y) - s[temp_W].bind(xi, thread_x) - s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) - - #schedule kernel - - # schedule kernel_L - if "2_14" in s[conv].op.tag: - s[kernel_L].compute_at(s[conv_L], ry) - else: - s[kernel_L].compute_at(s[conv_L], rx) - - # schedule output - if output.op in s.outputs: - out = output - else: - s[output].compute_inline() - out = s.outputs[0] - - _, co, h, w = s[out].op.axis - tile_and_bind3d(s, out, w, h, co, 4, 8, 8) - - -@conv2d.register(["intel_gpu"]) -def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): - """Conv2D operator for Intel GPU backend. - Parameters - ---------- - data : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] - kernel : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] - stride : int or a list/tuple of two ints - stride size, or [stride_height, stride_width] - padding : int or a list/tuple of two ints - padding size, or [pad_height, pad_width] - layout : str - layout of data - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - assert layout == 'NCHW', "only support NCHW convolution on intel gpu" - assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" - assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." - - out_dtype = data.dtype - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) - kernel_shape = util.get_const_tuple(kernel.shape) - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - - return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype) - -@generic.schedule_conv2d_nchw.register(["intel_gpu"]) -def schedule_conv2d_nchw(outs): - """Schedule for conv2d_nchw for Intel GPU - Parameters - ---------- - outs: Array of Tensor - The computation graph description of conv2d_nchw - in the format of an array of tensors. - Returns - ------- - s: Schedule - The computation schedule for conv2d_nchw. - """ - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - - def traverse(op): - """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag: - _schedule_cl_spatialpack(s,op) - - traverse(outs[0].op) - return s - -def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): - batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] - num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) - - if isinstance(stride, (tuple, list)): - stride_h, stride_w = stride - else: - stride_h, stride_w = stride, stride - - out_channel = num_filter - out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - oshape = (batch, out_channel, out_height, out_width) - pad_before = [0, 0, pad_top, pad_left] - pad_after = [0, 0, pad_down, pad_right] - temp = pad(data, pad_before, pad_after, name="pad_temp") - - rc = tvm.reduce_axis((0, in_channel), name='rc') - ry = tvm.reduce_axis((0, kernel_h), name='ry') - rx = tvm.reduce_axis((0, kernel_w), name='rx') - - block_w = 0 - block_h = 0 - if stride_h == 2: - if num_filter + kernel_h == 515: - conv_tag = "4_4" - block_h = 4 - block_w = 4 - else: - conv_tag = "4_5" - block_h = 4 - block_w = 5 - elif kernel_h == 3: - if num_filter == 512: - conv_tag = "2_7" - block_h = 2 - block_w = 7 - else: - conv_tag = "2_14" - block_h = 2 - block_w = 14 - else: - conv_tag = "1_16" - block_h = 1 - block_w = 16 - - c_h = out_height - c_w = out_width - - if not out_height % block_h == 0: - c_h = (out_height // block_h + 1) * block_h - - if not out_width % block_w == 0: - c_w = (out_width // block_w + 1) * block_w - - nv = 16 - cshape = (batch, out_channel // nv, c_h, c_w, nv) - kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) - - kernel_vec = tvm.compute( - kvshape, - lambda co, ci, kh, kw, vc: - kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') - - conv = tvm.compute( - cshape, - lambda nn, ff, yy, xx, vc:\ - tvm.sum( - temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), - axis=[rc, ry, rx]), tag=conv_tag, name='conv') - - output = tvm.compute( - oshape, - lambda nn, ff, yy, xx: - conv[nn][ff//nv][yy][xx][ff%nv], - name='output_unpack', tag=conv_tag) - - return output - -def _schedule_cl_spatialpack(s, op): - output = op.output(0) - _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] - - conv = op.input_tensors[0] - temp = s[conv].op.input_tensors[0] - kernel_vec = s[conv].op.input_tensors[1] - kernel = s[kernel_vec].op.input_tensors[0] - temp_W = s.cache_read(temp, "warp", [conv]) - conv_L = s.cache_write(conv, "local") - - kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) - _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] - - if "1_16" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 1 - OUTPUT_BLOCK_WIDTH = 16 - elif "2_14" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 2 - OUTPUT_BLOCK_WIDTH = 14 - elif "2_7" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 2 - OUTPUT_BLOCK_WIDTH = 7 - elif "4_5" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 4 - OUTPUT_BLOCK_WIDTH = 5 - elif "4_4" in s[conv].op.tag: - OUTPUT_BLOCK_HEIGHT = 4 - OUTPUT_BLOCK_WIDTH = 4 - - # schedule conv - z_factor = 1 - y_factor = 1 - x_factor = 16 - thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z") - thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y") - thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x") - _, co, oh, ow, vc = s[conv].op.axis - ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT) - oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH) - s[conv].reorder(_, co, ooh, oow, vc, ioh, iow) - coo, coi = s[conv].split(co, nparts = 1) - ooho, oohi = s[conv].split(ooh, factor = z_factor) - oowo, oowi = s[conv].split(oow, factor = y_factor) - vco, vci = s[conv].split(vc, factor = x_factor) - s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow) - s[conv].bind(oohi, thread_z) - s[conv].bind(oowi, thread_y) - s[conv].bind(vci, thread_x) - s[conv].bind(ooho, tvm.thread_axis("blockIdx.z")) - s[conv].bind(oowo, tvm.thread_axis("blockIdx.y")) - s[conv].bind(coi, tvm.thread_axis("blockIdx.x")) - - # schedule conv_L - s[conv_L].compute_at(s[conv], vci) - i, oc, h, w, vc = s[conv_L].op.axis - rc, ry, rx = s[conv_L].op.reduce_axis - if in_channel == 2048: - rco, rci = s[conv_L].split(rc, nparts = 128) - s[conv_L].unroll(rci) - s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w) - s[temp_W].compute_at(s[conv_L], rco) - else: - s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w) - s[temp_W].compute_at(s[conv_L], rc) - if kernel.shape[3].value != 7: - s[conv_L].unroll(ry) - s[conv_L].unroll(rx) - - # schedule temp - _, ci, h, w = s[temp].op.axis - tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) - - # schedule temp_W - _, ci, h, w = s[temp_W].op.axis - zo, zi = s[temp_W].split(ci, 1) - yo, yi = s[temp_W].split(h, 1) - xo, xi = s[temp_W].split(w, 16) - s[temp_W].reorder(zo, yo, xo, zi, yi, xi) - s[temp_W].bind(zi, thread_z) - s[temp_W].bind(yi, thread_y) - s[temp_W].bind(xi, thread_x) - s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) - - s[kernel_vec].compute_inline() - - # schedule kernel_L - if "2_14" in s[conv].op.tag: - s[kernel_L].compute_at(s[conv_L], ry) - else: - s[kernel_L].compute_at(s[conv_L], rx) - - # schedule output - if output.op in s.outputs: - out = output - else: - s[output].compute_inline() - out = s.outputs[0] - - _, co, h, w = s[out].op.axis - tile_and_bind3d(s, out, w, h, co, 4, 8, 8)