From 7b231823e4c6c472409cc77e5dbdb4281a938173 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 12 Apr 2018 14:55:55 -0700
Subject: [PATCH 01/19] intel gpu conv schedule added

---
 topi/python/topi/intel_gpu/__init__.py |   5 +
 topi/python/topi/intel_gpu/conv2d.py   | 294 +++++++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 topi/python/topi/intel_gpu/__init__.py
 create mode 100644 topi/python/topi/intel_gpu/conv2d.py

diff --git a/topi/python/topi/intel_gpu/__init__.py b/topi/python/topi/intel_gpu/__init__.py
new file mode 100644
index 000000000000..336b1508f977
--- /dev/null
+++ b/topi/python/topi/intel_gpu/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""Intel Gen9 GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d import *
diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
new file mode 100644
index 000000000000..7380aeabfe81
--- /dev/null
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -0,0 +1,294 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""conv2d schedule on Intel GPU"""
+
+from __future__ import absolute_import as _abs
+
+import numpy as np
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+from ..nn import pad
+from ..nn.conv2d import conv2d
+from ..nn.util import get_pad_tuple
+from ..util import simplify
+
+
+##### SCHEDULE UTILITIES #####
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+    bx, tx = s[tensor].split(fused, num_thread or max_threads)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
+
+def split_and_bind(s, tensor, x, x_factor=1):
+    bx, tx = s[tensor].split(x, factor = x_factor)
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    return bx, tx
+
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """ tile and bind to GPU threads """
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+def cache_tile_and_bind3d(s, tensor, z, y, x, z_factor = 2, y_factor=None, x_factor=None):
+    """ tile and bind cache to GPU threads"""
+    x_factor = x_factor or z_factor
+    y_factor = y_factor or z_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return zo, yo, xo, zi, yi, xi
+
+def cache_tile_and_bind(s, tensor, y, x, y_factor=2, x_factor=None):
+    """ tile and bind cache to GPU threads"""
+    x_factor = x_factor or y_factor
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].reorder(yo, xo, yi, xi)
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return yo, xo, yi, xi
+
+def cache_split_and_bind(s, tensor, x, x_factor=1):
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return xo
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].reorder(zo, yo, xo ,zi, yi, xi)
+
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, thread_z)
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, thread_y)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, thread_x)
+    return xi, thread_z, thread_y, thread_x
+
+@conv2d.register(["intel_gpu"])
+def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for Intel GPU backend.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
+    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+
+    out_dtype = data.dtype
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
+
+@generic.schedule_conv2d_nchw.register(["intel_gpu"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw for Intel GPU
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag:
+            _schedule_cl_spatialpack(s,op)
+
+    traverse(outs[0].op)
+    return s
+    
+def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
+    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride, stride
+
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    # pad 3 and 2 in order to achieve 1, 3, 229, 229
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+
+    if stride_h == 2:
+        if num_filter + kernel_h == 515:
+            conv_tag = "4_4"
+        else:
+            conv_tag = "4_5"
+    elif kernel_h == 3:
+        if num_filter == 512:
+            conv_tag = "2_7"
+        else:
+            conv_tag = "2_14"
+    else:
+        conv_tag = "1_16"
+
+    return tvm.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda nn, ff, yy, xx: tvm.sum(
+            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            kernel[ff, rc, ry, rx].astype(out_dtype),
+            axis=[rc, ry, rx]), tag=conv_tag)
+
+def _schedule_cl_spatialpack(s, op):
+    output = op.output(0)
+    temp = op.input_tensors[0]
+    kernel = op.input_tensors[1]
+#    temp_S = s.cache_read(temp, "shared", [output])
+    temp_W = s.cache_read(temp, "warp", [output])
+
+    if output.op in s.outputs:
+        out = output
+        out_L = s.cache_write(out, "local")
+    else:
+        s[output].compute_inline()
+        out = s.outputs[0]
+        s[output].set_scope("local")
+        out_L = output
+    kernel_L = s.cache_read(kernel, "local", [out_L])
+    _, _, out_height, out_width = [util.get_const_int(x) for x in out_L.shape]
+    _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    if "1_16" in op.tag:
+        OUTPUT_BLOCK_HEIGHT = 1
+        OUTPUT_BLOCK_WIDTH  = 16
+        num_threads_y = 1
+        num_threads_x = 64
+    elif "2_14" in op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH  = 14
+        num_threads_y = 1
+        num_threads_x = temp_h 
+    elif "2_7" in op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH  = 7
+        num_threads_y = 10
+        num_threads_x = 9
+    elif "4_5" in op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH  = 5
+        num_threads_y = 1
+        num_threads_x = 235
+    elif "4_4" in op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH  = 4
+        num_threads_y = 1
+        num_threads_x = 17
+
+    PREFETCH = 4
+    SUBGROUP_SIZE = 16
+    IN_BLOCK_ARRAY_SIZE = 13
+    STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT
+    STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH
+
+    _, co, oh, ow = s[out].op.axis
+    ooh, ioh = s[out].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
+    oow, iow = s[out].split(ow, factor = OUTPUT_BLOCK_WIDTH)
+    s[out].reorder(_, co, ooh, oow, ioh, iow)
+
+    tx, thread_z, thread_y, thread_x  = tile_and_bind3d(s, out, oow, ooh, co, 1, 1, 16)
+
+    s[out_L].compute_at(s[out], tx)
+   
+    i, oc, h, w = s[out_L].op.axis
+
+    oh, ih = s[out_L].split(h, factor = OUTPUT_BLOCK_HEIGHT)
+    ow, iw = s[out_L].split(w, factor = OUTPUT_BLOCK_WIDTH)
+    rc, ry, rx = s[out_L].op.reduce_axis
+    s[out_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw)
+#    s[out_L].unroll(ry)
+#    s[out_L].unroll(rx)
+
+    # data load from global to padded data global
+    _, ci, h, w = s[temp].op.axis
+    # prameter should be changed for each workload
+    tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x)
+
+#    s[temp_S].compute_at(s[out_L], rc)
+    s[temp_W].compute_at(s[out_L], rc)
+#    _, ci, h, w = s[temp_S].op.axis
+    _, ci, h, w = s[temp_W].op.axis
+
+    zo, zi = s[temp_W].split(ci, 1)
+    yo, yi = s[temp_W].split(h, 1)
+    xo, xi = s[temp_W].split(w, 16)
+    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
+    s[temp_W].bind(zi, thread_z)
+    s[temp_W].bind(yi, thread_y)
+    s[temp_W].bind(xi, thread_x)
+#    s[temp_S].storage_align(s[temp_S].op.axis[2], 16, 0)
+    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
+    s[kernel_L].compute_at(s[out_L], rx)

From ba97cbe76a564a4a74defea5a0075bc507a285d2 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 13 Apr 2018 00:53:59 -0700
Subject: [PATCH 02/19] conv2d output packing added

---
 topi/python/topi/intel_gpu/conv2d.py | 138 +++++++++++++++++----------
 1 file changed, 88 insertions(+), 50 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 7380aeabfe81..658be93c2443 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -175,7 +175,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     out_channel = num_filter
     out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    # pad 3 and 2 in order to achieve 1, 3, 229, 229
+    oshape = (batch, out_channel, out_height, out_width)
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
     temp = pad(data, pad_before, pad_after, name="pad_temp")
@@ -184,65 +184,97 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
+    block_w = 0
+    block_h = 0
     if stride_h == 2:
         if num_filter + kernel_h == 515:
             conv_tag = "4_4"
+            block_h = 4
+            block_w = 4
         else:
             conv_tag = "4_5"
+            block_h = 4
+            block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
             conv_tag = "2_7"
+            block_h = 2
+            block_w = 7
         else:
             conv_tag = "2_14"
+            block_h = 2
+            block_w = 14
     else:
         conv_tag = "1_16"
+        block_h = 1
+        block_w = 16
 
-    return tvm.compute(
-        (batch, out_channel, out_height, out_width),
+    c_h = 0
+    c_w = 0
+
+    if out_height % block_h == 0:
+        c_h = out_height
+    else:
+        c_h = (out_height // block_h + 1) * block_h
+
+    if out_width % block_w == 0:
+        c_w = out_width
+    else:
+        c_w = (out_width // block_w + 1) * block_w
+
+    cshape = (batch, out_channel, c_h, c_w)
+
+    conv = tvm.compute(
+        cshape,
         lambda nn, ff, yy, xx: tvm.sum(
             temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
             kernel[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx]), tag=conv_tag)
+            axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+
+    output = tvm.compute(
+        oshape,
+        lambda nn, ff, yy, xx:
+            conv[nn][ff][yy][xx],
+            name='output_unpack', tag=conv_tag)
+
+#    if out_height % block_h ==0 and out_width % block_w == 0:
+#        return conv
+    
+    return output
 
 def _schedule_cl_spatialpack(s, op):
     output = op.output(0)
-    temp = op.input_tensors[0]
-    kernel = op.input_tensors[1]
-#    temp_S = s.cache_read(temp, "shared", [output])
-    temp_W = s.cache_read(temp, "warp", [output])
+    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
 
-    if output.op in s.outputs:
-        out = output
-        out_L = s.cache_write(out, "local")
-    else:
-        s[output].compute_inline()
-        out = s.outputs[0]
-        s[output].set_scope("local")
-        out_L = output
-    kernel_L = s.cache_read(kernel, "local", [out_L])
-    _, _, out_height, out_width = [util.get_const_int(x) for x in out_L.shape]
+    conv = op.input_tensors[0]
+    temp = s[conv].op.input_tensors[0]
+    kernel = s[conv].op.input_tensors[1]
+    temp_W = s.cache_read(temp, "warp", [conv])
+    conv_L = s.cache_write(conv, "local")
+
+    kernel_L = s.cache_read(kernel, "local", [conv_L])
     _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in op.tag:
+    if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
         OUTPUT_BLOCK_WIDTH  = 16
         num_threads_y = 1
         num_threads_x = 64
-    elif "2_14" in op.tag:
+    elif "2_14" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 14
         num_threads_y = 1
         num_threads_x = temp_h 
-    elif "2_7" in op.tag:
+    elif "2_7" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 7
         num_threads_y = 10
         num_threads_x = 9
-    elif "4_5" in op.tag:
+    elif "4_5" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 5
         num_threads_y = 1
         num_threads_x = 235
-    elif "4_4" in op.tag:
+    elif "4_4" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 4
         num_threads_y = 1
@@ -250,38 +282,33 @@ def _schedule_cl_spatialpack(s, op):
 
     PREFETCH = 4
     SUBGROUP_SIZE = 16
-    IN_BLOCK_ARRAY_SIZE = 13
     STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT
     STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH
 
-    _, co, oh, ow = s[out].op.axis
-    ooh, ioh = s[out].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
-    oow, iow = s[out].split(ow, factor = OUTPUT_BLOCK_WIDTH)
-    s[out].reorder(_, co, ooh, oow, ioh, iow)
-
-    tx, thread_z, thread_y, thread_x  = tile_and_bind3d(s, out, oow, ooh, co, 1, 1, 16)
-
-    s[out_L].compute_at(s[out], tx)
-   
-    i, oc, h, w = s[out_L].op.axis
-
-    oh, ih = s[out_L].split(h, factor = OUTPUT_BLOCK_HEIGHT)
-    ow, iw = s[out_L].split(w, factor = OUTPUT_BLOCK_WIDTH)
-    rc, ry, rx = s[out_L].op.reduce_axis
-    s[out_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw)
-#    s[out_L].unroll(ry)
-#    s[out_L].unroll(rx)
-
-    # data load from global to padded data global
+    # schedule conv
+    _, co, oh, ow = s[conv].op.axis
+    ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
+    oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH)
+    s[conv].reorder(_, co, ooh, oow, ioh, iow)
+    tx, thread_z, thread_y, thread_x  = tile_and_bind3d(s, conv, oow, ooh, co, 1, 1, 16)
+
+    # schedule conv_L
+    s[conv_L].compute_at(s[conv], tx)
+    i, oc, h, w = s[conv_L].op.axis
+    oh, ih = s[conv_L].split(h, factor = OUTPUT_BLOCK_HEIGHT)
+    ow, iw = s[conv_L].split(w, factor = OUTPUT_BLOCK_WIDTH)
+    rc, ry, rx = s[conv_L].op.reduce_axis
+    s[conv_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw)
+#    s[conv_L].unroll(ry)
+#    s[conv_L].unroll(rx)
+
+    # schedule temp
     _, ci, h, w = s[temp].op.axis
-    # prameter should be changed for each workload
     tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x)
 
-#    s[temp_S].compute_at(s[out_L], rc)
-    s[temp_W].compute_at(s[out_L], rc)
-#    _, ci, h, w = s[temp_S].op.axis
+    # schedule temp_W
+    s[temp_W].compute_at(s[conv_L], rc)
     _, ci, h, w = s[temp_W].op.axis
-
     zo, zi = s[temp_W].split(ci, 1)
     yo, yi = s[temp_W].split(h, 1)
     xo, xi = s[temp_W].split(w, 16)
@@ -289,6 +316,17 @@ def _schedule_cl_spatialpack(s, op):
     s[temp_W].bind(zi, thread_z)
     s[temp_W].bind(yi, thread_y)
     s[temp_W].bind(xi, thread_x)
-#    s[temp_S].storage_align(s[temp_S].op.axis[2], 16, 0)
     s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
-    s[kernel_L].compute_at(s[out_L], rx)
+    # schedule kernel_L
+    s[kernel_L].compute_at(s[conv_L], rx)
+
+    # schedule output
+    if output.op in s.outputs:
+        out = output
+    else:
+        s[output].compute_inline()
+        out = s.outputs[0]
+
+    _, co, h, w = s[out].op.axis
+    tile_and_bind3d(s, out, w, h, co, 1, 1, 64)
+

From 2579bfbded26fb1313766b046a897aa5ca255ad2 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 17 Apr 2018 16:37:36 -0700
Subject: [PATCH 03/19] intel gpu conv2d updated

---
 topi/python/topi/intel_gpu/conv2d.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 658be93c2443..cf219fad29f6 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -295,10 +295,8 @@ def _schedule_cl_spatialpack(s, op):
     # schedule conv_L
     s[conv_L].compute_at(s[conv], tx)
     i, oc, h, w = s[conv_L].op.axis
-    oh, ih = s[conv_L].split(h, factor = OUTPUT_BLOCK_HEIGHT)
-    ow, iw = s[conv_L].split(w, factor = OUTPUT_BLOCK_WIDTH)
     rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, oh, ow, rc, ry, rx, ih, iw)
+    s[conv_L].reorder(i, oc, rc, ry, rx, h, w)
 #    s[conv_L].unroll(ry)
 #    s[conv_L].unroll(rx)
 
@@ -318,7 +316,13 @@ def _schedule_cl_spatialpack(s, op):
     s[temp_W].bind(xi, thread_x)
     s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
     # schedule kernel_L
-    s[kernel_L].compute_at(s[conv_L], rx)
+    if "2_14" in s[conv].op.tag:
+#        i, oc, h, w = s[conv_L].op.axis
+#        s[conv_L].reorder(i, oc, rc, ry, h, w, rx)
+        s[kernel_L].compute_at(s[conv_L], ry)
+#        s[conv_L].vectorize(rx)
+    else:
+        s[kernel_L].compute_at(s[conv_L], rx)
 
     # schedule output
     if output.op in s.outputs:

From c0821be6389a880bb491b80c58410068dd666cea Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 19 Apr 2018 11:19:05 -0700
Subject: [PATCH 04/19] minor changes

---
 topi/python/topi/intel_gpu/conv2d.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index cf219fad29f6..6e44fecc6add 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -280,11 +280,6 @@ def _schedule_cl_spatialpack(s, op):
         num_threads_y = 1
         num_threads_x = 17
 
-    PREFETCH = 4
-    SUBGROUP_SIZE = 16
-    STRIDE_SIZE_Y = out_height // OUTPUT_BLOCK_HEIGHT
-    STRIDE_SIZE_X = out_width // OUTPUT_BLOCK_WIDTH
-
     # schedule conv
     _, co, oh, ow = s[conv].op.axis
     ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)

From 436e3359b9a0a5e628d25ac632585e726d114b3a Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 3 May 2018 12:21:21 -0700
Subject: [PATCH 05/19] conv2d packing schedule fixed

---
 topi/python/topi/intel_gpu/conv2d.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 6e44fecc6add..55dd4dfab0ab 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -257,28 +257,18 @@ def _schedule_cl_spatialpack(s, op):
     if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
         OUTPUT_BLOCK_WIDTH  = 16
-        num_threads_y = 1
-        num_threads_x = 64
     elif "2_14" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 14
-        num_threads_y = 1
-        num_threads_x = temp_h 
     elif "2_7" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 7
-        num_threads_y = 10
-        num_threads_x = 9
     elif "4_5" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 5
-        num_threads_y = 1
-        num_threads_x = 235
     elif "4_4" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 4
-        num_threads_y = 1
-        num_threads_x = 17
 
     # schedule conv
     _, co, oh, ow = s[conv].op.axis
@@ -292,12 +282,16 @@ def _schedule_cl_spatialpack(s, op):
     i, oc, h, w = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
     s[conv_L].reorder(i, oc, rc, ry, rx, h, w)
-#    s[conv_L].unroll(ry)
-#    s[conv_L].unroll(rx)
+    s[conv_L].unroll(ry)
+    s[conv_L].unroll(rx)
 
     # schedule temp
+    num_thread_z = 1
+    num_thread_y = 16
+    num_thread_x = 16
+
     _, ci, h, w = s[temp].op.axis
-    tile_and_bind(s, temp, h, w, num_threads_y, num_threads_x)
+    tile_and_bind3d(s, temp, ci, h, w, num_thread_z, num_thread_y, num_thread_x)
 
     # schedule temp_W
     s[temp_W].compute_at(s[conv_L], rc)
@@ -327,5 +321,5 @@ def _schedule_cl_spatialpack(s, op):
         out = s.outputs[0]
 
     _, co, h, w = s[out].op.axis
-    tile_and_bind3d(s, out, w, h, co, 1, 1, 64)
+    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
 

From e1628c118f8a5146a4f039b46cdb21d93ad5d771 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 3 May 2018 16:07:48 -0700
Subject: [PATCH 06/19] conv2d.py

---
 topi/python/topi/intel_gpu/conv2d.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 55dd4dfab0ab..a934aa6aaeae 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -282,8 +282,9 @@ def _schedule_cl_spatialpack(s, op):
     i, oc, h, w = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
     s[conv_L].reorder(i, oc, rc, ry, rx, h, w)
-    s[conv_L].unroll(ry)
-    s[conv_L].unroll(rx)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
 
     # schedule temp
     num_thread_z = 1
@@ -306,10 +307,7 @@ def _schedule_cl_spatialpack(s, op):
     s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
     # schedule kernel_L
     if "2_14" in s[conv].op.tag:
-#        i, oc, h, w = s[conv_L].op.axis
-#        s[conv_L].reorder(i, oc, rc, ry, h, w, rx)
         s[kernel_L].compute_at(s[conv_L], ry)
-#        s[conv_L].vectorize(rx)
     else:
         s[kernel_L].compute_at(s[conv_L], rx)
 

From c1b83d95a22900f5457401e4ffb9da5812a3435c Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 3 May 2018 16:56:39 -0700
Subject: [PATCH 07/19] weight reordering added

---
 topi/python/topi/intel_gpu/conv2d.py | 70 ++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index a934aa6aaeae..b5f06cbb21c0 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -222,24 +222,29 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     else:
         c_w = (out_width // block_w + 1) * block_w
 
-    cshape = (batch, out_channel, c_h, c_w)
+    nv = 16
+    cshape = (batch, out_channel // nv, c_h, c_w, nv)
+    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
+
+    kernel_vec = tvm.compute(
+            kvshape, 
+            lambda co, ci, kh, kw, vc:
+                kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
 
     conv = tvm.compute(
         cshape,
-        lambda nn, ff, yy, xx: tvm.sum(
+        lambda nn, ff, yy, xx, vc:\
+          tvm.sum(
             temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
-            kernel[ff, rc, ry, rx].astype(out_dtype),
+            kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
             axis=[rc, ry, rx]), tag=conv_tag, name='conv')
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
-            conv[nn][ff][yy][xx],
+            conv[nn][ff//nv][yy][xx][ff%nv],
             name='output_unpack', tag=conv_tag)
 
-#    if out_height % block_h ==0 and out_width % block_w == 0:
-#        return conv
-    
     return output
 
 def _schedule_cl_spatialpack(s, op):
@@ -248,51 +253,69 @@ def _schedule_cl_spatialpack(s, op):
 
     conv = op.input_tensors[0]
     temp = s[conv].op.input_tensors[0]
-    kernel = s[conv].op.input_tensors[1]
+    kernel_vec = s[conv].op.input_tensors[1]
+    kernel = s[kernel_vec].op.input_tensors[0]
     temp_W = s.cache_read(temp, "warp", [conv])
     conv_L = s.cache_write(conv, "local")
 
-    kernel_L = s.cache_read(kernel, "local", [conv_L])
+    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
     _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
     if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
         OUTPUT_BLOCK_WIDTH  = 16
+#        kernel_vec_tile = "fuse"
     elif "2_14" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 14
+#        kernel_vec_tile = "fuse"
     elif "2_7" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 7
+#        kernel_vec_tile = "inline"
     elif "4_5" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 5
+#        kernel_vec_tile = "inline"
     elif "4_4" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 4
+#        kernel_vec_tile = "fuse"
 
     # schedule conv
-    _, co, oh, ow = s[conv].op.axis
+    z_factor = 1
+    y_factor = 1
+    x_factor = 16
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    _, co, oh, ow, vc = s[conv].op.axis
     ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
     oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH)
-    s[conv].reorder(_, co, ooh, oow, ioh, iow)
-    tx, thread_z, thread_y, thread_x  = tile_and_bind3d(s, conv, oow, ooh, co, 1, 1, 16)
+    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
+    coo, coi = s[conv].split(co, nparts = 1)
+    ooho, oohi = s[conv].split(ooh, factor = z_factor) 
+    oowo, oowi = s[conv].split(oow, factor = y_factor)
+    vco, vci = s[conv].split(vc, factor = x_factor)
+    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
+    s[conv].bind(oohi, thread_z)
+    s[conv].bind(oowi, thread_y)
+    s[conv].bind(vci, thread_x)
+    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
 
     # schedule conv_L
-    s[conv_L].compute_at(s[conv], tx)
-    i, oc, h, w = s[conv_L].op.axis
+    s[conv_L].compute_at(s[conv], vci)
+    i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, rc, ry, rx, h, w)
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
 
     # schedule temp
-    num_thread_z = 1
-    num_thread_y = 16
-    num_thread_x = 16
-
     _, ci, h, w = s[temp].op.axis
-    tile_and_bind3d(s, temp, ci, h, w, num_thread_z, num_thread_y, num_thread_x)
+    tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
 
     # schedule temp_W
     s[temp_W].compute_at(s[conv_L], rc)
@@ -305,6 +328,13 @@ def _schedule_cl_spatialpack(s, op):
     s[temp_W].bind(yi, thread_y)
     s[temp_W].bind(xi, thread_x)
     s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
+
+    #schedule kernel
+#    if kernel_vec_tile == "fuse":
+#        fuse_and_bind(s, kernel_vec, num_thread = 256)
+#    else:
+    s[kernel_vec].compute_inline()
+
     # schedule kernel_L
     if "2_14" in s[conv].op.tag:
         s[kernel_L].compute_at(s[conv_L], ry)

From ed68a27de269962ccf0277d6d460beaa61f113d4 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 17 May 2018 13:24:02 -0700
Subject: [PATCH 08/19] code reduce

---
 topi/python/topi/intel_gpu/conv2d.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index b5f06cbb21c0..6080a8c46592 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -209,17 +209,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
         block_h = 1
         block_w = 16
 
-    c_h = 0
-    c_w = 0
+    c_h = out_height
+    c_w = out_width
 
-    if out_height % block_h == 0:
-        c_h = out_height
-    else:
+    if not out_height % block_h == 0:
         c_h = (out_height // block_h + 1) * block_h
 
-    if out_width % block_w == 0:
-        c_w = out_width
-    else:
+    if not out_width % block_w == 0:
         c_w = (out_width // block_w + 1) * block_w
 
     nv = 16

From 97f0771411d21361ea1e44cbad254b1d1468cd4d Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 21 May 2018 16:20:25 -0700
Subject: [PATCH 09/19] kernel consist packing added with improved end-to-end
 performance

---
 topi/python/topi/intel_gpu/conv2d.py | 121 ++++++++-------------------
 1 file changed, 36 insertions(+), 85 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 6080a8c46592..3ac7117cc99d 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -10,66 +10,16 @@
 from .. import util
 from .. import tag
 from ..nn import pad
-from ..nn.conv2d import conv2d
+from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
 from ..nn.util import get_pad_tuple
 from ..util import simplify
 
+import nnvm
+import nnvm.symbol as sym
+from nnvm.top import registry as reg
 
-##### SCHEDULE UTILITIES #####
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """ fuse all the axis and bind to GPU threads """
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
-    bx, tx = s[tensor].split(fused, num_thread or max_threads)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-    return bx, tx
-
-def split_and_bind(s, tensor, x, x_factor=1):
-    bx, tx = s[tensor].split(x, factor = x_factor)
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    return bx, tx
-
-def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
-    """ tile and bind to GPU threads """
-    x_factor = x_factor or y_factor
-    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    return yo, xo, yi, xi
-
-def cache_tile_and_bind3d(s, tensor, z, y, x, z_factor = 2, y_factor=None, x_factor=None):
-    """ tile and bind cache to GPU threads"""
-    x_factor = x_factor or z_factor
-    y_factor = y_factor or z_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    return zo, yo, xo, zi, yi, xi
-
-def cache_tile_and_bind(s, tensor, y, x, y_factor=2, x_factor=None):
-    """ tile and bind cache to GPU threads"""
-    x_factor = x_factor or y_factor
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].reorder(yo, xo, yi, xi)
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    return yo, xo, yi, xi
-
-def cache_split_and_bind(s, tensor, x, x_factor=1):
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    return xo
 
+##### SCHEDULE UTILITIES #####
 def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
     """ tile and bind 3d """
     y_factor = y_factor or z_factor
@@ -90,8 +40,27 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     s[tensor].bind(xi, thread_x)
     return xi, thread_z, thread_y, thread_x
 
-@conv2d.register(["intel_gpu"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+@conv2d_alter_layout.register(["intel_gpu"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    copy_inputs = [s for s in inputs]
+
+    data = tinfos[0]
+    kernel = tinfos[1]
+
+    import ast
+    padding = ast.literal_eval(attrs['padding'])
+    stride  = ast.literal_eval(attrs['strides'])
+
+    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
+    oc_bn = 16
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
+
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+
+@conv2d_NCHWc.register(["intel_gpu"])
+def _decl_conv2d(data, kernel, num_filter, kernel_size,  stride, padding, out_dtype='float32'):
     """Conv2D operator for Intel GPU backend.
 
     Parameters
@@ -100,7 +69,7 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
         4-D with shape [batch, in_channel, in_height, in_width]
 
     kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec]
 
     stride : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -116,7 +85,6 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
     assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
     assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
 
@@ -128,10 +96,10 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
     else:
         HSTR, WSTR = stride, stride
 
-    return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
+    return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype)
 
-@generic.schedule_conv2d_nchw.register(["intel_gpu"])
-def schedule_conv2d_nchw(outs):
+@generic.schedule_conv2d_NCHWc.register(["intel_gpu"])
+def schedule_conv2d_nchw(num_filter, kernel_size, stride, padding, outs):
     """Schedule for conv2d_nchw for Intel GPU
 
     Parameters
@@ -162,9 +130,10 @@ def traverse(op):
     traverse(outs[0].op)
     return s
     
-def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
+def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
     batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape]
+    num_filter = num_filter * nv
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
 
     if isinstance(stride, (tuple, list)):
@@ -218,21 +187,14 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     if not out_width % block_w == 0:
         c_w = (out_width // block_w + 1) * block_w
 
-    nv = 16
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
-    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
-
-    kernel_vec = tvm.compute(
-            kvshape, 
-            lambda co, ci, kh, kw, vc:
-                kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
 
     conv = tvm.compute(
         cshape,
         lambda nn, ff, yy, xx, vc:\
           tvm.sum(
             temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
-            kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
+            kernel[ff, rc, ry, rx, vc].astype(out_dtype),
             axis=[rc, ry, rx]), tag=conv_tag, name='conv')
 
     output = tvm.compute(
@@ -249,33 +211,27 @@ def _schedule_cl_spatialpack(s, op):
 
     conv = op.input_tensors[0]
     temp = s[conv].op.input_tensors[0]
-    kernel_vec = s[conv].op.input_tensors[1]
-    kernel = s[kernel_vec].op.input_tensors[0]
+    kernel = s[conv].op.input_tensors[1]
     temp_W = s.cache_read(temp, "warp", [conv])
     conv_L = s.cache_write(conv, "local")
 
-    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
+    kernel_L = s.cache_read(kernel, "local", [conv_L])
     _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
     if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
         OUTPUT_BLOCK_WIDTH  = 16
-#        kernel_vec_tile = "fuse"
     elif "2_14" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 14
-#        kernel_vec_tile = "fuse"
     elif "2_7" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 2
         OUTPUT_BLOCK_WIDTH  = 7
-#        kernel_vec_tile = "inline"
     elif "4_5" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 5
-#        kernel_vec_tile = "inline"
     elif "4_4" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 4
         OUTPUT_BLOCK_WIDTH  = 4
-#        kernel_vec_tile = "fuse"
 
     # schedule conv
     z_factor = 1
@@ -326,10 +282,6 @@ def _schedule_cl_spatialpack(s, op):
     s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
 
     #schedule kernel
-#    if kernel_vec_tile == "fuse":
-#        fuse_and_bind(s, kernel_vec, num_thread = 256)
-#    else:
-    s[kernel_vec].compute_inline()
 
     # schedule kernel_L
     if "2_14" in s[conv].op.tag:
@@ -346,4 +298,3 @@ def _schedule_cl_spatialpack(s, op):
 
     _, co, h, w = s[out].op.axis
     tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
-

From b2f450fcc52945dec9d957b30b15405e6de6cb69 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 22 May 2018 12:57:03 -0700
Subject: [PATCH 10/19] merged conv2d_nchw and conv2d_NCHWc

---
 topi/python/topi/intel_gpu/conv2d.py | 252 ++++++++++++++++++++++++++-
 1 file changed, 247 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 3ac7117cc99d..1445016129a6 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -96,10 +96,10 @@ def _decl_conv2d(data, kernel, num_filter, kernel_size,  stride, padding, out_dt
     else:
         HSTR, WSTR = stride, stride
 
-    return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype)
+    return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype)
 
 @generic.schedule_conv2d_NCHWc.register(["intel_gpu"])
-def schedule_conv2d_nchw(num_filter, kernel_size, stride, padding, outs):
+def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, outs):
     """Schedule for conv2d_nchw for Intel GPU
 
     Parameters
@@ -125,12 +125,12 @@ def traverse(op):
                 if tensor.op.input_tensors:
                     traverse(tensor.op)
         if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag:
-            _schedule_cl_spatialpack(s,op)
+            _schedule_cl_spatialpack_NCHWc(s,op)
 
     traverse(outs[0].op)
     return s
     
-def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
+def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'):
     batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
     num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape]
     num_filter = num_filter * nv
@@ -205,7 +205,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
 
     return output
 
-def _schedule_cl_spatialpack(s, op):
+def _schedule_cl_spatialpack_NCHWc(s, op):
     output = op.output(0)
     _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
 
@@ -298,3 +298,245 @@ def _schedule_cl_spatialpack(s, op):
 
     _, co, h, w = s[out].op.axis
     tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
+
+
+@conv2d.register(["intel_gpu"])
+def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+    """Conv2D operator for Intel GPU backend.
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    layout : str
+        layout of data
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
+    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+
+    out_dtype = data.dtype
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+
+    return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
+
+@generic.schedule_conv2d_nchw.register(["intel_gpu"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw for Intel GPU
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag:
+            _schedule_cl_spatialpack(s,op)
+
+    traverse(outs[0].op)
+    return s
+    
+def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
+    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride, stride
+
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    oshape = (batch, out_channel, out_height, out_width)
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+
+    block_w = 0
+    block_h = 0
+    if stride_h == 2:
+        if num_filter + kernel_h == 515:
+            conv_tag = "4_4"
+            block_h = 4
+            block_w = 4
+        else:
+            conv_tag = "4_5"
+            block_h = 4
+            block_w = 5
+    elif kernel_h == 3:
+        if num_filter == 512:
+            conv_tag = "2_7"
+            block_h = 2
+            block_w = 7
+        else:
+            conv_tag = "2_14"
+            block_h = 2
+            block_w = 14
+    else:
+        conv_tag = "1_16"
+        block_h = 1
+        block_w = 16
+
+    c_h = out_height
+    c_w = out_width
+
+    if not out_height % block_h == 0:
+        c_h = (out_height // block_h + 1) * block_h
+
+    if not out_width % block_w == 0:
+        c_w = (out_width // block_w + 1) * block_w
+
+    nv = 16
+    cshape = (batch, out_channel // nv, c_h, c_w, nv)
+    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
+
+    kernel_vec = tvm.compute(
+            kvshape, 
+            lambda co, ci, kh, kw, vc:
+                kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
+
+    conv = tvm.compute(
+        cshape,
+        lambda nn, ff, yy, xx, vc:\
+          tvm.sum(
+            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
+            axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+
+    output = tvm.compute(
+        oshape,
+        lambda nn, ff, yy, xx:
+            conv[nn][ff//nv][yy][xx][ff%nv],
+            name='output_unpack', tag=conv_tag)
+
+    return output
+
+def _schedule_cl_spatialpack(s, op):
+    output = op.output(0)
+    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
+
+    conv = op.input_tensors[0]
+    temp = s[conv].op.input_tensors[0]
+    kernel_vec = s[conv].op.input_tensors[1]
+    kernel = s[kernel_vec].op.input_tensors[0]
+    temp_W = s.cache_read(temp, "warp", [conv])
+    conv_L = s.cache_write(conv, "local")
+
+    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
+    _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+
+    if "1_16" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 1
+        OUTPUT_BLOCK_WIDTH  = 16
+    elif "2_14" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH  = 14
+    elif "2_7" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 2
+        OUTPUT_BLOCK_WIDTH  = 7
+    elif "4_5" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH  = 5
+    elif "4_4" in s[conv].op.tag:
+        OUTPUT_BLOCK_HEIGHT = 4
+        OUTPUT_BLOCK_WIDTH  = 4
+
+    # schedule conv
+    z_factor = 1
+    y_factor = 1
+    x_factor = 16
+    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    _, co, oh, ow, vc = s[conv].op.axis
+    ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
+    oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH)
+    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
+    coo, coi = s[conv].split(co, nparts = 1)
+    ooho, oohi = s[conv].split(ooh, factor = z_factor) 
+    oowo, oowi = s[conv].split(oow, factor = y_factor)
+    vco, vci = s[conv].split(vc, factor = x_factor)
+    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
+    s[conv].bind(oohi, thread_z)
+    s[conv].bind(oowi, thread_y)
+    s[conv].bind(vci, thread_x)
+    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
+
+    # schedule conv_L
+    s[conv_L].compute_at(s[conv], vci)
+    i, oc, h, w, vc = s[conv_L].op.axis
+    rc, ry, rx = s[conv_L].op.reduce_axis
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
+
+    # schedule temp
+    _, ci, h, w = s[temp].op.axis
+    tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
+
+    # schedule temp_W
+    s[temp_W].compute_at(s[conv_L], rc)
+    _, ci, h, w = s[temp_W].op.axis
+    zo, zi = s[temp_W].split(ci, 1)
+    yo, yi = s[temp_W].split(h, 1)
+    xo, xi = s[temp_W].split(w, 16)
+    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
+    s[temp_W].bind(zi, thread_z)
+    s[temp_W].bind(yi, thread_y)
+    s[temp_W].bind(xi, thread_x)
+    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
+
+    s[kernel_vec].compute_inline()
+
+    # schedule kernel_L
+    if "2_14" in s[conv].op.tag:
+        s[kernel_L].compute_at(s[conv_L], ry)
+    else:
+        s[kernel_L].compute_at(s[conv_L], rx)
+
+    # schedule output
+    if output.op in s.outputs:
+        out = output
+    else:
+        s[output].compute_inline()
+        out = s.outputs[0]
+
+    _, co, h, w = s[out].op.axis
+    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)

From 27265fd106bfc47f032adbbe530e25754bafed08 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 22 May 2018 15:17:18 -0700
Subject: [PATCH 11/19] resnet50 end-to-end perform added

---
 topi/python/topi/intel_gpu/conv2d.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
index 1445016129a6..c394d79f6744 100644
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ b/topi/python/topi/intel_gpu/conv2d.py
@@ -216,7 +216,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
     conv_L = s.cache_write(conv, "local")
 
     kernel_L = s.cache_read(kernel, "local", [conv_L])
-    _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
     if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
         OUTPUT_BLOCK_WIDTH  = 16
@@ -260,7 +260,17 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    if in_channel == 2048:
+        rco, rci = s[conv_L].split(rc, nparts = 128)
+        s[conv_L].unroll(rci)
+        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rco)
+    else:
+        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rc)
+    if kernel.shape[3].value != 7:
+        s[conv_L].unroll(ry)
+        s[conv_L].unroll(rx)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
@@ -270,7 +280,6 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
     tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
 
     # schedule temp_W
-    s[temp_W].compute_at(s[conv_L], rc)
     _, ci, h, w = s[temp_W].op.axis
     zo, zi = s[temp_W].split(ci, 1)
     yo, yi = s[temp_W].split(h, 1)
@@ -457,7 +466,7 @@ def _schedule_cl_spatialpack(s, op):
     conv_L = s.cache_write(conv, "local")
 
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
-    _, _, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
 
     if "1_16" in s[conv].op.tag:
         OUTPUT_BLOCK_HEIGHT = 1
@@ -502,7 +511,14 @@ def _schedule_cl_spatialpack(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    if in_channel == 2048:
+        rco, rci = s[conv_L].split(rc, nparts = 128)
+        s[conv_L].unroll(rci)
+        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rco)
+    else:
+        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+        s[temp_W].compute_at(s[conv_L], rc)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
@@ -512,7 +528,6 @@ def _schedule_cl_spatialpack(s, op):
     tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
 
     # schedule temp_W
-    s[temp_W].compute_at(s[conv_L], rc)
     _, ci, h, w = s[temp_W].op.axis
     zo, zi = s[temp_W].split(ci, 1)
     yo, yi = s[temp_W].split(h, 1)

From d4aed968ddf85b9eff512d41d356e1285b11cb4b Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 11 Jun 2018 23:28:10 -0700
Subject: [PATCH 12/19] multibox_prior gpu ir added, clip tests udpated

---
 topi/python/topi/cuda/__init__.py     |   3 +
 topi/python/topi/cuda/ssd/__init__.py |   5 +
 topi/python/topi/cuda/ssd/multibox.py | 328 ++++++++++++++++++++++++++
 topi/python/topi/cuda/vision.py       |  29 ++-
 topi/tests/python/test_topi_clip.py   |  14 +-
 topi/tests/python/test_topi_vision.py |  13 +-
 6 files changed, 379 insertions(+), 13 deletions(-)
 create mode 100644 topi/python/topi/cuda/ssd/__init__.py
 create mode 100644 topi/python/topi/cuda/ssd/multibox.py

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index dbf00ebeb52b..dff76fc4ebf5 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -18,3 +18,6 @@
 from .vision import schedule_region
 from .vision import schedule_reorg
 from .nn import schedule_lrn, schedule_l2_normalize
+from .vision import schedule_multibox_prior
+from . import ssd
+from .ssd import multibox_prior
diff --git a/topi/python/topi/cuda/ssd/__init__.py b/topi/python/topi/cuda/ssd/__init__.py
new file mode 100644
index 000000000000..d680c578e7aa
--- /dev/null
+++ b/topi/python/topi/cuda/ssd/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""VISION network operators"""
+from __future__ import absolute_import as _abs
+
+from .multibox import *
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
new file mode 100644
index 000000000000..fde2b182141e
--- /dev/null
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -0,0 +1,328 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+"""SSD multibox operators"""
+from __future__ import absolute_import as _abs
+import math
+import tvm
+
+from tvm import api
+
+import topi
+
+from ..nms import nms
+from topi.vision.ssd import multibox_prior
+from topi.vision.ssd import multibox_detection
+from topi.vision.ssd import multibox_transform_loc
+
+def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
+    """Low level IR routing for multibox_prior operator.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data buffer.
+
+    out : Buffer
+        Output buffer.
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of int
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    ty = tvm.thread_axis("threadIdx.y")
+    bx = tvm.thread_axis("blockIdx.x")
+    by = tvm.thread_axis("blockIdx.y")
+    ib = tvm.ir_builder.create()
+    p_out = ib.buffer_ptr(out)
+    in_height = data.shape[2]
+    in_width = data.shape[3]
+    nthread_tx = max_threads
+    nthread_bx = in_height // max_threads + 1
+    nthread_ty = max_threads
+    nthread_by = in_width // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(ty, "thread_extent", nthread_ty)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    i = bx * max_threads + tx
+    j = by * max_threads + ty
+    with ib.if_scope((i < in_height)):
+        with ib.if_scope((j < in_width)):
+            center_h = (i + offset_h) * steps_h
+            center_w = (j + offset_w) * steps_w
+
+            for k in range(num_sizes + num_ratios - 1):
+                w = tvm.select(k < num_sizes,
+                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[0] * in_height / in_width *
+                               math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
+                               size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                count = (i * in_width * (num_sizes + num_ratios - 1) +
+                         j * (num_sizes + num_ratios - 1) + k) * 4
+                p_out[count] = center_w - w
+                p_out[count + 1] = center_h - h
+                p_out[count + 2] = center_w + w
+                p_out[count + 3] = center_h + h
+
+    body = ib.get()
+    return body
+
+
+@multibox_prior.register("cuda")
+def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
+    """Generate prior(anchor) boxes from data, sizes and ratios.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, c_in, h_in, w_in]]
+
+    sizes : tuple of float
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of int
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int
+        Priorbox center offsets, y and x respectively.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
+    """
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    out = tvm.extern(oshape, [data], lambda ins, outs:
+                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     tag="multibox_prior")
+    if clip:
+        out = topi.clip(out, 0, 1)
+    return out
+
+
+def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    cls_prob : Buffer
+        Buffer of class probabilities.
+
+    loc_pred : Buffer
+        Buffer of location regression predictions.
+
+    anchor : Buffer
+        Buffer of prior anchor boxes.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
+        """Transform prior anchor box to output box through location predictions.
+        """
+        al = anchor[anchor_base_idx]
+        at = anchor[anchor_base_idx + 1]
+        ar = anchor[anchor_base_idx + 2]
+        ab = anchor[anchor_base_idx + 3]
+        aw = ar - al
+        ah = ab - at
+        ax = (al + ar) / 2.0
+        ay = (at + ab) / 2.0
+        px = loc[loc_base_idx]
+        py = loc[loc_base_idx + 1]
+        pw = loc[loc_base_idx + 2]
+        ph = loc[loc_base_idx + 3]
+        ox = px * vx * aw + ax
+        oy = py * vy * ah + ay
+        ow = tvm.exp(pw * vw) * aw / 2.0
+        oh = tvm.exp(ph * vh) * ah / 2.0
+        return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
+
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    ib = tvm.ir_builder.create()
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_loc_pred = ib.buffer_ptr(loc_pred)
+    p_anchor = ib.buffer_ptr(anchor)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
+        p_valid_count[n] = 0
+        with ib.for_range(0, num_anchors, name="i") as i:
+            # Find the predicted class id and probability
+            score = ib.allocate('float32', (1,), name="score", scope="local")
+            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
+            score[0] = -1.0
+            cls_id[0] = 0
+            with ib.for_range(0, num_classes, name="j") as j:
+                with ib.if_scope(j > 0):
+                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
+                    cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
+                    score[0] = tvm.make.Max(temp, score[0])
+            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
+                cls_id[0] = 0
+            # [id, prob, xmin, ymin, xmax, ymax]
+            # Remove background, restore original id
+            with ib.if_scope(cls_id[0] > 0):
+                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
+                p_out[out_base_idx] = cls_id[0] - 1.0
+                p_out[out_base_idx + 1] = score[0]
+                offset = i * 4
+                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
+                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
+                                                        p_anchor, offset, clip, variances[0],
+                                                        variances[1], variances[2], variances[3])
+                p_valid_count[n] += 1
+
+    return ib.get()
+
+
+@multibox_transform_loc.register(["cuda", "gpu"])
+def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
+                          variances=(0.1, 0.1, 0.2, 0.2)):
+    """Location transformation for multibox detection
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape (batch_size, num_anchors, 6)
+
+    valid_count : tvm.Tensor
+        1-D tensor with shape (batch_size,), number of valid anchor boxes.
+    """
+    batch_size = cls_prob.shape[0]
+    num_anchors = anchor.shape[1]
+    oshape = (batch_size, num_anchors, 6)
+    # Define data alignment for intermediate buffer
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    valid_count, out = \
+        tvm.extern([(batch_size,), oshape],
+                   [cls_prob, loc_pred, anchor],
+                   lambda ins, outs: transform_loc_ir(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
+                   dtype=[valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf, out_buf],
+                   tag="multibox_transform_loc")
+    return out, valid_count
+
+
+@multibox_detection.register(["cuda", "gpu"])
+def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
+                       force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
+    """Convert multibox detection predictions.
+
+    Parameters
+    ----------
+    cls_prob : tvm.Tensor
+        Class probabilities.
+
+    loc_pred : tvm.Tensor
+        Location regression predictions.
+
+    anchor : tvm.Tensor
+        Prior anchor boxes.
+
+    clip : boolean
+        Whether to clip out-of-boundary boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    variances : tuple of float
+        Variances to be decoded from box regression output.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape (batch_size, num_anchors, 6)
+    """
+    inter_out, valid_count = mutibox_transform_loc(cls_prob, loc_pred, anchor,
+                                                   clip, threshold, variances)
+    out = nms(inter_out, valid_count, nms_threshold, force_suppress, nms_topk)
+    return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 106d13665ad8..64774fc79254 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -4,6 +4,8 @@
 import tvm
 from .. import generic
 from .. import cpp
+from .. import tag
+import topi
 
 @generic.schedule_reorg.register(["cuda", "gpu"])
 def schedule_reorg(outs):
@@ -42,7 +44,7 @@ def schedule_region(outs):
     return cpp.cuda.schedule_region(cpp_target, outs)
 
 @generic.schedule_multibox_prior.register(["cuda", "gpu"])
-def schedule_multibox_prior(out):
+def schedule_multibox_prior(outs):
     """Schedule for multibox_prior operator.
 
     Parameters
@@ -56,10 +58,31 @@ def schedule_multibox_prior(out):
     s: Schedule
         The computation schedule for multibox_prior.
     """
-    raise RuntimeError("Currently multibox_prior only supports CPU.")
+    target = tvm.target.current_target()
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            #TODO: should be injected automatically
+            else:
+                x = op.output(0)
+                fused = s[x].fuse(*s[x].op.axis)
+                num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+                bx, tx = s[x].split(fused, factor=num_thread)
+                s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+    traverse(outs[0].op)
+    return s
 
 @generic.schedule_multibox_detection.register(["cuda", "gpu"])
-def schedule_multibox_detection(out):
+def schedule_multibox_detection(outs):
     """Schedule for multibox_detection operator.
 
     Parameters
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index 52da4922e1d6..041565433bcc 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -20,23 +20,27 @@ def get_ref_data():
     a_np, b_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.module.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'opencl']:
         check_device(device)
 
 def test_clip():
-    verify_clip(1024, -127, 127, 'int8')
-    verify_clip(1024, -127, 127, 'int16')
     verify_clip(1024, -127, 127, 'float32')
+    verify_clip(1024, -127, 127, 'int16')
+    verify_clip(1024, -127, 127, 'int8')
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 3c624726d562..9ef5445dc5e8 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -46,7 +46,6 @@ def check_device(device):
 
 def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
     data = tvm.placeholder(dshape, name="data")
-    out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
 
     dtype = data.dtype
     input_data = np.random.uniform(size=dshape).astype(dtype)
@@ -88,15 +87,19 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            if device == 'llvm':
+                out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
+            else:
+                out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
             s = topi.generic.schedule_multibox_prior(out)
 
         tvm_input_data = tvm.nd.array(input_data, ctx)
         tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
         f = tvm.build(s, [data, out], device)
         f(tvm_input_data, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-4)
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'opencl']:
         check_device(device)
 
 
@@ -146,6 +149,6 @@ def check_device(device):
 
 
 if __name__ == "__main__":
-    test_nms()
+#    test_nms()
     test_multibox_prior()
-    test_multibox_detection()
\ No newline at end of file
+#    test_multibox_detection()

From 0aef45e886198e02ddcae0089c511fc5acd461b9 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Wed, 13 Jun 2018 13:49:23 -0700
Subject: [PATCH 13/19] topi vision test typo fixed

---
 topi/tests/python/test_topi_vision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 9ef5445dc5e8..8ee594ddd35b 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -149,6 +149,6 @@ def check_device(device):
 
 
 if __name__ == "__main__":
-#    test_nms()
+    test_nms()
     test_multibox_prior()
-#    test_multibox_detection()
+    test_multibox_detection()

From d2f3948083d86d2d5e97f44d55f0c3ab36735efa Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 15 Jun 2018 10:58:53 -0700
Subject: [PATCH 14/19] default gpu schedule added for vision

---
 topi/python/topi/cuda/vision.py | 42 +++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 64774fc79254..da787c64e2ee 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -7,6 +7,31 @@
 from .. import tag
 import topi
 
+def _default_schedule(outs):
+    """Default schedule for gpu."""
+    target = tvm.target.current_target()
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            #TODO: should be injected automatically
+            else:
+                x = op.output(0)
+                fused = s[x].fuse(*s[x].op.axis)
+                num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+                bx, tx = s[x].split(fused, factor=num_thread)
+                s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+    traverse(outs[0].op)
+    return s
+
 @generic.schedule_reorg.register(["cuda", "gpu"])
 def schedule_reorg(outs):
     """Schedule for reorg operator.
@@ -43,6 +68,23 @@ def schedule_region(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.cuda.schedule_region(cpp_target, outs)
 
+@generic.schedule_nms.register(["cuda", "gpu"])
+def schedule_nms(outs):
+    """Schedule for non-maximum suppression
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of nms
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
+
 @generic.schedule_multibox_prior.register(["cuda", "gpu"])
 def schedule_multibox_prior(outs):
     """Schedule for multibox_prior operator.

From 52eb115c2d44b67cf3b68f8874bd4d0e9a81b218 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 21 Jun 2018 10:26:34 -0700
Subject: [PATCH 15/19] nms gpu added

---
 topi/python/topi/cuda/__init__.py |   1 +
 topi/python/topi/cuda/nms.py      | 328 ++++++++++++++++++++++++++++++
 2 files changed, 329 insertions(+)
 create mode 100644 topi/python/topi/cuda/nms.py

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index dff76fc4ebf5..c0e3c5e173f2 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -21,3 +21,4 @@
 from .vision import schedule_multibox_prior
 from . import ssd
 from .ssd import multibox_prior
+from .nms import *
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
new file mode 100644
index 000000000000..f7e65040b555
--- /dev/null
+++ b/topi/python/topi/cuda/nms.py
@@ -0,0 +1,328 @@
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+"""Non-maximum suppression operator"""
+import tvm
+
+from tvm import api
+from topi.vision import nms
+import math
+import numpy as np
+
+def sort_ir(data, index, output, axis, is_descend):
+    def swap(a, b):
+        a, b = b, a
+    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    ty = tvm.thread_axis("threadIdx.y")
+    bx = tvm.thread_axis("blockIdx.x")
+    by = tvm.thread_axis("blockIdx.y")
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_index = ib.buffer_ptr(index)
+    p_out = ib.buffer_ptr(output)
+    dshape = p_index[index.shape[0]-1]
+    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
+    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
+#    p_index_new = ib.buffer_ptr(index_new)
+#    p_data_new = ib.buffer_ptr(data_new)
+    ndim = len(data.shape)
+    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
+    assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
+
+    axis_mul_before = 1
+    axis_mul_after = 1
+    if axis < 0:
+        axis = ndim + axis
+    for i in range(0, ndim):
+        if i < axis:
+            axis_mul_before *= data.shape[i]
+        elif i > axis:
+            axis_mul_after *= data.shape[i]
+
+    nthread_tx = max_threads
+    nthread_bx = axis_mul_before // max_threads + 1
+    nthread_ty = max_threads
+    nthread_by = axis_mul_after // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(ty, "thread_extent", nthread_ty)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    i = bx * max_threads + tx
+    j = by * max_threads + ty
+    tid = i * nthread_tx * nthread_bx + j
+
+    with ib.if_scope(i < axis_mul_before):
+        with ib.if_scope(j < axis_mul_after):
+            current_sort_num = p_index[i * axis_mul_after + j]
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, current_sort_num, name = "k") as k:
+                full_idx = base_idx + k * axis_mul_after
+                index_new[k] = k
+                data_new[k] = p_data[full_idx]
+            # sync_threads
+            # sorting
+            size = current_sort_num
+            with ib.if_scope(tid < size - 1):
+                with ib.for_range(0, size - 1, name = "level") as level:
+                    with ib.if_scope(tid % 2 == (level & 1)):
+                        with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)):
+                            swap(data_new[tid], data_new[tid+1])
+                            swap(index_new[tid], index_new[tid+1])
+            #convert back
+            with ib.for_range(0, data.shape[axis], for_type = "unroll", name = "l") as l:
+                p_out[base_idx + l * axis_mul_after] = tvm.select(k < size, index_new[l], l)
+
+    body = ib.get()
+    input(body)
+    return body
+
+
+def OddEvenTransposeSort_ir(data, index, is_descend):
+    """ Low level IR routing for sorting operation on GPUs.
+
+    Parameters
+    ----------
+    ----------
+    """
+    def swap(a, b):
+        a, b = b, a
+
+    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_index = ib.buffer_ptr(index)
+    in_size = data.shape
+    nthread_tx = max_threads
+    nthread_bx = in_size // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    with ib.if_scope(tid < in_size - 1):
+        with ib.for_range(0, in_size - 1, name = "level", for_type = "unroll") as level:
+            with ib.if_scope(tid % 2 == (1 & level)):
+                with ib.if_scope(~(is_descend ^ (p_data[tid] < p_data[tid+1]))): # xnor comp
+                    # doing swap on global mem which is non-efficient
+                    swap(p_data[tid], p_data[tid+1])
+                    swap(p_index[tid], p_index[tid+1])
+    body = ib.get()
+    return body
+
+
+def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    sort_result : Buffer
+        Buffer of output box indexes sorted by score.
+
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    out : Buffer
+        Output buffer.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
+        """Calculate overlap of two boxes.
+        """
+        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
+        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
+        i = w * h
+        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
+            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
+            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
+            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
+        return tvm.select(u <= 0.0, 0.0, i / u)
+
+    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    tx = tvm.thread_axis("threadIdx.x")
+    ty = tvm.thread_axis("threadIdx.y")
+    bx = tvm.thread_axis("blockIdx.x")
+    by = tvm.thread_axis("blockIdx.y")
+    ib = tvm.ir_builder.create()
+    p_data = ib.buffer_ptr(data)
+    p_sort_result = ib.buffer_ptr(sort_result)
+    p_valid_count = ib.buffer_ptr(valid_count)
+    p_out = ib.buffer_ptr(out)
+    batch_size = out.shape[0]
+    num_anchors = out.shape[1]
+    nthread_tx = max_threads
+    nthread_bx = num_anchors // max_threads + 1
+    nthread_ty = max_threads
+    nthread_by = 6 // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(ty, "thread_extent", nthread_ty)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    i = bx * max_threads + tx
+    j = by * max_threads + ty
+
+    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
+    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
+    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+    with ib.for_range(0, batch_size, for_type="unroll", name="n") as n:
+        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
+                                 p_valid_count[0] > 0)):
+            # Reorder output
+            nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
+                               nms_topk, p_valid_count[n])
+            with ib.if_scope(i < nkeep):
+                with ib.if_scope(j < 6):
+                    p_out[(n * num_anchors * 6
+                           + i * 6 + j)] = p_data[(n * num_anchors * 6
+                                                   + p_sort_result[n * num_anchors + i] * 6 + j)]
+            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
+                with ib.if_scope(i < p_valid_count[n] - nkeep):
+                    with ib.if_scope(j < 6):
+                        p_out[(n * num_anchors * 6
+                               + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6
+                                                                 + (i + nkeep) * 6 + j)]
+            # Apply nms
+            with ib.if_scope(i < p_valid_count[n]):
+                offset_i = i * 6
+                with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0):
+                    with ib.if_scope(j < p_valid_count[n]):
+                        offset_j = j * 6
+                        with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6
+                                                              + offset_j] >= 0)):
+                            with ib.if_scope(tvm.any(force_suppress_node > 0,
+                                                     p_out[n * num_anchors * 6 + offset_i] ==
+                                                     p_out[n * num_anchors * 6 + offset_j])):
+                                # When force_suppress == True or class_id equals
+                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_i + 2,
+                                                        n * num_anchors * 6 + offset_j + 2)
+                                with ib.if_scope(iou >= nms_threshold):
+                                    p_out[n * num_anchors * 6 + offset_j] = -1.0
+        with ib.else_scope():
+            with ib.if_scope(i < p_valid_count[n]):
+                with ib.if_scope(j < 6):
+                    p_out[(n * num_anchors * 6
+                           + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j]
+        # Set invalid entry to be -1
+        with ib.if_scope(i < num_anchors - p_valid_count[n]):
+            with ib.if_scope(j < 6):
+                p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0
+    body = ib.get()
+    input(body)
+    return body
+
+#@sort.register("cuda")
+def sort(data, index, is_descend):
+    oshape = data.shape
+    out = tvm.extern(oshape, [data, index], lambda ins, outs:
+                     OddEvenTransposeSort_ir(ins[0], ins[1], is_descend),
+                     tag="sort_gpu")
+    return out
+
+@nms.register("cuda")
+def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+    """Non-maximum suppression operator for object detection.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    nms_threshold : float
+        Non-maximum suppression threshold.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    Example
+    --------
+    .. code-block:: python
+
+        # An example to use nms
+        dshape = (1, 5, 6)
+        data = tvm.placeholder(dshape, name="data")
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        nms_threshold = 0.7
+        force_suppress = True
+        nms_topk = -1
+        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
+        s = topi.generic.schedule_nms(out)
+        f = tvm.build(s, [data, valid_count, out], "llvm")
+        ctx = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        f(tvm_data, tvm_valid_count, tvm_out)
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    valid_count_dtype = "int32"
+    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
+                                      "valid_count_buf", data_alignment=4)
+    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    score_axis = 1
+    score_shape = (batch_size, num_anchors)
+    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
+    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
+                                       "score_tensor_buf", data_alignment=8)
+    sort_tensor_dtype = "int32"
+    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
+                                      "sort_tensor_buf", data_alignment=8)
+
+#    dshape = (valid_count[valid_count.shape[0]-1],)
+#    sorter = tvm.placeholder(dshape, name = "sorter", dtype = "float32")
+#    index = tvm.placeholder(dshape, name = "index", dtype = "int32")
+#    sorter_buf = api.decl_buffer(sorter.shape, sorter.dtype,
+#                                 "sorter_data_buf", data_alignment=8)
+#    index_buf = api.decl_buffer(index.shape, index.dtype,
+#                                "sorter_index_buf", data_alignment=8)
+
+    sort_tensor = \
+        tvm.extern(score_shape,
+                   [score_tensor, valid_count],
+                   lambda ins, outs: sort_ir(
+                       ins[0], ins[1], outs[0], score_axis, True),
+                   dtype=sort_tensor_dtype,
+                   in_buffers=[score_tensor_buf, valid_count_buf],
+                   out_buffers=sort_tensor_buf,
+                   name="nms_sort")
+    out = \
+        tvm.extern(data.shape,
+                   [data, sort_tensor, valid_count],
+                   lambda ins, outs: nms_ir(
+                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
+                       force_suppress, nms_topk),
+                   dtype="float32",
+                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+                   tag="nms")
+    return out

From 172167a24f5babd6541f70aa9141974c1fb76a8b Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 3 Jul 2018 11:35:24 -0700
Subject: [PATCH 16/19] sorting debugged, detection completed

---
 topi/python/topi/cuda/nms.py          | 157 +++++++++++++-------------
 topi/python/topi/cuda/ssd/multibox.py |  86 +++++++++-----
 topi/python/topi/cuda/vision.py       |  33 ++----
 topi/tests/python/test_topi_vision.py |  14 ++-
 4 files changed, 156 insertions(+), 134 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index f7e65040b555..10f3ec2683d9 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -6,11 +6,19 @@
 from topi.vision import nms
 import math
 import numpy as np
+from ..nn import tag
 
 def sort_ir(data, index, output, axis, is_descend):
     def swap(a, b):
         a, b = b, a
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    def OETS(ib, tid, data_new, index_new, is_descend, size):
+        with ib.if_scope(tid < size - 1):
+            with ib.for_range(0, size - 1, name = "level") as level:
+                with ib.if_scope(tid % 2 == (level & 1)):
+                    with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)):
+                        swap(data_new[tid], data_new[tid+1])
+                        swap(index_new[tid], index_new[tid+1])
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
     tx = tvm.thread_axis("threadIdx.x")
     ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
@@ -19,11 +27,6 @@ def swap(a, b):
     p_data = ib.buffer_ptr(data)
     p_index = ib.buffer_ptr(index)
     p_out = ib.buffer_ptr(output)
-    dshape = p_index[index.shape[0]-1]
-    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
-    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
-#    p_index_new = ib.buffer_ptr(index_new)
-#    p_data_new = ib.buffer_ptr(data_new)
     ndim = len(data.shape)
     assert data.dtype == "float32", "Currently only supports input dtype to be float32"
     assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
@@ -38,73 +41,80 @@ def swap(a, b):
         elif i > axis:
             axis_mul_after *= data.shape[i]
 
-    nthread_tx = max_threads
-    nthread_bx = axis_mul_before // max_threads + 1
-    nthread_ty = max_threads
-    nthread_by = axis_mul_after // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(ty, "thread_extent", nthread_ty)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    ib.scope_attr(by, "thread_extent", nthread_by)
-    i = bx * max_threads + tx
-    j = by * max_threads + ty
-    tid = i * nthread_tx * nthread_bx + j
-
-    with ib.if_scope(i < axis_mul_before):
-        with ib.if_scope(j < axis_mul_after):
-            current_sort_num = p_index[i * axis_mul_after + j]
-            base_idx = i * data.shape[axis] * axis_mul_after + j
-            with ib.for_range(0, current_sort_num, name = "k") as k:
-                full_idx = base_idx + k * axis_mul_after
-                index_new[k] = k
-                data_new[k] = p_data[full_idx]
-            # sync_threads
-            # sorting
-            size = current_sort_num
-            with ib.if_scope(tid < size - 1):
-                with ib.for_range(0, size - 1, name = "level") as level:
-                    with ib.if_scope(tid % 2 == (level & 1)):
-                        with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)):
-                            swap(data_new[tid], data_new[tid+1])
-                            swap(index_new[tid], index_new[tid+1])
-            #convert back
-            with ib.for_range(0, data.shape[axis], for_type = "unroll", name = "l") as l:
-                p_out[base_idx + l * axis_mul_after] = tvm.select(k < size, index_new[l], l)
-
-    body = ib.get()
-    input(body)
-    return body
-
-
-def OddEvenTransposeSort_ir(data, index, is_descend):
-    """ Low level IR routing for sorting operation on GPUs.
-
-    Parameters
-    ----------
-    ----------
-    """
-    def swap(a, b):
-        a, b = b, a
+    dshape = 0
+    for i in range(0, len(index.shape)):
+        dshape += index.shape[i]
+    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, axis_mul_before*axis_mul_after)
 
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_index = ib.buffer_ptr(index)
-    in_size = data.shape
+    sizes_temp = ib.allocate("int32", dshape, name="sizes_temp", scope="global")
+    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
+    temp_index = ib.allocate("int32", dshape, name="temp_index", scope = "local")
+    temp_data = ib.allocate("float32", dshape, name="temp_data", scope = "local")
+    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
+    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
     nthread_tx = max_threads
-    nthread_bx = in_size // max_threads + 1
+    nthread_bx = dshape // max_threads + 1
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    with ib.if_scope(tid < in_size - 1):
-        with ib.for_range(0, in_size - 1, name = "level", for_type = "unroll") as level:
-            with ib.if_scope(tid % 2 == (1 & level)):
-                with ib.if_scope(~(is_descend ^ (p_data[tid] < p_data[tid+1]))): # xnor comp
-                    # doing swap on global mem which is non-efficient
-                    swap(p_data[tid], p_data[tid+1])
-                    swap(p_index[tid], p_index[tid+1])
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        sizes[tid] = p_index[tid]
+        sizes_temp[tid] = p_index[tid]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after).astype("float32"))) + 1, name="k") as k:
+            with ib.if_scope(tid- (tvm.const(1, "int32") << k) >= 0):
+                with ib.if_scope(k % 2 == 0):
+                    sizes[tid] +=  sizes_temp[tid - (tvm.const(1, "int32") << k)]
+                    sizes_temp[tid] = sizes[tid]
+                with ib.else_scope():
+                    sizes_temp[tid] +=  sizes[tid - (tvm.const(1, "int32") << k)]
+                    sizes[tid] = sizes_temp[tid]
+#                sizes[tid] += sizes[tid - (tvm.const(1, "int32") << k)]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        i = tid / axis_mul_after
+        j = tid % axis_mul_after
+        current_sort_num = p_index[tid]
+        base_idx = i * data.shape[axis] * axis_mul_after + j
+        with ib.for_range(0, current_sort_num, name = "k") as k:
+            full_idx = base_idx + k * axis_mul_after
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            index_new[start + k] = k
+            data_new[start + k] = p_data[full_idx]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        with ib.if_scope(tid == 0):
+            start = 0
+        with ib.else_scope():
+            start = sizes[tid-1]
+        # OddEvenTransposeSort
+        with ib.for_range(0, p_index[tid], name = "k") as k:
+            with ib.for_range(0, p_index[tid] - 1, name = "i") as i:
+                with ib.if_scope(i % 2 == (k & 1)):
+                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ is_descend) == False):
+                        temp_data[tid] = data_new[i+start]
+                        data_new[i+start] = data_new[i+start+1]
+                        data_new[i+start+1] = temp_data[tid]
+                        temp_index[tid] = index_new[i+start]
+                        index_new[i+start] = index_new[i+start+1]
+                        index_new[i+start+1] = temp_index[tid]
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        i = tid / axis_mul_after
+        j = tid % axis_mul_after
+        current_sort_num = p_index[tid]
+        base_idx = i * data.shape[axis] * axis_mul_after + j
+        with ib.for_range(0, data.shape[axis], name = "k") as k:
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            p_out[base_idx + k * axis_mul_after] = tvm.select(k < current_sort_num, index_new[k+start], k)
     body = ib.get()
     return body
 
@@ -223,7 +233,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             with ib.if_scope(j < 6):
                 p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0
     body = ib.get()
-    input(body)
     return body
 
 #@sort.register("cuda")
@@ -292,21 +301,13 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     score_axis = 1
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
+    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor")
     score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
                                        "score_tensor_buf", data_alignment=8)
     sort_tensor_dtype = "int32"
     sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
                                       "sort_tensor_buf", data_alignment=8)
 
-#    dshape = (valid_count[valid_count.shape[0]-1],)
-#    sorter = tvm.placeholder(dshape, name = "sorter", dtype = "float32")
-#    index = tvm.placeholder(dshape, name = "index", dtype = "int32")
-#    sorter_buf = api.decl_buffer(sorter.shape, sorter.dtype,
-#                                 "sorter_data_buf", data_alignment=8)
-#    index_buf = api.decl_buffer(index.shape, index.dtype,
-#                                "sorter_index_buf", data_alignment=8)
-
     sort_tensor = \
         tvm.extern(score_shape,
                    [score_tensor, valid_count],
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index fde2b182141e..147160333f0d 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -195,38 +195,64 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     num_anchors = cls_prob.shape[2]
 
     ib = tvm.ir_builder.create()
+    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, ), name="temp_score", scope="global")
+    score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
+    cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
+    flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
     p_cls_prob = ib.buffer_ptr(cls_prob)
     p_loc_pred = ib.buffer_ptr(loc_pred)
     p_anchor = ib.buffer_ptr(anchor)
     p_valid_count = ib.buffer_ptr(valid_count)
     p_out = ib.buffer_ptr(out)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
+    with ib.if_scope(tid < batch_size * num_anchors * num_classes):
+        n = tid / (num_anchors * num_classes)
+        j = (tid % (num_anchors * num_classes)) / num_anchors
+        i = tid % num_anchors
+        with ib.if_scope(j > 0):
+            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = p_cls_prob[tid]
         p_valid_count[n] = 0
-        with ib.for_range(0, num_anchors, name="i") as i:
-            # Find the predicted class id and probability
-            score = ib.allocate('float32', (1,), name="score", scope="local")
-            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
-            score[0] = -1.0
-            cls_id[0] = 0
-            with ib.for_range(0, num_classes, name="j") as j:
-                with ib.if_scope(j > 0):
-                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
-                    cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.make.Max(temp, score[0])
-            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
-                cls_id[0] = 0
-            # [id, prob, xmin, ymin, xmax, ymax]
-            # Remove background, restore original id
-            with ib.if_scope(cls_id[0] > 0):
-                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
-                p_out[out_base_idx] = cls_id[0] - 1.0
-                p_out[out_base_idx + 1] = score[0]
-                offset = i * 4
-                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
-                                                        p_anchor, offset, clip, variances[0],
-                                                        variances[1], variances[2], variances[3])
-                p_valid_count[n] += 1
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors
+        i = tid % num_anchors
+        score[tid] = -1.0
+        cls_id[tid] = 0
+        with ib.for_range(0, num_classes-1, name="k") as k:
+            temp = temp_score[tid * (num_classes-1) + k]
+            cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid])
+            score[tid] = tvm.make.Max(temp, score[tid])
+        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
+            cls_id[tid] = 0
+        with ib.if_scope(cls_id[tid] > 0):
+            flag[tid] = 1
+        with ib.else_scope():
+            flag[tid] = 0
+    with ib.if_scope(tid < batch_size):
+        with ib.for_range(0, num_anchors, name="k") as k:
+            with ib.if_scope(k > 0):
+                flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1]
+        p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1]
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors
+        i = tid % num_anchors
+        with ib.if_scope(cls_id[tid] > 0):
+            with ib.if_scope(i == 0):
+                out_base_id = n * num_anchors * 6
+            with ib.else_scope():
+                out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6
+            p_out[out_base_idx] = cls_id[tid] - 1.0
+            p_out[out_base_idx + 1] = score[tid]
+            p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
+            p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4,
+                                                    clip, variances[0], variances[1],
+                                                    variances[2], variances[3])
 
     return ib.get()
 
@@ -258,6 +284,8 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
 
     Returns
     -------
+    ret : tuple of tvm.Tensor composed of
+
     out : tvm.Tensor
         3-D tensor with shape (batch_size, num_anchors, 6)
 
@@ -280,7 +308,7 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
                    dtype=[valid_count_dtype, cls_prob.dtype],
                    out_buffers=[valid_count_buf, out_buf],
                    tag="multibox_transform_loc")
-    return out, valid_count
+    return [out, valid_count]
 
 
 @multibox_detection.register(["cuda", "gpu"])
@@ -322,7 +350,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     out : tvm.Tensor
         3-D tensor with shape (batch_size, num_anchors, 6)
     """
-    inter_out, valid_count = mutibox_transform_loc(cls_prob, loc_pred, anchor,
+    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                                    clip, threshold, variances)
-    out = nms(inter_out, valid_count, nms_threshold, force_suppress, nms_topk)
+    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index da787c64e2ee..136d62dbe998 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -14,6 +14,14 @@ def _default_schedule(outs):
     s = tvm.create_schedule([x.op for x in outs])
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
+        if "nms" in op.tag:
+            sort = op.input_tensors[1]
+            score = s[sort].op.input_tensors[0]
+            fused = s[score].fuse(*s[score].op.axis)
+            num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+            bx, tx = s[score].split(fused, factor=num_thread)
+            s[score].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[score].bind(tx, tvm.thread_axis("threadIdx.x"))
         if tag.is_broadcast(op.tag):
             if op not in s.outputs:
                 s[op].compute_inline()
@@ -100,28 +108,7 @@ def schedule_multibox_prior(outs):
     s: Schedule
         The computation schedule for multibox_prior.
     """
-    target = tvm.target.current_target()
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            #TODO: should be injected automatically
-            else:
-                x = op.output(0)
-                fused = s[x].fuse(*s[x].op.axis)
-                num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-                bx, tx = s[x].split(fused, factor=num_thread)
-                s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
-                s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-
-    traverse(outs[0].op)
-    return s
+    return _default_schedule(outs)
 
 @generic.schedule_multibox_detection.register(["cuda", "gpu"])
 def schedule_multibox_detection(outs):
@@ -138,4 +125,4 @@ def schedule_multibox_detection(outs):
     s: Schedule
         The computation schedule for multibox_detection.
     """
-    raise RuntimeError("Currently multibox_detection only supports CPU.")
+    return _default_schedule(outs)
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 8ee594ddd35b..959b10f82ca5 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -14,7 +14,6 @@ def test_nms():
     nms_threshold = 0.7
     force_suppress = True
     nms_topk = 2
-    out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -31,6 +30,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            if device == 'llvm':
+                out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+            else:
+                out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
@@ -40,7 +43,7 @@ def check_device(device):
         f(tvm_data, tvm_valid_count, tvm_out)
         np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'opencl']:
         check_device(device)
 
 
@@ -116,7 +119,6 @@ def test_multibox_detection():
     cls_prob = tvm.placeholder((batch_size, num_anchors, num_classes), name="cls_prob")
     loc_preds = tvm.placeholder((batch_size, num_anchors * 4), name="loc_preds")
     anchors = tvm.placeholder((1, num_anchors, 4), name="anchors")
-    out = ssd.multibox_detection(cls_prob, loc_preds, anchors)
 
     # Manually create test case
     np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
@@ -134,6 +136,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            if device == 'llvm':
+                out = ssd.multibox_detection(cls_prob, loc_preds, anchors)
+            else:
+                out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors)
             s = topi.generic.schedule_multibox_detection(out)
 
         tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx)
@@ -144,7 +150,7 @@ def check_device(device):
         f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
         np.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'opencl']:
         check_device(device)
 
 

From df0a2d9d697fe436858bec46d15661efc57ee9b1 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 6 Jul 2018 13:50:39 -0700
Subject: [PATCH 17/19] minor changes

---
 topi/python/topi/cuda/__init__.py          |  6 ++----
 topi/python/topi/cuda/nms.py               | 11 +----------
 topi/python/topi/cuda/ssd/multibox.py      |  9 +++++----
 topi/python/topi/cuda/vision.py            | 19 ++++++++++++++++++-
 topi/tests/python/test_topi_conv2d_nchw.py | 19 ++++++++++++++++++-
 5 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index c0e3c5e173f2..b8740f811ff7 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -15,10 +15,8 @@
 from .pooling import schedule_pool, schedule_global_pool
 from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw
 from .extern import schedule_extern
-from .vision import schedule_region
-from .vision import schedule_reorg
 from .nn import schedule_lrn, schedule_l2_normalize
-from .vision import schedule_multibox_prior
+from .vision import *
 from . import ssd
-from .ssd import multibox_prior
+from .ssd import *
 from .nms import *
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 10f3ec2683d9..231796ecb1da 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -71,7 +71,6 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
                 with ib.else_scope():
                     sizes_temp[tid] +=  sizes[tid - (tvm.const(1, "int32") << k)]
                     sizes[tid] = sizes_temp[tid]
-#                sizes[tid] += sizes[tid - (tvm.const(1, "int32") << k)]
 
     with ib.if_scope(tid < axis_mul_before * axis_mul_after):
         i = tid / axis_mul_after
@@ -235,15 +234,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     body = ib.get()
     return body
 
-#@sort.register("cuda")
-def sort(data, index, is_descend):
-    oshape = data.shape
-    out = tvm.extern(oshape, [data, index], lambda ins, outs:
-                     OddEvenTransposeSort_ir(ins[0], ins[1], is_descend),
-                     tag="sort_gpu")
-    return out
-
-@nms.register("cuda")
+@nms.register(["cuda", "gpu"])
 def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
     """Non-maximum suppression operator for object detection.
 
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 147160333f0d..1f9ce14c23f4 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -30,7 +30,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     ratios : tuple of float
         Tuple of ratios for anchor boxes.
 
-    steps : Tuple of int
+    steps : Tuple of float
         Priorbox step across y and x, -1 for auto calculation.
 
     offsets : tuple of int
@@ -92,7 +92,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     return body
 
 
-@multibox_prior.register("cuda")
+@multibox_prior.register(["cuda", "gpu"])
 def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
@@ -107,7 +107,7 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
     ratios : tuple of float
         Tuple of ratios for anchor boxes.
 
-    steps : Tuple of int
+    steps : Tuple of float
         Priorbox step across y and x, -1 for auto calculation.
 
     offsets : tuple of int
@@ -254,7 +254,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                                                     clip, variances[0], variances[1],
                                                     variances[2], variances[3])
 
-    return ib.get()
+    body = ib.get()
+    return body
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 136d62dbe998..19b1db5827d8 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -25,7 +25,6 @@ def traverse(op):
         if tag.is_broadcast(op.tag):
             if op not in s.outputs:
                 s[op].compute_inline()
-            #TODO: should be injected automatically
             else:
                 x = op.output(0)
                 fused = s[x].fuse(*s[x].op.axis)
@@ -110,6 +109,24 @@ def schedule_multibox_prior(outs):
     """
     return _default_schedule(outs)
 
+@generic.schedule_multibox_transform_loc.register(["cuda", "gpu"])
+def schedule_multibox_transform_loc(outs):
+    """Schedule for multibox_transform_loc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of
+      multibox_transform_loc in the format
+      of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
+
 @generic.schedule_multibox_detection.register(["cuda", "gpu"])
 def schedule_multibox_detection(outs):
     """Schedule for multibox_detection operator.
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 0d55a9163466..7f3935f3aad7 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -8,6 +8,8 @@
 from topi.util import get_const_tuple
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
@@ -59,7 +61,7 @@ def check_device(device):
 
 
 def test_conv2d_nchw():
-    # ResNet18 worklaods
+    # ResNet18 workloads
     verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
     verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
     verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
@@ -72,6 +74,21 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
     verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
     verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+    # ResNet 50 workloads
+    verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0)
+    verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0)
+    verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0)
+    verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0)
+    verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0)
+    verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0)
+    verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0)
+    verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0)
     # Vgg16 workloads
     verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
     # Super resolution workloads

From ff87074a82fa2757300fe486f67a07cfdd09a10f Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 6 Jul 2018 19:54:59 -0700
Subject: [PATCH 18/19] pylint error fixed

---
 topi/python/topi/cuda/nms.py          | 124 ++++++++++++++++----------
 topi/python/topi/cuda/ssd/multibox.py |  27 +++---
 topi/python/topi/cuda/vision.py       |   3 +-
 3 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 231796ecb1da..4d4e402de5c2 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -1,28 +1,42 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
 """Non-maximum suppression operator"""
+import math
 import tvm
 
 from tvm import api
 from topi.vision import nms
-import math
-import numpy as np
-from ..nn import tag
+
 
 def sort_ir(data, index, output, axis, is_descend):
-    def swap(a, b):
-        a, b = b, a
-    def OETS(ib, tid, data_new, index_new, is_descend, size):
-        with ib.if_scope(tid < size - 1):
-            with ib.for_range(0, size - 1, name = "level") as level:
-                with ib.if_scope(tid % 2 == (level & 1)):
-                    with ib.if_scope(~((data_new[tid] < data_new[tid + 1]) ^ is_descend)):
-                        swap(data_new[tid], data_new[tid+1])
-                        swap(index_new[tid], index_new[tid+1])
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+
+    Parameters
+    ----------
+    data: Buffer
+        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+
+    index : Buffer
+        Buffer of number of valid number of boxes.
+
+    output : Buffer
+        Output buffer of indicies of sorted tensor.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     tx = tvm.thread_axis("threadIdx.x")
-    ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
-    by = tvm.thread_axis("blockIdx.y")
     ib = tvm.ir_builder.create()
     p_data = ib.buffer_ptr(data)
     p_index = ib.buffer_ptr(index)
@@ -44,12 +58,14 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
     dshape = 0
     for i in range(0, len(index.shape)):
         dshape += index.shape[i]
-    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, axis_mul_before*axis_mul_after)
+    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape,
+                        axis_mul_before*axis_mul_after)
 
-    sizes_temp = ib.allocate("int32", dshape, name="sizes_temp", scope="global")
+    sizes_temp = ib.allocate(
+        "int32", dshape, name="sizes_temp", scope="global")
     sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
-    temp_index = ib.allocate("int32", dshape, name="temp_index", scope = "local")
-    temp_data = ib.allocate("float32", dshape, name="temp_data", scope = "local")
+    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
+    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
     data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
     index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
     nthread_tx = max_threads
@@ -63,13 +79,16 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
         sizes_temp[tid] = p_index[tid]
 
     with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after).astype("float32"))) + 1, name="k") as k:
-            with ib.if_scope(tid- (tvm.const(1, "int32") << k) >= 0):
+        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
+             .astype("float32"))) + 1, name="k") as k:
+            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
                 with ib.if_scope(k % 2 == 0):
-                    sizes[tid] +=  sizes_temp[tid - (tvm.const(1, "int32") << k)]
+                    sizes[tid] += sizes_temp[tid - (
+                        tvm.const(1, "int32") << k)]
                     sizes_temp[tid] = sizes[tid]
                 with ib.else_scope():
-                    sizes_temp[tid] +=  sizes[tid - (tvm.const(1, "int32") << k)]
+                    sizes_temp[tid] += sizes[tid - (
+                        tvm.const(1, "int32") << k)]
                     sizes[tid] = sizes_temp[tid]
 
     with ib.if_scope(tid < axis_mul_before * axis_mul_after):
@@ -77,7 +96,7 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
         j = tid % axis_mul_after
         current_sort_num = p_index[tid]
         base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, current_sort_num, name = "k") as k:
+        with ib.for_range(0, current_sort_num, name="k") as k:
             full_idx = base_idx + k * axis_mul_after
             with ib.if_scope(tid == 0):
                 start = 0
@@ -92,10 +111,11 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
         with ib.else_scope():
             start = sizes[tid-1]
         # OddEvenTransposeSort
-        with ib.for_range(0, p_index[tid], name = "k") as k:
-            with ib.for_range(0, p_index[tid] - 1, name = "i") as i:
+        with ib.for_range(0, p_index[tid], name="k") as k:
+            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
                 with ib.if_scope(i % 2 == (k & 1)):
-                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ is_descend) == False):
+                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^
+                                      is_descend) == False):
                         temp_data[tid] = data_new[i+start]
                         data_new[i+start] = data_new[i+start+1]
                         data_new[i+start+1] = temp_data[tid]
@@ -108,12 +128,14 @@ def OETS(ib, tid, data_new, index_new, is_descend, size):
         j = tid % axis_mul_after
         current_sort_num = p_index[tid]
         base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, data.shape[axis], name = "k") as k:
+        with ib.for_range(0, data.shape[axis], name="k") as k:
             with ib.if_scope(tid == 0):
                 start = 0
             with ib.else_scope():
                 start = sizes[tid-1]
-            p_out[base_idx + k * axis_mul_after] = tvm.select(k < current_sort_num, index_new[k+start], k)
+            p_out[base_idx + k * axis_mul_after] = tvm.select(
+                k < current_sort_num,
+                index_new[k+start], k)
     body = ib.get()
     return body
 
@@ -163,7 +185,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
         return tvm.select(u <= 0.0, 0.0, i / u)
 
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
     tx = tvm.thread_axis("threadIdx.x")
     ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
@@ -186,15 +209,19 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     i = bx * max_threads + tx
     j = by * max_threads + ty
 
-    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
+    nms_threshold_node = tvm.make.node(
+        "FloatImm", dtype="float32", value=nms_threshold)
     nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
-    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+    force_suppress_node = tvm.make.node(
+        "IntImm", dtype="int32", value=1 if force_suppress else 0)
     with ib.for_range(0, batch_size, for_type="unroll", name="n") as n:
-        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
-                                 p_valid_count[0] > 0)):
+        with ib.if_scope(
+            tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
+                    p_valid_count[0] > 0)):
             # Reorder output
-            nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
-                               nms_topk, p_valid_count[n])
+            nkeep = tvm.select(
+                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
+                nms_topk, p_valid_count[n])
             with ib.if_scope(i < nkeep):
                 with ib.if_scope(j < 6):
                     p_out[(n * num_anchors * 6
@@ -218,10 +245,12 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                                                      p_out[n * num_anchors * 6 + offset_i] ==
                                                      p_out[n * num_anchors * 6 + offset_j])):
                                 # When force_suppress == True or class_id equals
-                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_i + 2,
-                                                        n * num_anchors * 6 + offset_j + 2)
+                                iou = calculate_overlap(
+                                    p_out, n * num_anchors * 6 + offset_i + 2,
+                                    n * num_anchors * 6 + offset_j + 2)
                                 with ib.if_scope(iou >= nms_threshold):
-                                    p_out[n * num_anchors * 6 + offset_j] = -1.0
+                                    p_out[
+                                        n * num_anchors * 6 + offset_j] = -1.0
         with ib.else_scope():
             with ib.if_scope(i < p_valid_count[n]):
                 with ib.if_scope(j < 6):
@@ -230,12 +259,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         # Set invalid entry to be -1
         with ib.if_scope(i < num_anchors - p_valid_count[n]):
             with ib.if_scope(j < 6):
-                p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0
+                p_out[n * num_anchors * 6 + (i +
+                                             p_valid_count[n]) * 6 + j] = -1.0
     body = ib.get()
     return body
 
+
 @nms.register(["cuda", "gpu"])
-def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -269,7 +300,8 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
         # An example to use nms
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        valid_count = tvm.placeholder(
+            (dshape[0],), dtype="int32", name="valid_count")
         nms_threshold = 0.7
         force_suppress = True
         nms_topk = -1
@@ -289,10 +321,12 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
     score_axis = 1
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor")
+    score_tensor = tvm.compute(
+        score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor")
     score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
                                        "score_tensor_buf", data_alignment=8)
     sort_tensor_dtype = "int32"
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 1f9ce14c23f4..c22e7a513d7d 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
 import math
@@ -8,10 +8,10 @@
 
 import topi
 
-from ..nms import nms
 from topi.vision.ssd import multibox_prior
 from topi.vision.ssd import multibox_detection
 from topi.vision.ssd import multibox_transform_loc
+from ..nms import nms
 
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     """Low level IR routing for multibox_prior operator.
@@ -93,7 +93,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
 
 @multibox_prior.register(["cuda", "gpu"])
-def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
+def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
+                       offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
     Parameters
@@ -195,7 +196,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     num_anchors = cls_prob.shape[2]
 
     ib = tvm.ir_builder.create()
-    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, ), name="temp_score", scope="global")
+    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \
+                 ), name="temp_score", scope="global")
     score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
     cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
     flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
@@ -217,7 +219,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         j = (tid % (num_anchors * num_classes)) / num_anchors
         i = tid % num_anchors
         with ib.if_scope(j > 0):
-            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = p_cls_prob[tid]
+            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \
+            p_cls_prob[tid]
         p_valid_count[n] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         n = tid / num_anchors
@@ -243,8 +246,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         n = tid / num_anchors
         i = tid % num_anchors
         with ib.if_scope(cls_id[tid] > 0):
-            with ib.if_scope(i == 0):
-                out_base_id = n * num_anchors * 6
+            with ib.if_scope(tid == 0):
+                out_base_idx = n * num_anchors * 6
             with ib.else_scope():
                 out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6
             p_out[out_base_idx] = cls_id[tid] - 1.0
@@ -259,8 +262,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
-def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
-                          variances=(0.1, 0.1, 0.2, 0.2)):
+def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
+                               variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -313,8 +316,8 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
 
 
 @multibox_detection.register(["cuda", "gpu"])
-def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
-                       force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
+def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
+                           force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
     """Convert multibox detection predictions.
 
     Parameters
@@ -352,6 +355,6 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
         3-D tensor with shape (batch_size, num_anchors, 6)
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
-                                                   clip, threshold, variances)
+                                       clip, threshold, variances)
     out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 19b1db5827d8..c5d94b5ab4de 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -1,11 +1,10 @@
-# pylint: disable=invalid-name, unused-variable, unused-argument
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-member
 """Schedule for vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
 from .. import generic
 from .. import cpp
 from .. import tag
-import topi
 
 def _default_schedule(outs):
     """Default schedule for gpu."""

From e1d67e90ee5c798d394e31b953f6dd52207ab0bb Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 6 Jul 2018 19:57:40 -0700
Subject: [PATCH 19/19] removing intel gpu folder

---
 topi/python/topi/intel_gpu/__init__.py |   5 -
 topi/python/topi/intel_gpu/conv2d.py   | 557 -------------------------
 2 files changed, 562 deletions(-)
 delete mode 100644 topi/python/topi/intel_gpu/__init__.py
 delete mode 100644 topi/python/topi/intel_gpu/conv2d.py

diff --git a/topi/python/topi/intel_gpu/__init__.py b/topi/python/topi/intel_gpu/__init__.py
deleted file mode 100644
index 336b1508f977..000000000000
--- a/topi/python/topi/intel_gpu/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=redefined-builtin, wildcard-import
-"""Intel Gen9 GPU specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .conv2d import *
diff --git a/topi/python/topi/intel_gpu/conv2d.py b/topi/python/topi/intel_gpu/conv2d.py
deleted file mode 100644
index c394d79f6744..000000000000
--- a/topi/python/topi/intel_gpu/conv2d.py
+++ /dev/null
@@ -1,557 +0,0 @@
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
-"""conv2d schedule on Intel GPU"""
-
-from __future__ import absolute_import as _abs
-
-import numpy as np
-import tvm
-
-from .. import generic
-from .. import util
-from .. import tag
-from ..nn import pad
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
-from ..nn.util import get_pad_tuple
-from ..util import simplify
-
-import nnvm
-import nnvm.symbol as sym
-from nnvm.top import registry as reg
-
-
-##### SCHEDULE UTILITIES #####
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """ tile and bind 3d """
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].reorder(zo, yo, xo ,zi, yi, xi)
-
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, thread_z)
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, thread_y)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, thread_x)
-    return xi, thread_z, thread_y, thread_x
-
-@conv2d_alter_layout.register(["intel_gpu"])
-def _alter_conv2d_layout(attrs, inputs, tinfos):
-    copy_inputs = [s for s in inputs]
-
-    data = tinfos[0]
-    kernel = tinfos[1]
-
-    import ast
-    padding = ast.literal_eval(attrs['padding'])
-    stride  = ast.literal_eval(attrs['strides'])
-
-    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
-    oc_bn = 16
-
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-    new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
-
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
-
-@conv2d_NCHWc.register(["intel_gpu"])
-def _decl_conv2d(data, kernel, num_filter, kernel_size,  stride, padding, out_dtype='float32'):
-    """Conv2D operator for Intel GPU backend.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.Tensor
-        5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
-    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
-
-    out_dtype = data.dtype
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    kernel_shape = util.get_const_tuple(kernel.shape)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype)
-
-@generic.schedule_conv2d_NCHWc.register(["intel_gpu"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, outs):
-    """Schedule for conv2d_nchw for Intel GPU
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag:
-            _schedule_cl_spatialpack_NCHWc(s,op)
-
-    traverse(outs[0].op)
-    return s
-    
-def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'):
-    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape]
-    num_filter = num_filter * nv
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
-
-    if isinstance(stride, (tuple, list)):
-        stride_h, stride_w = stride
-    else:
-        stride_h, stride_w = stride, stride
-
-    out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
-
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-
-    block_w = 0
-    block_h = 0
-    if stride_h == 2:
-        if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
-            block_h = 4
-            block_w = 4
-        else:
-            conv_tag = "4_5"
-            block_h = 4
-            block_w = 5
-    elif kernel_h == 3:
-        if num_filter == 512:
-            conv_tag = "2_7"
-            block_h = 2
-            block_w = 7
-        else:
-            conv_tag = "2_14"
-            block_h = 2
-            block_w = 14
-    else:
-        conv_tag = "1_16"
-        block_h = 1
-        block_w = 16
-
-    c_h = out_height
-    c_w = out_width
-
-    if not out_height % block_h == 0:
-        c_h = (out_height // block_h + 1) * block_h
-
-    if not out_width % block_w == 0:
-        c_w = (out_width // block_w + 1) * block_w
-
-    cshape = (batch, out_channel // nv, c_h, c_w, nv)
-
-    conv = tvm.compute(
-        cshape,
-        lambda nn, ff, yy, xx, vc:\
-          tvm.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
-            kernel[ff, rc, ry, rx, vc].astype(out_dtype),
-            axis=[rc, ry, rx]), tag=conv_tag, name='conv')
-
-    output = tvm.compute(
-        oshape,
-        lambda nn, ff, yy, xx:
-            conv[nn][ff//nv][yy][xx][ff%nv],
-            name='output_unpack', tag=conv_tag)
-
-    return output
-
-def _schedule_cl_spatialpack_NCHWc(s, op):
-    output = op.output(0)
-    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
-
-    conv = op.input_tensors[0]
-    temp = s[conv].op.input_tensors[0]
-    kernel = s[conv].op.input_tensors[1]
-    temp_W = s.cache_read(temp, "warp", [conv])
-    conv_L = s.cache_write(conv, "local")
-
-    kernel_L = s.cache_read(kernel, "local", [conv_L])
-    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH  = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH  = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH  = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH  = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH  = 4
-
-    # schedule conv
-    z_factor = 1
-    y_factor = 1
-    x_factor = 16
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
-    _, co, oh, ow, vc = s[conv].op.axis
-    ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
-    oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH)
-    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
-    coo, coi = s[conv].split(co, nparts = 1)
-    ooho, oohi = s[conv].split(ooh, factor = z_factor) 
-    oowo, oowi = s[conv].split(oow, factor = y_factor)
-    vco, vci = s[conv].split(vc, factor = x_factor)
-    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
-    s[conv].bind(oohi, thread_z)
-    s[conv].bind(oowi, thread_y)
-    s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
-
-    # schedule conv_L
-    s[conv_L].compute_at(s[conv], vci)
-    i, oc, h, w, vc = s[conv_L].op.axis
-    rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts = 128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
-
-    # schedule temp
-    _, ci, h, w = s[temp].op.axis
-    tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
-
-    # schedule temp_W
-    _, ci, h, w = s[temp_W].op.axis
-    zo, zi = s[temp_W].split(ci, 1)
-    yo, yi = s[temp_W].split(h, 1)
-    xo, xi = s[temp_W].split(w, 16)
-    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
-    s[temp_W].bind(zi, thread_z)
-    s[temp_W].bind(yi, thread_y)
-    s[temp_W].bind(xi, thread_x)
-    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
-
-    #schedule kernel
-
-    # schedule kernel_L
-    if "2_14" in s[conv].op.tag:
-        s[kernel_L].compute_at(s[conv_L], ry)
-    else:
-        s[kernel_L].compute_at(s[conv_L], rx)
-
-    # schedule output
-    if output.op in s.outputs:
-        out = output
-    else:
-        s[output].compute_inline()
-        out = s.outputs[0]
-
-    _, co, h, w = s[out].op.axis
-    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)
-
-
-@conv2d.register(["intel_gpu"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
-    """Conv2D operator for Intel GPU backend.
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-    kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-    layout : str
-        layout of data
-    Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
-    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
-
-    out_dtype = data.dtype
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    kernel_shape = util.get_const_tuple(kernel.shape)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
-
-@generic.schedule_conv2d_nchw.register(["intel_gpu"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw for Intel GPU
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag or "1_16" in op.tag:
-            _schedule_cl_spatialpack(s,op)
-
-    traverse(outs[0].op)
-    return s
-    
-def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
-    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
-
-    if isinstance(stride, (tuple, list)):
-        stride_h, stride_w = stride
-    else:
-        stride_h, stride_w = stride, stride
-
-    out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
-
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-
-    block_w = 0
-    block_h = 0
-    if stride_h == 2:
-        if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
-            block_h = 4
-            block_w = 4
-        else:
-            conv_tag = "4_5"
-            block_h = 4
-            block_w = 5
-    elif kernel_h == 3:
-        if num_filter == 512:
-            conv_tag = "2_7"
-            block_h = 2
-            block_w = 7
-        else:
-            conv_tag = "2_14"
-            block_h = 2
-            block_w = 14
-    else:
-        conv_tag = "1_16"
-        block_h = 1
-        block_w = 16
-
-    c_h = out_height
-    c_w = out_width
-
-    if not out_height % block_h == 0:
-        c_h = (out_height // block_h + 1) * block_h
-
-    if not out_width % block_w == 0:
-        c_w = (out_width // block_w + 1) * block_w
-
-    nv = 16
-    cshape = (batch, out_channel // nv, c_h, c_w, nv)
-    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
-
-    kernel_vec = tvm.compute(
-            kvshape, 
-            lambda co, ci, kh, kw, vc:
-                kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
-
-    conv = tvm.compute(
-        cshape,
-        lambda nn, ff, yy, xx, vc:\
-          tvm.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
-            kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-            axis=[rc, ry, rx]), tag=conv_tag, name='conv')
-
-    output = tvm.compute(
-        oshape,
-        lambda nn, ff, yy, xx:
-            conv[nn][ff//nv][yy][xx][ff%nv],
-            name='output_unpack', tag=conv_tag)
-
-    return output
-
-def _schedule_cl_spatialpack(s, op):
-    output = op.output(0)
-    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
-
-    conv = op.input_tensors[0]
-    temp = s[conv].op.input_tensors[0]
-    kernel_vec = s[conv].op.input_tensors[1]
-    kernel = s[kernel_vec].op.input_tensors[0]
-    temp_W = s.cache_read(temp, "warp", [conv])
-    conv_L = s.cache_write(conv, "local")
-
-    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
-    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH  = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH  = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH  = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH  = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH  = 4
-
-    # schedule conv
-    z_factor = 1
-    y_factor = 1
-    x_factor = 16
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
-    _, co, oh, ow, vc = s[conv].op.axis
-    ooh, ioh = s[conv].split(oh, factor = OUTPUT_BLOCK_HEIGHT)
-    oow, iow = s[conv].split(ow, factor = OUTPUT_BLOCK_WIDTH)
-    s[conv].reorder(_, co, ooh, oow, vc, ioh, iow)
-    coo, coi = s[conv].split(co, nparts = 1)
-    ooho, oohi = s[conv].split(ooh, factor = z_factor) 
-    oowo, oowi = s[conv].split(oow, factor = y_factor)
-    vco, vci = s[conv].split(vc, factor = x_factor)
-    s[conv].reorder(_, coo, vco, ooho, oowo, coi, oohi, oowi, vci, ioh, iow)
-    s[conv].bind(oohi, thread_z)
-    s[conv].bind(oowi, thread_y)
-    s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
-
-    # schedule conv_L
-    s[conv_L].compute_at(s[conv], vci)
-    i, oc, h, w, vc = s[conv_L].op.axis
-    rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts = 128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
-
-    # schedule temp
-    _, ci, h, w = s[temp].op.axis
-    tile_and_bind3d(s, temp, ci,  h, w, 1, 16, 16)
-
-    # schedule temp_W
-    _, ci, h, w = s[temp_W].op.axis
-    zo, zi = s[temp_W].split(ci, 1)
-    yo, yi = s[temp_W].split(h, 1)
-    xo, xi = s[temp_W].split(w, 16)
-    s[temp_W].reorder(zo, yo, xo, zi, yi, xi)
-    s[temp_W].bind(zi, thread_z)
-    s[temp_W].bind(yi, thread_y)
-    s[temp_W].bind(xi, thread_x)
-    s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0)
-
-    s[kernel_vec].compute_inline()
-
-    # schedule kernel_L
-    if "2_14" in s[conv].op.tag:
-        s[kernel_L].compute_at(s[conv_L], ry)
-    else:
-        s[kernel_L].compute_at(s[conv_L], rx)
-
-    # schedule output
-    if output.op in s.outputs:
-        out = output
-    else:
-        s[output].compute_inline()
-        out = s.outputs[0]
-
-    _, co, h, w = s[out].op.axis
-    tile_and_bind3d(s, out, w, h, co, 4, 8, 8)