Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ARM_CPU] Conv2d int8 intrinsic for cortex-A72 #10310

Merged
merged 6 commits into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/tvm/relay/backend/te_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
The best op implementation and the corresponding output tensors.
"""
all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
if len(all_impls) == 0:
raise RuntimeError(f"No valid {op} implementations for {target}")
best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

# Disable autotvm if auto_scheduler is enabled.
Expand Down
36 changes: 25 additions & 11 deletions python/tvm/relay/op/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
name="conv2d_nchw_spatial_pack.arm_cpu",
)

# Intel x86 conv2d schedule.
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86",
)
if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
name="conv2d_nchw_int8.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86",
)

# check if winograd algorithm is applicable
_, _, kh, kw = get_const_tuple(kernel.shape)
Expand Down Expand Up @@ -256,11 +262,19 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_NCHWc adopted from x86"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86",
)
data, kernel = inputs
if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_NCHWc_int8, True, True),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
name="conv2d_NCHWc_int8.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86",
)
return strategy


Expand Down
6 changes: 6 additions & 0 deletions python/tvm/topi/arm_cpu/arm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def is_aarch64_arm():
return "aarch64" in target.attrs.get("mtriple", "")


def is_neon_available():
"""Check if neon instructions are available"""
target = tvm.target.Target.current(allow_none=False)
return "+neon" in target.mattr


def get_tiling_B_interleaved_t(interleave_A):
"""Compute the tiling information for matrix B', where B'
is the transposed and interleaved version of matrix B in C=A*B.
Expand Down
105 changes: 103 additions & 2 deletions python/tvm/topi/arm_cpu/conv2d_alter_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
from tvm import relay
from tvm import autotvm

from ..nn import conv2d_alter_layout
from ..nn import conv2d_alter_layout, conv2d_legalize
from ..utils import get_const_tuple
from ..x86.conv2d import _get_default_config as _get_x86_default_config
from .conv2d_int8 import is_int8_hw_support
from .arm_utils import get_tiling_B_interleaved_t
from ..generic.conv2d import conv2d_alter_int8_common

logger = logging.getLogger("topi")

Expand Down Expand Up @@ -257,7 +259,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_x86_default_config(
cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
cfg,
data_tensor,
kernel_tensor,
strides,
padding,
dilation,
out_dtype,
False,
data_layout,
)
batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
Expand Down Expand Up @@ -333,6 +343,57 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_default_config_int8(
cfg,
data_tensor,
kernel_tensor,
strides,
padding,
dilation,
out_dtype,
False,
data_layout,
)

batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
n_elems = 8

# update new attrs
new_attrs["channels"] = out_channel
new_attrs["data_layout"] = "NCHW%dc" % ic_bn
new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
new_attrs["out_layout"] = "NCHW%dc" % oc_bn

# Store altered operator's config.
new_data = te.placeholder(
(batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
)
new_kernel = te.placeholder(
(out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
dtype=kernel_dtype,
)
new_workload = autotvm.task.args_to_workload(
[
new_data,
new_kernel,
strides,
padding,
dilation,
new_attrs["data_layout"],
new_attrs["out_layout"],
out_dtype,
],
topi_tmpl,
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
assert data_layout == "NHWC" and kernel_layout == "HWIO"
KH, KW, _, OC = get_const_tuple(kernel.shape)
Expand Down Expand Up @@ -365,3 +426,43 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
inputs[0], new_kernel_expr, **new_attrs
)
return None


@conv2d_legalize.register("arm_cpu")
def _conv2d_legalize(attrs, inputs, arg_types):
"""Legalizes Conv2D op.

Parameters
----------
attrs : tvm.ir.Attrs
Attributes of current convolution
inputs : list of tvm.relay.Expr
The args of the Relay expr to be legalized
types : list of types
List of input and output types

Returns
-------
result : tvm.relay.Expr
The legalized expr
"""
# Collect the input tensors.
data_tensor, kernel_tensor = arg_types[0], arg_types[1]
data_dtype = data_tensor.dtype
kernel_dtype = kernel_tensor.dtype

# Collect the output tensor.
output_tensor = arg_types[2]

# Collect the input exprs.
data, kernel = inputs

# ARM vector instructions operate on the same dtype for data and kernel, we
# provide those here and conv2d_alter_int8_common will convert to the
# correct datatype.
if is_int8_hw_support(kernel_dtype, kernel_dtype):
# ARM intrinsics need the datatypes of data and kernel to be the same
return conv2d_alter_int8_common(
data, data_tensor, kernel, kernel_tensor, output_tensor, attrs, kernel_dtype, 8, 8
)
return None
masahi marked this conversation as resolved.
Show resolved Hide resolved
Loading