From 8c955008b44daf2ccfc9f07c3eb4d5cfc3e5ad64 Mon Sep 17 00:00:00 2001 From: nverke Date: Tue, 2 Aug 2022 21:53:27 +0000 Subject: [PATCH 1/3] Add tests for 2 operators that utilize Qualcomm HVX intrinsics. --- .../python/contrib/test_hexagon/conv_uint8.py | 159 ++++++++++++++ .../test_hexagon/conv_uint8_hvx_intrin.py | 138 +++++++++++++ .../test_hexagon/mmul_unit8_hvx_intrin.py | 189 +++++++++++++++++ .../test_hexagon/quantization_utils.py | 44 ++++ .../test_hexagon/test_conv_hvx_intrinsics.py | 142 +++++++++++++ .../test_hexagon/test_mmul_hvx_intrinsics.py | 195 ++++++++++++++++++ 6 files changed, 867 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/conv_uint8.py create mode 100644 tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py create mode 100644 tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py create mode 100644 tests/python/contrib/test_hexagon/quantization_utils.py create mode 100644 tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py create mode 100644 tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py new file mode 100644 index 000000000000..264e6f77bbdf --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8.py @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm + +from numbers import Integral +from tvm import te + +def get_const_int(expr): + """Verifies expr is integer and get the constant value. + + Parameters + ---------- + expr : tvm.Expr or int + The input expression. + + Returns + ------- + out_value : int + The output. + """ + if isinstance(expr, Integral): + return expr + if not isinstance(expr, tvm.tir.IntImm): + ana = tvm.arith.Analyzer() + expr = ana.simplify(expr) + if not isinstance(expr, tvm.tir.IntImm): + raise ValueError("Expect value to be constant int") + return int(expr.value) + +def get_const_tuple(in_tuple): + """Verifies input tuple is IntImm or Var, returns tuple of int or Var. + + Parameters + ---------- + in_tuple : tuple of Expr + The input. + + Returns + ------- + out_tuple : tuple of int + The output. + """ + ret = [] + ana = None + for elem in in_tuple: + if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)): + ret.append(elem) + elif not isinstance(elem, (tvm.tir.IntImm, int)): + ana = tvm.arith.Analyzer() if ana is None else ana + elem = ana.simplify(elem) + if not isinstance(elem, tvm.tir.IntImm): + ret.append(elem) + else: + ret.append(get_const_int(elem)) + else: + ret.append(get_const_int(elem)) + return tuple(ret) + +def Pad(Input, padding): + batch, in_height, in_width, in_channel = Input.shape + return te.compute( + (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel), + lambda nn, yy, xx, cc: tvm.tir.if_then_else( + te.all( + yy >= padding, + yy - padding < in_height, + xx >= padding, + xx - padding < in_width, + ), + Input[nn, yy - padding, xx - padding, cc], + tvm.tir.const(0, Input.dtype), + ), + name="Apad", + ) + +def schedule_qconv2d_nhwc(outs, target, device): + s = te.create_schedule([x.op for x in outs]) + x = outs[0] + nn, yy, xx, cc = s[x].op.axis + px1, px2 = s[x].split(nn, nparts=1) + return s + +def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None): + if out_dtype is None: + out_dtype = Input.dtype + + batch, in_height, in_width, in_channel = Input.shape + filt_height, filt_width, _, num_filter = Filter.shape + # Input is already padded. No need to add padding while computing + # out_height and out_width. + out_height = (in_height - filt_height) // stride + 1 + out_width = (in_width - filt_width) // stride + 1 + out_channel = num_filter + + rc = te.reduce_axis((0, in_channel), name="rc") + ry = te.reduce_axis((0, filt_height), name="ry") + rx = te.reduce_axis((0, filt_width), name="rx") + + return te.compute( + (batch, out_height, out_width, out_channel), + lambda nn, yy, xx, ff: te.sum( + (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset) + * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype), + axis=[rc, ry, rx], + ), + tag="qconv2d_nhwc", + ) + + +def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding): + + # Input tensor size + A = te.placeholder(a.shape, name="A", dtype="uint8") + W = te.placeholder(w.shape, name="W", dtype="uint8") + + # Pad input and create computation for quantized conv2d + Apad = Pad(A, padding) + B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding) + target_hexagon = tvm.target.hexagon("v68", link_params=True) + device = hexagon_session.device + s = schedule_qconv2d_nhwc([B], target_hexagon, device) + nn, yy, xx, cc = s[B].op.axis + yo, yi = s[B].split(yy, nparts=1) + s[Apad].compute_at(s[B], yi) + s[B].vectorize(cc) + + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device) + func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d") + + module_te = hexagon_session.load_module(func_te) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + + module_te(a_hexagon, w_hexagon, b_hexagon) + evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1) + mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000 + + out = b_hexagon.numpy() + + return out, mean_ms + \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py new file mode 100644 index 000000000000..44154bf29767 --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math + +from tvm.script import tir as T + +def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope): + VRMPY_WIDTH = 128 + + batches, input_size, _, in_c = input_shape + w_size, _, _, filters = kernel_shape + out_size = input_size + + input_padding = w_size // 2 + + # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order + # for this implementation to not mix output data there will need to be kernel + # padding to round to the nearest multiple of 4. + kernel_width_padding = 4 - w_size % 4 + padded_kernel_width = w_size + kernel_width_padding + + # vrmpy buffer loads are always 128B and will go out of bounds for the + # implementation written here if there is not sufficient padding. This + # means that for this implementation it must always be a multiple of 128 + # and have the standard padding and the padding needed for the kernel + # window (4) + if (input_size % VRMPY_WIDTH != 0): + input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding + else: + input_width_padding = input_padding + kernel_width_padding + + padded_input_height = input_size + 2 * input_padding + padded_input_width = input_size + input_padding + input_width_padding + + # vrmpy output buffer loads will go out of bounds for this implementation + # if there is not proper padding. + padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3 + + # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input. + w_steps = math.ceil(input_size / VRMPY_WIDTH) + + # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. + kw_steps = math.ceil(w_size / 4) + + @T.prim_func + def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) + W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) + C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope) + with T.block("root"): + T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")]) + T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")]) + for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size): + C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"), + T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"), + dtype="int32x32" + ) + + @T.prim_func + def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope) + W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope) + C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope) + with T.block("root"): + for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size): + with T.block("C"): + y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx]) + C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32") + + + @T.prim_func + def operator(a: T.handle, w: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1) + W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1) + C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1) + A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope) + W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope) + C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope) + with T.block("root"): + for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c): + with T.block("A_local"): + nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c]) + T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc]) + T.writes(A_local[nn, yy, xx, cc]) + A_local[nn, yy, xx, cc] = T.if_then_else( + T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and + T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), + A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8" + ) + for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters): + with T.block("W_local"): + yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f]) + T.reads(W[yy, xx, cc, ff]) + T.writes(W_local[yy, xx, cc, ff]) + W_local[yy, xx, cc, ff] = T.if_then_else( + xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), + W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8" + ) + for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width): + with T.block("C_local_init"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C_local[n, f, y, x] = 0 + for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c): + with T.block("C"): + n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc]) + C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32") + for n, f, y, x in T.grid(batches, filters, out_size, out_size): + with T.block("C_local"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C[n, y, x, f] = C_local[n, f, y, x] + + return conv2d_vrmpy_desc, conv2d_vrmpy, operator + \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py new file mode 100644 index 000000000000..5d8c5ed328a8 --- /dev/null +++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tvm.script import tir as T + +w_spit__b = "llvm.hexagon.S2.vsplatrb" # Q6_R_vsplatb_R +v_spit__w = "llvm.hexagon.V6.lvsplatw.128B" +v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B" +v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B" +v_sub = "llvm.hexagon.V6.vsubw.128B" + +def get_mm_uint8_intrin(in_m, in_n, in_k): + blocks = in_k // 32 + unrolled_rows = in_m // 16 + + @T.prim_func + def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle): + A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8") + B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8") + C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + with T.block("root"): + T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2]) + T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")]) + for i in T.serial(in_n): + for s in T.serial(blocks): + C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32") + for blok, ro in T.grid(blocks, unrolled_rows): + b_offset = T.cast(OFFSETS[1], dtype="int32") + a_offset = T.cast(OFFSETS[0], dtype="int32") + out_index = blok * 32 + i * T.cast(in_k, dtype="int32") + + B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32")) + B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32") + B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32") + B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32") + + A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32") + A_index_unrolled_2 = A_index_unrolled + 4 + A_index_unrolled_3 = A_index_unrolled + 8 + A_index_unrolled_4 = A_index_unrolled + 12 + + a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction + T.uint32(3), # number of inputs + C[T.ramp(out_index, 1, 32)], # accumulation location + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy + T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy + dtype = "int32x32" # output datatype + ) + + a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b1, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b2, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b3, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"), + dtype = "int32x32" + ) + + C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), + T.uint32(2), + la_b_vrmpy_accumulation_unrolled_3, + a_b_offsets_vrmpy_accumulation_unrolled_a3, + dtype = "int32x32" + ) + + + @T.prim_func + def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [in_n, in_m], dtype="uint8") + B = T.match_buffer(b, [in_m, in_k], dtype="uint8") + C = T.match_buffer(c, [in_n, in_k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2 in T.grid(in_m, in_n, in_k): + with T.block("C"): + y, x, j = T.axis.remap("SSR", [i0, i1, i2]) + C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32") + + return mmul_desc, mm_uint8_intrinsic + + diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py new file mode 100644 index 000000000000..3ebcfe3a805e --- /dev/null +++ b/tests/python/contrib/test_hexagon/quantization_utils.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def quantize_uint8(val, minval, maxval): + range = max(0.0001, maxval - minval) + resize_amt = 255.0 / range + value_f = (val - minval) * resize_amt + value_i = round(value_f, 8) + if value_i < 0: + return 0 + elif value_i > 255: + return 255 + else: + return int(value_i) + + +def dequantize(val, minval, maxval): + range = max(0.0001, maxval - minval) + stepsize = range / 4294967296 + return val * stepsize + + +def quantize_array(in_f, size): + in_q = [] + # 0 must lie in interval [min,max] for quantization to work correctly. + in_min = min(0, min(in_f)) + in_max = max(0, max(in_f)) + for i in range(size): + in_q.append(quantize_uint8(in_f[i], in_min, in_max)) + return in_q, in_min, in_max \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py new file mode 100644 index 000000000000..0ab7cbd734a0 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm.testing +import tvm.topi.testing + +from numpy.random import default_rng +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te +from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + +class TestConvHVX: + + def create_inputs(input_shape, filter_shape, mem_scope): + + w_size, _, _, _ = filter_shape + input_padding = w_size // 2 + + rng = default_rng() + a = rng.integers(1, 255, input_shape, dtype="uint8") + w = rng.integers(1, 8, filter_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size) + + a_q = np.array(a_q, dtype="uint8").reshape(input_shape) + w_q = np.array(w_q, dtype="uint8").reshape(filter_shape) + + a_offset = quantize_uint8(0.0, a_min, a_max) + w_offset = quantize_uint8(0.0, b_min, b_max) + + a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape)) + w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape)) + expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32") + + return a_q, w_q, a_offset, w_offset, expected_output, mem_scope + + + a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters( + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")), + (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")), + (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")), + (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")), + (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")), + (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")), + ) + + @tvm.testing.requires_hexagon + def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + + #TODO even sized kernels and stride are currently not working. + + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + + out_height = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_width = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_shape = (batches, out_height, out_width, filters) + c = np.zeros(out_shape, dtype="int32") + + conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope) + + intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope) + try: + TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic) + except: + print("Intrinsic already registered.") + + ir_module = conv2d_operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + + w_block_local = sch.get_block("W_local") + sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w)) + + a_block_local = sch.get_block("A_local") + sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w)) + + n, f, y, x, ry, rx, rc = sch.get_loops(block) + sch.reorder(n, f, rc, y, x, ry, rx) + + sch.tensorize(y, intrin_name) + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + + A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8") + W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + + func_tir = tvm.build( + sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op" + ) + + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + + module(a_hexagon, w_hexagon, c_hexagon) + out = c_hexagon.numpy() + out = out[:,:,:out_width,:] + + tvm.testing.assert_allclose(out, expected_output) + + timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1) + time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000 + print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms)) + + @tvm.testing.requires_hexagon + def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2) + tvm.testing.assert_allclose(baseline_output, expected_output) + print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time)) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py new file mode 100644 index 000000000000..161ff305c5c1 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py @@ -0,0 +1,195 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm +import tvm.testing + +from numpy.random import default_rng +from tvm.script import tir as T +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + +UNROLL_FACTOR = 4 # This must match the hard-coded unrolling in mm_uint8_intrinsic(). + +def can_tensorize(n, m, k): + return m % (4 * UNROLL_FACTOR) == k % 32 == 0 + +def blockify_matrix(B): + """ + inputs + ------ + B : numpy 2D array (of M x K ) to be blockified + + outputs + ------- + BB : blockified B as array of dimensions (M/4) x K x 4 + + blockification is in preparation for HVX ops on 128B 'vectors' + assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32 + - once for each block - then K/32 such vectors will be required to carry out the operation. + + if B is height x width = M x K, then output is array of K blocks in x direction, + i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4. + Specifically: + + B = [ B[1,1] B[1,2] ... B[1,K] ] + [ B[2,1] B[2,2] ... B[2,K] ] + . + . + . + [ B[M,1] B[M,2] ... B[M,K] ] + + + BB = | | | + | | | + [ B[1,1] B[2,1] B[3,1] B[4,1] | B[1,2] B[2,2] B[3,2] B[4,2] | ... | B[1,K] B[2,K] B[3,K] B[4,K] ] + [ B[5,1] B[6,1] B[7,1] B[8,1] | B[5,2] B[6,2] B[7,2] B[8,2] | ... | B[5,K] B[6,K] B[7,K] B[8,K] ] + . | . | . | . + . | . | . | . + . | . | . | . + [ B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ] + | | | + | | | + | | | + | | | + ^ | ^ | | ^ + | | | | | | + | | | | | | + | | | | | | + block 1 block 2 | | block K + + """ + BT, M, K = B.shape + assert M % 4 == 0 + out_height = M // 4 + out_shape = tuple((BT, out_height, K, 4)) + BB = np.zeros(out_shape).astype(B.dtype) # block form of B + for bt in range(BT): + for j in range(0, out_height): + for k in range(0, K): + for b in range(0, 4): + y = b + 4 * j + BB[bt, j, k, b] = B[bt, y, k] + return BB + +def setup_test(b, m, n, k): + a_shape = (b, n, m) + b_shape = (b, m, k) + + rng = default_rng() + a = rng.integers(1, 16, a_shape, dtype="uint8") + b = rng.integers(1, 16, b_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size) + a_q = np.array(a_q, dtype="uint8").reshape(a_shape) + b_q = np.array(b_q, dtype="uint8").reshape(b_shape) + a_offset = quantize_uint8(0.0, a_min, a_max) + b_offset = quantize_uint8(0.0, b_min, b_max) + if can_tensorize(n, m, k): + bb = blockify_matrix(b_q) # blockification is only used by the tensorized version + bb = bb.reshape(-1) # go via 1D rep in case there are stride / offset issues + bb = bb.reshape(b_shape) + else: + bb = [] + + a_f = np.array(a_q, dtype="int32").reshape(a_shape) + b_f = np.array(b_q, dtype="int32").reshape(b_shape) + expected_output = np.matmul(a_f, b_f) + + intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k) + try: + TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k)) + except: + print("Intrinsic already registered.") + + return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output + +class TestMatMulVec: + + batches, m, n, k = tvm.testing.parameters( + (1, 128, 768, 768), + (1, 128, 768, 3072), + (1, 128, 3072, 768), + (1, 128, 128, 64), + (1, 128, 64, 128), + ) + + @tvm.testing.requires_hexagon + def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k): + + out_shape = (batches, n, k) + + a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k) + + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [batches, n, m], dtype="uint8") + B = T.match_buffer(b, [batches, m, k], dtype="uint8") + C = T.match_buffer(c, [batches, n, k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2, i3 in T.grid(batches, m, n, k): + with T.block("C"): + batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3]) + C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32") + + ir_module = operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + _, y, _, _ = sch.get_loops(block) + sch.tensorize(y, intrin_name) + + A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8") + B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8") + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy") + module = hexagon_session.load_module(func_tir) + + c = np.zeros(out_shape, dtype="int32") + offsets = np.array([a_offset, b_offset], dtype="uint8") + + a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device) + + module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon) + evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1) + time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3 + print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms)) + + out = c_hexagon.numpy() + out_a = out.reshape(batches * n * k) + out_req, _, _ = quantize_array(out_a, batches * n * k) + out_req = np.array(out_req).reshape(batches, n, k) + + out_ref_a = out_ref.reshape(batches * n * k) + out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k) + out_ref_q = np.array(out_ref_q).reshape(batches, n, k) + + tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0) + \ No newline at end of file From 775f66c0cd17d9a2356804b914ca191dab6f5ce8 Mon Sep 17 00:00:00 2001 From: nverke Date: Tue, 2 Aug 2022 21:53:27 +0000 Subject: [PATCH 2/3] Add tests for 2 operators that utilize Qualcomm HVX intrinsics. --- .../python/contrib/test_hexagon/conv_uint8.py | 159 ++++++++++++++ .../test_hexagon/conv_uint8_hvx_intrin.py | 138 +++++++++++++ .../test_hexagon/mmul_unit8_hvx_intrin.py | 189 +++++++++++++++++ .../test_hexagon/quantization_utils.py | 44 ++++ .../test_hexagon/test_conv_hvx_intrinsics.py | 142 +++++++++++++ .../test_hexagon/test_mmul_hvx_intrinsics.py | 195 ++++++++++++++++++ 6 files changed, 867 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/conv_uint8.py create mode 100644 tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py create mode 100644 tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py create mode 100644 tests/python/contrib/test_hexagon/quantization_utils.py create mode 100644 tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py create mode 100644 tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py new file mode 100644 index 000000000000..264e6f77bbdf --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8.py @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm + +from numbers import Integral +from tvm import te + +def get_const_int(expr): + """Verifies expr is integer and get the constant value. + + Parameters + ---------- + expr : tvm.Expr or int + The input expression. + + Returns + ------- + out_value : int + The output. + """ + if isinstance(expr, Integral): + return expr + if not isinstance(expr, tvm.tir.IntImm): + ana = tvm.arith.Analyzer() + expr = ana.simplify(expr) + if not isinstance(expr, tvm.tir.IntImm): + raise ValueError("Expect value to be constant int") + return int(expr.value) + +def get_const_tuple(in_tuple): + """Verifies input tuple is IntImm or Var, returns tuple of int or Var. + + Parameters + ---------- + in_tuple : tuple of Expr + The input. + + Returns + ------- + out_tuple : tuple of int + The output. + """ + ret = [] + ana = None + for elem in in_tuple: + if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)): + ret.append(elem) + elif not isinstance(elem, (tvm.tir.IntImm, int)): + ana = tvm.arith.Analyzer() if ana is None else ana + elem = ana.simplify(elem) + if not isinstance(elem, tvm.tir.IntImm): + ret.append(elem) + else: + ret.append(get_const_int(elem)) + else: + ret.append(get_const_int(elem)) + return tuple(ret) + +def Pad(Input, padding): + batch, in_height, in_width, in_channel = Input.shape + return te.compute( + (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel), + lambda nn, yy, xx, cc: tvm.tir.if_then_else( + te.all( + yy >= padding, + yy - padding < in_height, + xx >= padding, + xx - padding < in_width, + ), + Input[nn, yy - padding, xx - padding, cc], + tvm.tir.const(0, Input.dtype), + ), + name="Apad", + ) + +def schedule_qconv2d_nhwc(outs, target, device): + s = te.create_schedule([x.op for x in outs]) + x = outs[0] + nn, yy, xx, cc = s[x].op.axis + px1, px2 = s[x].split(nn, nparts=1) + return s + +def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None): + if out_dtype is None: + out_dtype = Input.dtype + + batch, in_height, in_width, in_channel = Input.shape + filt_height, filt_width, _, num_filter = Filter.shape + # Input is already padded. No need to add padding while computing + # out_height and out_width. + out_height = (in_height - filt_height) // stride + 1 + out_width = (in_width - filt_width) // stride + 1 + out_channel = num_filter + + rc = te.reduce_axis((0, in_channel), name="rc") + ry = te.reduce_axis((0, filt_height), name="ry") + rx = te.reduce_axis((0, filt_width), name="rx") + + return te.compute( + (batch, out_height, out_width, out_channel), + lambda nn, yy, xx, ff: te.sum( + (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset) + * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype), + axis=[rc, ry, rx], + ), + tag="qconv2d_nhwc", + ) + + +def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding): + + # Input tensor size + A = te.placeholder(a.shape, name="A", dtype="uint8") + W = te.placeholder(w.shape, name="W", dtype="uint8") + + # Pad input and create computation for quantized conv2d + Apad = Pad(A, padding) + B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding) + target_hexagon = tvm.target.hexagon("v68", link_params=True) + device = hexagon_session.device + s = schedule_qconv2d_nhwc([B], target_hexagon, device) + nn, yy, xx, cc = s[B].op.axis + yo, yi = s[B].split(yy, nparts=1) + s[Apad].compute_at(s[B], yi) + s[B].vectorize(cc) + + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device) + func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d") + + module_te = hexagon_session.load_module(func_te) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + + module_te(a_hexagon, w_hexagon, b_hexagon) + evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1) + mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000 + + out = b_hexagon.numpy() + + return out, mean_ms + \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py new file mode 100644 index 000000000000..44154bf29767 --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math + +from tvm.script import tir as T + +def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope): + VRMPY_WIDTH = 128 + + batches, input_size, _, in_c = input_shape + w_size, _, _, filters = kernel_shape + out_size = input_size + + input_padding = w_size // 2 + + # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order + # for this implementation to not mix output data there will need to be kernel + # padding to round to the nearest multiple of 4. + kernel_width_padding = 4 - w_size % 4 + padded_kernel_width = w_size + kernel_width_padding + + # vrmpy buffer loads are always 128B and will go out of bounds for the + # implementation written here if there is not sufficient padding. This + # means that for this implementation it must always be a multiple of 128 + # and have the standard padding and the padding needed for the kernel + # window (4) + if (input_size % VRMPY_WIDTH != 0): + input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding + else: + input_width_padding = input_padding + kernel_width_padding + + padded_input_height = input_size + 2 * input_padding + padded_input_width = input_size + input_padding + input_width_padding + + # vrmpy output buffer loads will go out of bounds for this implementation + # if there is not proper padding. + padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3 + + # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input. + w_steps = math.ceil(input_size / VRMPY_WIDTH) + + # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. + kw_steps = math.ceil(w_size / 4) + + @T.prim_func + def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) + W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) + C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope) + with T.block("root"): + T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")]) + T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")]) + for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size): + C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"), + T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"), + dtype="int32x32" + ) + + @T.prim_func + def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope) + W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope) + C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope) + with T.block("root"): + for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size): + with T.block("C"): + y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx]) + C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32") + + + @T.prim_func + def operator(a: T.handle, w: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1) + W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1) + C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1) + A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope) + W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope) + C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope) + with T.block("root"): + for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c): + with T.block("A_local"): + nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c]) + T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc]) + T.writes(A_local[nn, yy, xx, cc]) + A_local[nn, yy, xx, cc] = T.if_then_else( + T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and + T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), + A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8" + ) + for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters): + with T.block("W_local"): + yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f]) + T.reads(W[yy, xx, cc, ff]) + T.writes(W_local[yy, xx, cc, ff]) + W_local[yy, xx, cc, ff] = T.if_then_else( + xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), + W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8" + ) + for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width): + with T.block("C_local_init"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C_local[n, f, y, x] = 0 + for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c): + with T.block("C"): + n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc]) + C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32") + for n, f, y, x in T.grid(batches, filters, out_size, out_size): + with T.block("C_local"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C[n, y, x, f] = C_local[n, f, y, x] + + return conv2d_vrmpy_desc, conv2d_vrmpy, operator + \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py new file mode 100644 index 000000000000..5d8c5ed328a8 --- /dev/null +++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tvm.script import tir as T + +w_spit__b = "llvm.hexagon.S2.vsplatrb" # Q6_R_vsplatb_R +v_spit__w = "llvm.hexagon.V6.lvsplatw.128B" +v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B" +v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B" +v_sub = "llvm.hexagon.V6.vsubw.128B" + +def get_mm_uint8_intrin(in_m, in_n, in_k): + blocks = in_k // 32 + unrolled_rows = in_m // 16 + + @T.prim_func + def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle): + A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8") + B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8") + C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + with T.block("root"): + T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2]) + T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")]) + for i in T.serial(in_n): + for s in T.serial(blocks): + C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32") + for blok, ro in T.grid(blocks, unrolled_rows): + b_offset = T.cast(OFFSETS[1], dtype="int32") + a_offset = T.cast(OFFSETS[0], dtype="int32") + out_index = blok * 32 + i * T.cast(in_k, dtype="int32") + + B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32")) + B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32") + B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32") + B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32") + + A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32") + A_index_unrolled_2 = A_index_unrolled + 4 + A_index_unrolled_3 = A_index_unrolled + 8 + A_index_unrolled_4 = A_index_unrolled + 12 + + a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction + T.uint32(3), # number of inputs + C[T.ramp(out_index, 1, 32)], # accumulation location + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy + T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy + dtype = "int32x32" # output datatype + ) + + a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b1, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b2, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), + dtype = "int32x32" + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b3, + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"), + T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"), + dtype = "int32x32" + ) + + C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), + T.uint32(2), + la_b_vrmpy_accumulation_unrolled_3, + a_b_offsets_vrmpy_accumulation_unrolled_a3, + dtype = "int32x32" + ) + + + @T.prim_func + def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [in_n, in_m], dtype="uint8") + B = T.match_buffer(b, [in_m, in_k], dtype="uint8") + C = T.match_buffer(c, [in_n, in_k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2 in T.grid(in_m, in_n, in_k): + with T.block("C"): + y, x, j = T.axis.remap("SSR", [i0, i1, i2]) + C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32") + + return mmul_desc, mm_uint8_intrinsic + + diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py new file mode 100644 index 000000000000..3ebcfe3a805e --- /dev/null +++ b/tests/python/contrib/test_hexagon/quantization_utils.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def quantize_uint8(val, minval, maxval): + range = max(0.0001, maxval - minval) + resize_amt = 255.0 / range + value_f = (val - minval) * resize_amt + value_i = round(value_f, 8) + if value_i < 0: + return 0 + elif value_i > 255: + return 255 + else: + return int(value_i) + + +def dequantize(val, minval, maxval): + range = max(0.0001, maxval - minval) + stepsize = range / 4294967296 + return val * stepsize + + +def quantize_array(in_f, size): + in_q = [] + # 0 must lie in interval [min,max] for quantization to work correctly. + in_min = min(0, min(in_f)) + in_max = max(0, max(in_f)) + for i in range(size): + in_q.append(quantize_uint8(in_f[i], in_min, in_max)) + return in_q, in_min, in_max \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py new file mode 100644 index 000000000000..0ab7cbd734a0 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm.testing +import tvm.topi.testing + +from numpy.random import default_rng +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te +from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + +class TestConvHVX: + + def create_inputs(input_shape, filter_shape, mem_scope): + + w_size, _, _, _ = filter_shape + input_padding = w_size // 2 + + rng = default_rng() + a = rng.integers(1, 255, input_shape, dtype="uint8") + w = rng.integers(1, 8, filter_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size) + + a_q = np.array(a_q, dtype="uint8").reshape(input_shape) + w_q = np.array(w_q, dtype="uint8").reshape(filter_shape) + + a_offset = quantize_uint8(0.0, a_min, a_max) + w_offset = quantize_uint8(0.0, b_min, b_max) + + a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape)) + w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape)) + expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32") + + return a_q, w_q, a_offset, w_offset, expected_output, mem_scope + + + a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters( + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")), + (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")), + (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")), + (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")), + (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")), + (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")), + ) + + @tvm.testing.requires_hexagon + def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + + #TODO even sized kernels and stride are currently not working. + + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + + out_height = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_width = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_shape = (batches, out_height, out_width, filters) + c = np.zeros(out_shape, dtype="int32") + + conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope) + + intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope) + try: + TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic) + except: + print("Intrinsic already registered.") + + ir_module = conv2d_operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + + w_block_local = sch.get_block("W_local") + sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w)) + + a_block_local = sch.get_block("A_local") + sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w)) + + n, f, y, x, ry, rx, rc = sch.get_loops(block) + sch.reorder(n, f, rc, y, x, ry, rx) + + sch.tensorize(y, intrin_name) + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + + A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8") + W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + + func_tir = tvm.build( + sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op" + ) + + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + + module(a_hexagon, w_hexagon, c_hexagon) + out = c_hexagon.numpy() + out = out[:,:,:out_width,:] + + tvm.testing.assert_allclose(out, expected_output) + + timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1) + time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000 + print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms)) + + @tvm.testing.requires_hexagon + def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2) + tvm.testing.assert_allclose(baseline_output, expected_output) + print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time)) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py new file mode 100644 index 000000000000..161ff305c5c1 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py @@ -0,0 +1,195 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm +import tvm.testing + +from numpy.random import default_rng +from tvm.script import tir as T +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + +UNROLL_FACTOR = 4 # This must match the hard-coded unrolling in mm_uint8_intrinsic(). + +def can_tensorize(n, m, k): + return m % (4 * UNROLL_FACTOR) == k % 32 == 0 + +def blockify_matrix(B): + """ + inputs + ------ + B : numpy 2D array (of M x K ) to be blockified + + outputs + ------- + BB : blockified B as array of dimensions (M/4) x K x 4 + + blockification is in preparation for HVX ops on 128B 'vectors' + assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32 + - once for each block - then K/32 such vectors will be required to carry out the operation. + + if B is height x width = M x K, then output is array of K blocks in x direction, + i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4. + Specifically: + + B = [ B[1,1] B[1,2] ... B[1,K] ] + [ B[2,1] B[2,2] ... B[2,K] ] + . + . + . + [ B[M,1] B[M,2] ... B[M,K] ] + + + BB = | | | + | | | + [ B[1,1] B[2,1] B[3,1] B[4,1] | B[1,2] B[2,2] B[3,2] B[4,2] | ... | B[1,K] B[2,K] B[3,K] B[4,K] ] + [ B[5,1] B[6,1] B[7,1] B[8,1] | B[5,2] B[6,2] B[7,2] B[8,2] | ... | B[5,K] B[6,K] B[7,K] B[8,K] ] + . | . | . | . + . | . | . | . + . | . | . | . + [ B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ] + | | | + | | | + | | | + | | | + ^ | ^ | | ^ + | | | | | | + | | | | | | + | | | | | | + block 1 block 2 | | block K + + """ + BT, M, K = B.shape + assert M % 4 == 0 + out_height = M // 4 + out_shape = tuple((BT, out_height, K, 4)) + BB = np.zeros(out_shape).astype(B.dtype) # block form of B + for bt in range(BT): + for j in range(0, out_height): + for k in range(0, K): + for b in range(0, 4): + y = b + 4 * j + BB[bt, j, k, b] = B[bt, y, k] + return BB + +def setup_test(b, m, n, k): + a_shape = (b, n, m) + b_shape = (b, m, k) + + rng = default_rng() + a = rng.integers(1, 16, a_shape, dtype="uint8") + b = rng.integers(1, 16, b_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size) + a_q = np.array(a_q, dtype="uint8").reshape(a_shape) + b_q = np.array(b_q, dtype="uint8").reshape(b_shape) + a_offset = quantize_uint8(0.0, a_min, a_max) + b_offset = quantize_uint8(0.0, b_min, b_max) + if can_tensorize(n, m, k): + bb = blockify_matrix(b_q) # blockification is only used by the tensorized version + bb = bb.reshape(-1) # go via 1D rep in case there are stride / offset issues + bb = bb.reshape(b_shape) + else: + bb = [] + + a_f = np.array(a_q, dtype="int32").reshape(a_shape) + b_f = np.array(b_q, dtype="int32").reshape(b_shape) + expected_output = np.matmul(a_f, b_f) + + intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k) + try: + TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k)) + except: + print("Intrinsic already registered.") + + return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output + +class TestMatMulVec: + + batches, m, n, k = tvm.testing.parameters( + (1, 128, 768, 768), + (1, 128, 768, 3072), + (1, 128, 3072, 768), + (1, 128, 128, 64), + (1, 128, 64, 128), + ) + + @tvm.testing.requires_hexagon + def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k): + + out_shape = (batches, n, k) + + a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k) + + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [batches, n, m], dtype="uint8") + B = T.match_buffer(b, [batches, m, k], dtype="uint8") + C = T.match_buffer(c, [batches, n, k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2, i3 in T.grid(batches, m, n, k): + with T.block("C"): + batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3]) + C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32") + + ir_module = operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + _, y, _, _ = sch.get_loops(block) + sch.tensorize(y, intrin_name) + + A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8") + B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8") + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy") + module = hexagon_session.load_module(func_tir) + + c = np.zeros(out_shape, dtype="int32") + offsets = np.array([a_offset, b_offset], dtype="uint8") + + a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device) + + module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon) + evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1) + time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3 + print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms)) + + out = c_hexagon.numpy() + out_a = out.reshape(batches * n * k) + out_req, _, _ = quantize_array(out_a, batches * n * k) + out_req = np.array(out_req).reshape(batches, n, k) + + out_ref_a = out_ref.reshape(batches * n * k) + out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k) + out_ref_q = np.array(out_ref_q).reshape(batches, n, k) + + tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0) + \ No newline at end of file From 81b2e9bb213e6c96c41d314920efdac7025eb714 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Wed, 3 Aug 2022 09:55:44 -0700 Subject: [PATCH 3/3] Run and apply lint changes. --- .../python/contrib/test_hexagon/conv_uint8.py | 57 +-- .../test_hexagon/conv_uint8_hvx_intrin.py | 328 ++++++++++++------ .../test_hexagon/mmul_unit8_hvx_intrin.py | 210 +++++++---- .../test_hexagon/quantization_utils.py | 3 +- .../test_hexagon/test_conv_hvx_intrinsics.py | 66 ++-- .../test_hexagon/test_mmul_hvx_intrinsics.py | 28 +- 6 files changed, 463 insertions(+), 229 deletions(-) diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py index 264e6f77bbdf..9f4b00ec2c9f 100644 --- a/tests/python/contrib/test_hexagon/conv_uint8.py +++ b/tests/python/contrib/test_hexagon/conv_uint8.py @@ -21,27 +21,29 @@ from numbers import Integral from tvm import te + def get_const_int(expr): - """Verifies expr is integer and get the constant value. - - Parameters - ---------- - expr : tvm.Expr or int - The input expression. - - Returns - ------- - out_value : int - The output. - """ - if isinstance(expr, Integral): - return expr - if not isinstance(expr, tvm.tir.IntImm): - ana = tvm.arith.Analyzer() - expr = ana.simplify(expr) - if not isinstance(expr, tvm.tir.IntImm): - raise ValueError("Expect value to be constant int") - return int(expr.value) + """Verifies expr is integer and get the constant value. + + Parameters + ---------- + expr : tvm.Expr or int + The input expression. + + Returns + ------- + out_value : int + The output. + """ + if isinstance(expr, Integral): + return expr + if not isinstance(expr, tvm.tir.IntImm): + ana = tvm.arith.Analyzer() + expr = ana.simplify(expr) + if not isinstance(expr, tvm.tir.IntImm): + raise ValueError("Expect value to be constant int") + return int(expr.value) + def get_const_tuple(in_tuple): """Verifies input tuple is IntImm or Var, returns tuple of int or Var. @@ -72,6 +74,7 @@ def get_const_tuple(in_tuple): ret.append(get_const_int(elem)) return tuple(ret) + def Pad(Input, padding): batch, in_height, in_width, in_channel = Input.shape return te.compute( @@ -89,6 +92,7 @@ def Pad(Input, padding): name="Apad", ) + def schedule_qconv2d_nhwc(outs, target, device): s = te.create_schedule([x.op for x in outs]) x = outs[0] @@ -96,6 +100,7 @@ def schedule_qconv2d_nhwc(outs, target, device): px1, px2 = s[x].split(nn, nparts=1) return s + def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None): if out_dtype is None: out_dtype = Input.dtype @@ -141,7 +146,12 @@ def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding): s[B].vectorize(cc) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device) - func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d") + func_te = tvm.build( + s, + [A, W, B], + target=tvm.target.Target(target_hexagon, host=target_hexagon), + name="quant_conv2d", + ) module_te = hexagon_session.load_module(func_te) @@ -150,10 +160,11 @@ def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding): b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) module_te(a_hexagon, w_hexagon, b_hexagon) - evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1) + evaluator = module_te.time_evaluator( + module_te.entry_name, hexagon_session.device, number=1, repeat=1 + ) mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000 out = b_hexagon.numpy() return out, mean_ms - \ No newline at end of file diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py index 44154bf29767..a0b1d7de798f 100644 --- a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py +++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py @@ -19,120 +19,220 @@ from tvm.script import tir as T + def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope): - VRMPY_WIDTH = 128 - - batches, input_size, _, in_c = input_shape - w_size, _, _, filters = kernel_shape - out_size = input_size - - input_padding = w_size // 2 - - # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order - # for this implementation to not mix output data there will need to be kernel - # padding to round to the nearest multiple of 4. - kernel_width_padding = 4 - w_size % 4 - padded_kernel_width = w_size + kernel_width_padding - - # vrmpy buffer loads are always 128B and will go out of bounds for the - # implementation written here if there is not sufficient padding. This - # means that for this implementation it must always be a multiple of 128 - # and have the standard padding and the padding needed for the kernel - # window (4) - if (input_size % VRMPY_WIDTH != 0): - input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding - else: - input_width_padding = input_padding + kernel_width_padding - - padded_input_height = input_size + 2 * input_padding - padded_input_width = input_size + input_padding + input_width_padding - - # vrmpy output buffer loads will go out of bounds for this implementation - # if there is not proper padding. - padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3 - - # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input. - w_steps = math.ceil(input_size / VRMPY_WIDTH) - - # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. - kw_steps = math.ceil(w_size / 4) - - @T.prim_func - def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle): - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) - W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope) - C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope) - with T.block("root"): - T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")]) - T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")]) - for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size): - C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin( - T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), - T.uint32(2), - T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"), - T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"), - dtype="int32x32" + VRMPY_WIDTH = 128 + + batches, input_size, _, in_c = input_shape + w_size, _, _, filters = kernel_shape + out_size = input_size + + input_padding = w_size // 2 + + # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order + # for this implementation to not mix output data there will need to be kernel + # padding to round to the nearest multiple of 4. + kernel_width_padding = 4 - w_size % 4 + padded_kernel_width = w_size + kernel_width_padding + + # vrmpy buffer loads are always 128B and will go out of bounds for the + # implementation written here if there is not sufficient padding. This + # means that for this implementation it must always be a multiple of 128 + # and have the standard padding and the padding needed for the kernel + # window (4) + if input_size % VRMPY_WIDTH != 0: + input_width_padding = ( + (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding + ) + else: + input_width_padding = input_padding + kernel_width_padding + + padded_input_height = input_size + 2 * input_padding + padded_input_width = input_size + input_padding + input_width_padding + + # vrmpy output buffer loads will go out of bounds for this implementation + # if there is not proper padding. + padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3 + + # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input. + w_steps = math.ceil(input_size / VRMPY_WIDTH) + + # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. + kw_steps = math.ceil(w_size / 4) + + @T.prim_func + def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer( + a, + [ + T.cast(padded_input_height, dtype="int32") + * T.cast(padded_input_width, dtype="int32") + ], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + W_local = T.match_buffer( + w, + [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + C_local = T.match_buffer( + c, + [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], + dtype="int32", + offset_factor=1, + scope=mem_scope, + ) + with T.block("root"): + T.reads( + A_local[ + 0 : T.cast(padded_input_height, dtype="int32") + * T.cast(padded_input_width, dtype="int32") + ], + W_local[ + 0 : T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32") + ], + ) + T.writes( + C_local[ + 0 : T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32") + ] + ) + for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size): + C_local[ + T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32) + ] += T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret( + A_local[ + T.ramp( + (y + ry) * T.cast(padded_input_width, dtype="int32") + + x_o * 128 + + 4 * rx_o + + x_i, + 1, + 128, + ) + ], + dtype="int32x32", + ), + T.reinterpret( + W_local[ + T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4) + ], + dtype="int32", + ), + dtype="int32x32", + ) + + @T.prim_func + def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer( + a, + [padded_input_height, padded_input_width], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + W_local = T.match_buffer( + w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope + ) + C_local = T.match_buffer( + c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope + ) + with T.block("root"): + for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size): + with T.block("C"): + y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx]) + C_local[y, x] = C_local[y, x] + T.cast( + A_local[y + ry, x + rx], "int32" + ) * T.cast(W_local[ry, rx], "int32") + + @T.prim_func + def operator(a: T.handle, w: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer( + a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1 + ) + W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1) + C = T.match_buffer( + c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1 + ) + A_local = T.alloc_buffer( + [batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope + ) + W_local = T.alloc_buffer( + [w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope + ) + C_local = T.alloc_buffer( + [batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope + ) + with T.block("root"): + for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c): + with T.block("A_local"): + nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c]) + T.reads( + A[ + nn, + yy - T.cast(input_padding, dtype="int32"), + xx - T.cast(input_padding, dtype="int32"), + cc, + ] + ) + T.writes(A_local[nn, yy, xx, cc]) + A_local[nn, yy, xx, cc] = T.if_then_else( + T.cast(input_padding, dtype="int32") <= yy + and yy + < T.cast(padded_input_height, dtype="int32") + - T.cast(input_padding, dtype="int32") + and T.cast(input_padding, dtype="int32") <= xx + and xx + < T.cast(padded_input_width, dtype="int32") + - T.cast(input_width_padding, dtype="int32"), + A[ + nn, + yy - T.cast(input_padding, dtype="int32"), + xx - T.cast(input_padding, dtype="int32"), + cc, + ] + - T.cast(a_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8", + ) + for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters): + with T.block("W_local"): + yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f]) + T.reads(W[yy, xx, cc, ff]) + T.writes(W_local[yy, xx, cc, ff]) + W_local[yy, xx, cc, ff] = T.if_then_else( + xx + < T.cast(padded_kernel_width, dtype="int32") + - T.cast(kernel_width_padding, dtype="int32"), + W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8", ) + for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width): + with T.block("C_local_init"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C_local[n, f, y, x] = 0 + for n, f, y, x, ry, rx, rc in T.grid( + batches, filters, input_size, input_size, w_size, w_size, in_c + ): + with T.block("C"): + n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc]) + C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast( + A_local[n, y + ry, x + rx, rc], "int32" + ) * T.cast(W_local[ry, rx, rc, f], "int32") + for n, f, y, x in T.grid(batches, filters, out_size, out_size): + with T.block("C_local"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C[n, y, x, f] = C_local[n, f, y, x] - @T.prim_func - def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle): - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope) - W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope) - C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope) - with T.block("root"): - for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size): - with T.block("C"): - y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx]) - C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32") - - - @T.prim_func - def operator(a: T.handle, w: T.handle, c: T.handle) -> None: - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1) - W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1) - C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1) - A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope) - W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope) - C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope) - with T.block("root"): - for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c): - with T.block("A_local"): - nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c]) - T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc]) - T.writes(A_local[nn, yy, xx, cc]) - A_local[nn, yy, xx, cc] = T.if_then_else( - T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and - T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), - A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), - T.uint8(0), - dtype="uint8" - ) - for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters): - with T.block("W_local"): - yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f]) - T.reads(W[yy, xx, cc, ff]) - T.writes(W_local[yy, xx, cc, ff]) - W_local[yy, xx, cc, ff] = T.if_then_else( - xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), - W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), - T.uint8(0), - dtype="uint8" - ) - for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width): - with T.block("C_local_init"): - n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) - C_local[n, f, y, x] = 0 - for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c): - with T.block("C"): - n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc]) - C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32") - for n, f, y, x in T.grid(batches, filters, out_size, out_size): - with T.block("C_local"): - n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) - C[n, y, x, f] = C_local[n, f, y, x] - - return conv2d_vrmpy_desc, conv2d_vrmpy, operator - \ No newline at end of file + return conv2d_vrmpy_desc, conv2d_vrmpy, operator diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py index 5d8c5ed328a8..46cb816eeb56 100644 --- a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py +++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py @@ -23,153 +23,239 @@ v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B" v_sub = "llvm.hexagon.V6.vsubw.128B" + def get_mm_uint8_intrin(in_m, in_n, in_k): blocks = in_k // 32 unrolled_rows = in_m // 16 @T.prim_func def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle): - A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8") - B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8") - C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32") + A = T.match_buffer( + a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8" + ) + B = T.match_buffer( + b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8" + ) + C = T.match_buffer( + c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32" + ) OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") - with T.block("root"): - T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2]) - T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")]) + with T.block("root"): + T.reads( + A[0 : T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], + B[0 : T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], + OFFSETS[0:2], + ) + T.writes(C[0 : T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")]) for i in T.serial(in_n): for s in T.serial(blocks): - C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32") - for blok, ro in T.grid(blocks, unrolled_rows): - b_offset = T.cast(OFFSETS[1], dtype="int32") - a_offset = T.cast(OFFSETS[0], dtype="int32") + C[ + T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32) + ] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + ( + (T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) + * T.cast(in_m, dtype="int32") + ), + dtype="int32x32", + ) + for blok, ro in T.grid(blocks, unrolled_rows): + b_offset = T.cast(OFFSETS[1], dtype="int32") + a_offset = T.cast(OFFSETS[0], dtype="int32") out_index = blok * 32 + i * T.cast(in_k, dtype="int32") - + B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32")) B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32") B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32") B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32") - + A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32") A_index_unrolled_2 = A_index_unrolled + 4 A_index_unrolled_3 = A_index_unrolled + 8 A_index_unrolled_4 = A_index_unrolled + 12 - + a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin( - T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction - T.uint32(3), # number of inputs - C[T.ramp(out_index, 1, 32)], # accumulation location - T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy - T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy - dtype = "int32x32" # output datatype + T.llvm_lookup_intrinsic_id( + "llvm.hexagon.V6.vrmpyub.acc.128B" + ), # instruction + T.uint32(3), # number of inputs + C[T.ramp(out_index, 1, 32)], # accumulation location + T.reinterpret( + B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32" + ), # 32 4 byte inputs (Vu) to vrmpy + T.reinterpret( + A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32" + ), # 4 byte input (Rt) to vrmpy + dtype="int32x32", # output datatype ) a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_vrmpy_accumulation_unrolled, - T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), - T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"), + dtype="int32x32", ) a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_vrmpy_accumulation_unrolled_1, - T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), - T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"), + dtype="int32x32", ) la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_vrmpy_accumulation_unrolled_2, - T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), - T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), T.uint32(2), - T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_b, - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), - dtype = "int32x32" + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_a, - T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_b1, - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), - dtype = "int32x32" + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_a1, - T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_b2, - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"), - dtype = "int32x32" + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_a2, - T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"), - dtype = "int32x32" + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", ) a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin( - T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), - T.uint32(3), + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), a_b_offsets_vrmpy_accumulation_unrolled_b3, - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"), - T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"), - dtype = "int32x32" + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", ) C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin( - T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), - T.uint32(2), + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), + T.uint32(2), la_b_vrmpy_accumulation_unrolled_3, a_b_offsets_vrmpy_accumulation_unrolled_a3, - dtype = "int32x32" + dtype="int32x32", ) - @T.prim_func def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) @@ -178,12 +264,12 @@ def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: C = T.match_buffer(c, [in_n, in_k], dtype="int32") OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") # body - with T.block("root"): + with T.block("root"): for i0, i1, i2 in T.grid(in_m, in_n, in_k): with T.block("C"): y, x, j = T.axis.remap("SSR", [i0, i1, i2]) - C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32") + C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast( + B[j, x] - OFFSETS[1], "int32" + ) return mmul_desc, mm_uint8_intrinsic - - diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py index 3ebcfe3a805e..96d3d4b38a13 100644 --- a/tests/python/contrib/test_hexagon/quantization_utils.py +++ b/tests/python/contrib/test_hexagon/quantization_utils.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. + def quantize_uint8(val, minval, maxval): range = max(0.0001, maxval - minval) resize_amt = 255.0 / range @@ -41,4 +42,4 @@ def quantize_array(in_f, size): in_max = max(0, max(in_f)) for i in range(size): in_q.append(quantize_uint8(in_f[i], in_min, in_max)) - return in_q, in_min, in_max \ No newline at end of file + return in_q, in_min, in_max diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py index 0ab7cbd734a0..ac60eb5d67af 100644 --- a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py +++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py @@ -26,8 +26,8 @@ from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 -class TestConvHVX: +class TestConvHVX: def create_inputs(input_shape, filter_shape, mem_scope): w_size, _, _, _ = filter_shape @@ -48,11 +48,12 @@ def create_inputs(input_shape, filter_shape, mem_scope): a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape)) w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape)) - expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32") + expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype( + "int32" + ) return a_q, w_q, a_offset, w_offset, expected_output, mem_scope - a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters( (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")), (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")), @@ -67,9 +68,11 @@ def create_inputs(input_shape, filter_shape, mem_scope): ) @tvm.testing.requires_hexagon - def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + def test_vrmpy_conv( + self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope + ): - #TODO even sized kernels and stride are currently not working. + # TODO even sized kernels and stride are currently not working. batches, input_size, _, in_c = a.shape w_size, _, _, filters = w.shape @@ -79,9 +82,15 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou out_shape = (batches, out_height, out_width, filters) c = np.zeros(out_shape, dtype="int32") - conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope) + ( + conv2d_vrmpy_description, + conv2d_vrmpy_intrinsic, + conv2d_operator, + ) = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope) - intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope) + intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format( + input_size, input_size, w_size, w_size, mem_scope + ) try: TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic) except: @@ -89,14 +98,18 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou ir_module = conv2d_operator sch = tvm.tir.Schedule(ir_module, debug_mask="all") - + block = sch.get_block("C") w_block_local = sch.get_block("W_local") - sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w)) + sch.transform_layout( + w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w) + ) a_block_local = sch.get_block("A_local") - sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w)) + sch.transform_layout( + a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w) + ) n, f, y, x, ry, rx, rc = sch.get_loops(block) sch.reorder(n, f, rc, y, x, ry, rx) @@ -104,15 +117,18 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou sch.tensorize(y, intrin_name) target_hexagon = tvm.target.hexagon("v68", link_params=True) - + A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8") W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8") C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") func_tir = tvm.build( - sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op" + sch.mod, + [A, W, C], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="hvx_op", ) - + module = hexagon_session.load_module(func_tir) a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) @@ -121,22 +137,32 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou module(a_hexagon, w_hexagon, c_hexagon) out = c_hexagon.numpy() - out = out[:,:,:out_width,:] - + out = out[:, :, :out_width, :] + tvm.testing.assert_allclose(out, expected_output) - + timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1) time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000 - print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms)) + print( + "Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format( + a.shape, w.shape, mem_scope, time_ms + ) + ) @tvm.testing.requires_hexagon def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): batches, input_size, _, in_c = a.shape w_size, _, _, filters = w.shape - baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2) + baseline_output, baseline_time = run_conv_te( + hexagon_session, a, w, a_offset, w_offset, w_size // 2 + ) tvm.testing.assert_allclose(baseline_output, expected_output) - print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time)) - + print( + "Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format( + a.shape, w.shape, baseline_time + ) + ) + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py index 161ff305c5c1..d025b2b48751 100644 --- a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py +++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py @@ -28,9 +28,11 @@ UNROLL_FACTOR = 4 # This must match the hard-coded unrolling in mm_uint8_intrinsic(). + def can_tensorize(n, m, k): return m % (4 * UNROLL_FACTOR) == k % 32 == 0 + def blockify_matrix(B): """ inputs @@ -89,6 +91,7 @@ def blockify_matrix(B): BB[bt, j, k, b] = B[bt, y, k] return BB + def setup_test(b, m, n, k): a_shape = (b, n, m) b_shape = (b, m, k) @@ -122,6 +125,7 @@ def setup_test(b, m, n, k): return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output + class TestMatMulVec: batches, m, n, k = tvm.testing.parameters( @@ -134,8 +138,8 @@ class TestMatMulVec: @tvm.testing.requires_hexagon def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k): - - out_shape = (batches, n, k) + + out_shape = (batches, n, k) a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k) @@ -151,32 +155,39 @@ def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: for i0, i1, i2, i3 in T.grid(batches, m, n, k): with T.block("C"): batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3]) - C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32") + C[batch, y, x] = C[batch, y, x] + T.cast( + A[batch, y, j] - OFFSETS[0], "int32" + ) * T.cast(B[batch, j, x] - OFFSETS[1], "int32") ir_module = operator sch = tvm.tir.Schedule(ir_module, debug_mask="all") - + block = sch.get_block("C") _, y, _, _ = sch.get_loops(block) sch.tensorize(y, intrin_name) - + A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8") B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8") C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8") target_hexagon = tvm.target.hexagon("v68", link_params=True) - func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy") + func_tir = tvm.build( + sch.mod, + [A, B, C, OFFSETS], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="qmmul_vrmpy", + ) module = hexagon_session.load_module(func_tir) c = np.zeros(out_shape, dtype="int32") offsets = np.array([a_offset, b_offset], dtype="uint8") - + a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device) b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device) c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device) - + module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon) evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1) time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3 @@ -192,4 +203,3 @@ def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: out_ref_q = np.array(out_ref_q).reshape(batches, n, k) tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0) - \ No newline at end of file