diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py new file mode 100644 index 000000000000..9f4b00ec2c9f --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8.py @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm + +from numbers import Integral +from tvm import te + + +def get_const_int(expr): + """Verifies expr is integer and get the constant value. + + Parameters + ---------- + expr : tvm.Expr or int + The input expression. + + Returns + ------- + out_value : int + The output. + """ + if isinstance(expr, Integral): + return expr + if not isinstance(expr, tvm.tir.IntImm): + ana = tvm.arith.Analyzer() + expr = ana.simplify(expr) + if not isinstance(expr, tvm.tir.IntImm): + raise ValueError("Expect value to be constant int") + return int(expr.value) + + +def get_const_tuple(in_tuple): + """Verifies input tuple is IntImm or Var, returns tuple of int or Var. + + Parameters + ---------- + in_tuple : tuple of Expr + The input. + + Returns + ------- + out_tuple : tuple of int + The output. + """ + ret = [] + ana = None + for elem in in_tuple: + if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)): + ret.append(elem) + elif not isinstance(elem, (tvm.tir.IntImm, int)): + ana = tvm.arith.Analyzer() if ana is None else ana + elem = ana.simplify(elem) + if not isinstance(elem, tvm.tir.IntImm): + ret.append(elem) + else: + ret.append(get_const_int(elem)) + else: + ret.append(get_const_int(elem)) + return tuple(ret) + + +def Pad(Input, padding): + batch, in_height, in_width, in_channel = Input.shape + return te.compute( + (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel), + lambda nn, yy, xx, cc: tvm.tir.if_then_else( + te.all( + yy >= padding, + yy - padding < in_height, + xx >= padding, + xx - padding < in_width, + ), + Input[nn, yy - padding, xx - padding, cc], + tvm.tir.const(0, Input.dtype), + ), + name="Apad", + ) + + +def schedule_qconv2d_nhwc(outs, target, device): + s = te.create_schedule([x.op for x in outs]) + x = outs[0] + nn, yy, xx, cc = s[x].op.axis + px1, px2 = s[x].split(nn, nparts=1) + return s + + +def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None): + if out_dtype is None: + out_dtype = Input.dtype + + batch, in_height, in_width, in_channel = Input.shape + filt_height, filt_width, _, num_filter = Filter.shape + # Input is already padded. No need to add padding while computing + # out_height and out_width. + out_height = (in_height - filt_height) // stride + 1 + out_width = (in_width - filt_width) // stride + 1 + out_channel = num_filter + + rc = te.reduce_axis((0, in_channel), name="rc") + ry = te.reduce_axis((0, filt_height), name="ry") + rx = te.reduce_axis((0, filt_width), name="rx") + + return te.compute( + (batch, out_height, out_width, out_channel), + lambda nn, yy, xx, ff: te.sum( + (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset) + * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype), + axis=[rc, ry, rx], + ), + tag="qconv2d_nhwc", + ) + + +def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding): + + # Input tensor size + A = te.placeholder(a.shape, name="A", dtype="uint8") + W = te.placeholder(w.shape, name="W", dtype="uint8") + + # Pad input and create computation for quantized conv2d + Apad = Pad(A, padding) + B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding) + target_hexagon = tvm.target.hexagon("v68", link_params=True) + device = hexagon_session.device + s = schedule_qconv2d_nhwc([B], target_hexagon, device) + nn, yy, xx, cc = s[B].op.axis + yo, yi = s[B].split(yy, nparts=1) + s[Apad].compute_at(s[B], yi) + s[B].vectorize(cc) + + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device) + func_te = tvm.build( + s, + [A, W, B], + target=tvm.target.Target(target_hexagon, host=target_hexagon), + name="quant_conv2d", + ) + + module_te = hexagon_session.load_module(func_te) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + + module_te(a_hexagon, w_hexagon, b_hexagon) + evaluator = module_te.time_evaluator( + module_te.entry_name, hexagon_session.device, number=1, repeat=1 + ) + mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000 + + out = b_hexagon.numpy() + + return out, mean_ms diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py new file mode 100644 index 000000000000..a0b1d7de798f --- /dev/null +++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py @@ -0,0 +1,238 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math + +from tvm.script import tir as T + + +def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope): + VRMPY_WIDTH = 128 + + batches, input_size, _, in_c = input_shape + w_size, _, _, filters = kernel_shape + out_size = input_size + + input_padding = w_size // 2 + + # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order + # for this implementation to not mix output data there will need to be kernel + # padding to round to the nearest multiple of 4. + kernel_width_padding = 4 - w_size % 4 + padded_kernel_width = w_size + kernel_width_padding + + # vrmpy buffer loads are always 128B and will go out of bounds for the + # implementation written here if there is not sufficient padding. This + # means that for this implementation it must always be a multiple of 128 + # and have the standard padding and the padding needed for the kernel + # window (4) + if input_size % VRMPY_WIDTH != 0: + input_width_padding = ( + (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding + ) + else: + input_width_padding = input_padding + kernel_width_padding + + padded_input_height = input_size + 2 * input_padding + padded_input_width = input_size + input_padding + input_width_padding + + # vrmpy output buffer loads will go out of bounds for this implementation + # if there is not proper padding. + padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3 + + # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input. + w_steps = math.ceil(input_size / VRMPY_WIDTH) + + # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. + kw_steps = math.ceil(w_size / 4) + + @T.prim_func + def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer( + a, + [ + T.cast(padded_input_height, dtype="int32") + * T.cast(padded_input_width, dtype="int32") + ], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + W_local = T.match_buffer( + w, + [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + C_local = T.match_buffer( + c, + [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], + dtype="int32", + offset_factor=1, + scope=mem_scope, + ) + with T.block("root"): + T.reads( + A_local[ + 0 : T.cast(padded_input_height, dtype="int32") + * T.cast(padded_input_width, dtype="int32") + ], + W_local[ + 0 : T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32") + ], + ) + T.writes( + C_local[ + 0 : T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32") + ] + ) + for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size): + C_local[ + T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32) + ] += T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret( + A_local[ + T.ramp( + (y + ry) * T.cast(padded_input_width, dtype="int32") + + x_o * 128 + + 4 * rx_o + + x_i, + 1, + 128, + ) + ], + dtype="int32x32", + ), + T.reinterpret( + W_local[ + T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4) + ], + dtype="int32", + ), + dtype="int32x32", + ) + + @T.prim_func + def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A_local = T.match_buffer( + a, + [padded_input_height, padded_input_width], + dtype="uint8", + offset_factor=1, + scope=mem_scope, + ) + W_local = T.match_buffer( + w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope + ) + C_local = T.match_buffer( + c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope + ) + with T.block("root"): + for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size): + with T.block("C"): + y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx]) + C_local[y, x] = C_local[y, x] + T.cast( + A_local[y + ry, x + rx], "int32" + ) * T.cast(W_local[ry, rx], "int32") + + @T.prim_func + def operator(a: T.handle, w: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer( + a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1 + ) + W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1) + C = T.match_buffer( + c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1 + ) + A_local = T.alloc_buffer( + [batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope + ) + W_local = T.alloc_buffer( + [w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope + ) + C_local = T.alloc_buffer( + [batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope + ) + with T.block("root"): + for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c): + with T.block("A_local"): + nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c]) + T.reads( + A[ + nn, + yy - T.cast(input_padding, dtype="int32"), + xx - T.cast(input_padding, dtype="int32"), + cc, + ] + ) + T.writes(A_local[nn, yy, xx, cc]) + A_local[nn, yy, xx, cc] = T.if_then_else( + T.cast(input_padding, dtype="int32") <= yy + and yy + < T.cast(padded_input_height, dtype="int32") + - T.cast(input_padding, dtype="int32") + and T.cast(input_padding, dtype="int32") <= xx + and xx + < T.cast(padded_input_width, dtype="int32") + - T.cast(input_width_padding, dtype="int32"), + A[ + nn, + yy - T.cast(input_padding, dtype="int32"), + xx - T.cast(input_padding, dtype="int32"), + cc, + ] + - T.cast(a_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8", + ) + for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters): + with T.block("W_local"): + yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f]) + T.reads(W[yy, xx, cc, ff]) + T.writes(W_local[yy, xx, cc, ff]) + W_local[yy, xx, cc, ff] = T.if_then_else( + xx + < T.cast(padded_kernel_width, dtype="int32") + - T.cast(kernel_width_padding, dtype="int32"), + W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), + T.uint8(0), + dtype="uint8", + ) + for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width): + with T.block("C_local_init"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C_local[n, f, y, x] = 0 + for n, f, y, x, ry, rx, rc in T.grid( + batches, filters, input_size, input_size, w_size, w_size, in_c + ): + with T.block("C"): + n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc]) + C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast( + A_local[n, y + ry, x + rx, rc], "int32" + ) * T.cast(W_local[ry, rx, rc, f], "int32") + for n, f, y, x in T.grid(batches, filters, out_size, out_size): + with T.block("C_local"): + n, f, y, x = T.axis.remap("SSSS", [n, f, y, x]) + C[n, y, x, f] = C_local[n, f, y, x] + + return conv2d_vrmpy_desc, conv2d_vrmpy, operator diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py new file mode 100644 index 000000000000..46cb816eeb56 --- /dev/null +++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py @@ -0,0 +1,275 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tvm.script import tir as T + +w_spit__b = "llvm.hexagon.S2.vsplatrb" # Q6_R_vsplatb_R +v_spit__w = "llvm.hexagon.V6.lvsplatw.128B" +v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B" +v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B" +v_sub = "llvm.hexagon.V6.vsubw.128B" + + +def get_mm_uint8_intrin(in_m, in_n, in_k): + blocks = in_k // 32 + unrolled_rows = in_m // 16 + + @T.prim_func + def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle): + A = T.match_buffer( + a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8" + ) + B = T.match_buffer( + b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8" + ) + C = T.match_buffer( + c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32" + ) + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + with T.block("root"): + T.reads( + A[0 : T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], + B[0 : T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], + OFFSETS[0:2], + ) + T.writes(C[0 : T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")]) + for i in T.serial(in_n): + for s in T.serial(blocks): + C[ + T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32) + ] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + ( + (T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) + * T.cast(in_m, dtype="int32") + ), + dtype="int32x32", + ) + for blok, ro in T.grid(blocks, unrolled_rows): + b_offset = T.cast(OFFSETS[1], dtype="int32") + a_offset = T.cast(OFFSETS[0], dtype="int32") + out_index = blok * 32 + i * T.cast(in_k, dtype="int32") + + B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32")) + B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32") + B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32") + B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32") + + A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32") + A_index_unrolled_2 = A_index_unrolled + 4 + A_index_unrolled_3 = A_index_unrolled + 8 + A_index_unrolled_4 = A_index_unrolled + 12 + + a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id( + "llvm.hexagon.V6.vrmpyub.acc.128B" + ), # instruction + T.uint32(3), # number of inputs + C[T.ramp(out_index, 1, 32)], # accumulation location + T.reinterpret( + B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32" + ), # 32 4 byte inputs (Vu) to vrmpy + T.reinterpret( + A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32" + ), # 4 byte input (Rt) to vrmpy + dtype="int32x32", # output datatype + ) + + a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"), + dtype="int32x32", + ) + + a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"), + dtype="int32x32", + ) + + la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_vrmpy_accumulation_unrolled_2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"), + T.uint32(2), + T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b, + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a, + T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b1, + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a1, + T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b2, + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_a2, + T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + a_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + a_b_offsets_vrmpy_accumulation_unrolled_b3, + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), + T.uint32(1), + T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"), + dtype="int32x32", + ), + T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), + T.uint32(1), + b_offset, + dtype="int32", + ), + dtype="int32x32", + ) + + C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), + T.uint32(2), + la_b_vrmpy_accumulation_unrolled_3, + a_b_offsets_vrmpy_accumulation_unrolled_a3, + dtype="int32x32", + ) + + @T.prim_func + def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [in_n, in_m], dtype="uint8") + B = T.match_buffer(b, [in_m, in_k], dtype="uint8") + C = T.match_buffer(c, [in_n, in_k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2 in T.grid(in_m, in_n, in_k): + with T.block("C"): + y, x, j = T.axis.remap("SSR", [i0, i1, i2]) + C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast( + B[j, x] - OFFSETS[1], "int32" + ) + + return mmul_desc, mm_uint8_intrinsic diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py new file mode 100644 index 000000000000..96d3d4b38a13 --- /dev/null +++ b/tests/python/contrib/test_hexagon/quantization_utils.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def quantize_uint8(val, minval, maxval): + range = max(0.0001, maxval - minval) + resize_amt = 255.0 / range + value_f = (val - minval) * resize_amt + value_i = round(value_f, 8) + if value_i < 0: + return 0 + elif value_i > 255: + return 255 + else: + return int(value_i) + + +def dequantize(val, minval, maxval): + range = max(0.0001, maxval - minval) + stepsize = range / 4294967296 + return val * stepsize + + +def quantize_array(in_f, size): + in_q = [] + # 0 must lie in interval [min,max] for quantization to work correctly. + in_min = min(0, min(in_f)) + in_max = max(0, max(in_f)) + for i in range(size): + in_q.append(quantize_uint8(in_f[i], in_min, in_max)) + return in_q, in_min, in_max diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py new file mode 100644 index 000000000000..ac60eb5d67af --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm.testing +import tvm.topi.testing + +from numpy.random import default_rng +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te +from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + + +class TestConvHVX: + def create_inputs(input_shape, filter_shape, mem_scope): + + w_size, _, _, _ = filter_shape + input_padding = w_size // 2 + + rng = default_rng() + a = rng.integers(1, 255, input_shape, dtype="uint8") + w = rng.integers(1, 8, filter_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size) + + a_q = np.array(a_q, dtype="uint8").reshape(input_shape) + w_q = np.array(w_q, dtype="uint8").reshape(filter_shape) + + a_offset = quantize_uint8(0.0, a_min, a_max) + w_offset = quantize_uint8(0.0, b_min, b_max) + + a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape)) + w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape)) + expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype( + "int32" + ) + + return a_q, w_q, a_offset, w_offset, expected_output, mem_scope + + a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters( + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")), + (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")), + (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")), + (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")), + (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")), + (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")), + (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")), + (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")), + ) + + @tvm.testing.requires_hexagon + def test_vrmpy_conv( + self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope + ): + + # TODO even sized kernels and stride are currently not working. + + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + + out_height = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_width = (input_size - w_size + 2 * (w_size // 2)) + 1 + out_shape = (batches, out_height, out_width, filters) + c = np.zeros(out_shape, dtype="int32") + + ( + conv2d_vrmpy_description, + conv2d_vrmpy_intrinsic, + conv2d_operator, + ) = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope) + + intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format( + input_size, input_size, w_size, w_size, mem_scope + ) + try: + TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic) + except: + print("Intrinsic already registered.") + + ir_module = conv2d_operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + + w_block_local = sch.get_block("W_local") + sch.transform_layout( + w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w) + ) + + a_block_local = sch.get_block("A_local") + sch.transform_layout( + a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w) + ) + + n, f, y, x, ry, rx, rc = sch.get_loops(block) + sch.reorder(n, f, rc, y, x, ry, rx) + + sch.tensorize(y, intrin_name) + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + + A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8") + W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + + func_tir = tvm.build( + sch.mod, + [A, W, C], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="hvx_op", + ) + + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + + module(a_hexagon, w_hexagon, c_hexagon) + out = c_hexagon.numpy() + out = out[:, :, :out_width, :] + + tvm.testing.assert_allclose(out, expected_output) + + timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1) + time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000 + print( + "Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format( + a.shape, w.shape, mem_scope, time_ms + ) + ) + + @tvm.testing.requires_hexagon + def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope): + batches, input_size, _, in_c = a.shape + w_size, _, _, filters = w.shape + baseline_output, baseline_time = run_conv_te( + hexagon_session, a, w, a_offset, w_offset, w_size // 2 + ) + tvm.testing.assert_allclose(baseline_output, expected_output) + print( + "Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format( + a.shape, w.shape, baseline_time + ) + ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py new file mode 100644 index 000000000000..d025b2b48751 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py @@ -0,0 +1,205 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import tvm +import tvm.testing + +from numpy.random import default_rng +from tvm.script import tir as T +from tvm.tir.function import TensorIntrin + +from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin +from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8 + +UNROLL_FACTOR = 4 # This must match the hard-coded unrolling in mm_uint8_intrinsic(). + + +def can_tensorize(n, m, k): + return m % (4 * UNROLL_FACTOR) == k % 32 == 0 + + +def blockify_matrix(B): + """ + inputs + ------ + B : numpy 2D array (of M x K ) to be blockified + + outputs + ------- + BB : blockified B as array of dimensions (M/4) x K x 4 + + blockification is in preparation for HVX ops on 128B 'vectors' + assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32 + - once for each block - then K/32 such vectors will be required to carry out the operation. + + if B is height x width = M x K, then output is array of K blocks in x direction, + i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4. + Specifically: + + B = [ B[1,1] B[1,2] ... B[1,K] ] + [ B[2,1] B[2,2] ... B[2,K] ] + . + . + . + [ B[M,1] B[M,2] ... B[M,K] ] + + + BB = | | | + | | | + [ B[1,1] B[2,1] B[3,1] B[4,1] | B[1,2] B[2,2] B[3,2] B[4,2] | ... | B[1,K] B[2,K] B[3,K] B[4,K] ] + [ B[5,1] B[6,1] B[7,1] B[8,1] | B[5,2] B[6,2] B[7,2] B[8,2] | ... | B[5,K] B[6,K] B[7,K] B[8,K] ] + . | . | . | . + . | . | . | . + . | . | . | . + [ B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ] + | | | + | | | + | | | + | | | + ^ | ^ | | ^ + | | | | | | + | | | | | | + | | | | | | + block 1 block 2 | | block K + + """ + BT, M, K = B.shape + assert M % 4 == 0 + out_height = M // 4 + out_shape = tuple((BT, out_height, K, 4)) + BB = np.zeros(out_shape).astype(B.dtype) # block form of B + for bt in range(BT): + for j in range(0, out_height): + for k in range(0, K): + for b in range(0, 4): + y = b + 4 * j + BB[bt, j, k, b] = B[bt, y, k] + return BB + + +def setup_test(b, m, n, k): + a_shape = (b, n, m) + b_shape = (b, m, k) + + rng = default_rng() + a = rng.integers(1, 16, a_shape, dtype="uint8") + b = rng.integers(1, 16, b_shape, dtype="uint8") + + a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size) + b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size) + a_q = np.array(a_q, dtype="uint8").reshape(a_shape) + b_q = np.array(b_q, dtype="uint8").reshape(b_shape) + a_offset = quantize_uint8(0.0, a_min, a_max) + b_offset = quantize_uint8(0.0, b_min, b_max) + if can_tensorize(n, m, k): + bb = blockify_matrix(b_q) # blockification is only used by the tensorized version + bb = bb.reshape(-1) # go via 1D rep in case there are stride / offset issues + bb = bb.reshape(b_shape) + else: + bb = [] + + a_f = np.array(a_q, dtype="int32").reshape(a_shape) + b_f = np.array(b_q, dtype="int32").reshape(b_shape) + expected_output = np.matmul(a_f, b_f) + + intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k) + try: + TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k)) + except: + print("Intrinsic already registered.") + + return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output + + +class TestMatMulVec: + + batches, m, n, k = tvm.testing.parameters( + (1, 128, 768, 768), + (1, 128, 768, 3072), + (1, 128, 3072, 768), + (1, 128, 128, 64), + (1, 128, 64, 128), + ) + + @tvm.testing.requires_hexagon + def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k): + + out_shape = (batches, n, k) + + a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k) + + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [batches, n, m], dtype="uint8") + B = T.match_buffer(b, [batches, m, k], dtype="uint8") + C = T.match_buffer(c, [batches, n, k], dtype="int32") + OFFSETS = T.match_buffer(offsets, [2], dtype="uint8") + # body + with T.block("root"): + for i0, i1, i2, i3 in T.grid(batches, m, n, k): + with T.block("C"): + batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3]) + C[batch, y, x] = C[batch, y, x] + T.cast( + A[batch, y, j] - OFFSETS[0], "int32" + ) * T.cast(B[batch, j, x] - OFFSETS[1], "int32") + + ir_module = operator + sch = tvm.tir.Schedule(ir_module, debug_mask="all") + + block = sch.get_block("C") + _, y, _, _ = sch.get_loops(block) + sch.tensorize(y, intrin_name) + + A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8") + B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8") + C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32") + OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8") + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + func_tir = tvm.build( + sch.mod, + [A, B, C, OFFSETS], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="qmmul_vrmpy", + ) + module = hexagon_session.load_module(func_tir) + + c = np.zeros(out_shape, dtype="int32") + offsets = np.array([a_offset, b_offset], dtype="uint8") + + a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device) + + module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon) + evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1) + time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3 + print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms)) + + out = c_hexagon.numpy() + out_a = out.reshape(batches * n * k) + out_req, _, _ = quantize_array(out_a, batches * n * k) + out_req = np.array(out_req).reshape(batches, n, k) + + out_ref_a = out_ref.reshape(batches * n * k) + out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k) + out_ref_q = np.array(out_ref_q).reshape(batches, n, k) + + tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0)