From 8c955008b44daf2ccfc9f07c3eb4d5cfc3e5ad64 Mon Sep 17 00:00:00 2001
From: nverke <nverke@octoml.ai>
Date: Tue, 2 Aug 2022 21:53:27 +0000
Subject: [PATCH 1/3] Add tests for 2 operators that utilize Qualcomm HVX
 intrinsics.

---
 .../python/contrib/test_hexagon/conv_uint8.py | 159 ++++++++++++++
 .../test_hexagon/conv_uint8_hvx_intrin.py     | 138 +++++++++++++
 .../test_hexagon/mmul_unit8_hvx_intrin.py     | 189 +++++++++++++++++
 .../test_hexagon/quantization_utils.py        |  44 ++++
 .../test_hexagon/test_conv_hvx_intrinsics.py  | 142 +++++++++++++
 .../test_hexagon/test_mmul_hvx_intrinsics.py  | 195 ++++++++++++++++++
 6 files changed, 867 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/conv_uint8.py
 create mode 100644 tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
 create mode 100644 tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
 create mode 100644 tests/python/contrib/test_hexagon/quantization_utils.py
 create mode 100644 tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
 create mode 100644 tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py

diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py
new file mode 100644
index 000000000000..264e6f77bbdf
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8.py
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+
+from numbers import Integral
+from tvm import te
+
+def get_const_int(expr):
+            """Verifies expr is integer and get the constant value.
+
+            Parameters
+            ----------
+            expr : tvm.Expr or int
+                The input expression.
+
+            Returns
+            -------
+            out_value : int
+                The output.
+            """
+            if isinstance(expr, Integral):
+                return expr
+            if not isinstance(expr, tvm.tir.IntImm):
+                ana = tvm.arith.Analyzer()
+                expr = ana.simplify(expr)
+            if not isinstance(expr, tvm.tir.IntImm):
+                raise ValueError("Expect value to be constant int")
+            return int(expr.value)
+
+def get_const_tuple(in_tuple):
+    """Verifies input tuple is IntImm or Var, returns tuple of int or Var.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of int
+        The output.
+    """
+    ret = []
+    ana = None
+    for elem in in_tuple:
+        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
+            ret.append(elem)
+        elif not isinstance(elem, (tvm.tir.IntImm, int)):
+            ana = tvm.arith.Analyzer() if ana is None else ana
+            elem = ana.simplify(elem)
+            if not isinstance(elem, tvm.tir.IntImm):
+                ret.append(elem)
+            else:
+                ret.append(get_const_int(elem))
+        else:
+            ret.append(get_const_int(elem))
+    return tuple(ret)
+
+def Pad(Input, padding):
+    batch, in_height, in_width, in_channel = Input.shape
+    return te.compute(
+        (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel),
+        lambda nn, yy, xx, cc: tvm.tir.if_then_else(
+            te.all(
+                yy >= padding,
+                yy - padding < in_height,
+                xx >= padding,
+                xx - padding < in_width,
+            ),
+            Input[nn, yy - padding, xx - padding, cc],
+            tvm.tir.const(0, Input.dtype),
+        ),
+        name="Apad",
+    )
+
+def schedule_qconv2d_nhwc(outs, target, device):
+    s = te.create_schedule([x.op for x in outs])
+    x = outs[0]
+    nn, yy, xx, cc = s[x].op.axis
+    px1, px2 = s[x].split(nn, nparts=1)
+    return s
+
+def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = Input.dtype
+
+    batch, in_height, in_width, in_channel = Input.shape
+    filt_height, filt_width, _, num_filter = Filter.shape
+    # Input is already padded. No need to add padding while computing
+    # out_height and out_width.
+    out_height = (in_height - filt_height) // stride + 1
+    out_width = (in_width - filt_width) // stride + 1
+    out_channel = num_filter
+
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, filt_height), name="ry")
+    rx = te.reduce_axis((0, filt_width), name="rx")
+
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: te.sum(
+            (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset)
+            * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype),
+            axis=[rc, ry, rx],
+        ),
+        tag="qconv2d_nhwc",
+    )
+
+
+def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding):
+
+    # Input tensor size
+    A = te.placeholder(a.shape, name="A", dtype="uint8")
+    W = te.placeholder(w.shape, name="W", dtype="uint8")
+
+    # Pad input and create computation for quantized conv2d
+    Apad = Pad(A, padding)
+    B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding)
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    device = hexagon_session.device
+    s = schedule_qconv2d_nhwc([B], target_hexagon, device)
+    nn, yy, xx, cc = s[B].op.axis
+    yo, yi = s[B].split(yy, nparts=1)
+    s[Apad].compute_at(s[B], yi)
+    s[B].vectorize(cc)
+
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device)
+    func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d")
+
+    module_te = hexagon_session.load_module(func_te)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+
+    module_te(a_hexagon, w_hexagon, b_hexagon)
+    evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1)
+    mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000
+
+    out = b_hexagon.numpy()
+
+    return out, mean_ms
+    
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
new file mode 100644
index 000000000000..44154bf29767
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import math
+
+from tvm.script import tir as T
+
+def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope):
+        VRMPY_WIDTH = 128
+        
+        batches, input_size, _, in_c = input_shape
+        w_size, _, _, filters = kernel_shape
+        out_size = input_size
+        
+        input_padding = w_size // 2
+        
+        # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order
+        # for this implementation to not mix output data there will need to be kernel 
+        # padding to round to the nearest multiple of 4.
+        kernel_width_padding = 4 - w_size % 4 
+        padded_kernel_width = w_size + kernel_width_padding
+
+        # vrmpy buffer loads are always 128B and will go out of bounds for the 
+        # implementation written here if there is not sufficient padding. This
+        # means that for this implementation it must always be a multiple of 128
+        # and have the standard padding and the padding needed for the kernel 
+        # window (4)
+        if (input_size % VRMPY_WIDTH != 0):
+            input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding
+        else:
+            input_width_padding = input_padding + kernel_width_padding
+
+        padded_input_height = input_size + 2 * input_padding
+        padded_input_width = input_size + input_padding + input_width_padding
+
+        # vrmpy output buffer loads will go out of bounds for this implementation 
+        # if there is not proper padding. 
+        padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3
+        
+        # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input.
+        w_steps = math.ceil(input_size / VRMPY_WIDTH)
+
+        # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. 
+        kw_steps = math.ceil(w_size / 4)
+
+        @T.prim_func
+        def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle):
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
+            W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
+            C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope)
+            with T.block("root"):
+                T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")])
+                T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")])
+                for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size):
+                    C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                        T.uint32(2),
+                        T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"),
+                        dtype="int32x32"
+                    )
+
+        @T.prim_func
+        def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle):
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope)
+            W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope)
+            C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope)
+            with T.block("root"):
+                for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size):
+                    with T.block("C"):
+                        y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx])
+                        C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32")
+
+
+        @T.prim_func
+        def operator(a: T.handle, w: T.handle, c: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1)
+            W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1)
+            C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1)
+            A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope)
+            W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope)
+            C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope)
+            with T.block("root"):
+                for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c):
+                    with T.block("A_local"):
+                        nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c])
+                        T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc])
+                        T.writes(A_local[nn, yy, xx, cc])
+                        A_local[nn, yy, xx, cc] = T.if_then_else(
+                            T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and 
+                            T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), 
+                            A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), 
+                            T.uint8(0), 
+                            dtype="uint8"
+                        )
+                for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters):
+                    with T.block("W_local"):
+                        yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f])
+                        T.reads(W[yy, xx, cc, ff])
+                        T.writes(W_local[yy, xx, cc, ff])
+                        W_local[yy, xx, cc, ff] = T.if_then_else(
+                            xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), 
+                            W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), 
+                            T.uint8(0), 
+                            dtype="uint8"
+                        )
+                for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width):
+                    with T.block("C_local_init"):
+                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                        C_local[n, f, y, x] = 0
+                for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c):
+                    with T.block("C"):
+                        n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc])
+                        C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32")
+                for n, f, y, x in T.grid(batches, filters, out_size, out_size):
+                    with T.block("C_local"):
+                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                        C[n, y, x, f] = C_local[n, f, y, x]
+
+        return conv2d_vrmpy_desc, conv2d_vrmpy, operator
+        
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
new file mode 100644
index 000000000000..5d8c5ed328a8
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tvm.script import tir as T
+
+w_spit__b = "llvm.hexagon.S2.vsplatrb"  # Q6_R_vsplatb_R
+v_spit__w = "llvm.hexagon.V6.lvsplatw.128B"
+v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B"
+v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B"
+v_sub = "llvm.hexagon.V6.vsubw.128B"
+
+def get_mm_uint8_intrin(in_m, in_n, in_k):
+    blocks = in_k // 32
+    unrolled_rows = in_m // 16
+
+    @T.prim_func
+    def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle):
+        A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8")
+        B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8")
+        C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32")
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        with T.block("root"): 
+            T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2])
+            T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")])
+            for i in T.serial(in_n):
+                for s in T.serial(blocks):
+                    C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32")
+                for blok, ro  in T.grid(blocks, unrolled_rows):
+                    b_offset =  T.cast(OFFSETS[1], dtype="int32")
+                    a_offset =  T.cast(OFFSETS[0], dtype="int32")
+                    out_index = blok * 32 + i * T.cast(in_k, dtype="int32")
+                    
+                    B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32"))
+                    B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32")
+                    
+                    A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32")
+                    A_index_unrolled_2 = A_index_unrolled + 4
+                    A_index_unrolled_3 = A_index_unrolled + 8
+                    A_index_unrolled_4 = A_index_unrolled + 12
+                    
+                    a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction
+                        T.uint32(3), # number of inputs
+                        C[T.ramp(out_index, 1, 32)], # accumulation location
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy 
+                        T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy
+                        dtype = "int32x32" # output datatype
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                        T.uint32(2),
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), 
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b1,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b2,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), 
+                        T.uint32(3), 
+                        a_b_offsets_vrmpy_accumulation_unrolled_b3,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), 
+                        T.uint32(2), 
+                        la_b_vrmpy_accumulation_unrolled_3,
+                        a_b_offsets_vrmpy_accumulation_unrolled_a3,
+                        dtype = "int32x32"
+                    )
+
+
+    @T.prim_func
+    def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [in_n, in_m], dtype="uint8")
+        B = T.match_buffer(b, [in_m, in_k], dtype="uint8")
+        C = T.match_buffer(c, [in_n, in_k], dtype="int32")
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        # body
+        with T.block("root"): 
+            for i0, i1, i2 in T.grid(in_m, in_n, in_k):
+                with T.block("C"):
+                    y, x, j = T.axis.remap("SSR", [i0, i1, i2])
+                    C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32")
+
+    return mmul_desc, mm_uint8_intrinsic
+
+
diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py
new file mode 100644
index 000000000000..3ebcfe3a805e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/quantization_utils.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+def quantize_uint8(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    resize_amt = 255.0 / range
+    value_f = (val - minval) * resize_amt
+    value_i = round(value_f, 8)
+    if value_i < 0:
+        return 0
+    elif value_i > 255:
+        return 255
+    else:
+        return int(value_i)
+
+
+def dequantize(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    stepsize = range / 4294967296
+    return val * stepsize
+
+
+def quantize_array(in_f, size):
+    in_q = []
+    # 0 must lie in interval [min,max] for quantization to work correctly.
+    in_min = min(0, min(in_f))
+    in_max = max(0, max(in_f))
+    for i in range(size):
+        in_q.append(quantize_uint8(in_f[i], in_min, in_max))
+    return in_q, in_min, in_max
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
new file mode 100644
index 000000000000..0ab7cbd734a0
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm.testing
+import tvm.topi.testing
+
+from numpy.random import default_rng
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te
+from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+class TestConvHVX:
+
+    def create_inputs(input_shape, filter_shape, mem_scope):
+
+        w_size, _, _, _ = filter_shape
+        input_padding = w_size // 2
+
+        rng = default_rng()
+        a = rng.integers(1, 255, input_shape, dtype="uint8")
+        w = rng.integers(1, 8, filter_shape, dtype="uint8")
+
+        a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+        w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size)
+
+        a_q = np.array(a_q, dtype="uint8").reshape(input_shape)
+        w_q = np.array(w_q, dtype="uint8").reshape(filter_shape)
+
+        a_offset = quantize_uint8(0.0, a_min, a_max)
+        w_offset = quantize_uint8(0.0, b_min, b_max)
+
+        a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape))
+        w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape))
+        expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32")
+
+        return a_q, w_q, a_offset, w_offset, expected_output, mem_scope
+
+
+    a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters(
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")),
+        (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")),
+        (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")),
+        (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")),
+        (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")),
+        (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+
+        #TODO even sized kernels and stride are currently not working. 
+
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+
+        out_height = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_width = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_shape = (batches, out_height, out_width, filters)
+        c = np.zeros(out_shape, dtype="int32")
+
+        conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope)
+
+        intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope)
+        try:
+            TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic)
+        except:
+            print("Intrinsic already registered.")
+
+        ir_module = conv2d_operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+        
+        block = sch.get_block("C")
+
+        w_block_local = sch.get_block("W_local")
+        sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w))
+
+        a_block_local = sch.get_block("A_local")
+        sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w))
+
+        n, f, y, x, ry, rx, rc = sch.get_loops(block)
+        sch.reorder(n, f, rc, y, x, ry, rx)
+
+        sch.tensorize(y, intrin_name)
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+        
+        A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8")
+        W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+
+        func_tir = tvm.build(
+            sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op"
+        )
+        
+        module = hexagon_session.load_module(func_tir)
+
+        a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+        w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+        module(a_hexagon, w_hexagon, c_hexagon)
+        out = c_hexagon.numpy()
+        out = out[:,:,:out_width,:]
+    
+        tvm.testing.assert_allclose(out, expected_output)
+        
+        timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1)
+        time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000
+        print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms))
+
+    @tvm.testing.requires_hexagon
+    def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+        baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2)
+        tvm.testing.assert_allclose(baseline_output, expected_output)
+        print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time))
+    
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
new file mode 100644
index 000000000000..161ff305c5c1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+import tvm.testing
+
+from numpy.random import default_rng
+from tvm.script import tir as T
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+UNROLL_FACTOR = 4  # This must match the hard-coded unrolling in mm_uint8_intrinsic().
+
+def can_tensorize(n, m, k):
+    return m % (4 * UNROLL_FACTOR) == k % 32 == 0
+
+def blockify_matrix(B):
+    """
+    inputs
+    ------
+    B        : numpy 2D array (of M x K ) to be blockified
+
+    outputs
+    -------
+    BB       : blockified B as array of dimensions (M/4) x K x 4
+
+    blockification is in preparation for HVX ops on 128B 'vectors'
+    assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32
+    - once for each block - then K/32 such vectors will be required to carry out the operation.
+
+    if B is height x width = M x K, then output is array of K blocks in x direction,
+    i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4.
+    Specifically:
+
+     B = [ B[1,1]     B[1,2]      ...   B[1,K] ]
+         [ B[2,1]     B[2,2]      ...   B[2,K] ]
+                         .
+                         .
+                         .
+         [ B[M,1]     B[M,2]      ...   B[M,K] ]
+
+
+     BB =                                 |                                   |     |
+                                          |                                   |     |
+     [  B[1,1]   B[2,1]   B[3,1]   B[4,1] | B[1,2]   B[2,2]   B[3,2]   B[4,2] | ... | B[1,K]   B[2,K]   B[3,K]   B[4,K] ]
+     [  B[5,1]   B[6,1]   B[7,1]   B[8,1] | B[5,2]   B[6,2]   B[7,2]   B[8,2] | ... | B[5,K]   B[6,K]   B[7,K]   B[8,K] ]
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+     [  B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ]
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                      ^                   |                 ^                 |     |                 ^
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                    block 1                               block 2             |     |               block K
+
+    """
+    BT, M, K = B.shape
+    assert M % 4 == 0
+    out_height = M // 4
+    out_shape = tuple((BT, out_height, K, 4))
+    BB = np.zeros(out_shape).astype(B.dtype)  # block form of B
+    for bt in range(BT):
+        for j in range(0, out_height):
+            for k in range(0, K):
+                for b in range(0, 4):
+                    y = b + 4 * j
+                    BB[bt, j, k, b] = B[bt, y, k]
+    return BB
+
+def setup_test(b, m, n, k):
+    a_shape = (b, n, m)
+    b_shape = (b, m, k)
+
+    rng = default_rng()
+    a = rng.integers(1, 16, a_shape, dtype="uint8")
+    b = rng.integers(1, 16, b_shape, dtype="uint8")
+
+    a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+    b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size)
+    a_q = np.array(a_q, dtype="uint8").reshape(a_shape)
+    b_q = np.array(b_q, dtype="uint8").reshape(b_shape)
+    a_offset = quantize_uint8(0.0, a_min, a_max)
+    b_offset = quantize_uint8(0.0, b_min, b_max)
+    if can_tensorize(n, m, k):
+        bb = blockify_matrix(b_q)  # blockification is only used by the tensorized version
+        bb = bb.reshape(-1)  # go via 1D rep in case there are stride / offset issues
+        bb = bb.reshape(b_shape)
+    else:
+        bb = []
+
+    a_f = np.array(a_q, dtype="int32").reshape(a_shape)
+    b_f = np.array(b_q, dtype="int32").reshape(b_shape)
+    expected_output = np.matmul(a_f, b_f)
+
+    intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k)
+    try:
+        TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k))
+    except:
+        print("Intrinsic already registered.")
+
+    return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output
+
+class TestMatMulVec:
+
+    batches, m, n, k = tvm.testing.parameters(
+        (1, 128, 768, 768),
+        (1, 128, 768, 3072),
+        (1, 128, 3072, 768),
+        (1, 128, 128, 64),
+        (1, 128, 64, 128),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k):
+        
+        out_shape = (batches, n, k) 
+
+        a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k)
+
+        @T.prim_func
+        def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, [batches, n, m], dtype="uint8")
+            B = T.match_buffer(b, [batches, m, k], dtype="uint8")
+            C = T.match_buffer(c, [batches, n, k], dtype="int32")
+            OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+            # body
+            with T.block("root"):
+                for i0, i1, i2, i3 in T.grid(batches, m, n, k):
+                    with T.block("C"):
+                        batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3])
+                        C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32")
+
+        ir_module = operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+        
+        block = sch.get_block("C")
+        _, y, _, _ = sch.get_loops(block)
+        sch.tensorize(y, intrin_name)
+        
+        A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8")
+        B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+        OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8")
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+        func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy")
+        module = hexagon_session.load_module(func_tir)
+
+        c = np.zeros(out_shape, dtype="int32")
+        offsets = np.array([a_offset, b_offset], dtype="uint8")
+        
+        a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device)
+        b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+        offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device)
+        
+        module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon)
+        evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1)
+        time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3
+        print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms))
+
+        out = c_hexagon.numpy()
+        out_a = out.reshape(batches * n * k)
+        out_req, _, _ = quantize_array(out_a, batches * n * k)
+        out_req = np.array(out_req).reshape(batches, n, k)
+
+        out_ref_a = out_ref.reshape(batches * n * k)
+        out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k)
+        out_ref_q = np.array(out_ref_q).reshape(batches, n, k)
+
+        tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0)
+        
\ No newline at end of file

From 775f66c0cd17d9a2356804b914ca191dab6f5ce8 Mon Sep 17 00:00:00 2001
From: nverke <nverke@octoml.ai>
Date: Tue, 2 Aug 2022 21:53:27 +0000
Subject: [PATCH 2/3] Add tests for 2 operators that utilize Qualcomm HVX
 intrinsics.

---
 .../python/contrib/test_hexagon/conv_uint8.py | 159 ++++++++++++++
 .../test_hexagon/conv_uint8_hvx_intrin.py     | 138 +++++++++++++
 .../test_hexagon/mmul_unit8_hvx_intrin.py     | 189 +++++++++++++++++
 .../test_hexagon/quantization_utils.py        |  44 ++++
 .../test_hexagon/test_conv_hvx_intrinsics.py  | 142 +++++++++++++
 .../test_hexagon/test_mmul_hvx_intrinsics.py  | 195 ++++++++++++++++++
 6 files changed, 867 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/conv_uint8.py
 create mode 100644 tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
 create mode 100644 tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
 create mode 100644 tests/python/contrib/test_hexagon/quantization_utils.py
 create mode 100644 tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
 create mode 100644 tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py

diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py
new file mode 100644
index 000000000000..264e6f77bbdf
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8.py
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+
+from numbers import Integral
+from tvm import te
+
+def get_const_int(expr):
+            """Verifies expr is integer and get the constant value.
+
+            Parameters
+            ----------
+            expr : tvm.Expr or int
+                The input expression.
+
+            Returns
+            -------
+            out_value : int
+                The output.
+            """
+            if isinstance(expr, Integral):
+                return expr
+            if not isinstance(expr, tvm.tir.IntImm):
+                ana = tvm.arith.Analyzer()
+                expr = ana.simplify(expr)
+            if not isinstance(expr, tvm.tir.IntImm):
+                raise ValueError("Expect value to be constant int")
+            return int(expr.value)
+
+def get_const_tuple(in_tuple):
+    """Verifies input tuple is IntImm or Var, returns tuple of int or Var.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of int
+        The output.
+    """
+    ret = []
+    ana = None
+    for elem in in_tuple:
+        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
+            ret.append(elem)
+        elif not isinstance(elem, (tvm.tir.IntImm, int)):
+            ana = tvm.arith.Analyzer() if ana is None else ana
+            elem = ana.simplify(elem)
+            if not isinstance(elem, tvm.tir.IntImm):
+                ret.append(elem)
+            else:
+                ret.append(get_const_int(elem))
+        else:
+            ret.append(get_const_int(elem))
+    return tuple(ret)
+
+def Pad(Input, padding):
+    batch, in_height, in_width, in_channel = Input.shape
+    return te.compute(
+        (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel),
+        lambda nn, yy, xx, cc: tvm.tir.if_then_else(
+            te.all(
+                yy >= padding,
+                yy - padding < in_height,
+                xx >= padding,
+                xx - padding < in_width,
+            ),
+            Input[nn, yy - padding, xx - padding, cc],
+            tvm.tir.const(0, Input.dtype),
+        ),
+        name="Apad",
+    )
+
+def schedule_qconv2d_nhwc(outs, target, device):
+    s = te.create_schedule([x.op for x in outs])
+    x = outs[0]
+    nn, yy, xx, cc = s[x].op.axis
+    px1, px2 = s[x].split(nn, nparts=1)
+    return s
+
+def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = Input.dtype
+
+    batch, in_height, in_width, in_channel = Input.shape
+    filt_height, filt_width, _, num_filter = Filter.shape
+    # Input is already padded. No need to add padding while computing
+    # out_height and out_width.
+    out_height = (in_height - filt_height) // stride + 1
+    out_width = (in_width - filt_width) // stride + 1
+    out_channel = num_filter
+
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, filt_height), name="ry")
+    rx = te.reduce_axis((0, filt_width), name="rx")
+
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: te.sum(
+            (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset)
+            * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype),
+            axis=[rc, ry, rx],
+        ),
+        tag="qconv2d_nhwc",
+    )
+
+
+def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding):
+
+    # Input tensor size
+    A = te.placeholder(a.shape, name="A", dtype="uint8")
+    W = te.placeholder(w.shape, name="W", dtype="uint8")
+
+    # Pad input and create computation for quantized conv2d
+    Apad = Pad(A, padding)
+    B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding)
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    device = hexagon_session.device
+    s = schedule_qconv2d_nhwc([B], target_hexagon, device)
+    nn, yy, xx, cc = s[B].op.axis
+    yo, yi = s[B].split(yy, nparts=1)
+    s[Apad].compute_at(s[B], yi)
+    s[B].vectorize(cc)
+
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device)
+    func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d")
+
+    module_te = hexagon_session.load_module(func_te)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+
+    module_te(a_hexagon, w_hexagon, b_hexagon)
+    evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1)
+    mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000
+
+    out = b_hexagon.numpy()
+
+    return out, mean_ms
+    
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
new file mode 100644
index 000000000000..44154bf29767
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import math
+
+from tvm.script import tir as T
+
+def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope):
+        VRMPY_WIDTH = 128
+        
+        batches, input_size, _, in_c = input_shape
+        w_size, _, _, filters = kernel_shape
+        out_size = input_size
+        
+        input_padding = w_size // 2
+        
+        # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order
+        # for this implementation to not mix output data there will need to be kernel 
+        # padding to round to the nearest multiple of 4.
+        kernel_width_padding = 4 - w_size % 4 
+        padded_kernel_width = w_size + kernel_width_padding
+
+        # vrmpy buffer loads are always 128B and will go out of bounds for the 
+        # implementation written here if there is not sufficient padding. This
+        # means that for this implementation it must always be a multiple of 128
+        # and have the standard padding and the padding needed for the kernel 
+        # window (4)
+        if (input_size % VRMPY_WIDTH != 0):
+            input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding
+        else:
+            input_width_padding = input_padding + kernel_width_padding
+
+        padded_input_height = input_size + 2 * input_padding
+        padded_input_width = input_size + input_padding + input_width_padding
+
+        # vrmpy output buffer loads will go out of bounds for this implementation 
+        # if there is not proper padding. 
+        padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3
+        
+        # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input.
+        w_steps = math.ceil(input_size / VRMPY_WIDTH)
+
+        # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. 
+        kw_steps = math.ceil(w_size / 4)
+
+        @T.prim_func
+        def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle):
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
+            W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
+            C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope)
+            with T.block("root"):
+                T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")])
+                T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")])
+                for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size):
+                    C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                        T.uint32(2),
+                        T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"),
+                        dtype="int32x32"
+                    )
+
+        @T.prim_func
+        def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle):
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope)
+            W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope)
+            C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope)
+            with T.block("root"):
+                for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size):
+                    with T.block("C"):
+                        y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx])
+                        C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32")
+
+
+        @T.prim_func
+        def operator(a: T.handle, w: T.handle, c: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1)
+            W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1)
+            C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1)
+            A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope)
+            W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope)
+            C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope)
+            with T.block("root"):
+                for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c):
+                    with T.block("A_local"):
+                        nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c])
+                        T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc])
+                        T.writes(A_local[nn, yy, xx, cc])
+                        A_local[nn, yy, xx, cc] = T.if_then_else(
+                            T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and 
+                            T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), 
+                            A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), 
+                            T.uint8(0), 
+                            dtype="uint8"
+                        )
+                for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters):
+                    with T.block("W_local"):
+                        yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f])
+                        T.reads(W[yy, xx, cc, ff])
+                        T.writes(W_local[yy, xx, cc, ff])
+                        W_local[yy, xx, cc, ff] = T.if_then_else(
+                            xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), 
+                            W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), 
+                            T.uint8(0), 
+                            dtype="uint8"
+                        )
+                for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width):
+                    with T.block("C_local_init"):
+                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                        C_local[n, f, y, x] = 0
+                for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c):
+                    with T.block("C"):
+                        n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc])
+                        C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32")
+                for n, f, y, x in T.grid(batches, filters, out_size, out_size):
+                    with T.block("C_local"):
+                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                        C[n, y, x, f] = C_local[n, f, y, x]
+
+        return conv2d_vrmpy_desc, conv2d_vrmpy, operator
+        
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
new file mode 100644
index 000000000000..5d8c5ed328a8
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tvm.script import tir as T
+
+w_spit__b = "llvm.hexagon.S2.vsplatrb"  # Q6_R_vsplatb_R
+v_spit__w = "llvm.hexagon.V6.lvsplatw.128B"
+v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B"
+v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B"
+v_sub = "llvm.hexagon.V6.vsubw.128B"
+
+def get_mm_uint8_intrin(in_m, in_n, in_k):
+    blocks = in_k // 32
+    unrolled_rows = in_m // 16
+
+    @T.prim_func
+    def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle):
+        A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8")
+        B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8")
+        C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32")
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        with T.block("root"): 
+            T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2])
+            T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")])
+            for i in T.serial(in_n):
+                for s in T.serial(blocks):
+                    C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32")
+                for blok, ro  in T.grid(blocks, unrolled_rows):
+                    b_offset =  T.cast(OFFSETS[1], dtype="int32")
+                    a_offset =  T.cast(OFFSETS[0], dtype="int32")
+                    out_index = blok * 32 + i * T.cast(in_k, dtype="int32")
+                    
+                    B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32"))
+                    B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32")
+                    
+                    A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32")
+                    A_index_unrolled_2 = A_index_unrolled + 4
+                    A_index_unrolled_3 = A_index_unrolled + 8
+                    A_index_unrolled_4 = A_index_unrolled + 12
+                    
+                    a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction
+                        T.uint32(3), # number of inputs
+                        C[T.ramp(out_index, 1, 32)], # accumulation location
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy 
+                        T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy
+                        dtype = "int32x32" # output datatype
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                        T.uint32(2),
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), 
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b1,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b2,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), 
+                        T.uint32(3), 
+                        a_b_offsets_vrmpy_accumulation_unrolled_b3,
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"),
+                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"),
+                        dtype = "int32x32"
+                    )
+
+                    C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), 
+                        T.uint32(2), 
+                        la_b_vrmpy_accumulation_unrolled_3,
+                        a_b_offsets_vrmpy_accumulation_unrolled_a3,
+                        dtype = "int32x32"
+                    )
+
+
+    @T.prim_func
+    def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [in_n, in_m], dtype="uint8")
+        B = T.match_buffer(b, [in_m, in_k], dtype="uint8")
+        C = T.match_buffer(c, [in_n, in_k], dtype="int32")
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        # body
+        with T.block("root"): 
+            for i0, i1, i2 in T.grid(in_m, in_n, in_k):
+                with T.block("C"):
+                    y, x, j = T.axis.remap("SSR", [i0, i1, i2])
+                    C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32")
+
+    return mmul_desc, mm_uint8_intrinsic
+
+
diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py
new file mode 100644
index 000000000000..3ebcfe3a805e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/quantization_utils.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+def quantize_uint8(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    resize_amt = 255.0 / range
+    value_f = (val - minval) * resize_amt
+    value_i = round(value_f, 8)
+    if value_i < 0:
+        return 0
+    elif value_i > 255:
+        return 255
+    else:
+        return int(value_i)
+
+
+def dequantize(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    stepsize = range / 4294967296
+    return val * stepsize
+
+
+def quantize_array(in_f, size):
+    in_q = []
+    # 0 must lie in interval [min,max] for quantization to work correctly.
+    in_min = min(0, min(in_f))
+    in_max = max(0, max(in_f))
+    for i in range(size):
+        in_q.append(quantize_uint8(in_f[i], in_min, in_max))
+    return in_q, in_min, in_max
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
new file mode 100644
index 000000000000..0ab7cbd734a0
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm.testing
+import tvm.topi.testing
+
+from numpy.random import default_rng
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te
+from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+class TestConvHVX:
+
+    def create_inputs(input_shape, filter_shape, mem_scope):
+
+        w_size, _, _, _ = filter_shape
+        input_padding = w_size // 2
+
+        rng = default_rng()
+        a = rng.integers(1, 255, input_shape, dtype="uint8")
+        w = rng.integers(1, 8, filter_shape, dtype="uint8")
+
+        a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+        w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size)
+
+        a_q = np.array(a_q, dtype="uint8").reshape(input_shape)
+        w_q = np.array(w_q, dtype="uint8").reshape(filter_shape)
+
+        a_offset = quantize_uint8(0.0, a_min, a_max)
+        w_offset = quantize_uint8(0.0, b_min, b_max)
+
+        a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape))
+        w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape))
+        expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32")
+
+        return a_q, w_q, a_offset, w_offset, expected_output, mem_scope
+
+
+    a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters(
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")),
+        (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")),
+        (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")),
+        (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")),
+        (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")),
+        (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+
+        #TODO even sized kernels and stride are currently not working. 
+
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+
+        out_height = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_width = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_shape = (batches, out_height, out_width, filters)
+        c = np.zeros(out_shape, dtype="int32")
+
+        conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope)
+
+        intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope)
+        try:
+            TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic)
+        except:
+            print("Intrinsic already registered.")
+
+        ir_module = conv2d_operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+        
+        block = sch.get_block("C")
+
+        w_block_local = sch.get_block("W_local")
+        sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w))
+
+        a_block_local = sch.get_block("A_local")
+        sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w))
+
+        n, f, y, x, ry, rx, rc = sch.get_loops(block)
+        sch.reorder(n, f, rc, y, x, ry, rx)
+
+        sch.tensorize(y, intrin_name)
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+        
+        A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8")
+        W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+
+        func_tir = tvm.build(
+            sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op"
+        )
+        
+        module = hexagon_session.load_module(func_tir)
+
+        a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+        w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+        module(a_hexagon, w_hexagon, c_hexagon)
+        out = c_hexagon.numpy()
+        out = out[:,:,:out_width,:]
+    
+        tvm.testing.assert_allclose(out, expected_output)
+        
+        timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1)
+        time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000
+        print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms))
+
+    @tvm.testing.requires_hexagon
+    def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+        baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2)
+        tvm.testing.assert_allclose(baseline_output, expected_output)
+        print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time))
+    
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
new file mode 100644
index 000000000000..161ff305c5c1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+import tvm.testing
+
+from numpy.random import default_rng
+from tvm.script import tir as T
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+UNROLL_FACTOR = 4  # This must match the hard-coded unrolling in mm_uint8_intrinsic().
+
+def can_tensorize(n, m, k):
+    return m % (4 * UNROLL_FACTOR) == k % 32 == 0
+
+def blockify_matrix(B):
+    """
+    inputs
+    ------
+    B        : numpy 2D array (of M x K ) to be blockified
+
+    outputs
+    -------
+    BB       : blockified B as array of dimensions (M/4) x K x 4
+
+    blockification is in preparation for HVX ops on 128B 'vectors'
+    assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32
+    - once for each block - then K/32 such vectors will be required to carry out the operation.
+
+    if B is height x width = M x K, then output is array of K blocks in x direction,
+    i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4.
+    Specifically:
+
+     B = [ B[1,1]     B[1,2]      ...   B[1,K] ]
+         [ B[2,1]     B[2,2]      ...   B[2,K] ]
+                         .
+                         .
+                         .
+         [ B[M,1]     B[M,2]      ...   B[M,K] ]
+
+
+     BB =                                 |                                   |     |
+                                          |                                   |     |
+     [  B[1,1]   B[2,1]   B[3,1]   B[4,1] | B[1,2]   B[2,2]   B[3,2]   B[4,2] | ... | B[1,K]   B[2,K]   B[3,K]   B[4,K] ]
+     [  B[5,1]   B[6,1]   B[7,1]   B[8,1] | B[5,2]   B[6,2]   B[7,2]   B[8,2] | ... | B[5,K]   B[6,K]   B[7,K]   B[8,K] ]
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+     [  B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ]
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                      ^                   |                 ^                 |     |                 ^
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                    block 1                               block 2             |     |               block K
+
+    """
+    BT, M, K = B.shape
+    assert M % 4 == 0
+    out_height = M // 4
+    out_shape = tuple((BT, out_height, K, 4))
+    BB = np.zeros(out_shape).astype(B.dtype)  # block form of B
+    for bt in range(BT):
+        for j in range(0, out_height):
+            for k in range(0, K):
+                for b in range(0, 4):
+                    y = b + 4 * j
+                    BB[bt, j, k, b] = B[bt, y, k]
+    return BB
+
+def setup_test(b, m, n, k):
+    a_shape = (b, n, m)
+    b_shape = (b, m, k)
+
+    rng = default_rng()
+    a = rng.integers(1, 16, a_shape, dtype="uint8")
+    b = rng.integers(1, 16, b_shape, dtype="uint8")
+
+    a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+    b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size)
+    a_q = np.array(a_q, dtype="uint8").reshape(a_shape)
+    b_q = np.array(b_q, dtype="uint8").reshape(b_shape)
+    a_offset = quantize_uint8(0.0, a_min, a_max)
+    b_offset = quantize_uint8(0.0, b_min, b_max)
+    if can_tensorize(n, m, k):
+        bb = blockify_matrix(b_q)  # blockification is only used by the tensorized version
+        bb = bb.reshape(-1)  # go via 1D rep in case there are stride / offset issues
+        bb = bb.reshape(b_shape)
+    else:
+        bb = []
+
+    a_f = np.array(a_q, dtype="int32").reshape(a_shape)
+    b_f = np.array(b_q, dtype="int32").reshape(b_shape)
+    expected_output = np.matmul(a_f, b_f)
+
+    intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k)
+    try:
+        TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k))
+    except:
+        print("Intrinsic already registered.")
+
+    return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output
+
+class TestMatMulVec:
+
+    batches, m, n, k = tvm.testing.parameters(
+        (1, 128, 768, 768),
+        (1, 128, 768, 3072),
+        (1, 128, 3072, 768),
+        (1, 128, 128, 64),
+        (1, 128, 64, 128),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k):
+        
+        out_shape = (batches, n, k) 
+
+        a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k)
+
+        @T.prim_func
+        def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, [batches, n, m], dtype="uint8")
+            B = T.match_buffer(b, [batches, m, k], dtype="uint8")
+            C = T.match_buffer(c, [batches, n, k], dtype="int32")
+            OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+            # body
+            with T.block("root"):
+                for i0, i1, i2, i3 in T.grid(batches, m, n, k):
+                    with T.block("C"):
+                        batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3])
+                        C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32")
+
+        ir_module = operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+        
+        block = sch.get_block("C")
+        _, y, _, _ = sch.get_loops(block)
+        sch.tensorize(y, intrin_name)
+        
+        A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8")
+        B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+        OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8")
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+        func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy")
+        module = hexagon_session.load_module(func_tir)
+
+        c = np.zeros(out_shape, dtype="int32")
+        offsets = np.array([a_offset, b_offset], dtype="uint8")
+        
+        a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device)
+        b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+        offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device)
+        
+        module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon)
+        evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1)
+        time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3
+        print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms))
+
+        out = c_hexagon.numpy()
+        out_a = out.reshape(batches * n * k)
+        out_req, _, _ = quantize_array(out_a, batches * n * k)
+        out_req = np.array(out_req).reshape(batches, n, k)
+
+        out_ref_a = out_ref.reshape(batches * n * k)
+        out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k)
+        out_ref_q = np.array(out_ref_q).reshape(batches, n, k)
+
+        tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0)
+        
\ No newline at end of file

From 81b2e9bb213e6c96c41d314920efdac7025eb714 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Wed, 3 Aug 2022 09:55:44 -0700
Subject: [PATCH 3/3] Run and apply lint changes.

---
 .../python/contrib/test_hexagon/conv_uint8.py |  57 +--
 .../test_hexagon/conv_uint8_hvx_intrin.py     | 328 ++++++++++++------
 .../test_hexagon/mmul_unit8_hvx_intrin.py     | 210 +++++++----
 .../test_hexagon/quantization_utils.py        |   3 +-
 .../test_hexagon/test_conv_hvx_intrinsics.py  |  66 ++--
 .../test_hexagon/test_mmul_hvx_intrinsics.py  |  28 +-
 6 files changed, 463 insertions(+), 229 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py
index 264e6f77bbdf..9f4b00ec2c9f 100644
--- a/tests/python/contrib/test_hexagon/conv_uint8.py
+++ b/tests/python/contrib/test_hexagon/conv_uint8.py
@@ -21,27 +21,29 @@
 from numbers import Integral
 from tvm import te
 
+
 def get_const_int(expr):
-            """Verifies expr is integer and get the constant value.
-
-            Parameters
-            ----------
-            expr : tvm.Expr or int
-                The input expression.
-
-            Returns
-            -------
-            out_value : int
-                The output.
-            """
-            if isinstance(expr, Integral):
-                return expr
-            if not isinstance(expr, tvm.tir.IntImm):
-                ana = tvm.arith.Analyzer()
-                expr = ana.simplify(expr)
-            if not isinstance(expr, tvm.tir.IntImm):
-                raise ValueError("Expect value to be constant int")
-            return int(expr.value)
+    """Verifies expr is integer and get the constant value.
+
+    Parameters
+    ----------
+    expr : tvm.Expr or int
+        The input expression.
+
+    Returns
+    -------
+    out_value : int
+        The output.
+    """
+    if isinstance(expr, Integral):
+        return expr
+    if not isinstance(expr, tvm.tir.IntImm):
+        ana = tvm.arith.Analyzer()
+        expr = ana.simplify(expr)
+    if not isinstance(expr, tvm.tir.IntImm):
+        raise ValueError("Expect value to be constant int")
+    return int(expr.value)
+
 
 def get_const_tuple(in_tuple):
     """Verifies input tuple is IntImm or Var, returns tuple of int or Var.
@@ -72,6 +74,7 @@ def get_const_tuple(in_tuple):
             ret.append(get_const_int(elem))
     return tuple(ret)
 
+
 def Pad(Input, padding):
     batch, in_height, in_width, in_channel = Input.shape
     return te.compute(
@@ -89,6 +92,7 @@ def Pad(Input, padding):
         name="Apad",
     )
 
+
 def schedule_qconv2d_nhwc(outs, target, device):
     s = te.create_schedule([x.op for x in outs])
     x = outs[0]
@@ -96,6 +100,7 @@ def schedule_qconv2d_nhwc(outs, target, device):
     px1, px2 = s[x].split(nn, nparts=1)
     return s
 
+
 def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None):
     if out_dtype is None:
         out_dtype = Input.dtype
@@ -141,7 +146,12 @@ def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding):
     s[B].vectorize(cc)
 
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device)
-    func_te = tvm.build(s, [A, W, B], target=tvm.target.Target(target_hexagon, host=target_hexagon), name="quant_conv2d")
+    func_te = tvm.build(
+        s,
+        [A, W, B],
+        target=tvm.target.Target(target_hexagon, host=target_hexagon),
+        name="quant_conv2d",
+    )
 
     module_te = hexagon_session.load_module(func_te)
 
@@ -150,10 +160,11 @@ def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding):
     b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
 
     module_te(a_hexagon, w_hexagon, b_hexagon)
-    evaluator = module_te.time_evaluator(module_te.entry_name, hexagon_session.device, number=1, repeat=1)
+    evaluator = module_te.time_evaluator(
+        module_te.entry_name, hexagon_session.device, number=1, repeat=1
+    )
     mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000
 
     out = b_hexagon.numpy()
 
     return out, mean_ms
-    
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
index 44154bf29767..a0b1d7de798f 100644
--- a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
+++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
@@ -19,120 +19,220 @@
 
 from tvm.script import tir as T
 
+
 def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope):
-        VRMPY_WIDTH = 128
-        
-        batches, input_size, _, in_c = input_shape
-        w_size, _, _, filters = kernel_shape
-        out_size = input_size
-        
-        input_padding = w_size // 2
-        
-        # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order
-        # for this implementation to not mix output data there will need to be kernel 
-        # padding to round to the nearest multiple of 4.
-        kernel_width_padding = 4 - w_size % 4 
-        padded_kernel_width = w_size + kernel_width_padding
-
-        # vrmpy buffer loads are always 128B and will go out of bounds for the 
-        # implementation written here if there is not sufficient padding. This
-        # means that for this implementation it must always be a multiple of 128
-        # and have the standard padding and the padding needed for the kernel 
-        # window (4)
-        if (input_size % VRMPY_WIDTH != 0):
-            input_width_padding = (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding
-        else:
-            input_width_padding = input_padding + kernel_width_padding
-
-        padded_input_height = input_size + 2 * input_padding
-        padded_input_width = input_size + input_padding + input_width_padding
-
-        # vrmpy output buffer loads will go out of bounds for this implementation 
-        # if there is not proper padding. 
-        padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3
-        
-        # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input.
-        w_steps = math.ceil(input_size / VRMPY_WIDTH)
-
-        # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel. 
-        kw_steps = math.ceil(w_size / 4)
-
-        @T.prim_func
-        def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle):
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            A_local = T.match_buffer(a, [T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
-            W_local = T.match_buffer(w, [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")], dtype="uint8", offset_factor=1, scope=mem_scope)
-            C_local = T.match_buffer(c, [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")], dtype="int32", offset_factor=1, scope=mem_scope)
-            with T.block("root"):
-                T.reads(A_local[0: T.cast(padded_input_height, dtype="int32") * T.cast(padded_input_width, dtype="int32")], W_local[0: T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")])
-                T.writes(C_local[0: T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")])
-                for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size):
-                    C_local[T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)] += T.call_llvm_intrin(
-                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
-                        T.uint32(2),
-                        T.reinterpret(A_local[T.ramp((y + ry) * T.cast(padded_input_width, dtype="int32") + x_o * 128 + 4 * rx_o + x_i, 1, 128)], dtype = "int32x32"),
-                        T.reinterpret(W_local[T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)], dtype = "int32"),
-                        dtype="int32x32"
+    VRMPY_WIDTH = 128
+
+    batches, input_size, _, in_c = input_shape
+    w_size, _, _, filters = kernel_shape
+    out_size = input_size
+
+    input_padding = w_size // 2
+
+    # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order
+    # for this implementation to not mix output data there will need to be kernel
+    # padding to round to the nearest multiple of 4.
+    kernel_width_padding = 4 - w_size % 4
+    padded_kernel_width = w_size + kernel_width_padding
+
+    # vrmpy buffer loads are always 128B and will go out of bounds for the
+    # implementation written here if there is not sufficient padding. This
+    # means that for this implementation it must always be a multiple of 128
+    # and have the standard padding and the padding needed for the kernel
+    # window (4)
+    if input_size % VRMPY_WIDTH != 0:
+        input_width_padding = (
+            (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding
+        )
+    else:
+        input_width_padding = input_padding + kernel_width_padding
+
+    padded_input_height = input_size + 2 * input_padding
+    padded_input_width = input_size + input_padding + input_width_padding
+
+    # vrmpy output buffer loads will go out of bounds for this implementation
+    # if there is not proper padding.
+    padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3
+
+    # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input.
+    w_steps = math.ceil(input_size / VRMPY_WIDTH)
+
+    # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel.
+    kw_steps = math.ceil(w_size / 4)
+
+    @T.prim_func
+    def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A_local = T.match_buffer(
+            a,
+            [
+                T.cast(padded_input_height, dtype="int32")
+                * T.cast(padded_input_width, dtype="int32")
+            ],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        W_local = T.match_buffer(
+            w,
+            [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        C_local = T.match_buffer(
+            c,
+            [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")],
+            dtype="int32",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        with T.block("root"):
+            T.reads(
+                A_local[
+                    0 : T.cast(padded_input_height, dtype="int32")
+                    * T.cast(padded_input_width, dtype="int32")
+                ],
+                W_local[
+                    0 : T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")
+                ],
+            )
+            T.writes(
+                C_local[
+                    0 : T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")
+                ]
+            )
+            for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size):
+                C_local[
+                    T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)
+                ] += T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_local[
+                            T.ramp(
+                                (y + ry) * T.cast(padded_input_width, dtype="int32")
+                                + x_o * 128
+                                + 4 * rx_o
+                                + x_i,
+                                1,
+                                128,
+                            )
+                        ],
+                        dtype="int32x32",
+                    ),
+                    T.reinterpret(
+                        W_local[
+                            T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)
+                        ],
+                        dtype="int32",
+                    ),
+                    dtype="int32x32",
+                )
+
+    @T.prim_func
+    def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A_local = T.match_buffer(
+            a,
+            [padded_input_height, padded_input_width],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        W_local = T.match_buffer(
+            w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope
+        )
+        C_local = T.match_buffer(
+            c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope
+        )
+        with T.block("root"):
+            for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size):
+                with T.block("C"):
+                    y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx])
+                    C_local[y, x] = C_local[y, x] + T.cast(
+                        A_local[y + ry, x + rx], "int32"
+                    ) * T.cast(W_local[ry, rx], "int32")
+
+    @T.prim_func
+    def operator(a: T.handle, w: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(
+            a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1
+        )
+        W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1)
+        C = T.match_buffer(
+            c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1
+        )
+        A_local = T.alloc_buffer(
+            [batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope
+        )
+        W_local = T.alloc_buffer(
+            [w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope
+        )
+        C_local = T.alloc_buffer(
+            [batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope
+        )
+        with T.block("root"):
+            for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c):
+                with T.block("A_local"):
+                    nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c])
+                    T.reads(
+                        A[
+                            nn,
+                            yy - T.cast(input_padding, dtype="int32"),
+                            xx - T.cast(input_padding, dtype="int32"),
+                            cc,
+                        ]
+                    )
+                    T.writes(A_local[nn, yy, xx, cc])
+                    A_local[nn, yy, xx, cc] = T.if_then_else(
+                        T.cast(input_padding, dtype="int32") <= yy
+                        and yy
+                        < T.cast(padded_input_height, dtype="int32")
+                        - T.cast(input_padding, dtype="int32")
+                        and T.cast(input_padding, dtype="int32") <= xx
+                        and xx
+                        < T.cast(padded_input_width, dtype="int32")
+                        - T.cast(input_width_padding, dtype="int32"),
+                        A[
+                            nn,
+                            yy - T.cast(input_padding, dtype="int32"),
+                            xx - T.cast(input_padding, dtype="int32"),
+                            cc,
+                        ]
+                        - T.cast(a_offset, dtype="uint8"),
+                        T.uint8(0),
+                        dtype="uint8",
+                    )
+            for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters):
+                with T.block("W_local"):
+                    yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f])
+                    T.reads(W[yy, xx, cc, ff])
+                    T.writes(W_local[yy, xx, cc, ff])
+                    W_local[yy, xx, cc, ff] = T.if_then_else(
+                        xx
+                        < T.cast(padded_kernel_width, dtype="int32")
+                        - T.cast(kernel_width_padding, dtype="int32"),
+                        W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"),
+                        T.uint8(0),
+                        dtype="uint8",
                     )
+            for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width):
+                with T.block("C_local_init"):
+                    n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                    C_local[n, f, y, x] = 0
+            for n, f, y, x, ry, rx, rc in T.grid(
+                batches, filters, input_size, input_size, w_size, w_size, in_c
+            ):
+                with T.block("C"):
+                    n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc])
+                    C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(
+                        A_local[n, y + ry, x + rx, rc], "int32"
+                    ) * T.cast(W_local[ry, rx, rc, f], "int32")
+            for n, f, y, x in T.grid(batches, filters, out_size, out_size):
+                with T.block("C_local"):
+                    n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                    C[n, y, x, f] = C_local[n, f, y, x]
 
-        @T.prim_func
-        def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle):
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            A_local = T.match_buffer(a, [padded_input_height, padded_input_width], dtype="uint8", offset_factor=1, scope=mem_scope)
-            W_local = T.match_buffer(w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope)
-            C_local = T.match_buffer(c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope)
-            with T.block("root"):
-                for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size):
-                    with T.block("C"):
-                        y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx])
-                        C_local[y, x] = C_local[y, x] + T.cast(A_local[y + ry, x + rx], "int32") * T.cast(W_local[ry, rx], "int32")
-
-
-        @T.prim_func
-        def operator(a: T.handle, w: T.handle, c: T.handle) -> None:
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            A = T.match_buffer(a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1)
-            W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1)
-            C = T.match_buffer(c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1)
-            A_local = T.alloc_buffer([batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope)
-            W_local = T.alloc_buffer([w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope)
-            C_local = T.alloc_buffer([batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope)
-            with T.block("root"):
-                for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c):
-                    with T.block("A_local"):
-                        nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c])
-                        T.reads(A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc])
-                        T.writes(A_local[nn, yy, xx, cc])
-                        A_local[nn, yy, xx, cc] = T.if_then_else(
-                            T.cast(input_padding, dtype="int32") <= yy and yy < T.cast(padded_input_height, dtype="int32") - T.cast(input_padding, dtype="int32") and 
-                            T.cast(input_padding, dtype="int32") <= xx and xx < T.cast(padded_input_width, dtype="int32") - T.cast(input_width_padding, dtype="int32"), 
-                            A[nn, yy - T.cast(input_padding, dtype="int32"), xx - T.cast(input_padding, dtype="int32"), cc] - T.cast(a_offset, dtype="uint8"), 
-                            T.uint8(0), 
-                            dtype="uint8"
-                        )
-                for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters):
-                    with T.block("W_local"):
-                        yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f])
-                        T.reads(W[yy, xx, cc, ff])
-                        T.writes(W_local[yy, xx, cc, ff])
-                        W_local[yy, xx, cc, ff] = T.if_then_else(
-                            xx < T.cast(padded_kernel_width, dtype="int32") - T.cast(kernel_width_padding, dtype="int32"), 
-                            W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"), 
-                            T.uint8(0), 
-                            dtype="uint8"
-                        )
-                for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width):
-                    with T.block("C_local_init"):
-                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
-                        C_local[n, f, y, x] = 0
-                for n, f, y, x, ry, rx, rc in T.grid(batches, filters, input_size, input_size, w_size, w_size, in_c):
-                    with T.block("C"):
-                        n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc])
-                        C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(A_local[n, y + ry, x + rx, rc], "int32") * T.cast(W_local[ry, rx, rc, f], "int32")
-                for n, f, y, x in T.grid(batches, filters, out_size, out_size):
-                    with T.block("C_local"):
-                        n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
-                        C[n, y, x, f] = C_local[n, f, y, x]
-
-        return conv2d_vrmpy_desc, conv2d_vrmpy, operator
-        
\ No newline at end of file
+    return conv2d_vrmpy_desc, conv2d_vrmpy, operator
diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
index 5d8c5ed328a8..46cb816eeb56 100644
--- a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
+++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
@@ -23,153 +23,239 @@
 v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B"
 v_sub = "llvm.hexagon.V6.vsubw.128B"
 
+
 def get_mm_uint8_intrin(in_m, in_n, in_k):
     blocks = in_k // 32
     unrolled_rows = in_m // 16
 
     @T.prim_func
     def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle):
-        A = T.match_buffer(a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8")
-        B = T.match_buffer(b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8")
-        C = T.match_buffer(c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32")
+        A = T.match_buffer(
+            a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8"
+        )
+        B = T.match_buffer(
+            b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8"
+        )
+        C = T.match_buffer(
+            c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32"
+        )
         OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
-        with T.block("root"): 
-            T.reads(A[0: T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], B[0: T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], OFFSETS[0:2])
-            T.writes(C[0: T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")])
+        with T.block("root"):
+            T.reads(
+                A[0 : T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")],
+                B[0 : T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")],
+                OFFSETS[0:2],
+            )
+            T.writes(C[0 : T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")])
             for i in T.serial(in_n):
                 for s in T.serial(blocks):
-                    C[T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)] = T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), (( T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32")) * T.cast(in_m, dtype="int32")), dtype="int32x32")
-                for blok, ro  in T.grid(blocks, unrolled_rows):
-                    b_offset =  T.cast(OFFSETS[1], dtype="int32")
-                    a_offset =  T.cast(OFFSETS[0], dtype="int32")
+                    C[
+                        T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)
+                    ] = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                        T.uint32(1),
+                        (
+                            (T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32"))
+                            * T.cast(in_m, dtype="int32")
+                        ),
+                        dtype="int32x32",
+                    )
+                for blok, ro in T.grid(blocks, unrolled_rows):
+                    b_offset = T.cast(OFFSETS[1], dtype="int32")
+                    a_offset = T.cast(OFFSETS[0], dtype="int32")
                     out_index = blok * 32 + i * T.cast(in_k, dtype="int32")
-                    
+
                     B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32"))
                     B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32")
                     B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32")
                     B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32")
-                    
+
                     A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32")
                     A_index_unrolled_2 = A_index_unrolled + 4
                     A_index_unrolled_3 = A_index_unrolled + 8
                     A_index_unrolled_4 = A_index_unrolled + 12
-                    
+
                     a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin(
-                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), # instruction
-                        T.uint32(3), # number of inputs
-                        C[T.ramp(out_index, 1, 32)], # accumulation location
-                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"), # 32 4 byte inputs (Vu) to vrmpy 
-                        T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"), # 4 byte input (Rt) to vrmpy
-                        dtype = "int32x32" # output datatype
+                        T.llvm_lookup_intrinsic_id(
+                            "llvm.hexagon.V6.vrmpyub.acc.128B"
+                        ),  # instruction
+                        T.uint32(3),  # number of inputs
+                        C[T.ramp(out_index, 1, 32)],  # accumulation location
+                        T.reinterpret(
+                            B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"
+                        ),  # 32 4 byte inputs (Vu) to vrmpy
+                        T.reinterpret(
+                            A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"
+                        ),  # 4 byte input (Rt) to vrmpy
+                        dtype="int32x32",  # output datatype
                     )
 
                     a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_vrmpy_accumulation_unrolled,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
-                        T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
                     )
 
                     a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_vrmpy_accumulation_unrolled_1,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
-                        T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
                     )
 
                     la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_vrmpy_accumulation_unrolled_2,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
-                        T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
                         T.uint32(2),
-                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), a_offset, dtype = "int32"), 
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_b,
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype = "int32"),dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_a,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_b1,
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype = "int32"),dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_a1,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_b2,
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),T.uint32(1),T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype = "int32"),dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),b_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
                         T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_a2,
-                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),T.uint32(1),a_offset,dtype = "int32"),
-                        dtype = "int32x32"
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin(
-                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), 
-                        T.uint32(3), 
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
                         a_b_offsets_vrmpy_accumulation_unrolled_b3,
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"), T.uint32(1), T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype = "int32"), dtype = "int32x32"),
-                        T.call_llvm_intrin(T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"), T.uint32(1), b_offset, dtype = "int32"),
-                        dtype = "int32x32"
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
                     )
 
                     C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin(
-                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"), 
-                        T.uint32(2), 
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"),
+                        T.uint32(2),
                         la_b_vrmpy_accumulation_unrolled_3,
                         a_b_offsets_vrmpy_accumulation_unrolled_a3,
-                        dtype = "int32x32"
+                        dtype="int32x32",
                     )
 
-
     @T.prim_func
     def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
@@ -178,12 +264,12 @@ def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
         C = T.match_buffer(c, [in_n, in_k], dtype="int32")
         OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
         # body
-        with T.block("root"): 
+        with T.block("root"):
             for i0, i1, i2 in T.grid(in_m, in_n, in_k):
                 with T.block("C"):
                     y, x, j = T.axis.remap("SSR", [i0, i1, i2])
-                    C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(B[j, x] - OFFSETS[1], "int32")
+                    C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(
+                        B[j, x] - OFFSETS[1], "int32"
+                    )
 
     return mmul_desc, mm_uint8_intrinsic
-
-
diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py
index 3ebcfe3a805e..96d3d4b38a13 100644
--- a/tests/python/contrib/test_hexagon/quantization_utils.py
+++ b/tests/python/contrib/test_hexagon/quantization_utils.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+
 def quantize_uint8(val, minval, maxval):
     range = max(0.0001, maxval - minval)
     resize_amt = 255.0 / range
@@ -41,4 +42,4 @@ def quantize_array(in_f, size):
     in_max = max(0, max(in_f))
     for i in range(size):
         in_q.append(quantize_uint8(in_f[i], in_min, in_max))
-    return in_q, in_min, in_max
\ No newline at end of file
+    return in_q, in_min, in_max
diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
index 0ab7cbd734a0..ac60eb5d67af 100644
--- a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
+++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
@@ -26,8 +26,8 @@
 from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin
 from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
 
-class TestConvHVX:
 
+class TestConvHVX:
     def create_inputs(input_shape, filter_shape, mem_scope):
 
         w_size, _, _, _ = filter_shape
@@ -48,11 +48,12 @@ def create_inputs(input_shape, filter_shape, mem_scope):
 
         a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape))
         w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape))
-        expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype("int32")
+        expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype(
+            "int32"
+        )
 
         return a_q, w_q, a_offset, w_offset, expected_output, mem_scope
 
-
     a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters(
         (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")),
         (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")),
@@ -67,9 +68,11 @@ def create_inputs(input_shape, filter_shape, mem_scope):
     )
 
     @tvm.testing.requires_hexagon
-    def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+    def test_vrmpy_conv(
+        self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope
+    ):
 
-        #TODO even sized kernels and stride are currently not working. 
+        # TODO even sized kernels and stride are currently not working.
 
         batches, input_size, _, in_c = a.shape
         w_size, _, _, filters = w.shape
@@ -79,9 +82,15 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou
         out_shape = (batches, out_height, out_width, filters)
         c = np.zeros(out_shape, dtype="int32")
 
-        conv2d_vrmpy_description, conv2d_vrmpy_intrinsic, conv2d_operator = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope)
+        (
+            conv2d_vrmpy_description,
+            conv2d_vrmpy_intrinsic,
+            conv2d_operator,
+        ) = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope)
 
-        intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(input_size, input_size, w_size, w_size, mem_scope)
+        intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(
+            input_size, input_size, w_size, w_size, mem_scope
+        )
         try:
             TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic)
         except:
@@ -89,14 +98,18 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou
 
         ir_module = conv2d_operator
         sch = tvm.tir.Schedule(ir_module, debug_mask="all")
-        
+
         block = sch.get_block("C")
 
         w_block_local = sch.get_block("W_local")
-        sch.transform_layout(w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w))
+        sch.transform_layout(
+            w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w)
+        )
 
         a_block_local = sch.get_block("A_local")
-        sch.transform_layout(a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w))
+        sch.transform_layout(
+            a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w)
+        )
 
         n, f, y, x, ry, rx, rc = sch.get_loops(block)
         sch.reorder(n, f, rc, y, x, ry, rx)
@@ -104,15 +117,18 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou
         sch.tensorize(y, intrin_name)
 
         target_hexagon = tvm.target.hexagon("v68", link_params=True)
-        
+
         A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8")
         W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8")
         C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
 
         func_tir = tvm.build(
-            sch.mod, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="hvx_op"
+            sch.mod,
+            [A, W, C],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="hvx_op",
         )
-        
+
         module = hexagon_session.load_module(func_tir)
 
         a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
@@ -121,22 +137,32 @@ def test_vrmpy_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_ou
 
         module(a_hexagon, w_hexagon, c_hexagon)
         out = c_hexagon.numpy()
-        out = out[:,:,:out_width,:]
-    
+        out = out[:, :, :out_width, :]
+
         tvm.testing.assert_allclose(out, expected_output)
-        
+
         timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1)
         time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000
-        print("Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(a.shape, w.shape, mem_scope, time_ms))
+        print(
+            "Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(
+                a.shape, w.shape, mem_scope, time_ms
+            )
+        )
 
     @tvm.testing.requires_hexagon
     def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
         batches, input_size, _, in_c = a.shape
         w_size, _, _, filters = w.shape
-        baseline_output, baseline_time = run_conv_te(hexagon_session, a, w, a_offset, w_offset, w_size // 2)
+        baseline_output, baseline_time = run_conv_te(
+            hexagon_session, a, w, a_offset, w_offset, w_size // 2
+        )
         tvm.testing.assert_allclose(baseline_output, expected_output)
-        print("Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(a.shape, w.shape, baseline_time))
-    
+        print(
+            "Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(
+                a.shape, w.shape, baseline_time
+            )
+        )
+
 
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
index 161ff305c5c1..d025b2b48751 100644
--- a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
+++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
@@ -28,9 +28,11 @@
 
 UNROLL_FACTOR = 4  # This must match the hard-coded unrolling in mm_uint8_intrinsic().
 
+
 def can_tensorize(n, m, k):
     return m % (4 * UNROLL_FACTOR) == k % 32 == 0
 
+
 def blockify_matrix(B):
     """
     inputs
@@ -89,6 +91,7 @@ def blockify_matrix(B):
                     BB[bt, j, k, b] = B[bt, y, k]
     return BB
 
+
 def setup_test(b, m, n, k):
     a_shape = (b, n, m)
     b_shape = (b, m, k)
@@ -122,6 +125,7 @@ def setup_test(b, m, n, k):
 
     return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output
 
+
 class TestMatMulVec:
 
     batches, m, n, k = tvm.testing.parameters(
@@ -134,8 +138,8 @@ class TestMatMulVec:
 
     @tvm.testing.requires_hexagon
     def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k):
-        
-        out_shape = (batches, n, k) 
+
+        out_shape = (batches, n, k)
 
         a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k)
 
@@ -151,32 +155,39 @@ def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
                 for i0, i1, i2, i3 in T.grid(batches, m, n, k):
                     with T.block("C"):
                         batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3])
-                        C[batch, y, x] = C[batch, y, x] + T.cast(A[batch, y, j] - OFFSETS[0], "int32") * T.cast(B[batch, j, x] - OFFSETS[1], "int32")
+                        C[batch, y, x] = C[batch, y, x] + T.cast(
+                            A[batch, y, j] - OFFSETS[0], "int32"
+                        ) * T.cast(B[batch, j, x] - OFFSETS[1], "int32")
 
         ir_module = operator
         sch = tvm.tir.Schedule(ir_module, debug_mask="all")
-        
+
         block = sch.get_block("C")
         _, y, _, _ = sch.get_loops(block)
         sch.tensorize(y, intrin_name)
-        
+
         A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8")
         B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8")
         C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
         OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8")
 
         target_hexagon = tvm.target.hexagon("v68", link_params=True)
-        func_tir = tvm.build(sch.mod, [A, B, C, OFFSETS], tvm.target.Target(target_hexagon, host=target_hexagon), name="qmmul_vrmpy")
+        func_tir = tvm.build(
+            sch.mod,
+            [A, B, C, OFFSETS],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="qmmul_vrmpy",
+        )
         module = hexagon_session.load_module(func_tir)
 
         c = np.zeros(out_shape, dtype="int32")
         offsets = np.array([a_offset, b_offset], dtype="uint8")
-        
+
         a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device)
         b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device)
         c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
         offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device)
-        
+
         module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon)
         evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1)
         time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3
@@ -192,4 +203,3 @@ def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
         out_ref_q = np.array(out_ref_q).reshape(batches, n, k)
 
         tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0)
-        
\ No newline at end of file