diff --git a/tests/python/contrib/test_hexagon/conv_uint8.py b/tests/python/contrib/test_hexagon/conv_uint8.py
new file mode 100644
index 000000000000..9f4b00ec2c9f
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8.py
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+
+from numbers import Integral
+from tvm import te
+
+
+def get_const_int(expr):
+    """Verifies expr is integer and get the constant value.
+
+    Parameters
+    ----------
+    expr : tvm.Expr or int
+        The input expression.
+
+    Returns
+    -------
+    out_value : int
+        The output.
+    """
+    if isinstance(expr, Integral):
+        return expr
+    if not isinstance(expr, tvm.tir.IntImm):
+        ana = tvm.arith.Analyzer()
+        expr = ana.simplify(expr)
+    if not isinstance(expr, tvm.tir.IntImm):
+        raise ValueError("Expect value to be constant int")
+    return int(expr.value)
+
+
+def get_const_tuple(in_tuple):
+    """Verifies input tuple is IntImm or Var, returns tuple of int or Var.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of int
+        The output.
+    """
+    ret = []
+    ana = None
+    for elem in in_tuple:
+        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
+            ret.append(elem)
+        elif not isinstance(elem, (tvm.tir.IntImm, int)):
+            ana = tvm.arith.Analyzer() if ana is None else ana
+            elem = ana.simplify(elem)
+            if not isinstance(elem, tvm.tir.IntImm):
+                ret.append(elem)
+            else:
+                ret.append(get_const_int(elem))
+        else:
+            ret.append(get_const_int(elem))
+    return tuple(ret)
+
+
+def Pad(Input, padding):
+    batch, in_height, in_width, in_channel = Input.shape
+    return te.compute(
+        (batch, in_height + 2 * padding, in_width + 2 * padding, in_channel),
+        lambda nn, yy, xx, cc: tvm.tir.if_then_else(
+            te.all(
+                yy >= padding,
+                yy - padding < in_height,
+                xx >= padding,
+                xx - padding < in_width,
+            ),
+            Input[nn, yy - padding, xx - padding, cc],
+            tvm.tir.const(0, Input.dtype),
+        ),
+        name="Apad",
+    )
+
+
+def schedule_qconv2d_nhwc(outs, target, device):
+    s = te.create_schedule([x.op for x in outs])
+    x = outs[0]
+    nn, yy, xx, cc = s[x].op.axis
+    px1, px2 = s[x].split(nn, nparts=1)
+    return s
+
+
+def qconv2d_nhwc(Input, in_offset, Filter, filt_offset, stride, padding, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = Input.dtype
+
+    batch, in_height, in_width, in_channel = Input.shape
+    filt_height, filt_width, _, num_filter = Filter.shape
+    # Input is already padded. No need to add padding while computing
+    # out_height and out_width.
+    out_height = (in_height - filt_height) // stride + 1
+    out_width = (in_width - filt_width) // stride + 1
+    out_channel = num_filter
+
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, filt_height), name="ry")
+    rx = te.reduce_axis((0, filt_width), name="rx")
+
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: te.sum(
+            (Input[nn, yy * stride + ry, xx * stride + rx, rc] - in_offset)
+            * (Filter[ry, rx, rc, ff] - filt_offset).astype(out_dtype),
+            axis=[rc, ry, rx],
+        ),
+        tag="qconv2d_nhwc",
+    )
+
+
+def run_conv_te(hexagon_session, a, w, a_offset, w_offset, padding):
+
+    # Input tensor size
+    A = te.placeholder(a.shape, name="A", dtype="uint8")
+    W = te.placeholder(w.shape, name="W", dtype="uint8")
+
+    # Pad input and create computation for quantized conv2d
+    Apad = Pad(A, padding)
+    B = qconv2d_nhwc(Apad, a_offset, W, w_offset, 1, padding)
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    device = hexagon_session.device
+    s = schedule_qconv2d_nhwc([B], target_hexagon, device)
+    nn, yy, xx, cc = s[B].op.axis
+    yo, yi = s[B].split(yy, nparts=1)
+    s[Apad].compute_at(s[B], yi)
+    s[B].vectorize(cc)
+
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), device=device)
+    func_te = tvm.build(
+        s,
+        [A, W, B],
+        target=tvm.target.Target(target_hexagon, host=target_hexagon),
+        name="quant_conv2d",
+    )
+
+    module_te = hexagon_session.load_module(func_te)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+
+    module_te(a_hexagon, w_hexagon, b_hexagon)
+    evaluator = module_te.time_evaluator(
+        module_te.entry_name, hexagon_session.device, number=1, repeat=1
+    )
+    mean_ms = evaluator(a_hexagon, w_hexagon, b_hexagon).mean * 1000
+
+    out = b_hexagon.numpy()
+
+    return out, mean_ms
diff --git a/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
new file mode 100644
index 000000000000..a0b1d7de798f
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv_uint8_hvx_intrin.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import math
+
+from tvm.script import tir as T
+
+
+def get_conv_uint8_hvx_intrin(input_shape, kernel_shape, a_offset, w_offset, mem_scope):
+    VRMPY_WIDTH = 128
+
+    batches, input_size, _, in_c = input_shape
+    w_size, _, _, filters = kernel_shape
+    out_size = input_size
+
+    input_padding = w_size // 2
+
+    # For this usage of vrmpy it loads 4 bytes for vv from the kernel. In order
+    # for this implementation to not mix output data there will need to be kernel
+    # padding to round to the nearest multiple of 4.
+    kernel_width_padding = 4 - w_size % 4
+    padded_kernel_width = w_size + kernel_width_padding
+
+    # vrmpy buffer loads are always 128B and will go out of bounds for the
+    # implementation written here if there is not sufficient padding. This
+    # means that for this implementation it must always be a multiple of 128
+    # and have the standard padding and the padding needed for the kernel
+    # window (4)
+    if input_size % VRMPY_WIDTH != 0:
+        input_width_padding = (
+            (VRMPY_WIDTH - (input_size) % VRMPY_WIDTH) + input_padding + kernel_width_padding
+        )
+    else:
+        input_width_padding = input_padding + kernel_width_padding
+
+    padded_input_height = input_size + 2 * input_padding
+    padded_input_width = input_size + input_padding + input_width_padding
+
+    # vrmpy output buffer loads will go out of bounds for this implementation
+    # if there is not proper padding.
+    padded_output_width = VRMPY_WIDTH * (padded_input_width // VRMPY_WIDTH) + 3
+
+    # The number of vrmpy loads (128B) needed to complete a horizontal frame of the input.
+    w_steps = math.ceil(input_size / VRMPY_WIDTH)
+
+    # The number of vrmpy loads (4B) needed to complete a horizontal frame of the kernel.
+    kw_steps = math.ceil(w_size / 4)
+
+    @T.prim_func
+    def conv2d_vrmpy(a: T.handle, w: T.handle, c: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A_local = T.match_buffer(
+            a,
+            [
+                T.cast(padded_input_height, dtype="int32")
+                * T.cast(padded_input_width, dtype="int32")
+            ],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        W_local = T.match_buffer(
+            w,
+            [T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        C_local = T.match_buffer(
+            c,
+            [T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")],
+            dtype="int32",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        with T.block("root"):
+            T.reads(
+                A_local[
+                    0 : T.cast(padded_input_height, dtype="int32")
+                    * T.cast(padded_input_width, dtype="int32")
+                ],
+                W_local[
+                    0 : T.cast(w_size, dtype="int32") * T.cast(padded_kernel_width, dtype="int32")
+                ],
+            )
+            T.writes(
+                C_local[
+                    0 : T.cast(out_size, dtype="int32") * T.cast(padded_output_width, dtype="int32")
+                ]
+            )
+            for y, x_o, x_i, rx_o, ry in T.grid(input_size, w_steps, 4, kw_steps, w_size):
+                C_local[
+                    T.ramp(y * T.cast(padded_output_width, dtype="int32") + x_o * 128 + x_i, 4, 32)
+                ] += T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_local[
+                            T.ramp(
+                                (y + ry) * T.cast(padded_input_width, dtype="int32")
+                                + x_o * 128
+                                + 4 * rx_o
+                                + x_i,
+                                1,
+                                128,
+                            )
+                        ],
+                        dtype="int32x32",
+                    ),
+                    T.reinterpret(
+                        W_local[
+                            T.ramp(ry * T.cast(padded_kernel_width, dtype="int32") + rx_o * 4, 1, 4)
+                        ],
+                        dtype="int32",
+                    ),
+                    dtype="int32x32",
+                )
+
+    @T.prim_func
+    def conv2d_vrmpy_desc(a: T.handle, w: T.handle, c: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A_local = T.match_buffer(
+            a,
+            [padded_input_height, padded_input_width],
+            dtype="uint8",
+            offset_factor=1,
+            scope=mem_scope,
+        )
+        W_local = T.match_buffer(
+            w, [w_size, padded_kernel_width], dtype="uint8", offset_factor=1, scope=mem_scope
+        )
+        C_local = T.match_buffer(
+            c, [out_size, padded_output_width], dtype="int32", offset_factor=1, scope=mem_scope
+        )
+        with T.block("root"):
+            for y, x, ry, rx in T.grid(input_size, input_size, w_size, w_size):
+                with T.block("C"):
+                    y, x, ry, rx = T.axis.remap("SSRR", [y, x, ry, rx])
+                    C_local[y, x] = C_local[y, x] + T.cast(
+                        A_local[y + ry, x + rx], "int32"
+                    ) * T.cast(W_local[ry, rx], "int32")
+
+    @T.prim_func
+    def operator(a: T.handle, w: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(
+            a, [batches, input_size, input_size, in_c], dtype="uint8", offset_factor=1
+        )
+        W = T.match_buffer(w, [w_size, w_size, in_c, filters], dtype="uint8", offset_factor=1)
+        C = T.match_buffer(
+            c, [batches, out_size, out_size, filters], dtype="int32", offset_factor=1
+        )
+        A_local = T.alloc_buffer(
+            [batches, padded_input_height, padded_input_width, in_c], dtype="uint8", scope=mem_scope
+        )
+        W_local = T.alloc_buffer(
+            [w_size, padded_kernel_width, in_c, filters], dtype="uint8", scope=mem_scope
+        )
+        C_local = T.alloc_buffer(
+            [batches, filters, out_size, padded_output_width], dtype="int32", scope=mem_scope
+        )
+        with T.block("root"):
+            for n, y, x, c in T.grid(batches, padded_input_height, padded_input_width, in_c):
+                with T.block("A_local"):
+                    nn, yy, xx, cc = T.axis.remap("SSSS", [n, y, x, c])
+                    T.reads(
+                        A[
+                            nn,
+                            yy - T.cast(input_padding, dtype="int32"),
+                            xx - T.cast(input_padding, dtype="int32"),
+                            cc,
+                        ]
+                    )
+                    T.writes(A_local[nn, yy, xx, cc])
+                    A_local[nn, yy, xx, cc] = T.if_then_else(
+                        T.cast(input_padding, dtype="int32") <= yy
+                        and yy
+                        < T.cast(padded_input_height, dtype="int32")
+                        - T.cast(input_padding, dtype="int32")
+                        and T.cast(input_padding, dtype="int32") <= xx
+                        and xx
+                        < T.cast(padded_input_width, dtype="int32")
+                        - T.cast(input_width_padding, dtype="int32"),
+                        A[
+                            nn,
+                            yy - T.cast(input_padding, dtype="int32"),
+                            xx - T.cast(input_padding, dtype="int32"),
+                            cc,
+                        ]
+                        - T.cast(a_offset, dtype="uint8"),
+                        T.uint8(0),
+                        dtype="uint8",
+                    )
+            for y, x, c, f in T.grid(w_size, padded_kernel_width, in_c, filters):
+                with T.block("W_local"):
+                    yy, xx, cc, ff = T.axis.remap("SSSS", [y, x, c, f])
+                    T.reads(W[yy, xx, cc, ff])
+                    T.writes(W_local[yy, xx, cc, ff])
+                    W_local[yy, xx, cc, ff] = T.if_then_else(
+                        xx
+                        < T.cast(padded_kernel_width, dtype="int32")
+                        - T.cast(kernel_width_padding, dtype="int32"),
+                        W[yy, xx, cc, ff] - T.cast(w_offset, dtype="uint8"),
+                        T.uint8(0),
+                        dtype="uint8",
+                    )
+            for n, f, y, x in T.grid(batches, filters, out_size, padded_output_width):
+                with T.block("C_local_init"):
+                    n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                    C_local[n, f, y, x] = 0
+            for n, f, y, x, ry, rx, rc in T.grid(
+                batches, filters, input_size, input_size, w_size, w_size, in_c
+            ):
+                with T.block("C"):
+                    n, f, y, x, ry, rx, rc = T.axis.remap("SSSSRRR", [n, f, y, x, ry, rx, rc])
+                    C_local[n, f, y, x] = C_local[n, f, y, x] + T.cast(
+                        A_local[n, y + ry, x + rx, rc], "int32"
+                    ) * T.cast(W_local[ry, rx, rc, f], "int32")
+            for n, f, y, x in T.grid(batches, filters, out_size, out_size):
+                with T.block("C_local"):
+                    n, f, y, x = T.axis.remap("SSSS", [n, f, y, x])
+                    C[n, y, x, f] = C_local[n, f, y, x]
+
+    return conv2d_vrmpy_desc, conv2d_vrmpy, operator
diff --git a/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
new file mode 100644
index 000000000000..46cb816eeb56
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/mmul_unit8_hvx_intrin.py
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tvm.script import tir as T
+
+w_spit__b = "llvm.hexagon.S2.vsplatrb"  # Q6_R_vsplatb_R
+v_spit__w = "llvm.hexagon.V6.lvsplatw.128B"
+v_rmpy__uv_uw_acc = "llvm.hexagon.V6.vrmpyub.acc.128B"
+v_rmpy__uv_uw = "llvm.hexagon.V6.vrmpyub.128B"
+v_sub = "llvm.hexagon.V6.vsubw.128B"
+
+
+def get_mm_uint8_intrin(in_m, in_n, in_k):
+    blocks = in_k // 32
+    unrolled_rows = in_m // 16
+
+    @T.prim_func
+    def mm_uint8_intrinsic(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle):
+        A = T.match_buffer(
+            a, [T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")], dtype="uint8"
+        )
+        B = T.match_buffer(
+            b, [T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="uint8"
+        )
+        C = T.match_buffer(
+            c, [T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")], dtype="int32"
+        )
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        with T.block("root"):
+            T.reads(
+                A[0 : T.cast(in_n, dtype="int32") * T.cast(in_m, dtype="int32")],
+                B[0 : T.cast(in_m, dtype="int32") * T.cast(in_k, dtype="int32")],
+                OFFSETS[0:2],
+            )
+            T.writes(C[0 : T.cast(in_n, dtype="int32") * T.cast(in_k, dtype="int32")])
+            for i in T.serial(in_n):
+                for s in T.serial(blocks):
+                    C[
+                        T.ramp(((s * 32) + (i * T.cast(in_k, dtype="int32"))), 1, 32)
+                    ] = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                        T.uint32(1),
+                        (
+                            (T.cast(OFFSETS[0], dtype="int32") * T.cast(OFFSETS[1], dtype="int32"))
+                            * T.cast(in_m, dtype="int32")
+                        ),
+                        dtype="int32x32",
+                    )
+                for blok, ro in T.grid(blocks, unrolled_rows):
+                    b_offset = T.cast(OFFSETS[1], dtype="int32")
+                    a_offset = T.cast(OFFSETS[0], dtype="int32")
+                    out_index = blok * 32 + i * T.cast(in_k, dtype="int32")
+
+                    B_index_unrolled = blok * 128 + (ro * 16 * T.cast(in_k, dtype="int32"))
+                    B_index_unrolled_2 = blok * 128 + (ro * 16 + 4) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_3 = blok * 128 + (ro * 16 + 8) * T.cast(in_k, dtype="int32")
+                    B_index_unrolled_4 = blok * 128 + (ro * 16 + 12) * T.cast(in_k, dtype="int32")
+
+                    A_index_unrolled = ro * 16 + i * T.cast(in_m, dtype="int32")
+                    A_index_unrolled_2 = A_index_unrolled + 4
+                    A_index_unrolled_3 = A_index_unrolled + 8
+                    A_index_unrolled_4 = A_index_unrolled + 12
+
+                    a_b_vrmpy_accumulation_unrolled = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id(
+                            "llvm.hexagon.V6.vrmpyub.acc.128B"
+                        ),  # instruction
+                        T.uint32(3),  # number of inputs
+                        C[T.ramp(out_index, 1, 32)],  # accumulation location
+                        T.reinterpret(
+                            B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"
+                        ),  # 32 4 byte inputs (Vu) to vrmpy
+                        T.reinterpret(
+                            A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"
+                        ),  # 4 byte input (Rt) to vrmpy
+                        dtype="int32x32",  # output datatype
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
+                    )
+
+                    a_b_vrmpy_accumulation_unrolled_2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
+                    )
+
+                    la_b_vrmpy_accumulation_unrolled_3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_vrmpy_accumulation_unrolled_2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"),
+                        T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.128B"),
+                        T.uint32(2),
+                        T.reinterpret(B[T.ramp(B_index_unrolled, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b,
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_2, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a1 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b1,
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_2, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a1,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_3, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a2 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b2,
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_3, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_b3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_a2,
+                        T.reinterpret(B[T.ramp(B_index_unrolled_4, 1, 128)], dtype="int32x32"),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            a_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    a_b_offsets_vrmpy_accumulation_unrolled_a3 = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                        T.uint32(3),
+                        a_b_offsets_vrmpy_accumulation_unrolled_b3,
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.lvsplatw.128B"),
+                            T.uint32(1),
+                            T.reinterpret(A[T.ramp(A_index_unrolled_4, 1, 4)], dtype="int32"),
+                            dtype="int32x32",
+                        ),
+                        T.call_llvm_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.S2.vsplatrb"),
+                            T.uint32(1),
+                            b_offset,
+                            dtype="int32",
+                        ),
+                        dtype="int32x32",
+                    )
+
+                    C[T.ramp(out_index, 1, 32)] = T.call_llvm_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vsubw.128B"),
+                        T.uint32(2),
+                        la_b_vrmpy_accumulation_unrolled_3,
+                        a_b_offsets_vrmpy_accumulation_unrolled_a3,
+                        dtype="int32x32",
+                    )
+
+    @T.prim_func
+    def mmul_desc(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [in_n, in_m], dtype="uint8")
+        B = T.match_buffer(b, [in_m, in_k], dtype="uint8")
+        C = T.match_buffer(c, [in_n, in_k], dtype="int32")
+        OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+        # body
+        with T.block("root"):
+            for i0, i1, i2 in T.grid(in_m, in_n, in_k):
+                with T.block("C"):
+                    y, x, j = T.axis.remap("SSR", [i0, i1, i2])
+                    C[y, x] = C[y, x] + T.cast(A[y, j] - OFFSETS[0], "int32") * T.cast(
+                        B[j, x] - OFFSETS[1], "int32"
+                    )
+
+    return mmul_desc, mm_uint8_intrinsic
diff --git a/tests/python/contrib/test_hexagon/quantization_utils.py b/tests/python/contrib/test_hexagon/quantization_utils.py
new file mode 100644
index 000000000000..96d3d4b38a13
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/quantization_utils.py
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def quantize_uint8(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    resize_amt = 255.0 / range
+    value_f = (val - minval) * resize_amt
+    value_i = round(value_f, 8)
+    if value_i < 0:
+        return 0
+    elif value_i > 255:
+        return 255
+    else:
+        return int(value_i)
+
+
+def dequantize(val, minval, maxval):
+    range = max(0.0001, maxval - minval)
+    stepsize = range / 4294967296
+    return val * stepsize
+
+
+def quantize_array(in_f, size):
+    in_q = []
+    # 0 must lie in interval [min,max] for quantization to work correctly.
+    in_min = min(0, min(in_f))
+    in_max = max(0, max(in_f))
+    for i in range(size):
+        in_q.append(quantize_uint8(in_f[i], in_min, in_max))
+    return in_q, in_min, in_max
diff --git a/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
new file mode 100644
index 000000000000..ac60eb5d67af
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_conv_hvx_intrinsics.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm.testing
+import tvm.topi.testing
+
+from numpy.random import default_rng
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.conv_uint8 import get_const_tuple, run_conv_te
+from tests.python.contrib.test_hexagon.conv_uint8_hvx_intrin import get_conv_uint8_hvx_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+
+class TestConvHVX:
+    def create_inputs(input_shape, filter_shape, mem_scope):
+
+        w_size, _, _, _ = filter_shape
+        input_padding = w_size // 2
+
+        rng = default_rng()
+        a = rng.integers(1, 255, input_shape, dtype="uint8")
+        w = rng.integers(1, 8, filter_shape, dtype="uint8")
+
+        a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+        w_q, b_min, b_max = quantize_array(w.reshape(w.size), w.size)
+
+        a_q = np.array(a_q, dtype="uint8").reshape(input_shape)
+        w_q = np.array(w_q, dtype="uint8").reshape(filter_shape)
+
+        a_offset = quantize_uint8(0.0, a_min, a_max)
+        w_offset = quantize_uint8(0.0, b_min, b_max)
+
+        a_f = np.array(a_q, dtype="uint8").reshape(get_const_tuple(a.shape))
+        w_f = np.array(w_q, dtype="uint8").reshape(get_const_tuple(w.shape))
+        expected_output = tvm.topi.testing.conv2d_nhwc_python(a_f, w_f, 1, input_padding).astype(
+            "int32"
+        )
+
+        return a_q, w_q, a_offset, w_offset, expected_output, mem_scope
+
+    a, w, a_offset, w_offset, expected_output, mem_scope = tvm.testing.parameters(
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "local")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global")),
+        (create_inputs((2, 128, 128, 3), (3, 3, 3, 2), "global.vtcm")),
+        (create_inputs((1, 128, 128, 3), (7, 7, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (5, 5, 3, 1), "local")),
+        (create_inputs((1, 128, 128, 3), (3, 3, 3, 1), "local")),
+        (create_inputs((4, 128, 128, 1), (3, 3, 1, 4), "local")),
+        (create_inputs((2, 32, 32, 32), (7, 7, 32, 2), "local")),
+        (create_inputs((2, 34, 34, 29), (5, 5, 29, 2), "local")),
+        (create_inputs((1, 512, 512, 1), (9, 9, 1, 1), "local")),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_vrmpy_conv(
+        self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope
+    ):
+
+        # TODO even sized kernels and stride are currently not working.
+
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+
+        out_height = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_width = (input_size - w_size + 2 * (w_size // 2)) + 1
+        out_shape = (batches, out_height, out_width, filters)
+        c = np.zeros(out_shape, dtype="int32")
+
+        (
+            conv2d_vrmpy_description,
+            conv2d_vrmpy_intrinsic,
+            conv2d_operator,
+        ) = get_conv_uint8_hvx_intrin(a.shape, w.shape, a_offset, w_offset, mem_scope)
+
+        intrin_name = "conv2d.uint8_{}x{}x{}x{}_{}".format(
+            input_size, input_size, w_size, w_size, mem_scope
+        )
+        try:
+            TensorIntrin.register(intrin_name, conv2d_vrmpy_description, conv2d_vrmpy_intrinsic)
+        except:
+            print("Intrinsic already registered.")
+
+        ir_module = conv2d_operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+
+        block = sch.get_block("C")
+
+        w_block_local = sch.get_block("W_local")
+        sch.transform_layout(
+            w_block_local, buffer=("write", 0), index_map=lambda h, w, c, f: (f, c, h, w)
+        )
+
+        a_block_local = sch.get_block("A_local")
+        sch.transform_layout(
+            a_block_local, buffer=("write", 0), index_map=lambda b, h, w, c: (b, c, h, w)
+        )
+
+        n, f, y, x, ry, rx, rc = sch.get_loops(block)
+        sch.reorder(n, f, rc, y, x, ry, rx)
+
+        sch.tensorize(y, intrin_name)
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+
+        A = tvm.tir.decl_buffer(a.shape, name="A", dtype="uint8")
+        W = tvm.tir.decl_buffer(w.shape, name="W", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+
+        func_tir = tvm.build(
+            sch.mod,
+            [A, W, C],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="hvx_op",
+        )
+
+        module = hexagon_session.load_module(func_tir)
+
+        a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+        w_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+        module(a_hexagon, w_hexagon, c_hexagon)
+        out = c_hexagon.numpy()
+        out = out[:, :, :out_width, :]
+
+        tvm.testing.assert_allclose(out, expected_output)
+
+        timer = module.time_evaluator(module.entry_name, hexagon_session.device, number=1, repeat=1)
+        time_ms = timer(a_hexagon, w_hexagon, c_hexagon).mean * 1000
+        print(
+            "Input Shape: {} Kernel Shape: {} Mem_scope: {}. HVX: {} ms.".format(
+                a.shape, w.shape, mem_scope, time_ms
+            )
+        )
+
+    @tvm.testing.requires_hexagon
+    def test_te_conv(self, hexagon_session, a, w, a_offset, w_offset, expected_output, mem_scope):
+        batches, input_size, _, in_c = a.shape
+        w_size, _, _, filters = w.shape
+        baseline_output, baseline_time = run_conv_te(
+            hexagon_session, a, w, a_offset, w_offset, w_size // 2
+        )
+        tvm.testing.assert_allclose(baseline_output, expected_output)
+        print(
+            "Input Shape: {} Kernel Shape: {}. TE Baseline: {} ms".format(
+                a.shape, w.shape, baseline_time
+            )
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
new file mode 100644
index 000000000000..d025b2b48751
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_mmul_hvx_intrinsics.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+import tvm.testing
+
+from numpy.random import default_rng
+from tvm.script import tir as T
+from tvm.tir.function import TensorIntrin
+
+from tests.python.contrib.test_hexagon.mmul_unit8_hvx_intrin import get_mm_uint8_intrin
+from tests.python.contrib.test_hexagon.quantization_utils import quantize_array, quantize_uint8
+
+UNROLL_FACTOR = 4  # This must match the hard-coded unrolling in mm_uint8_intrinsic().
+
+
+def can_tensorize(n, m, k):
+    return m % (4 * UNROLL_FACTOR) == k % 32 == 0
+
+
+def blockify_matrix(B):
+    """
+    inputs
+    ------
+    B        : numpy 2D array (of M x K ) to be blockified
+
+    outputs
+    -------
+    BB       : blockified B as array of dimensions (M/4) x K x 4
+
+    blockification is in preparation for HVX ops on 128B 'vectors'
+    assuming input is of type int8 or uint8, and matrix multiplication uses vrmpy to accumulate to int32
+    - once for each block - then K/32 such vectors will be required to carry out the operation.
+
+    if B is height x width = M x K, then output is array of K blocks in x direction,
+    i.e. as [block1, block2, ... blockK ] where each block has dimensions M/4 x 4.
+    Specifically:
+
+     B = [ B[1,1]     B[1,2]      ...   B[1,K] ]
+         [ B[2,1]     B[2,2]      ...   B[2,K] ]
+                         .
+                         .
+                         .
+         [ B[M,1]     B[M,2]      ...   B[M,K] ]
+
+
+     BB =                                 |                                   |     |
+                                          |                                   |     |
+     [  B[1,1]   B[2,1]   B[3,1]   B[4,1] | B[1,2]   B[2,2]   B[3,2]   B[4,2] | ... | B[1,K]   B[2,K]   B[3,K]   B[4,K] ]
+     [  B[5,1]   B[6,1]   B[7,1]   B[8,1] | B[5,2]   B[6,2]   B[7,2]   B[8,2] | ... | B[5,K]   B[6,K]   B[7,K]   B[8,K] ]
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+                        .                 |                 .                 |  .  |                 .
+     [  B[M-3,1] B[M-2,1] B[M-1,1] B[M,1] | B[M-3,2] B[M-2,2] B[M-1,2] B[M,2] | ... | B[M-3,K] B[M-2,K] B[M-1,K] B[M,K] ]
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                                          |                                   |     |
+                      ^                   |                 ^                 |     |                 ^
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                      |                   |                 |                 |     |                 |
+                    block 1                               block 2             |     |               block K
+
+    """
+    BT, M, K = B.shape
+    assert M % 4 == 0
+    out_height = M // 4
+    out_shape = tuple((BT, out_height, K, 4))
+    BB = np.zeros(out_shape).astype(B.dtype)  # block form of B
+    for bt in range(BT):
+        for j in range(0, out_height):
+            for k in range(0, K):
+                for b in range(0, 4):
+                    y = b + 4 * j
+                    BB[bt, j, k, b] = B[bt, y, k]
+    return BB
+
+
+def setup_test(b, m, n, k):
+    a_shape = (b, n, m)
+    b_shape = (b, m, k)
+
+    rng = default_rng()
+    a = rng.integers(1, 16, a_shape, dtype="uint8")
+    b = rng.integers(1, 16, b_shape, dtype="uint8")
+
+    a_q, a_min, a_max = quantize_array(a.reshape(a.size), a.size)
+    b_q, b_min, b_max = quantize_array(b.reshape(b.size), b.size)
+    a_q = np.array(a_q, dtype="uint8").reshape(a_shape)
+    b_q = np.array(b_q, dtype="uint8").reshape(b_shape)
+    a_offset = quantize_uint8(0.0, a_min, a_max)
+    b_offset = quantize_uint8(0.0, b_min, b_max)
+    if can_tensorize(n, m, k):
+        bb = blockify_matrix(b_q)  # blockification is only used by the tensorized version
+        bb = bb.reshape(-1)  # go via 1D rep in case there are stride / offset issues
+        bb = bb.reshape(b_shape)
+    else:
+        bb = []
+
+    a_f = np.array(a_q, dtype="int32").reshape(a_shape)
+    b_f = np.array(b_q, dtype="int32").reshape(b_shape)
+    expected_output = np.matmul(a_f, b_f)
+
+    intrin_name = "mm.uint8_{}x{}x{}".format(m, n, k)
+    try:
+        TensorIntrin.register(intrin_name, *get_mm_uint8_intrin(m, n, k))
+    except:
+        print("Intrinsic already registered.")
+
+    return a_q, b_q, bb, a_offset, b_offset, intrin_name, expected_output
+
+
+class TestMatMulVec:
+
+    batches, m, n, k = tvm.testing.parameters(
+        (1, 128, 768, 768),
+        (1, 128, 768, 3072),
+        (1, 128, 3072, 768),
+        (1, 128, 128, 64),
+        (1, 128, 64, 128),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_matmul_intrinsics(self, hexagon_session, batches, m, n, k):
+
+        out_shape = (batches, n, k)
+
+        a_q, b_q, bb, a_offset, b_offset, intrin_name, out_ref = setup_test(batches, m, n, k)
+
+        @T.prim_func
+        def operator(a: T.handle, b: T.handle, c: T.handle, offsets: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, [batches, n, m], dtype="uint8")
+            B = T.match_buffer(b, [batches, m, k], dtype="uint8")
+            C = T.match_buffer(c, [batches, n, k], dtype="int32")
+            OFFSETS = T.match_buffer(offsets, [2], dtype="uint8")
+            # body
+            with T.block("root"):
+                for i0, i1, i2, i3 in T.grid(batches, m, n, k):
+                    with T.block("C"):
+                        batch, y, x, j = T.axis.remap("SSSR", [i0, i1, i2, i3])
+                        C[batch, y, x] = C[batch, y, x] + T.cast(
+                            A[batch, y, j] - OFFSETS[0], "int32"
+                        ) * T.cast(B[batch, j, x] - OFFSETS[1], "int32")
+
+        ir_module = operator
+        sch = tvm.tir.Schedule(ir_module, debug_mask="all")
+
+        block = sch.get_block("C")
+        _, y, _, _ = sch.get_loops(block)
+        sch.tensorize(y, intrin_name)
+
+        A = tvm.tir.decl_buffer(a_q.shape, name="A", dtype="uint8")
+        B = tvm.tir.decl_buffer(b_q.shape, name="B", dtype="uint8")
+        C = tvm.tir.decl_buffer(out_shape, name="C", dtype="int32")
+        OFFSETS = tvm.tir.decl_buffer((2), name="OFFSETS", dtype="uint8")
+
+        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+        func_tir = tvm.build(
+            sch.mod,
+            [A, B, C, OFFSETS],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="qmmul_vrmpy",
+        )
+        module = hexagon_session.load_module(func_tir)
+
+        c = np.zeros(out_shape, dtype="int32")
+        offsets = np.array([a_offset, b_offset], dtype="uint8")
+
+        a_hexagon = tvm.runtime.ndarray.array(a_q, device=hexagon_session.device)
+        b_hexagon = tvm.runtime.ndarray.array(bb, device=hexagon_session.device)
+        c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+        offsets_hexagon = tvm.runtime.ndarray.array(offsets, device=hexagon_session.device)
+
+        module(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon)
+        evaluator = module.time_evaluator(module.entry_name, hexagon_session.device, number=1)
+        time_ms = evaluator(a_hexagon, b_hexagon, c_hexagon, offsets_hexagon).mean * 1e3
+        print("Input Shape: {}. Conv time elapsed: {} ms".format((batches, m, n, k), time_ms))
+
+        out = c_hexagon.numpy()
+        out_a = out.reshape(batches * n * k)
+        out_req, _, _ = quantize_array(out_a, batches * n * k)
+        out_req = np.array(out_req).reshape(batches, n, k)
+
+        out_ref_a = out_ref.reshape(batches * n * k)
+        out_ref_q, _, _ = quantize_array(out_ref_a, batches * n * k)
+        out_ref_q = np.array(out_ref_q).reshape(batches, n, k)
+
+        tvm.testing.assert_allclose(out_req, out_ref_q, atol=2.0, rtol=0.0)