From 58244be9417aa6ba3cdb4feba44328e65ff5fad9 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 5 Sep 2023 11:40:59 +0300
Subject: [PATCH 1/2] [VM][Adreno] Fix using buffers for weights in VM

In VM `fn->attrs` doesn't contain information about `kernel_layout`. So
we can get this value from `expr_attrib`. In this PR function
`CanUseBuffers` was modified to work with VM.

A new test which checks memory scope for VM was added.
---
 .../transforms/annotate_texture_storage.cc    |  8 ++-
 .../test_conv2d_nchw_texture.py               | 72 ++++++++++++++++---
 .../opencl_texture/utils/adreno_utils.py      | 18 ++---
 3 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 4921cef4c8c2..01d47b69530b 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -174,8 +174,11 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
             for (const auto& ttype : FlattenTupleType(fn->params[i]->checked_type())) {
               std::string scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
               if (expr_attrib.as<Conv2DAttrs>() || expr_attrib.as<Conv2DWinogradAttrs>()) {
+                String kernel_layout = expr_attrib.as<Conv2DAttrs>()
+                                           ? expr_attrib.as<Conv2DAttrs>()->kernel_layout
+                                           : expr_attrib.as<Conv2DWinogradAttrs>()->kernel_layout;
                 if ((i == weights_pos) && !ttype->dtype.is_float16() &&
-                    CanUseBuffers(call->args[i], ttype->shape, fn->attrs)) {
+                    CanUseBuffers(call->args[i], ttype->shape, kernel_layout)) {
                   buffers_params.insert(fn->params[i]);
                   buffers_args.insert(call->args[i]);
                   scope = "global";
@@ -426,10 +429,9 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
   }
 
   bool CanUseBuffers(const Expr param, const Array<PrimExpr> shape,
-                     const tvm::DictAttrs param_attrs) const {
+                     const String kernel_layout) const {
     bool use_buffer = false;
     if (param.as<ConstantNode>() && shape.size() == 5) {
-      auto kernel_layout = param_attrs.GetAttr<String>("kernel_layout");
       if (kernel_layout == "HWOI4o" || kernel_layout == "HWIO4o") {
         int a0 = shape[0].as<IntImmNode>()->value;
         int a1 = shape[1].as<IntImmNode>()->value;
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index 3c9c3f2caf1e..5fd86adf6f69 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -692,7 +692,6 @@ def test_residual_block(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -803,8 +802,6 @@ def test_concat(remote, target, executor_type, dtype):
         "",
     ]
 
-    static_memory_scope = []
-
     if executor_type == "ge":
         build_run_compare(
             remote,
@@ -823,7 +820,6 @@ def test_concat(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -968,7 +964,6 @@ def test_pooling_branching_texture_params(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -1111,7 +1106,6 @@ def test_branching_texture_params(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -1212,7 +1206,6 @@ def test_conv2d_different_lowering_same_op(remote, target, executor_type, dtype)
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -1380,7 +1373,6 @@ def test_injective_nwo_inputs1(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -1495,7 +1487,6 @@ def test_injective_nwo_inputs2(remote, target, executor_type, dtype):
             {"data": input_shape},
             {"data": dtype},
             target,
-            static_memory_scope,
         )
 
 
@@ -1534,5 +1525,68 @@ def test_conv2d_to_3_channels(remote, target, executor_type, dtype):
         )
 
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_weight_on_buffers(remote, target, executor_type, dtype):
+    target = "opencl -device=adreno"
+    input_shape = (1, 64, 75, 75)
+    filter_shape = (64, 64, 3, 3)
+    bias_shape = (64,)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W = relay.var("weight", shape=filter_shape, dtype=dtype)
+    BS = relay.var("bias", shape=bias_shape, dtype=dtype)
+    conv = relay.nn.conv2d(A, W, padding=[1, 1, 1, 1], channels=64, kernel_size=(3, 3))
+    conv = relay.nn.bias_add(conv, BS)
+    conv = relay.op.nn.relu(conv)
+
+    mod = relay.Function([A, W, BS], conv)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    if executor_type == "ge":
+        static_memory_scope = [
+            "",
+            "global.texture",
+            "global",
+            "global.texture-weight",
+            "",
+            "",
+        ]
+        build_run_compare(
+            remote,
+            mod,
+            params1,
+            {"data": input_shape},
+            {"data": dtype},
+            target,
+            static_memory_scope,
+        )
+    else:
+        static_memory_scope = """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global
+        VM VirtualDevice[4]: device type 4, id 0 and mem_scope global.texture-weight
+        """
+        build_run_compare_vm(
+            remote,
+            mod,
+            params1,
+            {"data": input_shape},
+            {"data": dtype},
+            target,
+            static_memory_scope,
+        )
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py
index 309243df1624..722922f544d8 100644
--- a/tests/python/relay/opencl_texture/utils/adreno_utils.py
+++ b/tests/python/relay/opencl_texture/utils/adreno_utils.py
@@ -161,19 +161,11 @@ def build_run_compare_vm(
                 tvm_mod_nchwc, target=target, target_host=target_host, params=params1
             )
 
-    # TODO(echuraev): enable scope checking
-    ## verification that storage_scope has expected textures scopes
-    # graph_json = json.loads(graph)
-    # if "storage_scope" in graph_json["attrs"]:
-    #    assert (
-    #        len(static_mem_scopes) == len(graph_json["attrs"]["storage_scope"][1])
-    #        or len(static_mem_scopes) == 0
-    #    )
-    # else:
-    #    assert len(static_mem_scopes) == 0
-
-    # for i in range(0, len(static_mem_scopes)):
-    #    assert static_mem_scopes[i] == graph_json["attrs"]["storage_scope"][1][i]
+    if len(static_mem_scopes) > 0:
+        mem_scopes_lines = static_mem_scopes.strip().split('\n')
+        vm_lines = vmc._get_virtual_devices().strip().split('\n')
+        for i in range(0, len(mem_scopes_lines)):
+            assert mem_scopes_lines[i].strip() == vm_lines[i].strip()
 
     if remote is None:
         dev = tvm.opencl()

From a9b41daa099e7614e748be3ab994ff1ff7ede7ae Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 6 Sep 2023 09:15:57 +0300
Subject: [PATCH 2/2] Fix ci

---
 .../python/relay/opencl_texture/test_conv2d_nchw_texture.py  | 5 +++--
 tests/python/relay/opencl_texture/utils/adreno_utils.py      | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index 5fd86adf6f69..1dd5ca2abd00 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -789,11 +789,12 @@ def test_concat(remote, target, executor_type, dtype):
 
     static_memory_scope = [
         "",
+        "global.texture",
         "global",
         "global.texture-weight",
-        "global.texture-weight",
         "global",
-        "global.texture-weight",
+        "global.texture-nhwc",
+        "global",
         "global.texture-weight",
         "",
         "",
diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py
index 722922f544d8..d9e52f8847a7 100644
--- a/tests/python/relay/opencl_texture/utils/adreno_utils.py
+++ b/tests/python/relay/opencl_texture/utils/adreno_utils.py
@@ -162,8 +162,8 @@ def build_run_compare_vm(
             )
 
     if len(static_mem_scopes) > 0:
-        mem_scopes_lines = static_mem_scopes.strip().split('\n')
-        vm_lines = vmc._get_virtual_devices().strip().split('\n')
+        mem_scopes_lines = static_mem_scopes.strip().split("\n")
+        vm_lines = vmc._get_virtual_devices().strip().split("\n")
         for i in range(0, len(mem_scopes_lines)):
             assert mem_scopes_lines[i].strip() == vm_lines[i].strip()