Move the allocates of AoT codegen to be TVMBAWs

*Adding more comments and descriptions *Modified the test case to use primitive relay Change-Id: Ia18a169d94bded3f81af7b3081c7d1ac29c669bc
apache · Sep 22, 2021 · 321ba2c · 321ba2c
1 parent d72a9e6
commit 321ba2c
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 30 deletions.
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
@@ -624,9 +624,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     for (auto kv : storage_device_map_) {
       for (auto sid : kv.second->storage_ids) {
         // The buffer_var is created with storage_scope to be global.workspace to be serviced by
-        // TVMBAWs, explicitly. The reasoning being the executor allocates should be serviced by
-        // TVMBAWs as the data could be accessed by many devices and should not be lowered to the
-        // stack. For more details please refer to the discussion here:
+        // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
+        // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
+        // should not be lowered to the stack. For more details please refer to the discussion here:
         // https://github.com/apache/tvm/issues/9022
         te::Var buffer_var(MakeString("sid_", sid),
                            PointerType(PrimType(DataType::Int(8)), "global.workspace"));

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
@@ -113,6 +113,10 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
+    // If the buffers are for CPU and have global scope,
+    // and less than runtime::kMaxStackAlloca heuristic
+    // they are not serviced with TVMBackendWorkspaceAlloc calls
+    // to be placed on stack.
     if (device_type_.defined()) {
       if (const auto* dev_type = device_type_.as<IntImmNode>()) {
         auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;

diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
@@ -478,6 +478,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     uint64_t bits_offset{0};
   };
 
+  // Checks whether the storage_scope is especially tagged for a specific memory.
   bool IsSpecialTaggedMemory(const StorageScope& scope) {
     return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
   }

diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
@@ -591,38 +591,42 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
 
 
 def test_aot_codegen_backend_alloc_workspace_calls():
-    dtype = "float32"
-
-    # These shapes should create small tensors that would
-    # get lowered to stack allocations in the CPU PrimFuncs.
-    # However, the AoT executor codegen should retain them
-    # as TVMBAW calls
-    ishape = (1, 4, 4, 4)
-    wshape = (4, 4, 3, 3)
-
-    data0 = relay.var("data", shape=ishape, dtype=dtype)
-    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
-    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1)
-    main_f = relay.Function([data0, weight0], out)
-    mod = tvm.IRModule()
-    mod["main"] = main_f
-    mod = transform.InferType()(mod)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w1_data = np.random.uniform(0, 1, wshape).astype(dtype)
-
-    inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
-    output_list = generate_ref_data(mod, inputs)
-
+    """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""
+
+    # The %data and %weight shapes in the following primitive Relay should create
+    # small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
+    # However, the AoT executor codegen should retain them as TVMBAW calls
+    relay_mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
+        %0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+            layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
+            layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
+        };
+        %2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
+        %4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+                                                                                                                                                                                                                                                      nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
+            layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
+        };
+        %6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
+        }
+        """
+    )
     compiled_runtime_modules = compile_models(
-        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        AOTTestModel(module=relay_mod, inputs=None, outputs=None),
         "c",
         True,
     )
-
     source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
-    # There should be three TVMBackendAllocWorkspace calls generated
-    # for the above snippet of code
+    # There should be three allocates created for three primitive relay function
+    # calls in the main for the above relay snippet.
     assert source.count("TVMBackendAllocWorkspace") == 3