Skip to content

Commit

Permalink
Move the allocates of AoT codegen to be TVMBAWs
Browse files Browse the repository at this point in the history
*Adding more comments and descriptions
*Modified the test case to use primitive relay

Change-Id: Ia18a169d94bded3f81af7b3081c7d1ac29c669bc
  • Loading branch information
manupak committed Sep 22, 2021
1 parent d72a9e6 commit 321ba2c
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 30 deletions.
6 changes: 3 additions & 3 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -624,9 +624,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
for (auto kv : storage_device_map_) {
for (auto sid : kv.second->storage_ids) {
// The buffer_var is created with storage_scope to be global.workspace to be serviced by
// TVMBAWs, explicitly. The reasoning being the executor allocates should be serviced by
// TVMBAWs as the data could be accessed by many devices and should not be lowered to the
// stack. For more details please refer to the discussion here:
// TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
// allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
// should not be lowered to the stack. For more details please refer to the discussion here:
// https://github.com/apache/tvm/issues/9022
te::Var buffer_var(MakeString("sid_", sid),
PointerType(PrimType(DataType::Int(8)), "global.workspace"));
Expand Down
4 changes: 4 additions & 0 deletions src/tir/transforms/lower_tvm_builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ class BuiltinLower : public StmtExprMutator {
op = stmt.as<AllocateNode>();
// Get constant allocation bound.
int64_t nbytes = GetVectorBytes(op->dtype);
// If the buffers are for CPU and have global scope,
// and less than runtime::kMaxStackAlloca heuristic
// they are not serviced with TVMBackendWorkspaceAlloc calls
// to be placed on stack.
if (device_type_.defined()) {
if (const auto* dev_type = device_type_.as<IntImmNode>()) {
auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;
Expand Down
1 change: 1 addition & 0 deletions src/tir/transforms/storage_rewrite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ class StoragePlanRewriter : public StmtExprMutator {
uint64_t bits_offset{0};
};

// Checks whether the storage_scope is especially tagged for a specific memory.
bool IsSpecialTaggedMemory(const StorageScope& scope) {
return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
}
Expand Down
58 changes: 31 additions & 27 deletions tests/python/relay/aot/test_crt_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,38 +591,42 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work


def test_aot_codegen_backend_alloc_workspace_calls():
dtype = "float32"

# These shapes should create small tensors that would
# get lowered to stack allocations in the CPU PrimFuncs.
# However, the AoT executor codegen should retain them
# as TVMBAW calls
ishape = (1, 4, 4, 4)
wshape = (4, 4, 3, 3)

data0 = relay.var("data", shape=ishape, dtype=dtype)
weight0 = relay.var("weight", shape=wshape, dtype=dtype)
out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1)
main_f = relay.Function([data0, weight0], out)
mod = tvm.IRModule()
mod["main"] = main_f
mod = transform.InferType()(mod)

i_data = np.random.uniform(0, 1, ishape).astype(dtype)
w1_data = np.random.uniform(0, 1, wshape).astype(dtype)

inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
output_list = generate_ref_data(mod, inputs)

"""This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""

# The %data and %weight shapes in the following primitive Relay should create
# small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
# However, the AoT executor codegen should retain them as TVMBAW calls
relay_mod = tvm.parser.fromtext(
"""
#[version = "0.0.5"]
def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
%0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
};
%2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
%4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
};
%6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
}
"""
)
compiled_runtime_modules = compile_models(
AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
AOTTestModel(module=relay_mod, inputs=None, outputs=None),
"c",
True,
)

source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
# There should be three TVMBackendAllocWorkspace calls generated
# for the above snippet of code
# There should be three allocates created for three primitive relay function
# calls in the main for the above relay snippet.
assert source.count("TVMBackendAllocWorkspace") == 3


Expand Down

0 comments on commit 321ba2c

Please sign in to comment.