-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Closed
Labels
Description
When using an alloc_buffer that is not initialized or written out, CompactBufferAllocation and LoopVectorize passes perform invalid rewrites. Technically these could be considered correct because the changes have no effect on output, but they are very confusing when inspecting the lowered code.
Example script:
import tvm
from tvm.script import tir as T
@T.prim_func
def blis_gemm_microkernel_template(c: T.handle):
A_pack = T.alloc_buffer((8,), "float32", scope="local")
B_pack = T.alloc_buffer((8,), "float32", scope="local")
C = T.match_buffer(c, (8,8))
for loop in range(1000):
for rii in T.unroll(8):
for rjj in T.vectorized(8):
C[rii, rjj] += A_pack[rii] * B_pack[rjj]
if __name__ == "__main__":
with tvm.transform.PassContext(opt_level=3):
print(tvm.lower(blis_gemm_microkernel_template, "llvm -mcpu=znver3"))
with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CompactBufferAllocation"]):
print(tvm.lower(blis_gemm_microkernel_template, "llvm -mcpu=znver3"))
Output:
@main = primfn(c: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {C: Buffer(C_1: Pointer(global float32), float32, [64], [])}
buffer_map = {c: C} {
allocate(A_pack: Pointer(local float32x8), float32x8, [1]), storage_scope = local;
allocate(B_pack: Pointer(local float32x8), float32x8, [1]), storage_scope = local;
for (loop: int32, 0, 1000) {
C[ramp(0, 1, 8)] = (C[ramp(0, 1, 8)] + (A_pack_1: Buffer(A_pack, float32x8, [1], [], scope="local")[0]*B_pack_1: Buffer(B_pack, float32x8, [1], [], scope="local")[0]))
C[ramp(8, 1, 8)] = (C[ramp(8, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(16, 1, 8)] = (C[ramp(16, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(24, 1, 8)] = (C[ramp(24, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(32, 1, 8)] = (C[ramp(32, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(40, 1, 8)] = (C[ramp(40, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(48, 1, 8)] = (C[ramp(48, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
C[ramp(56, 1, 8)] = (C[ramp(56, 1, 8)] + (A_pack_1[0]*B_pack_1[0]))
}
}
@main = primfn(c: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {C: Buffer(C_1: Pointer(global float32), float32, [64], [])}
buffer_map = {c: C} {
allocate(A_pack: Pointer(local float32x8), float32x8, [8]), storage_scope = local;
allocate(B_pack: Pointer(local float32), float32, [64]), storage_scope = local;
for (loop: int32, 0, 1000) {
C[ramp(0, 1, 8)] = (C[ramp(0, 1, 8)] + (A_pack_1: Buffer(A_pack, float32x8, [8], [], scope="local")[0]*B_pack_1: Buffer(B_pack, float32, [64], [], scope="local")[ramp(0, 9, 8)]))
C[ramp(8, 1, 8)] = (C[ramp(8, 1, 8)] + (A_pack_1[1]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(16, 1, 8)] = (C[ramp(16, 1, 8)] + (A_pack_1[2]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(24, 1, 8)] = (C[ramp(24, 1, 8)] + (A_pack_1[3]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(32, 1, 8)] = (C[ramp(32, 1, 8)] + (A_pack_1[4]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(40, 1, 8)] = (C[ramp(40, 1, 8)] + (A_pack_1[5]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(48, 1, 8)] = (C[ramp(48, 1, 8)] + (A_pack_1[6]*B_pack_1[ramp(0, 9, 8)]))
C[ramp(56, 1, 8)] = (C[ramp(56, 1, 8)] + (A_pack_1[7]*B_pack_1[ramp(0, 9, 8)]))
}
}
The first output incorrectly has A_pack_1 accesses as all being 0 when they should be 0-7. The second output has an incorrect ramp with a stride of 9.
Reactions are currently unavailable