Skip to content

Commit

Permalink
[compiler] fix bug in RemoveCopy (#289)
Browse files Browse the repository at this point in the history
as title
  • Loading branch information
XG-zheng committed May 29, 2024
1 parent b1914a7 commit c34abdc
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 38 deletions.
15 changes: 11 additions & 4 deletions compiler/include/byteir/Dialect/MemRef/Utils/MemEffect.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,19 @@ void getAllAlias(Operation *op,
llvm::SmallVectorImpl<SmallVector<Value>> &aliases,
bool skipNonOverlapedSubviews = false);

bool maybeOpOperandWrite(OpOperand &opOpernad);

bool maybeOpOperandRead(OpOperand &opOpernad);

// Note: this method would collect all **potential** read/write uses on given
// aliases
void getMemEffects(llvm::SmallVectorImpl<OpMemEffectOrder> &memEffects,
llvm::ArrayRef<SmallVector<Value>> aliases,
llvm::DenseMap<Operation *, unsigned> &opToIdx,
unsigned pivot);
using OperandMemEffectFn = std::function<bool(OpOperand &)>;
void getMemEffects(
llvm::SmallVectorImpl<OpMemEffectOrder> &memEffects,
llvm::ArrayRef<SmallVector<Value>> aliases,
llvm::DenseMap<Operation *, unsigned> &opToIdx, unsigned pivot,
const OperandMemEffectFn &hasReadEffect = maybeOpOperandRead,
const OperandMemEffectFn &hasWriteEffect = maybeOpOperandWrite);

} // namespace mlir

Expand Down
18 changes: 17 additions & 1 deletion compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,32 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
}
}

auto srcMemSpace = src.getType().cast<MemRefType>().getMemorySpace();
auto dstMemSpace = target.getType().cast<MemRefType>().getMemorySpace();
if (srcMemSpace && dstMemSpace && srcMemSpace != dstMemSpace) {
return failure();
}

SmallVector<SmallVector<Value>, 2> aliases(2);
getAllAlias(copyOp, aliases, /*skipNonOverlapedSubviews*/ true);
aliases[0].push_back(copyOp.getSource());

llvm::DenseMap<Operation *, unsigned> opToIdx;
unsigned idx = 0;
copyOp->getBlock()->walk<WalkOrder::PreOrder>(
[&](Operation *inner) { opToIdx[inner] = idx++; });

SmallVector<OpMemEffectOrder, 2> memEffects(2);
getMemEffects(memEffects, aliases, opToIdx, opToIdx[copyOp]);
auto hasReadEffectFn = [](OpOperand &opOpernad) -> bool {
if (maybeOpOperandRead(opOpernad) ||
llvm::isa<func::ReturnOp>(opOpernad.getOwner())) {
return true;
}
return false;
};

getMemEffects(memEffects, aliases, opToIdx, opToIdx[copyOp],
hasReadEffectFn);

auto hasReadAfterWrite = [&](ArrayRef<Operation *> reads,
ArrayRef<Operation *> writes) {
Expand Down
18 changes: 9 additions & 9 deletions compiler/lib/Dialect/MemRef/Utils/MemEffect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
using namespace mlir;
using namespace llvm;

namespace {
static bool maybeOpOperandWrite(OpOperand &opOpernad) {
bool mlir::maybeOpOperandWrite(OpOperand &opOpernad) {
if (auto memEffect =
dyn_cast<MemoryEffectOpInterface>(opOpernad.getOwner())) {
return memEffect.getEffectOnValue<MemoryEffects::Write>(opOpernad.get())
Expand All @@ -34,15 +33,14 @@ static bool maybeOpOperandWrite(OpOperand &opOpernad) {
return true;
}

static bool maybeOpOperandRead(OpOperand &opOpernad) {
bool mlir::maybeOpOperandRead(OpOperand &opOpernad) {
if (auto memEffect =
dyn_cast<MemoryEffectOpInterface>(opOpernad.getOwner())) {
return memEffect.getEffectOnValue<MemoryEffects::Read>(opOpernad.get())
.has_value();
}
return true;
}
} // namespace

void mlir::getAllAlias(Operation *op,
SmallVectorImpl<SmallVector<Value>> &aliases,
Expand Down Expand Up @@ -75,23 +73,25 @@ void mlir::getAllAlias(Operation *op,
void mlir::getMemEffects(SmallVectorImpl<OpMemEffectOrder> &memEffects,
ArrayRef<SmallVector<Value>> aliases,
llvm::DenseMap<Operation *, unsigned> &opToIdx,
unsigned pivot) {
unsigned pivot,
const OperandMemEffectFn &hasReadEffect,
const OperandMemEffectFn &hasWriteEffect) {
for (const auto &en : llvm::enumerate(aliases)) {
for (auto val : en.value()) {
for (auto &use : val.getUses()) {
auto user = use.getOwner();
if (opToIdx[user] < pivot) {
if (maybeOpOperandRead(use)) {
if (hasReadEffect(use)) {
memEffects[en.index()].before.reads.push_back(user);
}
if (maybeOpOperandWrite(use)) {
if (hasWriteEffect(use)) {
memEffects[en.index()].before.writes.push_back(user);
}
} else if (opToIdx[user] > pivot) {
if (maybeOpOperandRead(use)) {
if (hasReadEffect(use)) {
memEffects[en.index()].after.reads.push_back(user);
}
if (maybeOpOperandWrite(use)) {
if (hasWriteEffect(use)) {
memEffects[en.index()].after.writes.push_back(user);
}
}
Expand Down
88 changes: 64 additions & 24 deletions compiler/test/Dialect/MemRef/removeCopy.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -489,14 +489,14 @@ module attributes {byre.container_module} {
// CHECK-LABEL: func.func @src_alloc_shape_transform_0(
// CHECK-SAME: %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
// CHECK-SAME: %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
// CHECK: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK: %[[VAL_3:.*]] = memref.collapse_shape %[[VAL_1]] {{\[\[}}0, 1]] : memref<2x1001xf16, "gpuhost"> into memref<2002xf16, "gpuhost">
// CHECK: %[[VAL_4:.*]] = memref.alloc() : memref<2002xf16, "gpu">
// CHECK: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2002xf16, "gpu">
// CHECK: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2002xf16, "gpu"> to memref<2002xf16, "gpuhost">
// CHECK-DAG: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK-DAG: %[[VAL_3:.*]] = memref.collapse_shape %[[VAL_1]] {{\[\[}}0, 1]] : memref<2x1001xf16, "gpuhost"> into memref<2002xf16, "gpuhost">
// CHECK-DAG: %[[VAL_4:.*]] = memref.alloc() : memref<2002xf16, "gpu">
// CHECK-DAG: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK-DAG: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2002xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2002xf16, "gpu"> to memref<2002xf16, "gpuhost">
// CHECK: return
// CHECK: }

Expand All @@ -522,14 +522,14 @@ module attributes {byre.container_module} {
// CHECK-LABEL: func.func @src_alloc_shape_transform_1(
// CHECK-SAME: %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
// CHECK-SAME: %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
// CHECK: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK: %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
// CHECK: %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
// CHECK: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
// CHECK: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
// CHECK-DAG: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK-DAG: %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
// CHECK-DAG: %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
// CHECK-DAG: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK-DAG: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
// CHECK: return
// CHECK: }

Expand All @@ -554,14 +554,14 @@ module attributes {byre.container_module} {
// CHECK-LABEL: func.func @src_alloc_shape_transform_2(
// CHECK-SAME: %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
// CHECK-SAME: %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
// CHECK: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK: %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
// CHECK: %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
// CHECK: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
// CHECK: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
// CHECK-DAG: %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
// CHECK-DAG: %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
// CHECK-DAG: %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
// CHECK-DAG: %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
// CHECK-DAG: %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
// CHECK-DAG: byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
// CHECK-DAG: memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
// CHECK: return
// CHECK: }

Expand Down Expand Up @@ -599,3 +599,43 @@ func.func @src_alloc_shape_transform_3(%arg0: memref<2x1x1001xf16>, %arg1: memre
// CHECK: }
// CHECK: return
// CHECK: }

// -----

func.func @insert_slice(%arg0: memref<1024x9xf32>, %arg1: memref<1024x9xf32>) -> (memref<1024x9xf32>, memref<1024x9xf32>) attributes {__byteir_elementwise_fusion__} {
%subview = memref.subview %arg0[0, 0] [1024, 4] [1, 1] : memref<1024x9xf32> to memref<1024x4xf32, strided<[9, 1]>>
%alloc = memref.alloc() : memref<1024x9xf32>
memref.copy %arg1, %alloc : memref<1024x9xf32> to memref<1024x9xf32>
%subview_0 = memref.subview %alloc[0, 0] [1024, 4] [1, 1] : memref<1024x9xf32> to memref<1024x4xf32, strided<[9, 1]>>
memref.copy %subview, %subview_0 : memref<1024x4xf32, strided<[9, 1]>> to memref<1024x4xf32, strided<[9, 1]>>
return %alloc, %arg1 : memref<1024x9xf32>, memref<1024x9xf32>
}

// CHECK-LABEL: func.func @insert_slice
// CHECK: memref.copy
// CHECK: memref.copy

// -----

module attributes {byre.container_module} {
func.func @h2dCopy(%arg0: memref<2x224x224x3xf16, "cpu"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32}, %arg1: memref<2x1001xf16, "cuda"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
%collapse_shape = memref.collapse_shape %arg0 [[0, 1, 2, 3]] {device = "cpu"} : memref<2x224x224x3xf16, "cpu"> into memref<301056xf16, "cpu">
%expand_shape = memref.expand_shape %collapse_shape [[0, 1, 2, 3]] {device = "cpu"} : memref<301056xf16, "cpu"> into memref<2x224x1x672xf16, "cpu">
%alloc = memref.alloc() : memref<2x1x1x1001xf16, "cuda">
%alloc_0 = memref.alloc() : memref<2x224x1x672xf16, "cuda">
memref.copy %expand_shape, %alloc_0 : memref<2x224x1x672xf16, "cpu"> to memref<2x224x1x672xf16, "cuda">
byre.compute @cudaComputeOp(%alloc_0, %alloc) {device = "cuda", kernel_name = "main_cuda", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "cuda">, memref<2x1x1x1001xf16, "cuda">
%alloc_1 = memref.alloc() : memref<2x1x1x1001xf16, "cpu">
memref.copy %alloc, %alloc_1 : memref<2x1x1x1001xf16, "cuda"> to memref<2x1x1x1001xf16, "cpu">
%collapse_shape_2 = memref.collapse_shape %alloc_1 [[0], [1, 2, 3]] {device = "cpu"} : memref<2x1x1x1001xf16, "cpu"> into memref<2x1001xf16, "cpu">
memref.copy %collapse_shape_2, %arg1 : memref<2x1001xf16, "cpu"> to memref<2x1001xf16, "cuda">
return
}
}

// CHECK-LABEL: func.func @h2dCopy
// CHECK: memref.copy
// CHECK: memref.copy
// CHECK: memref.copy


0 comments on commit c34abdc

Please sign in to comment.