[compiler] fix bug in RemoveCopy (#289)

as title
bytedance · May 29, 2024 · c34abdc · c34abdc
1 parent b1914a7
commit c34abdc
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 38 deletions.
diff --git a/compiler/include/byteir/Dialect/MemRef/Utils/MemEffect.h b/compiler/include/byteir/Dialect/MemRef/Utils/MemEffect.h
@@ -41,12 +41,19 @@ void getAllAlias(Operation *op,
                  llvm::SmallVectorImpl<SmallVector<Value>> &aliases,
                  bool skipNonOverlapedSubviews = false);
 
+bool maybeOpOperandWrite(OpOperand &opOpernad);
+
+bool maybeOpOperandRead(OpOperand &opOpernad);
+
 // Note: this method would collect all **potential** read/write uses on given
 // aliases
-void getMemEffects(llvm::SmallVectorImpl<OpMemEffectOrder> &memEffects,
-                   llvm::ArrayRef<SmallVector<Value>> aliases,
-                   llvm::DenseMap<Operation *, unsigned> &opToIdx,
-                   unsigned pivot);
+using OperandMemEffectFn = std::function<bool(OpOperand &)>;
+void getMemEffects(
+    llvm::SmallVectorImpl<OpMemEffectOrder> &memEffects,
+    llvm::ArrayRef<SmallVector<Value>> aliases,
+    llvm::DenseMap<Operation *, unsigned> &opToIdx, unsigned pivot,
+    const OperandMemEffectFn &hasReadEffect = maybeOpOperandRead,
+    const OperandMemEffectFn &hasWriteEffect = maybeOpOperandWrite);
 
 } // namespace mlir
 

diff --git a/compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp b/compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp
@@ -112,16 +112,32 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
       }
     }
 
+    auto srcMemSpace = src.getType().cast<MemRefType>().getMemorySpace();
+    auto dstMemSpace = target.getType().cast<MemRefType>().getMemorySpace();
+    if (srcMemSpace && dstMemSpace && srcMemSpace != dstMemSpace) {
+      return failure();
+    }
+
     SmallVector<SmallVector<Value>, 2> aliases(2);
     getAllAlias(copyOp, aliases, /*skipNonOverlapedSubviews*/ true);
+    aliases[0].push_back(copyOp.getSource());
 
     llvm::DenseMap<Operation *, unsigned> opToIdx;
     unsigned idx = 0;
     copyOp->getBlock()->walk<WalkOrder::PreOrder>(
         [&](Operation *inner) { opToIdx[inner] = idx++; });
 
     SmallVector<OpMemEffectOrder, 2> memEffects(2);
-    getMemEffects(memEffects, aliases, opToIdx, opToIdx[copyOp]);
+    auto hasReadEffectFn = [](OpOperand &opOpernad) -> bool {
+      if (maybeOpOperandRead(opOpernad) ||
+          llvm::isa<func::ReturnOp>(opOpernad.getOwner())) {
+        return true;
+      }
+      return false;
+    };
+
+    getMemEffects(memEffects, aliases, opToIdx, opToIdx[copyOp],
+                  hasReadEffectFn);
 
     auto hasReadAfterWrite = [&](ArrayRef<Operation *> reads,
                                  ArrayRef<Operation *> writes) {

diff --git a/compiler/lib/Dialect/MemRef/Utils/MemEffect.cpp b/compiler/lib/Dialect/MemRef/Utils/MemEffect.cpp
@@ -24,8 +24,7 @@
 using namespace mlir;
 using namespace llvm;
 
-namespace {
-static bool maybeOpOperandWrite(OpOperand &opOpernad) {
+bool mlir::maybeOpOperandWrite(OpOperand &opOpernad) {
   if (auto memEffect =
           dyn_cast<MemoryEffectOpInterface>(opOpernad.getOwner())) {
     return memEffect.getEffectOnValue<MemoryEffects::Write>(opOpernad.get())
@@ -34,15 +33,14 @@ static bool maybeOpOperandWrite(OpOperand &opOpernad) {
   return true;
 }
 
-static bool maybeOpOperandRead(OpOperand &opOpernad) {
+bool mlir::maybeOpOperandRead(OpOperand &opOpernad) {
   if (auto memEffect =
           dyn_cast<MemoryEffectOpInterface>(opOpernad.getOwner())) {
     return memEffect.getEffectOnValue<MemoryEffects::Read>(opOpernad.get())
         .has_value();
   }
   return true;
 }
-} // namespace
 
 void mlir::getAllAlias(Operation *op,
                        SmallVectorImpl<SmallVector<Value>> &aliases,
@@ -75,23 +73,25 @@ void mlir::getAllAlias(Operation *op,
 void mlir::getMemEffects(SmallVectorImpl<OpMemEffectOrder> &memEffects,
                          ArrayRef<SmallVector<Value>> aliases,
                          llvm::DenseMap<Operation *, unsigned> &opToIdx,
-                         unsigned pivot) {
+                         unsigned pivot,
+                         const OperandMemEffectFn &hasReadEffect,
+                         const OperandMemEffectFn &hasWriteEffect) {
   for (const auto &en : llvm::enumerate(aliases)) {
     for (auto val : en.value()) {
       for (auto &use : val.getUses()) {
         auto user = use.getOwner();
         if (opToIdx[user] < pivot) {
-          if (maybeOpOperandRead(use)) {
+          if (hasReadEffect(use)) {
             memEffects[en.index()].before.reads.push_back(user);
           }
-          if (maybeOpOperandWrite(use)) {
+          if (hasWriteEffect(use)) {
             memEffects[en.index()].before.writes.push_back(user);
           }
         } else if (opToIdx[user] > pivot) {
-          if (maybeOpOperandRead(use)) {
+          if (hasReadEffect(use)) {
             memEffects[en.index()].after.reads.push_back(user);
           }
-          if (maybeOpOperandWrite(use)) {
+          if (hasWriteEffect(use)) {
             memEffects[en.index()].after.writes.push_back(user);
           }
         }

diff --git a/compiler/test/Dialect/MemRef/removeCopy.mlir b/compiler/test/Dialect/MemRef/removeCopy.mlir
@@ -489,14 +489,14 @@ module attributes {byre.container_module} {
 // CHECK-LABEL:   func.func @src_alloc_shape_transform_0(
 // CHECK-SAME:                                           %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
 // CHECK-SAME:                                           %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
-// CHECK:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
-// CHECK:           %[[VAL_3:.*]] = memref.collapse_shape %[[VAL_1]] {{\[\[}}0, 1]] : memref<2x1001xf16, "gpuhost"> into memref<2002xf16, "gpuhost">
-// CHECK:           %[[VAL_4:.*]] = memref.alloc() : memref<2002xf16, "gpu">
-// CHECK:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
-// CHECK:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
-// CHECK:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2002xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2002xf16, "gpu"> to memref<2002xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_3:.*]] = memref.collapse_shape %[[VAL_1]] {{\[\[}}0, 1]] : memref<2x1001xf16, "gpuhost"> into memref<2002xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_4:.*]] = memref.alloc() : memref<2002xf16, "gpu">
+// CHECK-DAG:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2002xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2002xf16, "gpu"> to memref<2002xf16, "gpuhost">
 // CHECK:           return
 // CHECK:         }
 
@@ -522,14 +522,14 @@ module attributes {byre.container_module} {
 // CHECK-LABEL:   func.func @src_alloc_shape_transform_1(
 // CHECK-SAME:                                           %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
 // CHECK-SAME:                                           %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
-// CHECK:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
-// CHECK:           %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
-// CHECK:           %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
-// CHECK:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
-// CHECK:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
-// CHECK:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
+// CHECK-DAG:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
 // CHECK:           return
 // CHECK:         }
 
@@ -554,14 +554,14 @@ module attributes {byre.container_module} {
 // CHECK-LABEL:   func.func @src_alloc_shape_transform_2(
 // CHECK-SAME:                                           %[[VAL_0:.*]]: memref<2x224x224x3xf16, "gpuhost"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32},
 // CHECK-SAME:                                           %[[VAL_1:.*]]: memref<2x1001xf16, "gpuhost"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point, byteir.entry_point = {inputs = ["input_tensor@Cast"], outputs = ["softmax_tensor@Cast"]}} {
-// CHECK:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
-// CHECK:           %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
-// CHECK:           %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
-// CHECK:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
-// CHECK:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
-// CHECK:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
-// CHECK:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_2:.*]] = memref.collapse_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<2x224x224x3xf16, "gpuhost"> into memref<301056xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_3:.*]] = memref.expand_shape %[[VAL_1]] {{\[\[}}0], [1, 2]] : memref<2x1001xf16, "gpuhost"> into memref<2x1x1001xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_4:.*]] = memref.alloc() : memref<2x1x1001xf16, "gpu">
+// CHECK-DAG:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_2]] {{\[\[}}0, 1, 2, 3]] {device = "gpuhost"} : memref<301056xf16, "gpuhost"> into memref<2x224x1x672xf16, "gpuhost">
+// CHECK-DAG:           %[[VAL_6:.*]] = memref.alloc() : memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_5]], %[[VAL_6]] : memref<2x224x1x672xf16, "gpuhost"> to memref<2x224x1x672xf16, "gpu">
+// CHECK-DAG:           byre.compute @foo(%[[VAL_6]], %[[VAL_4]]) {device = "gpu", kernel_name = "main_gpu", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "gpu">, memref<2x1x1001xf16, "gpu">
+// CHECK-DAG:           memref.copy %[[VAL_4]], %[[VAL_3]] : memref<2x1x1001xf16, "gpu"> to memref<2x1x1001xf16, "gpuhost">
 // CHECK:           return
 // CHECK:         }
 
@@ -599,3 +599,43 @@ func.func @src_alloc_shape_transform_3(%arg0: memref<2x1x1001xf16>, %arg1: memre
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
+
+// -----
+
+func.func @insert_slice(%arg0: memref<1024x9xf32>, %arg1: memref<1024x9xf32>) -> (memref<1024x9xf32>, memref<1024x9xf32>) attributes {__byteir_elementwise_fusion__} {
+  %subview = memref.subview %arg0[0, 0] [1024, 4] [1, 1] : memref<1024x9xf32> to memref<1024x4xf32, strided<[9, 1]>>
+  %alloc = memref.alloc() : memref<1024x9xf32>
+  memref.copy %arg1, %alloc : memref<1024x9xf32> to memref<1024x9xf32>
+  %subview_0 = memref.subview %alloc[0, 0] [1024, 4] [1, 1] : memref<1024x9xf32> to memref<1024x4xf32, strided<[9, 1]>>
+  memref.copy %subview, %subview_0 : memref<1024x4xf32, strided<[9, 1]>> to memref<1024x4xf32, strided<[9, 1]>>
+  return %alloc, %arg1 : memref<1024x9xf32>, memref<1024x9xf32>
+}
+
+// CHECK-LABEL: func.func @insert_slice
+// CHECK: memref.copy
+// CHECK: memref.copy
+
+// -----
+
+module attributes {byre.container_module} {
+  func.func @h2dCopy(%arg0: memref<2x224x224x3xf16, "cpu"> {byre.argname = "input_tensor@Cast", byre.argtype = 1 : i32}, %arg1: memref<2x1001xf16, "cuda"> {byre.argname = "softmax_tensor@Cast", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1, 2, 3]] {device = "cpu"} : memref<2x224x224x3xf16, "cpu"> into memref<301056xf16, "cpu">
+    %expand_shape = memref.expand_shape %collapse_shape [[0, 1, 2, 3]] {device = "cpu"} : memref<301056xf16, "cpu"> into memref<2x224x1x672xf16, "cpu">
+    %alloc = memref.alloc() : memref<2x1x1x1001xf16, "cuda">
+    %alloc_0 = memref.alloc() : memref<2x224x1x672xf16, "cuda">
+    memref.copy %expand_shape, %alloc_0 : memref<2x224x1x672xf16, "cpu"> to memref<2x224x1x672xf16, "cuda">
+    byre.compute @cudaComputeOp(%alloc_0, %alloc) {device = "cuda", kernel_name = "main_cuda", memory_effects = [1 : i32, 2 : i32]} : memref<2x224x1x672xf16, "cuda">, memref<2x1x1x1001xf16, "cuda">
+    %alloc_1 = memref.alloc() : memref<2x1x1x1001xf16, "cpu">
+    memref.copy %alloc, %alloc_1 : memref<2x1x1x1001xf16, "cuda"> to memref<2x1x1x1001xf16, "cpu">
+    %collapse_shape_2 = memref.collapse_shape %alloc_1 [[0], [1, 2, 3]] {device = "cpu"} : memref<2x1x1x1001xf16, "cpu"> into memref<2x1001xf16, "cpu">
+    memref.copy %collapse_shape_2, %arg1 : memref<2x1001xf16, "cpu"> to memref<2x1001xf16, "cuda">
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @h2dCopy
+// CHECK: memref.copy
+// CHECK: memref.copy
+// CHECK: memref.copy
+
+