diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile index 7e9359140..b3300eba9 100644 --- a/examples/BuddyNext/makefile +++ b/examples/BuddyNext/makefile @@ -164,3 +164,296 @@ next-attention-fusion-run: -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-concat-matmul-lower: + @${MLIR_OPT} ./next-concat-matmul.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts \ + -o ./log.mlir + +next-concat-matmul-translate: + @${MLIR_OPT} ./next-concat-matmul.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +next-concat-matmul-run: + @${MLIR_OPT} ./next-concat-matmul.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-eliminate-add-zero-lower: + @${MLIR_OPT} ./next-eliminate-add-zero.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts \ + -o ./log.mlir + +next-eliminate-add-zero-translate: + @${MLIR_OPT} ./next-eliminate-add-zero.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +next-eliminate-add-zero-run: + @${MLIR_OPT} ./next-eliminate-add-zero.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + + +next-eliminate-identity-lower: + @${MLIR_OPT} ./next-eliminate-identity.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts \ + -o ./log.mlir + +next-eliminate-identity-translate: + @${MLIR_OPT} ./next-eliminate-identity.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + + +next-eliminate-identity-run: + @${MLIR_OPT} ./next-eliminate-identity.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} diff --git a/examples/BuddyNext/next-concat-matmul.mlir b/examples/BuddyNext/next-concat-matmul.mlir new file mode 100644 index 000000000..87b7d0249 --- /dev/null +++ b/examples/BuddyNext/next-concat-matmul.mlir @@ -0,0 +1,113 @@ +module { + func.func private @printMemrefF32(tensor<*xf32>) + func.func private @rtclock() -> f64 + + func.func @qkv_compute_original(%arg1: tensor<4096x4096xf32>, %arg2: tensor<4096x4096xf32>, %arg3: tensor<4096x4096xf32>) { + // %41 = tosa.mul %40, %39 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + %41 = arith.constant dense<1.0> : tensor<1x40x4096xf32> + + %t0_original = call @rtclock() : () -> f64 + + %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %44 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> + %45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %46 = tosa.reshape %45 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %48 = tosa.transpose %arg2, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %49 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> + %50 = linalg.matmul {cast = #linalg.type_fn} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %51 = tosa.reshape %50 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %53 = tosa.transpose %arg3, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %54 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> + %55 = linalg.matmul {cast = #linalg.type_fn} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %56 = tosa.reshape %55 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %t1_original = call @rtclock() : () -> f64 + + %tensor_unranked_46 = tensor.cast %46 : tensor<1x40x4096xf32> to tensor<*xf32> + %tensor_unranked_51 = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32> + %tensor_unranked_56 = tensor.cast %56 : tensor<1x40x4096xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked_46) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_51) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_56) : (tensor<*xf32>) -> () + + %t_original = arith.subf %t1_original, %t0_original : f64 + vector.print str "original operation time: " + vector.print %t_original : f64 + + return + } + + func.func @qkv_compute_optimized(%arg1: tensor<4096x4096xf32>, %arg2: tensor<4096x4096xf32>, %arg3: tensor<4096x4096xf32>) { + %41 = arith.constant dense<1.0> : tensor<1x40x4096xf32> + + %t0_optimized = call @rtclock() : () -> f64 + + %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %47 = tosa.transpose %arg2, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %52 = tosa.transpose %arg3, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + + %concat_weights = "tosa.concat"(%43, %47, %52) {axis = 1 : i32} : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> tensor<4096x12288xf32> + + %44 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x12288xf32> + %45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %concat_weights : tensor<40x4096xf32>, tensor<4096x12288xf32>) outs(%cst_6 : tensor<40x12288xf32>) -> tensor<40x12288xf32> + + + %t1_optimized = call @rtclock() : () -> f64 + // %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %res_w1 = tensor.extract_slice %45[0, 0] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32> + %res_w2 = tensor.extract_slice %45[0, 4096] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32> + %res_w3 = tensor.extract_slice %45[0, 8192] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32> + + %46 = tosa.reshape %res_w1 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %51 = tosa.reshape %res_w2 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %56 = tosa.reshape %res_w3 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + // %result_W1 = "tosa.slice"(%45) {offsets = array, sizes = array} : (tensor<40x12288xf32>) -> tensor<40x4096xf32> + // %result_W2 = "tosa.slice"(%45) {offsets = array, sizes = array} : (tensor<40x12288xf32>) -> tensor<40x4096xf32> + // %result_W3 = "tosa.slice"(%45) {offsets = array, sizes = array} : (tensor<40x12288xf32>) -> tensor<40x4096xf32> + + // %46 = tosa.reshape %result_W1 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + // %51 = tosa.reshape %result_W2 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + // %56 = tosa.reshape %result_W3 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %tensor_unranked_46 = tensor.cast %46 : tensor<1x40x4096xf32> to tensor<*xf32> + %tensor_unranked_51 = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32> + %tensor_unranked_56 = tensor.cast %56 : tensor<1x40x4096xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked_46) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_51) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_56) : (tensor<*xf32>) -> () + + %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64 + vector.print str "optimized operation time: " + vector.print %t_optimized : f64 + + return + } + + + func.func @main() { + %arg1 = arith.constant dense<0.0> : tensor<4096x4096xf32> + %arg2 = arith.constant dense<1.0> : tensor<4096x4096xf32> + %arg3 = arith.constant dense<2.0> : tensor<4096x4096xf32> + + call @qkv_compute_original(%arg1, %arg2, %arg3) : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> () + + call @qkv_compute_optimized(%arg1, %arg2, %arg3) : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> () + + return + } +} diff --git a/examples/BuddyNext/next-eliminate-add-zero.mlir b/examples/BuddyNext/next-eliminate-add-zero.mlir new file mode 100644 index 000000000..c40c073ec --- /dev/null +++ b/examples/BuddyNext/next-eliminate-add-zero.mlir @@ -0,0 +1,54 @@ +module { + func.func private @printMemrefF32(tensor<*xf32>) + func.func private @rtclock() -> f64 + + func.func @uvue_original() { + %t0_original = call @rtclock() : () -> f64 + + %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32> + %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %94 = tosa.reshape %93 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + + %t1_original = call @rtclock() : () -> f64 + %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + + + %t_original = arith.subf %t1_original, %t0_original : f64 + vector.print str "original operation time: " + vector.print %t_original : f64 + return + } + + func.func @uve_optimized() { + %t0_optimized = call @rtclock() : () -> f64 + + %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32> + %94 = tosa.reshape %84 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + %t1_optimized = call @rtclock() : () -> f64 + + %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + + %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64 + vector.print str "optimized operation time: " + vector.print %t_optimized : f64 + return + } + + + func.func @main() { + %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32> + + call @uvue_original() : () -> () + + call @uve_optimized() : () -> () + + return + } +} diff --git a/examples/BuddyNext/next-eliminate-identity.mlir b/examples/BuddyNext/next-eliminate-identity.mlir new file mode 100644 index 000000000..424cac3b7 --- /dev/null +++ b/examples/BuddyNext/next-eliminate-identity.mlir @@ -0,0 +1,49 @@ +module { + func.func private @printMemrefF32(tensor<*xf32>) + func.func private @rtclock() -> f64 + + func.func @ie_original() { + %t0_original = call @rtclock() : () -> f64 + + %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32> + %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32> + %121 = tosa.reshape %120 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32> + %t1_original = call @rtclock() : () -> f64 + + %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + + %t_original = arith.subf %t1_original, %t0_original : f64 + vector.print str "original operation time: " + vector.print %t_original : f64 + return + } + + func.func @ie_optimized() { + %t0_optimized = call @rtclock() : () -> f64 + + %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32> + %121 = tosa.reshape %119 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32> + %t1_optimized = call @rtclock() : () -> f64 + + %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32> + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64 + vector.print str "optimized operation time: " + vector.print %t_optimized : f64 + return + } + + + func.func @main() { + + call @ie_original() : () -> () + call @ie_optimized() : () -> () + + return + } +}