diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index 7e9359140..b3300eba9 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -164,3 +164,296 @@ next-attention-fusion-run:
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-concat-matmul-lower:
+	@${MLIR_OPT} ./next-concat-matmul.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts \
+		-o ./log.mlir
+
+next-concat-matmul-translate:
+	@${MLIR_OPT} ./next-concat-matmul.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+next-concat-matmul-run:
+	@${MLIR_OPT} ./next-concat-matmul.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-eliminate-add-zero-lower:
+	@${MLIR_OPT} ./next-eliminate-add-zero.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts \
+		-o ./log.mlir
+
+next-eliminate-add-zero-translate:
+	@${MLIR_OPT} ./next-eliminate-add-zero.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+next-eliminate-add-zero-run:
+	@${MLIR_OPT} ./next-eliminate-add-zero.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-eliminate-identity-lower:
+	@${MLIR_OPT} ./next-eliminate-identity.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -func-bufferize \
+        -arith-bufferize \
+        -tensor-bufferize \
+        -buffer-deallocation \
+        -finalizing-bufferize \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts \
+        -o ./log.mlir
+
+next-eliminate-identity-translate:
+	@${MLIR_OPT} ./next-eliminate-identity.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -func-bufferize \
+        -arith-bufferize \
+        -tensor-bufferize \
+        -buffer-deallocation \
+        -finalizing-bufferize \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+
+next-eliminate-identity-run:
+	@${MLIR_OPT} ./next-eliminate-identity.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -func-bufferize \
+        -arith-bufferize \
+        -tensor-bufferize \
+        -buffer-deallocation \
+        -finalizing-bufferize \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/BuddyNext/next-concat-matmul.mlir b/examples/BuddyNext/next-concat-matmul.mlir
new file mode 100644
index 000000000..87b7d0249
--- /dev/null
+++ b/examples/BuddyNext/next-concat-matmul.mlir
@@ -0,0 +1,113 @@
+module {
+    func.func private @printMemrefF32(tensor<*xf32>)
+    func.func private @rtclock() -> f64
+
+    func.func @qkv_compute_original(%arg1: tensor<4096x4096xf32>, %arg2: tensor<4096x4096xf32>, %arg3: tensor<4096x4096xf32>) {
+        // %41 = tosa.mul %40, %39 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+        %41 = arith.constant dense<1.0> : tensor<1x40x4096xf32>
+        
+        %t0_original = call @rtclock() : () -> f64 
+
+        %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+        %43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+        %44 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+        %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+        %45 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+        %46 = tosa.reshape %45 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        
+        %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+        %48 = tosa.transpose %arg2, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+        %49 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+        %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+        %50 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+        %51 = tosa.reshape %50 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        
+        %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+        %53 = tosa.transpose %arg3, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+        %54 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+        %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+        %55 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+        %56 = tosa.reshape %55 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+        %t1_original = call @rtclock() : () -> f64 
+
+        %tensor_unranked_46 = tensor.cast %46 : tensor<1x40x4096xf32> to tensor<*xf32>
+        %tensor_unranked_51 = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32>
+        %tensor_unranked_56 = tensor.cast %56 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked_46) : (tensor<*xf32>) -> ()
+        call @printMemrefF32(%tensor_unranked_51) : (tensor<*xf32>) -> ()
+        call @printMemrefF32(%tensor_unranked_56) : (tensor<*xf32>) -> ()
+
+        %t_original = arith.subf %t1_original, %t0_original : f64
+        vector.print str "original operation time: "
+        vector.print %t_original : f64
+
+        return 
+    }
+
+    func.func @qkv_compute_optimized(%arg1: tensor<4096x4096xf32>, %arg2: tensor<4096x4096xf32>, %arg3: tensor<4096x4096xf32>) {
+        %41 = arith.constant dense<1.0> : tensor<1x40x4096xf32>
+
+        %t0_optimized = call @rtclock() : () -> f64 
+
+        %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+        %43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+        %47 = tosa.transpose %arg2, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+        %52 = tosa.transpose %arg3, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+
+        %concat_weights = "tosa.concat"(%43, %47, %52) {axis = 1 : i32} : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> tensor<4096x12288xf32>
+
+        %44 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+        %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x12288xf32>
+        %45 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%44, %concat_weights : tensor<40x4096xf32>, tensor<4096x12288xf32>) outs(%cst_6 : tensor<40x12288xf32>) -> tensor<40x12288xf32>
+        
+
+        %t1_optimized = call @rtclock() : () -> f64 
+        // %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+        %res_w1 = tensor.extract_slice %45[0, 0] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32>
+        %res_w2 = tensor.extract_slice %45[0, 4096] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32>
+        %res_w3 = tensor.extract_slice %45[0, 8192] [40, 4096] [1, 1] : tensor<40x12288xf32> to tensor<40x4096xf32>
+
+        %46 = tosa.reshape %res_w1 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        %51 = tosa.reshape %res_w2 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        %56 = tosa.reshape %res_w3 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+        // %result_W1 = "tosa.slice"(%45) {offsets = array<i64: 0, 0>, sizes = array<i64: 40, 4096>} : (tensor<40x12288xf32>) -> tensor<40x4096xf32>
+        // %result_W2 = "tosa.slice"(%45) {offsets = array<i64: 0, 4096>, sizes = array<i64: 40, 4096>} : (tensor<40x12288xf32>) -> tensor<40x4096xf32>
+        // %result_W3 = "tosa.slice"(%45) {offsets = array<i64: 0, 8192>, sizes = array<i64: 40, 4096>} : (tensor<40x12288xf32>) -> tensor<40x4096xf32>
+
+        // %46 = tosa.reshape %result_W1 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        // %51 = tosa.reshape %result_W2 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+        // %56 = tosa.reshape %result_W3 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+        %tensor_unranked_46 = tensor.cast %46 : tensor<1x40x4096xf32> to tensor<*xf32>
+        %tensor_unranked_51 = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32>
+        %tensor_unranked_56 = tensor.cast %56 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked_46) : (tensor<*xf32>) -> ()
+        call @printMemrefF32(%tensor_unranked_51) : (tensor<*xf32>) -> ()
+        call @printMemrefF32(%tensor_unranked_56) : (tensor<*xf32>) -> ()
+
+        %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64
+        vector.print str "optimized operation time: "
+        vector.print %t_optimized : f64
+
+        return
+    }
+
+
+    func.func @main() {
+        %arg1 = arith.constant dense<0.0> : tensor<4096x4096xf32>
+        %arg2 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+        %arg3 = arith.constant dense<2.0> : tensor<4096x4096xf32>
+
+        call @qkv_compute_original(%arg1, %arg2, %arg3) : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> ()
+
+        call @qkv_compute_optimized(%arg1, %arg2, %arg3) : (tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>) -> ()
+
+        return
+    }
+}
diff --git a/examples/BuddyNext/next-eliminate-add-zero.mlir b/examples/BuddyNext/next-eliminate-add-zero.mlir
new file mode 100644
index 000000000..c40c073ec
--- /dev/null
+++ b/examples/BuddyNext/next-eliminate-add-zero.mlir
@@ -0,0 +1,54 @@
+module {
+    func.func private @printMemrefF32(tensor<*xf32>)
+    func.func private @rtclock() -> f64
+
+    func.func @uvue_original() {
+        %t0_original = call @rtclock() : () -> f64 
+
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+        %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+        %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+        %94 = tosa.reshape %93 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+        
+        %t1_original = call @rtclock() : () -> f64
+        %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+
+        
+        %t_original = arith.subf %t1_original, %t0_original : f64
+        vector.print str "original operation time: "
+        vector.print %t_original : f64
+        return 
+    }
+
+    func.func @uve_optimized() {
+        %t0_optimized = call @rtclock() : () -> f64
+
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+        %94 = tosa.reshape %84 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+        %t1_optimized = call @rtclock() : () -> f64
+
+        %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        
+        %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64
+        vector.print str "optimized operation time: "
+        vector.print %t_optimized : f64
+        return 
+    }
+
+
+    func.func @main() {
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+
+        call @uvue_original() : () -> ()
+        
+        call @uve_optimized() : () -> ()
+
+        return 
+    }
+}
diff --git a/examples/BuddyNext/next-eliminate-identity.mlir b/examples/BuddyNext/next-eliminate-identity.mlir
new file mode 100644
index 000000000..424cac3b7
--- /dev/null
+++ b/examples/BuddyNext/next-eliminate-identity.mlir
@@ -0,0 +1,49 @@
+module {
+    func.func private @printMemrefF32(tensor<*xf32>)
+    func.func private @rtclock() -> f64
+
+    func.func @ie_original() {
+        %t0_original = call @rtclock() : () -> f64 
+
+        %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32>
+        %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+        %121 = tosa.reshape %120 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+        %t1_original = call @rtclock() : () -> f64
+        
+        %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        
+        %t_original = arith.subf %t1_original, %t0_original : f64
+        vector.print str "original operation time: "
+        vector.print %t_original : f64
+        return 
+    }
+
+    func.func @ie_optimized() {
+        %t0_optimized = call @rtclock() : () -> f64
+
+        %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32>
+        %121 = tosa.reshape %119 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+        %t1_optimized = call @rtclock() : () -> f64
+
+        %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64
+        vector.print str "optimized operation time: "
+        vector.print %t_optimized : f64
+        return 
+    }
+
+
+    func.func @main() {
+
+        call @ie_original() : () -> ()
+        call @ie_optimized() : () -> ()
+
+        return 
+    }
+}