Merge f3aeceb into 912e6bc

albiol2004 · web-flow · commit 709cb99fb6ea · 2026-04-09T22:27:26.000+02:00
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ The IRON Python API for Ryzen™ AI NPUs is described in the following paper:
 | [Reduction]() | Reduction | bfloat16 | | | 🟡 |  |
 | [Dequant](./aie_kernels/generic/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/dequant/](./iron/operators/dequant/) |
 | [RELU](./aie_kernels/aie2/relu.cc) | RELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/relu/](./iron/operators/relu/) |
-| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) (WIP) | Leaky RELU kernel | bfloat16 | | ✓ | ⚪ | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
+| [Leaky RELU](./aie_kernels/aie2/leaky_relu.cc) | Leaky RELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
 | [GELU](./aie_kernels/aie2/gelu.cc) | GELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/gelu/](./iron/operators/gelu/) |
 | [LayerNorm](./aie_kernels/aie2/layer_norm.cc) | LayerNorm | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/layer_norm/](./iron/operators/layer_norm/) |
 | [Convolution]() | Convolution | bfloat16 | | | 🟡 |  |
diff --git a/aie_kernels/aie2/leaky_relu.cc b/aie_kernels/aie2/leaky_relu.cc
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+using namespace aie;
+
+void leaky_relu_vectorized_bf16(bfloat16 *restrict a,
+                                bfloat16 *restrict c,
+                                const int32_t vector_size,
+                                const bfloat16 alpha)
+{
+    event0();
+
+    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)a);
+    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)c);
+
+    // Broadcast alpha to a vector
+    vector<bfloat16, 16> alpha_vec = aie::broadcast<bfloat16, 16>(alpha);
+
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(16)
+    for (int i = 0; i < vector_size; i += 16) {
+        vector<bfloat16, 16> input = *it_in++;
+        // Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01
+        // When alpha < 1: if x > 0 then x, else alpha * x
+        vector<bfloat16, 16> alpha_times_input = aie::mul(input, alpha_vec);
+        vector<bfloat16, 16> output = aie::max(input, alpha_times_input);
+        *it_out++ = output;
+    }
+
+    event1();
+
+    return;
+}
+
+extern "C" {
+
+void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha)
+{
+    leaky_relu_vectorized_bf16(input, output, input_size, alpha);
+}
+
+} // extern "C"
diff --git a/iron/operators/leaky_relu/design.py b/iron/operators/leaky_relu/design.py
@@ -50,7 +50,7 @@ def my_leaky_relu(
     leaky_relu_fcn = Kernel(
         "leaky_relu_bf16",
         "leaky_relu.o",
-        [line_type, line_type, np.int32, np.dtype[xfr_dtype]],
+        [line_type, line_type, np.int32, xfr_dtype],
     )
 
     # Task for the core to perform
diff --git a/iron/operators/leaky_relu/test.py b/iron/operators/leaky_relu/test.py
@@ -25,7 +25,6 @@ def get_params():
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,alpha", get_params()
 )
-@pytest.mark.skip(reason="Leaky ReLU is currently broken (#36)")
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def my_leaky_relu(`
`50`	`50`	`leaky_relu_fcn = Kernel(`
`51`	`51`	`"leaky_relu_bf16",`
`52`	`52`	`"leaky_relu.o",`
`53`		`- [line_type, line_type, np.int32, np.dtype[xfr_dtype]],`
	`53`	`+ [line_type, line_type, np.int32, xfr_dtype],`
`54`	`54`	`)`
`55`	`55`
`56`	`56`	`# Task for the core to perform`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,6 @@ def get_params():`
`25`	`25`	`@pytest.mark.parametrize(`
`26`	`26`	`"input_length,num_aie_columns,num_channels,tile_size,alpha", get_params()`
`27`	`27`	`)`
`28`		`-@pytest.mark.skip(reason="Leaky ReLU is currently broken (#36)")`
`29`	`28`	`@pytest.mark.metrics(`
`30`	`29`	`Latency=r"Latency \(us\): (?P<value>[\d\.]+)",`
`31`	`30`	`Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",`