Skip to content

Commit 709cb99

Browse files
authored
Merge f3aeceb into 912e6bc
2 parents 912e6bc + f3aeceb commit 709cb99

File tree

4 files changed

+49
-3
lines changed

4 files changed

+49
-3
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ The IRON Python API for Ryzen™ AI NPUs is described in the following paper:
5959
| [Reduction]() | Reduction | bfloat16 | | | 🟡 | |
6060
| [Dequant](./aie_kernels/generic/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 ||| 🟢 | [iron/operators/dequant/](./iron/operators/dequant/) |
6161
| [RELU](./aie_kernels/aie2/relu.cc) | RELU | bfloat16 ||| 🟢 | [iron/operators/relu/](./iron/operators/relu/) |
62-
| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) (WIP) | Leaky RELU kernel | bfloat16 | || | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
62+
| [Leaky RELU](./aie_kernels/aie2/leaky_relu.cc) | Leaky RELU | bfloat16 | || 🟢 | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
6363
| [GELU](./aie_kernels/aie2/gelu.cc) | GELU | bfloat16 ||| 🟢 | [iron/operators/gelu/](./iron/operators/gelu/) |
6464
| [LayerNorm](./aie_kernels/aie2/layer_norm.cc) | LayerNorm | bfloat16 ||| 🟢 | [iron/operators/layer_norm/](./iron/operators/layer_norm/) |
6565
| [Convolution]() | Convolution | bfloat16 | | | 🟡 | |

aie_kernels/aie2/leaky_relu.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "../aie_kernel_utils.h"
5+
6+
#include <aie_api/aie.hpp>
7+
#include <stdint.h>
8+
9+
using namespace aie;
10+
11+
void leaky_relu_vectorized_bf16(bfloat16 *restrict a,
12+
bfloat16 *restrict c,
13+
const int32_t vector_size,
14+
const bfloat16 alpha)
15+
{
16+
event0();
17+
18+
auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)a);
19+
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)c);
20+
21+
// Broadcast alpha to a vector
22+
vector<bfloat16, 16> alpha_vec = aie::broadcast<bfloat16, 16>(alpha);
23+
24+
AIE_PREPARE_FOR_PIPELINING
25+
AIE_LOOP_MIN_ITERATION_COUNT(16)
26+
for (int i = 0; i < vector_size; i += 16) {
27+
vector<bfloat16, 16> input = *it_in++;
28+
// Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01
29+
// When alpha < 1: if x > 0 then x, else alpha * x
30+
vector<bfloat16, 16> alpha_times_input = aie::mul(input, alpha_vec);
31+
vector<bfloat16, 16> output = aie::max(input, alpha_times_input);
32+
*it_out++ = output;
33+
}
34+
35+
event1();
36+
37+
return;
38+
}
39+
40+
extern "C" {
41+
42+
void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha)
43+
{
44+
leaky_relu_vectorized_bf16(input, output, input_size, alpha);
45+
}
46+
47+
} // extern "C"

iron/operators/leaky_relu/design.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def my_leaky_relu(
5050
leaky_relu_fcn = Kernel(
5151
"leaky_relu_bf16",
5252
"leaky_relu.o",
53-
[line_type, line_type, np.int32, np.dtype[xfr_dtype]],
53+
[line_type, line_type, np.int32, xfr_dtype],
5454
)
5555

5656
# Task for the core to perform

iron/operators/leaky_relu/test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ def get_params():
2525
@pytest.mark.parametrize(
2626
"input_length,num_aie_columns,num_channels,tile_size,alpha", get_params()
2727
)
28-
@pytest.mark.skip(reason="Leaky ReLU is currently broken (#36)")
2928
@pytest.mark.metrics(
3029
Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
3130
Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",

0 commit comments

Comments
 (0)