amd
diff --git a/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 4 additions & 3 deletions b/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 45 additions & 8 deletions b/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 45 additions & 8 deletions
diff --git a/‎applications/llama_3.2_1b/src/compilation.py‎
Lines changed: 9 additions & 2 deletions b/‎applications/llama_3.2_1b/src/compilation.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 9 additions & 0 deletions b/‎applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎applications/llama_3.2_1b/src/operator/aie_base.py‎
Lines changed: 3 additions & 0 deletions b/‎applications/llama_3.2_1b/src/operator/aie_base.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎applications/llama_3.2_1b/src/operator/aie_elementwise_mul.py‎
Lines changed: 29 additions & 5 deletions b/‎applications/llama_3.2_1b/src/operator/aie_elementwise_mul.py‎
Lines changed: 29 additions & 5 deletions
@@ -12,9 +12,10 @@
     "rope_base": 500000.0,
     "dtype": "bfloat16",
     "use_aie_final_norm": true,
-    "use_aie_ffn_gemm": true,
-    "use_aie_ffn_silu": true,
-    "use_aie_ffn_mul": true,
+    "use_aie_ffn_gemm": false,
+    "use_aie_ffn_silu": false,
+    "use_aie_ffn_mul": false,
+    "use_aie_ffn_swiglu": true,
     "use_aie_attn_projection_gemm": true,
     "use_aie_rope": true,
     "use_aie_norm1": true,
 
@@ -13,6 +13,9 @@
 from src.operator.aie_gemm import AIEGEMM
 from src.operator.aie_gemv import AIEGEMV
 from src.operator.aie_silu import AIESiLU
+from src.operator.aie_swiglu_prefill import AIESwiGLUPrefill
+from src.operator.aie_swiglu_decode import AIESwiGLUDecode
+from ml_dtypes import bfloat16
 
 
 class FeedForward(nn.Module):
@@ -25,6 +28,16 @@ def __init__(
         super().__init__()
         self.cfg = cfg.copy()
 
+        assert (
+            cfg["use_aie_ffn_swiglu"]
+            and not (
+                cfg["use_aie_ffn_silu"]
+                or cfg["use_aie_ffn_gemm"]
+                or cfg["use_aie_ffn_mul"]
+            )
+            or not cfg["use_aie_ffn_swiglu"]
+        ), "Cannot mix fused SwiGLU with individual AIE operators."
+
         self.emb_dim = cfg["emb_dim"]
         self.hidden_dim = cfg["hidden_dim"]
 
@@ -36,10 +49,17 @@ def __init__(
         else:
             self.silu = nn.SiLU()
 
-        self.emb_dim = cfg["emb_dim"]
-        self.hidden_dim = cfg["hidden_dim"]
+        if self.cfg["use_aie_ffn_swiglu"]:
+            self.aie_swiglu_prefill = AIESwiGLUPrefill(
+                seq_len=prompt_length,
+                embedding_dim=self.emb_dim,
+                hidden_dim=self.hidden_dim,
+            )
+            if self.cfg["use_kv_cache"]:
+                self.aie_swiglu_decode = AIESwiGLUDecode(
+                    embedding_dim=self.emb_dim, hidden_dim=self.hidden_dim
+                )
 
-        # Initialize FFN up and down projections
         if self.cfg["use_aie_ffn_gemm"]:
             if self.cfg["use_kv_cache"]:
                 M_prefill = prompt_length
@@ -108,8 +128,15 @@ def forward(self, x):
             or (len(x.shape) == 3 and x.shape[0] == 1 and x.shape[1] == 1)
         )
 
+        is_prefill = not is_vector or not self.cfg["use_kv_cache"]
         is_decode_with_kv = is_vector and self.cfg["use_kv_cache"]
 
+        if self.cfg["use_aie_ffn_swiglu"]:
+            if is_prefill:
+                return self.aie_swiglu_prefill(x)
+            else:
+                return self.aie_swiglu_decode(x)
+
         if is_decode_with_kv and self.cfg["use_aie_gemv"]:
             x_fc1 = self.aie_fc1_gemv(x)
             x_fc2 = self.aie_fc2_gemv(x)
@@ -131,6 +158,21 @@ def forward(self, x):
             return self.fc3(x).view(original_shape)
 
     def assign_weights(self, l, fc1, fc2, fc3):
+        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+            self.aie_fc1_gemv.weight = fc1
+            self.aie_fc2_gemv.weight = fc2
+            self.aie_fc3_gemv.weight = fc3
+
+        if self.cfg["use_aie_ffn_swiglu"]:
+            self.aie_swiglu_prefill.weights_1 = fc1
+            self.aie_swiglu_prefill.weights_2 = fc2
+            self.aie_swiglu_prefill.weights_3 = fc3
+            if self.cfg["use_kv_cache"]:
+                self.aie_swiglu_decode.weights_1 = fc1
+                self.aie_swiglu_decode.weights_2 = fc2
+                self.aie_swiglu_decode.weights_3 = fc3
+            return
+
         self.fc1.weight = assign(
             self.fc1.weight,
             fc1,
@@ -146,8 +188,3 @@ def assign_weights(self, l, fc1, fc2, fc3):
             fc3,
             f"model.layers.{l}.mlp.down_proj.weight",
         )
-
-        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
-            self.aie_fc1_gemv.weight = fc1
-            self.aie_fc2_gemv.weight = fc2
-            self.aie_fc3_gemv.weight = fc3
 
@@ -113,10 +113,13 @@ class SourceArtifact(CompilationArtifact):
 
 
 class XclbinArtifact(CompilationArtifact):
-    def __init__(self, path, depends, kernel_name="MLIR_AIE", extra_flags=None):
+    def __init__(
+        self, path, depends, kernel_name="MLIR_AIE", extra_flags=None, xclbin_input=None
+    ):
         super().__init__(path, depends)
         self.kernel_name = kernel_name
         self.extra_flags = extra_flags if extra_flags is not None else []
+        self.xclbin_input = xclbin_input
 
 
 class InstsBinArtifact(CompilationArtifact):
@@ -295,6 +298,10 @@ def compile(self, artifacts):
                     "--xclbin-name=" + str(first_xclbin.path),
                     "--xclbin-kernel-name=" + first_xclbin.kernel_name,
                 ]
+                if first_xclbin.xclbin_input is not None:
+                    compile_cmd += [
+                        "--xclbin-input=" + str(first_xclbin.xclbin_input.path)
+                    ]
             if do_compile_insts_bin:
                 first_insts_bin = mlir_sources_to_insts_bins[mlir_source][
                     0
@@ -414,7 +421,7 @@ def _rename_symbols(self, artifact):
         result = subprocess.run(cmd, capture_output=True, text=True)
 
         if result.returncode == 0:
-            logging.info(f"Successfully renamed symbols in: {artifact.path.name}")
+            logging.debug(f"Successfully renamed symbols in: {artifact.path.name}")
         else:
             raise RuntimeError(f"Symbol renaming failed: {result.stderr}")
 
 
@@ -37,6 +37,7 @@ def dtype_from_string(inp):
     "use_aie_ffn_gemm":             (bool,              False,         "[FFN] GEMM"),
     "use_aie_ffn_mul":              (bool,              False,         "[FFN] Elementwise Mul"),
     "use_aie_ffn_silu":             (bool,              False,         "[FFN] SiLU"),
+    "use_aie_ffn_swiglu":           (bool,              False,         "[FFN] Runlist-based SwiGLU"),
     "use_aie_residual":             (bool,              False,         "[Transformer] Residual Addition"),
     "use_aie_norm1":                (bool,              False,         "[Transformer] Pre Norm"),
     "use_aie_norm2":                (bool,              False,         "[Transformer] Post Norm"),
@@ -81,6 +82,14 @@ def format_option(name, value):
         dont_print |= {"use_aie_regular_mha"}
     else:
         dont_print |= {"use_aie_fused_mha"}
+    if cfg["use_aie_ffn_swiglu"]:
+        dont_print |= {
+            "use_aie_ffn_gemm",
+            "use_aie_ffn_mul",
+            "use_aie_ffn_silu",
+        }
+    else:
+        dont_print |= {"use_aie_ffn_swiglu"}
 
     console.print(
         "AIE Configuration ([green]✔[/green] = AIE NPU / [red]✘[/red] = CPU):",
 
@@ -60,6 +60,9 @@ def prepare_runtime(cls):
             cls.static_data_pool[buffer_data] = bo
 
         for op in cls.registered_operators:
+            if len(op.kernels) == 0:
+                # Operator likely is used as a sub-operator in another operator and does need any setup.
+                continue
             logging.info(f"Preparing runtime for AIE operator: {op.__class__.__name__}")
 
             # Set up for each kernel
 
@@ -15,12 +15,21 @@
     PythonGeneratedMLIRArtifact,
 )
 from ..utils import torch_to_numpy, numpy_to_torch
+from pathlib import Path
 
 
 class AIEElementwiseMul(AIEOperatorBase):
     """AIE-accelerated element-wise multiplication"""
 
-    def __init__(self, size, num_columns=None, num_channels=None, tile_size=None):
+    def __init__(
+        self,
+        size,
+        num_columns=None,
+        num_channels=None,
+        tile_size=None,
+        trace_size=0,
+        do_set_up=True,
+    ):
         self.size = size
 
         # Enforce ShimDMA limits for elementwise_mul (uses 2 inputs per core)
@@ -37,12 +46,13 @@ def __init__(self, size, num_columns=None, num_channels=None, tile_size=None):
         self.num_columns = num_columns
         self.num_channels = num_channels
         self.tile_size = tile_size
+        self.trace_size = trace_size
+        self.do_set_up = do_set_up
 
         AIEOperatorBase.__init__(self)
 
-    def set_up(self):
-        # Compilation artifacts
-        file_name_base = f"mul_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
+    def get_artifacts(self, prefix="eltwise_mul_"):
+        file_name_base = f"{prefix}{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
         mlir_artifact = PythonGeneratedMLIRArtifact.new(
             f"{file_name_base}.mlir",
@@ -57,7 +67,7 @@ def set_up(self):
                 self.num_columns,
                 self.num_channels,
                 self.tile_size,
-                0,
+                self.trace_size,
             ],
         )
 
@@ -75,6 +85,20 @@ def set_up(self):
             f"{file_name_base}.bin", depends=[mlir_artifact]
         )
 
+        return xclbin_artifact, insts_artifact
+
+    def set_up(self):
+        # If this operator is only used as a sub-operator in another operator that sets it up, we should skip the setup here as those artifacts and buffers may not be needed.
+        if not self.do_set_up:
+            return
+
+        # Compilation artifacts
+        xclbin_artifact, insts_artifact = self.get_artifacts()
+
+        # Override device_type in the mlir_artifact's callback_args if needed
+        mlir_artifact = xclbin_artifact.depends[0]
+        mlir_artifact.callback_args[0] = self.device_manager.device_type
+
         artifacts = [xclbin_artifact, insts_artifact]
         self.add_artifacts(artifacts)