amd
diff --git a/‎aie_kernels/aie2p/softmax.cc‎
Lines changed: 8 additions & 0 deletions b/‎aie_kernels/aie2p/softmax.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎aie_kernels/generic/mv.cc‎
Lines changed: 11 additions & 14 deletions b/‎aie_kernels/generic/mv.cc‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎conftest.py‎
Lines changed: 3 additions & 1 deletion b/‎conftest.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎iron/applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 1 addition & 1 deletion b/‎iron/applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎iron/applications/llama_3.2_1b/src/block/gqa.py‎
Lines changed: 0 additions & 1 deletion b/‎iron/applications/llama_3.2_1b/src/block/gqa.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎iron/applications/llama_3.2_1b/src/block/transformer.py‎
Lines changed: 0 additions & 2 deletions b/‎iron/applications/llama_3.2_1b/src/block/transformer.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎iron/applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 0 additions & 2 deletions b/‎iron/applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎iron/common/__init__.py‎
Lines changed: 11 additions & 3 deletions b/‎iron/common/__init__.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎iron/common/aie_base.py‎
Lines changed: 0 additions & 229 deletions b/‎iron/common/aie_base.py‎
Lines changed: 0 additions & 229 deletions
@@ -177,4 +177,12 @@ void partial_softmax_bf16(bfloat16 *restrict input,
     partial_softmax_alias_bf16(input, output, scale_buffer, input_size, row_idx, num_rows, scale);
 }
 
+void mask_bf16(bfloat16 *inout, const int32 unmasked_size, const int32 total_size)
+{
+    // TODO: Optimize this to use vector code
+    for (int32 i = unmasked_size; i < total_size; i++) {
+        inout[i] = (bfloat16)(-INFINITY);
+    }
+}
+
 } // extern "C"
@@ -15,6 +15,10 @@
 
 #include <aie_api/aie.hpp>
 
+#ifndef VEC_SIZE
+#define VEC_SIZE 64
+#endif
+
 void matvec_scalar(uint32_t m,
                    uint32_t k,
                    const bfloat16 *__restrict a,
@@ -40,22 +44,17 @@ Matrix-vector multiplication kernel
  - c: Pointer to the output vector
  - r: Vector size; data from the matrix and vector will be loaded in and processed in chunks of this size
 */
-template <uint32_t r>
-void matvec_vectorized(uint32_t m,
-                       uint32_t k,
-                       const bfloat16 *__restrict a,
-                       const bfloat16 *__restrict b,
-                       bfloat16 *__restrict c)
+template <uint32_t r, uint32_t k>
+void matvec_vectorized(uint32_t m, const bfloat16 *__restrict a, const bfloat16 *__restrict b, bfloat16 *__restrict c)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
     bfloat16 *c_end = c + m;
     const bfloat16 *b_end = b + k;
     for (; c < c_end; c++) {
         aie::accum acc = aie::zeros<accfloat, r>();
-        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that k is at least
-        // two. This assumption should hold for any useful use of this function; if k were one, this would be a simple
-        // scalar multiplication of a vector.
-        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that there are at
+        // least two iterations of the loop, i.e. k >= 2*r. This pragma will break the code if that is not the case!
+        AIE_LOOP_MIN_ITERATION_COUNT(k / VEC_SIZE)
         for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
             aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
             aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
@@ -72,25 +71,23 @@ extern "C" {
  * `c`.  */
 
 void matvec_scalar_bf16_bf16(uint32_t m,
-                             uint32_t k,
                              uint32_t row_offset,
                              const bfloat16 *__restrict a_in,
                              const bfloat16 *__restrict b_in,
                              bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_scalar(m, k, a_in, b_in, c_out);
+    matvec_scalar(m, DIM_K, a_in, b_in, c_out);
 }
 
 void matvec_vectorized_bf16_bf16(uint32_t m,
-                                 uint32_t k,
                                  uint32_t row_offset,
                                  const bfloat16 *__restrict a_in,
                                  const bfloat16 *__restrict b_in,
                                  bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_vectorized<64>(m, k, a_in, b_in, c_out);
+    matvec_vectorized<VEC_SIZE, DIM_K>(m, a_in, b_in, c_out);
 }
 
 } // extern "C"
@@ -16,7 +16,9 @@
 @pytest.fixture
 def aie_context():
     """Create a fresh AIEContext for each test"""
-    return AIEContext()
+    ctx = AIEContext()
+    yield ctx
+    ctx.device_manager.reset()
 
 
 def pytest_addoption(parser):
 
@@ -116,7 +116,7 @@ def __init__(
             )
 
         if self.cfg["use_kv_cache"] and self.cfg["use_aie_ffn_gemv"]:
-            aie_gemv_config = {"num_aie_columns": 8, "is_mv": False}
+            aie_gemv_config = {"num_aie_columns": 8}
             # FC1 and FC2: emb_dim -> hidden_dim
             self.aie_fc1_gemv = AIEGEMV(
                 M=self.hidden_dim,
 
@@ -133,7 +133,6 @@ def __init__(
 
             aie_gemv_config = {
                 "num_aie_columns": 8,
-                "is_mv": False,
                 "use_static_weight": True,
             }
             self.aie_query_gemv = AIEGEMV(
 
@@ -104,7 +104,6 @@ def __init__(
             self.aie_residual_add_prefill = AIEElementwiseAdd(
                 size=max_prefill_size,
                 num_aie_columns=8,
-                num_channels=2,
                 tile_size=cfg["emb_dim"],
             )
 
@@ -114,7 +113,6 @@ def __init__(
                 self.aie_residual_add_decode = AIEElementwiseAdd(
                     size=decode_size,
                     num_aie_columns=1,
-                    num_channels=2,
                     tile_size=cfg["emb_dim"],
                 )
             else:
 
@@ -197,9 +197,7 @@ def __init__(
             )
             aie_gemv_config = {
                 "num_aie_columns": 8,
-                "is_mv": True,
                 "use_static_weight": True,
-                "num_aie_columns": 8,
                 "tile_size_input": 4,
                 "tile_size_output": 32,
             }
 
@@ -3,8 +3,16 @@
 
 """Common utilities and base classes for IRON operators."""
 
-from .aie_base import AIEOperatorBase, AIEOperatorConstraintError
-from .aie_context import AIEContext
+from .base import (
+    AIEOperatorBase,
+    MLIROperator,
+    CompositeOperator,
+    CompositeCallable,
+    AIEBuffer,
+    SingleXclbinCallable,
+    AIERuntimeArgSpec,
+)
+from .context import AIEContext
 from .compilation import (
     XclbinArtifact,
     InstsBinArtifact,
@@ -13,4 +21,4 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from .aie_device_manager import AIEDeviceManager
+from .device_manager import AIEDeviceManager
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def __init__(`
`116`	`116`	`)`
`117`	`117`
`118`	`118`	`if self.cfg["use_kv_cache"] and self.cfg["use_aie_ffn_gemv"]:`
`119`		`- aie_gemv_config = {"num_aie_columns": 8, "is_mv": False}`
	`119`	`+ aie_gemv_config = {"num_aie_columns": 8}`
`120`	`120`	`# FC1 and FC2: emb_dim -> hidden_dim`
`121`	`121`	`self.aie_fc1_gemv = AIEGEMV(`
`122`	`122`	`M=self.hidden_dim,`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,6 @@ def __init__(`
`133`	`133`
`134`	`134`	`aie_gemv_config = {`
`135`	`135`	`"num_aie_columns": 8,`
`136`		`- "is_mv": False,`
`137`	`136`	`"use_static_weight": True,`
`138`	`137`	`}`
`139`	`138`	`self.aie_query_gemv = AIEGEMV(`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,6 @@ def __init__(`
`104`	`104`	`self.aie_residual_add_prefill = AIEElementwiseAdd(`
`105`	`105`	`size=max_prefill_size,`
`106`	`106`	`num_aie_columns=8,`
`107`		`- num_channels=2,`
`108`	`107`	`tile_size=cfg["emb_dim"],`
`109`	`108`	`)`
`110`	`109`
`@@ -114,7 +113,6 @@ def __init__(`
`114`	`113`	`self.aie_residual_add_decode = AIEElementwiseAdd(`
`115`	`114`	`size=decode_size,`
`116`	`115`	`num_aie_columns=1,`
`117`		`- num_channels=2,`
`118`	`116`	`tile_size=cfg["emb_dim"],`
`119`	`117`	`)`
`120`	`118`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -197,9 +197,7 @@ def __init__(`
`197`	`197`	`)`
`198`	`198`	`aie_gemv_config = {`
`199`	`199`	`"num_aie_columns": 8,`
`200`		`- "is_mv": True,`
`201`	`200`	`"use_static_weight": True,`
`202`		`- "num_aie_columns": 8,`
`203`	`201`	`"tile_size_input": 4,`
`204`	`202`	`"tile_size_output": 32,`
`205`	`203`	`}`