Merge b53e764 into 130b6ea

kurtis-b-1 · web-flow · commit 818b7833dbce · 2025-12-06T01:20:04.000Z
diff --git a/applications/llama_3.2_1b/configs/llama32_1b.json b/applications/llama_3.2_1b/configs/llama32_1b.json
@@ -23,7 +23,7 @@
     "use_aie_residual": true,
     "use_aie_regular_mha": false,
     "use_aie_fused_mha": true,
-    "use_aie_final_gemm": false,
+    "use_aie_final_gemm": true,
     "rope_freq": {
       "factor": 32.0,
       "low_freq_factor": 1.0,
diff --git a/applications/llama_3.2_1b/inference.py b/applications/llama_3.2_1b/inference.py
@@ -400,7 +400,7 @@ def set_prefill_time():
     parser.add_argument(
         "--prompt_len",
         type=int,
-        default=64,
+        default=2048,
         help="Truncate prompt to this many tokens.",
     )
     parser.add_argument(
diff --git a/applications/llama_3.2_1b/src/block/gqa.py b/applications/llama_3.2_1b/src/block/gqa.py
@@ -163,38 +163,37 @@ def forward(self, x, mask, angles, input_pos=None):
             # Decode phase with KV cache - use GEMV for single token
             # weight.T @ input, which is vector-matrix multiplication (So, is_mv=False)
             x_flat = x.reshape(1, -1)  # Shape: (1, d_in)
-            input_dtype = x.dtype
 
             queries_flat = self.aie_query_gemv(x_flat)
-            queries = queries_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            queries = queries_flat.reshape(b, num_tokens, self.d_out)
 
             keys_flat = self.aie_key_gemv(x_flat)
             keys = keys_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
 
             values_flat = self.aie_value_gemv(x_flat)
             values = values_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
 
         elif self.cfg["use_aie_attn_projection_gemm"]:
             # Prefill phase - use GEMM for multiple tokens
             x_flat = x.reshape(-1, d_in)
             input_dtype = x.dtype
 
             queries_flat = self.aie_query(x_flat)
-            queries = queries_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            queries = queries_flat.reshape(b, num_tokens, self.d_out)
 
             keys_flat = self.aie_key(x_flat)
             keys = keys_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
 
             values_flat = self.aie_value(x_flat)
             values = values_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
         else:
             queries = self.W_query(x)
             keys = self.W_key(x)
@@ -348,9 +347,9 @@ def apply_rope_and_transpose(tensor, num_heads_dim, angle_slice):
                 def my_mha(queries, keys, values):
                     inv_scale = 1 / np.sqrt(values.shape[-1])
                     context_vec = torch.nn.functional.scaled_dot_product_attention(
-                        queries.to(torch.bfloat16).to("cpu"),
-                        keys.to(torch.bfloat16).to("cpu"),
-                        values.to(torch.bfloat16).to("cpu"),
+                        queries,
+                        keys,
+                        values,
                         dropout_p=0.0,
                         is_causal=True,
                         scale=inv_scale,
@@ -384,11 +383,11 @@ def my_mha(queries, keys, values):
         if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
             context_vec_flat = context_vec.reshape(1, -1)
             output_flat = self.aie_out_proj_gemv(context_vec_flat)
-            context_vec = output_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            context_vec = output_flat.reshape(b, num_tokens, self.d_out)
         elif self.cfg["use_aie_attn_projection_gemm"]:
             context_vec_flat = context_vec.reshape(-1, self.d_out)
             output_flat = self.aie_out_proj(context_vec_flat)
-            context_vec = output_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            context_vec = output_flat.reshape(b, num_tokens, self.d_out)
         else:
             context_vec = self.out_proj(context_vec)
 
diff --git a/applications/llama_3.2_1b/src/model_with_json.py b/applications/llama_3.2_1b/src/model_with_json.py
@@ -12,7 +12,10 @@
 from pathlib import Path
 from src.block.transformer import TransformerBlock
 from operators.rope.rope_utils import compute_rope_params
-from operators import AIERMSNorm
+from operators import (
+    AIERMSNorm,
+    AIEGEMM,
+)
 from rich.console import Console
 from rich.text import Text
 
@@ -169,13 +172,37 @@ def __init__(
                 self.cfg["emb_dim"], eps=1e-5, dtype=self.cfg["dtype"]
             )
 
-        # Depedns on use_aie_final_gemm
-        self.out_head = nn.Linear(
-            self.cfg["emb_dim"],
-            self.cfg["vocab_size"],
-            bias=False,
-            dtype=self.cfg["dtype"],
-        )
+        # Offload final linear layer if enabled
+        if self.cfg.get("use_aie_final_gemm", False):
+            # Since this GEMM has such a large N dimension, partition the N dimension by 4, 
+            # and GEMM will execute for a workload of that smaller N dimension across different buffers of B and C
+            aie_config_prefill = {
+                "num_aie_columns": 8,
+                "tile_m": 64,
+                "tile_k": 64,
+                "tile_n": 64,
+                "b_col_maj": True,
+                "use_static_weight": True,
+                "separate_c_tiles": True,
+                "partition_N": 4,
+            }
+            if self.cfg["use_kv_cache"]:
+                M_for_gemm = self.prompt_length
+            else:
+                M_for_gemm = self.prompt_length + self.num_tokens
+            self.out_head = AIEGEMM(
+                M=M_for_gemm,
+                K=self.cfg["emb_dim"],
+                N=self.cfg["vocab_size"], 
+                **aie_config_prefill,
+            )
+        else:
+            self.out_head = nn.Linear(
+                self.cfg["emb_dim"],
+                self.cfg["vocab_size"],
+                bias=False,
+                dtype=self.cfg["dtype"],
+            )
 
         # Reusable utilities
         cos, sin = compute_rope_params(
@@ -194,6 +221,22 @@ def forward(self, in_idx, input_pos=None, use_kv_cache=False):
         tok_embeds = self.tok_emb(in_idx)
         x = tok_embeds
 
+        # Check if input is a vector (decode phase) or matrix (prefill phase)
+        # Handle 1D: (emb_dim,), 2D: (1, emb_dim), or 3D: (1, 1, emb_dim)
+        is_vector = (
+            len(x.shape) == 1
+            or (len(x.shape) == 2 and x.shape[0] == 1)
+            or (len(x.shape) == 3 and x.shape[0] == 1 and x.shape[1] == 1)
+        )
+
+        # (batch, sequence, embedding) where sequence=1 indicates decode
+        if len(x.shape) == 3:
+            is_decode_with_kv = (x.shape[1] == 1) and self.cfg["use_kv_cache"]
+        elif len(x.shape) == 2:
+            is_decode_with_kv = (x.shape[0] == 1) and self.cfg["use_kv_cache"]
+        else:
+            is_decode_with_kv = False
+
         num_tokens = x.shape[1]
 
         # During generation phase with KV cache, don't create a mask
@@ -219,19 +262,39 @@ def forward(self, in_idx, input_pos=None, use_kv_cache=False):
         else:
             x = self.final_norm(x)
 
-        logits = self.out_head(x.to(self.cfg["dtype"]))
+        if is_decode_with_kv and self.cfg["use_aie_gemv"]:
+            # TODO: Offload to NPU
+            # logits = self.aie_out_head_gemv(x)
+            logits = self.out_head(x)
+        else:
+            logits = self.out_head(x)
 
         return logits
 
-    def assign_weights(self, final_norm):
+    def assign_weights(self, final_norm, out_head, out_head_name):
         if self.cfg.get("use_aie_final_norm", False):
             self.aie_final_norm_prefill.weight = final_norm
             if self.cfg["use_kv_cache"]:
                 self.aie_final_norm_decode.weight = final_norm
-            return
+        else:
+            self.final_norm.weight = assign(
+                self.final_norm.weight,
+                final_norm,
+                f"model.norm.weight",
+            )
 
-        self.final_norm.weight = assign(
-            self.final_norm.weight,
-            final_norm,
-            f"model.norm.weight",
-        )
+        # TODO: Offload GEMV to NPU
+        # if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+        #     self.aie_out_head_gemv.weight = out_head
+        if self.cfg["use_aie_final_gemm"]:
+            # Want column-major for B
+            self.out_head.weight = out_head.T
+            # TODO: Create separate linear layers for prefill and decode (with gemm/gemv)
+            # if self.cfg["use_kv_cache"]:
+            #     self.out_head.weight = out_head.T
+        else:
+            self.out_head.weight = assign(
+                self.out_head.weight,
+                out_head,
+                out_head_name,
+            )
diff --git a/applications/llama_3.2_1b/src/utils.py b/applications/llama_3.2_1b/src/utils.py
@@ -126,18 +126,10 @@ def load_weights_into_llama(model, param_config, params):
         )
 
     # Load output layer weights
-    model.assign_weights(params["model.norm.weight"])
-
     if "lm_head.weight" in params.keys():
-        model.out_head.weight = assign(
-            model.out_head.weight, params["lm_head.weight"], "lm_head.weight"
-        )
+        model.assign_weights(params["model.norm.weight"], params["lm_head.weight"], "lm_head.weight")
     else:
-        model.out_head.weight = assign(
-            model.out_head.weight,
-            params["model.embed_tokens.weight"],
-            "model.embed_tokens.weight",
-        )
+        model.assign_weights(params["model.norm.weight"], params["model.embed_tokens.weight"], "model.embed_tokens.weight")
 
 
 def text_to_token_ids(text, tokenizer):
diff --git a/operators/common/aie_base.py b/operators/common/aie_base.py
@@ -98,18 +98,48 @@ def prepare_runtime(cls):
                 )
 
             # If multiple buffers (of the same binned size) are used in the
-            # same kernel invocation, they require separate allocations.
+            # same kernel invocation OR across different invocations with shared
+            # buffers, they require separate allocations.
             conflicting_buffers = {}  # map buffer -> {set of conflicting buffers}
-            for kernel, *args in op.runlist:
+            buffer_to_runlist_entries = {}  # map buffer -> set of runlist entry indices
+
+            # First pass: track which buffers appear in which runlist entries
+            for idx, (kernel, *args) in enumerate(op.runlist):
+                for arg in args:
+                    buffer_to_runlist_entries.setdefault(arg, set()).add(idx)
+
+            # Second pass: determine conflicts
+            for idx, (kernel, *args) in enumerate(op.runlist):
                 for arg in args:
                     if arg in op.buffer_static_data:
                         # Static buffers never conflict
                         continue
-                    # Conflict only exists if buffers are in the same size pool
                     pool_sz = get_pool_sz(op.buffers[arg])
+
+                    # Buffers conflict if they're in the same runlist entry
                     conflicting_args = {
                         a for a in args if get_pool_sz(op.buffers[a]) == pool_sz
                     } - {arg}
+
+                    # Also conflict with buffers in other runlist entries that share
+                    # a buffer with this entry
+                    for other_arg in args:
+                        if other_arg == arg:
+                            continue
+                        for other_idx in buffer_to_runlist_entries.get(
+                            other_arg, set()
+                        ):
+                            if other_idx != idx:
+                                _, *other_args = op.runlist[other_idx]
+                                conflicting_args.update(
+                                    {
+                                        a
+                                        for a in other_args
+                                        if get_pool_sz(op.buffers[a]) == pool_sz
+                                        and a != arg
+                                    }
+                                )
+
                     conflicting_buffers[arg] = conflicting_buffers.get(
                         arg, set()
                     ).union(conflicting_args)
@@ -244,12 +274,19 @@ def add_to_runlist(self, kernel_name, *args):
     def get_bo(self, buffer_name):
         return self.buffer_bos[buffer_name]
 
-    def read_buffer(self, buffer_name, shape, dtype=bfloat16):
+    def read_buffer(self, buffer_name, shape, copy=False, dtype=bfloat16):
         """Read buffer and return values as a numpy array"""
-        size = np.prod(shape) * np.dtype(dtype).itemsize
-        output_bytes = self.get_bo(buffer_name).read(size, 0)
-        output_data_flat = np.frombuffer(output_bytes, dtype=dtype)
-        return output_data_flat.reshape(*shape)
+        # Total bytes
+        size = int(np.prod(shape)) * np.dtype(dtype).itemsize
+
+        # Map once; map() should return a Python buffer interface over the BO
+        mv = self.get_bo(buffer_name).map()
+
+        # Create a NumPy view over mapped memory (zero-copy)
+        arr = np.frombuffer(mv, dtype=dtype, count=np.prod(shape))
+        if copy:
+            return arr.copy()
+        return arr.reshape(shape)
 
     def read_buffer_as_torch(self, buffer_name, shape, dtype=bfloat16):
         return numpy_to_torch(self.read_buffer(buffer_name, shape, dtype))
diff --git a/operators/common/discover_tests.py b/operators/common/discover_tests.py
@@ -93,11 +93,10 @@ def generate_test_list(operators_dir, output_dir=None, extensive=False):
         test_script = test_parts[0]
         test_args = " ".join(test_parts[1:]) if len(test_parts) > 1 else ""
 
-        # Wrap command to run from /tmp to avoid sys.path issues
         if test_args:
-            wrapped_command = f"cd /tmp && python3 {test_script} {test_args}"
+            wrapped_command = f"cd {output_dir} && python3 {test_script} {test_args}"
         else:
-            wrapped_command = f"cd /tmp && python3 {test_script}"
+            wrapped_command = f"cd {output_dir} && python3 {test_script}"
 
         # Generate test file content
         content = f"""run = '{wrapped_command}'
diff --git a/operators/common/utils.py b/operators/common/utils.py
@@ -23,14 +23,33 @@
 
 
 def torch_to_numpy(tensor: torch.Tensor) -> np.ndarray:
-    if tensor.dtype == torch.bfloat16:
-        float_arr = tensor.float().detach().cpu().numpy()
-        return float_arr.astype(bfloat16)
-    return tensor.detach().cpu().numpy()
+    # Detach (to drop grad) and ensure on CPU
+    t = tensor.detach()
+    if t.device.type != 'cpu':
+        t = t.cpu()
+    # Ensure contiguous for safe view operations
+    if not t.is_contiguous():
+        t = t.contiguous()
+
+    if t.dtype == torch.bfloat16:
+        # Zero-copy reinterpret: view the same memory as uint16, then as NumPy bfloat16
+        # This avoids numeric conversion and extra passes over memory.
+        u16_np = t.view(torch.uint16).numpy()        # shares memory, zero-copy
+        return u16_np.view(np.dtype('bfloat16'))     # reinterpret, zero-copy
+
+    # For supported dtypes, this is already zero-copy
+    return t.numpy()
 
 
 def numpy_to_torch(array: np.ndarray) -> torch.Tensor:
-    device = torch.device("cpu")
-    if array.dtype == bfloat16:
-        return torch.from_numpy(array.astype(np.float32)).to(torch.bfloat16).to(device)
-    return torch.from_numpy(array).to(device)
+    # Ensure contiguous to let from_numpy create a view
+    if not array.flags['C_CONTIGUOUS']:
+        array = np.ascontiguousarray(array)
+
+    if array.dtype == np.dtype('bfloat16'):
+        # reinterpret the same memory as uint16, then view as torch.bfloat16
+        t_u16 = torch.from_numpy(array.view(np.uint16))  # zero-copy
+        return t_u16.view(torch.bfloat16)                # view, zero-copy
+
+    # For supported dtypes, from_numpy is already zero-copy
+    return torch.from_numpy(array)
diff --git a/operators/gemm/design.py b/operators/gemm/design.py
diff --git a/operators/gemm/op.py b/operators/gemm/op.py
diff --git a/operators/gemm/reference.py b/operators/gemm/reference.py
diff --git a/operators/gemm/test.py b/operators/gemm/test.py

Original file line number	Diff line number	Diff line change
`@@ -400,7 +400,7 @@ def set_prefill_time():`
`400`	`400`	`parser.add_argument(`
`401`	`401`	`"--prompt_len",`
`402`	`402`	`type=int,`
`403`		`- default=64,`
	`403`	`+ default=2048,`
`404`	`404`	`help="Truncate prompt to this many tokens.",`
`405`	`405`	`)`
`406`	`406`	`parser.add_argument(`