diff --git a/tests/test-flash-decoding-custom-op.cpp b/tests/test-flash-decoding-custom-op.cpp
index f2246b363864b..6d9a2262e2f44 100644
--- a/tests/test-flash-decoding-custom-op.cpp
+++ b/tests/test-flash-decoding-custom-op.cpp
@@ -375,9 +375,9 @@ int main() {
     // Adjust fp16_window to fit within kv_len for this test
     size_t fp16_window  = std::min((size_t)kv_len, (size_t)32);
     size_t quant_len    = kv_len - fp16_window > 0 ? kv_len - fp16_window : 0;
-    size_t fp16_nb1     = head_dim * ggml_type_size(k->type);
-    size_t fp16_nb2     = fp16_window * fp16_nb1;
-    size_t fp16_nb3     = fp16_nb2 * n_kv_heads;
+    size_t fp16_nb1     = k->nb[1];
+    size_t fp16_nb2     = k->nb[2];
+    size_t fp16_nb3     = k->nb[3];
 
     size_t quant_nb1    = head_dim * ggml_type_size(k->type);
     size_t quant_nb2    = quant_len * quant_nb1;
@@ -398,80 +398,68 @@ int main() {
     // Create Q4_0 quantized tensors for k_quant and v_quant if we have quantized tokens
     if (quant_len > 0) {
         printf("Creating simple Q4_0 quantized tensors for %zu tokens\n", quant_len);
-        
-        // Calculate total elements for the quantized portion
-        size_t total_elements = head_dim * quant_len * n_kv_heads;
-        
-        // Create simple 1D tensors for quantization (based on successful test_unified_cache_copy.cpp example)
-        ggml_tensor * k_quant_src = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, total_elements);
-        ggml_tensor * v_quant_src = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, total_elements);
-        k_quant = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, total_elements);
-        v_quant = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, total_elements);
-        
-        printf("Created 1D tensors: src=%zu elements, dst=%zu elements\n", 
-               total_elements, total_elements);
-        printf("K_src: %zu bytes, K_quant: %zu bytes\n", 
+
+        ggml_tensor * k_quant_src = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,
+                                                       head_dim, quant_len,
+                                                       n_kv_heads, 1);
+        ggml_tensor * v_quant_src = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,
+                                                       head_dim, quant_len,
+                                                       n_kv_heads, 1);
+
+        k_quant = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,
+                                     head_dim, quant_len,
+                                     n_kv_heads, 1);
+        v_quant = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,
+                                     head_dim, quant_len,
+                                     n_kv_heads, 1);
+
+        printf("K_quant_src bytes: %zu, K_quant bytes: %zu\n",
                ggml_nbytes(k_quant_src), ggml_nbytes(k_quant));
-        
-        // Fill source tensors with data from the quantized portion (tokens fp16_window to fp16_window+quant_len)
-        ggml_fp16_t* k_src_data = (ggml_fp16_t*)k_quant_src->data;
-        ggml_fp16_t* v_src_data = (ggml_fp16_t*)v_quant_src->data;
-        ggml_fp16_t* k_orig_data = (ggml_fp16_t*)k->data;
-        ggml_fp16_t* v_orig_data = (ggml_fp16_t*)v->data;
-        
-        // Copy data from the quantized portion to the 1D tensors
-        size_t idx = 0;
+
+        ggml_fp16_t * k_src_data = (ggml_fp16_t *)k_quant_src->data;
+        ggml_fp16_t * v_src_data = (ggml_fp16_t *)v_quant_src->data;
+        ggml_fp16_t * k_orig_data = (ggml_fp16_t *)k->data;
+        ggml_fp16_t * v_orig_data = (ggml_fp16_t *)v->data;
+
         for (size_t h = 0; h < n_kv_heads; h++) {
             for (size_t t = 0; t < quant_len; t++) {
                 for (size_t d = 0; d < head_dim; d++) {
-                    // Source position: token (fp16_window + t) in original tensor
-                    size_t orig_idx = d + (fp16_window + t) * head_dim + h * head_dim * GGML_PAD(kv_len, n_pad);
-                    
-                    k_src_data[idx] = k_orig_data[orig_idx];
-                    v_src_data[idx] = v_orig_data[orig_idx];
-                    idx++;
+                    size_t orig_idx = d + (fp16_window + t) * head_dim +
+                                     h * head_dim * GGML_PAD(kv_len, n_pad);
+                    size_t dst_idx = d + t * head_dim +
+                                    h * head_dim * quant_len;
+                    k_src_data[dst_idx] = k_orig_data[orig_idx];
+                    v_src_data[dst_idx] = v_orig_data[orig_idx];
                 }
             }
         }
-        
+
         printf("Data copy completed successfully\n");
-        
-        // Use ggml_cpy to quantize the data from F16 to Q4_0 (based on successful example)
-        printf("Creating ggml_cpy operations...\n");
+
         ggml_tensor * k_quantize_op = ggml_cpy(ctx, k_quant_src, k_quant);
         ggml_tensor * v_quantize_op = ggml_cpy(ctx, v_quant_src, v_quant);
-        
-        printf("ggml_cpy operations created successfully\n");
-        
-        // Build quantization graph and execute it
-        printf("Building computation graph...\n");
+
         struct ggml_cgraph * graph_quantize = ggml_new_graph(ctx);
         ggml_build_forward_expand(graph_quantize, k_quantize_op);
         ggml_build_forward_expand(graph_quantize, v_quantize_op);
-        
+
         printf("Computing quantization (F16 -> Q4_0)...\n");
         enum ggml_status status_quantize = ggml_graph_compute_with_ctx(ctx, graph_quantize, n_threads);
-        
+
         if (status_quantize != GGML_STATUS_SUCCESS) {
             printf("ERROR: Quantization failed with status: %d\n", status_quantize);
             ggml_free(ctx);
             return 1;
         }
-        
+
         printf("Quantization completed successfully\n");
-        
-        // Now we need to create 4D views of our 1D quantized tensors for the flash attention
-        // Reshape the 1D quantized tensors back to 4D for flash attention compatibility
-        printf("Creating 4D views for flash attention...\n");
-        
-        // For flash attention, we need 4D tensors with the correct shape
-        // We can't use ggml_view_4d on quantized tensors directly due to size constraints
-        // Instead, we'll work with the 1D tensors and let the flash attention handle the reshape
-        
-        printf("K_quant final shape: 1D tensor with %ld elements, type: %s\n", 
-               k_quant->ne[0], ggml_type_name(k_quant->type));
-        printf("V_quant final shape: 1D tensor with %ld elements, type: %s\n", 
-               v_quant->ne[0], ggml_type_name(v_quant->type));
+
+        printf("K_quant final shape: [%zu, %zu, %zu, %zu] type: %s\n",
+               k_quant->ne[0], k_quant->ne[1], k_quant->ne[2], k_quant->ne[3],
+               ggml_type_name(k_quant->type));
+        printf("V_quant final shape: [%zu, %zu, %zu, %zu] type: %s\n",
+               v_quant->ne[0], v_quant->ne[1], v_quant->ne[2], v_quant->ne[3],
+               ggml_type_name(v_quant->type));
         
     } else {
         printf("No quantized tokens to create (quant_len = 0)\n");
@@ -700,7 +688,7 @@ int main() {
         for (int h = 0; h < n_kv_heads; h++) {
             for (int s = 0; s < kv_len; s++) {
                 for (int d = 0; d < head_dim; d++) {
-                    int ggml_idx = d + s * head_dim + h * head_dim * kv_len;
+                    int ggml_idx = d + s * head_dim + h * head_dim * GGML_PAD(kv_len, n_pad);
                     int torch_idx = h * kv_len * head_dim + s * head_dim + d;
                     // Convert F16 to F32
                     k_torch_data[torch_idx] = ggml_fp16_to_fp32(((ggml_fp16_t*)k->data)[ggml_idx]);