diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 5e65c58d2455e..dcaf028eb9aa6 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7322,9 +7322,9 @@ void ggml_compute_forward_flash_attn_ext_mixed(
             const int64_t q_head_start = kv_head * rk2;
             const int64_t q_head_end   = q_head_start + rk2;
 
-            for (int64_t q_head = q_head_start; q_head < q_head_end; ++ q_head) {
+                for (int64_t q_head = q_head_start; q_head < q_head_end; ++ q_head) {
                 for (int64_t q_pos = 0; q_pos < SEQ_LEN; ++ q_pos) {
-                    float* mp = (float*) mask->data + q_pos * nek1;
+                    float* mp = (float*) ((char *) mask->data + q_pos * mask->nb[1]);
                     if (mp[kv_pos] == -INFINITY) {
                         continue;
                     }
diff --git a/tests/test-flash-decoding-custom-op.cpp b/tests/test-flash-decoding-custom-op.cpp
index 6181f028fdbbb..fee97347f98a9 100644
--- a/tests/test-flash-decoding-custom-op.cpp
+++ b/tests/test-flash-decoding-custom-op.cpp
@@ -57,7 +57,7 @@ static void fill_random_f32(ggml_tensor * dst, size_t n_rows, size_t n_cols, flo
     GGML_TENSOR_LOCALS(int64_t, nedst, dst, ne)
 
     float* data = (float*)dst->data;
-    size_t row_stride = nedst1;
+    size_t row_stride = nedst0;
 
     static std::random_device rd;
     static std::mt19937 gen(rd());