diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 5e65c58d2455e..dcaf028eb9aa6 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7322,9 +7322,9 @@ void ggml_compute_forward_flash_attn_ext_mixed( const int64_t q_head_start = kv_head * rk2; const int64_t q_head_end = q_head_start + rk2; - for (int64_t q_head = q_head_start; q_head < q_head_end; ++ q_head) { + for (int64_t q_head = q_head_start; q_head < q_head_end; ++ q_head) { for (int64_t q_pos = 0; q_pos < SEQ_LEN; ++ q_pos) { - float* mp = (float*) mask->data + q_pos * nek1; + float* mp = (float*) ((char *) mask->data + q_pos * mask->nb[1]); if (mp[kv_pos] == -INFINITY) { continue; } diff --git a/tests/test-flash-decoding-custom-op.cpp b/tests/test-flash-decoding-custom-op.cpp index 6181f028fdbbb..fee97347f98a9 100644 --- a/tests/test-flash-decoding-custom-op.cpp +++ b/tests/test-flash-decoding-custom-op.cpp @@ -57,7 +57,7 @@ static void fill_random_f32(ggml_tensor * dst, size_t n_rows, size_t n_cols, flo GGML_TENSOR_LOCALS(int64_t, nedst, dst, ne) float* data = (float*)dst->data; - size_t row_stride = nedst1; + size_t row_stride = nedst0; static std::random_device rd; static std::mt19937 gen(rd());