From cd3f7db61502603d5baa2f2f0d7891a618e5a555 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:47:09 +0000
Subject: [PATCH 01/10] Upgrade llama.cpp from b8953 to b8962

Breaking changes in b8962 that affect this project:
- task_params::to_json() drops speculative.n_max/n_min/p_min from output;
  only speculative.type remains. Update test_server.cpp accordingly.

Breaking changes in b8962 that don't affect project code directly:
- struct cpu_params renamed to common_cpu_params (and related functions)
- common_params_speculative restructured with nested sub-structs (.draft.*,
  .ngram_cache.*, .ngram_mod.*, etc.)
- common_arg::is_sparam split into is_sampling + is_spec

New in b8962:
- common_speculative_n_max() / common_speculative_n_min() public API
- CANN backend: fused SwiGLU/GeGLU, softplus, set, cumsum, diag, fill,
  tri, solve_tri ops; improved L2 norm, cross entropy, get/set_rows
- Vulkan: timestamp query sync fix
- WebGPU: Q1_0 quantization support; SSM scan x/B/C overlap handling

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md                    | 9 +++++++--
 CMakeLists.txt               | 2 +-
 README.md                    | 2 +-
 src/test/cpp/test_server.cpp | 6 +++---
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 0d787ea2..2eeb875a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8953**
+Current llama.cpp pinned version: **b8962**
 
 ## Upgrading CUDA Version
 
@@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
 `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`
 
-**Known breaking changes by version range** (b5022 → b8953):
+**Known breaking changes by version range** (b5022 → b8962):
 
 | Version | File | Change |
 |---------|------|--------|
@@ -218,6 +218,11 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8913–b8953 | `tools/server/server-http.h` | New `uploaded_file` struct; `files` map type changed from `map<string, raw_buffer>` to `map<string, uploaded_file>`; upstream server sources compiled directly — no project impact |
 | ~b8913–b8953 | `src/llama-quant.cpp` | Default quantization ftype changed from `LLAMA_FTYPE_MOSTLY_Q5_1` to `LLAMA_FTYPE_MOSTLY_Q8_0`; upstream only |
 | ~b8913–b8953 | `src/models/llama.cpp`, `qwen3.cpp`, `qwen3moe.cpp` | Removed duplicate `ggml_mul` for `wo_s` scale (now handled exclusively by `build_attn`); upstream only |
+| ~b8953–b8962 | `common/common.h` | `struct cpu_params` → `struct common_cpu_params`; `cpu_get_num_physical_cores()` → `common_cpu_get_num_physical_cores()`; `cpu_get_num_math()` → `common_cpu_get_num_math()`; not used directly by project |
+| ~b8953–b8962 | `common/common.h` | `common_params_speculative` fully restructured with nested sub-structs: `.mparams_dft`/`.model_dft`/`.cparams_dft`/`.n_max`/`.n_min`/`.p_split`/`.p_min` → `.draft.mparams`/`.draft.model`/`.draft.cparams`/`.draft.n_max`/`.draft.n_min`/`.draft.p_split`/`.draft.p_min`; ngram fields moved to `.ngram_cache`/`.ngram_mod`/`.ngram_simple`/etc sub-structs; not referenced by project directly |
+| ~b8953–b8962 | `common/arg.h` | `is_sparam` bool split into `is_sampling` + `is_spec`; `set_sparam()` split into `set_sampling()` + `set_spec()`; not used by project |
+| ~b8953–b8962 | `tools/server/server-task.cpp` | `task_params::to_json()` drops `"speculative.n_max"`, `"speculative.n_min"`, `"speculative.p_min"` from output; only `"speculative.type"` remains; test `SlotParamsToJson.SpeculativeFields_Present` updated accordingly |
+| ~b8953–b8962 | `common/speculative.h` | New public API: `common_speculative_n_max()` and `common_speculative_n_min()` added; server-context.cpp uses these instead of direct field access; no project changes required |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d7213b7..c597272d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8953
+	GIT_TAG        b8962
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index bd2e05e0..e9abd56d 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8953](https://img.shields.io/badge/llama.cpp-%23b8953-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8953)
+[![llama.cpp b8962](https://img.shields.io/badge/llama.cpp-%23b8962-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8962)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 
diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp
index 82801deb..ee56d790 100644
--- a/src/test/cpp/test_server.cpp
+++ b/src/test/cpp/test_server.cpp
@@ -243,9 +243,9 @@ TEST(SlotParamsToJson, SpeculativeFields_Present) {
     task_params p;
     const json j = p.to_json();
 
-    EXPECT_TRUE(j.contains("speculative.n_max"));
-    EXPECT_TRUE(j.contains("speculative.n_min"));
-    EXPECT_TRUE(j.contains("speculative.p_min"));
+    // b8962: only speculative.type is serialised; n_max/n_min/p_min are
+    // input-only (consumed by params_from_json_cmpl, not emitted by to_json)
+    EXPECT_TRUE(j.contains("speculative.type"));
 }
 
 TEST(SlotParamsToJson, GrammarTriggers_IsArrayByDefault) {

From 24be512027d4381273553c555ceea14a6a785570 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:53:39 +0000
Subject: [PATCH 02/10] Upgrade llama.cpp from b8962 to b8982
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Breaking changes in b8982 that don't affect project code directly:
- common_sampler_accept: 3rd param renamed accept_grammar → is_generated;
  semantics broadened so false also skips reasoning budget update
- common_reasoning_budget_init: two overloads merged; prefill_tokens param
  removed; callers feed prefill via llama_sampler_accept() loop after init
- ggml_cuda_op_ssm_conv: new optional bias_add_node param; SSM_CONV+ADD+SILU
  CUDA fusion now supported
- speculative.cpp: p_min confidence check moved before result push (fix:
  low-confidence draft tokens now discarded entirely, not appended then ignored)
- server-context.cpp: n_draft_total accounting moved to generation site (fix)

New in b8982:
- Reasoning budget re-arms on subsequent <think> tags (multi-block support)
- CUDA: flash attention for DKQ=320/DV=256 (Mistral Small 4, GQA=32)
- CUDA: fused SSM_CONV + channel-wise bias ADD + SiLU kernel
- CUDA: NVFP4 native Blackwell MMQ path (unified with MXFP4 via template)
- CUDA: quantize_mmq_fp4_cuda replaces quantize_mmq_mxfp4_cuda (covers both)
- ARM: SVE Q8_0 4x8 GEMM kernel for 256-bit SVE with MATMUL_INT8
- PPC: big-endian / AIX tinyBLAS fallback path
- Vulkan: Q4_K scale extraction rewritten via packed uint32 reads (bug fix)
- WebGPU: flash-attn NONE path guard; subgroup-matrix path gated on capability
- ggml: version patch bumped to 0.10.1; backend-meta AllReduce delay fix;
  RISCV SpacemiT xsmtvdotii extension support
- common/log: singleton intentionally leaked to avoid Windows DLL teardown hang

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md      | 9 +++++++--
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 2eeb875a..27d6caa3 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8962**
+Current llama.cpp pinned version: **b8982**
 
 ## Upgrading CUDA Version
 
@@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
 `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`
 
-**Known breaking changes by version range** (b5022 → b8962):
+**Known breaking changes by version range** (b5022 → b8982):
 
 | Version | File | Change |
 |---------|------|--------|
@@ -223,6 +223,11 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8953–b8962 | `common/arg.h` | `is_sparam` bool split into `is_sampling` + `is_spec`; `set_sparam()` split into `set_sampling()` + `set_spec()`; not used by project |
 | ~b8953–b8962 | `tools/server/server-task.cpp` | `task_params::to_json()` drops `"speculative.n_max"`, `"speculative.n_min"`, `"speculative.p_min"` from output; only `"speculative.type"` remains; test `SlotParamsToJson.SpeculativeFields_Present` updated accordingly |
 | ~b8953–b8962 | `common/speculative.h` | New public API: `common_speculative_n_max()` and `common_speculative_n_min()` added; server-context.cpp uses these instead of direct field access; no project changes required |
+| ~b8962–b8982 | `common/sampling.h` | `common_sampler_accept` 3rd param renamed `accept_grammar` → `is_generated`; semantics broadened: `false` now also skips reasoning budget update (not just grammar); no project call sites affected |
+| ~b8962–b8982 | `common/reasoning-budget.h` | Two overloads merged: `prefill_tokens` variant removed; new single overload takes `initial_state = REASONING_BUDGET_IDLE`; prefill now fed via `llama_sampler_accept()` loop after init; not called directly by project |
+| ~b8962–b8982 | `ggml/src/ggml-cuda/ssm-conv.cuh` | `ggml_cuda_op_ssm_conv` gained optional `bias_add_node` param; `SSM_CONV + ADD + SILU` fusion now supported; internal CUDA code, no project changes required |
+| ~b8962–b8982 | `common/speculative.cpp` | Draft token confidence check (`p_min`) moved before push to result: low-confidence tokens are now discarded entirely rather than included then ignored; behavior fix, no project changes required |
+| ~b8962–b8982 | `tools/server/server-context.cpp` | `n_draft_total` accounting moved to draft generation site instead of acceptance site (bug fix); upstream only |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c597272d..08c7e423 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8962
+	GIT_TAG        b8982
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index e9abd56d..370ce1c5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8962](https://img.shields.io/badge/llama.cpp-%23b8962-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8962)
+[![llama.cpp b8982](https://img.shields.io/badge/llama.cpp-%23b8982-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8982)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

From e701e036eebecb4b96e0a25be872c0afe68a0cd0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:58:07 +0000
Subject: [PATCH 03/10] Upgrade llama.cpp from b8982 to b8994
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No project C++ changes required. Key upstream changes:
- CUDA: fixed swapped get/set_tensor_2d_async function pointers
- Vulkan: added dpitch param to buffer write 2d, implements set/get_tensor_2d
- speculative.cpp: checkpoint helpers renamed (draft_ prefix removed), ckpt_size removed
- arg.cpp: CLI typo --spec--draft-p-split → --spec-draft-p-split
- mmap: Windows >2 GB file fix using _ftelli64/_fseeki64
- httplib: bumped to v0.43.2 (Windows FILE_SHARE_WRITE, DNS cancel, mbedTLS fixes)
- server-context: LLAMA_TRACE env variable for slot acceptance tracing
All 413 C++ tests pass.

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md      | 9 ++++++++-
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 27d6caa3..bbb344ff 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8982**
+Current llama.cpp pinned version: **b8994**
 
 ## Upgrading CUDA Version
 
@@ -228,6 +228,13 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8962–b8982 | `ggml/src/ggml-cuda/ssm-conv.cuh` | `ggml_cuda_op_ssm_conv` gained optional `bias_add_node` param; `SSM_CONV + ADD + SILU` fusion now supported; internal CUDA code, no project changes required |
 | ~b8962–b8982 | `common/speculative.cpp` | Draft token confidence check (`p_min`) moved before push to result: low-confidence tokens are now discarded entirely rather than included then ignored; behavior fix, no project changes required |
 | ~b8962–b8982 | `tools/server/server-context.cpp` | `n_draft_total` accounting moved to draft generation site instead of acceptance site (bug fix); upstream only |
+| ~b8982–b8994 | `ggml/src/ggml-cuda.cu` | `ggml_backend_cuda_i` struct: `.get_tensor_2d_async` and `.set_tensor_2d_async` function pointers were swapped (get pointed to set impl and vice versa); corrected; internal CUDA backend, no project changes required |
+| ~b8982–b8994 | `ggml/src/ggml-vulkan.cpp` | `ggml_vk_buffer_write_2d_async` and `ggml_vk_buffer_write_2d` gained a `dpitch` parameter; Vulkan now implements `set_tensor_2d`/`get_tensor_2d` in buffer interface; internal backend code, no project changes required |
+| ~b8982–b8994 | `common/speculative.cpp` | Checkpoint helpers renamed: `draft_create_checkpoint` → `create_checkpoint`, `draft_restore_checkpoint` → `restore_checkpoint`; `ckpt_size` field removed (size computed from context directly); internal speculative module, not called by project |
+| ~b8982–b8994 | `common/arg.cpp` | CLI option typo fixed: `--spec--draft-p-split` → `--spec-draft-p-split` (extra dash removed); CLI-only, no project changes required |
+| ~b8982–b8994 | `src/llama-mmap.cpp` | Windows large-file (>2 GB) fix: `ftell`/`fseek` replaced with `_ftelli64`/`_fseeki64`; upstream only |
+| ~b8982–b8994 | `tools/server/httplib.h` | cpp-httplib bumped to v0.43.2: Windows `FILE_SHARE_WRITE` fix, Linux DNS cancel race fix, mbedTLS `close_notify` fix; upstream server header, no project changes required |
+| ~b8982–b8994 | `tools/server/server-context.cpp` | New `LLAMA_TRACE` env variable enables slot acceptance tracing; upstream only |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08c7e423..cdeb9695 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8982
+	GIT_TAG        b8994
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index 370ce1c5..ab0b5514 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8982](https://img.shields.io/badge/llama.cpp-%23b8982-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8982)
+[![llama.cpp b8994](https://img.shields.io/badge/llama.cpp-%23b8994-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8994)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

From 78f6b1ffb458adc82773dee07f2444a1293e689a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:01:19 +0000
Subject: [PATCH 04/10] Upgrade llama.cpp from b8994 to b9004
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No project C++ changes required. All 413 C++ unit tests pass.

b8994→b9004 upstream changes (no project impact):
- Vulkan FA: separate k_type/v_type params in coopmat2 pipeline; CREATE_FA_CM2_MIXED macro; new spec constants 12-15 (FaTypeK/FaTypeV/FaBlockBytesK/FaBlockBytesV); DECODEFUNC/NEEDS_INIT_IQ_SHMEM macros removed
- WebGPU: vectorized mul_mat condition fix (removed dst->ne[1] % 4 == 0 guard)
- Hexagon HTP: FA exp2 half-precision option; unary-op non-contiguous tensor fix
- webUI: major Svelte/TypeScript component reorganization (no C++ impact)

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md      | 8 ++++++--
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index bbb344ff..3cc3ca20 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8994**
+Current llama.cpp pinned version: **b9004**
 
 ## Upgrading CUDA Version
 
@@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
 `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`
 
-**Known breaking changes by version range** (b5022 → b8982):
+**Known breaking changes by version range** (b5022 → b9004):
 
 | Version | File | Change |
 |---------|------|--------|
@@ -235,6 +235,10 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8982–b8994 | `src/llama-mmap.cpp` | Windows large-file (>2 GB) fix: `ftell`/`fseek` replaced with `_ftelli64`/`_fseeki64`; upstream only |
 | ~b8982–b8994 | `tools/server/httplib.h` | cpp-httplib bumped to v0.43.2: Windows `FILE_SHARE_WRITE` fix, Linux DNS cancel race fix, mbedTLS `close_notify` fix; upstream server header, no project changes required |
 | ~b8982–b8994 | `tools/server/server-context.cpp` | New `LLAMA_TRACE` env variable enables slot acceptance tracing; upstream only |
+| ~b8994–b9004 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `vk_fa_pipeline_state` gains `k_type`/`v_type` fields; `get_fa_tuning_params_coopmat2` now takes separate `k_type`/`v_type` params; mixed K/V type FA pipeline creation refactored to `CREATE_FA_CM2_MIXED()` macro; `flash_attn_cm2.comp` shader uses runtime `FaTypeK`/`FaTypeV` spec constants (spec constants 12–15 added); `DECODEFUNC`/`NEEDS_INIT_IQ_SHMEM` macros removed; internal Vulkan backend, no project changes required |
+| ~b8994–b9004 | `ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp` | `get_mul_mat_fast_pipeline` vectorized-path condition fixed: `dst->ne[1] % 4 == 0` check removed (was preventing vectorization for non-multiple-of-4 batch sizes); internal WebGPU backend, no project changes required |
+| ~b8994–b9004 | `ggml/src/ggml-hexagon/` | Hexagon HTP backend: FA `exp2` half-precision option, unary-op non-contiguous tensor fix; internal DSP backend, no project changes required |
+| ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cdeb9695..ef4bc32c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8994
+	GIT_TAG        b9004
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index ab0b5514..a202f6c0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8994](https://img.shields.io/badge/llama.cpp-%23b8994-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8994)
+[![llama.cpp b9004](https://img.shields.io/badge/llama.cpp-%23b9004-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9004)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

From bb7818f1a415f62a1b06d55e7572c7114e2742d4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:06:04 +0000
Subject: [PATCH 05/10] Upgrade llama.cpp from b9004 to b9016
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No project C++ changes required. All 413 C++ unit tests pass.

b9004→b9016 upstream changes (no project impact):
- llama-io.h: read_i interface refactored (read/read_to → read/read_tensor);
  llama_io_write/read_buffer batch backend tensor ops in destructors
- server-context.cpp: static server_get_checkpoint renamed to
  server_prompt_checkpoint_update (in-place ref param)
- arg.cpp: speculative decoding CLI args renamed to --spec-draft-* prefix;
  env vars renamed LLAMA_ARG_DRAFT_* → LLAMA_ARG_SPEC_DRAFT_*
- ggml-cuda: PCI bus ID via cudaDeviceGetPCIBusId (buffer 16→32 bytes)
- ggml-opencl: Adreno MoE MXFP4 GPU-side router reorder; new ns kernels
- ggml-vulkan: GGML_VK_MAX_NODES macro removed
- ggml-webgpu: row_norm gains GGML_OP_NORM support + type parameterization
- llama-model: rope_yarn_log_mul get_key required flag fixed (false not 0.0f)
- common/chat: extract common_chat_templates_generation_prompt helper

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md      | 13 +++++++++++--
 CMakeLists.txt |  2 +-
 README.md      |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 3cc3ca20..13a2cd85 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b9004**
+Current llama.cpp pinned version: **b9016**
 
 ## Upgrading CUDA Version
 
@@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
 `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`
 
-**Known breaking changes by version range** (b5022 → b9004):
+**Known breaking changes by version range** (b5022 → b9016):
 
 | Version | File | Change |
 |---------|------|--------|
@@ -239,6 +239,15 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8994–b9004 | `ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp` | `get_mul_mat_fast_pipeline` vectorized-path condition fixed: `dst->ne[1] % 4 == 0` check removed (was preventing vectorization for non-multiple-of-4 batch sizes); internal WebGPU backend, no project changes required |
 | ~b8994–b9004 | `ggml/src/ggml-hexagon/` | Hexagon HTP backend: FA `exp2` half-precision option, unary-op non-contiguous tensor fix; internal DSP backend, no project changes required |
 | ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact |
+| ~b9004–b9016 | `src/llama-io.h` | `llama_io_read_i` interface changed: `read(size_t)→read(void*,size_t)`, `read_to(void*,size_t)` removed, new `read_tensor(tensor,offset,size)` added; `llama_io_write_buffer`/`llama_io_read_buffer` now batch backend tensor ops in destructors for performance; internal state-save/load path, not called by project |
+| ~b9004–b9016 | `tools/server/server-context.cpp` | Static `server_get_checkpoint()` (returns by value) renamed to `server_prompt_checkpoint_update()` (takes `server_prompt_checkpoint &` by reference, in-place update); compiled directly into jllama, no call site in project code |
+| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` → deprecated (use `--spec-draft-n-max`); `-md`/`--model-draft` → `--spec-draft-model`; `-hfd`/`--hf-repo-draft` → `--spec-draft-hf`; `--spec-ngram-size-n/m/min-hits` → type-specific `--spec-ngram-simple-*`/`--spec-ngram-map-k-*`/`--spec-ngram-map-k4v-*`; env vars similarly renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.); CLI-level only, Java layer passes params via JSON struct fields, no JNI impact |
+| ~b9004–b9016 | `ggml/src/ggml-cuda/ggml-cuda.cu` | PCI bus ID detection replaced `snprintf` with `cudaDeviceGetPCIBusId` (buffer 16→32 bytes); HIP/MUSA compat headers gain `cudaDeviceGetPCIBusId` alias; internal CUDA backend |
+| ~b9004–b9016 | `ggml/src/ggml-opencl/` | Adreno MoE MXFP4: new `kernel_convert_block_mxfp4_trans4_ns`/`restore` kernels in `cvt.cl`; new `gemm_moe_mxfp4_f32_ns`, `gemv_moe_mxfp4_f32_ns`, `moe_reorder_b`, `moe_sort_by_expert` kernel files; GPU-side router reorder replaces CPU-side preprocessing; `q_img` created for GEMM path; internal OpenCL backend |
+| ~b9004–b9016 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `GGML_VK_MAX_NODES 8192` macro removed (node limit now determined differently); internal Vulkan backend |
+| ~b9004–b9016 | `ggml/src/ggml-webgpu/` | `ggml_webgpu_row_norm_pipeline_key` gains `src_type`/`dst_type` fields; `GGML_OP_NORM` now supported alongside `GGML_OP_RMS_NORM`/`GGML_OP_L2_NORM`; `row_norm.wgsl` gains SRC_TYPE/DST_TYPE parameterization and NORM two-pass algorithm; internal WebGPU backend |
+| ~b9004–b9016 | `src/llama-model.cpp` | `rope_yarn_log_mul` `get_key` call changed from `required=0.0f` to `required=false`; fixes Mistral YaRN log_mul loading; internal model loading, no project impact |
+| ~b9004–b9016 | `common/chat.cpp` | `common_chat_templates_generation_prompt()` extracted from `common_chat_templates_apply_jinja()`; internal refactor, no API change |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef4bc32c..c5294f1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b9004
+	GIT_TAG        b9016
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index a202f6c0..905fbc4a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b9004](https://img.shields.io/badge/llama.cpp-%23b9004-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9004)
+[![llama.cpp b9016](https://img.shields.io/badge/llama.cpp-%23b9016-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9016)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

From a368ab7df94df8d74a9c4c0466ca51421d911b75 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:24:26 +0000
Subject: [PATCH 06/10] Upgrade llama.cpp from b9016 to b9022

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md      | 13 +++++++++++--
 CMakeLists.txt |  2 +-
 README.md      |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 13a2cd85..95c74282 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b9016**
+Current llama.cpp pinned version: **b9022**
 
 ## Upgrading CUDA Version
 
@@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
 `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`
 
-**Known breaking changes by version range** (b5022 → b9016):
+**Known breaking changes by version range** (b5022 → b9022):
 
 | Version | File | Change |
 |---------|------|--------|
@@ -248,6 +248,15 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b9004–b9016 | `ggml/src/ggml-webgpu/` | `ggml_webgpu_row_norm_pipeline_key` gains `src_type`/`dst_type` fields; `GGML_OP_NORM` now supported alongside `GGML_OP_RMS_NORM`/`GGML_OP_L2_NORM`; `row_norm.wgsl` gains SRC_TYPE/DST_TYPE parameterization and NORM two-pass algorithm; internal WebGPU backend |
 | ~b9004–b9016 | `src/llama-model.cpp` | `rope_yarn_log_mul` `get_key` call changed from `required=0.0f` to `required=false`; fixes Mistral YaRN log_mul loading; internal model loading, no project impact |
 | ~b9004–b9016 | `common/chat.cpp` | `common_chat_templates_generation_prompt()` extracted from `common_chat_templates_apply_jinja()`; internal refactor, no API change |
+| ~b9016–b9022 | `src/llama-model.h` + `src/llama-model.cpp` + `src/models/` | `llama_model` becomes abstract base with pure virtual methods (`load_stats`, `load_hparams`, `load_vocab`, `load_tensors`, `load_arch_hparams`, `load_arch_tensors`, `build_arch_graph`); `load_arch()` removed; new intermediate `llama_model_base` class provides concrete implementations; per-arch subclasses (e.g. `llama_model_llama`, `llama_model_gemma2`) in `src/models/`; factory `llama_model_create(llm_arch, params)` and `llama_model_create(ml, params)` replace direct instantiation; `LLAMA_LOAD_LOCALS` convenience macro added; public C API (`llama_model_load_from_file` etc.) unchanged — no project impact |
+| ~b9016–b9022 | `src/models/` | Many model files renamed: `cohere2-iswa.cpp`→`cohere2.cpp`, `gemma2-iswa.cpp`→`gemma2.cpp`, `gemma3n-iswa.cpp`→`gemma3n.cpp`, `gemma4-iswa.cpp`→`gemma4.cpp`, `mimo2-iswa.cpp`→`mimo2.cpp`, `openai-moe-iswa.cpp`→`openai-moe.cpp`, `pangu-embedded.cpp`→`pangu-embed.cpp`, `qwen3vl-moe.cpp`→`qwen3vlmoe.cpp`, `step35-iswa.cpp`→`step35.cpp`; new model files added (`deepseek2ocr.cpp`, `glm-dsa.cpp`, `granite-moe.cpp`, `hunyuan-vl.cpp`, `jina-bert-v2/v3.cpp`, `lfm2moe.cpp`, `llama-embed.cpp`, `mamba2.cpp`, `minicpm.cpp`, `mistral4.cpp`, `nemotron-h-moe.cpp`, `nomic-bert.cpp`, `nomic-bert-moe.cpp`, `phimoe.cpp`); upstream only, no project changes required |
+| ~b9016–b9022 | `tools/server/server-context.cpp` | `server_prompt_checkpoint_update` (the renamed function from b9016) static function signature changed from returning by value to taking `server_prompt_checkpoint &` by reference; compiled directly into jllama, no project call site |
+| ~b9016–b9022 | `tools/server/server-tools.cpp` | New built-in `get_datetime` tool added via new `server_tool_get_datetime` struct in `build_tools()`; no project changes required (handled automatically by compiled upstream source) |
+| ~b9016–b9022 | `common/chat-auto-parser-generator.cpp` | `force_tools` variable removed from `build_tool_parser_json_native`, `build_tool_parser_tag_json`, `build_tool_parser_tag_tagged`; content before tool calls is now always `p.optional(p.content(...))` regardless of `tool_choice=required`; upstream only, no project changes required |
+| ~b9016–b9022 | `common/chat-peg-parser.h/cpp` | New `optspace(const std::string & tag)` method added to `common_chat_peg_builder`; makes leading/trailing spaces in reasoning tags optional; upstream only, no project changes required |
+| ~b9016–b9022 | `common/reasoning-budget.cpp` | Forced token logit now set to `+INFINITY` (previously left at whatever the model computed); reasoning budget enforcement is now absolute; upstream only, no project changes required |
+| ~b9016–b9022 | `common/chat.cpp` | `thinking_start_tag` and `thinking_end_tag` now trimmed via `trim_whitespace()`; upstream only, no project changes required |
+| ~b9016–b9022 | `examples/diffusion/` | `diffusion_generate` extracted from `diffusion-cli.cpp` to new `diffusion.h`/`diffusion.cpp` static library; enum names prefixed: `ORIGIN`→`DIFFUSION_ALGORITHM_ORIGIN`, `TIMESTEP_BASED`→`DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED` etc.; examples only, no project changes required |
 
 ## Build Commands
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5294f1e..efbfaa02 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b9016
+	GIT_TAG        b9022
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index 905fbc4a..d7222dce 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b9016](https://img.shields.io/badge/llama.cpp-%23b9016-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9016)
+[![llama.cpp b9022](https://img.shields.io/badge/llama.cpp-%23b9022-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9022)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

From 12d62ffb6b8bc02a2518d51f065cb7ce56a723cc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:32:24 +0000
Subject: [PATCH 07/10] Add ReasoningFormat enum and reasoning budget to
 InferenceParameters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- New ReasoningFormat enum (none/auto/deepseek/deepseek-legacy) mapping to
  the reasoning_format JSON field accepted by the server
- InferenceParameters.setReasoningFormat(ReasoningFormat) — controls how
  thinking tokens from models like DeepSeek-R1 and QwQ are extracted
- InferenceParameters.setReasoningBudgetTokens(int) — caps the number of
  reasoning tokens emitted before the model is forced to its response (-1 = unlimited)
- 4 new C++ tests for reasoning_budget_tokens parsing in params_from_json_cmpl
  (default -1, positive value, zero, explicit -1); total now 417/417 passing

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md                                     |  2 +-
 .../de/kherud/llama/InferenceParameters.java  | 29 ++++++++++++
 .../de/kherud/llama/args/ReasoningFormat.java | 46 +++++++++++++++++++
 src/test/cpp/test_server.cpp                  | 28 +++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 src/main/java/de/kherud/llama/args/ReasoningFormat.java

diff --git a/CLAUDE.md b/CLAUDE.md
index 95c74282..4a5096ad 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -426,7 +426,7 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
 | `src/test/cpp/test_json_helpers.cpp` | 42 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config` |
 | `src/test/cpp/test_jni_helpers.cpp` | 36 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
 
-**Current total: 413 tests (all passing).** Branch: `claude/refactor-java-llama-d3lua`.
+**Current total: 417 tests (all passing).** Branch: `claude/determined-volta-T8AoQ`.
 
 #### Upstream source location (in CMake build tree)
 
diff --git a/src/main/java/de/kherud/llama/InferenceParameters.java b/src/main/java/de/kherud/llama/InferenceParameters.java
index 70e94401..e18a86b6 100644
--- a/src/main/java/de/kherud/llama/InferenceParameters.java
+++ b/src/main/java/de/kherud/llama/InferenceParameters.java
@@ -5,6 +5,7 @@
 import java.util.Map;
 
 import de.kherud.llama.args.MiroStat;
+import de.kherud.llama.args.ReasoningFormat;
 import de.kherud.llama.args.Sampler;
 
 /**
@@ -52,6 +53,8 @@ public final class InferenceParameters extends JsonParameters {
 	private static final String PARAM_USE_JINJA = "use_jinja";
 	private static final String PARAM_CHAT_TEMPLATE_KWARGS = "chat_template_kwargs";
 	private static final String PARAM_MESSAGES = "messages";
+	private static final String PARAM_REASONING_FORMAT = "reasoning_format";
+	private static final String PARAM_REASONING_BUDGET_TOKENS = "reasoning_budget_tokens";
 
 	public InferenceParameters(String prompt) {
 		// we always need a prompt
@@ -545,6 +548,32 @@ public InferenceParameters setMessages(String systemMessage, List<Pair<String, S
         return this;
     }
 
+	/**
+	 * Set how reasoning/thinking tokens emitted by models like DeepSeek-R1 and QwQ are
+	 * extracted and returned. Only effective when chat-template rendering is active
+	 * ({@link #setUseChatTemplate(boolean)}).
+	 *
+	 * @param reasoningFormat the format used to handle thinking tokens
+	 * @return this builder
+	 */
+	public InferenceParameters setReasoningFormat(ReasoningFormat reasoningFormat) {
+		parameters.put(PARAM_REASONING_FORMAT, toJsonString(reasoningFormat.getArgValue()));
+		return this;
+	}
+
+	/**
+	 * Limit the number of reasoning tokens a thinking model (e.g. DeepSeek-R1, QwQ) may
+	 * emit before it is forced to stop reasoning and begin its response.
+	 * A value of {@code -1} (the default) disables the budget.
+	 *
+	 * @param budgetTokens maximum reasoning tokens (-1 = unlimited)
+	 * @return this builder
+	 */
+	public InferenceParameters setReasoningBudgetTokens(int budgetTokens) {
+		parameters.put(PARAM_REASONING_BUDGET_TOKENS, String.valueOf(budgetTokens));
+		return this;
+	}
+
 	InferenceParameters setStream(boolean stream) {
 		parameters.put(PARAM_STREAM, String.valueOf(stream));
 		return this;
diff --git a/src/main/java/de/kherud/llama/args/ReasoningFormat.java b/src/main/java/de/kherud/llama/args/ReasoningFormat.java
new file mode 100644
index 00000000..56ace933
--- /dev/null
+++ b/src/main/java/de/kherud/llama/args/ReasoningFormat.java
@@ -0,0 +1,46 @@
+package de.kherud.llama.args;
+
+/**
+ * Controls how reasoning/thinking tokens produced by models like DeepSeek-R1 and QwQ are
+ * extracted and returned in the response.
+ *
+ * <p>Passed as {@code "reasoning_format"} in inference requests. Only meaningful when the model
+ * uses a thinking tag (e.g. {@code <think>...</think>}) and chat-template rendering is active
+ * ({@link de.kherud.llama.InferenceParameters#setUseChatTemplate(boolean)}).
+ */
+public enum ReasoningFormat implements CliArg {
+
+    /**
+     * Reasoning tokens are left in-line; no extraction is performed.
+     */
+    NONE("none"),
+
+    /**
+     * Automatically detect the reasoning format from the model's chat template.
+     * Equivalent to {@link #DEEPSEEK} in most cases.
+     */
+    AUTO("auto"),
+
+    /**
+     * Extract thinking-tag content into a separate {@code reasoning_content} field,
+     * including in streaming deltas.
+     */
+    DEEPSEEK("deepseek"),
+
+    /**
+     * Legacy DeepSeek format: extract thinking content into {@code reasoning_content} in
+     * non-streaming mode; leave inline in {@code <think>} tags during streaming.
+     */
+    DEEPSEEK_LEGACY("deepseek-legacy");
+
+    private final String argValue;
+
+    ReasoningFormat(String argValue) {
+        this.argValue = argValue;
+    }
+
+    @Override
+    public String getArgValue() {
+        return argValue;
+    }
+}
diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp
index ee56d790..5d5235e6 100644
--- a/src/test/cpp/test_server.cpp
+++ b/src/test/cpp/test_server.cpp
@@ -1677,6 +1677,34 @@ TEST(ParamsFromJsonCmpl, NCmpl_AliasedFromN) {
     EXPECT_EQ(p.n_cmpl, 1);
 }
 
+// ============================================================
+// params_from_json_cmpl — reasoning_budget_tokens
+//   reasoning_budget_tokens defaults to -1 (disabled).
+//   Any explicit value is stored directly in sampling.reasoning_budget_tokens.
+//   The tag-tokenisation paths (start/end/message) are skipped when tags are empty,
+//   so these tests do not require a vocab pointer.
+// ============================================================
+
+TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_Default_IsMinusOne) {
+    const auto p = parse_params({});
+    EXPECT_EQ(p.sampling.reasoning_budget_tokens, -1);
+}
+
+TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_SetPositive) {
+    const auto p = parse_params({{"reasoning_budget_tokens", 512}});
+    EXPECT_EQ(p.sampling.reasoning_budget_tokens, 512);
+}
+
+TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_Zero) {
+    const auto p = parse_params({{"reasoning_budget_tokens", 0}});
+    EXPECT_EQ(p.sampling.reasoning_budget_tokens, 0);
+}
+
+TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_ExplicitMinusOne_Disabled) {
+    const auto p = parse_params({{"reasoning_budget_tokens", -1}});
+    EXPECT_EQ(p.sampling.reasoning_budget_tokens, -1);
+}
+
 // ============================================================
 // params_from_json_cmpl — grammar type routing
 //   Three distinct paths set grammar.type:

From df7ccf4f0a53f35ee63e6d6ae311d929b373f51e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:36:21 +0000
Subject: [PATCH 08/10] Add InferenceParametersTest coverage for
 setReasoningFormat and setReasoningBudgetTokens

Tests all four ReasoningFormat enum values (none/auto/deepseek/deepseek-legacy)
and the three budget token cases (positive, zero, -1/disabled), matching the
pattern of every other setter in InferenceParameters.

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 .../kherud/llama/InferenceParametersTest.java | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/test/java/de/kherud/llama/InferenceParametersTest.java b/src/test/java/de/kherud/llama/InferenceParametersTest.java
index 211706a2..c711b9bb 100644
--- a/src/test/java/de/kherud/llama/InferenceParametersTest.java
+++ b/src/test/java/de/kherud/llama/InferenceParametersTest.java
@@ -7,6 +7,7 @@
 import java.util.Map;
 
 import de.kherud.llama.args.MiroStat;
+import de.kherud.llama.args.ReasoningFormat;
 import de.kherud.llama.args.Sampler;
 import org.junit.Test;
 
@@ -229,6 +230,52 @@ public void testSetChatTemplateKwargsEmpty() {
 		assertEquals("{}", params.parameters.get("chat_template_kwargs"));
 	}
 
+	// -------------------------------------------------------------------------
+	// ReasoningFormat / ReasoningBudgetTokens
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetReasoningFormatNone() {
+		InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.NONE);
+		assertEquals("\"none\"", params.parameters.get("reasoning_format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatAuto() {
+		InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.AUTO);
+		assertEquals("\"auto\"", params.parameters.get("reasoning_format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatDeepseek() {
+		InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.DEEPSEEK);
+		assertEquals("\"deepseek\"", params.parameters.get("reasoning_format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatDeepseekLegacy() {
+		InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.DEEPSEEK_LEGACY);
+		assertEquals("\"deepseek-legacy\"", params.parameters.get("reasoning_format"));
+	}
+
+	@Test
+	public void testSetReasoningBudgetTokensPositive() {
+		InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(512);
+		assertEquals("512", params.parameters.get("reasoning_budget_tokens"));
+	}
+
+	@Test
+	public void testSetReasoningBudgetTokensZero() {
+		InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(0);
+		assertEquals("0", params.parameters.get("reasoning_budget_tokens"));
+	}
+
+	@Test
+	public void testSetReasoningBudgetTokensDisabled() {
+		InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(-1);
+		assertEquals("-1", params.parameters.get("reasoning_budget_tokens"));
+	}
+
 	// -------------------------------------------------------------------------
 	// MiroStat
 	// -------------------------------------------------------------------------

From c4172da6153ba0dc285cd7aac226cc0b570ea343 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:48:56 +0000
Subject: [PATCH 09/10] Add mmproj/reasoning/sigma/sleep-idle to Java API; fix
 --cache-idle-slots bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug fix:
- ModelFlag.CLEAR_IDLE/NO_CLEAR_IDLE mapped to non-existent --clear-idle /
  --no-clear-idle; corrected to --cache-idle-slots / --no-cache-idle-slots
  (the actual llama.cpp CLI flags since b8841)

New ModelParameters:
- setMmproj(String), setMmprojUrl(String), enableMmprojAuto(),
  enableMmprojOffload() — vision model projection file for LLaVA / Gemma3 /
  Qwen2-VL; previously impossible to configure from Java
- setReasoningFormat(ReasoningFormat) — model-level default reasoning format
- setReasoningBudget(int) — model-level default reasoning token budget
- setSleepIdleSeconds(int) — auto-shutdown after N seconds of idle time
- ModelFlag.MMPROJ_AUTO / MMPROJ_OFFLOAD (31 flags total)

New InferenceParameters:
- setTopNSigma(float) — per-request sigma sampling threshold

New ChatResponseParser:
- extractChoiceReasoningContent(String/JsonNode) — reads
  choices[0].message.reasoning_content so callers can access thinking-model
  reasoning output without parsing raw JSON themselves

Tests: 435 Java tests passing (27 new); 417/417 C++ tests passing

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 .../de/kherud/llama/InferenceParameters.java  |  14 +++
 .../java/de/kherud/llama/ModelParameters.java |  79 ++++++++++++++
 .../java/de/kherud/llama/args/ModelFlag.java  |  10 +-
 .../kherud/llama/json/ChatResponseParser.java |  30 ++++++
 .../kherud/llama/InferenceParametersTest.java |  16 +++
 .../llama/ModelParametersExtendedTest.java    |  24 ++---
 .../de/kherud/llama/ModelParametersTest.java  | 102 ++++++++++++++++++
 .../de/kherud/llama/args/ModelFlagTest.java   |   8 +-
 .../llama/json/ChatResponseParserTest.java    |  51 +++++++++
 9 files changed, 317 insertions(+), 17 deletions(-)

diff --git a/src/main/java/de/kherud/llama/InferenceParameters.java b/src/main/java/de/kherud/llama/InferenceParameters.java
index e18a86b6..0e341d26 100644
--- a/src/main/java/de/kherud/llama/InferenceParameters.java
+++ b/src/main/java/de/kherud/llama/InferenceParameters.java
@@ -53,6 +53,7 @@ public final class InferenceParameters extends JsonParameters {
 	private static final String PARAM_USE_JINJA = "use_jinja";
 	private static final String PARAM_CHAT_TEMPLATE_KWARGS = "chat_template_kwargs";
 	private static final String PARAM_MESSAGES = "messages";
+	private static final String PARAM_TOP_N_SIGMA = "top_n_sigma";
 	private static final String PARAM_REASONING_FORMAT = "reasoning_format";
 	private static final String PARAM_REASONING_BUDGET_TOKENS = "reasoning_budget_tokens";
 
@@ -548,6 +549,19 @@ public InferenceParameters setMessages(String systemMessage, List<Pair<String, S
         return this;
     }
 
+	/**
+	 * Set top-n-sigma sampling threshold (default: -1.0, disabled).
+	 * Only tokens whose logit is within {@code n} standard deviations of the maximum logit
+	 * are kept for sampling. Effective values are typically in the range 1.0–3.0.
+	 *
+	 * @param topNSigma the sigma threshold (-1.0 = disabled)
+	 * @return this builder
+	 */
+	public InferenceParameters setTopNSigma(float topNSigma) {
+		parameters.put(PARAM_TOP_N_SIGMA, String.valueOf(topNSigma));
+		return this;
+	}
+
 	/**
 	 * Set how reasoning/thinking tokens emitted by models like DeepSeek-R1 and QwQ are
 	 * extracted and returned. Only effective when chat-template rendering is active
diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java
index 0aabec8c..1f739a5c 100644
--- a/src/main/java/de/kherud/llama/ModelParameters.java
+++ b/src/main/java/de/kherud/llama/ModelParameters.java
@@ -1307,6 +1307,85 @@ public ModelParameters setModelDraft(String modelDraft) {
         return this;
     }
 
+    /**
+     * Set the multimodal projection model file for vision-capable models (LLaVA, Gemma3, Qwen2-VL, etc.).
+     *
+     * @param mmproj path to the mmproj model file
+     * @return this builder
+     */
+    public ModelParameters setMmproj(String mmproj) {
+        parameters.put("--mmproj", mmproj);
+        return this;
+    }
+
+    /**
+     * Set a URL to download the multimodal projection model file.
+     *
+     * @param url URL of the mmproj model file
+     * @return this builder
+     */
+    public ModelParameters setMmprojUrl(String url) {
+        parameters.put("--mmproj-url", url);
+        return this;
+    }
+
+    /**
+     * Enable automatic detection and loading of the mmproj model (e.g. when loading from Hugging Face).
+     *
+     * @return this builder
+     */
+    public ModelParameters enableMmprojAuto() {
+        return setFlag(ModelFlag.MMPROJ_AUTO);
+    }
+
+    /**
+     * Enable offloading of the mmproj model to the GPU.
+     *
+     * @return this builder
+     */
+    public ModelParameters enableMmprojOffload() {
+        return setFlag(ModelFlag.MMPROJ_OFFLOAD);
+    }
+
+    /**
+     * Set the default reasoning format for all requests handled by this model instance.
+     * Individual requests can override this via
+     * {@link InferenceParameters#setReasoningFormat(de.kherud.llama.args.ReasoningFormat)}.
+     *
+     * @param format the reasoning format for thinking-model output
+     * @return this builder
+     */
+    public ModelParameters setReasoningFormat(de.kherud.llama.args.ReasoningFormat format) {
+        parameters.put("--reasoning-format", format.getArgValue());
+        return this;
+    }
+
+    /**
+     * Set the default reasoning token budget for all requests.
+     * Use {@code -1} to disable the budget (unlimited reasoning tokens).
+     * Individual requests can override this via
+     * {@link InferenceParameters#setReasoningBudgetTokens(int)}.
+     *
+     * @param budget maximum reasoning tokens per request (-1 = unlimited)
+     * @return this builder
+     */
+    public ModelParameters setReasoningBudget(int budget) {
+        parameters.put("--reasoning-budget", String.valueOf(budget));
+        return this;
+    }
+
+    /**
+     * Set the number of seconds of idle time after which the server shuts down automatically.
+     * Useful for resource management in on-demand deployments.
+     *
+     * @param seconds idle timeout in seconds before auto-shutdown
+     * @return this builder
+     */
+    public ModelParameters setSleepIdleSeconds(int seconds) {
+        parameters.put("--sleep-idle-seconds", String.valueOf(seconds));
+        return this;
+    }
+
     /**
      * Enable jinja for templating
      *
diff --git a/src/main/java/de/kherud/llama/args/ModelFlag.java b/src/main/java/de/kherud/llama/args/ModelFlag.java
index 056b9260..b903b6d7 100644
--- a/src/main/java/de/kherud/llama/args/ModelFlag.java
+++ b/src/main/java/de/kherud/llama/args/ModelFlag.java
@@ -93,10 +93,16 @@ public enum ModelFlag {
     NO_KV_UNIFIED("--no-kv-unified"),
 
     /** Enable saving and clearing idle slots when a new task starts. */
-    CLEAR_IDLE("--clear-idle"),
+    CLEAR_IDLE("--cache-idle-slots"),
 
     /** Disable saving and clearing idle slots. */
-    NO_CLEAR_IDLE("--no-clear-idle");
+    NO_CLEAR_IDLE("--no-cache-idle-slots"),
+
+    /** Automatically detect and load the mmproj vision projection model. */
+    MMPROJ_AUTO("--mmproj-auto"),
+
+    /** Offload the mmproj vision projection model to the GPU. */
+    MMPROJ_OFFLOAD("--mmproj-offload");
 
     private final String cliFlag;
 
diff --git a/src/main/java/de/kherud/llama/json/ChatResponseParser.java b/src/main/java/de/kherud/llama/json/ChatResponseParser.java
index ce7ce230..81029e3d 100644
--- a/src/main/java/de/kherud/llama/json/ChatResponseParser.java
+++ b/src/main/java/de/kherud/llama/json/ChatResponseParser.java
@@ -33,6 +33,36 @@ public class ChatResponseParser {
     /** Shared Jackson mapper; thread-safe and reused across all instances. */
     public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 
+    /**
+     * Extract the reasoning/thinking content from an OAI chat completion JSON string.
+     * Navigates {@code choices[0].message.reasoning_content}.
+     *
+     * <p>Thinking models (DeepSeek-R1, QwQ, Qwen3) populate this field when
+     * {@code reasoning_format} is {@code "deepseek"} or {@code "auto"}. Returns an
+     * empty string when no reasoning content is present or when the JSON is malformed.
+     *
+     * @param json OAI-compatible chat completion JSON string
+     * @return the reasoning content string, or {@code ""} on any failure
+     */
+    public String extractChoiceReasoningContent(String json) {
+        try {
+            return extractChoiceReasoningContent(OBJECT_MAPPER.readTree(json));
+        } catch (IOException e) {
+            return "";
+        }
+    }
+
+    /**
+     * Extract the reasoning/thinking content from a pre-parsed OAI chat completion node.
+     * Navigates {@code choices[0].message.reasoning_content} via Jackson path API.
+     *
+     * @param node pre-parsed OAI chat completion response node
+     * @return the reasoning content string, or {@code ""} if absent
+     */
+    public String extractChoiceReasoningContent(JsonNode node) {
+        return node.path("choices").path(0).path("message").path("reasoning_content").asText("");
+    }
+
     /**
      * Extract the assistant's reply text from an OAI chat completion JSON string.
      * Navigates {@code choices[0].message.content} via Jackson.
diff --git a/src/test/java/de/kherud/llama/InferenceParametersTest.java b/src/test/java/de/kherud/llama/InferenceParametersTest.java
index c711b9bb..f09052f3 100644
--- a/src/test/java/de/kherud/llama/InferenceParametersTest.java
+++ b/src/test/java/de/kherud/llama/InferenceParametersTest.java
@@ -230,6 +230,22 @@ public void testSetChatTemplateKwargsEmpty() {
 		assertEquals("{}", params.parameters.get("chat_template_kwargs"));
 	}
 
+	// -------------------------------------------------------------------------
+	// setTopNSigma
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetTopNSigmaEnabled() {
+		InferenceParameters params = new InferenceParameters("").setTopNSigma(2.0f);
+		assertEquals("2.0", params.parameters.get("top_n_sigma"));
+	}
+
+	@Test
+	public void testSetTopNSigmaDisabled() {
+		InferenceParameters params = new InferenceParameters("").setTopNSigma(-1.0f);
+		assertEquals("-1.0", params.parameters.get("top_n_sigma"));
+	}
+
 	// -------------------------------------------------------------------------
 	// ReasoningFormat / ReasoningBudgetTokens
 	// -------------------------------------------------------------------------
diff --git a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
index 36f0fe35..ae7b0a8a 100644
--- a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
+++ b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
@@ -460,31 +460,31 @@ public void testSetCacheRamMibDisabled() {
     @Test
     public void testSetClearIdleTrue() {
         ModelParameters p = new ModelParameters().setClearIdle(true);
-        assertTrue(p.parameters.containsKey("--clear-idle"));
-        assertNull(p.parameters.get("--clear-idle"));
-        assertFalse(p.parameters.containsKey("--no-clear-idle"));
+        assertTrue(p.parameters.containsKey("--cache-idle-slots"));
+        assertNull(p.parameters.get("--cache-idle-slots"));
+        assertFalse(p.parameters.containsKey("--no-cache-idle-slots"));
     }
 
     @Test
     public void testSetClearIdleFalse() {
         ModelParameters p = new ModelParameters().setClearIdle(false);
-        assertTrue(p.parameters.containsKey("--no-clear-idle"));
-        assertNull(p.parameters.get("--no-clear-idle"));
-        assertFalse(p.parameters.containsKey("--clear-idle"));
+        assertTrue(p.parameters.containsKey("--no-cache-idle-slots"));
+        assertNull(p.parameters.get("--no-cache-idle-slots"));
+        assertFalse(p.parameters.containsKey("--cache-idle-slots"));
     }
 
     @Test
     public void testSetClearIdleFlipFromTrueToFalse() {
         ModelParameters p = new ModelParameters().setClearIdle(true).setClearIdle(false);
-        assertTrue(p.parameters.containsKey("--no-clear-idle"));
-        assertFalse(p.parameters.containsKey("--clear-idle"));
+        assertTrue(p.parameters.containsKey("--no-cache-idle-slots"));
+        assertFalse(p.parameters.containsKey("--cache-idle-slots"));
     }
 
     @Test
     public void testSetClearIdleFlipFromFalseToTrue() {
         ModelParameters p = new ModelParameters().setClearIdle(false).setClearIdle(true);
-        assertTrue(p.parameters.containsKey("--clear-idle"));
-        assertFalse(p.parameters.containsKey("--no-clear-idle"));
+        assertTrue(p.parameters.containsKey("--cache-idle-slots"));
+        assertFalse(p.parameters.containsKey("--no-cache-idle-slots"));
     }
 
     @Test
@@ -496,10 +496,10 @@ public void testKvUnifiedCacheRamClearIdleChaining() {
                 .setClearIdle(true);
         assertTrue(p.parameters.containsKey("--kv-unified"));
         assertEquals("8192", p.parameters.get("--cache-ram"));
-        assertTrue(p.parameters.containsKey("--clear-idle"));
+        assertTrue(p.parameters.containsKey("--cache-idle-slots"));
         // Opposite flags must be absent
         assertFalse(p.parameters.containsKey("--no-kv-unified"));
-        assertFalse(p.parameters.containsKey("--no-clear-idle"));
+        assertFalse(p.parameters.containsKey("--no-cache-idle-slots"));
     }
 
     @Test
diff --git a/src/test/java/de/kherud/llama/ModelParametersTest.java b/src/test/java/de/kherud/llama/ModelParametersTest.java
index 271e570f..1204813f 100644
--- a/src/test/java/de/kherud/llama/ModelParametersTest.java
+++ b/src/test/java/de/kherud/llama/ModelParametersTest.java
@@ -399,4 +399,106 @@ public void testBuilderChainingReturnsSameInstance() {
 		assertSame(p, p.setGpuLayers(10));
 		assertSame(p, p.enableEmbedding());
 	}
+
+	// -------------------------------------------------------------------------
+	// mmproj — vision model projection file/url
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetMmproj() {
+		ModelParameters p = new ModelParameters().setMmproj("/models/mmproj.gguf");
+		assertEquals("/models/mmproj.gguf", p.parameters.get("--mmproj"));
+	}
+
+	@Test
+	public void testSetMmprojUrl() {
+		ModelParameters p = new ModelParameters().setMmprojUrl("https://example.com/mmproj.gguf");
+		assertEquals("https://example.com/mmproj.gguf", p.parameters.get("--mmproj-url"));
+	}
+
+	@Test
+	public void testEnableMmprojAuto() {
+		ModelParameters p = new ModelParameters().enableMmprojAuto();
+		assertTrue(p.parameters.containsKey("--mmproj-auto"));
+	}
+
+	@Test
+	public void testEnableMmprojOffload() {
+		ModelParameters p = new ModelParameters().enableMmprojOffload();
+		assertTrue(p.parameters.containsKey("--mmproj-offload"));
+	}
+
+	// -------------------------------------------------------------------------
+	// Reasoning format / budget — model-level defaults for thinking models
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetReasoningFormatNone() {
+		ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.NONE);
+		assertEquals("none", p.parameters.get("--reasoning-format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatAuto() {
+		ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.AUTO);
+		assertEquals("auto", p.parameters.get("--reasoning-format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatDeepseek() {
+		ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.DEEPSEEK);
+		assertEquals("deepseek", p.parameters.get("--reasoning-format"));
+	}
+
+	@Test
+	public void testSetReasoningFormatDeepseekLegacy() {
+		ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.DEEPSEEK_LEGACY);
+		assertEquals("deepseek-legacy", p.parameters.get("--reasoning-format"));
+	}
+
+	@Test
+	public void testSetReasoningBudgetPositive() {
+		ModelParameters p = new ModelParameters().setReasoningBudget(1024);
+		assertEquals("1024", p.parameters.get("--reasoning-budget"));
+	}
+
+	@Test
+	public void testSetReasoningBudgetDisabled() {
+		ModelParameters p = new ModelParameters().setReasoningBudget(-1);
+		assertEquals("-1", p.parameters.get("--reasoning-budget"));
+	}
+
+	// -------------------------------------------------------------------------
+	// setSleepIdleSeconds
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetSleepIdleSeconds() {
+		ModelParameters p = new ModelParameters().setSleepIdleSeconds(60);
+		assertEquals("60", p.parameters.get("--sleep-idle-seconds"));
+	}
+
+	@Test
+	public void testSetSleepIdleSecondsZero() {
+		ModelParameters p = new ModelParameters().setSleepIdleSeconds(0);
+		assertEquals("0", p.parameters.get("--sleep-idle-seconds"));
+	}
+
+	// -------------------------------------------------------------------------
+	// setClearIdle / setKvUnified — correct flag names (regression)
+	// -------------------------------------------------------------------------
+
+	@Test
+	public void testSetClearIdleTrue_usesCacheIdleSlotsFlag() {
+		ModelParameters p = new ModelParameters().setClearIdle(true);
+		assertTrue(p.parameters.containsKey("--cache-idle-slots"));
+		assertFalse(p.parameters.containsKey("--no-cache-idle-slots"));
+	}
+
+	@Test
+	public void testSetClearIdleFalse_usesNoCacheIdleSlotsFlag() {
+		ModelParameters p = new ModelParameters().setClearIdle(false);
+		assertTrue(p.parameters.containsKey("--no-cache-idle-slots"));
+		assertFalse(p.parameters.containsKey("--cache-idle-slots"));
+	}
 }
diff --git a/src/test/java/de/kherud/llama/args/ModelFlagTest.java b/src/test/java/de/kherud/llama/args/ModelFlagTest.java
index 16ce3e44..294efe5d 100644
--- a/src/test/java/de/kherud/llama/args/ModelFlagTest.java
+++ b/src/test/java/de/kherud/llama/args/ModelFlagTest.java
@@ -42,8 +42,10 @@ public static Collection<Object[]> data() {
             {ModelFlag.VOCAB_ONLY,             "--vocab-only"},
             {ModelFlag.KV_UNIFIED,             "--kv-unified"},
             {ModelFlag.NO_KV_UNIFIED,          "--no-kv-unified"},
-            {ModelFlag.CLEAR_IDLE,             "--clear-idle"},
-            {ModelFlag.NO_CLEAR_IDLE,          "--no-clear-idle"},
+            {ModelFlag.CLEAR_IDLE,             "--cache-idle-slots"},
+            {ModelFlag.NO_CLEAR_IDLE,          "--no-cache-idle-slots"},
+            {ModelFlag.MMPROJ_AUTO,            "--mmproj-auto"},
+            {ModelFlag.MMPROJ_OFFLOAD,         "--mmproj-offload"},
         });
     }
 
@@ -66,7 +68,7 @@ public void testGetCliFlag() {
 
     @Test
     public void testEnumCount() {
-        assertEquals(29, ModelFlag.values().length);
+        assertEquals(31, ModelFlag.values().length);
     }
 
     @Test
diff --git a/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java b/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java
index 69572862..c3c13a56 100644
--- a/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java
+++ b/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java
@@ -96,6 +96,57 @@ public void testExtractChoiceContent_nodeMultipleChoices_takesFirst() throws Exc
         assertEquals("First", parser.extractChoiceContent(node));
     }
 
+    // ------------------------------------------------------------------
+    // extractChoiceReasoningContent
+    // ------------------------------------------------------------------
+
+    @Test
+    public void testExtractChoiceReasoningContent_present() {
+        String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"The answer is 42.\"," +
+                "\"reasoning_content\":\"Let me think step by step...\"}}]}";
+        assertEquals("Let me think step by step...", parser.extractChoiceReasoningContent(json));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_absent_returnsEmpty() {
+        String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"Hello\"}}]}";
+        assertEquals("", parser.extractChoiceReasoningContent(json));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_emptyString() {
+        String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"Hi\"," +
+                "\"reasoning_content\":\"\"}}]}";
+        assertEquals("", parser.extractChoiceReasoningContent(json));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_missingChoices_returnsEmpty() {
+        String json = "{\"id\":\"x\",\"object\":\"chat.completion\"}";
+        assertEquals("", parser.extractChoiceReasoningContent(json));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_malformedJson_returnsEmpty() {
+        assertEquals("", parser.extractChoiceReasoningContent("{not json"));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_multiline() {
+        String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"42\"," +
+                "\"reasoning_content\":\"Step 1: identify the question.\\nStep 2: answer it.\"}}]}";
+        assertEquals("Step 1: identify the question.\nStep 2: answer it.",
+                parser.extractChoiceReasoningContent(json));
+    }
+
+    @Test
+    public void testExtractChoiceReasoningContent_node() throws Exception {
+        JsonNode node = MAPPER.readTree(
+                "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"ok\"," +
+                "\"reasoning_content\":\"thinking...\"}}]}");
+        assertEquals("thinking...", parser.extractChoiceReasoningContent(node));
+    }
+
     // ------------------------------------------------------------------
     // extractUsageField
     // ------------------------------------------------------------------

From 08f372afb7fe6f7ad89b3a3b850396f34c61fb5d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 07:13:25 +0000
Subject: [PATCH 10/10] Fix setDraftMax/setDraftMin throwing on b9016+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llama.cpp b9016 removed --draft-max and --draft-min: the handler now
unconditionally throws std::invalid_argument at parse time. Calling
setDraftMax() or setDraftMin() (already covered by existing tests but
not exercised in CI without a draft model) caused models to fail to
load with no useful error.

Fix:
- setDraftMax → --spec-draft-n-max  (was --draft-max, removed)
- setDraftMin → --spec-draft-n-min  (was --draft-min, removed)

Also updated still-aliased flags to the canonical --spec-draft-*
names for forward compatibility:
- setDraftPMin → --spec-draft-p-min
- setCtxSizeDraft → --spec-draft-ctx-size
- setDeviceDraft → --spec-draft-device
- setGpuLayersDraft → --spec-draft-ngl
- setModelDraft → --spec-draft-model

Tests updated to expect the new flag names; setDraftMax/setDraftMin
tests now also assert the broken old flag is absent.

https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk
---
 CLAUDE.md                                     |  2 +-
 .../java/de/kherud/llama/ModelParameters.java | 16 +++++++-------
 .../llama/ModelParametersExtendedTest.java    | 22 +++++++++++++------
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 4a5096ad..a3d36133 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -241,7 +241,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact |
 | ~b9004–b9016 | `src/llama-io.h` | `llama_io_read_i` interface changed: `read(size_t)→read(void*,size_t)`, `read_to(void*,size_t)` removed, new `read_tensor(tensor,offset,size)` added; `llama_io_write_buffer`/`llama_io_read_buffer` now batch backend tensor ops in destructors for performance; internal state-save/load path, not called by project |
 | ~b9004–b9016 | `tools/server/server-context.cpp` | Static `server_get_checkpoint()` (returns by value) renamed to `server_prompt_checkpoint_update()` (takes `server_prompt_checkpoint &` by reference, in-place update); compiled directly into jllama, no call site in project code |
-| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` → deprecated (use `--spec-draft-n-max`); `-md`/`--model-draft` → `--spec-draft-model`; `-hfd`/`--hf-repo-draft` → `--spec-draft-hf`; `--spec-ngram-size-n/m/min-hits` → type-specific `--spec-ngram-simple-*`/`--spec-ngram-map-k-*`/`--spec-ngram-map-k4v-*`; env vars similarly renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.); CLI-level only, Java layer passes params via JSON struct fields, no JNI impact |
+| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` and `--draft-min`/`--draft-n-min` were **REMOVED** (handler `throw`s `std::invalid_argument` at parse time, not just deprecated); other draft flags (`--draft-p-min`, `--ctx-size-draft`, `--device-draft`, `--gpu-layers-draft`, `--model-draft`) kept as aliases for new canonical `--spec-draft-*` names. **Java impact**: `ModelParameters.setDraftMax`/`setDraftMin` produced removed flags → threw at model load; fixed to canonical `--spec-draft-n-max`/`--spec-draft-n-min`. Other `set*Draft` methods updated to canonical names for forward compatibility. Env vars also renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.) |
 | ~b9004–b9016 | `ggml/src/ggml-cuda/ggml-cuda.cu` | PCI bus ID detection replaced `snprintf` with `cudaDeviceGetPCIBusId` (buffer 16→32 bytes); HIP/MUSA compat headers gain `cudaDeviceGetPCIBusId` alias; internal CUDA backend |
 | ~b9004–b9016 | `ggml/src/ggml-opencl/` | Adreno MoE MXFP4: new `kernel_convert_block_mxfp4_trans4_ns`/`restore` kernels in `cvt.cl`; new `gemm_moe_mxfp4_f32_ns`, `gemv_moe_mxfp4_f32_ns`, `moe_reorder_b`, `moe_sort_by_expert` kernel files; GPU-side router reorder replaces CPU-side preprocessing; `q_img` created for GEMM path; internal OpenCL backend |
 | ~b9004–b9016 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `GGML_VK_MAX_NODES 8192` macro removed (node limit now determined differently); internal Vulkan backend |
diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java
index 1f739a5c..7c037eb9 100644
--- a/src/main/java/de/kherud/llama/ModelParameters.java
+++ b/src/main/java/de/kherud/llama/ModelParameters.java
@@ -1231,13 +1231,13 @@ public ModelParameters enableLogTimestamps() {
     }
 
     /**
-     * Set the number of tokens to draft for speculative decoding.
+     * Set the maximum number of tokens to draft for speculative decoding.
      *
      * @param draftMax the number of tokens to draft for speculative decoding
      * @return this builder
      */
     public ModelParameters setDraftMax(int draftMax) {
-        parameters.put("--draft-max", String.valueOf(draftMax));
+        parameters.put("--spec-draft-n-max", String.valueOf(draftMax));
         return this;
     }
 
@@ -1248,7 +1248,7 @@ public ModelParameters setDraftMax(int draftMax) {
      * @return this builder
      */
     public ModelParameters setDraftMin(int draftMin) {
-        parameters.put("--draft-min", String.valueOf(draftMin));
+        parameters.put("--spec-draft-n-min", String.valueOf(draftMin));
         return this;
     }
 
@@ -1259,7 +1259,7 @@ public ModelParameters setDraftMin(int draftMin) {
      * @return this builder
      */
     public ModelParameters setDraftPMin(float draftPMin) {
-        parameters.put("--draft-p-min", String.valueOf(draftPMin));
+        parameters.put("--spec-draft-p-min", String.valueOf(draftPMin));
         return this;
     }
 
@@ -1270,7 +1270,7 @@ public ModelParameters setDraftPMin(float draftPMin) {
      * @return this builder
      */
     public ModelParameters setCtxSizeDraft(int ctxSizeDraft) {
-        parameters.put("--ctx-size-draft", String.valueOf(ctxSizeDraft));
+        parameters.put("--spec-draft-ctx-size", String.valueOf(ctxSizeDraft));
         return this;
     }
 
@@ -1281,7 +1281,7 @@ public ModelParameters setCtxSizeDraft(int ctxSizeDraft) {
      * @return this builder
      */
     public ModelParameters setDeviceDraft(String deviceDraft) {
-        parameters.put("--device-draft", deviceDraft);
+        parameters.put("--spec-draft-device", deviceDraft);
         return this;
     }
 
@@ -1292,7 +1292,7 @@ public ModelParameters setDeviceDraft(String deviceDraft) {
      * @return this builder
      */
     public ModelParameters setGpuLayersDraft(int gpuLayersDraft) {
-        parameters.put("--gpu-layers-draft", String.valueOf(gpuLayersDraft));
+        parameters.put("--spec-draft-ngl", String.valueOf(gpuLayersDraft));
         return this;
     }
 
@@ -1303,7 +1303,7 @@ public ModelParameters setGpuLayersDraft(int gpuLayersDraft) {
      * @return this builder
      */
     public ModelParameters setModelDraft(String modelDraft) {
-        parameters.put("--model-draft", modelDraft);
+        parameters.put("--spec-draft-model", modelDraft);
         return this;
     }
 
diff --git a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
index ae7b0a8a..d3945f4a 100644
--- a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
+++ b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java
@@ -894,43 +894,51 @@ public void testAddControlVector() {
     @Test
     public void testSetModelDraft() {
         ModelParameters p = new ModelParameters().setModelDraft("/path/to/draft.gguf");
-        assertEquals("/path/to/draft.gguf", p.parameters.get("--model-draft"));
+        assertEquals("/path/to/draft.gguf", p.parameters.get("--spec-draft-model"));
     }
 
     @Test
     public void testSetCtxSizeDraft() {
         ModelParameters p = new ModelParameters().setCtxSizeDraft(512);
-        assertEquals("512", p.parameters.get("--ctx-size-draft"));
+        assertEquals("512", p.parameters.get("--spec-draft-ctx-size"));
     }
 
     @Test
     public void testSetDeviceDraft() {
         ModelParameters p = new ModelParameters().setDeviceDraft("cuda0");
-        assertEquals("cuda0", p.parameters.get("--device-draft"));
+        assertEquals("cuda0", p.parameters.get("--spec-draft-device"));
     }
 
     @Test
     public void testSetGpuLayersDraft() {
         ModelParameters p = new ModelParameters().setGpuLayersDraft(16);
-        assertEquals("16", p.parameters.get("--gpu-layers-draft"));
+        assertEquals("16", p.parameters.get("--spec-draft-ngl"));
     }
 
     @Test
     public void testSetDraftMax() {
+        // Regression: --draft-max was REMOVED in b9016 and now throws std::invalid_argument
+        // at model load. Must use --spec-draft-n-max.
         ModelParameters p = new ModelParameters().setDraftMax(8);
-        assertEquals("8", p.parameters.get("--draft-max"));
+        assertEquals("8", p.parameters.get("--spec-draft-n-max"));
+        assertFalse("--draft-max throws on b9016+; must not appear in args",
+                p.parameters.containsKey("--draft-max"));
     }
 
     @Test
     public void testSetDraftMin() {
+        // Regression: --draft-min was REMOVED in b9016 and now throws std::invalid_argument
+        // at model load. Must use --spec-draft-n-min.
         ModelParameters p = new ModelParameters().setDraftMin(2);
-        assertEquals("2", p.parameters.get("--draft-min"));
+        assertEquals("2", p.parameters.get("--spec-draft-n-min"));
+        assertFalse("--draft-min throws on b9016+; must not appear in args",
+                p.parameters.containsKey("--draft-min"));
     }
 
     @Test
     public void testSetDraftPMin() {
         ModelParameters p = new ModelParameters().setDraftPMin(0.5f);
-        assertEquals("0.5", p.parameters.get("--draft-p-min"));
+        assertEquals("0.5", p.parameters.get("--spec-draft-p-min"));
     }
 
     // -------------------------------------------------------------------------