From cd3f7db61502603d5baa2f2f0d7891a618e5a555 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:47:09 +0000 Subject: [PATCH 01/10] Upgrade llama.cpp from b8953 to b8962 Breaking changes in b8962 that affect this project: - task_params::to_json() drops speculative.n_max/n_min/p_min from output; only speculative.type remains. Update test_server.cpp accordingly. Breaking changes in b8962 that don't affect project code directly: - struct cpu_params renamed to common_cpu_params (and related functions) - common_params_speculative restructured with nested sub-structs (.draft.*, .ngram_cache.*, .ngram_mod.*, etc.) - common_arg::is_sparam split into is_sampling + is_spec New in b8962: - common_speculative_n_max() / common_speculative_n_min() public API - CANN backend: fused SwiGLU/GeGLU, softplus, set, cumsum, diag, fill, tri, solve_tri ops; improved L2 norm, cross entropy, get/set_rows - Vulkan: timestamp query sync fix - WebGPU: Q1_0 quantization support; SSM scan x/B/C overlap handling https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 9 +++++++-- CMakeLists.txt | 2 +- README.md | 2 +- src/test/cpp/test_server.cpp | 6 +++--- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0d787ea2..2eeb875a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8953** +Current llama.cpp pinned version: **b8962** ## Upgrading CUDA Version @@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`, `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp` -**Known breaking changes by version range** (b5022 → b8953): +**Known breaking changes by version range** (b5022 → b8962): | Version | File | Change | |---------|------|--------| @@ -218,6 +218,11 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8913–b8953 | `tools/server/server-http.h` | New `uploaded_file` struct; `files` map type changed from `map` to `map`; upstream server sources compiled directly — no project impact | | ~b8913–b8953 | `src/llama-quant.cpp` | Default quantization ftype changed from `LLAMA_FTYPE_MOSTLY_Q5_1` to `LLAMA_FTYPE_MOSTLY_Q8_0`; upstream only | | ~b8913–b8953 | `src/models/llama.cpp`, `qwen3.cpp`, `qwen3moe.cpp` | Removed duplicate `ggml_mul` for `wo_s` scale (now handled exclusively by `build_attn`); upstream only | +| ~b8953–b8962 | `common/common.h` | `struct cpu_params` → `struct common_cpu_params`; `cpu_get_num_physical_cores()` → `common_cpu_get_num_physical_cores()`; `cpu_get_num_math()` → `common_cpu_get_num_math()`; not used directly by project | +| ~b8953–b8962 | `common/common.h` | `common_params_speculative` fully restructured with nested sub-structs: `.mparams_dft`/`.model_dft`/`.cparams_dft`/`.n_max`/`.n_min`/`.p_split`/`.p_min` → `.draft.mparams`/`.draft.model`/`.draft.cparams`/`.draft.n_max`/`.draft.n_min`/`.draft.p_split`/`.draft.p_min`; ngram fields moved to `.ngram_cache`/`.ngram_mod`/`.ngram_simple`/etc sub-structs; not referenced by project directly | +| ~b8953–b8962 | `common/arg.h` | `is_sparam` bool split into `is_sampling` + `is_spec`; `set_sparam()` split into `set_sampling()` + `set_spec()`; not used by project | +| ~b8953–b8962 | `tools/server/server-task.cpp` | `task_params::to_json()` drops `"speculative.n_max"`, `"speculative.n_min"`, `"speculative.p_min"` from output; only `"speculative.type"` remains; test `SlotParamsToJson.SpeculativeFields_Present` updated accordingly | +| ~b8953–b8962 | `common/speculative.h` | New public API: `common_speculative_n_max()` and `common_speculative_n_min()` added; server-context.cpp uses these instead of direct field access; no project changes required | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d7213b7..c597272d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8953 + GIT_TAG b8962 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index bd2e05e0..e9abd56d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8953](https://img.shields.io/badge/llama.cpp-%23b8953-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8953) +[![llama.cpp b8962](https://img.shields.io/badge/llama.cpp-%23b8962-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8962) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp index 82801deb..ee56d790 100644 --- a/src/test/cpp/test_server.cpp +++ b/src/test/cpp/test_server.cpp @@ -243,9 +243,9 @@ TEST(SlotParamsToJson, SpeculativeFields_Present) { task_params p; const json j = p.to_json(); - EXPECT_TRUE(j.contains("speculative.n_max")); - EXPECT_TRUE(j.contains("speculative.n_min")); - EXPECT_TRUE(j.contains("speculative.p_min")); + // b8962: only speculative.type is serialised; n_max/n_min/p_min are + // input-only (consumed by params_from_json_cmpl, not emitted by to_json) + EXPECT_TRUE(j.contains("speculative.type")); } TEST(SlotParamsToJson, GrammarTriggers_IsArrayByDefault) { From 24be512027d4381273553c555ceea14a6a785570 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:53:39 +0000 Subject: [PATCH 02/10] Upgrade llama.cpp from b8962 to b8982 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Breaking changes in b8982 that don't affect project code directly: - common_sampler_accept: 3rd param renamed accept_grammar → is_generated; semantics broadened so false also skips reasoning budget update - common_reasoning_budget_init: two overloads merged; prefill_tokens param removed; callers feed prefill via llama_sampler_accept() loop after init - ggml_cuda_op_ssm_conv: new optional bias_add_node param; SSM_CONV+ADD+SILU CUDA fusion now supported - speculative.cpp: p_min confidence check moved before result push (fix: low-confidence draft tokens now discarded entirely, not appended then ignored) - server-context.cpp: n_draft_total accounting moved to generation site (fix) New in b8982: - Reasoning budget re-arms on subsequent tags (multi-block support) - CUDA: flash attention for DKQ=320/DV=256 (Mistral Small 4, GQA=32) - CUDA: fused SSM_CONV + channel-wise bias ADD + SiLU kernel - CUDA: NVFP4 native Blackwell MMQ path (unified with MXFP4 via template) - CUDA: quantize_mmq_fp4_cuda replaces quantize_mmq_mxfp4_cuda (covers both) - ARM: SVE Q8_0 4x8 GEMM kernel for 256-bit SVE with MATMUL_INT8 - PPC: big-endian / AIX tinyBLAS fallback path - Vulkan: Q4_K scale extraction rewritten via packed uint32 reads (bug fix) - WebGPU: flash-attn NONE path guard; subgroup-matrix path gated on capability - ggml: version patch bumped to 0.10.1; backend-meta AllReduce delay fix; RISCV SpacemiT xsmtvdotii extension support - common/log: singleton intentionally leaked to avoid Windows DLL teardown hang https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 9 +++++++-- CMakeLists.txt | 2 +- README.md | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2eeb875a..27d6caa3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8962** +Current llama.cpp pinned version: **b8982** ## Upgrading CUDA Version @@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`, `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp` -**Known breaking changes by version range** (b5022 → b8962): +**Known breaking changes by version range** (b5022 → b8982): | Version | File | Change | |---------|------|--------| @@ -223,6 +223,11 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8953–b8962 | `common/arg.h` | `is_sparam` bool split into `is_sampling` + `is_spec`; `set_sparam()` split into `set_sampling()` + `set_spec()`; not used by project | | ~b8953–b8962 | `tools/server/server-task.cpp` | `task_params::to_json()` drops `"speculative.n_max"`, `"speculative.n_min"`, `"speculative.p_min"` from output; only `"speculative.type"` remains; test `SlotParamsToJson.SpeculativeFields_Present` updated accordingly | | ~b8953–b8962 | `common/speculative.h` | New public API: `common_speculative_n_max()` and `common_speculative_n_min()` added; server-context.cpp uses these instead of direct field access; no project changes required | +| ~b8962–b8982 | `common/sampling.h` | `common_sampler_accept` 3rd param renamed `accept_grammar` → `is_generated`; semantics broadened: `false` now also skips reasoning budget update (not just grammar); no project call sites affected | +| ~b8962–b8982 | `common/reasoning-budget.h` | Two overloads merged: `prefill_tokens` variant removed; new single overload takes `initial_state = REASONING_BUDGET_IDLE`; prefill now fed via `llama_sampler_accept()` loop after init; not called directly by project | +| ~b8962–b8982 | `ggml/src/ggml-cuda/ssm-conv.cuh` | `ggml_cuda_op_ssm_conv` gained optional `bias_add_node` param; `SSM_CONV + ADD + SILU` fusion now supported; internal CUDA code, no project changes required | +| ~b8962–b8982 | `common/speculative.cpp` | Draft token confidence check (`p_min`) moved before push to result: low-confidence tokens are now discarded entirely rather than included then ignored; behavior fix, no project changes required | +| ~b8962–b8982 | `tools/server/server-context.cpp` | `n_draft_total` accounting moved to draft generation site instead of acceptance site (bug fix); upstream only | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index c597272d..08c7e423 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8962 + GIT_TAG b8982 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index e9abd56d..370ce1c5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8962](https://img.shields.io/badge/llama.cpp-%23b8962-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8962) +[![llama.cpp b8982](https://img.shields.io/badge/llama.cpp-%23b8982-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8982) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) From e701e036eebecb4b96e0a25be872c0afe68a0cd0 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:58:07 +0000 Subject: [PATCH 03/10] Upgrade llama.cpp from b8982 to b8994 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No project C++ changes required. Key upstream changes: - CUDA: fixed swapped get/set_tensor_2d_async function pointers - Vulkan: added dpitch param to buffer write 2d, implements set/get_tensor_2d - speculative.cpp: checkpoint helpers renamed (draft_ prefix removed), ckpt_size removed - arg.cpp: CLI typo --spec--draft-p-split → --spec-draft-p-split - mmap: Windows >2 GB file fix using _ftelli64/_fseeki64 - httplib: bumped to v0.43.2 (Windows FILE_SHARE_WRITE, DNS cancel, mbedTLS fixes) - server-context: LLAMA_TRACE env variable for slot acceptance tracing All 413 C++ tests pass. https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 9 ++++++++- CMakeLists.txt | 2 +- README.md | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 27d6caa3..bbb344ff 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8982** +Current llama.cpp pinned version: **b8994** ## Upgrading CUDA Version @@ -228,6 +228,13 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8962–b8982 | `ggml/src/ggml-cuda/ssm-conv.cuh` | `ggml_cuda_op_ssm_conv` gained optional `bias_add_node` param; `SSM_CONV + ADD + SILU` fusion now supported; internal CUDA code, no project changes required | | ~b8962–b8982 | `common/speculative.cpp` | Draft token confidence check (`p_min`) moved before push to result: low-confidence tokens are now discarded entirely rather than included then ignored; behavior fix, no project changes required | | ~b8962–b8982 | `tools/server/server-context.cpp` | `n_draft_total` accounting moved to draft generation site instead of acceptance site (bug fix); upstream only | +| ~b8982–b8994 | `ggml/src/ggml-cuda.cu` | `ggml_backend_cuda_i` struct: `.get_tensor_2d_async` and `.set_tensor_2d_async` function pointers were swapped (get pointed to set impl and vice versa); corrected; internal CUDA backend, no project changes required | +| ~b8982–b8994 | `ggml/src/ggml-vulkan.cpp` | `ggml_vk_buffer_write_2d_async` and `ggml_vk_buffer_write_2d` gained a `dpitch` parameter; Vulkan now implements `set_tensor_2d`/`get_tensor_2d` in buffer interface; internal backend code, no project changes required | +| ~b8982–b8994 | `common/speculative.cpp` | Checkpoint helpers renamed: `draft_create_checkpoint` → `create_checkpoint`, `draft_restore_checkpoint` → `restore_checkpoint`; `ckpt_size` field removed (size computed from context directly); internal speculative module, not called by project | +| ~b8982–b8994 | `common/arg.cpp` | CLI option typo fixed: `--spec--draft-p-split` → `--spec-draft-p-split` (extra dash removed); CLI-only, no project changes required | +| ~b8982–b8994 | `src/llama-mmap.cpp` | Windows large-file (>2 GB) fix: `ftell`/`fseek` replaced with `_ftelli64`/`_fseeki64`; upstream only | +| ~b8982–b8994 | `tools/server/httplib.h` | cpp-httplib bumped to v0.43.2: Windows `FILE_SHARE_WRITE` fix, Linux DNS cancel race fix, mbedTLS `close_notify` fix; upstream server header, no project changes required | +| ~b8982–b8994 | `tools/server/server-context.cpp` | New `LLAMA_TRACE` env variable enables slot acceptance tracing; upstream only | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index 08c7e423..cdeb9695 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8982 + GIT_TAG b8994 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index 370ce1c5..ab0b5514 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8982](https://img.shields.io/badge/llama.cpp-%23b8982-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8982) +[![llama.cpp b8994](https://img.shields.io/badge/llama.cpp-%23b8994-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8994) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) From 78f6b1ffb458adc82773dee07f2444a1293e689a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:01:19 +0000 Subject: [PATCH 04/10] Upgrade llama.cpp from b8994 to b9004 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No project C++ changes required. All 413 C++ unit tests pass. b8994→b9004 upstream changes (no project impact): - Vulkan FA: separate k_type/v_type params in coopmat2 pipeline; CREATE_FA_CM2_MIXED macro; new spec constants 12-15 (FaTypeK/FaTypeV/FaBlockBytesK/FaBlockBytesV); DECODEFUNC/NEEDS_INIT_IQ_SHMEM macros removed - WebGPU: vectorized mul_mat condition fix (removed dst->ne[1] % 4 == 0 guard) - Hexagon HTP: FA exp2 half-precision option; unary-op non-contiguous tensor fix - webUI: major Svelte/TypeScript component reorganization (no C++ impact) https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 8 ++++++-- CMakeLists.txt | 2 +- README.md | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index bbb344ff..3cc3ca20 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8994** +Current llama.cpp pinned version: **b9004** ## Upgrading CUDA Version @@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`, `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp` -**Known breaking changes by version range** (b5022 → b8982): +**Known breaking changes by version range** (b5022 → b9004): | Version | File | Change | |---------|------|--------| @@ -235,6 +235,10 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8982–b8994 | `src/llama-mmap.cpp` | Windows large-file (>2 GB) fix: `ftell`/`fseek` replaced with `_ftelli64`/`_fseeki64`; upstream only | | ~b8982–b8994 | `tools/server/httplib.h` | cpp-httplib bumped to v0.43.2: Windows `FILE_SHARE_WRITE` fix, Linux DNS cancel race fix, mbedTLS `close_notify` fix; upstream server header, no project changes required | | ~b8982–b8994 | `tools/server/server-context.cpp` | New `LLAMA_TRACE` env variable enables slot acceptance tracing; upstream only | +| ~b8994–b9004 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `vk_fa_pipeline_state` gains `k_type`/`v_type` fields; `get_fa_tuning_params_coopmat2` now takes separate `k_type`/`v_type` params; mixed K/V type FA pipeline creation refactored to `CREATE_FA_CM2_MIXED()` macro; `flash_attn_cm2.comp` shader uses runtime `FaTypeK`/`FaTypeV` spec constants (spec constants 12–15 added); `DECODEFUNC`/`NEEDS_INIT_IQ_SHMEM` macros removed; internal Vulkan backend, no project changes required | +| ~b8994–b9004 | `ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp` | `get_mul_mat_fast_pipeline` vectorized-path condition fixed: `dst->ne[1] % 4 == 0` check removed (was preventing vectorization for non-multiple-of-4 batch sizes); internal WebGPU backend, no project changes required | +| ~b8994–b9004 | `ggml/src/ggml-hexagon/` | Hexagon HTP backend: FA `exp2` half-precision option, unary-op non-contiguous tensor fix; internal DSP backend, no project changes required | +| ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index cdeb9695..ef4bc32c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8994 + GIT_TAG b9004 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index ab0b5514..a202f6c0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8994](https://img.shields.io/badge/llama.cpp-%23b8994-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8994) +[![llama.cpp b9004](https://img.shields.io/badge/llama.cpp-%23b9004-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9004) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) From bb7818f1a415f62a1b06d55e7572c7114e2742d4 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:06:04 +0000 Subject: [PATCH 05/10] Upgrade llama.cpp from b9004 to b9016 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No project C++ changes required. All 413 C++ unit tests pass. b9004→b9016 upstream changes (no project impact): - llama-io.h: read_i interface refactored (read/read_to → read/read_tensor); llama_io_write/read_buffer batch backend tensor ops in destructors - server-context.cpp: static server_get_checkpoint renamed to server_prompt_checkpoint_update (in-place ref param) - arg.cpp: speculative decoding CLI args renamed to --spec-draft-* prefix; env vars renamed LLAMA_ARG_DRAFT_* → LLAMA_ARG_SPEC_DRAFT_* - ggml-cuda: PCI bus ID via cudaDeviceGetPCIBusId (buffer 16→32 bytes) - ggml-opencl: Adreno MoE MXFP4 GPU-side router reorder; new ns kernels - ggml-vulkan: GGML_VK_MAX_NODES macro removed - ggml-webgpu: row_norm gains GGML_OP_NORM support + type parameterization - llama-model: rope_yarn_log_mul get_key required flag fixed (false not 0.0f) - common/chat: extract common_chat_templates_generation_prompt helper https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 13 +++++++++++-- CMakeLists.txt | 2 +- README.md | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 3cc3ca20..13a2cd85 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b9004** +Current llama.cpp pinned version: **b9016** ## Upgrading CUDA Version @@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`, `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp` -**Known breaking changes by version range** (b5022 → b9004): +**Known breaking changes by version range** (b5022 → b9016): | Version | File | Change | |---------|------|--------| @@ -239,6 +239,15 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8994–b9004 | `ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp` | `get_mul_mat_fast_pipeline` vectorized-path condition fixed: `dst->ne[1] % 4 == 0` check removed (was preventing vectorization for non-multiple-of-4 batch sizes); internal WebGPU backend, no project changes required | | ~b8994–b9004 | `ggml/src/ggml-hexagon/` | Hexagon HTP backend: FA `exp2` half-precision option, unary-op non-contiguous tensor fix; internal DSP backend, no project changes required | | ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact | +| ~b9004–b9016 | `src/llama-io.h` | `llama_io_read_i` interface changed: `read(size_t)→read(void*,size_t)`, `read_to(void*,size_t)` removed, new `read_tensor(tensor,offset,size)` added; `llama_io_write_buffer`/`llama_io_read_buffer` now batch backend tensor ops in destructors for performance; internal state-save/load path, not called by project | +| ~b9004–b9016 | `tools/server/server-context.cpp` | Static `server_get_checkpoint()` (returns by value) renamed to `server_prompt_checkpoint_update()` (takes `server_prompt_checkpoint &` by reference, in-place update); compiled directly into jllama, no call site in project code | +| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` → deprecated (use `--spec-draft-n-max`); `-md`/`--model-draft` → `--spec-draft-model`; `-hfd`/`--hf-repo-draft` → `--spec-draft-hf`; `--spec-ngram-size-n/m/min-hits` → type-specific `--spec-ngram-simple-*`/`--spec-ngram-map-k-*`/`--spec-ngram-map-k4v-*`; env vars similarly renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.); CLI-level only, Java layer passes params via JSON struct fields, no JNI impact | +| ~b9004–b9016 | `ggml/src/ggml-cuda/ggml-cuda.cu` | PCI bus ID detection replaced `snprintf` with `cudaDeviceGetPCIBusId` (buffer 16→32 bytes); HIP/MUSA compat headers gain `cudaDeviceGetPCIBusId` alias; internal CUDA backend | +| ~b9004–b9016 | `ggml/src/ggml-opencl/` | Adreno MoE MXFP4: new `kernel_convert_block_mxfp4_trans4_ns`/`restore` kernels in `cvt.cl`; new `gemm_moe_mxfp4_f32_ns`, `gemv_moe_mxfp4_f32_ns`, `moe_reorder_b`, `moe_sort_by_expert` kernel files; GPU-side router reorder replaces CPU-side preprocessing; `q_img` created for GEMM path; internal OpenCL backend | +| ~b9004–b9016 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `GGML_VK_MAX_NODES 8192` macro removed (node limit now determined differently); internal Vulkan backend | +| ~b9004–b9016 | `ggml/src/ggml-webgpu/` | `ggml_webgpu_row_norm_pipeline_key` gains `src_type`/`dst_type` fields; `GGML_OP_NORM` now supported alongside `GGML_OP_RMS_NORM`/`GGML_OP_L2_NORM`; `row_norm.wgsl` gains SRC_TYPE/DST_TYPE parameterization and NORM two-pass algorithm; internal WebGPU backend | +| ~b9004–b9016 | `src/llama-model.cpp` | `rope_yarn_log_mul` `get_key` call changed from `required=0.0f` to `required=false`; fixes Mistral YaRN log_mul loading; internal model loading, no project impact | +| ~b9004–b9016 | `common/chat.cpp` | `common_chat_templates_generation_prompt()` extracted from `common_chat_templates_apply_jinja()`; internal refactor, no API change | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index ef4bc32c..c5294f1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b9004 + GIT_TAG b9016 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index a202f6c0..905fbc4a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b9004](https://img.shields.io/badge/llama.cpp-%23b9004-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9004) +[![llama.cpp b9016](https://img.shields.io/badge/llama.cpp-%23b9016-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9016) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) From a368ab7df94df8d74a9c4c0466ca51421d911b75 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:24:26 +0000 Subject: [PATCH 06/10] Upgrade llama.cpp from b9016 to b9022 https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 13 +++++++++++-- CMakeLists.txt | 2 +- README.md | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 13a2cd85..95c74282 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b9016** +Current llama.cpp pinned version: **b9022** ## Upgrading CUDA Version @@ -183,7 +183,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren `ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`, `ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp` -**Known breaking changes by version range** (b5022 → b9016): +**Known breaking changes by version range** (b5022 → b9022): | Version | File | Change | |---------|------|--------| @@ -248,6 +248,15 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b9004–b9016 | `ggml/src/ggml-webgpu/` | `ggml_webgpu_row_norm_pipeline_key` gains `src_type`/`dst_type` fields; `GGML_OP_NORM` now supported alongside `GGML_OP_RMS_NORM`/`GGML_OP_L2_NORM`; `row_norm.wgsl` gains SRC_TYPE/DST_TYPE parameterization and NORM two-pass algorithm; internal WebGPU backend | | ~b9004–b9016 | `src/llama-model.cpp` | `rope_yarn_log_mul` `get_key` call changed from `required=0.0f` to `required=false`; fixes Mistral YaRN log_mul loading; internal model loading, no project impact | | ~b9004–b9016 | `common/chat.cpp` | `common_chat_templates_generation_prompt()` extracted from `common_chat_templates_apply_jinja()`; internal refactor, no API change | +| ~b9016–b9022 | `src/llama-model.h` + `src/llama-model.cpp` + `src/models/` | `llama_model` becomes abstract base with pure virtual methods (`load_stats`, `load_hparams`, `load_vocab`, `load_tensors`, `load_arch_hparams`, `load_arch_tensors`, `build_arch_graph`); `load_arch()` removed; new intermediate `llama_model_base` class provides concrete implementations; per-arch subclasses (e.g. `llama_model_llama`, `llama_model_gemma2`) in `src/models/`; factory `llama_model_create(llm_arch, params)` and `llama_model_create(ml, params)` replace direct instantiation; `LLAMA_LOAD_LOCALS` convenience macro added; public C API (`llama_model_load_from_file` etc.) unchanged — no project impact | +| ~b9016–b9022 | `src/models/` | Many model files renamed: `cohere2-iswa.cpp`→`cohere2.cpp`, `gemma2-iswa.cpp`→`gemma2.cpp`, `gemma3n-iswa.cpp`→`gemma3n.cpp`, `gemma4-iswa.cpp`→`gemma4.cpp`, `mimo2-iswa.cpp`→`mimo2.cpp`, `openai-moe-iswa.cpp`→`openai-moe.cpp`, `pangu-embedded.cpp`→`pangu-embed.cpp`, `qwen3vl-moe.cpp`→`qwen3vlmoe.cpp`, `step35-iswa.cpp`→`step35.cpp`; new model files added (`deepseek2ocr.cpp`, `glm-dsa.cpp`, `granite-moe.cpp`, `hunyuan-vl.cpp`, `jina-bert-v2/v3.cpp`, `lfm2moe.cpp`, `llama-embed.cpp`, `mamba2.cpp`, `minicpm.cpp`, `mistral4.cpp`, `nemotron-h-moe.cpp`, `nomic-bert.cpp`, `nomic-bert-moe.cpp`, `phimoe.cpp`); upstream only, no project changes required | +| ~b9016–b9022 | `tools/server/server-context.cpp` | `server_prompt_checkpoint_update` (the renamed function from b9016) static function signature changed from returning by value to taking `server_prompt_checkpoint &` by reference; compiled directly into jllama, no project call site | +| ~b9016–b9022 | `tools/server/server-tools.cpp` | New built-in `get_datetime` tool added via new `server_tool_get_datetime` struct in `build_tools()`; no project changes required (handled automatically by compiled upstream source) | +| ~b9016–b9022 | `common/chat-auto-parser-generator.cpp` | `force_tools` variable removed from `build_tool_parser_json_native`, `build_tool_parser_tag_json`, `build_tool_parser_tag_tagged`; content before tool calls is now always `p.optional(p.content(...))` regardless of `tool_choice=required`; upstream only, no project changes required | +| ~b9016–b9022 | `common/chat-peg-parser.h/cpp` | New `optspace(const std::string & tag)` method added to `common_chat_peg_builder`; makes leading/trailing spaces in reasoning tags optional; upstream only, no project changes required | +| ~b9016–b9022 | `common/reasoning-budget.cpp` | Forced token logit now set to `+INFINITY` (previously left at whatever the model computed); reasoning budget enforcement is now absolute; upstream only, no project changes required | +| ~b9016–b9022 | `common/chat.cpp` | `thinking_start_tag` and `thinking_end_tag` now trimmed via `trim_whitespace()`; upstream only, no project changes required | +| ~b9016–b9022 | `examples/diffusion/` | `diffusion_generate` extracted from `diffusion-cli.cpp` to new `diffusion.h`/`diffusion.cpp` static library; enum names prefixed: `ORIGIN`→`DIFFUSION_ALGORITHM_ORIGIN`, `TIMESTEP_BASED`→`DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED` etc.; examples only, no project changes required | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index c5294f1e..efbfaa02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b9016 + GIT_TAG b9022 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index 905fbc4a..d7222dce 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b9016](https://img.shields.io/badge/llama.cpp-%23b9016-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9016) +[![llama.cpp b9022](https://img.shields.io/badge/llama.cpp-%23b9022-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9022) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) From 12d62ffb6b8bc02a2518d51f065cb7ce56a723cc Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:32:24 +0000 Subject: [PATCH 07/10] Add ReasoningFormat enum and reasoning budget to InferenceParameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New ReasoningFormat enum (none/auto/deepseek/deepseek-legacy) mapping to the reasoning_format JSON field accepted by the server - InferenceParameters.setReasoningFormat(ReasoningFormat) — controls how thinking tokens from models like DeepSeek-R1 and QwQ are extracted - InferenceParameters.setReasoningBudgetTokens(int) — caps the number of reasoning tokens emitted before the model is forced to its response (-1 = unlimited) - 4 new C++ tests for reasoning_budget_tokens parsing in params_from_json_cmpl (default -1, positive value, zero, explicit -1); total now 417/417 passing https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 2 +- .../de/kherud/llama/InferenceParameters.java | 29 ++++++++++++ .../de/kherud/llama/args/ReasoningFormat.java | 46 +++++++++++++++++++ src/test/cpp/test_server.cpp | 28 +++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 src/main/java/de/kherud/llama/args/ReasoningFormat.java diff --git a/CLAUDE.md b/CLAUDE.md index 95c74282..4a5096ad 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -426,7 +426,7 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" | `src/test/cpp/test_json_helpers.cpp` | 42 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config` | | `src/test/cpp/test_jni_helpers.cpp` | 36 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock | -**Current total: 413 tests (all passing).** Branch: `claude/refactor-java-llama-d3lua`. +**Current total: 417 tests (all passing).** Branch: `claude/determined-volta-T8AoQ`. #### Upstream source location (in CMake build tree) diff --git a/src/main/java/de/kherud/llama/InferenceParameters.java b/src/main/java/de/kherud/llama/InferenceParameters.java index 70e94401..e18a86b6 100644 --- a/src/main/java/de/kherud/llama/InferenceParameters.java +++ b/src/main/java/de/kherud/llama/InferenceParameters.java @@ -5,6 +5,7 @@ import java.util.Map; import de.kherud.llama.args.MiroStat; +import de.kherud.llama.args.ReasoningFormat; import de.kherud.llama.args.Sampler; /** @@ -52,6 +53,8 @@ public final class InferenceParameters extends JsonParameters { private static final String PARAM_USE_JINJA = "use_jinja"; private static final String PARAM_CHAT_TEMPLATE_KWARGS = "chat_template_kwargs"; private static final String PARAM_MESSAGES = "messages"; + private static final String PARAM_REASONING_FORMAT = "reasoning_format"; + private static final String PARAM_REASONING_BUDGET_TOKENS = "reasoning_budget_tokens"; public InferenceParameters(String prompt) { // we always need a prompt @@ -545,6 +548,32 @@ public InferenceParameters setMessages(String systemMessage, ListPassed as {@code "reasoning_format"} in inference requests. Only meaningful when the model + * uses a thinking tag (e.g. {@code ...}) and chat-template rendering is active + * ({@link de.kherud.llama.InferenceParameters#setUseChatTemplate(boolean)}). + */ +public enum ReasoningFormat implements CliArg { + + /** + * Reasoning tokens are left in-line; no extraction is performed. + */ + NONE("none"), + + /** + * Automatically detect the reasoning format from the model's chat template. + * Equivalent to {@link #DEEPSEEK} in most cases. + */ + AUTO("auto"), + + /** + * Extract thinking-tag content into a separate {@code reasoning_content} field, + * including in streaming deltas. + */ + DEEPSEEK("deepseek"), + + /** + * Legacy DeepSeek format: extract thinking content into {@code reasoning_content} in + * non-streaming mode; leave inline in {@code } tags during streaming. + */ + DEEPSEEK_LEGACY("deepseek-legacy"); + + private final String argValue; + + ReasoningFormat(String argValue) { + this.argValue = argValue; + } + + @Override + public String getArgValue() { + return argValue; + } +} diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp index ee56d790..5d5235e6 100644 --- a/src/test/cpp/test_server.cpp +++ b/src/test/cpp/test_server.cpp @@ -1677,6 +1677,34 @@ TEST(ParamsFromJsonCmpl, NCmpl_AliasedFromN) { EXPECT_EQ(p.n_cmpl, 1); } +// ============================================================ +// params_from_json_cmpl — reasoning_budget_tokens +// reasoning_budget_tokens defaults to -1 (disabled). +// Any explicit value is stored directly in sampling.reasoning_budget_tokens. +// The tag-tokenisation paths (start/end/message) are skipped when tags are empty, +// so these tests do not require a vocab pointer. +// ============================================================ + +TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_Default_IsMinusOne) { + const auto p = parse_params({}); + EXPECT_EQ(p.sampling.reasoning_budget_tokens, -1); +} + +TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_SetPositive) { + const auto p = parse_params({{"reasoning_budget_tokens", 512}}); + EXPECT_EQ(p.sampling.reasoning_budget_tokens, 512); +} + +TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_Zero) { + const auto p = parse_params({{"reasoning_budget_tokens", 0}}); + EXPECT_EQ(p.sampling.reasoning_budget_tokens, 0); +} + +TEST(ParamsFromJsonCmpl, ReasoningBudgetTokens_ExplicitMinusOne_Disabled) { + const auto p = parse_params({{"reasoning_budget_tokens", -1}}); + EXPECT_EQ(p.sampling.reasoning_budget_tokens, -1); +} + // ============================================================ // params_from_json_cmpl — grammar type routing // Three distinct paths set grammar.type: From df7ccf4f0a53f35ee63e6d6ae311d929b373f51e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:36:21 +0000 Subject: [PATCH 08/10] Add InferenceParametersTest coverage for setReasoningFormat and setReasoningBudgetTokens Tests all four ReasoningFormat enum values (none/auto/deepseek/deepseek-legacy) and the three budget token cases (positive, zero, -1/disabled), matching the pattern of every other setter in InferenceParameters. https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- .../kherud/llama/InferenceParametersTest.java | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/test/java/de/kherud/llama/InferenceParametersTest.java b/src/test/java/de/kherud/llama/InferenceParametersTest.java index 211706a2..c711b9bb 100644 --- a/src/test/java/de/kherud/llama/InferenceParametersTest.java +++ b/src/test/java/de/kherud/llama/InferenceParametersTest.java @@ -7,6 +7,7 @@ import java.util.Map; import de.kherud.llama.args.MiroStat; +import de.kherud.llama.args.ReasoningFormat; import de.kherud.llama.args.Sampler; import org.junit.Test; @@ -229,6 +230,52 @@ public void testSetChatTemplateKwargsEmpty() { assertEquals("{}", params.parameters.get("chat_template_kwargs")); } + // ------------------------------------------------------------------------- + // ReasoningFormat / ReasoningBudgetTokens + // ------------------------------------------------------------------------- + + @Test + public void testSetReasoningFormatNone() { + InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.NONE); + assertEquals("\"none\"", params.parameters.get("reasoning_format")); + } + + @Test + public void testSetReasoningFormatAuto() { + InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.AUTO); + assertEquals("\"auto\"", params.parameters.get("reasoning_format")); + } + + @Test + public void testSetReasoningFormatDeepseek() { + InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.DEEPSEEK); + assertEquals("\"deepseek\"", params.parameters.get("reasoning_format")); + } + + @Test + public void testSetReasoningFormatDeepseekLegacy() { + InferenceParameters params = new InferenceParameters("").setReasoningFormat(ReasoningFormat.DEEPSEEK_LEGACY); + assertEquals("\"deepseek-legacy\"", params.parameters.get("reasoning_format")); + } + + @Test + public void testSetReasoningBudgetTokensPositive() { + InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(512); + assertEquals("512", params.parameters.get("reasoning_budget_tokens")); + } + + @Test + public void testSetReasoningBudgetTokensZero() { + InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(0); + assertEquals("0", params.parameters.get("reasoning_budget_tokens")); + } + + @Test + public void testSetReasoningBudgetTokensDisabled() { + InferenceParameters params = new InferenceParameters("").setReasoningBudgetTokens(-1); + assertEquals("-1", params.parameters.get("reasoning_budget_tokens")); + } + // ------------------------------------------------------------------------- // MiroStat // ------------------------------------------------------------------------- From c4172da6153ba0dc285cd7aac226cc0b570ea343 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:48:56 +0000 Subject: [PATCH 09/10] Add mmproj/reasoning/sigma/sleep-idle to Java API; fix --cache-idle-slots bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug fix: - ModelFlag.CLEAR_IDLE/NO_CLEAR_IDLE mapped to non-existent --clear-idle / --no-clear-idle; corrected to --cache-idle-slots / --no-cache-idle-slots (the actual llama.cpp CLI flags since b8841) New ModelParameters: - setMmproj(String), setMmprojUrl(String), enableMmprojAuto(), enableMmprojOffload() — vision model projection file for LLaVA / Gemma3 / Qwen2-VL; previously impossible to configure from Java - setReasoningFormat(ReasoningFormat) — model-level default reasoning format - setReasoningBudget(int) — model-level default reasoning token budget - setSleepIdleSeconds(int) — auto-shutdown after N seconds of idle time - ModelFlag.MMPROJ_AUTO / MMPROJ_OFFLOAD (31 flags total) New InferenceParameters: - setTopNSigma(float) — per-request sigma sampling threshold New ChatResponseParser: - extractChoiceReasoningContent(String/JsonNode) — reads choices[0].message.reasoning_content so callers can access thinking-model reasoning output without parsing raw JSON themselves Tests: 435 Java tests passing (27 new); 417/417 C++ tests passing https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- .../de/kherud/llama/InferenceParameters.java | 14 +++ .../java/de/kherud/llama/ModelParameters.java | 79 ++++++++++++++ .../java/de/kherud/llama/args/ModelFlag.java | 10 +- .../kherud/llama/json/ChatResponseParser.java | 30 ++++++ .../kherud/llama/InferenceParametersTest.java | 16 +++ .../llama/ModelParametersExtendedTest.java | 24 ++--- .../de/kherud/llama/ModelParametersTest.java | 102 ++++++++++++++++++ .../de/kherud/llama/args/ModelFlagTest.java | 8 +- .../llama/json/ChatResponseParserTest.java | 51 +++++++++ 9 files changed, 317 insertions(+), 17 deletions(-) diff --git a/src/main/java/de/kherud/llama/InferenceParameters.java b/src/main/java/de/kherud/llama/InferenceParameters.java index e18a86b6..0e341d26 100644 --- a/src/main/java/de/kherud/llama/InferenceParameters.java +++ b/src/main/java/de/kherud/llama/InferenceParameters.java @@ -53,6 +53,7 @@ public final class InferenceParameters extends JsonParameters { private static final String PARAM_USE_JINJA = "use_jinja"; private static final String PARAM_CHAT_TEMPLATE_KWARGS = "chat_template_kwargs"; private static final String PARAM_MESSAGES = "messages"; + private static final String PARAM_TOP_N_SIGMA = "top_n_sigma"; private static final String PARAM_REASONING_FORMAT = "reasoning_format"; private static final String PARAM_REASONING_BUDGET_TOKENS = "reasoning_budget_tokens"; @@ -548,6 +549,19 @@ public InferenceParameters setMessages(String systemMessage, ListThinking models (DeepSeek-R1, QwQ, Qwen3) populate this field when + * {@code reasoning_format} is {@code "deepseek"} or {@code "auto"}. Returns an + * empty string when no reasoning content is present or when the JSON is malformed. + * + * @param json OAI-compatible chat completion JSON string + * @return the reasoning content string, or {@code ""} on any failure + */ + public String extractChoiceReasoningContent(String json) { + try { + return extractChoiceReasoningContent(OBJECT_MAPPER.readTree(json)); + } catch (IOException e) { + return ""; + } + } + + /** + * Extract the reasoning/thinking content from a pre-parsed OAI chat completion node. + * Navigates {@code choices[0].message.reasoning_content} via Jackson path API. + * + * @param node pre-parsed OAI chat completion response node + * @return the reasoning content string, or {@code ""} if absent + */ + public String extractChoiceReasoningContent(JsonNode node) { + return node.path("choices").path(0).path("message").path("reasoning_content").asText(""); + } + /** * Extract the assistant's reply text from an OAI chat completion JSON string. * Navigates {@code choices[0].message.content} via Jackson. diff --git a/src/test/java/de/kherud/llama/InferenceParametersTest.java b/src/test/java/de/kherud/llama/InferenceParametersTest.java index c711b9bb..f09052f3 100644 --- a/src/test/java/de/kherud/llama/InferenceParametersTest.java +++ b/src/test/java/de/kherud/llama/InferenceParametersTest.java @@ -230,6 +230,22 @@ public void testSetChatTemplateKwargsEmpty() { assertEquals("{}", params.parameters.get("chat_template_kwargs")); } + // ------------------------------------------------------------------------- + // setTopNSigma + // ------------------------------------------------------------------------- + + @Test + public void testSetTopNSigmaEnabled() { + InferenceParameters params = new InferenceParameters("").setTopNSigma(2.0f); + assertEquals("2.0", params.parameters.get("top_n_sigma")); + } + + @Test + public void testSetTopNSigmaDisabled() { + InferenceParameters params = new InferenceParameters("").setTopNSigma(-1.0f); + assertEquals("-1.0", params.parameters.get("top_n_sigma")); + } + // ------------------------------------------------------------------------- // ReasoningFormat / ReasoningBudgetTokens // ------------------------------------------------------------------------- diff --git a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java index 36f0fe35..ae7b0a8a 100644 --- a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java +++ b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java @@ -460,31 +460,31 @@ public void testSetCacheRamMibDisabled() { @Test public void testSetClearIdleTrue() { ModelParameters p = new ModelParameters().setClearIdle(true); - assertTrue(p.parameters.containsKey("--clear-idle")); - assertNull(p.parameters.get("--clear-idle")); - assertFalse(p.parameters.containsKey("--no-clear-idle")); + assertTrue(p.parameters.containsKey("--cache-idle-slots")); + assertNull(p.parameters.get("--cache-idle-slots")); + assertFalse(p.parameters.containsKey("--no-cache-idle-slots")); } @Test public void testSetClearIdleFalse() { ModelParameters p = new ModelParameters().setClearIdle(false); - assertTrue(p.parameters.containsKey("--no-clear-idle")); - assertNull(p.parameters.get("--no-clear-idle")); - assertFalse(p.parameters.containsKey("--clear-idle")); + assertTrue(p.parameters.containsKey("--no-cache-idle-slots")); + assertNull(p.parameters.get("--no-cache-idle-slots")); + assertFalse(p.parameters.containsKey("--cache-idle-slots")); } @Test public void testSetClearIdleFlipFromTrueToFalse() { ModelParameters p = new ModelParameters().setClearIdle(true).setClearIdle(false); - assertTrue(p.parameters.containsKey("--no-clear-idle")); - assertFalse(p.parameters.containsKey("--clear-idle")); + assertTrue(p.parameters.containsKey("--no-cache-idle-slots")); + assertFalse(p.parameters.containsKey("--cache-idle-slots")); } @Test public void testSetClearIdleFlipFromFalseToTrue() { ModelParameters p = new ModelParameters().setClearIdle(false).setClearIdle(true); - assertTrue(p.parameters.containsKey("--clear-idle")); - assertFalse(p.parameters.containsKey("--no-clear-idle")); + assertTrue(p.parameters.containsKey("--cache-idle-slots")); + assertFalse(p.parameters.containsKey("--no-cache-idle-slots")); } @Test @@ -496,10 +496,10 @@ public void testKvUnifiedCacheRamClearIdleChaining() { .setClearIdle(true); assertTrue(p.parameters.containsKey("--kv-unified")); assertEquals("8192", p.parameters.get("--cache-ram")); - assertTrue(p.parameters.containsKey("--clear-idle")); + assertTrue(p.parameters.containsKey("--cache-idle-slots")); // Opposite flags must be absent assertFalse(p.parameters.containsKey("--no-kv-unified")); - assertFalse(p.parameters.containsKey("--no-clear-idle")); + assertFalse(p.parameters.containsKey("--no-cache-idle-slots")); } @Test diff --git a/src/test/java/de/kherud/llama/ModelParametersTest.java b/src/test/java/de/kherud/llama/ModelParametersTest.java index 271e570f..1204813f 100644 --- a/src/test/java/de/kherud/llama/ModelParametersTest.java +++ b/src/test/java/de/kherud/llama/ModelParametersTest.java @@ -399,4 +399,106 @@ public void testBuilderChainingReturnsSameInstance() { assertSame(p, p.setGpuLayers(10)); assertSame(p, p.enableEmbedding()); } + + // ------------------------------------------------------------------------- + // mmproj — vision model projection file/url + // ------------------------------------------------------------------------- + + @Test + public void testSetMmproj() { + ModelParameters p = new ModelParameters().setMmproj("/models/mmproj.gguf"); + assertEquals("/models/mmproj.gguf", p.parameters.get("--mmproj")); + } + + @Test + public void testSetMmprojUrl() { + ModelParameters p = new ModelParameters().setMmprojUrl("https://example.com/mmproj.gguf"); + assertEquals("https://example.com/mmproj.gguf", p.parameters.get("--mmproj-url")); + } + + @Test + public void testEnableMmprojAuto() { + ModelParameters p = new ModelParameters().enableMmprojAuto(); + assertTrue(p.parameters.containsKey("--mmproj-auto")); + } + + @Test + public void testEnableMmprojOffload() { + ModelParameters p = new ModelParameters().enableMmprojOffload(); + assertTrue(p.parameters.containsKey("--mmproj-offload")); + } + + // ------------------------------------------------------------------------- + // Reasoning format / budget — model-level defaults for thinking models + // ------------------------------------------------------------------------- + + @Test + public void testSetReasoningFormatNone() { + ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.NONE); + assertEquals("none", p.parameters.get("--reasoning-format")); + } + + @Test + public void testSetReasoningFormatAuto() { + ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.AUTO); + assertEquals("auto", p.parameters.get("--reasoning-format")); + } + + @Test + public void testSetReasoningFormatDeepseek() { + ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.DEEPSEEK); + assertEquals("deepseek", p.parameters.get("--reasoning-format")); + } + + @Test + public void testSetReasoningFormatDeepseekLegacy() { + ModelParameters p = new ModelParameters().setReasoningFormat(de.kherud.llama.args.ReasoningFormat.DEEPSEEK_LEGACY); + assertEquals("deepseek-legacy", p.parameters.get("--reasoning-format")); + } + + @Test + public void testSetReasoningBudgetPositive() { + ModelParameters p = new ModelParameters().setReasoningBudget(1024); + assertEquals("1024", p.parameters.get("--reasoning-budget")); + } + + @Test + public void testSetReasoningBudgetDisabled() { + ModelParameters p = new ModelParameters().setReasoningBudget(-1); + assertEquals("-1", p.parameters.get("--reasoning-budget")); + } + + // ------------------------------------------------------------------------- + // setSleepIdleSeconds + // ------------------------------------------------------------------------- + + @Test + public void testSetSleepIdleSeconds() { + ModelParameters p = new ModelParameters().setSleepIdleSeconds(60); + assertEquals("60", p.parameters.get("--sleep-idle-seconds")); + } + + @Test + public void testSetSleepIdleSecondsZero() { + ModelParameters p = new ModelParameters().setSleepIdleSeconds(0); + assertEquals("0", p.parameters.get("--sleep-idle-seconds")); + } + + // ------------------------------------------------------------------------- + // setClearIdle / setKvUnified — correct flag names (regression) + // ------------------------------------------------------------------------- + + @Test + public void testSetClearIdleTrue_usesCacheIdleSlotsFlag() { + ModelParameters p = new ModelParameters().setClearIdle(true); + assertTrue(p.parameters.containsKey("--cache-idle-slots")); + assertFalse(p.parameters.containsKey("--no-cache-idle-slots")); + } + + @Test + public void testSetClearIdleFalse_usesNoCacheIdleSlotsFlag() { + ModelParameters p = new ModelParameters().setClearIdle(false); + assertTrue(p.parameters.containsKey("--no-cache-idle-slots")); + assertFalse(p.parameters.containsKey("--cache-idle-slots")); + } } diff --git a/src/test/java/de/kherud/llama/args/ModelFlagTest.java b/src/test/java/de/kherud/llama/args/ModelFlagTest.java index 16ce3e44..294efe5d 100644 --- a/src/test/java/de/kherud/llama/args/ModelFlagTest.java +++ b/src/test/java/de/kherud/llama/args/ModelFlagTest.java @@ -42,8 +42,10 @@ public static Collection data() { {ModelFlag.VOCAB_ONLY, "--vocab-only"}, {ModelFlag.KV_UNIFIED, "--kv-unified"}, {ModelFlag.NO_KV_UNIFIED, "--no-kv-unified"}, - {ModelFlag.CLEAR_IDLE, "--clear-idle"}, - {ModelFlag.NO_CLEAR_IDLE, "--no-clear-idle"}, + {ModelFlag.CLEAR_IDLE, "--cache-idle-slots"}, + {ModelFlag.NO_CLEAR_IDLE, "--no-cache-idle-slots"}, + {ModelFlag.MMPROJ_AUTO, "--mmproj-auto"}, + {ModelFlag.MMPROJ_OFFLOAD, "--mmproj-offload"}, }); } @@ -66,7 +68,7 @@ public void testGetCliFlag() { @Test public void testEnumCount() { - assertEquals(29, ModelFlag.values().length); + assertEquals(31, ModelFlag.values().length); } @Test diff --git a/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java b/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java index 69572862..c3c13a56 100644 --- a/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java +++ b/src/test/java/de/kherud/llama/json/ChatResponseParserTest.java @@ -96,6 +96,57 @@ public void testExtractChoiceContent_nodeMultipleChoices_takesFirst() throws Exc assertEquals("First", parser.extractChoiceContent(node)); } + // ------------------------------------------------------------------ + // extractChoiceReasoningContent + // ------------------------------------------------------------------ + + @Test + public void testExtractChoiceReasoningContent_present() { + String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"The answer is 42.\"," + + "\"reasoning_content\":\"Let me think step by step...\"}}]}"; + assertEquals("Let me think step by step...", parser.extractChoiceReasoningContent(json)); + } + + @Test + public void testExtractChoiceReasoningContent_absent_returnsEmpty() { + String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"Hello\"}}]}"; + assertEquals("", parser.extractChoiceReasoningContent(json)); + } + + @Test + public void testExtractChoiceReasoningContent_emptyString() { + String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"Hi\"," + + "\"reasoning_content\":\"\"}}]}"; + assertEquals("", parser.extractChoiceReasoningContent(json)); + } + + @Test + public void testExtractChoiceReasoningContent_missingChoices_returnsEmpty() { + String json = "{\"id\":\"x\",\"object\":\"chat.completion\"}"; + assertEquals("", parser.extractChoiceReasoningContent(json)); + } + + @Test + public void testExtractChoiceReasoningContent_malformedJson_returnsEmpty() { + assertEquals("", parser.extractChoiceReasoningContent("{not json")); + } + + @Test + public void testExtractChoiceReasoningContent_multiline() { + String json = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"42\"," + + "\"reasoning_content\":\"Step 1: identify the question.\\nStep 2: answer it.\"}}]}"; + assertEquals("Step 1: identify the question.\nStep 2: answer it.", + parser.extractChoiceReasoningContent(json)); + } + + @Test + public void testExtractChoiceReasoningContent_node() throws Exception { + JsonNode node = MAPPER.readTree( + "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"ok\"," + + "\"reasoning_content\":\"thinking...\"}}]}"); + assertEquals("thinking...", parser.extractChoiceReasoningContent(node)); + } + // ------------------------------------------------------------------ // extractUsageField // ------------------------------------------------------------------ From 08f372afb7fe6f7ad89b3a3b850396f34c61fb5d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 07:13:25 +0000 Subject: [PATCH 10/10] Fix setDraftMax/setDraftMin throwing on b9016+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llama.cpp b9016 removed --draft-max and --draft-min: the handler now unconditionally throws std::invalid_argument at parse time. Calling setDraftMax() or setDraftMin() (already covered by existing tests but not exercised in CI without a draft model) caused models to fail to load with no useful error. Fix: - setDraftMax → --spec-draft-n-max (was --draft-max, removed) - setDraftMin → --spec-draft-n-min (was --draft-min, removed) Also updated still-aliased flags to the canonical --spec-draft-* names for forward compatibility: - setDraftPMin → --spec-draft-p-min - setCtxSizeDraft → --spec-draft-ctx-size - setDeviceDraft → --spec-draft-device - setGpuLayersDraft → --spec-draft-ngl - setModelDraft → --spec-draft-model Tests updated to expect the new flag names; setDraftMax/setDraftMin tests now also assert the broken old flag is absent. https://claude.ai/code/session_018Xi5jWrcJ257WyCx6C2Cpk --- CLAUDE.md | 2 +- .../java/de/kherud/llama/ModelParameters.java | 16 +++++++------- .../llama/ModelParametersExtendedTest.java | 22 +++++++++++++------ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 4a5096ad..a3d36133 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -241,7 +241,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8994–b9004 | `tools/server/webui/` | Major frontend component reorganization (Svelte/TypeScript); purely UI, no C++ or JNI impact | | ~b9004–b9016 | `src/llama-io.h` | `llama_io_read_i` interface changed: `read(size_t)→read(void*,size_t)`, `read_to(void*,size_t)` removed, new `read_tensor(tensor,offset,size)` added; `llama_io_write_buffer`/`llama_io_read_buffer` now batch backend tensor ops in destructors for performance; internal state-save/load path, not called by project | | ~b9004–b9016 | `tools/server/server-context.cpp` | Static `server_get_checkpoint()` (returns by value) renamed to `server_prompt_checkpoint_update()` (takes `server_prompt_checkpoint &` by reference, in-place update); compiled directly into jllama, no call site in project code | -| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` → deprecated (use `--spec-draft-n-max`); `-md`/`--model-draft` → `--spec-draft-model`; `-hfd`/`--hf-repo-draft` → `--spec-draft-hf`; `--spec-ngram-size-n/m/min-hits` → type-specific `--spec-ngram-simple-*`/`--spec-ngram-map-k-*`/`--spec-ngram-map-k4v-*`; env vars similarly renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.); CLI-level only, Java layer passes params via JSON struct fields, no JNI impact | +| ~b9004–b9016 | `common/arg.cpp` + docs | Speculative decoding CLI args renamed: `--draft`/`--draft-n`/`--draft-max` and `--draft-min`/`--draft-n-min` were **REMOVED** (handler `throw`s `std::invalid_argument` at parse time, not just deprecated); other draft flags (`--draft-p-min`, `--ctx-size-draft`, `--device-draft`, `--gpu-layers-draft`, `--model-draft`) kept as aliases for new canonical `--spec-draft-*` names. **Java impact**: `ModelParameters.setDraftMax`/`setDraftMin` produced removed flags → threw at model load; fixed to canonical `--spec-draft-n-max`/`--spec-draft-n-min`. Other `set*Draft` methods updated to canonical names for forward compatibility. Env vars also renamed (`LLAMA_ARG_DRAFT_MAX`→`LLAMA_ARG_SPEC_DRAFT_N_MAX`, etc.) | | ~b9004–b9016 | `ggml/src/ggml-cuda/ggml-cuda.cu` | PCI bus ID detection replaced `snprintf` with `cudaDeviceGetPCIBusId` (buffer 16→32 bytes); HIP/MUSA compat headers gain `cudaDeviceGetPCIBusId` alias; internal CUDA backend | | ~b9004–b9016 | `ggml/src/ggml-opencl/` | Adreno MoE MXFP4: new `kernel_convert_block_mxfp4_trans4_ns`/`restore` kernels in `cvt.cl`; new `gemm_moe_mxfp4_f32_ns`, `gemv_moe_mxfp4_f32_ns`, `moe_reorder_b`, `moe_sort_by_expert` kernel files; GPU-side router reorder replaces CPU-side preprocessing; `q_img` created for GEMM path; internal OpenCL backend | | ~b9004–b9016 | `ggml/src/ggml-vulkan/ggml-vulkan.cpp` | `GGML_VK_MAX_NODES 8192` macro removed (node limit now determined differently); internal Vulkan backend | diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java index 1f739a5c..7c037eb9 100644 --- a/src/main/java/de/kherud/llama/ModelParameters.java +++ b/src/main/java/de/kherud/llama/ModelParameters.java @@ -1231,13 +1231,13 @@ public ModelParameters enableLogTimestamps() { } /** - * Set the number of tokens to draft for speculative decoding. + * Set the maximum number of tokens to draft for speculative decoding. * * @param draftMax the number of tokens to draft for speculative decoding * @return this builder */ public ModelParameters setDraftMax(int draftMax) { - parameters.put("--draft-max", String.valueOf(draftMax)); + parameters.put("--spec-draft-n-max", String.valueOf(draftMax)); return this; } @@ -1248,7 +1248,7 @@ public ModelParameters setDraftMax(int draftMax) { * @return this builder */ public ModelParameters setDraftMin(int draftMin) { - parameters.put("--draft-min", String.valueOf(draftMin)); + parameters.put("--spec-draft-n-min", String.valueOf(draftMin)); return this; } @@ -1259,7 +1259,7 @@ public ModelParameters setDraftMin(int draftMin) { * @return this builder */ public ModelParameters setDraftPMin(float draftPMin) { - parameters.put("--draft-p-min", String.valueOf(draftPMin)); + parameters.put("--spec-draft-p-min", String.valueOf(draftPMin)); return this; } @@ -1270,7 +1270,7 @@ public ModelParameters setDraftPMin(float draftPMin) { * @return this builder */ public ModelParameters setCtxSizeDraft(int ctxSizeDraft) { - parameters.put("--ctx-size-draft", String.valueOf(ctxSizeDraft)); + parameters.put("--spec-draft-ctx-size", String.valueOf(ctxSizeDraft)); return this; } @@ -1281,7 +1281,7 @@ public ModelParameters setCtxSizeDraft(int ctxSizeDraft) { * @return this builder */ public ModelParameters setDeviceDraft(String deviceDraft) { - parameters.put("--device-draft", deviceDraft); + parameters.put("--spec-draft-device", deviceDraft); return this; } @@ -1292,7 +1292,7 @@ public ModelParameters setDeviceDraft(String deviceDraft) { * @return this builder */ public ModelParameters setGpuLayersDraft(int gpuLayersDraft) { - parameters.put("--gpu-layers-draft", String.valueOf(gpuLayersDraft)); + parameters.put("--spec-draft-ngl", String.valueOf(gpuLayersDraft)); return this; } @@ -1303,7 +1303,7 @@ public ModelParameters setGpuLayersDraft(int gpuLayersDraft) { * @return this builder */ public ModelParameters setModelDraft(String modelDraft) { - parameters.put("--model-draft", modelDraft); + parameters.put("--spec-draft-model", modelDraft); return this; } diff --git a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java index ae7b0a8a..d3945f4a 100644 --- a/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java +++ b/src/test/java/de/kherud/llama/ModelParametersExtendedTest.java @@ -894,43 +894,51 @@ public void testAddControlVector() { @Test public void testSetModelDraft() { ModelParameters p = new ModelParameters().setModelDraft("/path/to/draft.gguf"); - assertEquals("/path/to/draft.gguf", p.parameters.get("--model-draft")); + assertEquals("/path/to/draft.gguf", p.parameters.get("--spec-draft-model")); } @Test public void testSetCtxSizeDraft() { ModelParameters p = new ModelParameters().setCtxSizeDraft(512); - assertEquals("512", p.parameters.get("--ctx-size-draft")); + assertEquals("512", p.parameters.get("--spec-draft-ctx-size")); } @Test public void testSetDeviceDraft() { ModelParameters p = new ModelParameters().setDeviceDraft("cuda0"); - assertEquals("cuda0", p.parameters.get("--device-draft")); + assertEquals("cuda0", p.parameters.get("--spec-draft-device")); } @Test public void testSetGpuLayersDraft() { ModelParameters p = new ModelParameters().setGpuLayersDraft(16); - assertEquals("16", p.parameters.get("--gpu-layers-draft")); + assertEquals("16", p.parameters.get("--spec-draft-ngl")); } @Test public void testSetDraftMax() { + // Regression: --draft-max was REMOVED in b9016 and now throws std::invalid_argument + // at model load. Must use --spec-draft-n-max. ModelParameters p = new ModelParameters().setDraftMax(8); - assertEquals("8", p.parameters.get("--draft-max")); + assertEquals("8", p.parameters.get("--spec-draft-n-max")); + assertFalse("--draft-max throws on b9016+; must not appear in args", + p.parameters.containsKey("--draft-max")); } @Test public void testSetDraftMin() { + // Regression: --draft-min was REMOVED in b9016 and now throws std::invalid_argument + // at model load. Must use --spec-draft-n-min. ModelParameters p = new ModelParameters().setDraftMin(2); - assertEquals("2", p.parameters.get("--draft-min")); + assertEquals("2", p.parameters.get("--spec-draft-n-min")); + assertFalse("--draft-min throws on b9016+; must not appear in args", + p.parameters.containsKey("--draft-min")); } @Test public void testSetDraftPMin() { ModelParameters p = new ModelParameters().setDraftPMin(0.5f); - assertEquals("0.5", p.parameters.get("--draft-p-min")); + assertEquals("0.5", p.parameters.get("--spec-draft-p-min")); } // -------------------------------------------------------------------------