From 8ba227869df3bb581cf4c9ff2725f1ae382da5e6 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Wed, 6 Aug 2025 11:19:55 -0500 Subject: [PATCH] always use seq_id=0 for generation; provide strftime_now to templates --- llama_cpp/llama.py | 4 ++-- llama_cpp/llama_chat_format.py | 36 ++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e93670e6..f72f0bf09 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -637,7 +637,7 @@ def eval(self, tokens: Sequence[int]): Args: tokens: The list of tokens to evaluate. """ - self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) + self._ctx.kv_cache_seq_rm(0, self.n_tokens, -1) for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = self.n_tokens @@ -945,7 +945,7 @@ def generate( if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]: self.n_tokens = sample_idx - self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) + self._ctx.kv_cache_seq_rm(0, self.n_tokens, -1) break if self.draft_model is not None: diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2a7d575ce..cb283ebdb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -8,6 +8,7 @@ import random import string +from datetime import datetime from contextlib import ExitStack from typing import ( Any, @@ -214,6 +215,10 @@ def __init__( lstrip_blocks=True, ).from_string(self.template) + @staticmethod + def strftime_now(f: str) -> str: + return datetime.now().strftime(f) + def __call__( self, *, @@ -237,6 +242,7 @@ def raise_exception(message: str): function_call=function_call, tools=tools, tool_choice=tool_choice, + strftime_now=self.strftime_now, ) stopping_criteria = None @@ -2752,10 +2758,10 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes): (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), len(image_bytes) ) - + if bitmap is None: raise ValueError("Failed to create bitmap from image bytes") - + return bitmap def __call__( @@ -2814,10 +2820,10 @@ def __call__( trim_blocks=True, lstrip_blocks=True, ).from_string(self.CHAT_FORMAT) - + # Get the default media marker media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - + # Replace image URLs with media markers in the template text = template.render( messages=messages, @@ -2825,7 +2831,7 @@ def __call__( eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) - + # Replace image URLs in text with media markers for image_url in image_urls: text = text.replace(image_url, media_marker) @@ -2875,40 +2881,40 @@ def __call__( # Process each chunk n_past = llama_cpp.llama_pos(0) n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - + for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - + if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT: # Handle text chunk n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( chunk, ctypes.byref(n_tokens_out) ) - + if tokens_ptr and n_tokens_out.value > 0: # Convert ctypes array to Python list tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - + if llama.n_tokens + len(tokens) > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" ) llama.eval(tokens) - + elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]: # Handle image/audio chunk using helper chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - + if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" ) - + new_n_past = llama_cpp.llama_pos(0) result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( self.mtmd_ctx, @@ -2920,10 +2926,10 @@ def __call__( False, # logits_last ctypes.byref(new_n_past) ) - + if result != 0: raise ValueError(f"Failed to evaluate chunk: error code {result}") - + # Update llama's token count llama.n_tokens = new_n_past.value @@ -3013,7 +3019,7 @@ def __call__( grammar=grammar, logit_bias=logit_bias, ) - + if tool is not None: tool_name = tool["function"]["name"] return _convert_completion_to_chat_function(