From 9fb9f5caa540649ade610660964167bae670320c Mon Sep 17 00:00:00 2001 From: twaka Date: Fri, 5 Apr 2024 14:58:56 +0900 Subject: [PATCH 1/8] implement min_tokens --- llama_cpp/llama.py | 30 ++++++++++++++++++++++++++++++ llama_cpp/server/types.py | 6 ++++++ 2 files changed, 36 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5acc112d1..39b7b6d9c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -925,6 +925,7 @@ def _create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, + min_tokens: Optional[int] = 1, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1051,6 +1052,26 @@ def logit_bias_processor( else (self._n_ctx - len(prompt_tokens)) ) + if min_tokens is not None: + def min_length_logits_processor( + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + ) -> npt.NDArray[np.single]: + print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}") + # Does it make sense to copy the whole array or can we just overwrite the original one? + new_scores = np.copy(scores) + if len(input_ids) - len(prompt_tokens) < min_tokens: + new_scores[self._token_eos] = -np.inf + return new_scores + + _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor]) + if logits_processor is None: + logits_processor = _min_length_logits_processor + else: + logits_processor = logits_processor.extend(_min_length_logits_processor) + else: + assert False + if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] else: @@ -1469,6 +1490,7 @@ def create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, + min_tokens: Optional[int] = 1, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1498,6 +1520,7 @@ def create_completion( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. + min_tokens: The minimum number of tokens to generate. temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1532,6 +1555,7 @@ def create_completion( prompt=prompt, suffix=suffix, max_tokens=-1 if max_tokens is None else max_tokens, + min_tokens=min_tokens, temperature=temperature, top_p=top_p, min_p=min_p, @@ -1566,6 +1590,7 @@ def __call__( prompt: str, suffix: Optional[str] = None, max_tokens: Optional[int] = 16, + min_tokens: Optional[int] = 1, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1595,6 +1620,7 @@ def __call__( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. + min_tokens: The minimum number of tokens to generate. temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1629,6 +1655,7 @@ def __call__( prompt=prompt, suffix=suffix, max_tokens=max_tokens, + min_tokens=min_tokens, temperature=temperature, top_p=top_p, min_p=min_p, @@ -1670,6 +1697,7 @@ def create_chat_completion( seed: Optional[int] = None, response_format: Optional[ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, + min_tokens: Optional[int] = 1, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1704,6 +1732,7 @@ def create_chat_completion( seed: The seed to use for sampling. response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. + min_tokens: The minimum number of tokens to generate. presence_penalty: The penalty to apply to tokens based on their presence in the prompt. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. repeat_penalty: The penalty to apply to repeated tokens. @@ -1741,6 +1770,7 @@ def create_chat_completion( seed=seed, response_format=response_format, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index a20b3940f..e441a2740 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -16,6 +16,10 @@ default=16, ge=1, description="The maximum number of tokens to generate." ) +min_tokens_field = Field( + default=1, ge=1, description="The minimum number of tokens to generate." +) + temperature_field = Field( default=0.8, description="Adjust the randomness of the generated text.\n\n" @@ -111,6 +115,7 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=16, ge=0, description="The maximum number of tokens to generate." ) + min_tokens: Optional[int] = min_tokens_field temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -206,6 +211,7 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + min_tokens: Optional[int] = min_tokens_field logprobs: Optional[bool] = Field( default=False, description="Whether to output the logprobs or not. Default is True" From a9c2ff7f26acccc596d6219d126083899f1057ba Mon Sep 17 00:00:00 2001 From: twaka Date: Fri, 5 Apr 2024 15:28:14 +0900 Subject: [PATCH 2/8] set default to 0 --- llama_cpp/llama.py | 19 ++++++++----------- llama_cpp/server/types.py | 6 +++--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 39b7b6d9c..1820a5d2a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -925,7 +925,7 @@ def _create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: Optional[int] = 1, + min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1052,12 +1052,11 @@ def logit_bias_processor( else (self._n_ctx - len(prompt_tokens)) ) - if min_tokens is not None: + if min_tokens > 0: def min_length_logits_processor( input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single], ) -> npt.NDArray[np.single]: - print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}") # Does it make sense to copy the whole array or can we just overwrite the original one? new_scores = np.copy(scores) if len(input_ids) - len(prompt_tokens) < min_tokens: @@ -1069,8 +1068,6 @@ def min_length_logits_processor( logits_processor = _min_length_logits_processor else: logits_processor = logits_processor.extend(_min_length_logits_processor) - else: - assert False if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] @@ -1490,7 +1487,7 @@ def create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: Optional[int] = 1, + min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1520,7 +1517,7 @@ def create_completion( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. + min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1590,7 +1587,7 @@ def __call__( prompt: str, suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: Optional[int] = 1, + min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1620,7 +1617,7 @@ def __call__( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. + min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1697,7 +1694,7 @@ def create_chat_completion( seed: Optional[int] = None, response_format: Optional[ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - min_tokens: Optional[int] = 1, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1732,7 +1729,7 @@ def create_chat_completion( seed: The seed to use for sampling. response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. + min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). presence_penalty: The penalty to apply to tokens based on their presence in the prompt. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. repeat_penalty: The penalty to apply to repeated tokens. diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index e441a2740..926e0c5ed 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -17,7 +17,7 @@ ) min_tokens_field = Field( - default=1, ge=1, description="The minimum number of tokens to generate." + default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)." ) temperature_field = Field( @@ -115,7 +115,7 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=16, ge=0, description="The maximum number of tokens to generate." ) - min_tokens: Optional[int] = min_tokens_field + min_tokens: int = min_tokens_field temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -211,7 +211,7 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) - min_tokens: Optional[int] = min_tokens_field + min_tokens: int = min_tokens_field logprobs: Optional[bool] = Field( default=False, description="Whether to output the logprobs or not. Default is True" From b4c9762c8d0af6d054f8e8bfbe5f8b23990c172f Mon Sep 17 00:00:00 2001 From: twaka Date: Mon, 8 Apr 2024 18:00:40 +0900 Subject: [PATCH 3/8] pass min_tokens --- llama_cpp/llama_chat_format.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 9da6b9800..f2cd4e5d9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -71,6 +71,7 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -522,6 +523,7 @@ def chat_completion_handler( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -613,6 +615,7 @@ def chat_completion_handler( stop=stop, seed=seed, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1346,6 +1349,7 @@ def functionary_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1552,6 +1556,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): stream=stream, stop=["user:", ""], max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1628,6 +1633,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): stream=False, grammar=grammar, max_tokens=max_tokens, + min_tokens=min_tokens, temperature=temperature, top_p=top_p, top_k=top_k, @@ -1705,6 +1711,7 @@ def functionary_v1_v2_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1921,6 +1928,7 @@ def prepare_messages_for_inference( stream=stream, stop=stop, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1980,6 +1988,7 @@ def create_completion(prompt, stop, grammar): stream=stream, stop=stop, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -2577,6 +2586,7 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -3159,6 +3169,7 @@ def chatml_function_calling( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, + min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -3288,6 +3299,7 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3341,6 +3353,7 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3385,6 +3398,7 @@ def chatml_function_calling( stream=False, stop=[":"], max_tokens=None, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3413,6 +3427,7 @@ def chatml_function_calling( stop=["<|im_end|>"], logprobs=top_logprobs if logprobs else None, max_tokens=None, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3460,6 +3475,7 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3487,6 +3503,7 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, + min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, From 138300aaa7c8fce86e5d20edbf9f7789582bb21d Mon Sep 17 00:00:00 2001 From: twaka Date: Mon, 8 Apr 2024 19:23:09 +0900 Subject: [PATCH 4/8] fix --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1820a5d2a..178cf9c8b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1057,11 +1057,11 @@ def min_length_logits_processor( input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single], ) -> npt.NDArray[np.single]: - # Does it make sense to copy the whole array or can we just overwrite the original one? - new_scores = np.copy(scores) if len(input_ids) - len(prompt_tokens) < min_tokens: + new_scores = np.copy(scores) new_scores[self._token_eos] = -np.inf - return new_scores + return new_scores + return scores _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor]) if logits_processor is None: From e440b035a407ac13aad17bb7e0a153bcb61ab572 Mon Sep 17 00:00:00 2001 From: twaka Date: Mon, 8 Apr 2024 19:46:02 +0900 Subject: [PATCH 5/8] remove copy --- llama_cpp/llama.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 178cf9c8b..477fb1bbe 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1058,9 +1058,7 @@ def min_length_logits_processor( scores: npt.NDArray[np.single], ) -> npt.NDArray[np.single]: if len(input_ids) - len(prompt_tokens) < min_tokens: - new_scores = np.copy(scores) - new_scores[self._token_eos] = -np.inf - return new_scores + scores[self._token_eos] = -np.inf return scores _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor]) From 878399121e248202bb01aed012805fe975b9eb20 Mon Sep 17 00:00:00 2001 From: twaka Date: Wed, 8 May 2024 23:07:21 +0900 Subject: [PATCH 6/8] implement MinTokensLogitsProcessor --- llama_cpp/llama.py | 41 +++++++++++++--------------------- llama_cpp/llama_chat_format.py | 17 -------------- llama_cpp/server/app.py | 20 +++++++++++++++++ 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 477fb1bbe..caca04a4d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -925,7 +925,6 @@ def _create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1052,21 +1051,6 @@ def logit_bias_processor( else (self._n_ctx - len(prompt_tokens)) ) - if min_tokens > 0: - def min_length_logits_processor( - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - ) -> npt.NDArray[np.single]: - if len(input_ids) - len(prompt_tokens) < min_tokens: - scores[self._token_eos] = -np.inf - return scores - - _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor]) - if logits_processor is None: - logits_processor = _min_length_logits_processor - else: - logits_processor = logits_processor.extend(_min_length_logits_processor) - if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] else: @@ -1485,7 +1469,6 @@ def create_completion( prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1515,7 +1498,6 @@ def create_completion( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1550,7 +1532,6 @@ def create_completion( prompt=prompt, suffix=suffix, max_tokens=-1 if max_tokens is None else max_tokens, - min_tokens=min_tokens, temperature=temperature, top_p=top_p, min_p=min_p, @@ -1585,7 +1566,6 @@ def __call__( prompt: str, suffix: Optional[str] = None, max_tokens: Optional[int] = 16, - min_tokens: int = 0, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1615,7 +1595,6 @@ def __call__( prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). temperature: The temperature to use for sampling. top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 @@ -1650,7 +1629,6 @@ def __call__( prompt=prompt, suffix=suffix, max_tokens=max_tokens, - min_tokens=min_tokens, temperature=temperature, top_p=top_p, min_p=min_p, @@ -1692,7 +1670,6 @@ def create_chat_completion( seed: Optional[int] = None, response_format: Optional[ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1727,7 +1704,6 @@ def create_chat_completion( seed: The seed to use for sampling. response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop). presence_penalty: The penalty to apply to tokens based on their presence in the prompt. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. repeat_penalty: The penalty to apply to repeated tokens. @@ -1765,7 +1741,6 @@ def create_chat_completion( seed=seed, response_format=response_format, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -2103,3 +2078,19 @@ def __call__( self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single] ) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + +class MinTokensLogitsProcessor(LogitsProcessor): + def __init__(self, min_tokens: int, token_eos: int): + self.min_tokens = min_tokens + self.token_eos = token_eos + self.prompt_tokens = None + + def __call__( + self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single] + ) -> npt.NDArray[np.single]: + if self.prompt_tokens is None: + self.prompt_tokens = len(input_ids) + if len(input_ids) - self.prompt_tokens < self.min_tokens: + scores[self.token_eos] = -np.inf + return scores diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f2cd4e5d9..9da6b9800 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -71,7 +71,6 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -523,7 +522,6 @@ def chat_completion_handler( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -615,7 +613,6 @@ def chat_completion_handler( stop=stop, seed=seed, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1349,7 +1346,6 @@ def functionary_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1556,7 +1552,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): stream=stream, stop=["user:", ""], max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1633,7 +1628,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): stream=False, grammar=grammar, max_tokens=max_tokens, - min_tokens=min_tokens, temperature=temperature, top_p=top_p, top_k=top_k, @@ -1711,7 +1705,6 @@ def functionary_v1_v2_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1928,7 +1921,6 @@ def prepare_messages_for_inference( stream=stream, stop=stop, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -1988,7 +1980,6 @@ def create_completion(prompt, stop, grammar): stream=stream, stop=stop, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -2586,7 +2577,6 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -3169,7 +3159,6 @@ def chatml_function_calling( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - min_tokens: int = 0, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -3299,7 +3288,6 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3353,7 +3341,6 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3398,7 +3385,6 @@ def chatml_function_calling( stream=False, stop=[":"], max_tokens=None, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3427,7 +3413,6 @@ def chatml_function_calling( stop=["<|im_end|>"], logprobs=top_logprobs if logprobs else None, max_tokens=None, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3475,7 +3460,6 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, @@ -3503,7 +3487,6 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, - min_tokens=min_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4cf10d1f6..05d797b45 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -275,6 +275,7 @@ async def create_completion( "best_of", "logit_bias_type", "user", + "min_tokens", } kwargs = body.model_dump(exclude=exclude) @@ -288,6 +289,15 @@ async def create_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens is not None: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([ + llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos()) + ]) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + iterator_or_completion: Union[ llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse], @@ -445,6 +455,7 @@ async def create_chat_completion( "n", "logit_bias_type", "user", + "min_tokens", } kwargs = body.model_dump(exclude=exclude) llama = llama_proxy(body.model) @@ -458,6 +469,15 @@ async def create_chat_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens is not None: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([ + llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos()) + ]) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) From 5220aa96762ef87e15728c3fb87b243c5d4dc038 Mon Sep 17 00:00:00 2001 From: twaka Date: Wed, 8 May 2024 23:11:23 +0900 Subject: [PATCH 7/8] format --- llama_cpp/server/app.py | 12 ++++++------ llama_cpp/server/types.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 05d797b45..6c09e7ac8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -290,9 +290,9 @@ async def create_completion( kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) if body.min_tokens is not None: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([ - llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos()) - ]) + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor else: @@ -470,9 +470,9 @@ async def create_chat_completion( kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) if body.min_tokens is not None: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([ - llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos()) - ]) + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor else: diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 926e0c5ed..a75f9e55b 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -17,7 +17,9 @@ ) min_tokens_field = Field( - default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)." + default=0, + ge=0, + description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).", ) temperature_field = Field( From 24a4bb83cae8917127b92948fb48bfc8ced860e3 Mon Sep 17 00:00:00 2001 From: twaka Date: Wed, 8 May 2024 23:15:43 +0900 Subject: [PATCH 8/8] fix condition --- llama_cpp/server/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 6c09e7ac8..4cda4af7a 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -289,7 +289,7 @@ async def create_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - if body.min_tokens is not None: + if body.min_tokens > 0: _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] ) @@ -469,7 +469,7 @@ async def create_chat_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - if body.min_tokens is not None: + if body.min_tokens > 0: _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] )