From 9fb9f5caa540649ade610660964167bae670320c Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Fri, 5 Apr 2024 14:58:56 +0900
Subject: [PATCH 1/8] implement min_tokens

---
 llama_cpp/llama.py        | 30 ++++++++++++++++++++++++++++++
 llama_cpp/server/types.py |  6 ++++++
 2 files changed, 36 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 5acc112d1..39b7b6d9c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -925,6 +925,7 @@ def _create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
+        min_tokens: Optional[int] = 1,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1051,6 +1052,26 @@ def logit_bias_processor(
             else (self._n_ctx - len(prompt_tokens))
         )
 
+        if min_tokens is not None:
+            def min_length_logits_processor(
+                input_ids: npt.NDArray[np.intc],
+                scores: npt.NDArray[np.single],
+            ) -> npt.NDArray[np.single]:
+                print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}")
+                # Does it make sense to copy the whole array or can we just overwrite the original one?
+                new_scores = np.copy(scores)
+                if len(input_ids) - len(prompt_tokens) < min_tokens:
+                    new_scores[self._token_eos] = -np.inf
+                return new_scores
+
+            _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor])
+            if logits_processor is None:
+                logits_processor = _min_length_logits_processor
+            else:
+                logits_processor = logits_processor.extend(_min_length_logits_processor)
+        else:
+            assert False
+
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
         else:
@@ -1469,6 +1490,7 @@ def create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
+        min_tokens: Optional[int] = 1,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1498,6 +1520,7 @@ def create_completion(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            min_tokens: The minimum number of tokens to generate.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1532,6 +1555,7 @@ def create_completion(
             prompt=prompt,
             suffix=suffix,
             max_tokens=-1 if max_tokens is None else max_tokens,
+            min_tokens=min_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1566,6 +1590,7 @@ def __call__(
         prompt: str,
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
+        min_tokens: Optional[int] = 1,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1595,6 +1620,7 @@ def __call__(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            min_tokens: The minimum number of tokens to generate.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1629,6 +1655,7 @@ def __call__(
             prompt=prompt,
             suffix=suffix,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1670,6 +1697,7 @@ def create_chat_completion(
         seed: Optional[int] = None,
         response_format: Optional[ChatCompletionRequestResponseFormat] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: Optional[int] = 1,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -1704,6 +1732,7 @@ def create_chat_completion(
             seed: The seed to use for sampling.
             response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            min_tokens: The minimum number of tokens to generate.
             presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
             frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
             repeat_penalty: The penalty to apply to repeated tokens.
@@ -1741,6 +1770,7 @@ def create_chat_completion(
             seed=seed,
             response_format=response_format,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index a20b3940f..e441a2740 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -16,6 +16,10 @@
     default=16, ge=1, description="The maximum number of tokens to generate."
 )
 
+min_tokens_field = Field(
+    default=1, ge=1, description="The minimum number of tokens to generate."
+)
+
 temperature_field = Field(
     default=0.8,
     description="Adjust the randomness of the generated text.\n\n"
@@ -111,6 +115,7 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(
         default=16, ge=0, description="The maximum number of tokens to generate."
     )
+    min_tokens: Optional[int] = min_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
@@ -206,6 +211,7 @@ class CreateChatCompletionRequest(BaseModel):
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
+    min_tokens: Optional[int] = min_tokens_field
     logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True"

From a9c2ff7f26acccc596d6219d126083899f1057ba Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Fri, 5 Apr 2024 15:28:14 +0900
Subject: [PATCH 2/8] set default to 0

---
 llama_cpp/llama.py        | 19 ++++++++-----------
 llama_cpp/server/types.py |  6 +++---
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 39b7b6d9c..1820a5d2a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -925,7 +925,7 @@ def _create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1052,12 +1052,11 @@ def logit_bias_processor(
             else (self._n_ctx - len(prompt_tokens))
         )
 
-        if min_tokens is not None:
+        if min_tokens > 0:
             def min_length_logits_processor(
                 input_ids: npt.NDArray[np.intc],
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
-                print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}")
                 # Does it make sense to copy the whole array or can we just overwrite the original one?
                 new_scores = np.copy(scores)
                 if len(input_ids) - len(prompt_tokens) < min_tokens:
@@ -1069,8 +1068,6 @@ def min_length_logits_processor(
                 logits_processor = _min_length_logits_processor
             else:
                 logits_processor = logits_processor.extend(_min_length_logits_processor)
-        else:
-            assert False
 
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
@@ -1490,7 +1487,7 @@ def create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1520,7 +1517,7 @@ def create_completion(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1590,7 +1587,7 @@ def __call__(
         prompt: str,
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1620,7 +1617,7 @@ def __call__(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1697,7 +1694,7 @@ def create_chat_completion(
         seed: Optional[int] = None,
         response_format: Optional[ChatCompletionRequestResponseFormat] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -1732,7 +1729,7 @@ def create_chat_completion(
             seed: The seed to use for sampling.
             response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
             frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
             repeat_penalty: The penalty to apply to repeated tokens.
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index e441a2740..926e0c5ed 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -17,7 +17,7 @@
 )
 
 min_tokens_field = Field(
-    default=1, ge=1, description="The minimum number of tokens to generate."
+    default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)."
 )
 
 temperature_field = Field(
@@ -115,7 +115,7 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(
         default=16, ge=0, description="The maximum number of tokens to generate."
     )
-    min_tokens: Optional[int] = min_tokens_field
+    min_tokens: int = min_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
@@ -211,7 +211,7 @@ class CreateChatCompletionRequest(BaseModel):
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
-    min_tokens: Optional[int] = min_tokens_field
+    min_tokens: int = min_tokens_field
     logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True"

From b4c9762c8d0af6d054f8e8bfbe5f8b23990c172f Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Mon, 8 Apr 2024 18:00:40 +0900
Subject: [PATCH 3/8] pass min_tokens

---
 llama_cpp/llama_chat_format.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 9da6b9800..f2cd4e5d9 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -71,6 +71,7 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -522,6 +523,7 @@ def chat_completion_handler(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -613,6 +615,7 @@ def chat_completion_handler(
             stop=stop,
             seed=seed,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1346,6 +1349,7 @@ def functionary_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1552,6 +1556,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             stream=stream,
             stop=["user:", "</s>"],
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1628,6 +1633,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         stream=False,
         grammar=grammar,
         max_tokens=max_tokens,
+        min_tokens=min_tokens,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
@@ -1705,6 +1711,7 @@ def functionary_v1_v2_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1921,6 +1928,7 @@ def prepare_messages_for_inference(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1980,6 +1988,7 @@ def create_completion(prompt, stop, grammar):
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -2577,6 +2586,7 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -3159,6 +3169,7 @@ def chatml_function_calling(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -3288,6 +3299,7 @@ def chatml_function_calling(
                 stream=stream,
                 stop=stop,
                 max_tokens=max_tokens,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3341,6 +3353,7 @@ def chatml_function_calling(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -3385,6 +3398,7 @@ def chatml_function_calling(
         stream=False,
         stop=[":"],
         max_tokens=None,
+        min_tokens=min_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -3413,6 +3427,7 @@ def chatml_function_calling(
                 stop=["<|im_end|>"],
                 logprobs=top_logprobs if logprobs else None,
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3460,6 +3475,7 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3487,6 +3503,7 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,

From 138300aaa7c8fce86e5d20edbf9f7789582bb21d Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Mon, 8 Apr 2024 19:23:09 +0900
Subject: [PATCH 4/8] fix

---
 llama_cpp/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 1820a5d2a..178cf9c8b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1057,11 +1057,11 @@ def min_length_logits_processor(
                 input_ids: npt.NDArray[np.intc],
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
-                # Does it make sense to copy the whole array or can we just overwrite the original one?
-                new_scores = np.copy(scores)
                 if len(input_ids) - len(prompt_tokens) < min_tokens:
+                    new_scores = np.copy(scores)
                     new_scores[self._token_eos] = -np.inf
-                return new_scores
+                    return new_scores
+                return scores
 
             _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor])
             if logits_processor is None:

From e440b035a407ac13aad17bb7e0a153bcb61ab572 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Mon, 8 Apr 2024 19:46:02 +0900
Subject: [PATCH 5/8] remove copy

---
 llama_cpp/llama.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 178cf9c8b..477fb1bbe 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1058,9 +1058,7 @@ def min_length_logits_processor(
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
                 if len(input_ids) - len(prompt_tokens) < min_tokens:
-                    new_scores = np.copy(scores)
-                    new_scores[self._token_eos] = -np.inf
-                    return new_scores
+                    scores[self._token_eos] = -np.inf
                 return scores
 
             _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor])

From 878399121e248202bb01aed012805fe975b9eb20 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Wed, 8 May 2024 23:07:21 +0900
Subject: [PATCH 6/8] implement MinTokensLogitsProcessor

---
 llama_cpp/llama.py             | 41 +++++++++++++---------------------
 llama_cpp/llama_chat_format.py | 17 --------------
 llama_cpp/server/app.py        | 20 +++++++++++++++++
 3 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 477fb1bbe..caca04a4d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -925,7 +925,6 @@ def _create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1052,21 +1051,6 @@ def logit_bias_processor(
             else (self._n_ctx - len(prompt_tokens))
         )
 
-        if min_tokens > 0:
-            def min_length_logits_processor(
-                input_ids: npt.NDArray[np.intc],
-                scores: npt.NDArray[np.single],
-            ) -> npt.NDArray[np.single]:
-                if len(input_ids) - len(prompt_tokens) < min_tokens:
-                    scores[self._token_eos] = -np.inf
-                return scores
-
-            _min_length_logits_processor = LogitsProcessorList([min_length_logits_processor])
-            if logits_processor is None:
-                logits_processor = _min_length_logits_processor
-            else:
-                logits_processor = logits_processor.extend(_min_length_logits_processor)
-
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
         else:
@@ -1485,7 +1469,6 @@ def create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1515,7 +1498,6 @@ def create_completion(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1550,7 +1532,6 @@ def create_completion(
             prompt=prompt,
             suffix=suffix,
             max_tokens=-1 if max_tokens is None else max_tokens,
-            min_tokens=min_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1585,7 +1566,6 @@ def __call__(
         prompt: str,
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1615,7 +1595,6 @@ def __call__(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1650,7 +1629,6 @@ def __call__(
             prompt=prompt,
             suffix=suffix,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1692,7 +1670,6 @@ def create_chat_completion(
         seed: Optional[int] = None,
         response_format: Optional[ChatCompletionRequestResponseFormat] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -1727,7 +1704,6 @@ def create_chat_completion(
             seed: The seed to use for sampling.
             response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
             frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
             repeat_penalty: The penalty to apply to repeated tokens.
@@ -1765,7 +1741,6 @@ def create_chat_completion(
             seed=seed,
             response_format=response_format,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -2103,3 +2078,19 @@ def __call__(
         self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+    def __init__(self, min_tokens: int, token_eos: int):
+        self.min_tokens = min_tokens
+        self.token_eos = token_eos
+        self.prompt_tokens = None
+
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
+        if self.prompt_tokens is None:
+            self.prompt_tokens = len(input_ids)
+        if len(input_ids) - self.prompt_tokens < self.min_tokens:
+            scores[self.token_eos] = -np.inf
+        return scores
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f2cd4e5d9..9da6b9800 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -71,7 +71,6 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -523,7 +522,6 @@ def chat_completion_handler(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -615,7 +613,6 @@ def chat_completion_handler(
             stop=stop,
             seed=seed,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1349,7 +1346,6 @@ def functionary_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
-    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1556,7 +1552,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             stream=stream,
             stop=["user:", "</s>"],
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1633,7 +1628,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         stream=False,
         grammar=grammar,
         max_tokens=max_tokens,
-        min_tokens=min_tokens,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
@@ -1711,7 +1705,6 @@ def functionary_v1_v2_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
-    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1928,7 +1921,6 @@ def prepare_messages_for_inference(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1988,7 +1980,6 @@ def create_completion(prompt, stop, grammar):
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -2586,7 +2577,6 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -3169,7 +3159,6 @@ def chatml_function_calling(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
-    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -3299,7 +3288,6 @@ def chatml_function_calling(
                 stream=stream,
                 stop=stop,
                 max_tokens=max_tokens,
-                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3353,7 +3341,6 @@ def chatml_function_calling(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
-            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -3398,7 +3385,6 @@ def chatml_function_calling(
         stream=False,
         stop=[":"],
         max_tokens=None,
-        min_tokens=min_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -3427,7 +3413,6 @@ def chatml_function_calling(
                 stop=["<|im_end|>"],
                 logprobs=top_logprobs if logprobs else None,
                 max_tokens=None,
-                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3475,7 +3460,6 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
-                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -3503,7 +3487,6 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
-                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 4cf10d1f6..05d797b45 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -275,6 +275,7 @@ async def create_completion(
         "best_of",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
 
@@ -288,6 +289,15 @@ async def create_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens is not None:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([
+            llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())
+        ])
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.CreateCompletionResponse,
         Iterator[llama_cpp.CreateCompletionStreamResponse],
@@ -445,6 +455,7 @@ async def create_chat_completion(
         "n",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
     llama = llama_proxy(body.model)
@@ -458,6 +469,15 @@ async def create_chat_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens is not None:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([
+            llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())
+        ])
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)

From 5220aa96762ef87e15728c3fb87b243c5d4dc038 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Wed, 8 May 2024 23:11:23 +0900
Subject: [PATCH 7/8] format

---
 llama_cpp/server/app.py   | 12 ++++++------
 llama_cpp/server/types.py |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 05d797b45..6c09e7ac8 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -290,9 +290,9 @@ async def create_completion(
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
     if body.min_tokens is not None:
-        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([
-            llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())
-        ])
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
         else:
@@ -470,9 +470,9 @@ async def create_chat_completion(
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
     if body.min_tokens is not None:
-        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList([
-            llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())
-        ])
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
         else:
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index 926e0c5ed..a75f9e55b 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -17,7 +17,9 @@
 )
 
 min_tokens_field = Field(
-    default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)."
+    default=0,
+    ge=0,
+    description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
 )
 
 temperature_field = Field(

From 24a4bb83cae8917127b92948fb48bfc8ced860e3 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Wed, 8 May 2024 23:15:43 +0900
Subject: [PATCH 8/8] fix condition

---
 llama_cpp/server/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 6c09e7ac8..4cda4af7a 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -289,7 +289,7 @@ async def create_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
-    if body.min_tokens is not None:
+    if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
             [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
         )
@@ -469,7 +469,7 @@ async def create_chat_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
-    if body.min_tokens is not None:
+    if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
             [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
         )