From f9f1d40bd71508ea8d2ebe6c894a877c1712e82e Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 22:41:31 +0100
Subject: [PATCH 01/41] add token count, flexible batch size and kwargs to vllm
 class

---
 promptolution/llms/vllm.py | 77 ++++++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index d99c542..ddd60fa 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -32,22 +32,25 @@ class VLLM(BaseLLM):
 
     Methods:
         get_response: Generate responses for a list of prompts.
+        get_token_count: Get the current count of input and output tokens.
+        reset_token_count: Reset the token counters to zero.
     """
 
     def __init__(
         self,
         model_id: str,
-        batch_size: int = 64,
+        batch_size: int | None = None,
         max_generated_tokens: int = 256,
         temperature: float = 0.1,
         top_p: float = 0.9,
-        model_storage_path: str = None,
-        token: str = None,
+        model_storage_path: str | None = None,
+        token: str | None = None,
         dtype: str = "auto",
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.95,
         max_model_len: int = 2048,
         trust_remote_code: bool = False,
+        **kwargs,
     ):
         """Initialize the VLLM with a specific model.
 
@@ -64,31 +67,45 @@ def __init__(
             gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
             max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
             trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+            **kwargs: Additional keyword arguments to pass to the LLM class initialization.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
         """
-        self.batch_size = batch_size
         self.dtype = dtype
         self.tensor_parallel_size = tensor_parallel_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.max_model_len = max_model_len
         self.trust_remote_code = trust_remote_code
 
+        # Initialize token counters
+        self.input_token_count = 0
+        self.output_token_count = 0
+
         # Configure sampling parameters
         self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
 
-        # Initialize the vLLM engine
-        self.llm = LLM(
-            model=model_id,
-            tokenizer=model_id,
-            dtype=self.dtype,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            max_model_len=self.max_model_len,
-            download_dir=model_storage_path,
-            trust_remote_code=self.trust_remote_code,
-        )
+        # Initialize the vLLM engine with both explicit parameters and any additional kwargs
+        llm_params = {
+            "model": model_id,
+            "tokenizer": model_id,
+            "dtype": self.dtype,
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "gpu_memory_utilization": self.gpu_memory_utilization,
+            "max_model_len": self.max_model_len,
+            "download_dir": model_storage_path,
+            "trust_remote_code": self.trust_remote_code,
+            **kwargs,
+        }
+
+        self.llm = LLM(**llm_params)
+
+        if batch_size is None:
+            gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
+            block_size = self.llm.llm_engine.model_executor.cache_config.block_size
+            self.batch_size = (gpu_blocks * block_size / self.max_model_len) * 0.95
+        else:
+            self.batch_size = batch_size
 
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -104,6 +121,7 @@ def get_response(self, inputs: list[str]):
 
         Note:
             This method uses vLLM's batched generation capabilities for efficient inference.
+            It also counts input and output tokens.
         """
         prompts = [
             self.tokenizer.apply_chat_template(
@@ -119,16 +137,45 @@ def get_response(self, inputs: list[str]):
             for input in inputs
         ]
 
+        # Count input tokens
+        for prompt in prompts:
+            input_tokens = self.tokenizer.encode(prompt)
+            self.input_token_count += len(input_tokens)
+
         # generate responses for self.batch_size prompts at the same time
         all_responses = []
         for i in range(0, len(prompts), self.batch_size):
             batch = prompts[i : i + self.batch_size]
             outputs = self.llm.generate(batch, self.sampling_params)
             responses = [output.outputs[0].text for output in outputs]
+
+            # Count output tokens
+            for response in responses:
+                output_tokens = self.tokenizer.encode(response)
+                self.output_token_count += len(output_tokens)
+
             all_responses.extend(responses)
 
         return all_responses
 
+    def get_token_count(self):
+        """Get the current count of input and output tokens.
+
+        Returns:
+            dict: A dictionary containing the input and output token counts.
+        """
+        return {
+            "input_tokens": self.input_token_count,
+            "output_tokens": self.output_token_count,
+            "total_tokens": self.input_token_count + self.output_token_count,
+        }
+
+    def reset_token_count(self):
+        """Reset the token counters to zero."""
+        self.input_token_count = 0
+        self.output_token_count = 0
+        logger.info("Token counters have been reset.")
+
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""
         del self.llm

From b20495fdc7cfc05999300ee5ec29cca28a1cfde2 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 22:55:52 +0100
Subject: [PATCH 02/41] add testing script for implementation

---
 scripts/llm_test_run.py | 94 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 scripts/llm_test_run.py

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
new file mode 100644
index 0000000..65930c4
--- /dev/null
+++ b/scripts/llm_test_run.py
@@ -0,0 +1,94 @@
+"""Test script for measuring raw LLM inference performance on a dataset."""
+import argparse
+import time
+from logging import Logger
+
+import numpy as np
+import pandas as pd
+from promptolution.config import Config
+from promptolution.llms import get_llm
+from promptolution.predictors import Classificator
+from promptolution.tasks import get_task
+from tqdm import tqdm
+
+logger = Logger(__name__)
+
+
+def main():
+    """Run inference test on a dataset using a specified LLM."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model")
+    parser.add_argument("--output")
+    parser.add_argument("--datasets", default=["agnews", "subj"])
+    parser.add_argument("--token", default=None)
+    parser.add_argument("--batch-size", default=None)
+    parser.add_argument("--model-storage-path", default=None)
+    args = parser.parse_args()
+
+    start_time = time.time()
+
+    if "vllm" in args.model:
+        llm = get_llm(
+            args.model,
+            batch_size=args.batch_size,
+            model_storage_path=args.model_storage_path,
+            revision="main",
+        )
+    else:
+        llm = get_llm(args.model, args.token)
+
+    results = pd.DataFrame()
+
+    for dataset in args.datasets:
+        config = Config(
+            evaluation_llm=args.model,
+            ds_path=f"data_sets/cls/{dataset}/",
+            task_name=dataset,
+            api_token=args.token,
+            n_eval_samples=200,
+        )
+
+        task = get_task(config, split="dev")
+        predictor = Classificator(llm, classes=task.classes)
+
+        prompt = task.initial_population
+
+        xs = task.xs[: config.n_eval_samples]
+        ys = task.ys[: config.n_eval_samples]
+
+        for prompt in tqdm(task.initial_population):
+            preds, seqs = predictor.predict(prompt, xs, return_seq=True)
+
+            scores = []
+            for i in range(len(xs)):
+                scores.append(1 if preds[0][i] == ys[i] else 0)
+
+            # clean up the sequences
+            seqs = [seq.replace("\n", "").strip() for seq in seqs]
+
+            # if single prompts should be stored
+            # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
+            # df.to_csv(args.output + "_detailed", index=False)
+
+            accuracy = np.array(scores).mean()
+
+            results = pd.DataFrame(
+                dict(
+                    model=args.model,
+                    dataset=dataset,
+                    prompt=prompt,
+                    accuracy=accuracy,
+                    n_samples=len(xs),
+                ),
+                index=[0],
+            )
+            results.to_csv(args.output, mode="a", header=False, index=False)
+            print(llm.get_token_count())
+        llm.reset_token_count()
+
+    total_inference_time = time.time() - start_time
+    print(f"Total inference took {total_inference_time:.2f} seconds")
+
+
+if __name__ == "__main__":
+    main()

From e27fa6ce0f084d6cee25a2d76f5823ffc248cbaa Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:06:42 +0100
Subject: [PATCH 03/41] fix batch size calculation

---
 promptolution/llms/vllm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index ddd60fa..30d78da 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -103,7 +103,8 @@ def __init__(
         if batch_size is None:
             gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
             block_size = self.llm.llm_engine.model_executor.cache_config.block_size
-            self.batch_size = (gpu_blocks * block_size / self.max_model_len) * 0.95
+            self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95)
+            logger.info(f"Batch size set to {self.batch_size} based on GPU memory.")
         else:
             self.batch_size = batch_size
 

From 01eeb6d30874a8e43c7486859e5abbde17bd5f8b Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:21:16 +0100
Subject: [PATCH 04/41] small changes

---
 scripts/llm_test_run.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 65930c4..40b9938 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -83,11 +83,10 @@ def main():
                 index=[0],
             )
             results.to_csv(args.output, mode="a", header=False, index=False)
-            print(llm.get_token_count())
-        llm.reset_token_count()
 
     total_inference_time = time.time() - start_time
-    print(f"Total inference took {total_inference_time:.2f} seconds")
+    print(f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens.")
+    print(f"Results saved to {args.output}")
 
 
 if __name__ == "__main__":

From 045ffb8722be7c5b6e78180bf8e02415bc9e9c35 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:27:02 +0100
Subject: [PATCH 05/41] add revision test

---
 scripts/llm_test_run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 40b9938..4de7131 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -22,6 +22,7 @@ def main():
     parser.add_argument("--datasets", default=["agnews", "subj"])
     parser.add_argument("--token", default=None)
     parser.add_argument("--batch-size", default=None)
+    parser.add_argument("--revision", default="main")
     parser.add_argument("--model-storage-path", default=None)
     args = parser.parse_args()
 
@@ -32,7 +33,7 @@ def main():
             args.model,
             batch_size=args.batch_size,
             model_storage_path=args.model_storage_path,
-            revision="main",
+            revision=args.revision,
         )
     else:
         llm = get_llm(args.model, args.token)

From ad54496f0f4d64b96a3399764ad494378ab8b986 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:28:15 +0100
Subject: [PATCH 06/41] add argument to parser

---
 scripts/llm_test_run.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 4de7131..69d2328 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -23,6 +23,7 @@ def main():
     parser.add_argument("--token", default=None)
     parser.add_argument("--batch-size", default=None)
     parser.add_argument("--revision", default="main")
+    parser.add_argument("--max-model-len", default=None)
     parser.add_argument("--model-storage-path", default=None)
     args = parser.parse_args()
 
@@ -32,6 +33,7 @@ def main():
         llm = get_llm(
             args.model,
             batch_size=args.batch_size,
+            max_model_len=args.max_model_len,
             model_storage_path=args.model_storage_path,
             revision=args.revision,
         )

From fc8d7790b2c21151e231509ece92338fc1783ea8 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:41:34 +0100
Subject: [PATCH 07/41] max model len to int

---
 scripts/llm_test_run.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 69d2328..35519eb 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -29,11 +29,14 @@ def main():
 
     start_time = time.time()
 
+    if args.max_model_len is not None:
+        max_model_len = int(args.max_model_len)
+
     if "vllm" in args.model:
         llm = get_llm(
             args.model,
             batch_size=args.batch_size,
-            max_model_len=args.max_model_len,
+            max_model_len=max_model_len,
             model_storage_path=args.model_storage_path,
             revision=args.revision,
         )

From 469117c4b9b4b08c703c79fb95a6697e5fb42dbf Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Thu, 6 Mar 2025 23:49:37 +0100
Subject: [PATCH 08/41] remove script

---
 scripts/llm_test_run.py | 99 -----------------------------------------
 1 file changed, 99 deletions(-)
 delete mode 100644 scripts/llm_test_run.py

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
deleted file mode 100644
index 35519eb..0000000
--- a/scripts/llm_test_run.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""Test script for measuring raw LLM inference performance on a dataset."""
-import argparse
-import time
-from logging import Logger
-
-import numpy as np
-import pandas as pd
-from promptolution.config import Config
-from promptolution.llms import get_llm
-from promptolution.predictors import Classificator
-from promptolution.tasks import get_task
-from tqdm import tqdm
-
-logger = Logger(__name__)
-
-
-def main():
-    """Run inference test on a dataset using a specified LLM."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model")
-    parser.add_argument("--output")
-    parser.add_argument("--datasets", default=["agnews", "subj"])
-    parser.add_argument("--token", default=None)
-    parser.add_argument("--batch-size", default=None)
-    parser.add_argument("--revision", default="main")
-    parser.add_argument("--max-model-len", default=None)
-    parser.add_argument("--model-storage-path", default=None)
-    args = parser.parse_args()
-
-    start_time = time.time()
-
-    if args.max_model_len is not None:
-        max_model_len = int(args.max_model_len)
-
-    if "vllm" in args.model:
-        llm = get_llm(
-            args.model,
-            batch_size=args.batch_size,
-            max_model_len=max_model_len,
-            model_storage_path=args.model_storage_path,
-            revision=args.revision,
-        )
-    else:
-        llm = get_llm(args.model, args.token)
-
-    results = pd.DataFrame()
-
-    for dataset in args.datasets:
-        config = Config(
-            evaluation_llm=args.model,
-            ds_path=f"data_sets/cls/{dataset}/",
-            task_name=dataset,
-            api_token=args.token,
-            n_eval_samples=200,
-        )
-
-        task = get_task(config, split="dev")
-        predictor = Classificator(llm, classes=task.classes)
-
-        prompt = task.initial_population
-
-        xs = task.xs[: config.n_eval_samples]
-        ys = task.ys[: config.n_eval_samples]
-
-        for prompt in tqdm(task.initial_population):
-            preds, seqs = predictor.predict(prompt, xs, return_seq=True)
-
-            scores = []
-            for i in range(len(xs)):
-                scores.append(1 if preds[0][i] == ys[i] else 0)
-
-            # clean up the sequences
-            seqs = [seq.replace("\n", "").strip() for seq in seqs]
-
-            # if single prompts should be stored
-            # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
-            # df.to_csv(args.output + "_detailed", index=False)
-
-            accuracy = np.array(scores).mean()
-
-            results = pd.DataFrame(
-                dict(
-                    model=args.model,
-                    dataset=dataset,
-                    prompt=prompt,
-                    accuracy=accuracy,
-                    n_samples=len(xs),
-                ),
-                index=[0],
-            )
-            results.to_csv(args.output, mode="a", header=False, index=False)
-
-    total_inference_time = time.time() - start_time
-    print(f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens.")
-    print(f"Results saved to {args.output}")
-
-
-if __name__ == "__main__":
-    main()

From 6b543fa9ad73d24106e9f1383f13d77f3d7349af Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Fri, 7 Mar 2025 00:21:55 +0100
Subject: [PATCH 09/41] Change version and Release notes

---
 docs/release-notes.md | 5 +++++
 pyproject.toml        | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/release-notes.md b/docs/release-notes.md
index 20b97b7..7be8e79 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -1,5 +1,10 @@
 # Release Notes
 
+## Release v1.2.1
+### What's changed
+#### Added features
+* New features for the VLLM Wrapper (automatic batch size determination, accepting kwargs and token count)
+
 ## Release v1.2.0
 ### What's changed
 #### Added features
diff --git a/pyproject.toml b/pyproject.toml
index e4f5be3..06cbcfc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.2.0"
+version = "1.2.1"
 description = ""
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"

From 619ce65e374dbfa4849919e33f94e7fd7f4ec26d Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 7 Mar 2025 11:41:04 +0100
Subject: [PATCH 10/41] changed callback behaviour and impelemented token count
 callback

---
 promptolution/callbacks.py                 | 47 ++++++++++++++++--
 promptolution/llms/api_llm.py              |  2 +-
 promptolution/llms/base_llm.py             | 55 +++++++++++++++++++++-
 promptolution/llms/local_llm.py            |  2 +-
 promptolution/llms/vllm.py                 | 29 +-----------
 promptolution/optimizers/base_optimizer.py | 13 +++--
 promptolution/optimizers/evoprompt_de.py   |  6 ++-
 promptolution/optimizers/evoprompt_ga.py   |  5 +-
 promptolution/optimizers/opro.py           |  4 +-
 promptolution/utils/prompt_creation.py     | 10 ++--
 10 files changed, 126 insertions(+), 47 deletions(-)

diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
index fe655d6..b4f75af 100644
--- a/promptolution/callbacks.py
+++ b/promptolution/callbacks.py
@@ -14,24 +14,33 @@ def on_step_end(self, optimizer):
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_epoch_end(self, optimizer):
         """Called at the end of each optimization epoch.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_train_end(self, optimizer):
         """Called at the end of the entire optimization process.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
 
 class LoggerCallback(Callback):
@@ -57,6 +66,8 @@ def on_step_end(self, optimizer):
             self.logger.critical(f"*** Prompt {i}: Score: {score}")
             self.logger.critical(f"{prompt}")
 
+        return True
+
     def on_train_end(self, optimizer, logs=None):
         """Log information at the end of training.
 
@@ -66,6 +77,8 @@ def on_train_end(self, optimizer, logs=None):
         """
         self.logger.critical(f"Training ended - {logs}")
 
+        return True
+
 
 class CSVCallback(Callback):
     """Callback for saving optimization progress to a CSV file.
@@ -105,13 +118,15 @@ def on_step_end(self, optimizer):
         )
         df.to_csv(self.path, mode="a", header=False, index=False)
 
+        return True
+
     def on_train_end(self, optimizer):
         """Called at the end of training.
 
         Args:
         optimizer: The optimizer object that called the callback.
         """
-        pass
+        return True
 
 
 class BestPromptCallback(Callback):
@@ -139,6 +154,8 @@ def on_step_end(self, optimizer):
             self.best_score = optimizer.scores[0]
             self.best_prompt = optimizer.prompts[0]
 
+        return True
+
     def get_best_prompt(self):
         """Get the best prompt and score achieved during optimization.
 
@@ -173,6 +190,8 @@ def on_step_end(self, optimizer):
         """
         self.pbar.update(1)
 
+        return True
+
     def on_train_end(self, optimizer):
         """Close the progress bar at the end of training.
 
@@ -180,3 +199,23 @@ def on_train_end(self, optimizer):
         optimizer: The optimizer object that called the callback.
         """
         self.pbar.close()
+
+        return True
+
+
+class TokenCountCallback(Callback):
+    """Callback for stopping optimization based on the total token count."""
+
+    def __init__(self, max_tokens_for_termination):
+        """Initialize the TokenCountCallback."""
+        self.max_tokens_for_termination = max_tokens_for_termination
+
+    def on_step_end(self, optimizer):
+        """Check if the total token count exceeds the maximum allowed. If so, stop the optimization."""
+        token_counts = optimizer.predictor.llm.get_token_count()
+        total_token_count = token_counts["total_tokens"]
+
+        if total_token_count > self.max_tokens_for_termination:
+            return False
+
+        return True
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index cf966bf..db920de 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -80,7 +80,7 @@ def __init__(self, model_id: str, token: str = None):
         else:
             self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
 
-    def get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str]) -> List[str]:
         """Get responses for a list of prompts in a synchronous manner.
 
         This method includes retry logic for handling connection errors and rate limits.
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 7f0e95d..c222b6d 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -18,10 +18,61 @@ class BaseLLM(ABC):
 
     def __init__(self, *args, **kwargs):
         """Initialize the LLM."""
-        pass
+        self.input_token_count = 0
+        self.output_token_count = 0
+
+    def get_token_count(self):
+        """Get the current count of input and output tokens.
+
+        Returns:
+            dict: A dictionary containing the input and output token counts.
+        """
+        return {
+            "input_tokens": self.input_token_count,
+            "output_tokens": self.output_token_count,
+            "total_tokens": self.input_token_count + self.output_token_count,
+        }
+
+    def reset_token_count(self):
+        """Reset the token counters to zero."""
+        self.input_token_count = 0
+        self.output_token_count = 0
+
+    def update_token_count(self, inputs: List[str], outputs: List[str]):
+        """Update the token count based on the given inputs and outputs.
+
+        Args:
+            inputs (List[str]): A list of input prompts.
+            outputs (List[str]): A list of generated responses.
+        """
+        input_tokens = sum([len(i.split()) for i in inputs])
+        output_tokens = sum([len(o.split()) for o in outputs])
+        self.input_token_count += input_tokens
+        self.output_token_count += output_tokens
+
+    def get_response(self, prompts: str) -> str:
+        """Generate responses for the given prompts.
+
+        This method calls the _get_response method to generate responses
+        for the given prompts. It also updates the token count for the
+        input and output tokens.
+
+        Args:
+            prompts (str or List[str]): Input prompt(s). If a single string is provided,
+                                        it's converted to a list containing that string.
+
+        Returns:
+            List[str]: A list of generated responses, one for each input prompt.
+        """
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        responses = self._get_response(prompts)
+        self.update_token_count(prompts, responses)
+
+        return responses
 
     @abstractmethod
-    def get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str]) -> List[str]:
         """Generate responses for the given prompts.
 
         This method should be implemented by subclasses to define how
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index 074bf01..a58675e 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -48,7 +48,7 @@ def __init__(self, model_id: str, batch_size=8):
         self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
         self.pipeline.tokenizer.padding_side = "left"
 
-    def get_response(self, prompts: list[str]):
+    def _get_response(self, prompts: list[str]):
         """Generate responses for a list of prompts using the local language model.
 
         Args:
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 30d78da..39157a1 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -78,10 +78,6 @@ def __init__(
         self.max_model_len = max_model_len
         self.trust_remote_code = trust_remote_code
 
-        # Initialize token counters
-        self.input_token_count = 0
-        self.output_token_count = 0
-
         # Configure sampling parameters
         self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
 
@@ -111,7 +107,7 @@ def __init__(
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-    def get_response(self, inputs: list[str]):
+    def _get_response(self, inputs: list[str]):
         """Generate responses for a list of prompts using the vLLM engine.
 
         Args:
@@ -150,33 +146,10 @@ def get_response(self, inputs: list[str]):
             outputs = self.llm.generate(batch, self.sampling_params)
             responses = [output.outputs[0].text for output in outputs]
 
-            # Count output tokens
-            for response in responses:
-                output_tokens = self.tokenizer.encode(response)
-                self.output_token_count += len(output_tokens)
-
             all_responses.extend(responses)
 
         return all_responses
 
-    def get_token_count(self):
-        """Get the current count of input and output tokens.
-
-        Returns:
-            dict: A dictionary containing the input and output token counts.
-        """
-        return {
-            "input_tokens": self.input_token_count,
-            "output_tokens": self.output_token_count,
-            "total_tokens": self.input_token_count + self.output_token_count,
-        }
-
-    def reset_token_count(self):
-        """Reset the token counters to zero."""
-        self.input_token_count = 0
-        self.output_token_count = 0
-        logger.info("Token counters have been reset.")
-
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""
         del self.llm
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 2cac685..95ec7c2 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -61,18 +61,24 @@ def optimize(self, n_steps: int) -> List[str]:
 
     def _on_step_end(self):
         """Call all registered callbacks at the end of each optimization step."""
+        continue_optimization = True
         for callback in self.callbacks:
-            callback.on_step_end(self)
+            continue_optimization &= callback.on_step_end(self)  # if any callback returns False, end the optimization
+
+        return continue_optimization
 
     def _on_epoch_end(self):
         """Call all registered callbacks at the end of each optimization epoch."""
+        continue_optimization = True
         for callback in self.callbacks:
-            callback.on_epoch_end(self)
+            continue_optimization &= callback._on_epoch_end(self)  # if any callback returns False, end the optimization
+
+        return continue_optimization
 
     def _on_train_end(self):
         """Call all registered callbacks at the end of the entire optimization process."""
         for callback in self.callbacks:
-            callback.on_train_end(self)
+            callback._on_train_end(self)
 
 
 class DummyOptimizer(BaseOptimizer):
@@ -111,4 +117,5 @@ def optimize(self, n_steps) -> list[str]:
         self._on_step_end()
         self._on_epoch_end()
         self._on_train_end()
+
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 17d74b3..f44556e 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -89,7 +89,11 @@ def optimize(self, n_steps: int) -> List[str]:
                     self.prompts[i] = child_prompts[i]
                     self.scores[i] = child_scores[i]
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+
+            if not continue_optimization:
+                break
 
         self._on_train_end()
+
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 2ec789b..f6efcb8 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -77,7 +77,10 @@ def optimize(self, n_steps: int) -> List[str]:
             self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)]
             self.scores = sorted(scores, reverse=True)[: len(self.prompts)]
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+            if not continue_optimization:
+                break
+
         return self.prompts
 
     def _crossover(self, prompts, scores) -> str:
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index 3c71f4e..ef6f6fd 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -89,7 +89,9 @@ def optimize(self, n_steps: int) -> List[str]:
             self.prompts.append(prompt)
             self.scores.append(score)
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+            if not continue_optimization:
+                break
 
         self._on_epoch_end()
 
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index d85edd9..e0c7c0b 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -56,7 +56,7 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
     """
     if isinstance(task, ClassificationTask):
         # if classification task sample such that all classes are represented
-        unique_classes, counts = np.unique(task.ys, return_counts=True)
+        unique_labels, counts = np.unique(task.ys, return_counts=True)
         proportions = counts / len(task.ys)
         samples_per_class = np.round(proportions * n_samples).astype(int)
         samples_per_class = np.maximum(samples_per_class, 1)
@@ -64,8 +64,8 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
         # sample
         xs = []
         ys = []
-        for cls, n_samples in zip(unique_classes, samples_per_class):
-            indices = np.where(task.ys == cls)[0]
+        for label, n_samples in zip(unique_labels, samples_per_class):
+            indices = np.where(task.ys == label)[0]
             indices = np.random.choice(indices, n_samples, replace=False)
             xs.extend(task.xs[indices])
             ys.extend(task.ys[indices])
@@ -78,9 +78,9 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
 
     meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt
     examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
-    meta_prompt = meta_prompt.replace("<input_output_pairs", examples)
-
+    meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
     prompt = llm.get_response([meta_prompt])[0]
     prompt = prompt.split("</prompt>")[0].split("<prompt>")[-1]
+    prompt = prompt.strip()
 
     return prompt

From 2588664f2caf0bdbf8046b72841fa28992a51d95 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 7 Mar 2025 12:41:27 +0100
Subject: [PATCH 11/41] added super inits

---
 promptolution/llms/api_llm.py   |  1 +
 promptolution/llms/local_llm.py |  2 +
 promptolution/llms/vllm.py      |  2 +
 scripts/llm_test_run.py         | 98 +++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+)
 create mode 100644 scripts/llm_test_run.py

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index db920de..14a70da 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -73,6 +73,7 @@ def __init__(self, model_id: str, token: str = None):
         Raises:
             ValueError: If an unknown model identifier is provided.
         """
+        super().__init__()
         if "claude" in model_id:
             self.model = ChatAnthropic(model=model_id, api_key=token)
         elif "gpt" in model_id:
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index a58675e..577d4a0 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -35,6 +35,8 @@ def __init__(self, model_id: str, batch_size=8):
             This method sets up a text generation pipeline with bfloat16 precision,
             automatic device mapping, and specific generation parameters.
         """
+        super().__init__()
+
         self.pipeline = transformers.pipeline(
             "text-generation",
             model=model_id,
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 39157a1..5380e87 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -72,6 +72,8 @@ def __init__(
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
         """
+        super().__init__()
+
         self.dtype = dtype
         self.tensor_parallel_size = tensor_parallel_size
         self.gpu_memory_utilization = gpu_memory_utilization
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
new file mode 100644
index 0000000..15b1203
--- /dev/null
+++ b/scripts/llm_test_run.py
@@ -0,0 +1,98 @@
+"""Test script for measuring raw LLM inference performance on a dataset."""
+import argparse
+import time
+from logging import Logger
+
+import numpy as np
+import pandas as pd
+from promptolution.config import Config
+from promptolution.llms import get_llm
+from promptolution.predictors import Classificator
+from promptolution.tasks import get_task
+from tqdm import tqdm
+
+logger = Logger(__name__)
+
+# TODO: Align this script with how we import datasets in capo
+
+
+"""Run inference test on a dataset using a specified LLM."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--model")
+parser.add_argument("--output")
+parser.add_argument("--datasets", default=["subj"])
+parser.add_argument("--token", default=None)
+parser.add_argument("--batch-size", default=None)
+parser.add_argument("--revision", default="main")
+parser.add_argument("--max-model-len", default=None)
+parser.add_argument("--model-storage-path", default=None)
+args = parser.parse_args()
+
+start_time = time.time()
+
+if args.max_model_len is not None:
+    max_model_len = int(args.max_model_len)
+
+if "vllm" in args.model:
+    llm = get_llm(
+        args.model,
+        batch_size=args.batch_size,
+        max_model_len=max_model_len,
+        model_storage_path=args.model_storage_path,
+        revision=args.revision,
+    )
+else:
+    llm = get_llm(args.model, args.token)
+
+results = pd.DataFrame()
+
+for dataset in args.datasets:
+    config = Config(
+        evaluation_llm=args.model,
+        ds_path=f"data_sets/cls/{dataset}/",
+        task_name=dataset,
+        api_token=args.token,
+        n_eval_samples=200,
+    )
+
+    task = get_task(config, split="dev")
+    predictor = Classificator(llm, classes=task.classes)
+
+    prompts = [task.initial_population[0]]
+
+    xs = task.xs[: config.n_eval_samples]
+    ys = task.ys[: config.n_eval_samples]
+
+    for prompt in tqdm(prompts):
+        preds, seqs = predictor.predict(prompt, xs, return_seq=True)
+
+        scores = []
+        for i in range(len(xs)):
+            scores.append(1 if preds[0][i] == ys[i] else 0)
+
+        # clean up the sequences
+        seqs = [seq.replace("\n", "").strip() for seq in seqs]
+
+        # if single prompts should be stored
+        # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
+        # df.to_csv(args.output + "_detailed", index=False)
+
+        accuracy = np.array(scores).mean()
+
+        results = pd.DataFrame(
+            dict(
+                model=args.model,
+                dataset=dataset,
+                prompt=prompt,
+                accuracy=accuracy,
+                n_samples=len(xs),
+            ),
+            index=[0],
+        )
+        results.to_csv(args.output, mode="a", header=False, index=False)
+
+total_inference_time = time.time() - start_time
+print(
+    f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens."
+)
+print(f"Results saved to {args.output}")

From 8c365c72c3753bb37ddd9f5572f828398b859802 Mon Sep 17 00:00:00 2001
From: Tom Zehle <t.zehle@gmail.com>
Date: Sat, 8 Mar 2025 13:09:55 +0100
Subject: [PATCH 12/41] allow for splits not based on white space (such as new
 line break etc)

---
 promptolution/predictors/classificator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py
index f33bfc6..c23278a 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classificator.py
@@ -44,7 +44,7 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray
         response = []
         for pred in preds:
             predicted_class = self.classes[0]  # use first class as default pred
-            for word in pred.split(" "):
+            for word in pred.split():
                 word = "".join([c for c in word if c.isalnum()])
                 if word in self.classes:
                     predicted_class = word

From 7e7d2b57aafb158b37dc50b9cf0eb1c4b0878e89 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 8 Mar 2025 17:55:49 +0100
Subject: [PATCH 13/41] include task descriptions

---
 promptolution/predictors/classificator.py | 57 ++++++++++++++++++++++-
 promptolution/templates.py                |  8 ++++
 promptolution/utils/prompt_creation.py    | 12 +++--
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py
index c23278a..89eb5d4 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classificator.py
@@ -7,7 +7,7 @@
 from promptolution.predictors.base_predictor import BasePredictor
 
 
-class Classificator(BasePredictor):
+class FirstOccurrenceClassificator(BasePredictor):
     """A predictor class for classification tasks using language models.
 
     This class takes a language model and a list of classes, and provides a method
@@ -33,6 +33,10 @@ def __init__(self, llm, classes, *args, **kwargs):
         """
         super().__init__(llm)
         self.classes = classes
+        self.extraction_description = (
+            f"The task is to classify the texts into one of those classes: {', '.join(classes)}."
+            "The first occurrence of a valid class label in the prediction is used as the predicted class."
+        )
 
     def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
         """Extract class labels from the predictions, based on the list of valid class labels.
@@ -54,3 +58,54 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray
 
         response = np.array(response).reshape(*shape)
         return response
+
+
+class MarkerBasedClassificator(BasePredictor):
+    """A predictor class for classification tasks using language models.
+
+    This class takes a language model and a list of classes, and provides a method
+    to predict classes for given prompts and input data. The class labels are extracted.
+
+    Attributes:
+        llm: The language model used for generating predictions.
+        classes (List[str]): The list of valid class labels.
+        marker (str): The marker to use for extracting the class label.
+
+    Inherits from:
+        BasePredictor: The base class for predictors in the promptolution library.
+    """
+
+    def __init__(self, llm, classes, marker="<final_answer>", *args, **kwargs):
+        """Initialize the Classificator.
+
+        Args:
+            llm: The language model to use for predictions.
+            classes (List[str]): The list of valid class labels.
+            marker (str): The marker to use for extracting the class label.
+            *args, **kwargs: Additional arguments for the BasePredictor.
+        """
+        super().__init__(llm)
+        self.classes = classes
+        self.marker = marker
+        self.extraction_description = (
+            f"The task is to classify the texts into one of those classes: {','.join(classes)}."
+            f"The class label is extracted from the text following the marker: {marker}."
+        )
+
+    def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
+        """Extract class labels from the predictions, by extracting the text following the marker.
+
+        Args:
+            preds: The raw predictions from the language model.
+            shape: The shape of the output array: (n_prompts, n_samples).
+        """
+        response = []
+        for pred in preds:
+            predicted_class = pred.split(self.marker)[-1].strip()
+            if predicted_class not in self.classes:
+                predicted_class = self.classes[0]
+
+            response.append(predicted_class)
+
+        response = np.array(response).reshape(*shape)
+        return response
diff --git a/promptolution/templates.py b/promptolution/templates.py
index 05d7ae3..18c0765 100644
--- a/promptolution/templates.py
+++ b/promptolution/templates.py
@@ -114,3 +114,11 @@
 <input_output_pairs>
 
 The instruction was"""
+
+PROMPT_CREATION_TEMPLATE_TD = """You are asked to give the corresponding prompt that gives the following outputs given these inputs for the following task: <task_desc>.
+Return it starting with <prompt> and ending with </prompt> tags.
+Include the name of the output classes in the prompt.
+
+<input_output_pairs>
+
+The instruction was"""
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index e0c7c0b..07f8c16 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -7,7 +7,7 @@
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.tasks.base_task import BaseTask
 from promptolution.tasks.classification_tasks import ClassificationTask
-from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_VARIATION_TEMPLATE
+from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE
 
 
 def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_prompt: str = None) -> List[str]:
@@ -35,7 +35,9 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr
     return varied_prompts
 
 
-def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3) -> List[str]:
+def create_prompts_from_samples(
+    task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3, task_description: str = None
+) -> List[str]:
     """Generate a set of prompts from dataset examples sampled from a given task.
 
     Idea taken from the paper Zhou et al. (2021) https://arxiv.org/pdf/2211.01910
@@ -50,6 +52,7 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
         meta_prompt (str): The meta prompt to use for generating the prompts.
         If None, a default meta prompt is used.
         n_samples (int): The number of samples to use for generating prompts.
+        task_description (str): The description of the task to include in the prompt.
 
     Returns:
         List[str]: A list of generated prompts.
@@ -76,7 +79,10 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
         xs = task.xs[indices].tolist()
         ys = task.ys[indices].tolist()
 
-    meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt
+    if meta_prompt is None:
+        meta_prompt = PROMPT_CREATION_TEMPLATE
+    if task_description is None:
+        meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
     examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
     meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
     prompt = llm.get_response([meta_prompt])[0]

From edcd28dc3d7b3a4ddab99edadf51ccd7c1aaa272 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 19:33:27 +0100
Subject: [PATCH 14/41] add tokenizer based token count to vllm class

---
 promptolution/llms/vllm.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 5380e87..f558458 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -2,6 +2,7 @@
 
 
 from logging import INFO, Logger
+from typing import List
 
 try:
     import torch
@@ -32,8 +33,7 @@ class VLLM(BaseLLM):
 
     Methods:
         get_response: Generate responses for a list of prompts.
-        get_token_count: Get the current count of input and output tokens.
-        reset_token_count: Reset the token counters to zero.
+        update_token_count: Update the token count based on the given inputs and outputs.
     """
 
     def __init__(
@@ -152,6 +152,21 @@ def _get_response(self, inputs: list[str]):
 
         return all_responses
 
+    def update_token_count(self, inputs: List[str], outputs: List[str]):
+        """Update the token count based on the given inputs and outputs.
+
+            Uses the tokenizer to count the tokens.
+
+        Args:
+            inputs (List[str]): A list of input prompts.
+            outputs (List[str]): A list of generated responses.
+        """
+        for input in inputs:
+            self.input_token_count += len(self.tokenizer.encode(input))
+
+        for output in outputs:
+            self.output_token_count += len(self.tokenizer.encode(output))
+
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""
         del self.llm

From f2d73d4c8a65defdc6546a2f24ae9f775a1921ab Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 21:55:49 +0100
Subject: [PATCH 15/41] update test run script

---
 scripts/opro_test_run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index 474af3e..854f543 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -16,13 +16,13 @@
 def main():
     """Run a test run for the Opro optimizer."""
     config = Config(
-        meta_llm="meta-llama/Meta-Llama-3-8B-Instruct",
+        meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
         ds_path="data_sets/agnews",
         task_name="agnews",
         n_steps=10,
         optimizer="opro",
-        downstream_llm="meta-llama/Meta-Llama-3-8B-Instruct",
-        evaluation_llm="meta-llama/Meta-Llama-3-8B-Instruct",
+        downstream_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
+        evaluation_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
 
     )
     task = get_task(config, split="dev")
@@ -37,7 +37,7 @@ def main():
         callbacks=[LoggerCallback(logger)],
         n_samples=5,
     )
-    prompts = optimizer.optimize(n_steps=10)
+    prompts = optimizer.optimize(n_steps=2)
 
     logger.info(f"Optimized prompts: {prompts}")
 

From a725384a3c536bd9e12ebe9756008ef302c20158 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:13:43 +0100
Subject: [PATCH 16/41] use classifiers accordingly

---
 promptolution/predictors/__init__.py | 16 ++++++++++------
 scripts/llm_test_run.py              |  4 ++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index d850759..65705c0 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -3,25 +3,26 @@
 from promptolution.llms import get_llm
 
 from .base_predictor import DummyPredictor
-from .classificator import Classificator
+from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
 
 
-def get_predictor(name, *args, **kwargs):
+def get_predictor(name, type: str = "first_occurrence", *args, **kwargs):
     """Factory function to create and return a predictor instance based on the provided name.
 
     This function supports two types of predictors:
     1. DummyPredictor: A mock predictor for testing purposes.
-    2. Classificator: A real predictor using a language model for classification tasks.
+    2. FirstOccurrenceClassificator: A real predictor using a language model for classification tasks.
 
     Args:
         name (str): Identifier for the predictor to use. Special case:
                     - "dummy" for DummyPredictor
-                    - Any other string for Classificator with the specified LLM
+                    - Any other string for FirstOccurrenceClassificator with the specified LLM
+        type ()
         *args: Variable length argument list passed to the predictor constructor.
         **kwargs: Arbitrary keyword arguments passed to the predictor constructor.
 
     Returns:
-        An instance of DummyPredictor or Classificator based on the name.
+        An instance of DummyPredictor or FirstOccurrenceClassificator based on the name.
 
     Notes:
         - For non-dummy predictors, this function calls get_llm to obtain the language model.
@@ -36,4 +37,7 @@ def get_predictor(name, *args, **kwargs):
 
     downstream_llm = get_llm(name)
 
-    return Classificator(downstream_llm, *args, **kwargs)
+    if type == "first_occurrence":
+        return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs)
+    elif type == "marker":
+        return MarkerBasedClassificator(downstream_llm, *args, **kwargs)
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 15b1203..442475a 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from promptolution.config import Config
 from promptolution.llms import get_llm
-from promptolution.predictors import Classificator
+from promptolution.predictors import FirstOccurrenceClassificator
 from promptolution.tasks import get_task
 from tqdm import tqdm
 
@@ -56,7 +56,7 @@
     )
 
     task = get_task(config, split="dev")
-    predictor = Classificator(llm, classes=task.classes)
+    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     prompts = [task.initial_population[0]]
 

From b0f7931fada3116823f00fa72fa5b324037e57cc Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:15:21 +0100
Subject: [PATCH 17/41] small fix

---
 scripts/opro_test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index 854f543..dc343df 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -17,7 +17,7 @@ def main():
     """Run a test run for the Opro optimizer."""
     config = Config(
         meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
-        ds_path="data_sets/agnews",
+        ds_path="data_sets/cls/agnews",
         task_name="agnews",
         n_steps=10,
         optimizer="opro",

From 30e171282936970e3498f2f7a4ea72f93df443af Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:17:04 +0100
Subject: [PATCH 18/41] add storage path

---
 scripts/opro_test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index dc343df..db71ba3 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -28,7 +28,7 @@ def main():
     task = get_task(config, split="dev")
     predictor = get_predictor(config.evaluation_llm, classes=task.classes)
 
-    llm = get_llm(config.meta_llm)
+    llm = get_llm(config.meta_llm, model_storage_path="../models/")
     optimizer = Opro(
         llm,
         initial_prompts=task.initial_population,

From 80b19d2ef2ef8e3f4cac54a8c3c445c9757d2a67 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:21:33 +0100
Subject: [PATCH 19/41] helpers should use classificator

---
 promptolution/helpers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 9d776a9..345d849 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -9,7 +9,7 @@
 from promptolution.exemplar_selectors import get_exemplar_selector
 from promptolution.llms import get_llm
 from promptolution.optimizers import get_optimizer
-from promptolution.predictors import Classificator
+from promptolution.predictors import FirstOccurrenceClassificator
 from promptolution.tasks import get_task
 
 
@@ -38,7 +38,7 @@ def run_optimization(config: Config):
     """
     task = get_task(config)
     llm = get_llm(config.meta_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     if config.init_pop_size:
         init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True)
@@ -76,7 +76,7 @@ def run_evaluation(config: Config, prompts: List[str]):
     task = get_task(config, split="test")
 
     llm = get_llm(config.evaluation_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples)
     df = pd.DataFrame(dict(prompt=prompts, score=scores))

From ec4861ae5a008d62e12aba57f38582ea1f96fdca Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:40:28 +0100
Subject: [PATCH 20/41] use different model

---
 scripts/opro_test_run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index db71ba3..6a5b1f6 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -16,13 +16,13 @@
 def main():
     """Run a test run for the Opro optimizer."""
     config = Config(
-        meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
+        meta_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
         ds_path="data_sets/cls/agnews",
         task_name="agnews",
         n_steps=10,
         optimizer="opro",
-        downstream_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
-        evaluation_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ",
+        downstream_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+        evaluation_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
 
     )
     task = get_task(config, split="dev")

From bf7f1df50fc79bc3d9cf2fcf558ae709142a1e8b Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:45:14 +0100
Subject: [PATCH 21/41] changes in opro test

---
 scripts/opro_test_run.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index 6a5b1f6..f7cf0c0 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -28,7 +28,12 @@ def main():
     task = get_task(config, split="dev")
     predictor = get_predictor(config.evaluation_llm, classes=task.classes)
 
-    llm = get_llm(config.meta_llm, model_storage_path="../models/")
+    llm = get_llm(
+        config.meta_llm,
+        max_model_len=512,
+        model_storage_path="../models/",
+        revision="main"
+    )
     optimizer = Opro(
         llm,
         initial_prompts=task.initial_population,

From 3969e03b8fe6f5bf9366b3629deebd82b371b975 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 22:57:40 +0100
Subject: [PATCH 22/41] change get_predictor function

---
 promptolution/predictors/__init__.py |  6 ++----
 scripts/opro_test_run.py             | 10 ++++++----
 scripts/prompt_creation_run.py       |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index 65705c0..9d4d5b4 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -6,7 +6,7 @@
 from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
 
 
-def get_predictor(name, type: str = "first_occurrence", *args, **kwargs):
+def get_predictor(downstream_llm=None, type: str = "first_occurrence", *args, **kwargs):
     """Factory function to create and return a predictor instance based on the provided name.
 
     This function supports two types of predictors:
@@ -32,11 +32,9 @@ def get_predictor(name, type: str = "first_occurrence", *args, **kwargs):
         >>> dummy_pred = get_predictor("dummy", classes=["A", "B", "C"])
         >>> real_pred = get_predictor("gpt-3.5-turbo", classes=["positive", "negative"])
     """
-    if name == "dummy":
+    if downstream_llm is None:
         return DummyPredictor("", *args, **kwargs)
 
-    downstream_llm = get_llm(name)
-
     if type == "first_occurrence":
         return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs)
     elif type == "marker":
diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index f7cf0c0..81e670d 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -15,18 +15,19 @@
 
 def main():
     """Run a test run for the Opro optimizer."""
+    llm_name = "vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4"
+
     config = Config(
-        meta_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+        meta_llm=llm_name,
         ds_path="data_sets/cls/agnews",
         task_name="agnews",
         n_steps=10,
         optimizer="opro",
-        downstream_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
-        evaluation_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+        downstream_llm=llm_name,
+        evaluation_llm=llm_name,
 
     )
     task = get_task(config, split="dev")
-    predictor = get_predictor(config.evaluation_llm, classes=task.classes)
 
     llm = get_llm(
         config.meta_llm,
@@ -34,6 +35,7 @@ def main():
         model_storage_path="../models/",
         revision="main"
     )
+    predictor = get_predictor(llm, classes=task.classes)
     optimizer = Opro(
         llm,
         initial_prompts=task.initial_population,
diff --git a/scripts/prompt_creation_run.py b/scripts/prompt_creation_run.py
index 4c17694..f7d54c3 100644
--- a/scripts/prompt_creation_run.py
+++ b/scripts/prompt_creation_run.py
@@ -21,7 +21,7 @@ def main():
     llm = get_llm("meta-llama/Meta-Llama-3-8B-Instruct")
     task = get_task(config, split="dev")
 
-    predictor = get_predictor("meta-llama/Meta-Llama-3-8B-Instruct", classes=task.classes)
+    predictor = get_predictor(llm, classes=task.classes)
 
     init_prompts = create_prompts_from_samples(task, llm)
     logger.critical(f"Initial prompts: {init_prompts}")

From bd05cd80cee6c3b5491a6bbf19336edff8089c9c Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 23:04:58 +0100
Subject: [PATCH 23/41] fix callback calling

---
 promptolution/optimizers/base_optimizer.py | 4 ++--
 scripts/opro_test_run.py                   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 95ec7c2..e150c1a 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -71,14 +71,14 @@ def _on_epoch_end(self):
         """Call all registered callbacks at the end of each optimization epoch."""
         continue_optimization = True
         for callback in self.callbacks:
-            continue_optimization &= callback._on_epoch_end(self)  # if any callback returns False, end the optimization
+            continue_optimization &= callback.on_epoch_end(self)  # if any callback returns False, end the optimization
 
         return continue_optimization
 
     def _on_train_end(self):
         """Call all registered callbacks at the end of the entire optimization process."""
         for callback in self.callbacks:
-            callback._on_train_end(self)
+            callback.on_train_end(self)
 
 
 class DummyOptimizer(BaseOptimizer):
diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
index 81e670d..e6654a0 100644
--- a/scripts/opro_test_run.py
+++ b/scripts/opro_test_run.py
@@ -31,7 +31,7 @@ def main():
 
     llm = get_llm(
         config.meta_llm,
-        max_model_len=512,
+        max_model_len=2000,
         model_storage_path="../models/",
         revision="main"
     )

From 96e1bf613b3c100b1725aa9f040903181d4f3266 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 23:11:02 +0100
Subject: [PATCH 24/41] change optimizer test run script

---
 scripts/opro_test_run.py      | 53 -------------------------
 scripts/optimizer_test_run.py | 73 +++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 53 deletions(-)
 delete mode 100644 scripts/opro_test_run.py
 create mode 100644 scripts/optimizer_test_run.py

diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
deleted file mode 100644
index e6654a0..0000000
--- a/scripts/opro_test_run.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""Test run for the Opro optimizer."""
-
-from logging import Logger
-
-from promptolution.callbacks import LoggerCallback
-from promptolution.llms import get_llm
-from promptolution.optimizers import Opro
-from promptolution.predictors import get_predictor
-from promptolution.tasks import get_task
-
-from promptolution.config import Config
-
-logger = Logger(__name__)
-
-
-def main():
-    """Run a test run for the Opro optimizer."""
-    llm_name = "vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4"
-
-    config = Config(
-        meta_llm=llm_name,
-        ds_path="data_sets/cls/agnews",
-        task_name="agnews",
-        n_steps=10,
-        optimizer="opro",
-        downstream_llm=llm_name,
-        evaluation_llm=llm_name,
-
-    )
-    task = get_task(config, split="dev")
-
-    llm = get_llm(
-        config.meta_llm,
-        max_model_len=2000,
-        model_storage_path="../models/",
-        revision="main"
-    )
-    predictor = get_predictor(llm, classes=task.classes)
-    optimizer = Opro(
-        llm,
-        initial_prompts=task.initial_population,
-        task=task,
-        predictor=predictor,
-        callbacks=[LoggerCallback(logger)],
-        n_samples=5,
-    )
-    prompts = optimizer.optimize(n_steps=2)
-
-    logger.info(f"Optimized prompts: {prompts}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
new file mode 100644
index 0000000..03fc79e
--- /dev/null
+++ b/scripts/optimizer_test_run.py
@@ -0,0 +1,73 @@
+"""Test run for the Opro optimizer."""
+import argparse
+from logging import Logger
+
+from promptolution.callbacks import LoggerCallback
+from promptolution.llms import get_llm
+from promptolution.optimizers import Opro, EvoPromptDE, EvoPromptGA
+from promptolution.predictors import get_predictor
+from promptolution.tasks import get_task
+
+from promptolution.config import Config
+
+logger = Logger(__name__)
+
+"""Run a test run for any of the implemented optimizers."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--model")
+parser.add_argument("--model-storage-path", default="../models/")
+parser.add_argument("--optimizer", default="evoprompt_de")
+parser.add_argument("--n-steps", default=10)
+args = parser.parse_args()
+
+config = Config(
+    meta_llm=args.model,
+    ds_path="data_sets/cls/agnews",
+    task_name="agnews",
+    n_steps=10,
+    optimizer="opro",
+    downstream_llm=args.model,
+    evaluation_llm=args.model,
+
+)
+task = get_task(config, split="dev")
+
+llm = get_llm(
+    config.meta_llm,
+    max_model_len=2000,
+    model_storage_path=args.model_storage_path,
+    revision="main"
+)
+predictor = get_predictor(llm, classes=task.classes)
+
+if args.optimizer == "evoprompt_de":
+    optimizer = EvoPromptDE(
+        llm,
+        initial_prompts=task.initial_population,
+        task=task,
+        predictor=predictor,
+        callbacks=[LoggerCallback(logger)],
+        n_samples=5,
+    )
+elif args.optimizer == "evoprompt_ga":
+    optimizer = EvoPromptGA(
+        llm,
+        initial_prompts=task.initial_population,
+        task=task,
+        predictor=predictor,
+        callbacks=[LoggerCallback(logger)],
+        n_samples=5,
+    )
+else:
+    optimizer = Opro(
+        llm,
+        initial_prompts=task.initial_population,
+        task=task,
+        predictor=predictor,
+        callbacks=[LoggerCallback(logger)],
+        n_samples=5,
+    )
+
+prompts = optimizer.optimize(n_steps=args.n_steps)
+
+logger.info(f"Optimized prompts: {prompts}")

From 62c8de79c2eec0418f23d1f7612b4296b7e6986d Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 23:15:41 +0100
Subject: [PATCH 25/41] small alignments

---
 scripts/optimizer_test_run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index 03fc79e..e165993 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -42,7 +42,7 @@
 
 if args.optimizer == "evoprompt_de":
     optimizer = EvoPromptDE(
-        llm,
+        meta_llm=llm,
         initial_prompts=task.initial_population,
         task=task,
         predictor=predictor,
@@ -51,7 +51,7 @@
     )
 elif args.optimizer == "evoprompt_ga":
     optimizer = EvoPromptGA(
-        llm,
+        meta_llm=llm,
         initial_prompts=task.initial_population,
         task=task,
         predictor=predictor,
@@ -60,7 +60,7 @@
     )
 else:
     optimizer = Opro(
-        llm,
+        meta_llm=llm,
         initial_prompts=task.initial_population,
         task=task,
         predictor=predictor,

From 1aa56067e289a8b0f4e7efb7a42205444edfbc27 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 23:17:27 +0100
Subject: [PATCH 26/41] small alignments

---
 scripts/optimizer_test_run.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index e165993..9abf794 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -47,7 +47,6 @@
         task=task,
         predictor=predictor,
         callbacks=[LoggerCallback(logger)],
-        n_samples=5,
     )
 elif args.optimizer == "evoprompt_ga":
     optimizer = EvoPromptGA(
@@ -56,7 +55,6 @@
         task=task,
         predictor=predictor,
         callbacks=[LoggerCallback(logger)],
-        n_samples=5,
     )
 else:
     optimizer = Opro(

From 7214658b25e3bbc3e6f3bf00b709d9f5863036ea Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sat, 8 Mar 2025 23:19:57 +0100
Subject: [PATCH 27/41] small alignments

---
 scripts/optimizer_test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index 9abf794..8676fda 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -17,7 +17,7 @@
 parser.add_argument("--model")
 parser.add_argument("--model-storage-path", default="../models/")
 parser.add_argument("--optimizer", default="evoprompt_de")
-parser.add_argument("--n-steps", default=10)
+parser.add_argument("--n-steps", type=int, default=10)
 args = parser.parse_args()
 
 config = Config(

From 0b15410863d4d036fa7113dd1820825932acdf04 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Sun, 9 Mar 2025 00:05:49 +0100
Subject: [PATCH 28/41] some changes to match the current optimizer
 implementation

---
 promptolution/config.py              |  1 +
 promptolution/helpers.py             |  5 +--
 promptolution/llms/api_llm.py        |  4 +--
 promptolution/optimizers/__init__.py | 10 ++++--
 scripts/optimizer_test_run.py        | 51 +++++-----------------------
 5 files changed, 21 insertions(+), 50 deletions(-)

diff --git a/promptolution/config.py b/promptolution/config.py
index dac2d9a..ca07522 100644
--- a/promptolution/config.py
+++ b/promptolution/config.py
@@ -56,6 +56,7 @@ class Config:
     include_task_desc: bool = True
     donor_random: bool = False
     random_seed: int = 42
+    model_storage_path: Optional[Path] = Path("../models/")
     selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random"
     meta_bs: Optional[int] = None
     downstream_bs: Optional[int] = None
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 345d849..0420b17 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -27,7 +27,7 @@ def run_experiment(config: Config):
     return df
 
 
-def run_optimization(config: Config):
+def run_optimization(config: Config, callbacks: List = None):
     """Run the optimization phase of the experiment.
 
     Args:
@@ -37,7 +37,7 @@ def run_optimization(config: Config):
         List[str]: The optimized list of prompts.
     """
     task = get_task(config)
-    llm = get_llm(config.meta_llm, token=config.api_token)
+    llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path)
     predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     if config.init_pop_size:
@@ -52,6 +52,7 @@ def run_optimization(config: Config):
         task=task,
         predictor=predictor,
         n_eval_samples=config.n_eval_samples,
+        callbacks=callbacks,
     )
 
     prompts = optimizer.optimize(n_steps=config.n_steps)
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 14a70da..91c9942 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 from logging import INFO, Logger
-from typing import List
+from typing import Any, List
 
 import nest_asyncio
 import openai
@@ -63,7 +63,7 @@ class APILLM(BaseLLM):
         get_response_async: Asynchronously get responses for a list of prompts.
     """
 
-    def __init__(self, model_id: str, token: str = None):
+    def __init__(self, model_id: str, token: str = None, **kwargs: Any):
         """Initialize the APILLM with a specific model.
 
         Args:
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index 7e386a2..acde31e 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -51,9 +51,13 @@ def get_optimizer(
     if optimizer == "dummy":
         return DummyOptimizer(*args, **kwargs)
     if config.optimizer == "evopromptde":
-        return EvoPromptDE(donor_random=config.donor_random, *args, **kwargs)
+        if include_task_desc:
+            return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE_TD, *args, **kwargs)
+        return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs)
     if config.optimizer == "evopromptga":
-        return EvoPromptGA(selection_mode=config.selection_mode, *args, **kwargs)
+        if include_task_desc:
+            return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE_TD, *args, **kwargs)
+        return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs)
     if config.optimizer == "opro":
-        return Opro(*args, **kwargs)
+        return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs)
     raise ValueError(f"Unknown optimizer: {config.optimizer}")
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index 8676fda..4f1a851 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -3,10 +3,7 @@
 from logging import Logger
 
 from promptolution.callbacks import LoggerCallback
-from promptolution.llms import get_llm
-from promptolution.optimizers import Opro, EvoPromptDE, EvoPromptGA
-from promptolution.predictors import get_predictor
-from promptolution.tasks import get_task
+from promptolution.helpers import run_optimization
 
 from promptolution.config import Config
 
@@ -16,56 +13,24 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--model")
 parser.add_argument("--model-storage-path", default="../models/")
-parser.add_argument("--optimizer", default="evoprompt_de")
+parser.add_argument("--optimizer", default="evopromptde")
 parser.add_argument("--n-steps", type=int, default=10)
+parser.add_argument("--token", default=None)
 args = parser.parse_args()
 
 config = Config(
     meta_llm=args.model,
     ds_path="data_sets/cls/agnews",
     task_name="agnews",
-    n_steps=10,
-    optimizer="opro",
+    n_steps=args.n_steps,
+    optimizer=args.optimizer,
     downstream_llm=args.model,
     evaluation_llm=args.model,
-
-)
-task = get_task(config, split="dev")
-
-llm = get_llm(
-    config.meta_llm,
-    max_model_len=2000,
+    include_task_desc=True,
+    api_token=args.token,
     model_storage_path=args.model_storage_path,
-    revision="main"
 )
-predictor = get_predictor(llm, classes=task.classes)
-
-if args.optimizer == "evoprompt_de":
-    optimizer = EvoPromptDE(
-        meta_llm=llm,
-        initial_prompts=task.initial_population,
-        task=task,
-        predictor=predictor,
-        callbacks=[LoggerCallback(logger)],
-    )
-elif args.optimizer == "evoprompt_ga":
-    optimizer = EvoPromptGA(
-        meta_llm=llm,
-        initial_prompts=task.initial_population,
-        task=task,
-        predictor=predictor,
-        callbacks=[LoggerCallback(logger)],
-    )
-else:
-    optimizer = Opro(
-        meta_llm=llm,
-        initial_prompts=task.initial_population,
-        task=task,
-        predictor=predictor,
-        callbacks=[LoggerCallback(logger)],
-        n_samples=5,
-    )
 
-prompts = optimizer.optimize(n_steps=args.n_steps)
+prompts = run_optimization(config, callbacks=[LoggerCallback(logger)])
 
 logger.info(f"Optimized prompts: {prompts}")

From 39679788e437d1fd6ab86db5aecea62ec1833092 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 9 Mar 2025 11:38:08 +0100
Subject: [PATCH 29/41] changes in template and config

---
 promptolution/config.py              |  7 +++++--
 promptolution/helpers.py             | 10 ++++++++--
 promptolution/optimizers/__init__.py | 22 +++++++++++++++-------
 promptolution/optimizers/opro.py     |  1 -
 promptolution/templates.py           | 17 +++++++++++++++--
 scripts/optimizer_test_run.py        |  2 +-
 6 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/promptolution/config.py b/promptolution/config.py
index ca07522..25e254c 100644
--- a/promptolution/config.py
+++ b/promptolution/config.py
@@ -17,15 +17,17 @@ class Config:
         ds_path (str): Path to the dataset. Should not be None if used.
         n_steps (int): Number of optimization steps. Should not be None if used.
         optimizer (str): Name of the optimizer to use. Should not be None if used.
+        predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator".
         meta_llm (str): Name of the meta language model. Should not be None if used.
         downstream_llm (str): Name of the downstream language model. Should not be None if used.
         evaluation_llm (str): Name of the evaluation language model. Should not be None if used.
         init_pop_size (int): Initial population size. Defaults to 10.
         logging_dir (str): Directory for logging. Defaults to "logs/run.csv".
         experiment_name (str): Name of the experiment. Defaults to "experiment".
-        include_task_desc (bool): Whether to include task description. Defaults to False.
+        task_description (str): Task Description fed to the optimizer. Defaults to None.
         donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False.
         random_seed (int): Random seed for reproducibility. Defaults to 42.
+        model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/".
         selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random".
         meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None.
         downstream_bs (int): Batch size for local downstream LLM.
@@ -46,6 +48,7 @@ class Config:
     task_name: str = None
     ds_path: Path = None
     optimizer: str = None
+    predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator"
     meta_llm: str = None
     downstream_llm: str = None
     evaluation_llm: str = None
@@ -53,7 +56,7 @@ class Config:
     init_pop_size: int = None
     logging_dir: Path = Path("logs/run.csv")
     experiment_name: str = "experiment"
-    include_task_desc: bool = True
+    task_description: str = None
     donor_random: bool = False
     random_seed: int = 42
     model_storage_path: Optional[Path] = Path("../models/")
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 0420b17..52472ea 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -9,7 +9,7 @@
 from promptolution.exemplar_selectors import get_exemplar_selector
 from promptolution.llms import get_llm
 from promptolution.optimizers import get_optimizer
-from promptolution.predictors import FirstOccurrenceClassificator
+from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator
 from promptolution.tasks import get_task
 
 
@@ -38,7 +38,12 @@ def run_optimization(config: Config, callbacks: List = None):
     """
     task = get_task(config)
     llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path)
-    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
+    if config.predictor == "MarkerBasedClassificator":
+        predictor = MarkerBasedClassificator(llm, classes=task.classes)
+    elif config.predictor == "FirstOccurenceClassificator":
+        predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
+    else:
+        raise ValueError(f"Predictor {config.predictor} not supported.")
 
     if config.init_pop_size:
         init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True)
@@ -53,6 +58,7 @@ def run_optimization(config: Config, callbacks: List = None):
         predictor=predictor,
         n_eval_samples=config.n_eval_samples,
         callbacks=callbacks,
+        task_description=predictor.extraction_description,
     )
 
     prompts = optimizer.optimize(n_steps=config.n_steps)
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index acde31e..09c57fa 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -6,6 +6,7 @@
     EVOPROMPT_GA_TEMPLATE,
     EVOPROMPT_GA_TEMPLATE_TD,
     OPRO_TEMPLATE,
+    OPRO_TEMPLATE_TD,
 )
 
 from .base_optimizer import DummyOptimizer
@@ -15,7 +16,7 @@
 
 
 def get_optimizer(
-    config=None, optimizer: str = None, include_task_desc: bool = None, meta_prompt: str = None, *args, **kwargs
+    config=None, optimizer: str = None, meta_prompt: str = None, task_description: str = None, *args, **kwargs
 ):
     """Factory function to create and return an optimizer instance based on the provided configuration.
 
@@ -30,6 +31,7 @@ def get_optimizer(
                          - Any other string for the specified optimizer class
         include_task_desc (bool): Flag to include task description in the prompt.
         meta_prompt (str): Meta prompt for the optimizer.
+        task_description (str): Task description for the optimizer.
         *args: Variable length argument list passed to the optimizer constructor.
         **kwargs: Arbitrary keyword arguments passed to the optimizer constructor
 
@@ -42,8 +44,8 @@ def get_optimizer(
     if optimizer is None:
         optimizer = config.optimizer
 
-    if include_task_desc is None:
-        include_task_desc = config.include_task_desc
+    if task_description is None:
+        task_description = config.task_description
 
     if config is not None and meta_prompt is None:
         meta_prompt = config.meta_prompt
@@ -51,13 +53,19 @@ def get_optimizer(
     if optimizer == "dummy":
         return DummyOptimizer(*args, **kwargs)
     if config.optimizer == "evopromptde":
-        if include_task_desc:
-            return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE_TD, *args, **kwargs)
+        if task_description is not None:
+            return EvoPromptDE(
+                prompt_template=EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
+            )
         return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs)
     if config.optimizer == "evopromptga":
-        if include_task_desc:
-            return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE_TD, *args, **kwargs)
+        if task_description is not None:
+            return EvoPromptGA(
+                prompt_template=EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
+            )
         return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs)
     if config.optimizer == "opro":
+        if task_description is not None:
+            return Opro(prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs)
         return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs)
     raise ValueError(f"Unknown optimizer: {config.optimizer}")
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index ef6f6fd..7ef3616 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -36,7 +36,6 @@ def __init__(self, meta_llm: BaseLLM, n_samples: int = 2, prompt_template: str =
         self.meta_prompt = prompt_template if prompt_template else OPRO_TEMPLATE
 
         super().__init__(**args)
-        self.meta_prompt = self.meta_prompt.replace("<task_description>", self.task.description)
 
         self.scores = [
             self.task.evaluate(p, self.predictor, subsample=True, n_samples=self.n_eval_samples)[0]
diff --git a/promptolution/templates.py b/promptolution/templates.py
index 18c0765..6cbc39e 100644
--- a/promptolution/templates.py
+++ b/promptolution/templates.py
@@ -86,8 +86,21 @@
 
 1."""
 
-OPRO_TEMPLATE = """Your task is to generate an instruction for the following task:
-<task_description>
+OPRO_TEMPLATE = """Your task is to generate an instruction.
+
+Below are some previous instructions with their scores. The score ranges from 0 to 100.
+
+<old_instructions>
+
+Here are some examples of the target dataset:
+<examples>
+
+Generate a new instruction bracketed with <prompt> and ending it with </prompt> that is different from all the instructions above and has a higher score than all the instructions above. The instruction should be concise, effective, and generally applicable to the task described.
+
+Your new instruction:"""
+
+OPRO_TEMPLATE_TD = """Your task is to generate an instruction for the following task:
+<task_desc>
 
 Below are some previous instructions with their scores. The score ranges from 0 to 100.
 
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index 4f1a851..eaa683b 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -22,11 +22,11 @@
     meta_llm=args.model,
     ds_path="data_sets/cls/agnews",
     task_name="agnews",
+    predictor = "MarkerBasedClassificator",
     n_steps=args.n_steps,
     optimizer=args.optimizer,
     downstream_llm=args.model,
     evaluation_llm=args.model,
-    include_task_desc=True,
     api_token=args.token,
     model_storage_path=args.model_storage_path,
 )

From 9f8c0b6080ee0a791c51ec06aca62e14aca49f33 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 9 Mar 2025 12:15:48 +0100
Subject: [PATCH 30/41] allow for batching of prompt creation

---
 promptolution/utils/prompt_creation.py | 76 ++++++++++++++------------
 1 file changed, 42 insertions(+), 34 deletions(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 07f8c16..08e88dd 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -36,7 +36,12 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr
 
 
 def create_prompts_from_samples(
-    task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3, task_description: str = None
+    task: BaseTask,
+    llm: BaseLLM,
+    meta_prompt: str = None,
+    n_samples: int = 3,
+    task_description: str = None,
+    n_prompts: int = 1,
 ) -> List[str]:
     """Generate a set of prompts from dataset examples sampled from a given task.
 
@@ -53,40 +58,43 @@ def create_prompts_from_samples(
         If None, a default meta prompt is used.
         n_samples (int): The number of samples to use for generating prompts.
         task_description (str): The description of the task to include in the prompt.
+        n_prompts (int): The number of prompts to generate.
 
     Returns:
         List[str]: A list of generated prompts.
     """
-    if isinstance(task, ClassificationTask):
-        # if classification task sample such that all classes are represented
-        unique_labels, counts = np.unique(task.ys, return_counts=True)
-        proportions = counts / len(task.ys)
-        samples_per_class = np.round(proportions * n_samples).astype(int)
-        samples_per_class = np.maximum(samples_per_class, 1)
-
-        # sample
-        xs = []
-        ys = []
-        for label, n_samples in zip(unique_labels, samples_per_class):
-            indices = np.where(task.ys == label)[0]
-            indices = np.random.choice(indices, n_samples, replace=False)
-            xs.extend(task.xs[indices])
-            ys.extend(task.ys[indices])
-
-    else:
-        # if not classification task, sample randomly
-        indices = np.random.choice(len(task.xs), n_samples, replace=False)
-        xs = task.xs[indices].tolist()
-        ys = task.ys[indices].tolist()
-
-    if meta_prompt is None:
-        meta_prompt = PROMPT_CREATION_TEMPLATE
-    if task_description is None:
-        meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
-    examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
-    meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
-    prompt = llm.get_response([meta_prompt])[0]
-    prompt = prompt.split("</prompt>")[0].split("<prompt>")[-1]
-    prompt = prompt.strip()
-
-    return prompt
+    meta_prompts = []
+    for _ in range(n_prompts):
+        if isinstance(task, ClassificationTask):
+            # if classification task sample such that all classes are represented
+            unique_labels, counts = np.unique(task.ys, return_counts=True)
+            proportions = counts / len(task.ys)
+            samples_per_class = np.round(proportions * n_samples).astype(int)
+            samples_per_class = np.maximum(samples_per_class, 1)
+
+            # sample
+            xs = []
+            ys = []
+            for label, n_samples in zip(unique_labels, samples_per_class):
+                indices = np.where(task.ys == label)[0]
+                indices = np.random.choice(indices, n_samples, replace=False)
+                xs.extend(task.xs[indices])
+                ys.extend(task.ys[indices])
+
+        else:
+            # if not classification task, sample randomly
+            indices = np.random.choice(len(task.xs), n_samples, replace=False)
+            xs = task.xs[indices].tolist()
+            ys = task.ys[indices].tolist()
+
+        if meta_prompt is None:
+            meta_prompt = PROMPT_CREATION_TEMPLATE
+        if task_description is None:
+            meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
+        examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
+        meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
+        meta_prompts.append(meta_prompt)
+    prompts = llm.get_response(meta_prompts)
+    prompts = [prompt.split("</prompt>")[0].split("<prompt>")[-1].strip() for prompt in prompts]
+
+    return prompts

From 8ecc6a8bfdef8835d69e29b9c9de49eaa2ef0838 Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sun, 9 Mar 2025 21:45:49 +0100
Subject: [PATCH 31/41] v1.3.0 (#34)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Feature/workflows (#8)

* chore: add codeowners file

* chore: add python poetry action and docs workflow

* chore: update pre-commit file

* chore: update docs

* chore: update logo

* chore: add cicd pipeline for automated deployment

* chore: update poetry version

* chore: fix action versioning

* chore: add gitattributes to ignore line count in jupyter notebooks

* chore: add and update docstrings

* chore: fix end of files

* chore: update action versions

* Update README.md

---------

Co-authored-by: mo374z <schlager.mo@t-online.de>

* Fix/workflows (#11)

* chore: fix workflow execution

* chore: fix version check in CICD pipeline

* Opro implementation (#7)

* update gitignore

* initial implementation of opro

* formatting of prompt template

* added opro test run

* opro refinements

* fixed sampling error

* add docs to opro

* fix pre commit issues#

* fix pre commit issues#

* fixed end of line

* Patch/pre commit config (#10)

* fixed pre commit config and removed end of file line breaks in tempaltes

* added /

* Feature/prompt generation (#12)

* added prompt_creation.py

* change version

* Create LICENSE (#14)

* Refactor/remove deepinfra (#16)

* Remove deepinfra file

* change langchain-community version

* Usability patches (#15)

* renamed get_tasks to get_task and change functionality accordingly. moved templates and data_sets

* init

* move templates to templates.py

* Add nested asyncio to make it useable in notebooks

* Update README.md

* changed getting_started.ipynb and created helper functions

* added sampling of initial population

* fixed config

* fixed callbacks

* adjust runs

* fix run evaluation api token

* fix naming convention in opro, remove on epoch end for logger callback, fixed to allow for numeric values in class names

* Update promptolution/llms/api_llm.py

Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>

* fixed comments

* Update pyproject.toml

* resolve comments

---------

Co-authored-by: mo374z <schlager.mo@t-online.de>
Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>
Co-authored-by: Moritz Schlager <87517800+mo374z@users.noreply.github.com>

* Feature/examplar selection (#17)

* implemented random selector

* added random search selector

* increased version count

* fix typos

* Update promptolution/predictors/base_predictor.py

Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>

* Update promptolution/tasks/classification_tasks.py

Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>

* resolve comments

* resolve comments

---------

Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>

* Chore/docs release notes (#18)

* Update release-notes.md

* Fix release note links

* revert Chore/docs release notes (#18)"

This reverts commit e23dd743cf7e1eefc89746409619dc947bd6d349.

* revert last commit

* updated release notes and read me

* Feature/read from df (#21)

* Delete Experiment files

* Removed config necessities

* improved opro meta-prompts

* added read from data frame feature

* changed required python version to 3.9

* Update pyproject.toml

* Update release-notes.md

* merge

* merge

* resolve merge mistakes

* delete duplicated lines

* Update release-notes.md (#24)

* Fix/dependencies (#28)

* delete poetry.lock and upgrade transformers dependency

* Update release-notes.md

* Add vllm as feature and a llm_test_run_script

* small fixes in vllm class

* differentiate between vllm and api inference

* set up experiment over multiple tasks and prompts

* change csv saving

* add base llm super class

* add changes from PR review

* change some VLLM params

* fix tensor parallel size to 1

* experiment with batch size

* experiment with larger batch sizes

* add continuous batch llm

* remove arg

* remove continuous batch inference try

* add batching to vllm

* add batching in script

* Add release notes and increase version number

* remove llm_test_run.py script

* change system prompt

* Fix/vllm (#33)

* add token count, flexible batch size and kwargs to vllm class

* add testing script for implementation

* fix batch size calculation

* small changes

* add revision test

* add argument to parser

* max model len to int

* remove script

* Change version and Release notes

* changed callback behaviour and impelemented token count callback

* added super inits

* allow for splits not based on white space (such as new line break etc)

* include task descriptions

* add tokenizer based token count to vllm class

* update test run script

* use classifiers accordingly

* small fix

* add storage path

* helpers should use classificator

* use different model

* changes in opro test

* change get_predictor function

* fix callback calling

* change optimizer test run script

* small alignments

* small alignments

* small alignments

* some changes to match the current optimizer implementation

* changes in template and config

* allow for batching of prompt creation

* update release notes and version

* extend csvcallback functionality

* change callback csv export

* change step time calculation

* small changes

* remove llm_test_run script

* update release notes

* fix issues in token stepswise calculation

* small fix

---------

Co-authored-by: finitearth <t.zehle@gmail.com>

* implement changes from review

* add typing to token count callback

---------

Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com>
Co-authored-by: Tom Zehle <t.zehle@gmail.com>
Co-authored-by: Timo Heiß <ti-heiss@t-online.de>
---
 .gitignore                                 |   1 +
 docs/release-notes.md                      |  14 +++
 promptolution/callbacks.py                 | 121 ++++++++++++++++++---
 promptolution/config.py                    |   8 +-
 promptolution/helpers.py                   |  17 ++-
 promptolution/llms/api_llm.py              |   7 +-
 promptolution/llms/base_llm.py             |  59 +++++++++-
 promptolution/llms/local_llm.py            |   4 +-
 promptolution/llms/vllm.py                 |  70 +++++++++---
 promptolution/optimizers/__init__.py       |  24 +++-
 promptolution/optimizers/base_optimizer.py |  12 +-
 promptolution/optimizers/evoprompt_de.py   |   6 +-
 promptolution/optimizers/evoprompt_ga.py   |   5 +-
 promptolution/optimizers/opro.py           |   5 +-
 promptolution/predictors/__init__.py       |  45 ++++----
 promptolution/predictors/classificator.py  |  59 +++++++++-
 promptolution/templates.py                 |  25 ++++-
 promptolution/utils/prompt_creation.py     |  78 +++++++------
 pyproject.toml                             |   2 +-
 scripts/opro_test_run.py                   |  46 --------
 scripts/optimizer_test_run.py              |  36 ++++++
 scripts/prompt_creation_run.py             |   2 +-
 22 files changed, 480 insertions(+), 166 deletions(-)
 delete mode 100644 scripts/opro_test_run.py
 create mode 100644 scripts/optimizer_test_run.py

diff --git a/.gitignore b/.gitignore
index 5786ca0..088f43a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ __pycache__/
 temp/
 dist/
 outputs/
+results/
 poetry.lock
diff --git a/docs/release-notes.md b/docs/release-notes.md
index 20b97b7..8ea09ca 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -1,5 +1,19 @@
 # Release Notes
 
+## Release v1.3.0
+### What's changed
+#### Added features
+* new features for the VLLM Wrapper (automatic batch size determination, accepting kwargs)
+* allow callbacks to terminate optimization run
+* add token count functionality
+* renamed "Classificator"-Predictor to "FirstOccurenceClassificator"
+* introduced "MarkerBasedClassifcator"
+* automatic task description creation
+* use task description in prompt creation
+* implement CSV callbacks
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.2.0...v1.3.0)
+
 ## Release v1.2.0
 ### What's changed
 #### Added features
diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
index fe655d6..48a9b3e 100644
--- a/promptolution/callbacks.py
+++ b/promptolution/callbacks.py
@@ -1,7 +1,10 @@
 """Callback classes for logging, saving, and tracking optimization progress."""
 
 import os
+import time
+from typing import Literal
 
+import numpy as np
 import pandas as pd
 from tqdm import tqdm
 
@@ -14,24 +17,33 @@ def on_step_end(self, optimizer):
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_epoch_end(self, optimizer):
         """Called at the end of each optimization epoch.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_train_end(self, optimizer):
         """Called at the end of the entire optimization process.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
 
 class LoggerCallback(Callback):
@@ -57,6 +69,8 @@ def on_step_end(self, optimizer):
             self.logger.critical(f"*** Prompt {i}: Score: {score}")
             self.logger.critical(f"{prompt}")
 
+        return True
+
     def on_train_end(self, optimizer, logs=None):
         """Log information at the end of training.
 
@@ -64,7 +78,12 @@ def on_train_end(self, optimizer, logs=None):
         optimizer: The optimizer object that called the callback.
         logs: Additional information to log.
         """
-        self.logger.critical(f"Training ended - {logs}")
+        if logs is None:
+            self.logger.critical("Training ended")
+        else:
+            self.logger.critical(f"Training ended - {logs}")
+
+        return True
 
 
 class CSVCallback(Callback):
@@ -73,25 +92,25 @@ class CSVCallback(Callback):
     This callback saves prompts and scores at each step to a CSV file.
 
     Attributes:
-        path (str): The path to the CSV file.
+        dir (str): Directory the CSV file is saved to.
         step (int): The current step number.
     """
 
-    def __init__(self, path):
+    def __init__(self, dir):
         """Initialize the CSVCallback.
 
         Args:
-        path (str): The path to the CSV file.
+        dir (str): Directory the CSV file is saved to.
         """
-        # if dir does not exist
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-
-        # create file in path with header: "step,prompt,score"
-        with open(path, "w") as f:
-            f.write("step,prompt,score\n")
-        self.path = path
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+
+        self.dir = dir
         self.step = 0
+        self.input_tokens = 0
+        self.output_tokens = 0
+        self.start_time = time.time()
+        self.step_time = time.time()
 
     def on_step_end(self, optimizer):
         """Save prompts and scores to csv.
@@ -101,9 +120,25 @@ def on_step_end(self, optimizer):
         """
         self.step += 1
         df = pd.DataFrame(
-            {"step": [self.step] * len(optimizer.prompts), "prompt": optimizer.prompts, "score": optimizer.scores}
+            {
+                "step": [self.step] * len(optimizer.prompts),
+                "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
+                "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
+                "time_elapsed": [time.time() - self.step_time] * len(optimizer.prompts),
+                "score": optimizer.scores,
+                "prompt": optimizer.prompts,
+            }
         )
-        df.to_csv(self.path, mode="a", header=False, index=False)
+        self.step_time = time.time()
+        self.input_tokens = optimizer.meta_llm.input_token_count
+        self.output_tokens = optimizer.meta_llm.output_token_count
+
+        if not os.path.exists(self.dir + "step_results.csv"):
+            df.to_csv(self.dir + "step_results.csv", index=False)
+        else:
+            df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False)
+
+        return True
 
     def on_train_end(self, optimizer):
         """Called at the end of training.
@@ -111,7 +146,24 @@ def on_train_end(self, optimizer):
         Args:
         optimizer: The optimizer object that called the callback.
         """
-        pass
+        df = pd.DataFrame(
+            dict(
+                steps=self.step,
+                input_tokens=optimizer.meta_llm.input_token_count,
+                output_tokens=optimizer.meta_llm.output_token_count,
+                time_elapsed=time.time() - self.start_time,
+                score=np.array(optimizer.scores).mean(),
+                best_prompts=str(optimizer.prompts),
+            ),
+            index=[0],
+        )
+
+        if not os.path.exists(self.dir + "train_results.csv"):
+            df.to_csv(self.dir + "train_results.csv", index=False)
+        else:
+            df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False)
+
+        return True
 
 
 class BestPromptCallback(Callback):
@@ -139,6 +191,8 @@ def on_step_end(self, optimizer):
             self.best_score = optimizer.scores[0]
             self.best_prompt = optimizer.prompts[0]
 
+        return True
+
     def get_best_prompt(self):
         """Get the best prompt and score achieved during optimization.
 
@@ -173,6 +227,8 @@ def on_step_end(self, optimizer):
         """
         self.pbar.update(1)
 
+        return True
+
     def on_train_end(self, optimizer):
         """Close the progress bar at the end of training.
 
@@ -180,3 +236,32 @@ def on_train_end(self, optimizer):
         optimizer: The optimizer object that called the callback.
         """
         self.pbar.close()
+
+        return True
+
+
+class TokenCountCallback(Callback):
+    """Callback for stopping optimization based on the total token count."""
+
+    def __init__(
+        self,
+        max_tokens_for_termination: int,
+        token_type_for_termination: Literal["input_tokens", "output_tokens", "total_tokens"],
+    ):
+        """Initialize the TokenCountCallback.
+
+        Args:
+        max_tokens_for_termination (int): Maximum number of tokens which is allowed befor the algorithm is stopped.
+        token_type_for_termination (str): Can be one of either "input_tokens", "output_tokens" or "total_tokens".
+        """
+        self.max_tokens_for_termination = max_tokens_for_termination
+        self.token_type_for_termination = token_type_for_termination
+
+    def on_step_end(self, optimizer):
+        """Check if the total token count exceeds the maximum allowed. If so, stop the optimization."""
+        token_counts = optimizer.predictor.llm.get_token_count()
+
+        if token_counts[self.token_type_for_termination] > self.max_tokens_for_termination:
+            return False
+
+        return True
diff --git a/promptolution/config.py b/promptolution/config.py
index dac2d9a..25e254c 100644
--- a/promptolution/config.py
+++ b/promptolution/config.py
@@ -17,15 +17,17 @@ class Config:
         ds_path (str): Path to the dataset. Should not be None if used.
         n_steps (int): Number of optimization steps. Should not be None if used.
         optimizer (str): Name of the optimizer to use. Should not be None if used.
+        predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator".
         meta_llm (str): Name of the meta language model. Should not be None if used.
         downstream_llm (str): Name of the downstream language model. Should not be None if used.
         evaluation_llm (str): Name of the evaluation language model. Should not be None if used.
         init_pop_size (int): Initial population size. Defaults to 10.
         logging_dir (str): Directory for logging. Defaults to "logs/run.csv".
         experiment_name (str): Name of the experiment. Defaults to "experiment".
-        include_task_desc (bool): Whether to include task description. Defaults to False.
+        task_description (str): Task Description fed to the optimizer. Defaults to None.
         donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False.
         random_seed (int): Random seed for reproducibility. Defaults to 42.
+        model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/".
         selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random".
         meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None.
         downstream_bs (int): Batch size for local downstream LLM.
@@ -46,6 +48,7 @@ class Config:
     task_name: str = None
     ds_path: Path = None
     optimizer: str = None
+    predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator"
     meta_llm: str = None
     downstream_llm: str = None
     evaluation_llm: str = None
@@ -53,9 +56,10 @@ class Config:
     init_pop_size: int = None
     logging_dir: Path = Path("logs/run.csv")
     experiment_name: str = "experiment"
-    include_task_desc: bool = True
+    task_description: str = None
     donor_random: bool = False
     random_seed: int = 42
+    model_storage_path: Optional[Path] = Path("../models/")
     selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random"
     meta_bs: Optional[int] = None
     downstream_bs: Optional[int] = None
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 9d776a9..52472ea 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -9,7 +9,7 @@
 from promptolution.exemplar_selectors import get_exemplar_selector
 from promptolution.llms import get_llm
 from promptolution.optimizers import get_optimizer
-from promptolution.predictors import Classificator
+from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator
 from promptolution.tasks import get_task
 
 
@@ -27,7 +27,7 @@ def run_experiment(config: Config):
     return df
 
 
-def run_optimization(config: Config):
+def run_optimization(config: Config, callbacks: List = None):
     """Run the optimization phase of the experiment.
 
     Args:
@@ -37,8 +37,13 @@ def run_optimization(config: Config):
         List[str]: The optimized list of prompts.
     """
     task = get_task(config)
-    llm = get_llm(config.meta_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path)
+    if config.predictor == "MarkerBasedClassificator":
+        predictor = MarkerBasedClassificator(llm, classes=task.classes)
+    elif config.predictor == "FirstOccurenceClassificator":
+        predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
+    else:
+        raise ValueError(f"Predictor {config.predictor} not supported.")
 
     if config.init_pop_size:
         init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True)
@@ -52,6 +57,8 @@ def run_optimization(config: Config):
         task=task,
         predictor=predictor,
         n_eval_samples=config.n_eval_samples,
+        callbacks=callbacks,
+        task_description=predictor.extraction_description,
     )
 
     prompts = optimizer.optimize(n_steps=config.n_steps)
@@ -76,7 +83,7 @@ def run_evaluation(config: Config, prompts: List[str]):
     task = get_task(config, split="test")
 
     llm = get_llm(config.evaluation_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples)
     df = pd.DataFrame(dict(prompt=prompts, score=scores))
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index cf966bf..91c9942 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 from logging import INFO, Logger
-from typing import List
+from typing import Any, List
 
 import nest_asyncio
 import openai
@@ -63,7 +63,7 @@ class APILLM(BaseLLM):
         get_response_async: Asynchronously get responses for a list of prompts.
     """
 
-    def __init__(self, model_id: str, token: str = None):
+    def __init__(self, model_id: str, token: str = None, **kwargs: Any):
         """Initialize the APILLM with a specific model.
 
         Args:
@@ -73,6 +73,7 @@ def __init__(self, model_id: str, token: str = None):
         Raises:
             ValueError: If an unknown model identifier is provided.
         """
+        super().__init__()
         if "claude" in model_id:
             self.model = ChatAnthropic(model=model_id, api_key=token)
         elif "gpt" in model_id:
@@ -80,7 +81,7 @@ def __init__(self, model_id: str, token: str = None):
         else:
             self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
 
-    def get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str]) -> List[str]:
         """Get responses for a list of prompts in a synchronous manner.
 
         This method includes retry logic for handling connection errors and rate limits.
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 7f0e95d..dbe26fa 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -1,10 +1,13 @@
 """Base module for LLMs in the promptolution library."""
 
+import logging
 from abc import ABC, abstractmethod
 from typing import List
 
 import numpy as np
 
+logger = logging.getLogger(__name__)
+
 
 class BaseLLM(ABC):
     """Abstract base class for Language Models in the promptolution library.
@@ -18,10 +21,62 @@ class BaseLLM(ABC):
 
     def __init__(self, *args, **kwargs):
         """Initialize the LLM."""
-        pass
+        self.input_token_count = 0
+        self.output_token_count = 0
+
+    def get_token_count(self):
+        """Get the current count of input and output tokens.
+
+        Returns:
+            dict: A dictionary containing the input and output token counts.
+        """
+        return {
+            "input_tokens": self.input_token_count,
+            "output_tokens": self.output_token_count,
+            "total_tokens": self.input_token_count + self.output_token_count,
+        }
+
+    def reset_token_count(self):
+        """Reset the token counters to zero."""
+        self.input_token_count = 0
+        self.output_token_count = 0
+
+    def update_token_count(self, inputs: List[str], outputs: List[str]):
+        """Update the token count based on the given inputs and outputs.
+
+        Args:
+            inputs (List[str]): A list of input prompts.
+            outputs (List[str]): A list of generated responses.
+        """
+        logger.warning("Token count is approximated using word count split by whitespace, not an actual tokenizer.")
+        input_tokens = sum([len(i.split()) for i in inputs])
+        output_tokens = sum([len(o.split()) for o in outputs])
+        self.input_token_count += input_tokens
+        self.output_token_count += output_tokens
+
+    def get_response(self, prompts: str) -> str:
+        """Generate responses for the given prompts.
+
+        This method calls the _get_response method to generate responses
+        for the given prompts. It also updates the token count for the
+        input and output tokens.
+
+        Args:
+            prompts (str or List[str]): Input prompt(s). If a single string is provided,
+                                        it's converted to a list containing that string.
+
+        Returns:
+            List[str]: A list of generated responses, one for each input prompt.
+        """
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        responses = self._get_response(prompts)
+        self.update_token_count(prompts, responses)
+
+        return responses
 
     @abstractmethod
-    def get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str]) -> List[str]:
         """Generate responses for the given prompts.
 
         This method should be implemented by subclasses to define how
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index 074bf01..577d4a0 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -35,6 +35,8 @@ def __init__(self, model_id: str, batch_size=8):
             This method sets up a text generation pipeline with bfloat16 precision,
             automatic device mapping, and specific generation parameters.
         """
+        super().__init__()
+
         self.pipeline = transformers.pipeline(
             "text-generation",
             model=model_id,
@@ -48,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8):
         self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
         self.pipeline.tokenizer.padding_side = "left"
 
-    def get_response(self, prompts: list[str]):
+    def _get_response(self, prompts: list[str]):
         """Generate responses for a list of prompts using the local language model.
 
         Args:
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index d99c542..f558458 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -2,6 +2,7 @@
 
 
 from logging import INFO, Logger
+from typing import List
 
 try:
     import torch
@@ -32,22 +33,24 @@ class VLLM(BaseLLM):
 
     Methods:
         get_response: Generate responses for a list of prompts.
+        update_token_count: Update the token count based on the given inputs and outputs.
     """
 
     def __init__(
         self,
         model_id: str,
-        batch_size: int = 64,
+        batch_size: int | None = None,
         max_generated_tokens: int = 256,
         temperature: float = 0.1,
         top_p: float = 0.9,
-        model_storage_path: str = None,
-        token: str = None,
+        model_storage_path: str | None = None,
+        token: str | None = None,
         dtype: str = "auto",
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.95,
         max_model_len: int = 2048,
         trust_remote_code: bool = False,
+        **kwargs,
     ):
         """Initialize the VLLM with a specific model.
 
@@ -64,11 +67,13 @@ def __init__(
             gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
             max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
             trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+            **kwargs: Additional keyword arguments to pass to the LLM class initialization.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
         """
-        self.batch_size = batch_size
+        super().__init__()
+
         self.dtype = dtype
         self.tensor_parallel_size = tensor_parallel_size
         self.gpu_memory_utilization = gpu_memory_utilization
@@ -78,22 +83,33 @@ def __init__(
         # Configure sampling parameters
         self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
 
-        # Initialize the vLLM engine
-        self.llm = LLM(
-            model=model_id,
-            tokenizer=model_id,
-            dtype=self.dtype,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            max_model_len=self.max_model_len,
-            download_dir=model_storage_path,
-            trust_remote_code=self.trust_remote_code,
-        )
+        # Initialize the vLLM engine with both explicit parameters and any additional kwargs
+        llm_params = {
+            "model": model_id,
+            "tokenizer": model_id,
+            "dtype": self.dtype,
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "gpu_memory_utilization": self.gpu_memory_utilization,
+            "max_model_len": self.max_model_len,
+            "download_dir": model_storage_path,
+            "trust_remote_code": self.trust_remote_code,
+            **kwargs,
+        }
+
+        self.llm = LLM(**llm_params)
+
+        if batch_size is None:
+            gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
+            block_size = self.llm.llm_engine.model_executor.cache_config.block_size
+            self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95)
+            logger.info(f"Batch size set to {self.batch_size} based on GPU memory.")
+        else:
+            self.batch_size = batch_size
 
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-    def get_response(self, inputs: list[str]):
+    def _get_response(self, inputs: list[str]):
         """Generate responses for a list of prompts using the vLLM engine.
 
         Args:
@@ -104,6 +120,7 @@ def get_response(self, inputs: list[str]):
 
         Note:
             This method uses vLLM's batched generation capabilities for efficient inference.
+            It also counts input and output tokens.
         """
         prompts = [
             self.tokenizer.apply_chat_template(
@@ -119,16 +136,37 @@ def get_response(self, inputs: list[str]):
             for input in inputs
         ]
 
+        # Count input tokens
+        for prompt in prompts:
+            input_tokens = self.tokenizer.encode(prompt)
+            self.input_token_count += len(input_tokens)
+
         # generate responses for self.batch_size prompts at the same time
         all_responses = []
         for i in range(0, len(prompts), self.batch_size):
             batch = prompts[i : i + self.batch_size]
             outputs = self.llm.generate(batch, self.sampling_params)
             responses = [output.outputs[0].text for output in outputs]
+
             all_responses.extend(responses)
 
         return all_responses
 
+    def update_token_count(self, inputs: List[str], outputs: List[str]):
+        """Update the token count based on the given inputs and outputs.
+
+            Uses the tokenizer to count the tokens.
+
+        Args:
+            inputs (List[str]): A list of input prompts.
+            outputs (List[str]): A list of generated responses.
+        """
+        for input in inputs:
+            self.input_token_count += len(self.tokenizer.encode(input))
+
+        for output in outputs:
+            self.output_token_count += len(self.tokenizer.encode(output))
+
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""
         del self.llm
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index 7e386a2..09c57fa 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -6,6 +6,7 @@
     EVOPROMPT_GA_TEMPLATE,
     EVOPROMPT_GA_TEMPLATE_TD,
     OPRO_TEMPLATE,
+    OPRO_TEMPLATE_TD,
 )
 
 from .base_optimizer import DummyOptimizer
@@ -15,7 +16,7 @@
 
 
 def get_optimizer(
-    config=None, optimizer: str = None, include_task_desc: bool = None, meta_prompt: str = None, *args, **kwargs
+    config=None, optimizer: str = None, meta_prompt: str = None, task_description: str = None, *args, **kwargs
 ):
     """Factory function to create and return an optimizer instance based on the provided configuration.
 
@@ -30,6 +31,7 @@ def get_optimizer(
                          - Any other string for the specified optimizer class
         include_task_desc (bool): Flag to include task description in the prompt.
         meta_prompt (str): Meta prompt for the optimizer.
+        task_description (str): Task description for the optimizer.
         *args: Variable length argument list passed to the optimizer constructor.
         **kwargs: Arbitrary keyword arguments passed to the optimizer constructor
 
@@ -42,8 +44,8 @@ def get_optimizer(
     if optimizer is None:
         optimizer = config.optimizer
 
-    if include_task_desc is None:
-        include_task_desc = config.include_task_desc
+    if task_description is None:
+        task_description = config.task_description
 
     if config is not None and meta_prompt is None:
         meta_prompt = config.meta_prompt
@@ -51,9 +53,19 @@ def get_optimizer(
     if optimizer == "dummy":
         return DummyOptimizer(*args, **kwargs)
     if config.optimizer == "evopromptde":
-        return EvoPromptDE(donor_random=config.donor_random, *args, **kwargs)
+        if task_description is not None:
+            return EvoPromptDE(
+                prompt_template=EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
+            )
+        return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs)
     if config.optimizer == "evopromptga":
-        return EvoPromptGA(selection_mode=config.selection_mode, *args, **kwargs)
+        if task_description is not None:
+            return EvoPromptGA(
+                prompt_template=EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
+            )
+        return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs)
     if config.optimizer == "opro":
-        return Opro(*args, **kwargs)
+        if task_description is not None:
+            return Opro(prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs)
+        return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs)
     raise ValueError(f"Unknown optimizer: {config.optimizer}")
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 2cac685..bfb828a 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -1,5 +1,6 @@
 """Base class for prompt optimizers."""
 
+import time
 from abc import ABC, abstractmethod
 from typing import Callable, List
 
@@ -61,13 +62,19 @@ def optimize(self, n_steps: int) -> List[str]:
 
     def _on_step_end(self):
         """Call all registered callbacks at the end of each optimization step."""
+        continue_optimization = True
         for callback in self.callbacks:
-            callback.on_step_end(self)
+            continue_optimization &= callback.on_step_end(self)  # if any callback returns False, end the optimization
+
+        return continue_optimization
 
     def _on_epoch_end(self):
         """Call all registered callbacks at the end of each optimization epoch."""
+        continue_optimization = True
         for callback in self.callbacks:
-            callback.on_epoch_end(self)
+            continue_optimization &= callback.on_epoch_end(self)  # if any callback returns False, end the optimization
+
+        return continue_optimization
 
     def _on_train_end(self):
         """Call all registered callbacks at the end of the entire optimization process."""
@@ -111,4 +118,5 @@ def optimize(self, n_steps) -> list[str]:
         self._on_step_end()
         self._on_epoch_end()
         self._on_train_end()
+
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 17d74b3..f44556e 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -89,7 +89,11 @@ def optimize(self, n_steps: int) -> List[str]:
                     self.prompts[i] = child_prompts[i]
                     self.scores[i] = child_scores[i]
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+
+            if not continue_optimization:
+                break
 
         self._on_train_end()
+
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 2ec789b..f6efcb8 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -77,7 +77,10 @@ def optimize(self, n_steps: int) -> List[str]:
             self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)]
             self.scores = sorted(scores, reverse=True)[: len(self.prompts)]
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+            if not continue_optimization:
+                break
+
         return self.prompts
 
     def _crossover(self, prompts, scores) -> str:
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index 3c71f4e..7ef3616 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -36,7 +36,6 @@ def __init__(self, meta_llm: BaseLLM, n_samples: int = 2, prompt_template: str =
         self.meta_prompt = prompt_template if prompt_template else OPRO_TEMPLATE
 
         super().__init__(**args)
-        self.meta_prompt = self.meta_prompt.replace("<task_description>", self.task.description)
 
         self.scores = [
             self.task.evaluate(p, self.predictor, subsample=True, n_samples=self.n_eval_samples)[0]
@@ -89,7 +88,9 @@ def optimize(self, n_steps: int) -> List[str]:
             self.prompts.append(prompt)
             self.scores.append(score)
 
-            self._on_step_end()
+            continue_optimization = self._on_step_end()
+            if not continue_optimization:
+                break
 
         self._on_epoch_end()
 
diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index d850759..e5aa69c 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -1,39 +1,38 @@
 """Module for LLM predictors."""
 
-from promptolution.llms import get_llm
+from typing import Literal
 
 from .base_predictor import DummyPredictor
-from .classificator import Classificator
+from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
 
 
-def get_predictor(name, *args, **kwargs):
-    """Factory function to create and return a predictor instance based on the provided name.
+def get_predictor(
+    downstream_llm=None, type: Literal["first_occurence", "marker"] = "first_occurrence", *args, **kwargs
+):
+    """Factory function to create and return a predictor instance.
 
-    This function supports two types of predictors:
-    1. DummyPredictor: A mock predictor for testing purposes.
-    2. Classificator: A real predictor using a language model for classification tasks.
+    This function supports three types of predictors:
+    1. DummyPredictor: A mock predictor for testing purposes when no downstream_llm is provided.
+    2. FirstOccurrenceClassificator: A predictor that classifies based on first occurrence of the label.
+    3. MarkerBasedClassificator: A predictor that classifies based on a marker.
 
     Args:
-        name (str): Identifier for the predictor to use. Special case:
-                    - "dummy" for DummyPredictor
-                    - Any other string for Classificator with the specified LLM
+        downstream_llm: The language model to use for prediction. If None, returns a DummyPredictor.
+        type (Literal["first_occurrence", "marker"]): The type of predictor to create:
+                    - "first_occurrence" (default) for FirstOccurrenceClassificator
+                    - "marker" for MarkerBasedClassificator
         *args: Variable length argument list passed to the predictor constructor.
         **kwargs: Arbitrary keyword arguments passed to the predictor constructor.
 
     Returns:
-        An instance of DummyPredictor or Classificator based on the name.
-
-    Notes:
-        - For non-dummy predictors, this function calls get_llm to obtain the language model.
-        - The batch_size for the language model is currently commented out and not used.
-
-    Examples:
-        >>> dummy_pred = get_predictor("dummy", classes=["A", "B", "C"])
-        >>> real_pred = get_predictor("gpt-3.5-turbo", classes=["positive", "negative"])
+        An instance of DummyPredictor, FirstOccurrenceClassificator, or MarkerBasedClassificator.
     """
-    if name == "dummy":
+    if downstream_llm is None:
         return DummyPredictor("", *args, **kwargs)
 
-    downstream_llm = get_llm(name)
-
-    return Classificator(downstream_llm, *args, **kwargs)
+    if type == "first_occurrence":
+        return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs)
+    elif type == "marker":
+        return MarkerBasedClassificator(downstream_llm, *args, **kwargs)
+    else:
+        raise ValueError(f"Invalid predictor type: '{type}'")
diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py
index f33bfc6..89eb5d4 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classificator.py
@@ -7,7 +7,7 @@
 from promptolution.predictors.base_predictor import BasePredictor
 
 
-class Classificator(BasePredictor):
+class FirstOccurrenceClassificator(BasePredictor):
     """A predictor class for classification tasks using language models.
 
     This class takes a language model and a list of classes, and provides a method
@@ -33,6 +33,10 @@ def __init__(self, llm, classes, *args, **kwargs):
         """
         super().__init__(llm)
         self.classes = classes
+        self.extraction_description = (
+            f"The task is to classify the texts into one of those classes: {', '.join(classes)}."
+            "The first occurrence of a valid class label in the prediction is used as the predicted class."
+        )
 
     def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
         """Extract class labels from the predictions, based on the list of valid class labels.
@@ -44,7 +48,7 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray
         response = []
         for pred in preds:
             predicted_class = self.classes[0]  # use first class as default pred
-            for word in pred.split(" "):
+            for word in pred.split():
                 word = "".join([c for c in word if c.isalnum()])
                 if word in self.classes:
                     predicted_class = word
@@ -54,3 +58,54 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray
 
         response = np.array(response).reshape(*shape)
         return response
+
+
+class MarkerBasedClassificator(BasePredictor):
+    """A predictor class for classification tasks using language models.
+
+    This class takes a language model and a list of classes, and provides a method
+    to predict classes for given prompts and input data. The class labels are extracted.
+
+    Attributes:
+        llm: The language model used for generating predictions.
+        classes (List[str]): The list of valid class labels.
+        marker (str): The marker to use for extracting the class label.
+
+    Inherits from:
+        BasePredictor: The base class for predictors in the promptolution library.
+    """
+
+    def __init__(self, llm, classes, marker="<final_answer>", *args, **kwargs):
+        """Initialize the Classificator.
+
+        Args:
+            llm: The language model to use for predictions.
+            classes (List[str]): The list of valid class labels.
+            marker (str): The marker to use for extracting the class label.
+            *args, **kwargs: Additional arguments for the BasePredictor.
+        """
+        super().__init__(llm)
+        self.classes = classes
+        self.marker = marker
+        self.extraction_description = (
+            f"The task is to classify the texts into one of those classes: {','.join(classes)}."
+            f"The class label is extracted from the text following the marker: {marker}."
+        )
+
+    def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
+        """Extract class labels from the predictions, by extracting the text following the marker.
+
+        Args:
+            preds: The raw predictions from the language model.
+            shape: The shape of the output array: (n_prompts, n_samples).
+        """
+        response = []
+        for pred in preds:
+            predicted_class = pred.split(self.marker)[-1].strip()
+            if predicted_class not in self.classes:
+                predicted_class = self.classes[0]
+
+            response.append(predicted_class)
+
+        response = np.array(response).reshape(*shape)
+        return response
diff --git a/promptolution/templates.py b/promptolution/templates.py
index 05d7ae3..6cbc39e 100644
--- a/promptolution/templates.py
+++ b/promptolution/templates.py
@@ -86,8 +86,21 @@
 
 1."""
 
-OPRO_TEMPLATE = """Your task is to generate an instruction for the following task:
-<task_description>
+OPRO_TEMPLATE = """Your task is to generate an instruction.
+
+Below are some previous instructions with their scores. The score ranges from 0 to 100.
+
+<old_instructions>
+
+Here are some examples of the target dataset:
+<examples>
+
+Generate a new instruction bracketed with <prompt> and ending it with </prompt> that is different from all the instructions above and has a higher score than all the instructions above. The instruction should be concise, effective, and generally applicable to the task described.
+
+Your new instruction:"""
+
+OPRO_TEMPLATE_TD = """Your task is to generate an instruction for the following task:
+<task_desc>
 
 Below are some previous instructions with their scores. The score ranges from 0 to 100.
 
@@ -114,3 +127,11 @@
 <input_output_pairs>
 
 The instruction was"""
+
+PROMPT_CREATION_TEMPLATE_TD = """You are asked to give the corresponding prompt that gives the following outputs given these inputs for the following task: <task_desc>.
+Return it starting with <prompt> and ending with </prompt> tags.
+Include the name of the output classes in the prompt.
+
+<input_output_pairs>
+
+The instruction was"""
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index d85edd9..08e88dd 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -7,7 +7,7 @@
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.tasks.base_task import BaseTask
 from promptolution.tasks.classification_tasks import ClassificationTask
-from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_VARIATION_TEMPLATE
+from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE
 
 
 def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_prompt: str = None) -> List[str]:
@@ -35,7 +35,14 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr
     return varied_prompts
 
 
-def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3) -> List[str]:
+def create_prompts_from_samples(
+    task: BaseTask,
+    llm: BaseLLM,
+    meta_prompt: str = None,
+    n_samples: int = 3,
+    task_description: str = None,
+    n_prompts: int = 1,
+) -> List[str]:
     """Generate a set of prompts from dataset examples sampled from a given task.
 
     Idea taken from the paper Zhou et al. (2021) https://arxiv.org/pdf/2211.01910
@@ -50,37 +57,44 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str =
         meta_prompt (str): The meta prompt to use for generating the prompts.
         If None, a default meta prompt is used.
         n_samples (int): The number of samples to use for generating prompts.
+        task_description (str): The description of the task to include in the prompt.
+        n_prompts (int): The number of prompts to generate.
 
     Returns:
         List[str]: A list of generated prompts.
     """
-    if isinstance(task, ClassificationTask):
-        # if classification task sample such that all classes are represented
-        unique_classes, counts = np.unique(task.ys, return_counts=True)
-        proportions = counts / len(task.ys)
-        samples_per_class = np.round(proportions * n_samples).astype(int)
-        samples_per_class = np.maximum(samples_per_class, 1)
-
-        # sample
-        xs = []
-        ys = []
-        for cls, n_samples in zip(unique_classes, samples_per_class):
-            indices = np.where(task.ys == cls)[0]
-            indices = np.random.choice(indices, n_samples, replace=False)
-            xs.extend(task.xs[indices])
-            ys.extend(task.ys[indices])
-
-    else:
-        # if not classification task, sample randomly
-        indices = np.random.choice(len(task.xs), n_samples, replace=False)
-        xs = task.xs[indices].tolist()
-        ys = task.ys[indices].tolist()
-
-    meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt
-    examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
-    meta_prompt = meta_prompt.replace("<input_output_pairs", examples)
-
-    prompt = llm.get_response([meta_prompt])[0]
-    prompt = prompt.split("</prompt>")[0].split("<prompt>")[-1]
-
-    return prompt
+    meta_prompts = []
+    for _ in range(n_prompts):
+        if isinstance(task, ClassificationTask):
+            # if classification task sample such that all classes are represented
+            unique_labels, counts = np.unique(task.ys, return_counts=True)
+            proportions = counts / len(task.ys)
+            samples_per_class = np.round(proportions * n_samples).astype(int)
+            samples_per_class = np.maximum(samples_per_class, 1)
+
+            # sample
+            xs = []
+            ys = []
+            for label, n_samples in zip(unique_labels, samples_per_class):
+                indices = np.where(task.ys == label)[0]
+                indices = np.random.choice(indices, n_samples, replace=False)
+                xs.extend(task.xs[indices])
+                ys.extend(task.ys[indices])
+
+        else:
+            # if not classification task, sample randomly
+            indices = np.random.choice(len(task.xs), n_samples, replace=False)
+            xs = task.xs[indices].tolist()
+            ys = task.ys[indices].tolist()
+
+        if meta_prompt is None:
+            meta_prompt = PROMPT_CREATION_TEMPLATE
+        if task_description is None:
+            meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
+        examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
+        meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
+        meta_prompts.append(meta_prompt)
+    prompts = llm.get_response(meta_prompts)
+    prompts = [prompt.split("</prompt>")[0].split("<prompt>")[-1].strip() for prompt in prompts]
+
+    return prompts
diff --git a/pyproject.toml b/pyproject.toml
index e4f5be3..bd5b6ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.2.0"
+version = "1.3.0"
 description = ""
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"
diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py
deleted file mode 100644
index 474af3e..0000000
--- a/scripts/opro_test_run.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Test run for the Opro optimizer."""
-
-from logging import Logger
-
-from promptolution.callbacks import LoggerCallback
-from promptolution.llms import get_llm
-from promptolution.optimizers import Opro
-from promptolution.predictors import get_predictor
-from promptolution.tasks import get_task
-
-from promptolution.config import Config
-
-logger = Logger(__name__)
-
-
-def main():
-    """Run a test run for the Opro optimizer."""
-    config = Config(
-        meta_llm="meta-llama/Meta-Llama-3-8B-Instruct",
-        ds_path="data_sets/agnews",
-        task_name="agnews",
-        n_steps=10,
-        optimizer="opro",
-        downstream_llm="meta-llama/Meta-Llama-3-8B-Instruct",
-        evaluation_llm="meta-llama/Meta-Llama-3-8B-Instruct",
-
-    )
-    task = get_task(config, split="dev")
-    predictor = get_predictor(config.evaluation_llm, classes=task.classes)
-
-    llm = get_llm(config.meta_llm)
-    optimizer = Opro(
-        llm,
-        initial_prompts=task.initial_population,
-        task=task,
-        predictor=predictor,
-        callbacks=[LoggerCallback(logger)],
-        n_samples=5,
-    )
-    prompts = optimizer.optimize(n_steps=10)
-
-    logger.info(f"Optimized prompts: {prompts}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
new file mode 100644
index 0000000..d60efb9
--- /dev/null
+++ b/scripts/optimizer_test_run.py
@@ -0,0 +1,36 @@
+"""Test run for the Opro optimizer."""
+import argparse
+from logging import Logger
+
+from promptolution.callbacks import LoggerCallback, CSVCallback
+from promptolution.helpers import run_optimization
+
+from promptolution.config import Config
+
+logger = Logger(__name__)
+
+"""Run a test run for any of the implemented optimizers."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--model")
+parser.add_argument("--model-storage-path", default="../models/")
+parser.add_argument("--optimizer", default="evopromptde")
+parser.add_argument("--n-steps", type=int, default=10)
+parser.add_argument("--token", default=None)
+args = parser.parse_args()
+
+config = Config(
+    meta_llm=args.model,
+    ds_path="data_sets/cls/agnews",
+    task_name="agnews",
+    predictor="FirstOccurenceClassificator",
+    n_steps=args.n_steps,
+    optimizer=args.optimizer,
+    downstream_llm=args.model,
+    evaluation_llm=args.model,
+    api_token=args.token,
+    model_storage_path=args.model_storage_path,
+)
+
+prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/{args.model}/")])
+
+logger.info(f"Optimized prompts: {prompts}")
diff --git a/scripts/prompt_creation_run.py b/scripts/prompt_creation_run.py
index 4c17694..f7d54c3 100644
--- a/scripts/prompt_creation_run.py
+++ b/scripts/prompt_creation_run.py
@@ -21,7 +21,7 @@ def main():
     llm = get_llm("meta-llama/Meta-Llama-3-8B-Instruct")
     task = get_task(config, split="dev")
 
-    predictor = get_predictor("meta-llama/Meta-Llama-3-8B-Instruct", classes=task.classes)
+    predictor = get_predictor(llm, classes=task.classes)
 
     init_prompts = create_prompts_from_samples(task, llm)
     logger.critical(f"Initial prompts: {init_prompts}")

From 859831cb15ac7ade08405c616b0f6d82819957cd Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 9 Mar 2025 22:52:30 +0100
Subject: [PATCH 32/41] fixed prompt creation with task description

---
 promptolution/utils/prompt_creation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 08e88dd..9667a48 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -89,7 +89,7 @@ def create_prompts_from_samples(
 
         if meta_prompt is None:
             meta_prompt = PROMPT_CREATION_TEMPLATE
-        if task_description is None:
+        if task_description is not None:
             meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
         examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
         meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)

From f53e4d2efe1f614e33fa140e43a5e79fafad4785 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Mon, 10 Mar 2025 00:06:49 +0100
Subject: [PATCH 33/41] make classifaction task for prompt creation optional

---
 promptolution/utils/prompt_creation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 9667a48..560b464 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -42,6 +42,7 @@ def create_prompts_from_samples(
     n_samples: int = 3,
     task_description: str = None,
     n_prompts: int = 1,
+    get_uniform_labels: bool = False,
 ) -> List[str]:
     """Generate a set of prompts from dataset examples sampled from a given task.
 
@@ -59,13 +60,14 @@ def create_prompts_from_samples(
         n_samples (int): The number of samples to use for generating prompts.
         task_description (str): The description of the task to include in the prompt.
         n_prompts (int): The number of prompts to generate.
+        get_uniform_labels (bool): If True, samples are selected such that all classes are represented.
 
     Returns:
         List[str]: A list of generated prompts.
     """
     meta_prompts = []
     for _ in range(n_prompts):
-        if isinstance(task, ClassificationTask):
+        if isinstance(task, ClassificationTask) and get_uniform_labels:
             # if classification task sample such that all classes are represented
             unique_labels, counts = np.unique(task.ys, return_counts=True)
             proportions = counts / len(task.ys)

From c0630393a36e9b4dcc749548d78b6c7f9fcc21ee Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Mon, 10 Mar 2025 00:10:04 +0100
Subject: [PATCH 34/41] fix meta_prompt_template

---
 promptolution/utils/prompt_creation.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 560b464..718ad76 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -65,6 +65,10 @@ def create_prompts_from_samples(
     Returns:
         List[str]: A list of generated prompts.
     """
+    if meta_prompt is None:
+        meta_prompt_template = PROMPT_CREATION_TEMPLATE
+    if task_description is not None:
+        meta_prompt_template = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
     meta_prompts = []
     for _ in range(n_prompts):
         if isinstance(task, ClassificationTask) and get_uniform_labels:
@@ -89,13 +93,10 @@ def create_prompts_from_samples(
             xs = task.xs[indices].tolist()
             ys = task.ys[indices].tolist()
 
-        if meta_prompt is None:
-            meta_prompt = PROMPT_CREATION_TEMPLATE
-        if task_description is not None:
-            meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
         examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)])
-        meta_prompt = meta_prompt.replace("<input_output_pairs>", examples)
+        meta_prompt = meta_prompt_template.replace("<input_output_pairs>", examples)
         meta_prompts.append(meta_prompt)
+
     prompts = llm.get_response(meta_prompts)
     prompts = [prompt.split("</prompt>")[0].split("<prompt>")[-1].strip() for prompt in prompts]
 

From 5e0b8f7a2fcc493199dddf7cc09a2321e8872bde Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Mon, 10 Mar 2025 00:24:25 +0100
Subject: [PATCH 35/41] enable not forcing class output for marker based
 classifactor

---
 promptolution/predictors/classificator.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py
index 89eb5d4..bb05930 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classificator.py
@@ -75,12 +75,12 @@ class MarkerBasedClassificator(BasePredictor):
         BasePredictor: The base class for predictors in the promptolution library.
     """
 
-    def __init__(self, llm, classes, marker="<final_answer>", *args, **kwargs):
+    def __init__(self, llm, classes=None, marker="<final_answer>", *args, **kwargs):
         """Initialize the Classificator.
 
         Args:
             llm: The language model to use for predictions.
-            classes (List[str]): The list of valid class labels.
+            classes (List[str]): The list of valid class labels. If None, does not force any class.
             marker (str): The marker to use for extracting the class label.
             *args, **kwargs: Additional arguments for the BasePredictor.
         """
@@ -101,11 +101,11 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray
         """
         response = []
         for pred in preds:
-            predicted_class = pred.split(self.marker)[-1].strip()
-            if predicted_class not in self.classes:
-                predicted_class = self.classes[0]
+            pred = pred.split(self.marker)[-1].strip()
+            if self.classes is not None and pred not in self.classes:
+                pred = self.classes[0]
 
-            response.append(predicted_class)
+            response.append(pred)
 
         response = np.array(response).reshape(*shape)
         return response

From eeb6995a04727d19e402a8da49af1d6e565a9b51 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Tue, 11 Mar 2025 13:12:55 +0100
Subject: [PATCH 36/41] updated callbacks

---
 promptolution/callbacks.py               | 21 ++++++++++++---------
 promptolution/optimizers/evoprompt_ga.py |  1 +
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
index 48a9b3e..452aeae 100644
--- a/promptolution/callbacks.py
+++ b/promptolution/callbacks.py
@@ -1,7 +1,7 @@
 """Callback classes for logging, saving, and tracking optimization progress."""
 
 import os
-import time
+from datetime import datetime
 from typing import Literal
 
 import numpy as np
@@ -64,7 +64,8 @@ def __init__(self, logger):
     def on_step_end(self, optimizer):
         """Log information about the current step."""
         self.step += 1
-        self.logger.critical(f"✨Step {self.step} ended✨")
+        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
+        self.logger.critical(f"{time} - ✨Step {self.step} ended✨")
         for i, (prompt, score) in enumerate(zip(optimizer.prompts, optimizer.scores)):
             self.logger.critical(f"*** Prompt {i}: Score: {score}")
             self.logger.critical(f"{prompt}")
@@ -78,10 +79,11 @@ def on_train_end(self, optimizer, logs=None):
         optimizer: The optimizer object that called the callback.
         logs: Additional information to log.
         """
+        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
         if logs is None:
-            self.logger.critical("Training ended")
+            self.logger.critical(f"{time} - Training ended")
         else:
-            self.logger.critical(f"Training ended - {logs}")
+            self.logger.critical(f"{time} - Training ended - {logs}")
 
         return True
 
@@ -109,8 +111,8 @@ def __init__(self, dir):
         self.step = 0
         self.input_tokens = 0
         self.output_tokens = 0
-        self.start_time = time.time()
-        self.step_time = time.time()
+        self.start_time = datetime.now()
+        self.step_time = datetime.now()
 
     def on_step_end(self, optimizer):
         """Save prompts and scores to csv.
@@ -124,12 +126,12 @@ def on_step_end(self, optimizer):
                 "step": [self.step] * len(optimizer.prompts),
                 "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
                 "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
-                "time_elapsed": [time.time() - self.step_time] * len(optimizer.prompts),
+                "time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts),
                 "score": optimizer.scores,
                 "prompt": optimizer.prompts,
             }
         )
-        self.step_time = time.time()
+        self.step_time = datetime.now()
         self.input_tokens = optimizer.meta_llm.input_token_count
         self.output_tokens = optimizer.meta_llm.output_token_count
 
@@ -151,7 +153,8 @@ def on_train_end(self, optimizer):
                 steps=self.step,
                 input_tokens=optimizer.meta_llm.input_token_count,
                 output_tokens=optimizer.meta_llm.output_token_count,
-                time_elapsed=time.time() - self.start_time,
+                time_elapsed=(datetime.now() - self.start_time).total_seconds(),
+                time=datetime.now(),
                 score=np.array(optimizer.scores).mean(),
                 best_prompts=str(optimizer.prompts),
             ),
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index f6efcb8..b26ff53 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -81,6 +81,7 @@ def optimize(self, n_steps: int) -> List[str]:
             if not continue_optimization:
                 break
 
+        self._on_train_end()
         return self.prompts
 
     def _crossover(self, prompts, scores) -> str:

From 147052e180550412be4ac388f0ae40b577d2167b Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Tue, 11 Mar 2025 13:30:15 +0100
Subject: [PATCH 37/41] add seeding to vllm and sampling params

---
 promptolution/llms/vllm.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index f558458..2021fea 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -44,12 +44,12 @@ def __init__(
         temperature: float = 0.1,
         top_p: float = 0.9,
         model_storage_path: str | None = None,
-        token: str | None = None,
         dtype: str = "auto",
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.95,
         max_model_len: int = 2048,
         trust_remote_code: bool = False,
+        seed: int = 42,
         **kwargs,
     ):
         """Initialize the VLLM with a specific model.
@@ -67,6 +67,7 @@ def __init__(
             gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
             max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
             trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+            seed (int, optional): Random seed for the model. Defaults to 42.
             **kwargs: Additional keyword arguments to pass to the LLM class initialization.
 
         Note:
@@ -81,7 +82,9 @@ def __init__(
         self.trust_remote_code = trust_remote_code
 
         # Configure sampling parameters
-        self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
+        self.sampling_params = SamplingParams(
+            temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
+        )
 
         # Initialize the vLLM engine with both explicit parameters and any additional kwargs
         llm_params = {
@@ -93,6 +96,7 @@ def __init__(
             "max_model_len": self.max_model_len,
             "download_dir": model_storage_path,
             "trust_remote_code": self.trust_remote_code,
+            "seed": seed,
             **kwargs,
         }
 
@@ -136,11 +140,6 @@ def _get_response(self, inputs: list[str]):
             for input in inputs
         ]
 
-        # Count input tokens
-        for prompt in prompts:
-            input_tokens = self.tokenizer.encode(prompt)
-            self.input_token_count += len(input_tokens)
-
         # generate responses for self.batch_size prompts at the same time
         all_responses = []
         for i in range(0, len(prompts), self.batch_size):

From 984220b442e72414b9aef93ec91d68df2c34564c Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Tue, 11 Mar 2025 18:06:45 +0100
Subject: [PATCH 38/41] add random seed do test script

---
 promptolution/helpers.py      | 4 +++-
 scripts/optimizer_test_run.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 52472ea..28180ea 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -37,7 +37,9 @@ def run_optimization(config: Config, callbacks: List = None):
         List[str]: The optimized list of prompts.
     """
     task = get_task(config)
-    llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path)
+    llm = get_llm(
+        config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path, seed=config.random_seed
+    )
     if config.predictor == "MarkerBasedClassificator":
         predictor = MarkerBasedClassificator(llm, classes=task.classes)
     elif config.predictor == "FirstOccurenceClassificator":
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index d60efb9..808fc5a 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -16,6 +16,7 @@
 parser.add_argument("--optimizer", default="evopromptde")
 parser.add_argument("--n-steps", type=int, default=10)
 parser.add_argument("--token", default=None)
+parser.add_argument("--seed", type=int, default=187)
 args = parser.parse_args()
 
 config = Config(
@@ -29,8 +30,9 @@
     evaluation_llm=args.model,
     api_token=args.token,
     model_storage_path=args.model_storage_path,
+    random_seed=args.seed,
 )
 
-prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/{args.model}/")])
+prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")])
 
 logger.info(f"Optimized prompts: {prompts}")

From aa26e5fa0051b1fe1a677b7af1286275782ae15c Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Tue, 11 Mar 2025 18:27:32 +0100
Subject: [PATCH 39/41] align with token / no token

---
 promptolution/helpers.py      | 9 +++++----
 scripts/optimizer_test_run.py | 5 ++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 28180ea..da70be7 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -27,7 +27,7 @@ def run_experiment(config: Config):
     return df
 
 
-def run_optimization(config: Config, callbacks: List = None):
+def run_optimization(config: Config, callbacks: List = None, use_token: bool = False):
     """Run the optimization phase of the experiment.
 
     Args:
@@ -37,9 +37,10 @@ def run_optimization(config: Config, callbacks: List = None):
         List[str]: The optimized list of prompts.
     """
     task = get_task(config)
-    llm = get_llm(
-        config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path, seed=config.random_seed
-    )
+    if use_token:
+        llm = get_llm(config.meta_llm, token=config.api_token)
+    else:
+        llm = get_llm(config.meta_llm, model_storage_path=config.model_storage_path, seed=config.random_seed)
     if config.predictor == "MarkerBasedClassificator":
         predictor = MarkerBasedClassificator(llm, classes=task.classes)
     elif config.predictor == "FirstOccurenceClassificator":
diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py
index 808fc5a..802208e 100644
--- a/scripts/optimizer_test_run.py
+++ b/scripts/optimizer_test_run.py
@@ -33,6 +33,9 @@
     random_seed=args.seed,
 )
 
-prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")])
+if args.token is None:
+    prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")])
+else:
+    prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")], use_token=True)
 
 logger.info(f"Optimized prompts: {prompts}")

From 5b483df7a3b293d751ae399eeae3dcc74dd3b47c Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Tue, 11 Mar 2025 18:38:21 +0100
Subject: [PATCH 40/41] delete script

---
 scripts/llm_test_run.py | 98 -----------------------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 scripts/llm_test_run.py

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
deleted file mode 100644
index 442475a..0000000
--- a/scripts/llm_test_run.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""Test script for measuring raw LLM inference performance on a dataset."""
-import argparse
-import time
-from logging import Logger
-
-import numpy as np
-import pandas as pd
-from promptolution.config import Config
-from promptolution.llms import get_llm
-from promptolution.predictors import FirstOccurrenceClassificator
-from promptolution.tasks import get_task
-from tqdm import tqdm
-
-logger = Logger(__name__)
-
-# TODO: Align this script with how we import datasets in capo
-
-
-"""Run inference test on a dataset using a specified LLM."""
-parser = argparse.ArgumentParser()
-parser.add_argument("--model")
-parser.add_argument("--output")
-parser.add_argument("--datasets", default=["subj"])
-parser.add_argument("--token", default=None)
-parser.add_argument("--batch-size", default=None)
-parser.add_argument("--revision", default="main")
-parser.add_argument("--max-model-len", default=None)
-parser.add_argument("--model-storage-path", default=None)
-args = parser.parse_args()
-
-start_time = time.time()
-
-if args.max_model_len is not None:
-    max_model_len = int(args.max_model_len)
-
-if "vllm" in args.model:
-    llm = get_llm(
-        args.model,
-        batch_size=args.batch_size,
-        max_model_len=max_model_len,
-        model_storage_path=args.model_storage_path,
-        revision=args.revision,
-    )
-else:
-    llm = get_llm(args.model, args.token)
-
-results = pd.DataFrame()
-
-for dataset in args.datasets:
-    config = Config(
-        evaluation_llm=args.model,
-        ds_path=f"data_sets/cls/{dataset}/",
-        task_name=dataset,
-        api_token=args.token,
-        n_eval_samples=200,
-    )
-
-    task = get_task(config, split="dev")
-    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
-
-    prompts = [task.initial_population[0]]
-
-    xs = task.xs[: config.n_eval_samples]
-    ys = task.ys[: config.n_eval_samples]
-
-    for prompt in tqdm(prompts):
-        preds, seqs = predictor.predict(prompt, xs, return_seq=True)
-
-        scores = []
-        for i in range(len(xs)):
-            scores.append(1 if preds[0][i] == ys[i] else 0)
-
-        # clean up the sequences
-        seqs = [seq.replace("\n", "").strip() for seq in seqs]
-
-        # if single prompts should be stored
-        # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
-        # df.to_csv(args.output + "_detailed", index=False)
-
-        accuracy = np.array(scores).mean()
-
-        results = pd.DataFrame(
-            dict(
-                model=args.model,
-                dataset=dataset,
-                prompt=prompt,
-                accuracy=accuracy,
-                n_samples=len(xs),
-            ),
-            index=[0],
-        )
-        results.to_csv(args.output, mode="a", header=False, index=False)
-
-total_inference_time = time.time() - start_time
-print(
-    f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens."
-)
-print(f"Results saved to {args.output}")

From 39c58e47340eacd63d510d00b74367ed068d049a Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 12 Mar 2025 16:53:15 +0100
Subject: [PATCH 41/41] fix prompt creation if else

---
 promptolution/utils/prompt_creation.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 718ad76..85a613e 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -65,10 +65,15 @@ def create_prompts_from_samples(
     Returns:
         List[str]: A list of generated prompts.
     """
-    if meta_prompt is None:
+    if meta_prompt is None and task_description is None:
         meta_prompt_template = PROMPT_CREATION_TEMPLATE
-    if task_description is not None:
+    elif meta_prompt is None and task_description is not None:
         meta_prompt_template = PROMPT_CREATION_TEMPLATE_TD.replace("<task_desc>", task_description)
+    elif meta_prompt is not None and task_description is None:
+        meta_prompt_template = meta_prompt
+    elif meta_prompt is not None and task_description is not None:
+        meta_prompt_template = meta_prompt.replace("<task_desc>", task_description)
+
     meta_prompts = []
     for _ in range(n_prompts):
         if isinstance(task, ClassificationTask) and get_uniform_labels: