automl · finitearth · Mar 20, 2025 · Mar 12, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/docs/release-notes.md b/docs/release-notes.md
@@ -1,5 +1,15 @@
 # Release Notes
 
+## Release v1.3.2
+### What's changed
+#### Added features
+* Allow for configuration and evaluation of system prompts in all LLM-Classes
+* CSV Callback is now FileOutputCallback and able to write Parquet files
+
+#### Further Changes:
+* Fixed LLM-Call templates in VLLM
+* refined OPRO-implementation to be closer to the paper
+
 ## Release v1.3.1
 ### What's changed
 #### Added features

diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
@@ -88,32 +88,37 @@ def on_train_end(self, optimizer, logs=None):
         return True
 
 
-class CSVCallback(Callback):
-    """Callback for saving optimization progress to a CSV file.
+class FileOutputCallback(Callback):
+    """Callback for saving optimization progress to a specified file type.
 
-    This callback saves prompts and scores at each step to a CSV file.
+    This callback saves information about each step to a file.
 
     Attributes:
-        dir (str): Directory the CSV file is saved to.
+        dir (str): Directory the file is saved to.
         step (int): The current step number.
+        file_type (str): The type of file to save the output to.
     """
 
-    def __init__(self, dir):
-        """Initialize the CSVCallback.
+    def __init__(self, dir, file_type: Literal["parquet", "csv"] = "parquet"):
+        """Initialize the FileOutputCallback.
 
         Args:
         dir (str): Directory the CSV file is saved to.
+        file_type (str): The type of file to save the output to.
         """
         if not os.path.exists(dir):
             os.makedirs(dir)
 
-        self.dir = dir
-        self.dir = dir
+        self.file_type = file_type
+
+        if file_type == "parquet":
+            self.path = dir + "/step_results.parquet"
+        elif file_type == "csv":
+            self.path = dir + "/step_results.csv"
+        else:
+            raise ValueError(f"File type {file_type} not supported.")
+
         self.step = 0
-        self.input_tokens = 0
-        self.output_tokens = 0
-        self.start_time = datetime.now()
-        self.step_time = datetime.now()
 
     def on_step_end(self, optimizer):
         """Save prompts and scores to csv.
@@ -125,47 +130,24 @@ def on_step_end(self, optimizer):
         df = pd.DataFrame(
             {
                 "step": [self.step] * len(optimizer.prompts),
-                "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
-                "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
-                "time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts),
+                "input_tokens": [optimizer.meta_llm.input_token_count] * len(optimizer.prompts),
+                "output_tokens": [optimizer.meta_llm.output_token_count] * len(optimizer.prompts),
+                "time": [datetime.now().total_seconds()] * len(optimizer.prompts),
                 "score": optimizer.scores,
                 "prompt": optimizer.prompts,
             }
         )
-        self.step_time = datetime.now()
-        self.input_tokens = optimizer.meta_llm.input_token_count
-        self.output_tokens = optimizer.meta_llm.output_token_count
 
-        if not os.path.exists(self.dir + "step_results.csv"):
-            df.to_csv(self.dir + "step_results.csv", index=False)
-        else:
-            df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False)
-
-        return True
-
-    def on_train_end(self, optimizer):
-        """Called at the end of training.
-
-        Args:
-        optimizer: The optimizer object that called the callback.
-        """
-        df = pd.DataFrame(
-            dict(
-                steps=self.step,
-                input_tokens=optimizer.meta_llm.input_token_count,
-                output_tokens=optimizer.meta_llm.output_token_count,
-                time_elapsed=(datetime.now() - self.start_time).total_seconds(),
-                time=datetime.now(),
-                score=np.array(optimizer.scores).mean(),
-                best_prompts=str(optimizer.prompts),
-            ),
-            index=[0],
-        )
-
-        if not os.path.exists(self.dir + "train_results.csv"):
-            df.to_csv(self.dir + "train_results.csv", index=False)
-        else:
-            df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False)
+        if self.file_type == "parquet":
+            if self.step == 1:
+                df.to_parquet(self.path, index=False)
+            else:
+                df.to_parquet(self.path, mode="a", index=False)
+        elif self.file_type == "csv":
+            if self.step == 1:
+                df.to_csv(self.path, index=False)
+            else:
+                df.to_csv(self.path, mode="a", header=False, index=False)
 
         return True
 

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
@@ -10,19 +10,20 @@
 import requests
 from langchain_anthropic import ChatAnthropic
 from langchain_community.chat_models.deepinfra import ChatDeepInfra, ChatDeepInfraException
-from langchain_core.messages import HumanMessage
+from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
 
 from promptolution.llms.base_llm import BaseLLM
 
 logger = Logger(__name__)
 
 
-async def invoke_model(prompt, model, semaphore):
+async def invoke_model(prompt, system_prompt, model, semaphore):
     """Asynchronously invoke a language model with retry logic.
 
     Args:
         prompt (str): The input prompt for the model.
+        system_prompt (str): The system prompt for the model.
         model: The language model to invoke.
         semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls.
 
@@ -39,7 +40,7 @@ async def invoke_model(prompt, model, semaphore):
 
         while attempts < max_retries:
             try:
-                response = await model.ainvoke([HumanMessage(content=prompt)])
+                response = await model.ainvoke([SystemMessage(content=system_prompt), HumanMessage(content=prompt)])
                 return response.content
             except ChatDeepInfraException as e:
                 print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...")
@@ -80,13 +81,14 @@ def __init__(self, model_id: str, token: str = None, **kwargs: Any):
         else:
             self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
 
-    def _get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Get responses for a list of prompts in a synchronous manner.
 
         This method includes retry logic for handling connection errors and rate limits.
 
         Args:
             prompts (list[str]): List of input prompts.
+            system_prompts (list[str]): List of system prompts. If not provided, uses default system_prompts
 
         Returns:
             list[str]: List of model responses.

diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 
+from promptolution.templates import DEFAULT_SYS_PROMPT
+
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +56,7 @@ def update_token_count(self, inputs: List[str], outputs: List[str]):
         self.input_token_count += input_tokens
         self.output_token_count += output_tokens
 
-    def get_response(self, prompts: str) -> str:
+    def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Generate responses for the given prompts.
 
         This method calls the _get_response method to generate responses
@@ -64,31 +66,37 @@ def get_response(self, prompts: str) -> str:
         Args:
             prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                         it's converted to a list containing that string.
+            system_prompts (str or List[str]): System prompt(s) to provide context to the model.
 
         Returns:
             List[str]: A list of generated responses, one for each input prompt.
         """
+        if system_prompts is None:
+            system_prompts = DEFAULT_SYS_PROMPT
         if isinstance(prompts, str):
             prompts = [prompts]
-        responses = self._get_response(prompts)
-        self.update_token_count(prompts, responses)
+        if isinstance(system_prompts, str):
+            system_prompts = [system_prompts] * len(prompts)
+        responses = self._get_response(prompts, system_prompts)
+        self.update_token_count(prompts + system_prompts, responses)
 
         return responses
 
     @abstractmethod
-    def _get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Generate responses for the given prompts.
 
         This method should be implemented by subclasses to define how
         the LLM generates responses.
 
         Args:
             prompts (List[str]): A list of input prompts.
+            system_prompts (List[str]): A list of system prompts to provide context to the model.
 
         Returns:
             List[str]: A list of generated responses corresponding to the input prompts.
         """
-        pass
+        raise NotImplementedError
 
 
 class DummyLLM(BaseLLM):

diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
@@ -50,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8):
         self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
         self.pipeline.tokenizer.padding_side = "left"
 
-    def _get_response(self, prompts: list[str]):
+    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the local language model.
 
         Args:
@@ -63,8 +63,12 @@ def _get_response(self, prompts: list[str]):
             This method uses torch.no_grad() for inference to reduce memory usage.
             It handles both single and batch inputs, ensuring consistent output format.
         """
+        inputs = []
+        for prompt, sys_prompt in zip(prompts, system_prompts):
+            inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}])
+
         with torch.no_grad():
-            response = self.pipeline(prompts, pad_token_id=self.pipeline.tokenizer.eos_token_id)
+            response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id)
 
         if len(response) != 1:
             response = [r[0] if isinstance(r, list) else r for r in response]

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
@@ -108,7 +108,7 @@ def __init__(
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-    def _get_response(self, inputs: list[str]):
+    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the vLLM engine.
 
         Args:
@@ -126,13 +126,14 @@ def _get_response(self, inputs: list[str]):
                 [
                     {
                         "role": "system",
-                        "content": "You are a helpful assistant.",
+                        "content": sys_prompt,
                     },
-                    {"role": "user", "content": input},
+                    {"role": "user", "content": prompt},
                 ],
                 tokenize=False,
+                add_generation_prompt=True,
             )
-            for input in inputs
+            for prompt, sys_prompt in zip(prompts, system_prompts)
         ]
 
         # generate responses for self.batch_size prompts at the same time

diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py
@@ -31,7 +31,9 @@ def __init__(self, llm: BaseLLM):
         """
         self.llm = llm
 
-    def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False) -> np.ndarray:
+    def predict(
+        self, prompts: List[str], xs: np.ndarray, system_prompts: List[str] = None, return_seq: bool = False
+    ) -> np.ndarray:
         """Abstract method to make predictions based on prompts and input data.
 
         Args:
@@ -48,7 +50,9 @@ def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False)
         if isinstance(prompts, str):
             prompts = [prompts]
 
-        outputs = self.llm.get_response([prompt + "\n" + x for prompt in prompts for x in xs])
+        outputs = self.llm.get_response(
+            [prompt + "\n" + x for prompt in prompts for x in xs], system_prompts=system_prompts
+        )
         preds = self._extract_preds(outputs)
 
         shape = (len(prompts), len(xs))

diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
@@ -21,12 +21,13 @@ def __init__(self, *args, **kwargs):
         pass
 
     @abstractmethod
-    def evaluate(self, prompts: List[str], predictor) -> np.ndarray:
+    def evaluate(self, prompts: List[str], predictor, system_promtps: List[str] = None) -> np.ndarray:
         """Abstract method to evaluate prompts using a given predictor.
 
         Args:
             prompts (List[str]): List of prompts to evaluate.
             predictor: The predictor to use for evaluation.
+            system_promtps (List[str]): List of system prompts to evaluate.
 
         Returns:
             np.ndarray: Array of evaluation scores for each prompt.
@@ -58,7 +59,7 @@ def __init__(self):
         self.ys = np.array(["positive", "negative", "positive"])
         self.classes = ["negative", "positive"]
 
-    def evaluate(self, prompts: List[str], predictor) -> np.ndarray:
+    def evaluate(self, prompts: List[str], predictor, system_prompts=None) -> np.ndarray:
         """Generate random evaluation scores for the given prompts.
 
         Args:

diff --git a/promptolution/tasks/classification_tasks.py b/promptolution/tasks/classification_tasks.py
@@ -63,6 +63,7 @@ def evaluate(
         self,
         prompts: List[str],
         predictor: BasePredictor,
+        system_prompts: List[str] = None,
         n_samples: int = 20,
         subsample: bool = False,
         return_seq: bool = False,
@@ -72,6 +73,7 @@ def evaluate(
         Args:
             prompts (List[str]): List of prompts to evaluate.
             predictor (BasePredictor): Predictor to use for evaluation.
+            system_prompts (List[str], optional): List of system prompts to evaluate. Defaults to None.
             n_samples (int, optional): Number of samples to use if subsampling. Defaults to 20.
             subsample (bool, optional): Whether to use subsampling.
             If set to true, samples a different subset per call. Defaults to False.
@@ -95,7 +97,7 @@ def evaluate(
         ys_subsample = self.ys[indices]
 
         # Make predictions on the subsample
-        preds = predictor.predict(prompts, xs_subsample, return_seq=return_seq)
+        preds = predictor.predict(prompts, xs_subsample, system_prompts=system_prompts, return_seq=return_seq)
 
         if return_seq:
             preds, seqs = preds

diff --git a/promptolution/templates.py b/promptolution/templates.py
@@ -1,3 +1,4 @@
+DEFAULT_SYS_PROMPT = "You are a helpful assistant."
 EVOPROMPT_DE_TEMPLATE = """Please follow the instruction step-by-step to generate a better prompt.
 Identifying the different parts between Prompt 1 and Prompt 2:
 Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.3.1"
+version = "1.3.2"
 description = ""
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"

diff --git a/scripts/evoprompt_ga_test.py b/scripts/evoprompt_ga_test.py
@@ -4,7 +4,7 @@
 import random
 from logging import Logger
 
-from promptolution.callbacks import LoggerCallback, CSVCallback, TokenCountCallback
+from promptolution.callbacks import LoggerCallback, FileOutputCallback, TokenCountCallback
 from promptolution.templates import EVOPROMPT_GA_TEMPLATE
 from promptolution.helpers import get_llm
 from promptolution.tasks import ClassificationTask
@@ -30,7 +30,7 @@
 
 callbacks = [
     LoggerCallback(logger),
-    CSVCallback(args.output_dir),
+    FileOutputCallback(args.output_dir, file_type="csv"),
     TokenCountCallback(100000, "input_tokens"),
 ]