diff --git a/docs/release-notes.md b/docs/release-notes.md index 3a81e29..21579c6 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,15 @@ # Release Notes +## Release v1.3.2 +### What's changed +#### Added features +* Allow for configuration and evaluation of system prompts in all LLM-Classes +* CSV Callback is now FileOutputCallback and able to write Parquet files + +#### Further Changes: +* Fixed LLM-Call templates in VLLM +* refined OPRO-implementation to be closer to the paper + ## Release v1.3.1 ### What's changed #### Added features diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py index cfcca6a..27f104d 100644 --- a/promptolution/callbacks.py +++ b/promptolution/callbacks.py @@ -88,32 +88,37 @@ def on_train_end(self, optimizer, logs=None): return True -class CSVCallback(Callback): - """Callback for saving optimization progress to a CSV file. +class FileOutputCallback(Callback): + """Callback for saving optimization progress to a specified file type. - This callback saves prompts and scores at each step to a CSV file. + This callback saves information about each step to a file. Attributes: - dir (str): Directory the CSV file is saved to. + dir (str): Directory the file is saved to. step (int): The current step number. + file_type (str): The type of file to save the output to. """ - def __init__(self, dir): - """Initialize the CSVCallback. + def __init__(self, dir, file_type: Literal["parquet", "csv"] = "parquet"): + """Initialize the FileOutputCallback. Args: dir (str): Directory the CSV file is saved to. + file_type (str): The type of file to save the output to. """ if not os.path.exists(dir): os.makedirs(dir) - self.dir = dir - self.dir = dir + self.file_type = file_type + + if file_type == "parquet": + self.path = dir + "/step_results.parquet" + elif file_type == "csv": + self.path = dir + "/step_results.csv" + else: + raise ValueError(f"File type {file_type} not supported.") + self.step = 0 - self.input_tokens = 0 - self.output_tokens = 0 - self.start_time = datetime.now() - self.step_time = datetime.now() def on_step_end(self, optimizer): """Save prompts and scores to csv. @@ -125,47 +130,24 @@ def on_step_end(self, optimizer): df = pd.DataFrame( { "step": [self.step] * len(optimizer.prompts), - "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts), - "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts), - "time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts), + "input_tokens": [optimizer.meta_llm.input_token_count] * len(optimizer.prompts), + "output_tokens": [optimizer.meta_llm.output_token_count] * len(optimizer.prompts), + "time": [datetime.now().total_seconds()] * len(optimizer.prompts), "score": optimizer.scores, "prompt": optimizer.prompts, } ) - self.step_time = datetime.now() - self.input_tokens = optimizer.meta_llm.input_token_count - self.output_tokens = optimizer.meta_llm.output_token_count - if not os.path.exists(self.dir + "step_results.csv"): - df.to_csv(self.dir + "step_results.csv", index=False) - else: - df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False) - - return True - - def on_train_end(self, optimizer): - """Called at the end of training. - - Args: - optimizer: The optimizer object that called the callback. - """ - df = pd.DataFrame( - dict( - steps=self.step, - input_tokens=optimizer.meta_llm.input_token_count, - output_tokens=optimizer.meta_llm.output_token_count, - time_elapsed=(datetime.now() - self.start_time).total_seconds(), - time=datetime.now(), - score=np.array(optimizer.scores).mean(), - best_prompts=str(optimizer.prompts), - ), - index=[0], - ) - - if not os.path.exists(self.dir + "train_results.csv"): - df.to_csv(self.dir + "train_results.csv", index=False) - else: - df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False) + if self.file_type == "parquet": + if self.step == 1: + df.to_parquet(self.path, index=False) + else: + df.to_parquet(self.path, mode="a", index=False) + elif self.file_type == "csv": + if self.step == 1: + df.to_csv(self.path, index=False) + else: + df.to_csv(self.path, mode="a", header=False, index=False) return True diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index 4c8e6ec..d00bc91 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -10,7 +10,7 @@ import requests from langchain_anthropic import ChatAnthropic from langchain_community.chat_models.deepinfra import ChatDeepInfra, ChatDeepInfraException -from langchain_core.messages import HumanMessage +from langchain_core.messages import HumanMessage, SystemMessage from langchain_openai import ChatOpenAI from promptolution.llms.base_llm import BaseLLM @@ -18,11 +18,12 @@ logger = Logger(__name__) -async def invoke_model(prompt, model, semaphore): +async def invoke_model(prompt, system_prompt, model, semaphore): """Asynchronously invoke a language model with retry logic. Args: prompt (str): The input prompt for the model. + system_prompt (str): The system prompt for the model. model: The language model to invoke. semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls. @@ -39,7 +40,7 @@ async def invoke_model(prompt, model, semaphore): while attempts < max_retries: try: - response = await model.ainvoke([HumanMessage(content=prompt)]) + response = await model.ainvoke([SystemMessage(content=system_prompt), HumanMessage(content=prompt)]) return response.content except ChatDeepInfraException as e: print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...") @@ -80,13 +81,14 @@ def __init__(self, model_id: str, token: str = None, **kwargs: Any): else: self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token) - def _get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]: """Get responses for a list of prompts in a synchronous manner. This method includes retry logic for handling connection errors and rate limits. Args: prompts (list[str]): List of input prompts. + system_prompts (list[str]): List of system prompts. If not provided, uses default system_prompts Returns: list[str]: List of model responses. diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py index 438ccf1..081b67e 100644 --- a/promptolution/llms/base_llm.py +++ b/promptolution/llms/base_llm.py @@ -6,6 +6,8 @@ import numpy as np +from promptolution.templates import DEFAULT_SYS_PROMPT + logger = logging.getLogger(__name__) @@ -54,7 +56,7 @@ def update_token_count(self, inputs: List[str], outputs: List[str]): self.input_token_count += input_tokens self.output_token_count += output_tokens - def get_response(self, prompts: str) -> str: + def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]: """Generate responses for the given prompts. This method calls the _get_response method to generate responses @@ -64,19 +66,24 @@ def get_response(self, prompts: str) -> str: Args: prompts (str or List[str]): Input prompt(s). If a single string is provided, it's converted to a list containing that string. + system_prompts (str or List[str]): System prompt(s) to provide context to the model. Returns: List[str]: A list of generated responses, one for each input prompt. """ + if system_prompts is None: + system_prompts = DEFAULT_SYS_PROMPT if isinstance(prompts, str): prompts = [prompts] - responses = self._get_response(prompts) - self.update_token_count(prompts, responses) + if isinstance(system_prompts, str): + system_prompts = [system_prompts] * len(prompts) + responses = self._get_response(prompts, system_prompts) + self.update_token_count(prompts + system_prompts, responses) return responses @abstractmethod - def _get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]: """Generate responses for the given prompts. This method should be implemented by subclasses to define how @@ -84,11 +91,12 @@ def _get_response(self, prompts: List[str]) -> List[str]: Args: prompts (List[str]): A list of input prompts. + system_prompts (List[str]): A list of system prompts to provide context to the model. Returns: List[str]: A list of generated responses corresponding to the input prompts. """ - pass + raise NotImplementedError class DummyLLM(BaseLLM): diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index 577d4a0..46afe17 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -50,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8): self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id self.pipeline.tokenizer.padding_side = "left" - def _get_response(self, prompts: list[str]): + def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]: """Generate responses for a list of prompts using the local language model. Args: @@ -63,8 +63,12 @@ def _get_response(self, prompts: list[str]): This method uses torch.no_grad() for inference to reduce memory usage. It handles both single and batch inputs, ensuring consistent output format. """ + inputs = [] + for prompt, sys_prompt in zip(prompts, system_prompts): + inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}]) + with torch.no_grad(): - response = self.pipeline(prompts, pad_token_id=self.pipeline.tokenizer.eos_token_id) + response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id) if len(response) != 1: response = [r[0] if isinstance(r, list) else r for r in response] diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index e3706c5..3ced55f 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -108,7 +108,7 @@ def __init__( # Initialize tokenizer separately for potential pre-processing self.tokenizer = AutoTokenizer.from_pretrained(model_id) - def _get_response(self, inputs: list[str]): + def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]: """Generate responses for a list of prompts using the vLLM engine. Args: @@ -126,13 +126,14 @@ def _get_response(self, inputs: list[str]): [ { "role": "system", - "content": "You are a helpful assistant.", + "content": sys_prompt, }, - {"role": "user", "content": input}, + {"role": "user", "content": prompt}, ], tokenize=False, + add_generation_prompt=True, ) - for input in inputs + for prompt, sys_prompt in zip(prompts, system_prompts) ] # generate responses for self.batch_size prompts at the same time diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py index 2674a17..6cc8358 100644 --- a/promptolution/predictors/base_predictor.py +++ b/promptolution/predictors/base_predictor.py @@ -31,7 +31,9 @@ def __init__(self, llm: BaseLLM): """ self.llm = llm - def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False) -> np.ndarray: + def predict( + self, prompts: List[str], xs: np.ndarray, system_prompts: List[str] = None, return_seq: bool = False + ) -> np.ndarray: """Abstract method to make predictions based on prompts and input data. Args: @@ -48,7 +50,9 @@ def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False) if isinstance(prompts, str): prompts = [prompts] - outputs = self.llm.get_response([prompt + "\n" + x for prompt in prompts for x in xs]) + outputs = self.llm.get_response( + [prompt + "\n" + x for prompt in prompts for x in xs], system_prompts=system_prompts + ) preds = self._extract_preds(outputs) shape = (len(prompts), len(xs)) diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py index 1dc1583..403a800 100644 --- a/promptolution/tasks/base_task.py +++ b/promptolution/tasks/base_task.py @@ -21,12 +21,13 @@ def __init__(self, *args, **kwargs): pass @abstractmethod - def evaluate(self, prompts: List[str], predictor) -> np.ndarray: + def evaluate(self, prompts: List[str], predictor, system_promtps: List[str] = None) -> np.ndarray: """Abstract method to evaluate prompts using a given predictor. Args: prompts (List[str]): List of prompts to evaluate. predictor: The predictor to use for evaluation. + system_promtps (List[str]): List of system prompts to evaluate. Returns: np.ndarray: Array of evaluation scores for each prompt. @@ -58,7 +59,7 @@ def __init__(self): self.ys = np.array(["positive", "negative", "positive"]) self.classes = ["negative", "positive"] - def evaluate(self, prompts: List[str], predictor) -> np.ndarray: + def evaluate(self, prompts: List[str], predictor, system_prompts=None) -> np.ndarray: """Generate random evaluation scores for the given prompts. Args: diff --git a/promptolution/tasks/classification_tasks.py b/promptolution/tasks/classification_tasks.py index 82823d3..5e093af 100644 --- a/promptolution/tasks/classification_tasks.py +++ b/promptolution/tasks/classification_tasks.py @@ -63,6 +63,7 @@ def evaluate( self, prompts: List[str], predictor: BasePredictor, + system_prompts: List[str] = None, n_samples: int = 20, subsample: bool = False, return_seq: bool = False, @@ -72,6 +73,7 @@ def evaluate( Args: prompts (List[str]): List of prompts to evaluate. predictor (BasePredictor): Predictor to use for evaluation. + system_prompts (List[str], optional): List of system prompts to evaluate. Defaults to None. n_samples (int, optional): Number of samples to use if subsampling. Defaults to 20. subsample (bool, optional): Whether to use subsampling. If set to true, samples a different subset per call. Defaults to False. @@ -95,7 +97,7 @@ def evaluate( ys_subsample = self.ys[indices] # Make predictions on the subsample - preds = predictor.predict(prompts, xs_subsample, return_seq=return_seq) + preds = predictor.predict(prompts, xs_subsample, system_prompts=system_prompts, return_seq=return_seq) if return_seq: preds, seqs = preds diff --git a/promptolution/templates.py b/promptolution/templates.py index 6cbc39e..1151b9d 100644 --- a/promptolution/templates.py +++ b/promptolution/templates.py @@ -1,3 +1,4 @@ +DEFAULT_SYS_PROMPT = "You are a helpful assistant." EVOPROMPT_DE_TEMPLATE = """Please follow the instruction step-by-step to generate a better prompt. Identifying the different parts between Prompt 1 and Prompt 2: Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great. diff --git a/pyproject.toml b/pyproject.toml index cc2d2e5..d8bc054 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptolution" -version = "1.3.1" +version = "1.3.2" description = "" authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"] readme = "README.md" diff --git a/scripts/evoprompt_ga_test.py b/scripts/evoprompt_ga_test.py index 0d13e40..dd3f21e 100644 --- a/scripts/evoprompt_ga_test.py +++ b/scripts/evoprompt_ga_test.py @@ -4,7 +4,7 @@ import random from logging import Logger -from promptolution.callbacks import LoggerCallback, CSVCallback, TokenCountCallback +from promptolution.callbacks import LoggerCallback, FileOutputCallback, TokenCountCallback from promptolution.templates import EVOPROMPT_GA_TEMPLATE from promptolution.helpers import get_llm from promptolution.tasks import ClassificationTask @@ -30,7 +30,7 @@ callbacks = [ LoggerCallback(logger), - CSVCallback(args.output_dir), + FileOutputCallback(args.output_dir, file_type="csv"), TokenCountCallback(100000, "input_tokens"), ] diff --git a/scripts/evoprompt_ga_test_gsm8k.py b/scripts/evoprompt_ga_test_gsm8k.py index 123db3d..f52bc43 100644 --- a/scripts/evoprompt_ga_test_gsm8k.py +++ b/scripts/evoprompt_ga_test_gsm8k.py @@ -4,7 +4,7 @@ import random from logging import Logger -from promptolution.callbacks import LoggerCallback, CSVCallback, TokenCountCallback +from promptolution.callbacks import LoggerCallback, TokenCountCallback, FileOutputCallback from promptolution.templates import EVOPROMPT_GA_TEMPLATE from promptolution.helpers import get_llm from promptolution.tasks import ClassificationTask @@ -30,7 +30,7 @@ callbacks = [ LoggerCallback(logger), - CSVCallback(args.output_dir), + FileOutputCallback(args.output_dir, file_type="csv"), TokenCountCallback(100000, "input_tokens"), ] diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py deleted file mode 100644 index 802208e..0000000 --- a/scripts/optimizer_test_run.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Test run for the Opro optimizer.""" -import argparse -from logging import Logger - -from promptolution.callbacks import LoggerCallback, CSVCallback -from promptolution.helpers import run_optimization - -from promptolution.config import Config - -logger = Logger(__name__) - -"""Run a test run for any of the implemented optimizers.""" -parser = argparse.ArgumentParser() -parser.add_argument("--model") -parser.add_argument("--model-storage-path", default="../models/") -parser.add_argument("--optimizer", default="evopromptde") -parser.add_argument("--n-steps", type=int, default=10) -parser.add_argument("--token", default=None) -parser.add_argument("--seed", type=int, default=187) -args = parser.parse_args() - -config = Config( - meta_llm=args.model, - ds_path="data_sets/cls/agnews", - task_name="agnews", - predictor="FirstOccurenceClassificator", - n_steps=args.n_steps, - optimizer=args.optimizer, - downstream_llm=args.model, - evaluation_llm=args.model, - api_token=args.token, - model_storage_path=args.model_storage_path, - random_seed=args.seed, -) - -if args.token is None: - prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")]) -else: - prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")], use_token=True) - -logger.info(f"Optimized prompts: {prompts}")