Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/release-notes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Release Notes

## Release v1.3.2
### What's changed
#### Added features
* Allow for configuration and evaluation of system prompts in all LLM-Classes
* CSV Callback is now FileOutputCallback and able to write Parquet files

#### Further Changes:
* Fixed LLM-Call templates in VLLM
* refined OPRO-implementation to be closer to the paper

## Release v1.3.1
### What's changed
#### Added features
Expand Down
78 changes: 30 additions & 48 deletions promptolution/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,32 +88,37 @@ def on_train_end(self, optimizer, logs=None):
return True


class CSVCallback(Callback):
"""Callback for saving optimization progress to a CSV file.
class FileOutputCallback(Callback):
"""Callback for saving optimization progress to a specified file type.

This callback saves prompts and scores at each step to a CSV file.
This callback saves information about each step to a file.

Attributes:
dir (str): Directory the CSV file is saved to.
dir (str): Directory the file is saved to.
step (int): The current step number.
file_type (str): The type of file to save the output to.
"""

def __init__(self, dir):
"""Initialize the CSVCallback.
def __init__(self, dir, file_type: Literal["parquet", "csv"] = "parquet"):
"""Initialize the FileOutputCallback.

Args:
dir (str): Directory the CSV file is saved to.
file_type (str): The type of file to save the output to.
"""
if not os.path.exists(dir):
os.makedirs(dir)

self.dir = dir
self.dir = dir
self.file_type = file_type

if file_type == "parquet":
self.path = dir + "/step_results.parquet"
elif file_type == "csv":
self.path = dir + "/step_results.csv"
else:
raise ValueError(f"File type {file_type} not supported.")

self.step = 0
self.input_tokens = 0
self.output_tokens = 0
self.start_time = datetime.now()
self.step_time = datetime.now()

def on_step_end(self, optimizer):
"""Save prompts and scores to csv.
Expand All @@ -125,47 +130,24 @@ def on_step_end(self, optimizer):
df = pd.DataFrame(
{
"step": [self.step] * len(optimizer.prompts),
"input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
"output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
"time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts),
"input_tokens": [optimizer.meta_llm.input_token_count] * len(optimizer.prompts),
"output_tokens": [optimizer.meta_llm.output_token_count] * len(optimizer.prompts),
"time": [datetime.now().total_seconds()] * len(optimizer.prompts),
"score": optimizer.scores,
"prompt": optimizer.prompts,
}
)
self.step_time = datetime.now()
self.input_tokens = optimizer.meta_llm.input_token_count
self.output_tokens = optimizer.meta_llm.output_token_count

if not os.path.exists(self.dir + "step_results.csv"):
df.to_csv(self.dir + "step_results.csv", index=False)
else:
df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False)

return True

def on_train_end(self, optimizer):
"""Called at the end of training.

Args:
optimizer: The optimizer object that called the callback.
"""
df = pd.DataFrame(
dict(
steps=self.step,
input_tokens=optimizer.meta_llm.input_token_count,
output_tokens=optimizer.meta_llm.output_token_count,
time_elapsed=(datetime.now() - self.start_time).total_seconds(),
time=datetime.now(),
score=np.array(optimizer.scores).mean(),
best_prompts=str(optimizer.prompts),
),
index=[0],
)

if not os.path.exists(self.dir + "train_results.csv"):
df.to_csv(self.dir + "train_results.csv", index=False)
else:
df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False)
if self.file_type == "parquet":
if self.step == 1:
df.to_parquet(self.path, index=False)
else:
df.to_parquet(self.path, mode="a", index=False)
elif self.file_type == "csv":
if self.step == 1:
df.to_csv(self.path, index=False)
else:
df.to_csv(self.path, mode="a", header=False, index=False)

return True

Expand Down
10 changes: 6 additions & 4 deletions promptolution/llms/api_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,20 @@
import requests
from langchain_anthropic import ChatAnthropic
from langchain_community.chat_models.deepinfra import ChatDeepInfra, ChatDeepInfraException
from langchain_core.messages import HumanMessage
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI

from promptolution.llms.base_llm import BaseLLM

logger = Logger(__name__)


async def invoke_model(prompt, model, semaphore):
async def invoke_model(prompt, system_prompt, model, semaphore):
"""Asynchronously invoke a language model with retry logic.

Args:
prompt (str): The input prompt for the model.
system_prompt (str): The system prompt for the model.
model: The language model to invoke.
semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls.

Expand All @@ -39,7 +40,7 @@ async def invoke_model(prompt, model, semaphore):

while attempts < max_retries:
try:
response = await model.ainvoke([HumanMessage(content=prompt)])
response = await model.ainvoke([SystemMessage(content=system_prompt), HumanMessage(content=prompt)])
return response.content
except ChatDeepInfraException as e:
print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...")
Expand Down Expand Up @@ -80,13 +81,14 @@ def __init__(self, model_id: str, token: str = None, **kwargs: Any):
else:
self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)

def _get_response(self, prompts: List[str]) -> List[str]:
def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
"""Get responses for a list of prompts in a synchronous manner.

This method includes retry logic for handling connection errors and rate limits.

Args:
prompts (list[str]): List of input prompts.
system_prompts (list[str]): List of system prompts. If not provided, uses default system_prompts

Returns:
list[str]: List of model responses.
Expand Down
18 changes: 13 additions & 5 deletions promptolution/llms/base_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import numpy as np

from promptolution.templates import DEFAULT_SYS_PROMPT

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -54,7 +56,7 @@ def update_token_count(self, inputs: List[str], outputs: List[str]):
self.input_token_count += input_tokens
self.output_token_count += output_tokens

def get_response(self, prompts: str) -> str:
def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
"""Generate responses for the given prompts.

This method calls the _get_response method to generate responses
Expand All @@ -64,31 +66,37 @@ def get_response(self, prompts: str) -> str:
Args:
prompts (str or List[str]): Input prompt(s). If a single string is provided,
it's converted to a list containing that string.
system_prompts (str or List[str]): System prompt(s) to provide context to the model.

Returns:
List[str]: A list of generated responses, one for each input prompt.
"""
if system_prompts is None:
system_prompts = DEFAULT_SYS_PROMPT
if isinstance(prompts, str):
prompts = [prompts]
responses = self._get_response(prompts)
self.update_token_count(prompts, responses)
if isinstance(system_prompts, str):
system_prompts = [system_prompts] * len(prompts)
responses = self._get_response(prompts, system_prompts)
self.update_token_count(prompts + system_prompts, responses)

return responses

@abstractmethod
def _get_response(self, prompts: List[str]) -> List[str]:
def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
"""Generate responses for the given prompts.

This method should be implemented by subclasses to define how
the LLM generates responses.

Args:
prompts (List[str]): A list of input prompts.
system_prompts (List[str]): A list of system prompts to provide context to the model.

Returns:
List[str]: A list of generated responses corresponding to the input prompts.
"""
pass
raise NotImplementedError


class DummyLLM(BaseLLM):
Expand Down
8 changes: 6 additions & 2 deletions promptolution/llms/local_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8):
self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
self.pipeline.tokenizer.padding_side = "left"

def _get_response(self, prompts: list[str]):
def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
"""Generate responses for a list of prompts using the local language model.

Args:
Expand All @@ -63,8 +63,12 @@ def _get_response(self, prompts: list[str]):
This method uses torch.no_grad() for inference to reduce memory usage.
It handles both single and batch inputs, ensuring consistent output format.
"""
inputs = []
for prompt, sys_prompt in zip(prompts, system_prompts):
inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}])

with torch.no_grad():
response = self.pipeline(prompts, pad_token_id=self.pipeline.tokenizer.eos_token_id)
response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id)

if len(response) != 1:
response = [r[0] if isinstance(r, list) else r for r in response]
Expand Down
9 changes: 5 additions & 4 deletions promptolution/llms/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def __init__(
# Initialize tokenizer separately for potential pre-processing
self.tokenizer = AutoTokenizer.from_pretrained(model_id)

def _get_response(self, inputs: list[str]):
def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
"""Generate responses for a list of prompts using the vLLM engine.

Args:
Expand All @@ -126,13 +126,14 @@ def _get_response(self, inputs: list[str]):
[
{
"role": "system",
"content": "You are a helpful assistant.",
"content": sys_prompt,
},
{"role": "user", "content": input},
{"role": "user", "content": prompt},
],
tokenize=False,
add_generation_prompt=True,
)
for input in inputs
for prompt, sys_prompt in zip(prompts, system_prompts)
]

# generate responses for self.batch_size prompts at the same time
Expand Down
8 changes: 6 additions & 2 deletions promptolution/predictors/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def __init__(self, llm: BaseLLM):
"""
self.llm = llm

def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False) -> np.ndarray:
def predict(
self, prompts: List[str], xs: np.ndarray, system_prompts: List[str] = None, return_seq: bool = False
) -> np.ndarray:
"""Abstract method to make predictions based on prompts and input data.

Args:
Expand All @@ -48,7 +50,9 @@ def predict(self, prompts: List[str], xs: np.ndarray, return_seq: bool = False)
if isinstance(prompts, str):
prompts = [prompts]

outputs = self.llm.get_response([prompt + "\n" + x for prompt in prompts for x in xs])
outputs = self.llm.get_response(
[prompt + "\n" + x for prompt in prompts for x in xs], system_prompts=system_prompts
)
preds = self._extract_preds(outputs)

shape = (len(prompts), len(xs))
Expand Down
5 changes: 3 additions & 2 deletions promptolution/tasks/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ def __init__(self, *args, **kwargs):
pass

@abstractmethod
def evaluate(self, prompts: List[str], predictor) -> np.ndarray:
def evaluate(self, prompts: List[str], predictor, system_promtps: List[str] = None) -> np.ndarray:
"""Abstract method to evaluate prompts using a given predictor.

Args:
prompts (List[str]): List of prompts to evaluate.
predictor: The predictor to use for evaluation.
system_promtps (List[str]): List of system prompts to evaluate.

Returns:
np.ndarray: Array of evaluation scores for each prompt.
Expand Down Expand Up @@ -58,7 +59,7 @@ def __init__(self):
self.ys = np.array(["positive", "negative", "positive"])
self.classes = ["negative", "positive"]

def evaluate(self, prompts: List[str], predictor) -> np.ndarray:
def evaluate(self, prompts: List[str], predictor, system_prompts=None) -> np.ndarray:
"""Generate random evaluation scores for the given prompts.

Args:
Expand Down
4 changes: 3 additions & 1 deletion promptolution/tasks/classification_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def evaluate(
self,
prompts: List[str],
predictor: BasePredictor,
system_prompts: List[str] = None,
n_samples: int = 20,
subsample: bool = False,
return_seq: bool = False,
Expand All @@ -72,6 +73,7 @@ def evaluate(
Args:
prompts (List[str]): List of prompts to evaluate.
predictor (BasePredictor): Predictor to use for evaluation.
system_prompts (List[str], optional): List of system prompts to evaluate. Defaults to None.
n_samples (int, optional): Number of samples to use if subsampling. Defaults to 20.
subsample (bool, optional): Whether to use subsampling.
If set to true, samples a different subset per call. Defaults to False.
Expand All @@ -95,7 +97,7 @@ def evaluate(
ys_subsample = self.ys[indices]

# Make predictions on the subsample
preds = predictor.predict(prompts, xs_subsample, return_seq=return_seq)
preds = predictor.predict(prompts, xs_subsample, system_prompts=system_prompts, return_seq=return_seq)

if return_seq:
preds, seqs = preds
Expand Down
1 change: 1 addition & 0 deletions promptolution/templates.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
DEFAULT_SYS_PROMPT = "You are a helpful assistant."
EVOPROMPT_DE_TEMPLATE = """Please follow the instruction step-by-step to generate a better prompt.
Identifying the different parts between Prompt 1 and Prompt 2:
Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "promptolution"
version = "1.3.1"
version = "1.3.2"
description = ""
authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
readme = "README.md"
Expand Down
4 changes: 2 additions & 2 deletions scripts/evoprompt_ga_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import random
from logging import Logger

from promptolution.callbacks import LoggerCallback, CSVCallback, TokenCountCallback
from promptolution.callbacks import LoggerCallback, FileOutputCallback, TokenCountCallback
from promptolution.templates import EVOPROMPT_GA_TEMPLATE
from promptolution.helpers import get_llm
from promptolution.tasks import ClassificationTask
Expand All @@ -30,7 +30,7 @@

callbacks = [
LoggerCallback(logger),
CSVCallback(args.output_dir),
FileOutputCallback(args.output_dir, file_type="csv"),
TokenCountCallback(100000, "input_tokens"),
]

Expand Down
Loading
Loading