diff --git a/.coverage b/.coverage
new file mode 100644
index 0000000..2e7af62
Binary files /dev/null and b/.coverage differ
diff --git a/.github/actions/python-poetry/action.yml b/.github/actions/python-poetry/action.yml
index f094d99..d2b0e62 100644
--- a/.github/actions/python-poetry/action.yml
+++ b/.github/actions/python-poetry/action.yml
@@ -34,5 +34,6 @@ runs:
         path: .venv
         key: ${{ runner.os }}-poetry-${{ inputs.poetry-version }}-${{ hashFiles('**/poetry.lock') }}
     - name: Install dependencies
-      run: poetry install --no-interaction --no-root --only ${{ inputs.groups }}
+      run:
+        poetry install --no-interaction --no-root --only ${{ inputs.groups }}
       shell: bash
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index be286c3..00477eb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,11 +21,11 @@ jobs:
       - name: Set up Python and Poetry
         uses: ./.github/actions/python-poetry
         with:
-          groups: dev
+          groups: main,dev,test
       - name: Run pre-commit
         uses: pre-commit/action@v3.0.1
-      # - name: Run tests
-      #   run: poetry run python -m pytest
+      - name: Run tests
+        run: poetry run python -m pytest
 
   build:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b2c41f8..f848a97 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - main
+  workflow_call:
 
 permissions:
   contents: write
@@ -22,6 +23,10 @@ jobs:
         with:
           groups: "docs"
 
+      - name: Generate notebook examples
+        run: |
+          poetry run jupyter nbconvert --to markdown --allow-errors --output-dir docs/examples notebooks/*.ipynb
+
       - name: Deploy docs
         run: |
           poetry run mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
index 088f43a..6034def 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ dist/
 outputs/
 results/
 poetry.lock
+CLAUDE.md
+**/CLAUDE.local.md
diff --git a/README.md b/README.md
index 971fc73..8718aeb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
 ![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
+
+[![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
+[![Docs](https://github.com/finitearth/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/docs.yml)
+
 # Promptolution
 Promptolution is a library that provides a modular and extensible framework for implementing prompt tuning experiments. It offers a user-friendly interface to assemble the core components for various prompt optimization tasks.
 
@@ -6,14 +10,24 @@ In addition, this repository contains our experiments for the paper "Towards Cos
 
 This project was developed by [Timo Heiß](https://www.linkedin.com/in/timo-heiss/), [Moritz Schlager](https://www.linkedin.com/in/moritz-schlager/) and [Tom Zehle](https://www.linkedin.com/in/tom-zehle/).
 
+## Getting Started
+Take a look at our getting started notebook: [getting_started.ipynb](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb)
+
 ## Installation
 
 Use pip to install our library:
 
 ```
-pip install promptolution
+pip install promptolution[api]
+```
+
+If you want to run your prompt optimization locally, either via transformers or vLLM, consider running:
+
+```
+pip install promptolution[vllm,transformers]
 ```
 
+
 Alternatively, clone the repository, run
 
 ```
@@ -58,17 +72,5 @@ Create API Keys for the models you want to use:
 - Detailed logging and callback system for optimization analysis
 
 
-## Getting Started
-Take a look at our getting started notebook: [getting_started.py](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb)
-## Reproduce our Experiments
-
-We provide scripts and configs for all our experiments. Run experiments based on config via:
-
-```
-poetry run python scripts/experiment_runs.py --experiment "configs/<my_experiment>.ini"
-```
-where `<my_experiment>.ini` is a config based on our templates.
-
-
 
-This project was developed for seminar "AutoML in the age of large pre-trained models" at LMU Munich.
+This project was developed for the "AutoML in the age of large pre-trained models" seminar at LMU Munich.
diff --git a/docs/api/config.md b/docs/api/config.md
index e973813..8793eb5 100644
--- a/docs/api/config.md
+++ b/docs/api/config.md
@@ -1,6 +1,6 @@
 # Config
 
-::: promptolution.config.Config
+::: promptolution.config.ExperimentConfig
     options:
       show_root_heading: true
       show_root_full_path: false
diff --git a/docs/examples/getting_started.md b/docs/examples/getting_started.md
new file mode 100644
index 0000000..57c8f17
--- /dev/null
+++ b/docs/examples/getting_started.md
@@ -0,0 +1,383 @@
+# Getting started
+
+## Before you start
+
+In this notebook we give you a short introduction into the workings of promptolution.
+
+We will use the OpenAI-API to demonstrate the functionality of promptolution, however we also provide a local LLM, as well as a vLLM backend. You can also change the `base_url` in the config, in order to use any other api, that follows the OpenAI API standard.
+
+Thanks for giving it a try!
+
+## Installs
+
+
+```python
+# ! pip install promptolution
+```
+
+## Imports
+
+
+```python
+import pandas as pd
+from promptolution import ExperimentConfig, run_experiment
+import nest_asyncio
+nest_asyncio.apply() # we need this only because we are in a notebook
+```
+
+## set up llms, predictor, tasks and optimizer
+
+Here we set up our dataset. We use the subjectivity dataset from hugging face, but of course here you may want to use your own dataset.
+
+Just make sure, to name the input column "x" and the target column "y", as well as providing a short dataset description.
+
+
+```python
+df = pd.read_csv("hf://datasets/tasksource/subjectivity/train.csv")
+df = df.rename(columns={"Sentence": "x", "Label": "y"})
+df = df.replace({"OBJ": "objective", "SUBJ": "subjective"})
+
+task_description = "The dataset contains sentences labeled as either subjective or objective. "\
+        "The task is to classify each sentence as either subjective or objective. " \
+        "The class mentioned first in the response of the LLM will be the prediction."
+```
+
+We definied some initial prompts, however you may also take a look at `create_prompts_from_samples` in order to automatically generate them.
+
+
+```python
+init_prompts = [
+    'Classify the given text as either an objective or subjective statement based on the tone and language used: e.g. the tone and language used should indicate whether the statement is a neutral, factual summary (objective) or an expression of opinion or emotional tone (subjective). Include the output classes "objective" or "subjective" in the prompt.',
+    'What kind of statement is the following text: [Insert text here]? Is it <objective_statement> or <subjective_statement>?',
+    'Identify whether a sentence is objective or subjective by analyzing the tone, language, and underlying perspective. Consider the emotion, opinion, and bias present in the sentence. Are the authors presenting objective facts or expressing a personal point of view? The output will be either "objective" (output class: objective) or "subjective" (output class: subjective).',
+    'Classify the following sentences as either objective or subjective, indicating the name of the output classes: [input sentence]. Output classes: objective, subjective',
+    '_query a text about legal or corporate-related issues, and predict whether the tone is objective or subjective, outputting the corresponding class "objective" for non-subjective language or "subjective" for subjective language_',
+    'Classify a statement as either "subjective" or "objective" based on whether it reflects a personal opinion or a verifiable fact. The output classes to include are "objective" and "subjective".',
+    'Classify the text as objective or subjective based on its tone and language.',
+    'Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.',
+    'Classify the given text as objective or subjective based on its tone, focusing on its intention, purpose, and level of personal opinion or emotional appeal, with outputs including classes such as objective or subjective.',
+    "Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\n\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\n\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.",
+    'Given a sentence, classify it as either "objective" or "subjective" based on its tone and language, considering the presence of third-person pronouns, neutral language, and opinions. Classify the output as "objective" if the tone is neutral and detached, focusing on facts and data, or as "subjective" if the tone is evaluative, emotive, or biased.',
+    'Identify whether the given sentence is subjective or objective, then correspondingly output "objective" or "subjective" in the form of "<output class>, (e.g. "objective"), without quotes. Please note that the subjective orientation typically describes a sentence where the writer expresses their own opinion or attitude, whereas an objective sentence presents facts or information without personal involvement or bias. <output classes: subjective, objective>'
+]
+```
+
+We will be now using the gpt
+
+
+```python
+token = open("../deepinfratoken.txt", "r").read()
+```
+
+
+```python
+config = ExperimentConfig(
+    task_description=task_description,
+    prompts=init_prompts,
+    n_steps=3,
+    optimizer="evopromptga",
+    api_url="https://api.openai.com/v1",
+    llm="gpt-4o-mini-2024-07-18",
+    token=token,
+)
+```
+
+
+```python
+prompts = run_experiment(df, config)
+```
+
+
+    ---------------------------------------------------------------------------
+
+    RateLimitError                            Traceback (most recent call last)
+
+    Cell In[48], line 1
+    ----> 1 prompts = run_experiment(df, config)
+    
+
+    File ~\Documents\programming\promptolution\promptolution\helpers.py:32, in run_experiment(df, config)
+         30 train_df = df.sample(frac=0.8, random_state=42)
+         31 test_df = df.drop(train_df.index)
+    ---> 32 prompts = run_optimization(train_df, config)
+         33 df_prompt_scores = run_evaluation(test_df, config, prompts)
+         35 return df_prompt_scores
+    
+
+    File ~\Documents\programming\promptolution\promptolution\helpers.py:59, in run_optimization(df, config)
+         51 task = get_task(df, config)
+         52 optimizer = get_optimizer(
+         53     predictor=predictor,
+         54     meta_llm=llm,
+         55     task=task,
+         56     config=config,
+         57 )
+    ---> 59 prompts = optimizer.optimize(n_steps=config.n_steps)
+         61 if config.prepend_exemplars:
+         62     selector = get_exemplar_selector(config.exemplar_selector, task, predictor)
+    
+
+    File <string>:15, in optimize(self, n_steps)
+    
+
+    File ~\Documents\programming\promptolution\promptolution\optimizers\evoprompt_ga.py:69, in EvoPromptGA._pre_optimization_loop(self)
+         67     logger.warning(f"Initial sequences: {seq}")
+         68 else:
+    ---> 69     self.scores = self.task.evaluate(
+         70         self.prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples
+         71     ).tolist()
+         72 # sort prompts by score
+         73 self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
+    
+
+    File ~\Documents\programming\promptolution\promptolution\tasks\classification_tasks.py:101, in ClassificationTask.evaluate(self, prompts, predictor, system_prompts, n_samples, subsample, return_seq)
+         98 ys_subsample = self.ys[indices]
+        100 # Make predictions on the subsample
+    --> 101 preds = predictor.predict(prompts, xs_subsample, system_prompts=system_prompts, return_seq=return_seq)
+        103 if return_seq:
+        104     preds, seqs = preds
+    
+
+    File ~\Documents\programming\promptolution\promptolution\predictors\base_predictor.py:57, in BasePredictor.predict(self, prompts, xs, system_prompts, return_seq)
+         54 if isinstance(prompts, str):
+         55     prompts = [prompts]
+    ---> 57 outputs = self.llm.get_response(
+         58     [prompt + "\n" + x for prompt in prompts for x in xs], system_prompts=system_prompts
+         59 )
+         60 preds = self._extract_preds(outputs)
+         62 shape = (len(prompts), len(xs))
+    
+
+    File ~\Documents\programming\promptolution\promptolution\llms\base_llm.py:97, in BaseLLM.get_response(self, prompts, system_prompts)
+         95 if isinstance(system_prompts, str):
+         96     system_prompts = [system_prompts] * len(prompts)
+    ---> 97 responses = self._get_response(prompts, system_prompts)
+         98 self.update_token_count(prompts + system_prompts, responses)
+        100 return responses
+    
+
+    File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:82, in APILLM._get_response(self, prompts, system_prompts)
+         79 def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+         80     # Setup for async execution in sync context
+         81     loop = asyncio.get_event_loop()
+    ---> 82     responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
+         83     return responses
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\nest_asyncio.py:98, in _patch_loop.<locals>.run_until_complete(self, future)
+         95 if not f.done():
+         96     raise RuntimeError(
+         97         'Event loop stopped before Future completed.')
+    ---> 98 return f.result()
+    
+
+    File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\futures.py:203, in Future.result(self)
+        201 self.__log_traceback = False
+        202 if self._exception is not None:
+    --> 203     raise self._exception.with_traceback(self._exception_tb)
+        204 return self._result
+    
+
+    File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:316, in Task.__step_run_and_handle_result(***failed resolving arguments***)
+        314         result = coro.send(None)
+        315     else:
+    --> 316         result = coro.throw(exc)
+        317 except StopIteration as exc:
+        318     if self._must_cancel:
+        319         # Task is cancelled right before coro stops.
+    
+
+    File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:90, in APILLM._get_response_async(self, prompts, system_prompts)
+         85 async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+         86     tasks = [
+         87         _invoke_model(prompt, system_prompt, self.max_tokens, self.llm, self.client, self.semaphore)
+         88         for prompt, system_prompt in zip(prompts, system_prompts)
+         89     ]
+    ---> 90     responses = await asyncio.gather(*tasks)
+         91     return [response.choices[0].message.content for response in responses]
+    
+
+    File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:385, in Task.__wakeup(self, future)
+        383 def __wakeup(self, future):
+        384     try:
+    --> 385         future.result()
+        386     except BaseException as exc:
+        387         # This may also be a cancellation.
+        388         self.__step(exc)
+    
+
+    File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:314, in Task.__step_run_and_handle_result(***failed resolving arguments***)
+        310 try:
+        311     if exc is None:
+        312         # We use the `send` method directly, because coroutines
+        313         # don't have `__iter__` and `__next__` methods.
+    --> 314         result = coro.send(None)
+        315     else:
+        316         result = coro.throw(exc)
+    
+
+    File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:25, in _invoke_model(prompt, system_prompt, max_tokens, model_id, client, semaphore)
+         23 async with semaphore:
+         24     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
+    ---> 25     response = await client.chat.completions.create(
+         26         model=model_id,
+         27         messages=messages,
+         28         max_tokens=max_tokens,
+         29     )
+         30     return response
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\resources\chat\completions\completions.py:2032, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)
+       1989 @required_args(["messages", "model"], ["messages", "model", "stream"])
+       1990 async def create(
+       1991     self,
+       (...)   2029     timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+       2030 ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
+       2031     validate_response_format(response_format)
+    -> 2032     return await self._post(
+       2033         "/chat/completions",
+       2034         body=await async_maybe_transform(
+       2035             {
+       2036                 "messages": messages,
+       2037                 "model": model,
+       2038                 "audio": audio,
+       2039                 "frequency_penalty": frequency_penalty,
+       2040                 "function_call": function_call,
+       2041                 "functions": functions,
+       2042                 "logit_bias": logit_bias,
+       2043                 "logprobs": logprobs,
+       2044                 "max_completion_tokens": max_completion_tokens,
+       2045                 "max_tokens": max_tokens,
+       2046                 "metadata": metadata,
+       2047                 "modalities": modalities,
+       2048                 "n": n,
+       2049                 "parallel_tool_calls": parallel_tool_calls,
+       2050                 "prediction": prediction,
+       2051                 "presence_penalty": presence_penalty,
+       2052                 "reasoning_effort": reasoning_effort,
+       2053                 "response_format": response_format,
+       2054                 "seed": seed,
+       2055                 "service_tier": service_tier,
+       2056                 "stop": stop,
+       2057                 "store": store,
+       2058                 "stream": stream,
+       2059                 "stream_options": stream_options,
+       2060                 "temperature": temperature,
+       2061                 "tool_choice": tool_choice,
+       2062                 "tools": tools,
+       2063                 "top_logprobs": top_logprobs,
+       2064                 "top_p": top_p,
+       2065                 "user": user,
+       2066                 "web_search_options": web_search_options,
+       2067             },
+       2068             completion_create_params.CompletionCreateParamsStreaming
+       2069             if stream
+       2070             else completion_create_params.CompletionCreateParamsNonStreaming,
+       2071         ),
+       2072         options=make_request_options(
+       2073             extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+       2074         ),
+       2075         cast_to=ChatCompletion,
+       2076         stream=stream or False,
+       2077         stream_cls=AsyncStream[ChatCompletionChunk],
+       2078     )
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1805, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
+       1791 async def post(
+       1792     self,
+       1793     path: str,
+       (...)   1800     stream_cls: type[_AsyncStreamT] | None = None,
+       1801 ) -> ResponseT | _AsyncStreamT:
+       1802     opts = FinalRequestOptions.construct(
+       1803         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
+       1804     )
+    -> 1805     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1495, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries)
+       1492 else:
+       1493     retries_taken = 0
+    -> 1495 return await self._request(
+       1496     cast_to=cast_to,
+       1497     options=options,
+       1498     stream=stream,
+       1499     stream_cls=stream_cls,
+       1500     retries_taken=retries_taken,
+       1501 )
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1585, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
+       1583 if remaining_retries > 0 and self._should_retry(err.response):
+       1584     await err.response.aclose()
+    -> 1585     return await self._retry_request(
+       1586         input_options,
+       1587         cast_to,
+       1588         retries_taken=retries_taken,
+       1589         response_headers=err.response.headers,
+       1590         stream=stream,
+       1591         stream_cls=stream_cls,
+       1592     )
+       1594 # If the response is streamed then we need to explicitly read the response
+       1595 # to completion before attempting to access the response text.
+       1596 if not err.response.is_closed:
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1632, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
+       1628 log.info("Retrying request to %s in %f seconds", options.url, timeout)
+       1630 await anyio.sleep(timeout)
+    -> 1632 return await self._request(
+       1633     options=options,
+       1634     cast_to=cast_to,
+       1635     retries_taken=retries_taken + 1,
+       1636     stream=stream,
+       1637     stream_cls=stream_cls,
+       1638 )
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1585, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
+       1583 if remaining_retries > 0 and self._should_retry(err.response):
+       1584     await err.response.aclose()
+    -> 1585     return await self._retry_request(
+       1586         input_options,
+       1587         cast_to,
+       1588         retries_taken=retries_taken,
+       1589         response_headers=err.response.headers,
+       1590         stream=stream,
+       1591         stream_cls=stream_cls,
+       1592     )
+       1594 # If the response is streamed then we need to explicitly read the response
+       1595 # to completion before attempting to access the response text.
+       1596 if not err.response.is_closed:
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1632, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
+       1628 log.info("Retrying request to %s in %f seconds", options.url, timeout)
+       1630 await anyio.sleep(timeout)
+    -> 1632 return await self._request(
+       1633     options=options,
+       1634     cast_to=cast_to,
+       1635     retries_taken=retries_taken + 1,
+       1636     stream=stream,
+       1637     stream_cls=stream_cls,
+       1638 )
+    
+
+    File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1600, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
+       1597         await err.response.aread()
+       1599     log.debug("Re-raising status error")
+    -> 1600     raise self._make_status_error_from_response(err.response) from None
+       1602 return await self._process_response(
+       1603     cast_to=cast_to,
+       1604     options=options,
+       (...)   1608     retries_taken=retries_taken,
+       1609 )
+    
+
+    RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-3DmWJfR4tphuKTSzcsMB3vHF on requests per min (RPM): Limit 500, Used 500, Requested 1. Please try again in 120ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}
+
+
+
+```python
+prompts
+```
diff --git a/docs/release-notes.md b/docs/release-notes.md
index 4710d15..41aaf04 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -1,5 +1,19 @@
 # Release Notes
 
+## Release v1.4.0
+### What's changed
+#### Added features
+* Reworked APILLM to allow for calls to any API that follows the OpenAI API format
+* Added graceful failing in optimization runs, allowing to obtain results after an error
+* Reworked configs to ExperimentConfig, allowing to parse any attributes
+
+### Further Changes:
+* Reworked getting started notebook
+* Added tests for the entire package, covering roughly 80% of the codebase
+* Reworked dependency and import structure to allow the usage of a subset of the package
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.3.2...v1.4.0)
+
 ## Release v1.3.2
 ### What's changed
 #### Added features
diff --git a/mkdocs.yml b/mkdocs.yml
index cd00a5a..d9e7ee3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -53,6 +53,8 @@ nav:
     - Tasks: api/tasks.md
     - Callbacks: api/callbacks.md
     - Config: api/config.md
+  - Tutorials:
+    - Getting Started: examples/getting_started.md
 
 markdown_extensions:
   - pymdownx.highlight:
diff --git a/notebooks/getting_started.ipynb b/notebooks/getting_started.ipynb
new file mode 100644
index 0000000..df61e85
--- /dev/null
+++ b/notebooks/getting_started.ipynb
@@ -0,0 +1,387 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Getting Started with Promptolution\n",
+    "\n",
+    "## Welcome to Promptolution! \n",
+    "\n",
+    "Discover a powerful tool for evolving and optimizing your LLM prompts. This notebook provides a friendly introduction to Promptolution's core functionality.\n",
+    "\n",
+    "We're excited to have you try Promptolution - let's get started!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installation\n",
+    "Install Promptolution with a single command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install promptolution[api]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from promptolution import ExperimentConfig, run_experiment\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply() # Required for notebook environments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting Up Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below, we're using a subsample of the subjectivity dataset from Hugging Face as an example. When using your own dataset, simply ensure you name the input column \"x\" and the target column \"y\", and provide a brief description of your task, that will parsed to the meta-llm during optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"hf://datasets/tasksource/subjectivity/train.csv\").sample(400)\n",
+    "df = df.rename(columns={\"Sentence\": \"x\", \"Label\": \"y\"})\n",
+    "df = df.replace({\"OBJ\": \"objective\", \"SUBJ\": \"subjective\"})\n",
+    "\n",
+    "task_description = \"The dataset contains sentences labeled as either subjective or objective. \"\\\n",
+    "        \"The task is to classify each sentence as either subjective or objective. \" \\\n",
+    "        \"The class mentioned first in the response of the LLM will be the prediction.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Creating Inital Prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We've defined some starter prompts below, but feel free to experiment! You might also want to explore create_prompts_from_samples to automatically generate initial prompts based on your data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_prompts = [\n",
+    "    'Classify the given text as either an objective or subjective statement based on the tone and language used: e.g. the tone and language used should indicate whether the statement is a neutral, factual summary (objective) or an expression of opinion or emotional tone (subjective). Include the output classes \"objective\" or \"subjective\" in the prompt.',\n",
+    "    'What kind of statement is the following text: [Insert text here]? Is it <objective_statement> or <subjective_statement>?',\n",
+    "    'Identify whether a sentence is objective or subjective by analyzing the tone, language, and underlying perspective. Consider the emotion, opinion, and bias present in the sentence. Are the authors presenting objective facts or expressing a personal point of view? The output will be either \"objective\" (output class: objective) or \"subjective\" (output class: subjective).',\n",
+    "    'Classify the following sentences as either objective or subjective, indicating the name of the output classes: [input sentence]. Output classes: objective, subjective',\n",
+    "    '_query a text about legal or corporate-related issues, and predict whether the tone is objective or subjective, outputting the corresponding class \"objective\" for non-subjective language or \"subjective\" for subjective language_',\n",
+    "    'Classify a statement as either \"subjective\" or \"objective\" based on whether it reflects a personal opinion or a verifiable fact. The output classes to include are \"objective\" and \"subjective\".',\n",
+    "    'Classify the text as objective or subjective based on its tone and language.',\n",
+    "    'Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.',\n",
+    "    'Classify the given text as objective or subjective based on its tone, focusing on its intention, purpose, and level of personal opinion or emotional appeal, with outputs including classes such as objective or subjective.',\n",
+    "    \"Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\",\n",
+    "    'Given a sentence, classify it as either \"objective\" or \"subjective\" based on its tone and language, considering the presence of third-person pronouns, neutral language, and opinions. Classify the output as \"objective\" if the tone is neutral and detached, focusing on facts and data, or as \"subjective\" if the tone is evaluative, emotive, or biased.',\n",
+    "    'Identify whether the given sentence is subjective or objective, then correspondingly output \"objective\" or \"subjective\" in the form of \"<output class>, (e.g. \"objective\"), without quotes. Please note that the subjective orientation typically describes a sentence where the writer expresses their own opinion or attitude, whereas an objective sentence presents facts or information without personal involvement or bias. <output classes: subjective, objective>'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Your LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Promptolution offers three flexible ways to access language models:\n",
+    "\n",
+    "1. Local LLMs (using the Transformers library)\n",
+    "1. vLLM backend (for efficient serving of large language models)\n",
+    "1. API-based LLMs (compatible with any provider following the OpenAI standard)\n",
+    "\n",
+    "For this demonstration, we'll use the DeepInfra API, but you can easily switch to other providers like Anthropic or OpenAI by simply changing the base_url and llm string in the configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token = \"YOUR_API_KEY\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's an explanation of each configuration parameter in the ExperimentConfig:\n",
+    "\n",
+    "- `task_description`: A string describing the task you're optimizing prompts for. This is used to provide the meta-llm with context about your task.\n",
+    "- `prompts`: A list of initial prompt strings that will be used as the starting point for optimization.\n",
+    "- `n_steps`: The number of optimization steps to run. Higher values allow more exploration and refinement but require more API calls and computational resources.\n",
+    "- `optimizer`: The algorithm used for prompt optimization. Currently we support \"evopromptga\", \"evopromptde\", and \"opro\".\n",
+    "- `api_url`: The API endpoint URL used to access the language model. This example uses DeepInfra's API which follows the OpenAI standard.\n",
+    "- `llm`: The LLM to use for the experiment, as both downstream and meta LLM.\n",
+    "- `token`: Your API authentication token required to access the language model service."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = ExperimentConfig(\n",
+    "    task_description=task_description,\n",
+    "    prompts=init_prompts,\n",
+    "    n_steps=10,\n",
+    "    optimizer=\"evopromptga\",\n",
+    "    api_url=\"https://api.deepinfra.com/v1/openai\",\n",
+    "    llm=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "    token=token,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With everything configured, you're ready to evolve your prompts:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = run_experiment(df, config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Determine the textual authenticity of the given sentences by distinguishing objective, factual information from subjective opinions, biases, and personal perspectives. Provide the predicted classification as the primary class label extracted within &lt;final_answer&gt; and &lt;/final_answer&gt;, with a classification of either 'objective' or 'subjective'</td>\n",
+       "      <td>0.6875</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Classify the sentences as objective (empirical evidence) or subjective (personal opinions), taking into account potential biases and factual information while considering the context, and return the classification as 'objective' or 'subjective' within &lt;final_answer&gt; and &lt;/final_answer&gt;.</td>\n",
+       "      <td>0.6125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Determine the sentiment of the given sentences by categorizing them as either objective or subjective, considering the presence of personal opinions, biases, or factual information. Output the predicted classification (objective or subjective) as the first class label extracted within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;.</td>\n",
+       "      <td>0.6125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Determine the objective or subjective nature of the input sentences by examining the presence of opinion, bias, or factual information and classify them accordingly, outputting the predicted classification as either 'objective' or 'subjective' with the corresponding label extracted from the text and marked as &lt;final_answer&gt; and &lt;/final_answer&gt;, providing accurate and concise results.</td>\n",
+       "      <td>0.5875</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Classify each sentence as either objective, conveying verifiable facts, or subjective, expressing personal opinions or perspectives, and output the predicted classification within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;, ensuring accurate identification of factual and opinion-based statements.</td>\n",
+       "      <td>0.5875</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Consider the input sentences' neutrality, fact-based content, and writer's perspective to accurately classify them as either 'objective' or 'subjective', providing the predicted classification within &lt;final_answer&gt; and &lt;/final_answer&gt;</td>\n",
+       "      <td>0.5750</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Accurately categorize the sentences as objective or subjective, accounting for biases, factual data, and linguistic features, and provide the predicted classification within &lt;final_answer&gt; and &lt;/final_answer&gt;, indicating 'objective' or 'subjective'</td>\n",
+       "      <td>0.5625</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Classify each sentence as objective (facts) or subjective (opinions) and output the predicted classification within &lt;final_answer&gt; and &lt;/final_answer&gt;.</td>\n",
+       "      <td>0.5375</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Determine the objective or subjective nature of the input sentences by examining the presence of opinion, bias, or factual information, and output the predicted classification with the extracted label between &lt;final_answer&gt; and &lt;/final_answer&gt;, labeling it as either 'objective' or 'subjective'</td>\n",
+       "      <td>0.5125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Determine the sentiment of the provided sentence, categorizing it as objective (factual) or subjective (personal), and report your classification within &lt;final_answer&gt; and &lt;/final_answer&gt;, stating either 'objective' or 'subjective'.</td>\n",
+       "      <td>0.4875</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Classify the provided sentences as either objective, containing verifiable facts, or subjective, featuring personal opinions or biases, and extract the classification as the first response within the markers &lt;final_answer&gt; &lt;/final_answer&gt;.</td>\n",
+       "      <td>0.4500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Determine the type of each sentence as objective (facts) or subjective (opinions), considering potential biases and factual information, and provide your predicted classification within &lt;final_answer&gt; and &lt;/final_answer&gt;.</td>\n",
+       "      <td>0.4250</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                                                                                                                                                                                prompt  \\\n",
+       "2                                             Determine the textual authenticity of the given sentences by distinguishing objective, factual information from subjective opinions, biases, and personal perspectives. Provide the predicted classification as the primary class label extracted within <final_answer> and </final_answer>, with a classification of either 'objective' or 'subjective'   \n",
+       "5                                                                                                      Classify the sentences as objective (empirical evidence) or subjective (personal opinions), taking into account potential biases and factual information while considering the context, and return the classification as 'objective' or 'subjective' within <final_answer> and </final_answer>.   \n",
+       "10                                                       Determine the sentiment of the given sentences by categorizing them as either objective or subjective, considering the presence of personal opinions, biases, or factual information. Output the predicted classification (objective or subjective) as the first class label extracted within the markers <final_answer> and </final_answer>.   \n",
+       "4   Determine the objective or subjective nature of the input sentences by examining the presence of opinion, bias, or factual information and classify them accordingly, outputting the predicted classification as either 'objective' or 'subjective' with the corresponding label extracted from the text and marked as <final_answer> and </final_answer>, providing accurate and concise results.   \n",
+       "6                                                                                        Classify each sentence as either objective, conveying verifiable facts, or subjective, expressing personal opinions or perspectives, and output the predicted classification within the markers <final_answer> and </final_answer>, ensuring accurate identification of factual and opinion-based statements.   \n",
+       "11                                                                                                                                                          Consider the input sentences' neutrality, fact-based content, and writer's perspective to accurately classify them as either 'objective' or 'subjective', providing the predicted classification within <final_answer> and </final_answer>   \n",
+       "7                                                                                                                                             Accurately categorize the sentences as objective or subjective, accounting for biases, factual data, and linguistic features, and provide the predicted classification within <final_answer> and </final_answer>, indicating 'objective' or 'subjective'   \n",
+       "1                                                                                                                                                                                                                                              Classify each sentence as objective (facts) or subjective (opinions) and output the predicted classification within <final_answer> and </final_answer>.   \n",
+       "3                                                                                               Determine the objective or subjective nature of the input sentences by examining the presence of opinion, bias, or factual information, and output the predicted classification with the extracted label between <final_answer> and </final_answer>, labeling it as either 'objective' or 'subjective'   \n",
+       "9                                                                                                                                                             Determine the sentiment of the provided sentence, categorizing it as objective (factual) or subjective (personal), and report your classification within <final_answer> and </final_answer>, stating either 'objective' or 'subjective'.   \n",
+       "0                                                                                                                                                      Classify the provided sentences as either objective, containing verifiable facts, or subjective, featuring personal opinions or biases, and extract the classification as the first response within the markers <final_answer> </final_answer>.   \n",
+       "8                                                                                                                                                                        Determine the type of each sentence as objective (facts) or subjective (opinions), considering potential biases and factual information, and provide your predicted classification within <final_answer> and </final_answer>.   \n",
+       "\n",
+       "     score  \n",
+       "2   0.6875  \n",
+       "5   0.6125  \n",
+       "10  0.6125  \n",
+       "4   0.5875  \n",
+       "6   0.5875  \n",
+       "11  0.5750  \n",
+       "7   0.5625  \n",
+       "1   0.5375  \n",
+       "3   0.5125  \n",
+       "9   0.4875  \n",
+       "0   0.4500  \n",
+       "8   0.4250  "
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, most optimized prompts are semantically very similar, however they often differ heavily in performance. This is exactly what we observed in our experiments across various LLMs and datasets. Running prompt optimization is an easy way to gain significant performance improvements on your task for free!\n",
+    "\n",
+    "If you run into any issues while using Promptolution, please feel free to contact us. We're also happy to receive support through pull requests and other contributions to the project.\n",
+    "\n",
+    "\n",
+    "Happy prompt optimizing! 🚀✨ We can't wait to see what you build with Promptolution! 🤖💡"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/promptolution/__init__.py b/promptolution/__init__.py
index 3efcb5a..0d9eeb2 100644
--- a/promptolution/__init__.py
+++ b/promptolution/__init__.py
@@ -1,3 +1,11 @@
-"""Promptolution: A library for prompt tuning."""
+"""Promptolution: A framework for prompt optimization and a zoo of prompt optimization algorithms."""
 
-from . import callbacks, config, llms, optimizers, predictors, tasks
+# Import main configuration classes
+from .config import ExperimentConfig
+from .helpers import run_evaluation, run_experiment, run_optimization
+
+# Import factory functions
+from .llms import get_llm
+from .optimizers import get_optimizer
+from .predictors import get_predictor
+from .tasks import get_task
diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
index ab9f73d..1895dd2 100644
--- a/promptolution/callbacks.py
+++ b/promptolution/callbacks.py
@@ -1,22 +1,36 @@
 """Callback classes for logging, saving, and tracking optimization progress."""
 
 import os
+from abc import ABC
 from datetime import datetime
 from typing import Literal
 
-import numpy as np
 import pandas as pd
 from tqdm import tqdm
 
 
-class Callback:
-    """Base class for optimization callbacks."""
+class BaseCallback(ABC):
+    """Base class for optimization callbacks.
+
+    Callbacks can be used to monitor the optimization process, save checkpoints,
+    log metrics, or implement early stopping criteria.
+
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize the callback with a configuration.
+
+        Args:
+            config: Configuration for the callback.
+            **kwargs: Additional keyword arguments.
+        """
+        pass
 
     def on_step_end(self, optimizer):
         """Called at the end of each optimization step.
 
         Args:
-        optimizer: The optimizer object that called the callback.
+            optimizer: The optimizer object that called the callback.
 
         Returns:
             Bool: True if the optimization should continue, False if it should stop.
@@ -27,7 +41,7 @@ def on_epoch_end(self, optimizer):
         """Called at the end of each optimization epoch.
 
         Args:
-        optimizer: The optimizer object that called the callback.
+            optimizer: The optimizer object that called the callback.
 
         Returns:
             Bool: True if the optimization should continue, False if it should stop.
@@ -38,7 +52,7 @@ def on_train_end(self, optimizer):
         """Called at the end of the entire optimization process.
 
         Args:
-        optimizer: The optimizer object that called the callback.
+            optimizer: The optimizer object that called the callback.
 
         Returns:
             Bool: True if the optimization should continue, False if it should stop.
@@ -46,7 +60,7 @@ def on_train_end(self, optimizer):
         return True
 
 
-class LoggerCallback(Callback):
+class LoggerCallback(BaseCallback):
     """Callback for logging optimization progress.
 
     This callback logs information about each step, epoch, and the end of training.
@@ -93,7 +107,7 @@ def on_train_end(self, optimizer, logs=None):
         return True
 
 
-class FileOutputCallback(Callback):
+class FileOutputCallback(BaseCallback):
     """Callback for saving optimization progress to a specified file type.
 
     This callback saves information about each step to a file.
@@ -137,7 +151,7 @@ def on_step_end(self, optimizer):
                 "step": [self.step] * len(optimizer.prompts),
                 "input_tokens": [optimizer.meta_llm.input_token_count] * len(optimizer.prompts),
                 "output_tokens": [optimizer.meta_llm.output_token_count] * len(optimizer.prompts),
-                "time": [datetime.now().total_seconds()] * len(optimizer.prompts),
+                "time": [datetime.now().timestamp()] * len(optimizer.prompts),
                 "score": optimizer.scores,
                 "prompt": optimizer.prompts,
             }
@@ -157,7 +171,7 @@ def on_step_end(self, optimizer):
         return True
 
 
-class BestPromptCallback(Callback):
+class BestPromptCallback(BaseCallback):
     """Callback for tracking the best prompt during optimization.
 
     This callback keeps track of the prompt with the highest score.
@@ -193,7 +207,7 @@ def get_best_prompt(self):
         return self.best_prompt, self.best_score
 
 
-class ProgressBarCallback(Callback):
+class ProgressBarCallback(BaseCallback):
     """Callback for displaying a progress bar during optimization.
 
     This callback uses tqdm to display a progress bar that updates at each step.
@@ -231,7 +245,7 @@ def on_train_end(self, optimizer):
         return True
 
 
-class TokenCountCallback(Callback):
+class TokenCountCallback(BaseCallback):
     """Callback for stopping optimization based on the total token count."""
 
     def __init__(
diff --git a/promptolution/config.py b/promptolution/config.py
index c7d2f0c..5af15b8 100644
--- a/promptolution/config.py
+++ b/promptolution/config.py
@@ -1,134 +1,62 @@
 """Configuration class for the promptolution library."""
-import configparser
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional
+from logging import Logger
+from typing import Set
 
+logger = Logger(__name__)
 
-@dataclass
-class Config:
+
+class ExperimentConfig:
     """Configuration class for the promptolution library.
 
-    This class handles loading and parsing of configuration settings,
-    either from a config file or from keyword arguments.
-
-    Attributes:
-        task_name (str): Name of the task. Should not be None if used.
-        ds_path (str): Path to the dataset. Should not be None if used.
-        n_steps (int): Number of optimization steps. Should not be None if used.
-        optimizer (str): Name of the optimizer to use. Should not be None if used.
-        predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator".
-        meta_llm (str): Name of the meta language model. Should not be None if used.
-        downstream_llm (str): Name of the downstream language model. Should not be None if used.
-        evaluation_llm (str): Name of the evaluation language model. Should not be None if used.
-        init_pop_size (int): Initial population size. Defaults to 10.
-        initial_prompts (List[str]): Initial prompts for the optimizer. Defaults to None.
-        logging_dir (str): Directory for logging. Defaults to "logs/run.csv".
-        experiment_name (str): Name of the experiment. Defaults to "experiment".
-        task_description (str): Task Description fed to the optimizer. Defaults to None.
-        donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False.
-        random_seed (int): Random seed for reproducibility. Defaults to 42.
-        model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/".
-        selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random".
-        meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None.
-        downstream_bs (int): Batch size for local downstream LLM.
-        Should not be None if llm is run locally Defaults to None.
-        api_token (str): API token for different APIs, as implemented in LLM classes.
-        Should not be None if APILLM is used. Defaults to None.
-        meta_prompt (str): Prompt template for the meta LLM.
-        If None is set, default meta_prompts from template.py will be used. Defaults to None.
-        prepend_exemplars (bool): rather to do exemplar search and prepend few-shot examples. Defaults to False.
-        n_exemplars (int): how many exemplars to prepend. Only used if prepend_exemplars is True. Defaults to 5.
-        exemplar_selector (str): which exemplar selector to use. Should not be None if preped_exemplars is True.
-        Defaults to None.
-        n_ds_samples_to_meta (int): how many examples to show of the ds to show to meta-llm
-        (not applicable to every optimizer)
-        n_eval_samples (int): how many examples to show to evaluation llm for evaluation.
+    This is a unified configuration class that handles all experiment settings.
+    It provides validation and tracking of used fields.
     """
 
-    task_name: str = None
-    ds_path: Path = None
-    optimizer: str = None
-    predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator"
-    meta_llm: str = None
-    downstream_llm: str = None
-    evaluation_llm: str = None
-    n_steps: int = None
-    init_pop_size: int = None
-    initial_prompts: List[str] = None
-    logging_dir: Path = Path("logs/run.csv")
-    experiment_name: str = "experiment"
-    task_description: str = None
-    donor_random: bool = False
-    random_seed: int = 42
-    model_storage_path: Optional[Path] = Path("../models/")
-    selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random"
-    meta_bs: Optional[int] = None
-    downstream_bs: Optional[int] = None
-    api_token: Optional[str] = None
-    meta_prompt: Optional[str] = None
-    prepend_exemplars: Optional[bool] = False
-    n_exemplars: Optional[int] = 5
-    exemplar_selector: Optional[str] = None
-    n_ds_samples_to_meta: Optional[int] = 2
-    n_eval_samples: Optional[int] = 20
-
-    def __post_init__(self):
-        """Validate the configuration after initialization."""
-        self._validate_config()
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
-        """Create a Config instance from a dictionary."""
-        return cls(**cls._process_config_dict(config_dict))
-
-    @classmethod
-    def from_file(cls, config_path: Path) -> "Config":
-        """Create a Config instance from a configuration file."""
-        if not config_path.exists():
-            raise FileNotFoundError(f"Configuration file not found: {config_path}")
-
-        config = configparser.ConfigParser()
-        config.read(config_path)
-
-        config_dict = {key: value for section in config.sections() for key, value in config[section].items()}
-
-        return cls.from_dict(config_dict)
-
-    @classmethod
-    def _process_config_dict(cls, config_dict: Dict[str, Any]) -> Dict[str, Any]:
-        """Process and validate the configuration dictionary."""
-        processed_dict = {}
-        for field in cls.__dataclass_fields__.values():
-            if field.name in config_dict:
-                value = config_dict[field.name]
-                if field.type == Path:
-                    processed_dict[field.name] = Path(value)
-                elif field.type == bool:
-                    processed_dict[field.name] = str(value).lower() == "true"
-                elif field.type == int:
-                    processed_dict[field.name] = int(value)
-                else:
-                    processed_dict[field.name] = value
-            elif field.default == field.default_factory:  # Check if field is required
-                raise ValueError(f"Required configuration parameter '{field.name}' is missing")
-
-        unknown_args = set(config_dict.keys()) - set(cls.__dataclass_fields__.keys())
-        if unknown_args:
-            print(f"Warning: Unexpected configuration arguments: {', '.join(unknown_args)}")
-
-        return processed_dict
-
-    def _validate_config(self):
-        """Validate the configuration settings."""
-        if self.meta_llm is not None:
-            if "local" in self.meta_llm and self.meta_bs is None:
-                raise ValueError("'meta_bs' must be specified for local meta_llm")
-            if "local" in self.downstream_llm and self.downstream_bs is None:
-                raise ValueError("'downstream_bs' must be specified for local downstream_llm")
-        if self.api_token is None:
-            print("Warning: No API token provided. Using default tokens from token files.")
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert the Config instance to a dictionary."""
-        return {field.name: getattr(self, field.name) for field in self.__dataclass_fields__.values()}
+    def __init__(self, **kwargs):
+        """Initialize the configuration with the provided keyword arguments."""
+        self._used_attributes: Set[str] = set()
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+    def __getattribute__(self, name):
+        """Override attribute access to track used attributes."""
+        # Get the attribute using the standard mechanism
+        value = object.__getattribute__(self, name)
+
+        if not name.startswith("_") and not callable(value):
+            self._used_attributes.add(name)
+
+        return value
+
+    def apply_to(self, obj):
+        """Apply matching attributes from this config to an existing object.
+
+        Examines each attribute of the target object and updates it if a matching
+        attribute exists in the config.
+
+        Args:
+            obj: The object to update with config values
+
+        Returns:
+            The updated object
+        """
+        for attr_name in dir(obj):
+            if attr_name.startswith("_") or not isinstance(
+                getattr(obj, attr_name), (str, int, float, list, type(None))
+            ):
+                continue
+
+            if hasattr(self, attr_name) and getattr(self, attr_name) is not None:
+                setattr(obj, attr_name, getattr(self, attr_name))
+
+        return obj
+
+    def validate(self):
+        """Check if any attributes were not used and run validation.
+
+        Does not raise an error, but logs a warning if any attributes are unused or validation fails.
+        """
+        all_attributes = {k for k in self.__dict__ if not k.startswith("_")}
+        unused_attributes = all_attributes - self._used_attributes
+        if unused_attributes:
+            logger.warning(f"Unused configuration attributes: {unused_attributes}")
diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py
index 8e81b86..5cc28c3 100644
--- a/promptolution/exemplar_selectors/base_exemplar_selector.py
+++ b/promptolution/exemplar_selectors/base_exemplar_selector.py
@@ -2,6 +2,7 @@
 
 from abc import ABC, abstractmethod
 
+from promptolution.config import ExperimentConfig
 from promptolution.predictors.base_predictor import BasePredictor
 from promptolution.tasks.base_task import BaseTask
 
@@ -13,16 +14,20 @@ class BaseExemplarSelector(ABC):
     that all exemplar selectors should implement.
     """
 
-    def __init__(self, task: BaseTask, predictor: BasePredictor):
+    def __init__(self, task: BaseTask, predictor: BasePredictor, config: ExperimentConfig = None):
         """Initialize the BaseExemplarSelector.
 
         Args:
             task (BaseTask): An object representing the task to be performed.
             predictor (BasePredictor): An object capable of making predictions based on prompts.
+            config (ExperimentConfig, optional): ExperimentConfig overwriting the defaults
         """
         self.task = task
         self.predictor = predictor
 
+        if config is not None:
+            config.apply_to(self)
+
     @abstractmethod
     def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
         """Select exemplars based on the given prompt.
diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py
index 5fe01ae..2b79801 100644
--- a/promptolution/exemplar_selectors/random_selector.py
+++ b/promptolution/exemplar_selectors/random_selector.py
@@ -12,16 +12,17 @@ class RandomSelector(BaseExemplarSelector):
     those that are evaluated as correct until the desired number of exemplars is reached.
     """
 
-    def __init__(self, task: BaseTask, predictor: BasePredictor, desired_score: int = 1):
+    def __init__(self, task: BaseTask, predictor: BasePredictor, desired_score: int = 1, config=None):
         """Initialize the RandomSelector.
 
         Args:
             task (BaseTask): An object representing the task to be performed.
             predictor (BasePredictor): An object capable of making predictions based on prompts.
             desired_score (int, optional): The desired score for the exemplars. Defaults to 1.
+            config: ExperimentConfig overriding the defaults
         """
-        super().__init__(task, predictor)
         self.desired_score = desired_score
+        super().__init__(task, predictor, config)
 
     def select_exemplars(self, prompt, n_examples: int = 5):
         """Select exemplars using a random selection strategy.
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index e58884b..601db56 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -1,34 +1,41 @@
 """Helper functions for the usage of the libary."""
 
-from logging import Logger
+from logging import getLogger
 from typing import List
 
-import numpy as np
 import pandas as pd
 
-from promptolution.config import Config
+from promptolution.callbacks import LoggerCallback
+from promptolution.config import ExperimentConfig
 from promptolution.exemplar_selectors import get_exemplar_selector
 from promptolution.llms import get_llm
 from promptolution.optimizers import get_optimizer
-from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator
-from promptolution.tasks import ClassificationTask
+from promptolution.predictors import get_predictor
+from promptolution.tasks import get_task
 
+logger = getLogger(__name__)
 
-def run_experiment(config: Config):
+
+def run_experiment(df: pd.DataFrame, config: ExperimentConfig):
     """Run a full experiment based on the provided configuration.
 
     Args:
+        df (pd.DataFrame): Input DataFrame containing the data.
         config (Config): Configuration object for the experiment.
 
     Returns:
         pd.DataFrame: A DataFrame containing the prompts and their scores.
     """
-    prompts = run_optimization(config)
-    df = run_evaluation(config, prompts)
-    return df
+    # train test split
+    train_df = df.sample(frac=0.8, random_state=42)
+    test_df = df.drop(train_df.index)
+    prompts = run_optimization(train_df, config)
+    df_prompt_scores = run_evaluation(test_df, config, prompts)
+
+    return df_prompt_scores
 
 
-def run_optimization(config: Config, callbacks: List = None, use_token: bool = False):
+def run_optimization(df, config: ExperimentConfig):
     """Run the optimization phase of the experiment.
 
     Args:
@@ -37,55 +44,45 @@ def run_optimization(config: Config, callbacks: List = None, use_token: bool = F
     Returns:
         List[str]: The optimized list of prompts.
     """
-    task = ClassificationTask(config)
-    if use_token:
-        llm = get_llm(config.meta_llm, token=config.api_token)
-    else:
-        llm = get_llm(config.meta_llm, model_storage_path=config.model_storage_path, seed=config.random_seed)
-    if config.predictor == "MarkerBasedClassificator":
-        predictor = MarkerBasedClassificator(llm, classes=task.classes)
-    elif config.predictor == "FirstOccurenceClassificator":
-        predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
-    else:
-        raise ValueError(f"Predictor {config.predictor} not supported.")
+    llm = get_llm(config=config)
+    predictor = get_predictor(llm, config=config)
+    config.task_description = config.task_description + " " + predictor.extraction_description
 
+    task = get_task(df, config)
     optimizer = get_optimizer(
-        config,
+        predictor=predictor,
         meta_llm=llm,
-        initial_prompts=config.intial_prompts,
         task=task,
-        predictor=predictor,
-        n_eval_samples=config.n_eval_samples,
-        callbacks=callbacks,
-        task_description=predictor.extraction_description,
+        config=config,
     )
 
     prompts = optimizer.optimize(n_steps=config.n_steps)
 
-    if config.prepend_exemplars:
+    if hasattr(config, "prepend_exemplars") and config.prepend_exemplars:
         selector = get_exemplar_selector(config.exemplar_selector, task, predictor)
         prompts = [selector.select_exemplars(p, n_examples=config.n_exemplars) for p in prompts]
 
     return prompts
 
 
-def run_evaluation(df: pd.DataFrame, config: Config, prompts: List[str]):
+def run_evaluation(df: pd.DataFrame, config: ExperimentConfig, prompts: List[str]):
     """Run the evaluation phase of the experiment.
 
     Args:
+        df (pd.DataFrame): Input DataFrame containing the data.
         config (Config): Configuration object for the experiment.
         prompts (List[str]): List of prompts to evaluate.
 
     Returns:
         pd.DataFrame: A DataFrame containing the prompts and their scores.
     """
-    task = ClassificationTask(df, description=config.task_description)
+    task = get_task(df, config)
 
-    llm = get_llm(config.evaluation_llm, token=config.api_token)
-    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
+    llm = get_llm(config=config)
+    predictor = get_predictor(llm, config=config)
 
-    scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples)
+    scores = task.evaluate(prompts, predictor)
     df = pd.DataFrame(dict(prompt=prompts, score=scores))
-    df = df.sort_values("score", ascending=False)
+    df = df.sort_values("score", ascending=False, ignore_index=True)
 
     return df
diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index 95a22bd..aa01a09 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -1,12 +1,14 @@
 """Module for Large Language Models."""
 
+from promptolution.config import ExperimentConfig
+
 from .api_llm import APILLM
 from .base_llm import DummyLLM
 from .local_llm import LocalLLM
 from .vllm import VLLM
 
 
-def get_llm(model_id: str, *args, **kwargs):
+def get_llm(model_id: str = None, config: ExperimentConfig = None):
     """Factory function to create and return a language model instance based on the provided model_id.
 
     This function supports three types of language models:
@@ -21,18 +23,20 @@ def get_llm(model_id: str, *args, **kwargs):
                         - "local-{model_name}" for LocalLLM
                         - "vllm-{model_name}" for VLLM
                         - Any other string for APILLM
-        *args: Variable length argument list passed to the LLM constructor.
-        **kwargs: Arbitrary keyword arguments passed to the LLM constructor.
+        config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
 
     Returns:
         An instance of DummyLLM, LocalLLM, or APILLM based on the model_id.
     """
+    if model_id is None:
+        model_id = config.llm
     if model_id == "dummy":
-        return DummyLLM(*args, **kwargs)
+        return DummyLLM(config)
     if "local" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
-        return LocalLLM(model_id, *args, **kwargs)
+        return LocalLLM(model_id, config)
     if "vllm" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
-        return VLLM(model_id, *args, **kwargs)
-    return APILLM(model_id, *args, **kwargs)
+        return VLLM(model_id, config=config)
+
+    return APILLM(llm=model_id, config=config)
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index d00bc91..00db034 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -1,144 +1,112 @@
 """Module to interface with various language models through their respective APIs."""
 
-import asyncio
-import time
-from logging import Logger
-from typing import Any, List
 
-import nest_asyncio
-import openai
-import requests
-from langchain_anthropic import ChatAnthropic
-from langchain_community.chat_models.deepinfra import ChatDeepInfra, ChatDeepInfraException
-from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_openai import ChatOpenAI
+try:
+    import asyncio
 
-from promptolution.llms.base_llm import BaseLLM
+    from openai import AsyncOpenAI
 
-logger = Logger(__name__)
+    import_successful = True
+except ImportError:
+    import_successful = False
 
+from logging import Logger
+from typing import Any, List
 
-async def invoke_model(prompt, system_prompt, model, semaphore):
-    """Asynchronously invoke a language model with retry logic.
+from promptolution.config import ExperimentConfig
+from promptolution.llms.base_llm import BaseLLM
 
-    Args:
-        prompt (str): The input prompt for the model.
-        system_prompt (str): The system prompt for the model.
-        model: The language model to invoke.
-        semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls.
+logger = Logger(__name__)
 
-    Returns:
-        str: The model's response content.
 
-    Raises:
-        ChatDeepInfraException: If all retry attempts fail.
-    """
+async def _invoke_model(prompt, system_prompt, max_tokens, model_id, client, semaphore, max_retries=20, retry_delay=5):
     async with semaphore:
-        max_retries = 100
-        delay = 3
-        attempts = 0
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
 
-        while attempts < max_retries:
+        for attempt in range(max_retries + 1):  # +1 for the initial attempt
             try:
-                response = await model.ainvoke([SystemMessage(content=system_prompt), HumanMessage(content=prompt)])
-                return response.content
-            except ChatDeepInfraException as e:
-                print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...")
-                attempts += 1
-                await asyncio.sleep(delay)
+                response = await client.chat.completions.create(
+                    model=model_id,
+                    messages=messages,
+                    max_tokens=max_tokens,
+                )
+                return response
+            except Exception as e:
+                if attempt < max_retries:
+                    # Calculate exponential backoff with jitter
+                    logger.warning(
+                        f"API call failed (attempt {attempt + 1} / {max_retries + 1}): {str(e)}. "
+                        f"Retrying in {retry_delay:.2f} seconds..."
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    # Log the final failure and re-raise the exception
+                    logger.error(f"API call failed after {max_retries + 1} attempts: {str(e)}")
+                    raise
 
 
 class APILLM(BaseLLM):
-    """A class to interface with various language models through their respective APIs.
+    """A class to interface with language models through their respective APIs.
 
-    This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models.
-    It handles API key management, model initialization, and provides methods for
-    both synchronous and asynchronous inference.
+    This class provides a unified interface for making API calls to language models
+    using the OpenAI client library. It handles rate limiting through semaphores
+    and supports both synchronous and asynchronous operations.
 
     Attributes:
-        model: The initialized language model instance.
-
-    Methods:
-        get_response: Synchronously get responses for a list of prompts.
-        get_response_async: Asynchronously get responses for a list of prompts.
+        model_id (str): Identifier for the model to use.
+        client (AsyncOpenAI): The initialized API client.
+        max_tokens (int): Maximum number of tokens in model responses.
+        semaphore (asyncio.Semaphore): Semaphore to limit concurrent API calls.
     """
 
-    def __init__(self, model_id: str, token: str = None, **kwargs: Any):
-        """Initialize the APILLM with a specific model.
-
-        Args:
-            model_id (str): Identifier for the model to use.
-            token (str): API key for the model.
-
-        Raises:
-            ValueError: If an unknown model identifier is provided.
-        """
-        super().__init__()
-        if "claude" in model_id:
-            self.model = ChatAnthropic(model=model_id, api_key=token)
-        elif "gpt" in model_id:
-            self.model = ChatOpenAI(model=model_id, api_key=token)
-        else:
-            self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
-
-    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
-        """Get responses for a list of prompts in a synchronous manner.
-
-        This method includes retry logic for handling connection errors and rate limits.
+    def __init__(
+        self,
+        api_url: str = None,
+        llm: str = None,
+        token: str = None,
+        max_concurrent_calls=50,
+        max_tokens=512,
+        config: ExperimentConfig = None,
+    ):
+        """Initialize the APILLM with a specific model and API configuration.
 
         Args:
-            prompts (list[str]): List of input prompts.
-            system_prompts (list[str]): List of system prompts. If not provided, uses default system_prompts
-
-        Returns:
-            list[str]: List of model responses.
+            api_url (str): The base URL for the API endpoint.
+            llm (str): Identifier for the model to use.
+            token (str, optional): API key for authentication. Defaults to None.
+            max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
+            max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
+            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
 
         Raises:
-            requests.exceptions.ConnectionError: If max retries are exceeded.
+            ImportError: If required libraries are not installed.
         """
-        max_retries = 100
-        delay = 3
-        attempts = 0
-
-        nest_asyncio.apply()
-
-        while attempts < max_retries:
-            try:
-                responses = asyncio.run(self.get_response_async(prompts))
-                return responses
-            except requests.exceptions.ConnectionError as e:
-                attempts += 1
-                logger.critical(
-                    f"Connection error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds..."
-                )
-                time.sleep(delay)
-            except openai.RateLimitError as e:
-                attempts += 1
-                logger.critical(
-                    f"Rate limit error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds..."
-                )
-                time.sleep(delay)
-
-        # If the loop exits, it means max retries were reached
-        raise requests.exceptions.ConnectionError("Max retries exceeded. Connection could not be established.")
-
-    async def get_response_async(self, prompts: list[str], max_concurrent_calls=200) -> list[str]:
-        """Asynchronously get responses for a list of prompts.
-
-        This method uses a semaphore to limit the number of concurrent API calls.
-
-        Args:
-            prompts (list[str]): List of input prompts.
-            max_concurrent_calls (int): Maximum number of concurrent API calls allowed.
-
-        Returns:
-            list[str]: List of model responses.
-        """
-        semaphore = asyncio.Semaphore(max_concurrent_calls)
-        tasks = []
-
-        for prompt in prompts:
-            tasks.append(invoke_model(prompt, self.model, semaphore))
+        if not import_successful:
+            raise ImportError(
+                "Could not import at least one of the required libraries: openai, asyncio. "
+                "Please ensure they are installed in your environment."
+            )
+
+        self.api_url = api_url
+        self.llm = llm
+        self.token = token
+        self.max_concurrent_calls = max_concurrent_calls
+        self.max_tokens = max_tokens
+
+        super().__init__(config=config)
+        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.token)
+        self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)
+
+    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+        # Setup for async execution in sync context
+        loop = asyncio.get_event_loop()
+        responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
+        return responses
 
+    async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+        tasks = [
+            _invoke_model(prompt, system_prompt, self.max_tokens, self.llm, self.client, self.semaphore)
+            for prompt, system_prompt in zip(prompts, system_prompts)
+        ]
         responses = await asyncio.gather(*tasks)
-        return responses
+        return [response.choices[0].message.content for response in responses]
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 1a79d29..592e248 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -2,10 +2,12 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import List
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 
+from promptolution.config import ExperimentConfig
 from promptolution.templates import DEFAULT_SYS_PROMPT
 
 logger = logging.getLogger(__name__)
@@ -15,14 +17,26 @@ class BaseLLM(ABC):
     """Abstract base class for Language Models in the promptolution library.
 
     This class defines the interface that all concrete LLM implementations should follow.
+    It's designed to track which configuration parameters are actually used.
 
-    Methods:
-        get_response: An abstract method that should be implemented by subclasses
-                      to generate responses for given prompts.
+    Attributes:
+        config (LLMModelConfig): Configuration for the language model.
+        input_token_count (int): Count of input tokens processed.
+        output_token_count (int): Count of output tokens generated.
     """
 
-    def __init__(self, *args, **kwargs):
-        """Initialize the LLM."""
+    def __init__(self, config: ExperimentConfig = None):
+        """Initialize the LLM with a configuration or direct parameters.
+
+        This constructor supports both config-based and direct parameter initialization
+        for backward compatibility.
+
+        Args:
+            config (Optional[Union[Dict[str, Any], LLMModelConfig]]): Configuration for the LLM.
+        """
+        if config is not None:
+            config.apply_to(self)
+        # Initialize token counters
         self.input_token_count = 0
         self.output_token_count = 0
 
@@ -46,11 +60,12 @@ def reset_token_count(self):
     def update_token_count(self, inputs: List[str], outputs: List[str]):
         """Update the token count based on the given inputs and outputs.
 
+        It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.
+
         Args:
             inputs (List[str]): A list of input prompts.
             outputs (List[str]): A list of generated responses.
         """
-        logger.warning("Token count is approximated using word count split by whitespace, not an actual tokenizer.")
         input_tokens = sum([len(i.split()) for i in inputs])
         output_tokens = sum([len(o.split()) for o in outputs])
         self.input_token_count += input_tokens
@@ -91,7 +106,7 @@ def set_generation_seed(self, seed: int):
         pass
 
     @abstractmethod
-    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
+    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
         """Generate responses for the given prompts.
 
         This method should be implemented by subclasses to define how
@@ -115,7 +130,7 @@ class DummyLLM(BaseLLM):
     complex natural language processing.
     """
 
-    def _get_response(self, prompts: str) -> str:
+    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> str:
         """Generate random responses for the given prompts.
 
         This method creates silly, random responses enclosed in <prompt> tags.
@@ -128,8 +143,6 @@ def _get_response(self, prompts: str) -> str:
         Returns:
             List[str]: A list of randomly generated responses, one for each input prompt.
         """
-        if isinstance(prompts, str):
-            prompts = [prompts]
         results = []
         for p in prompts:
             r = np.random.rand()
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index 46afe17..dabeb42 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -1,12 +1,11 @@
-"""Module for running language models locally using the Hugging Face Transformers library."""
+"""Module for running LLMs locally using the Hugging Face Transformers library."""
 try:
     import torch
     import transformers
-except ImportError as e:
-    import logging
 
-    logger = logging.getLogger(__name__)
-    logger.warning(f"Could not import torch or transformers in local_llm.py: {e}")
+    imports_successful = True
+except ImportError:
+    imports_successful = False
 
 from promptolution.llms.base_llm import BaseLLM
 
@@ -24,19 +23,23 @@ class LocalLLM(BaseLLM):
         get_response: Generate responses for a list of prompts.
     """
 
-    def __init__(self, model_id: str, batch_size=8):
+    def __init__(self, model_id: str, batch_size=8, config=None):
         """Initialize the LocalLLM with a specific model.
 
         Args:
             model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
             batch_size (int, optional): The batch size for text generation. Defaults to 8.
+            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
 
         Note:
             This method sets up a text generation pipeline with bfloat16 precision,
             automatic device mapping, and specific generation parameters.
         """
-        super().__init__()
-
+        if not imports_successful:
+            raise ImportError(
+                "Could not import at least one of the required libraries: torch, transformers. "
+                "Please ensure they are installed in your environment."
+            )
         self.pipeline = transformers.pipeline(
             "text-generation",
             model=model_id,
@@ -49,6 +52,7 @@ def __init__(self, model_id: str, batch_size=8):
         )
         self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
         self.pipeline.tokenizer.padding_side = "left"
+        super().__init__(config)
 
     def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the local language model.
@@ -78,8 +82,5 @@ def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[s
 
     def __del__(self):
         """Cleanup method to delete the pipeline and free up GPU memory."""
-        try:
-            del self.pipeline
-            torch.cuda.empty_cache()
-        except Exception as e:
-            logger.warning(f"Error during LocalLLM cleanup: {e}")
+        del self.pipeline
+        torch.cuda.empty_cache()
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index ec6505e..6e06dac 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -4,6 +4,7 @@
 from logging import Logger
 from typing import List
 
+from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 
 logger = Logger(__name__)
@@ -12,8 +13,10 @@
     import torch
     from transformers import AutoTokenizer
     from vllm import LLM, SamplingParams
-except ImportError as e:
-    logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")
+
+    imports_successful = True
+except ImportError:
+    imports_successful = False
 
 
 class VLLM(BaseLLM):
@@ -46,7 +49,8 @@ def __init__(
         max_model_len: int = 2048,
         trust_remote_code: bool = False,
         seed: int = 42,
-        **kwargs,
+        llm_kwargs: dict = None,
+        config: ExperimentConfig = None,
     ):
         """Initialize the VLLM with a specific model.
 
@@ -63,12 +67,17 @@ def __init__(
             max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
             trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
             seed (int, optional): Random seed for the model. Defaults to 42.
-            **kwargs: Additional keyword arguments to pass to the LLM class initialization.
+            llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
+            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
         """
-        super().__init__()
+        if not imports_successful:
+            raise ImportError(
+                "Could not import at least one of the required libraries: torch, transformers, vllm. "
+                "Please ensure they are installed in your environment."
+            )
 
         self.dtype = dtype
         self.tensor_parallel_size = tensor_parallel_size
@@ -81,6 +90,8 @@ def __init__(
             temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
         )
 
+        if llm_kwargs is None:
+            llm_kwargs = {}
         # Initialize the vLLM engine with both explicit parameters and any additional kwargs
         llm_params = {
             "model": model_id,
@@ -92,7 +103,7 @@ def __init__(
             "download_dir": model_storage_path,
             "trust_remote_code": self.trust_remote_code,
             "seed": seed,
-            **kwargs,
+            **llm_kwargs,
         }
 
         self.llm = LLM(**llm_params)
@@ -108,6 +119,8 @@ def __init__(
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
+        super().__init__(config)
+
     def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the vLLM engine.
 
@@ -169,8 +182,3 @@ def set_generation_seed(self, seed):
             seed (int): Random seed for text generation.
         """
         self.sampling_params.seed = seed
-
-    def __del__(self):
-        """Cleanup method to delete the LLM instance and free up GPU memory."""
-        del self.llm
-        torch.cuda.empty_cache()
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index 09c57fa..6d5e00d 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -1,5 +1,11 @@
 """Module for prompt optimizers."""
 
+from typing import Literal
+
+from promptolution.config import ExperimentConfig
+from promptolution.llms.base_llm import BaseLLM
+from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.tasks.base_task import BaseTask
 from promptolution.templates import (
     EVOPROMPT_DE_TEMPLATE,
     EVOPROMPT_DE_TEMPLATE_TD,
@@ -16,56 +22,59 @@
 
 
 def get_optimizer(
-    config=None, optimizer: str = None, meta_prompt: str = None, task_description: str = None, *args, **kwargs
+    predictor: BasePredictor,
+    meta_llm: BaseLLM,
+    task: BaseTask,
+    optimizer: Literal["evopromptde", "evopromptga", "opro"] = None,
+    meta_prompt: str = None,
+    task_description: str = None,
+    config: ExperimentConfig = None,
 ):
-    """Factory function to create and return an optimizer instance based on the provided configuration.
-
-    This function selects and instantiates the appropriate optimizer class based on the
-    'optimizer' field in the config object. Alternatively you can pass the relevant parameters.
-    It supports three types of optimizers: 'dummy', 'evopromptde', 'evopromptga', and 'opro'.
+    """Creates and returns an optimizer instance based on provided parameters.
 
     Args:
-        config (Config): Configuration object containing the optimizer type.
-        optimizer (str): Identifier for the optimizer to use. Special cases:
-                         - "dummy" for DummyOptimizer
-                         - Any other string for the specified optimizer class
-        include_task_desc (bool): Flag to include task description in the prompt.
-        meta_prompt (str): Meta prompt for the optimizer.
-        task_description (str): Task description for the optimizer.
-        *args: Variable length argument list passed to the optimizer constructor.
-        **kwargs: Arbitrary keyword arguments passed to the optimizer constructor
+        predictor: The predictor used for prompt evaluation
+        meta_llm: The language model used for generating meta-prompts
+        task: The task object used for evaluating prompts
+        optimizer: String identifying which optimizer to use
+        meta_prompt: Meta prompt text for the optimizer
+        task_description: Description of the task for the optimizer
+        config: Configuration object with default parameters
 
     Returns:
-        An instance of the specified optimizer class.
+        An optimizer instance
 
     Raises:
-        ValueError: If an unknown optimizer type is specified in the config.
+        ValueError: If an unknown optimizer type is specified
     """
     if optimizer is None:
         optimizer = config.optimizer
-
     if task_description is None:
         task_description = config.task_description
-
-    if config is not None and meta_prompt is None:
+    if meta_prompt is None and hasattr(config, "meta_prompt"):
         meta_prompt = config.meta_prompt
 
     if optimizer == "dummy":
-        return DummyOptimizer(*args, **kwargs)
+        return DummyOptimizer(predictor=predictor, config=config)
+
     if config.optimizer == "evopromptde":
-        if task_description is not None:
-            return EvoPromptDE(
-                prompt_template=EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
-            )
-        return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs)
+        template = (
+            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description)
+            if task_description
+            else EVOPROMPT_DE_TEMPLATE
+        )
+        return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
     if config.optimizer == "evopromptga":
-        if task_description is not None:
-            return EvoPromptGA(
-                prompt_template=EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs
-            )
-        return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs)
+        template = (
+            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description)
+            if task_description
+            else EVOPROMPT_GA_TEMPLATE
+        )
+        return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
     if config.optimizer == "opro":
-        if task_description is not None:
-            return Opro(prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc>", task_description), *args, **kwargs)
-        return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs)
+        template = OPRO_TEMPLATE_TD.replace("<task_desc>", task_description) if task_description else OPRO_TEMPLATE
+        return Opro(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
     raise ValueError(f"Unknown optimizer: {config.optimizer}")
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 20bbc6f..1acbceb 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -1,65 +1,112 @@
-"""Base class for prompt optimizers."""
+"""Base module for optimizers in the promptolution library."""
 
-import time
 from abc import ABC, abstractmethod
+from logging import getLogger
 from typing import Callable, List
 
-from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.config import ExperimentConfig
 from promptolution.tasks.base_task import BaseTask
 
+logger = getLogger(__name__)
+
 
 class BaseOptimizer(ABC):
     """Abstract base class for prompt optimizers.
 
     This class defines the basic structure and interface for prompt optimization algorithms.
-    Concrete optimizer implementations should inherit from this class and implement
-    the `optimize` method.
 
     Attributes:
+        config (OptimizerConfig): Configuration for the optimizer.
         prompts (List[str]): List of current prompts being optimized.
         task (BaseTask): The task object used for evaluating prompts.
         callbacks (List[Callable]): List of callback functions to be called during optimization.
         predictor: The predictor used for prompt evaluation (if applicable).
-
-    Args:
-        initial_prompts (List[str]): Initial set of prompts to start optimization with.
-        task (BaseTask): Task object for prompt evaluation.
-        callbacks (List[Callable], optional): List of callback functions. Defaults to an empty list.
-        predictor (optional): Predictor for prompt evaluation. Defaults to None.
     """
 
     def __init__(
         self,
-        initial_prompts: list[str],
+        predictor,
         task: BaseTask,
-        callbacks: list[Callable] = [],
-        predictor: BasePredictor = None,
-        verbosity: int = 0,
+        initial_prompts: List[str],
+        callbacks: List[Callable] = None,
+        config: ExperimentConfig = None,
     ):
-        """Initialize the BaseOptimizer."""
+        """Initialize the optimizer with a configuration and/or direct parameters.
+
+        Args:
+            initial_prompts: Initial set of prompts to start optimization with.
+            task: Task object for prompt evaluation.
+            callbacks: List of callback functions.
+            predictor: Predictor for prompt evaluation.
+            config: Configuration for the optimizer.
+        """
+        # Set up optimizer state
         self.prompts = initial_prompts
         self.task = task
-        self.callbacks = callbacks
+        self.callbacks = callbacks or []
         self.predictor = predictor
-        self.verbosity = verbosity
+        self.verbosity = 0
+
+        if config is not None:
+            config.apply_to(self)
+
+        self.config = config
 
-    @abstractmethod
     def optimize(self, n_steps: int) -> List[str]:
-        """Abstract method to perform the optimization process.
+        """Perform the optimization process.
 
         This method should be implemented by concrete optimizer classes to define
         the specific optimization algorithm.
 
         Args:
-            n_steps (int): Number of optimization steps to perform.
+            n_steps: Number of optimization steps to perform. If None, uses the value from config.
 
         Returns:
-            List[str]: The optimized list of prompts after all steps.
+            The optimized list of prompts after all steps.
+        """
+        # validate config
+        if self.config is not None:
+            self.config.validate()
+        self._pre_optimization_loop()
+
+        for _ in range(n_steps):
+            try:
+                self.prompts = self._step()
+            except Exception as e:
+                # exit training loop and gracefully fail
+                logger.error(f"Error during optimization step: {e}")
+                logger.error("Exiting optimization loop.")
+                break
+
+            # Callbacks at the end of each step
+            continue_optimization = self._on_step_end()
+            if not continue_optimization:
+                break
+
+        self._on_train_end()
 
-        Raises:
-            NotImplementedError: If not implemented by a concrete class.
+        return self.prompts
+
+    @abstractmethod
+    def _pre_optimization_loop(self):
+        """Prepare for the optimization loop.
+
+        This method should be implemented by concrete optimizer classes to define
+        any setup required before the optimization loop starts.
+        """
+        pass
+
+    @abstractmethod
+    def _step(self):
+        """Perform a single optimization step.
+
+        This method should be implemented by concrete optimizer classes to define
+        the specific optimization step.
+
+        Returns:
+            The optimized list of prompts after the step.
         """
-        raise NotImplementedError
+        pass
 
     def _on_step_end(self):
         """Call all registered callbacks at the end of each optimization step."""
@@ -69,14 +116,6 @@ def _on_step_end(self):
 
         return continue_optimization
 
-    def _on_epoch_end(self):
-        """Call all registered callbacks at the end of each optimization epoch."""
-        continue_optimization = True
-        for callback in self.callbacks:
-            continue_optimization &= callback.on_epoch_end(self)  # if any callback returns False, end the optimization
-
-        return continue_optimization
-
     def _on_train_end(self):
         """Call all registered callbacks at the end of the entire optimization process."""
         for callback in self.callbacks:
@@ -99,7 +138,7 @@ class DummyOptimizer(BaseOptimizer):
         **kwargs: Arbitrary keyword arguments (unused).
     """
 
-    def __init__(self, initial_prompts, *args, **kwargs):
+    def __init__(self, initial_prompts):
         """Initialize the DummyOptimizer."""
         self.callbacks = []
         self.prompts = initial_prompts
@@ -117,7 +156,6 @@ def optimize(self, n_steps) -> list[str]:
             List[str]: The original list of prompts, unchanged.
         """
         self._on_step_end()
-        self._on_epoch_end()
         self._on_train_end()
 
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index b3f58ee..c6e2b82 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -4,8 +4,12 @@
 
 import numpy as np
 
+from promptolution.callbacks import BaseCallback
+from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.tasks.base_task import BaseTask
 
 
 class EvoPromptDE(BaseOptimizer):
@@ -27,81 +31,79 @@ class EvoPromptDE(BaseOptimizer):
         prompt_template (str): Template for meta-prompts.
         meta_llm: Language model for child prompt generation.
         donor_random (bool, optional): Whether to use a random donor. Defaults to False.
-        **args: Additional arguments passed to the BaseOptimizer.
+        n_eval_samples (int, optional): Number of samples for evaluation. Defaults to 20.
+        config (ExperimentConfig, optional): Configuration for the experiment.
     """
 
     def __init__(
         self,
-        prompt_template: str = None,
-        meta_llm: BaseLLM = None,
+        predictor: BasePredictor,
+        task: BaseTask,
+        prompt_template: str,
+        meta_llm: BaseLLM,
+        initial_prompts: List[str] = None,
         donor_random: bool = False,
         n_eval_samples: int = 20,
-        **args
+        callbacks: List[BaseCallback] = None,
+        config: ExperimentConfig = None,
     ):
         """Initialize the EvoPromptDE optimizer."""
         self.prompt_template = prompt_template
         self.n_eval_samples = n_eval_samples
         self.donor_random = donor_random
-        assert meta_llm is not None, "A meta language model must be provided."
         self.meta_llm = meta_llm
-        super().__init__(**args)
+        super().__init__(
+            predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config
+        )
 
-    def optimize(self, n_steps: int) -> List[str]:
+    def _pre_optimization_loop(self):
+        self.scores = self.task.evaluate(self.prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples)
+        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
+        self.scores = sorted(self.scores, reverse=True)
+
+    def _step(self) -> List[str]:
         """Perform the optimization process for a specified number of steps.
 
         This method iteratively improves the prompts using a differential evolution strategy.
         It evaluates prompts, generates new prompts using the DE algorithm, and replaces
         prompts if the new ones perform better.
 
-        Args:
-            n_steps (int): Number of optimization steps to perform.
 
         Returns:
             List[str]: The optimized list of prompts after all steps.
         """
-        self.scores = self.task.evaluate(self.prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples)
-        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
-        self.scores = sorted(self.scores, reverse=True)
-
-        for _ in range(n_steps):
-            cur_best = self.prompts[0]
-            meta_prompts = []
-            for i in range(len(self.prompts)):
-                # create meta prompts
-                old_prompt = self.prompts[i]
-
-                candidates = [prompt for prompt in self.prompts if prompt != old_prompt]
-                a, b, c = np.random.choice(candidates, size=3, replace=False)
-
-                if not self.donor_random:
-                    c = cur_best
-
-                meta_prompt = (
-                    self.prompt_template.replace("<prompt0>", old_prompt)
-                    .replace("<prompt1>", a)
-                    .replace("<prompt2>", b)
-                    .replace("<prompt3>", c)
-                )
-
-                meta_prompts.append(meta_prompt)
-
-            child_prompts = self.meta_llm.get_response(meta_prompts)
-            child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
-
-            child_scores = self.task.evaluate(
-                child_prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples
+        cur_best = self.prompts[0]
+        meta_prompts = []
+        for i in range(len(self.prompts)):
+            # create meta prompts
+            old_prompt = self.prompts[i]
+
+            candidates = [prompt for prompt in self.prompts if prompt != old_prompt]
+            a, b, c = np.random.choice(candidates, size=3, replace=False)
+
+            if not self.donor_random:
+                c = cur_best
+
+            meta_prompt = (
+                self.prompt_template.replace("<prompt0>", old_prompt)
+                .replace("<prompt1>", a)
+                .replace("<prompt2>", b)
+                .replace("<prompt3>", c)
             )
 
-            for i in range(len(self.prompts)):
-                if child_scores[i] > self.scores[i]:
-                    self.prompts[i] = child_prompts[i]
-                    self.scores[i] = child_scores[i]
+            meta_prompts.append(meta_prompt)
+
+        child_prompts = self.meta_llm.get_response(meta_prompts)
+        child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
 
-            continue_optimization = self._on_step_end()
+        child_scores = self.task.evaluate(child_prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples)
 
-            if not continue_optimization:
-                break
+        for i in range(len(self.prompts)):
+            if child_scores[i] > self.scores[i]:
+                self.prompts[i] = child_prompts[i]
+                self.scores[i] = child_scores[i]
 
-        self._on_train_end()
+        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
+        self.scores = sorted(self.scores, reverse=True)
 
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 204e2e5..d06b0fb 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -5,8 +5,12 @@
 
 import numpy as np
 
+from promptolution.callbacks import BaseCallback
+from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.tasks.base_task import BaseTask
 
 logger = Logger(__name__)
 
@@ -30,7 +34,6 @@ class EvoPromptGA(BaseOptimizer):
         prompt_template (str): Template for meta-prompts.
         meta_llm: Language model for child prompt generation.
         selection_mode (str, optional): Parent selection method. Defaults to "wheel".
-        **args: Additional arguments passed to the BaseOptimizer.
 
     Raises:
         AssertionError: If an invalid selection mode is provided.
@@ -38,36 +41,29 @@ class EvoPromptGA(BaseOptimizer):
 
     def __init__(
         self,
-        prompt_template: str = None,
-        meta_llm: BaseLLM = None,
+        predictor: BasePredictor,
+        task: BaseTask,
+        prompt_template: str,
+        meta_llm: BaseLLM,
+        initial_prompts: List[str] = None,
         selection_mode: str = "wheel",
         n_eval_samples: int = 20,
-        **args,
+        callbacks: List[BaseCallback] = None,
+        config: ExperimentConfig = None,
     ):
         """Initialize the EvoPromptGA optimizer."""
         self.prompt_template = prompt_template
         self.n_eval_samples = n_eval_samples
-        assert meta_llm is not None, "Meta_llm is required"
         self.meta_llm = meta_llm
-        assert selection_mode in ["random", "wheel", "tour"], "Invalid selection mode."
         self.selection_mode = selection_mode
-        super().__init__(**args)
+        super().__init__(
+            predictor=predictor, initial_prompts=initial_prompts, task=task, callbacks=callbacks, config=config
+        )
+        assert self.selection_mode in ["random", "wheel", "tour"], "Invalid selection mode."
 
-    def optimize(self, n_steps: int) -> List[str]:
-        """Perform the optimization process for a specified number of steps.
-
-        This method iteratively improves the prompts using genetic algorithm techniques.
-        It evaluates prompts, performs crossover to generate new prompts, and selects
-        the best prompts for the next generation.
-
-        Args:
-            n_steps (int): Number of optimization steps to perform.
-
-        Returns:
-            List[str]: The optimized list of prompts after all steps.
-        """
+    def _pre_optimization_loop(self):
         # get scores from task
-        if self.verbosity > 1:
+        if self.verbosity > 1:  # pragma: no cover
             self.scores, seq = self.task.evaluate(
                 self.prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples, return_seq=True
             )
@@ -82,38 +78,33 @@ def optimize(self, n_steps: int) -> List[str]:
         self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
         self.scores = sorted(self.scores, reverse=True)
 
-        for _ in range(n_steps):
-            new_prompts = self._crossover(self.prompts, self.scores)
-            prompts = self.prompts + new_prompts
-
-            if self.verbosity > 1:
-                logger.warning(f"Prompts: {prompts}")
+    def _step(self) -> List[str]:
+        new_prompts = self._crossover(self.prompts, self.scores)
+        prompts = self.prompts + new_prompts
 
-            # evaluate new prompts
-            if self.verbosity > 1:
-                new_scores, seq = self.task.evaluate(
-                    prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples, return_seq=True
-                )
-                new_scores = new_scores.tolist()
-                logger.warning(f"Scores: {new_scores}")
-                logger.warning(f"Sequences: {seq}")
+        if self.verbosity > 1:  # pragma: no cover
+            logger.warning(f"Prompts: {prompts}")
 
-            else:
-                new_scores = self.task.evaluate(
-                    new_prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples
-                ).tolist()
+        # evaluate new prompts
+        if self.verbosity > 1:  # pragma: no cover
+            new_scores, seq = self.task.evaluate(
+                prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples, return_seq=True
+            )
+            new_scores = new_scores.tolist()
+            logger.warning(f"Scores: {new_scores}")
+            logger.warning(f"Sequences: {seq}")
 
-            scores = self.scores + new_scores
+        else:
+            new_scores = self.task.evaluate(
+                new_prompts, self.predictor, subsample=True, n_samples=self.n_eval_samples
+            ).tolist()
 
-            # sort scores and prompts
-            self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)]
-            self.scores = sorted(scores, reverse=True)[: len(self.prompts)]
+        scores = self.scores + new_scores
 
-            continue_optimization = self._on_step_end()
-            if not continue_optimization:
-                break
+        # sort scores and prompts
+        self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)]
+        self.scores = sorted(scores, reverse=True)[: len(self.prompts)]
 
-        self._on_train_end()
         return self.prompts
 
     def _crossover(self, prompts, scores) -> str:
@@ -159,7 +150,7 @@ def _crossover(self, prompts, scores) -> str:
             meta_prompts.append(meta_prompt)
 
         child_prompts = self.meta_llm.get_response(meta_prompts)
-        if self.verbosity > 1:
+        if self.verbosity > 1:  # pragma: no cover
             logger.warning("meta_prompts:")
             logger.warning(meta_prompts)
             logger.warning("child_prompts:")
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index aae22e8..c6822fc 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -1,11 +1,15 @@
 """Module implementing the OPRO (Optimization by PROmpting) algorithm."""
 
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 import numpy as np
 
+from promptolution.callbacks import BaseCallback
+from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.tasks.base_task import BaseTask
 from promptolution.templates import OPRO_TEMPLATE
 
 
@@ -21,31 +25,40 @@ class Opro(BaseOptimizer):
 
     def __init__(
         self,
+        predictor: BasePredictor,
+        task: BaseTask,
+        prompt_template: Optional[str],
         meta_llm: BaseLLM,
-        prompt_template: Optional[str] = None,
+        initial_prompts: List[str] = None,
         max_num_instructions: int = 20,
         num_instructions_per_step: int = 8,
         num_few_shots: int = 3,
-        **kwargs,
+        callbacks: List[BaseCallback] = None,
+        config: ExperimentConfig = None,
     ) -> None:
         """Initialize the OPRO optimizer.
 
         Args:
-            df_few_shots: DataFrame with few-shot examples (must have 'input' and 'target' columns)
+            predictor: Predictor for prompt evaluation
+            task: Task object for prompt evaluation
             meta_llm: LLM that generates improved prompts
+            initial_prompts: Initial set of prompts to start optimization with
             prompt_template: Custom meta prompt template (uses OPRO_TEMPLATE if None)
             max_num_instructions: Maximum previous instructions to include in meta prompt
             num_instructions_per_step: Number of prompts to generate in each step
             num_few_shots: Number of few-shot examples to include (0 for none)
-            **kwargs: Additional arguments passed to the BaseOptimizer
+            callbacks: List of callback functions
+            config: ExperimentConfig overwriting default parameters
         """
-        super().__init__(**kwargs)
         self.meta_llm = meta_llm
 
         self.meta_prompt_template = prompt_template if prompt_template else OPRO_TEMPLATE
         self.max_num_instructions = max_num_instructions
         self.num_instructions_per_step = num_instructions_per_step
         self.num_few_shots = num_few_shots
+        super().__init__(
+            predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config
+        )
 
     def _sample_examples(self) -> str:
         """Sample few-shot examples from the dataset.
@@ -89,61 +102,42 @@ def _add_prompt_and_score(self, prompt: str, score: float) -> None:
         self.prompts = [self.prompts[i] for i in keep_indices]
         self.scores = [self.scores[i] for i in keep_indices]
 
-    def optimize(self, n_steps: int) -> List[str]:
-        """Run the OPRO optimization process.
-
-        Args:
-            n_steps: Number of optimization steps to perform
-
-        Returns:
-            List of all prompts generated during optimization
-        """
+    def _pre_optimization_loop(self):
         self.scores = list(self.task.evaluate(self.prompts, self.predictor))
         self.meta_prompt = self.meta_prompt_template.replace("<instructions>", self._format_instructions()).replace(
             "<examples>", self._sample_examples()
         )
 
-        for _ in range(n_steps):
-            duplicate_prompts = 0
-            for _ in range(self.num_instructions_per_step):
-                generation_seed = np.random.randint(0, int(1e9))
-                self.meta_llm.set_generation_seed(generation_seed)
+    def _step(self) -> List[str]:
+        duplicate_prompts = 0
+        for _ in range(self.num_instructions_per_step):
+            generation_seed = np.random.randint(0, int(1e9))
+            self.meta_llm.set_generation_seed(generation_seed)
 
-                if self.verbosity > 1:
-                    print(f"Seed: {generation_seed}")
-                response = self.meta_llm.get_response([self.meta_prompt])[0]
+            if self.verbosity > 1:  # pragma: no cover
+                print(f"Seed: {generation_seed}")
+            response = self.meta_llm.get_response([self.meta_prompt])[0]
 
-                prompt = response.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            prompt = response.split("<prompt>")[-1].split("</prompt>")[0].strip()
 
-                if prompt in self.prompts:
-                    duplicate_prompts += 1
-                    continue
+            if prompt in self.prompts:
+                duplicate_prompts += 1
+                continue
 
-                score = self.task.evaluate(prompt, self.predictor)[0]
+            score = self.task.evaluate(prompt, self.predictor)[0]
 
-                self._add_prompt_and_score(prompt, score)
+            self._add_prompt_and_score(prompt, score)
 
-                if self.verbosity > 1:
-                    print(f"New Instruction: {prompt}\nScore: {score}\n")
+            if self.verbosity > 1:  # pragma: no cover
+                print(f"New Instruction: {prompt}\nScore: {score}\n")
 
-            # Update meta prompt
-            self.meta_prompt = self.meta_prompt_template.replace("<instructions>", self._format_instructions()).replace(
-                "<examples>", self._sample_examples()
-            )
-
-            if self.verbosity > 1:
-                print(f"New meta prompt:\n{self.meta_prompt}\n")
-
-            continue_optimization = self._on_step_end()
-
-            if not continue_optimization:
-                break
+        # Update meta prompt
+        self.meta_prompt = self.meta_prompt_template.replace("<instructions>", self._format_instructions()).replace(
+            "<examples>", self._sample_examples()
+        )
 
-            # stop optimization if all generated prompts are duplicates (converged)
-            if duplicate_prompts == self.num_instructions_per_step:
-                if self.verbosity > 0:
-                    print("All generated prompts are duplicates. Stopping optimization.")
-                break
+        if self.verbosity > 1:  # pragma: no cover
+            print(f"New meta prompt:\n{self.meta_prompt}\n")
 
         self._on_train_end()
         return self.prompts
diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index e5aa69c..85bc9ac 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -6,9 +6,7 @@
 from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
 
 
-def get_predictor(
-    downstream_llm=None, type: Literal["first_occurence", "marker"] = "first_occurrence", *args, **kwargs
-):
+def get_predictor(downstream_llm=None, type: Literal["first_occurrence", "marker"] = "marker", *args, **kwargs):
     """Factory function to create and return a predictor instance.
 
     This function supports three types of predictors:
diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py
index 6cc8358..2350117 100644
--- a/promptolution/predictors/base_predictor.py
+++ b/promptolution/predictors/base_predictor.py
@@ -1,51 +1,50 @@
-"""Base module for predictors."""
+"""Base module for predictors in the promptolution library."""
 
-from abc import abstractmethod
-from typing import List, Tuple
+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 
 
-class BasePredictor:
+class BasePredictor(ABC):
     """Abstract base class for predictors in the promptolution library.
 
     This class defines the interface that all concrete predictor implementations should follow.
 
     Attributes:
         llm: The language model used for generating predictions.
-
-
-    Methods:
-        predict: An abstract method that should be implemented by subclasses
-                 to make predictions based on prompts and input data.
+        classes (List[str]): The list of valid class labels.
+        config (ExperimentConfig): Experiment configuration overwriting defaults
     """
 
-    def __init__(self, llm: BaseLLM):
-        """Initialize the BasePredictor.
+    def __init__(self, llm: Optional[BaseLLM] = None, config: ExperimentConfig = None):
+        """Initialize the predictor with a language model and configuration.
 
         Args:
-            llm: The language model to use for predictions.
-            classes (List[str]): The list of valid class labels.
+            llm: Language model to use for prediction.
+            config: Configuration for the predictor.
         """
         self.llm = llm
 
+        if config is not None:
+            config.apply_to(self)
+
     def predict(
         self, prompts: List[str], xs: np.ndarray, system_prompts: List[str] = None, return_seq: bool = False
     ) -> np.ndarray:
         """Abstract method to make predictions based on prompts and input data.
 
         Args:
-            prompts (List[str]): List of prompts to use for prediction.
-            xs (np.ndarray): Array of input data.
-            return_seq (bool, optional): whether to return the generating sequence
+            prompts: Prompt or list of prompts to use for prediction.
+            xs: Array of input data.
+            return_seq: Whether to return the generating sequence.
 
         Returns:
-            np.ndarray: Array of predictions.
-
-        Raises:
-            NotImplementedError: If not implemented by a subclass.
+            Array of predictions, optionally with sequences.
         """
         if isinstance(prompts, str):
             prompts = [prompts]
@@ -71,12 +70,16 @@ def predict(
 
         return preds
 
+    @abstractmethod
     def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
         """Extract class labels from the predictions, based on the list of valid class labels.
 
         Args:
             preds: The raw predictions from the language model.
             shape: The shape of the output array: (n_prompts, n_samples).
+
+        Returns:
+            np.ndarray: Extracted predictions with shape (n_prompts, n_samples).
         """
         raise NotImplementedError
 
@@ -89,36 +92,43 @@ class DummyPredictor(BasePredictor):
     Attributes:
         model_id (str): Always set to "dummy".
         classes (List[str]): List of possible class labels.
-
-    Methods:
-        predict: Generates random predictions for the given prompts and input data.
     """
 
-    def __init__(self, model_id, classes, *args, **kwargs):
-        """Initialize the DummyPredictor.
+    def predict(
+        self, prompts: Union[str, List[str]], xs: np.ndarray, return_seq: bool = False
+    ) -> Union[np.ndarray, Tuple[np.ndarray, List[str]]]:
+        """Generate random predictions for the given prompts and input data.
+
+        Args:
+            prompts: Prompt or list of prompts (ignored in this implementation).
+            xs: Array of input data (only the length is used).
+            return_seq: Whether to return sequences.
 
-        Parameters
-        ----------
-        model_id : str
-            Model identifier string.
-        classes : list
-            List of possible class labels.
+        Returns:
+            Array of random predictions, optionally with sequences.
         """
-        self.model_id = "dummy"
-        self.classes = classes
+        if isinstance(prompts, str):
+            prompts = [prompts]
 
-    def predict(
-        self,
-        prompts: List[str],
-        xs: np.ndarray,
-    ) -> np.ndarray:
-        """Generate random predictions for the given prompts and input data.
+        preds = np.array([np.random.choice(self.classes, len(xs)) for _ in prompts])
+
+        if return_seq:
+            # Generate fake sequences
+            seqs = [f"Input: {x}\nOutput: {np.random.choice(self.classes)}" for x in xs]
+            return preds, seqs
+
+        return preds
+
+    def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
+        """Extract class labels from the predictions.
+
+        This is a dummy implementation that returns random predictions.
 
         Args:
-            prompts (List[str]): List of prompts (ignored in this implementation).
-            xs (np.ndarray): Array of input data (only the length is used).
+            preds: The raw predictions from the language model (ignored).
+            shape: The shape of the output array: (n_prompts, n_samples).
 
         Returns:
-            np.ndarray: 2D array of random predictions, shape (len(prompts), len(xs)).
+            np.ndarray: Random predictions.
         """
-        return np.array([np.random.choice(self.classes, len(xs)) for _ in prompts])
+        return np.array([np.random.choice(self.classes, shape[1]) for _ in range(shape[0])])
diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py
index 86999ed..2621a86 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classificator.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 
+from promptolution.config import ExperimentConfig
 from promptolution.predictors.base_predictor import BasePredictor
 
 
@@ -19,19 +20,20 @@ class FirstOccurrenceClassificator(BasePredictor):
     Attributes:
         llm: The language model used for generating predictions.
         classes (List[str]): The list of valid class labels.
+        config (ExperimentConfig): Experiment configuration overwriting defaults.
 
     Inherits from:
         BasePredictor: The base class for predictors in the promptolution library.
     """
 
-    def __init__(self, llm, classes, *args, **kwargs):
+    def __init__(self, llm, classes, config: ExperimentConfig = None):
         """Initialize the Classificator.
 
         Args:
             llm: The language model to use for predictions.
             classes (List[str]): The list of valid class labels.
+            config: Experiment configuration overwriting defaults.
         """
-        super().__init__(llm)
         assert all([c.islower() for c in classes]), "Class labels should be lowercase."
         self.classes = classes
 
@@ -40,6 +42,8 @@ def __init__(self, llm, classes, *args, **kwargs):
             "The first occurrence of a valid class label in the prediction is used as the predicted class."
         )
 
+        super().__init__(llm, config)
+
     def _extract_preds(self, preds: List[str]) -> np.ndarray:
         """Extract class labels from the predictions, based on the list of valid class labels.
 
@@ -76,7 +80,14 @@ class MarkerBasedClassificator(BasePredictor):
         BasePredictor: The base class for predictors in the promptolution library.
     """
 
-    def __init__(self, llm, classes=None, begin_marker="<final_answer>", end_marker="</final_answer>", *args, **kwargs):
+    def __init__(
+        self,
+        llm,
+        classes=None,
+        begin_marker="<final_answer>",
+        end_marker="</final_answer>",
+        config: ExperimentConfig = None,
+    ):
         """Initialize the Classificator.
 
         Args:
@@ -84,9 +95,8 @@ def __init__(self, llm, classes=None, begin_marker="<final_answer>", end_marker=
             classes (List[str]): The list of valid class labels. If None, does not force any class.
             begin_marker (str): The marker to use for extracting the class label.
             end_marker (str): The marker to use for extracting the class label.
-            *args, **kwargs: Additional arguments for the BasePredictor.
+            config: Experiment configuration overwriting defaults.
         """
-        super().__init__(llm)
         self.classes = classes
         self.begin_marker = begin_marker
         self.end_marker = end_marker
@@ -101,6 +111,8 @@ def __init__(self, llm, classes=None, begin_marker="<final_answer>", end_marker=
         else:
             self.extraction_description = f"The class label is extracted from the text that are between these markers: {begin_marker} and {end_marker}."
 
+        super().__init__(llm, config)
+
     def _extract_preds(self, preds: List[str]) -> np.ndarray:
         """Extract class labels from the predictions, by extracting the text following the marker.
 
diff --git a/promptolution/tasks/__init__.py b/promptolution/tasks/__init__.py
index 73f38da..2da06e7 100644
--- a/promptolution/tasks/__init__.py
+++ b/promptolution/tasks/__init__.py
@@ -1,3 +1,21 @@
 """Module for task-related functions and classes."""
-from promptolution.tasks.base_task import BaseTask, DummyTask
+import pandas as pd
+
+from promptolution.config import ExperimentConfig
+from promptolution.tasks.base_task import BaseTask
 from promptolution.tasks.classification_tasks import ClassificationTask
+
+
+def get_task(df: pd.DataFrame, config: ExperimentConfig) -> BaseTask:
+    """Get the task based on the provided DataFrame and configuration.
+
+    So far only ClassificationTask is supported.
+
+    Args:
+        df (pd.DataFrame): Input DataFrame containing the data.
+        config (ExperimentConfig): Configuration for the experiment.
+
+    Returns:
+        BaseTask: An instance of a task class based on the provided DataFrame and configuration.
+    """
+    return ClassificationTask(df, config)
diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index 403a800..0928119 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 
+from promptolution.config import ExperimentConfig
+
 
 class BaseTask(ABC):
     """Abstract base class for tasks in the promptolution library.
@@ -16,9 +18,10 @@ class BaseTask(ABC):
                   to evaluate prompts using a given predictor.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, config: ExperimentConfig = None):
         """Initialize the BaseTask."""
-        pass
+        if config is not None:
+            config.apply_to(self)
 
     @abstractmethod
     def evaluate(self, prompts: List[str], predictor, system_promtps: List[str] = None) -> np.ndarray:
diff --git a/promptolution/tasks/classification_tasks.py b/promptolution/tasks/classification_tasks.py
index 37ea90e..125bd29 100644
--- a/promptolution/tasks/classification_tasks.py
+++ b/promptolution/tasks/classification_tasks.py
@@ -6,6 +6,7 @@
 import pandas as pd
 from sklearn.metrics import accuracy_score
 
+from promptolution.config import ExperimentConfig
 from promptolution.predictors.base_predictor import BasePredictor
 from promptolution.tasks.base_task import BaseTask
 
@@ -23,6 +24,7 @@ class ClassificationTask(BaseTask):
         ys (np.ndarray): Array of labels.
         initial_prompts (List[str]): Initial set of prompts to start optimization with.
         metric (Callable): Metric to use for evaluation.
+        config (ExperimentConfig): Configuration for the experiment.
 
     Inherits from:
         BaseTask: The base class for tasks in the promptolution library.
@@ -36,6 +38,7 @@ def __init__(
         x_column: str = "x",
         y_column: str = "y",
         metric: Callable = accuracy_score,
+        config: ExperimentConfig = None,
     ):
         """Initialize the ClassificationTask from a pandas DataFrame.
 
@@ -47,12 +50,14 @@ def __init__(
             y_column (str, optional): Name of the column containing labels. Defaults to "y".
             seed (int, optional): Random seed for reproducibility. Defaults to 42.
             metric (Callable, optional): Metric to use for evaluation. Defaults to accuracy_score.
+            config (ExperimentConfig, optional): ExperimentConfig overwriting the defaults.
         """
-        super().__init__()
         self.description = description
         self.initial_prompts = initial_prompts
         self.metric = metric
 
+        super().__init__(config)
+
         df[y_column] = df[y_column].str.lower()
         self.classes = df[y_column].unique()
 
diff --git a/pyproject.toml b/pyproject.toml
index d8bc054..2dbeb3a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,38 +1,49 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.3.2"
-description = ""
+version = "1.4.0"
+description = "A framework for prompt optimization and a zoo of prompt optimization algorithms."
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.9"
 numpy = "^1.26.0"
-langchain-anthropic = "^0.1.22"
-langchain-openai = "^0.1.21"
-langchain-core = "^0.2.29"
-langchain-community = "^0.2.12"
 pandas = "^2.2.2"
 tqdm = "^4.66.5"
 scikit-learn = "^1.5.2"
-vllm = "^0.7.3"
-datasets = "^3.3.2"
+fastparquet = "^2024.11.0"
 
-[tool.poetry.group.dev.dependencies]
-matplotlib = "^3.9.2"
-seaborn = "^0.13.2"
+[tool.poetry.group.api.dependencies]
+openai = "^1.0.0"
+requests = "^2.31.0"
+
+[tool.poetry.group.vllm.dependencies]
+vllm = "^0.8.3"
+
+[tool.poetry.group.transformers.dependencies]
 transformers = "^4.48.0"
+
+[tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
 flake8 = "^7.1.0"
 isort = "^5.13.2"
 pre-commit = "^3.7.1"
 ipykernel = "^6.29.5"
 
+[tool.poetry.group.test.dependencies]
+pytest = "^8.3.5"
+pytest-cov = "^6.1.1"
+openai = "^1.0.0"
+requests = "^2.31.0"
+vllm = "^0.8.2"
+transformers = "^4.48.0"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.6.1"
 mkdocs-material = "^9.5.39"
 mkdocstrings = {version = "^0.26.1", extras = ["python"]}
+jupyter = "^1.1.1"
+nbconvert = "^7.16.6"
 
 [build-system]
 requires = ["poetry-core"]
@@ -46,4 +57,4 @@ line_length = 120
 profile = "black"
 
 [tool.pydocstyle]
-convention = "google"
+convention = "google"
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..38be796
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
\ No newline at end of file
diff --git a/scripts/api_test.py b/scripts/api_test.py
new file mode 100644
index 0000000..8ae585c
--- /dev/null
+++ b/scripts/api_test.py
@@ -0,0 +1,70 @@
+"""Test run for the Opro optimizer."""
+
+import argparse
+import random
+from logging import Logger
+
+from promptolution.callbacks import LoggerCallback
+from promptolution.templates import EVOPROMPT_GA_TEMPLATE
+from promptolution.tasks import ClassificationTask
+from promptolution.predictors import MarkerBasedClassificator
+from promptolution.optimizers import EvoPromptGA
+from datasets import load_dataset
+
+from promptolution.llms.api_llm import APILLM
+
+logger = Logger(__name__)
+
+"""Run a test run for any of the implemented optimizers."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--base-url", default="https://api.openai.com/v1")
+parser.add_argument("--model", default="gpt-4o-2024-08-06")
+# parser.add_argument("--base-url", default="https://api.deepinfra.com/v1/openai")
+# parser.add_argument("--model", default="meta-llama/Meta-Llama-3-8B-Instruct")
+# parser.add_argument("--base-url", default="https://api.anthropic.com/v1/")
+# parser.add_argument("--model", default="claude-3-haiku-20240307")
+parser.add_argument("--n-steps", type=int, default=2)
+parser.add_argument("--token", default=None)
+args = parser.parse_args()
+
+df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300)
+
+df["input"] = df["text"]
+df["target"] = df["label_text"]
+
+task = ClassificationTask(
+    df,
+    description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
+    x_column="input",
+    y_column="target",
+)
+
+initial_prompts = [
+    "Classify this news article as World, Sports, Business, or Tech. Provide your answer between <final_answer> and </final_answer> tags.",
+    "Read the following news article and determine which category it belongs to: World, Sports, Business, or Tech. Your classification must be placed between <final_answer> </final_answer> markers.",
+    "Your task is to identify whether this news article belongs to World, Sports, Business, or Tech news. Provide your classification between the markers <final_answer> </final_answer>.",
+    "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
+]
+
+llm = APILLM(api_url=args.base_url, llm=args.model, token=args.token)
+downstream_llm = llm
+meta_llm = llm
+
+predictor = MarkerBasedClassificator(downstream_llm, classes=task.classes)
+
+callbacks = [LoggerCallback(logger)]
+
+optimizer = EvoPromptGA(
+    task=task,
+    prompt_template=EVOPROMPT_GA_TEMPLATE,
+    predictor=predictor,
+    meta_llm=meta_llm,
+    initial_prompts=initial_prompts,
+    callbacks=callbacks,
+    n_eval_samples=20,
+    verbosity=2,  # for debugging
+)
+
+best_prompts = optimizer.optimize(n_steps=args.n_steps)
+
+logger.info(f"Optimized prompts: {best_prompts}")
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
new file mode 100644
index 0000000..15b1203
--- /dev/null
+++ b/scripts/llm_test_run.py
@@ -0,0 +1,98 @@
+"""Test script for measuring raw LLM inference performance on a dataset."""
+import argparse
+import time
+from logging import Logger
+
+import numpy as np
+import pandas as pd
+from promptolution.config import Config
+from promptolution.llms import get_llm
+from promptolution.predictors import Classificator
+from promptolution.tasks import get_task
+from tqdm import tqdm
+
+logger = Logger(__name__)
+
+# TODO: Align this script with how we import datasets in capo
+
+
+"""Run inference test on a dataset using a specified LLM."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--model")
+parser.add_argument("--output")
+parser.add_argument("--datasets", default=["subj"])
+parser.add_argument("--token", default=None)
+parser.add_argument("--batch-size", default=None)
+parser.add_argument("--revision", default="main")
+parser.add_argument("--max-model-len", default=None)
+parser.add_argument("--model-storage-path", default=None)
+args = parser.parse_args()
+
+start_time = time.time()
+
+if args.max_model_len is not None:
+    max_model_len = int(args.max_model_len)
+
+if "vllm" in args.model:
+    llm = get_llm(
+        args.model,
+        batch_size=args.batch_size,
+        max_model_len=max_model_len,
+        model_storage_path=args.model_storage_path,
+        revision=args.revision,
+    )
+else:
+    llm = get_llm(args.model, args.token)
+
+results = pd.DataFrame()
+
+for dataset in args.datasets:
+    config = Config(
+        evaluation_llm=args.model,
+        ds_path=f"data_sets/cls/{dataset}/",
+        task_name=dataset,
+        api_token=args.token,
+        n_eval_samples=200,
+    )
+
+    task = get_task(config, split="dev")
+    predictor = Classificator(llm, classes=task.classes)
+
+    prompts = [task.initial_population[0]]
+
+    xs = task.xs[: config.n_eval_samples]
+    ys = task.ys[: config.n_eval_samples]
+
+    for prompt in tqdm(prompts):
+        preds, seqs = predictor.predict(prompt, xs, return_seq=True)
+
+        scores = []
+        for i in range(len(xs)):
+            scores.append(1 if preds[0][i] == ys[i] else 0)
+
+        # clean up the sequences
+        seqs = [seq.replace("\n", "").strip() for seq in seqs]
+
+        # if single prompts should be stored
+        # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
+        # df.to_csv(args.output + "_detailed", index=False)
+
+        accuracy = np.array(scores).mean()
+
+        results = pd.DataFrame(
+            dict(
+                model=args.model,
+                dataset=dataset,
+                prompt=prompt,
+                accuracy=accuracy,
+                n_samples=len(xs),
+            ),
+            index=[0],
+        )
+        results.to_csv(args.output, mode="a", header=False, index=False)
+
+total_inference_time = time.time() - start_time
+print(
+    f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens."
+)
+print(f"Results saved to {args.output}")
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
new file mode 100644
index 0000000..a6c9562
--- /dev/null
+++ b/tests/callbacks/test_callbacks.py
@@ -0,0 +1,276 @@
+import os
+import pytest
+import pandas as pd
+from unittest.mock import MagicMock, patch
+
+from promptolution.callbacks import (
+    BaseCallback, LoggerCallback, FileOutputCallback, 
+    BestPromptCallback, ProgressBarCallback, TokenCountCallback
+)
+from tests.mocks.mock_llm import MockLLM
+
+
+@pytest.fixture
+def mock_optimizer():
+    """Create a mock optimizer with the necessary attributes for callbacks."""
+    optimizer = MagicMock()
+    optimizer.prompts = ["Prompt 1", "Prompt 2", "Prompt 3"]
+    optimizer.scores = [0.8, 0.7, 0.6]
+    optimizer.meta_llm = MockLLM()
+    optimizer.meta_llm.input_token_count = 100
+    optimizer.meta_llm.output_token_count = 50
+    
+    # Add predictor with LLM for TokenCountCallback
+    optimizer.predictor = MagicMock()
+    optimizer.predictor.llm = MockLLM()
+    optimizer.predictor.llm.input_token_count = 200
+    optimizer.predictor.llm.output_token_count = 100
+    
+    return optimizer
+
+
+@pytest.fixture
+def mock_logger():
+    """Create a mock logger for testing LoggerCallback."""
+    logger = MagicMock()
+    logger.critical = MagicMock()
+    return logger
+
+
+def test_base_callback():
+    """Test the base Callback class."""
+    callback = BaseCallback()
+    
+    # Test default method implementations
+    assert callback.on_step_end(None) is True
+    assert callback.on_epoch_end(None) is True
+    assert callback.on_train_end(None) is True
+
+
+def test_logger_callback(mock_optimizer, mock_logger):
+    """Test the LoggerCallback class."""
+    callback = LoggerCallback(mock_logger)
+    
+    # Test initialization
+    assert callback.logger == mock_logger
+    assert callback.step == 0
+    
+    # Test on_step_end
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 1
+    
+    # Verify logger was called
+    assert mock_logger.critical.call_count >= 5  # Time + Step + 3 prompts
+    
+    # Test on_train_end
+    result = callback.on_train_end(mock_optimizer)
+    assert result is True
+    
+    # Test on_train_end with logs
+    result = callback.on_train_end(mock_optimizer, logs="Test logs")
+    assert result is True
+    
+    # Verify logger was called with logs
+    mock_logger.critical.assert_any_call(mock_logger.critical.call_args_list[-1][0][0])
+
+
+def test_file_output_callback_csv(mock_optimizer, tmpdir):
+    """Test the FileOutputCallback with CSV output."""
+    output_dir = str(tmpdir.mkdir("test_output"))
+    callback = FileOutputCallback(dir=output_dir, file_type="csv")
+    
+    # Test initialization
+    assert callback.file_type == "csv"
+    assert callback.path == output_dir + "/step_results.csv"
+    assert callback.step == 0
+    
+    # Test on_step_end - first step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 1
+    
+    # Verify file was created
+    assert os.path.exists(callback.path)
+    
+    # Read the CSV file and verify content
+    df = pd.read_csv(callback.path)
+    assert len(df) == 3  # 3 prompts
+    assert "step" in df.columns
+    assert "score" in df.columns
+    assert "prompt" in df.columns
+    assert all(df["step"] == 1)
+    
+    # Test on_step_end - second step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 2
+    
+    # Verify file was updated
+    df = pd.read_csv(callback.path)
+    assert len(df) == 6  # 3 prompts × 2 steps
+    assert set(df["step"]) == {1, 2}
+
+
+def test_file_output_callback_parquet(mock_optimizer, tmpdir):
+    """Test the FileOutputCallback with Parquet output."""
+    output_dir = str(tmpdir.mkdir("test_output_parquet"))
+    callback = FileOutputCallback(dir=output_dir, file_type="parquet")
+    
+    # Test initialization
+    assert callback.file_type == "parquet"
+    assert callback.path == output_dir + "/step_results.parquet"
+    
+    # Test on_step_end - first step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Verify file was created
+    assert os.path.exists(callback.path)
+    
+    # Read the Parquet file and verify content
+    df = pd.read_parquet(callback.path)
+    assert len(df) == 3  # 3 prompts
+    assert "step" in df.columns
+    assert "score" in df.columns
+    assert "prompt" in df.columns
+    assert all(df["step"] == 1)
+
+
+def test_file_output_callback_invalid_type():
+    """Test FileOutputCallback with invalid file type."""
+    with pytest.raises(ValueError):
+        FileOutputCallback(dir="test", file_type="invalid")
+
+
+def test_best_prompt_callback(mock_optimizer):
+    """Test the BestPromptCallback class."""
+    callback = BestPromptCallback()
+    
+    # Test initialization
+    assert callback.best_prompt == ""
+    assert callback.best_score == -99999
+    
+    # Test on_step_end
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Prompt 1"
+    assert callback.best_score == 0.8
+    
+    # Test with better score
+    mock_optimizer.scores = [0.9, 0.7, 0.6]
+    mock_optimizer.prompts = ["Better Prompt", "Prompt 2", "Prompt 3"]
+    
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Better Prompt"
+    assert callback.best_score == 0.9
+    
+    # Test with worse score
+    mock_optimizer.scores = [0.7, 0.6, 0.5]
+    mock_optimizer.prompts = ["Worse Prompt", "Prompt 2", "Prompt 3"]
+    
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Better Prompt"  # Unchanged
+    assert callback.best_score == 0.9  # Unchanged
+    
+    # Test get_best_prompt
+    best_prompt, best_score = callback.get_best_prompt()
+    assert best_prompt == "Better Prompt"
+    assert best_score == 0.9
+
+
+def test_progress_bar_callback():
+    """Test the ProgressBarCallback class."""
+    with patch('promptolution.callbacks.tqdm') as mock_tqdm:
+        mock_pbar = MagicMock()
+        mock_tqdm.return_value = mock_pbar
+        
+        # Create callback
+        callback = ProgressBarCallback(total_steps=10)
+        
+        # Verify tqdm was called with correct arguments
+        mock_tqdm.assert_called_once_with(total=10)
+        
+        # Test on_step_end
+        result = callback.on_step_end(None)
+        assert result is True
+        mock_pbar.update.assert_called_once_with(1)
+        
+        # Test on_train_end
+        result = callback.on_train_end(None)
+        assert result is True
+        mock_pbar.close.assert_called_once()
+
+
+def test_token_count_callback(mock_optimizer):
+    """Test the TokenCountCallback class."""
+    # Test with input tokens
+    callback = TokenCountCallback(
+        max_tokens_for_termination=300,
+        token_type_for_termination="input_tokens"
+    )
+    
+    # Replace get_token_count with our own function
+    def get_token_count_under_limit():
+        return {
+            "input_tokens": 200,
+            "output_tokens": 100,
+            "total_tokens": 300
+        }
+    
+    # Replace the method directly
+    mock_optimizer.predictor.llm.get_token_count = get_token_count_under_limit
+    
+    # Should continue as we're below the max
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Now replace with a function that exceeds the limit
+    def get_token_count_over_limit():
+        return {
+            "input_tokens": 400,  # Over the limit
+            "output_tokens": 100,
+            "total_tokens": 500
+        }
+    
+    mock_optimizer.predictor.llm.get_token_count = get_token_count_over_limit
+    
+    # Should stop
+    result = callback.on_step_end(mock_optimizer)
+    assert result is False
+    
+    # Test with output tokens
+    callback = TokenCountCallback(
+        max_tokens_for_termination=150,
+        token_type_for_termination="output_tokens"
+    )
+    
+    # Use the same approach - replace with function
+    def get_token_count_output_under_limit():
+        return {
+            "input_tokens": 200,
+            "output_tokens": 100,  # Under the limit
+            "total_tokens": 300
+        }
+    
+    mock_optimizer.predictor.llm.get_token_count = get_token_count_output_under_limit
+    
+    # Should continue as we're below the max
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Now replace with a function that exceeds the output tokens limit
+    def get_token_count_output_over_limit():
+        return {
+            "input_tokens": 200,
+            "output_tokens": 200,  # Over the limit
+            "total_tokens": 400
+        }
+    
+    mock_optimizer.predictor.llm.get_token_count = get_token_count_output_over_limit
+    
+    # Should stop
+    result = callback.on_step_end(mock_optimizer)
+    assert result is False
\ No newline at end of file
diff --git a/tests/callbacks/test_callbacks_integration.py b/tests/callbacks/test_callbacks_integration.py
new file mode 100644
index 0000000..1da210a
--- /dev/null
+++ b/tests/callbacks/test_callbacks_integration.py
@@ -0,0 +1,295 @@
+import os
+import pytest
+import pandas as pd
+from unittest.mock import MagicMock, patch
+
+from promptolution.callbacks import (
+    BaseCallback, LoggerCallback, FileOutputCallback, 
+    BestPromptCallback, ProgressBarCallback, TokenCountCallback
+)
+from tests.mocks.mock_llm import MockLLM
+
+
+@pytest.fixture
+def mock_optimizer():
+    """Create a mock optimizer with the necessary attributes for callbacks."""
+    optimizer = MagicMock()
+    optimizer.prompts = ["Prompt 1", "Prompt 2", "Prompt 3"]
+    optimizer.scores = [0.8, 0.7, 0.6]
+    optimizer.meta_llm = MockLLM()
+    optimizer.meta_llm.input_token_count = 100
+    optimizer.meta_llm.output_token_count = 50
+    
+    # Add predictor with LLM for TokenCountCallback
+    optimizer.predictor = MagicMock()
+    optimizer.predictor.llm = MockLLM()
+    optimizer.predictor.llm.input_token_count = 200
+    optimizer.predictor.llm.output_token_count = 100
+    
+    return optimizer
+
+
+@pytest.fixture
+def mock_logger():
+    """Create a mock logger for testing LoggerCallback."""
+    logger = MagicMock()
+    logger.critical = MagicMock()
+    return logger
+
+
+def test_base_callback():
+    """Test the base Callback class."""
+    callback = BaseCallback()
+    
+    # Test default method implementations
+    assert callback.on_step_end(None) is True
+    assert callback.on_epoch_end(None) is True
+    assert callback.on_train_end(None) is True
+
+
+def test_logger_callback(mock_optimizer, mock_logger):
+    """Test the LoggerCallback class."""
+    callback = LoggerCallback(mock_logger)
+    
+    # Test initialization
+    assert callback.logger == mock_logger
+    assert callback.step == 0
+    
+    # Test on_step_end
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 1
+    
+    # Verify logger was called
+    assert mock_logger.critical.call_count >= 5  # Time + Step + 3 prompts
+    
+    # Test on_train_end
+    result = callback.on_train_end(mock_optimizer)
+    assert result is True
+    
+    # Test on_train_end with logs
+    result = callback.on_train_end(mock_optimizer, logs="Test logs")
+    assert result is True
+    
+    # Verify logger was called with logs
+    mock_logger.critical.assert_any_call(mock_logger.critical.call_args_list[-1][0][0])
+
+
+def test_file_output_callback_csv(mock_optimizer, tmpdir):
+    """Test the FileOutputCallback with CSV output."""
+    output_dir = str(tmpdir.mkdir("test_output"))
+    callback = FileOutputCallback(dir=output_dir, file_type="csv")
+    
+    # Test initialization
+    assert callback.file_type == "csv"
+    assert callback.path == output_dir + "/step_results.csv"
+    assert callback.step == 0
+    
+    # Test on_step_end - first step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 1
+    
+    # Verify file was created
+    assert os.path.exists(callback.path)
+    
+    # Read the CSV file and verify content
+    df = pd.read_csv(callback.path)
+    assert len(df) == 3  # 3 prompts
+    assert "step" in df.columns
+    assert "score" in df.columns
+    assert "prompt" in df.columns
+    assert all(df["step"] == 1)
+    
+    # Test on_step_end - second step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.step == 2
+    
+    # Verify file was updated
+    df = pd.read_csv(callback.path)
+    assert len(df) == 6  # 3 prompts × 2 steps
+    assert set(df["step"]) == {1, 2}
+
+
+def test_file_output_callback_parquet(mock_optimizer, tmpdir):
+    """Test the FileOutputCallback with Parquet output."""
+    output_dir = str(tmpdir.mkdir("test_output_parquet"))
+    callback = FileOutputCallback(dir=output_dir, file_type="parquet")
+    
+    # Test initialization
+    assert callback.file_type == "parquet"
+    assert callback.path == output_dir + "/step_results.parquet"
+    
+    # Test on_step_end - first step
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Verify file was created
+    assert os.path.exists(callback.path)
+    
+    # Read the Parquet file and verify content
+    df = pd.read_parquet(callback.path)
+    assert len(df) == 3  # 3 prompts
+    assert "step" in df.columns
+    assert "score" in df.columns
+    assert "prompt" in df.columns
+    assert all(df["step"] == 1)
+
+
+def test_file_output_callback_invalid_type():
+    """Test FileOutputCallback with invalid file type."""
+    with pytest.raises(ValueError):
+        FileOutputCallback(dir="test", file_type="invalid")
+
+
+def test_best_prompt_callback(mock_optimizer):
+    """Test the BestPromptCallback class."""
+    callback = BestPromptCallback()
+    
+    # Test initialization
+    assert callback.best_prompt == ""
+    assert callback.best_score == -99999
+    
+    # Test on_step_end
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Prompt 1"
+    assert callback.best_score == 0.8
+    
+    # Test with better score
+    mock_optimizer.scores = [0.9, 0.7, 0.6]
+    mock_optimizer.prompts = ["Better Prompt", "Prompt 2", "Prompt 3"]
+    
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Better Prompt"
+    assert callback.best_score == 0.9
+    
+    # Test with worse score
+    mock_optimizer.scores = [0.7, 0.6, 0.5]
+    mock_optimizer.prompts = ["Worse Prompt", "Prompt 2", "Prompt 3"]
+    
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    assert callback.best_prompt == "Better Prompt"  # Unchanged
+    assert callback.best_score == 0.9  # Unchanged
+    
+    # Test get_best_prompt
+    best_prompt, best_score = callback.get_best_prompt()
+    assert best_prompt == "Better Prompt"
+    assert best_score == 0.9
+
+
+def test_progress_bar_callback():
+    """Test the ProgressBarCallback class."""
+    with patch('promptolution.callbacks.tqdm') as mock_tqdm:
+        mock_pbar = MagicMock()
+        mock_tqdm.return_value = mock_pbar
+        
+        # Create callback
+        callback = ProgressBarCallback(total_steps=10)
+        
+        # Verify tqdm was called with correct arguments
+        mock_tqdm.assert_called_once_with(total=10)
+        
+        # Test on_step_end
+        result = callback.on_step_end(None)
+        assert result is True
+        mock_pbar.update.assert_called_once_with(1)
+        
+        # Test on_train_end
+        result = callback.on_train_end(None)
+        assert result is True
+        mock_pbar.close.assert_called_once()
+
+
+def test_token_count_callback(mock_optimizer):
+    """Test the TokenCountCallback class."""
+    # Test with input tokens
+    callback = TokenCountCallback(
+        max_tokens_for_termination=300,
+        token_type_for_termination="input_tokens"
+    )
+    
+    # Create a mock method for get_token_count instead of trying to modify the existing one
+    token_counts = {
+        "input_tokens": 200,
+        "output_tokens": 100,
+        "total_tokens": 300
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts)
+    
+    # Should continue as we're below the max
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Now exceed the max
+    token_counts_over_limit = {
+        "input_tokens": 400,  # Over the limit
+        "output_tokens": 100,
+        "total_tokens": 500
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts_over_limit)
+    
+    # Should stop
+    result = callback.on_step_end(mock_optimizer)
+    assert result is False
+    
+    # Test with output tokens
+    callback = TokenCountCallback(
+        max_tokens_for_termination=150,
+        token_type_for_termination="output_tokens"
+    )
+    
+    token_counts = {
+        "input_tokens": 200,
+        "output_tokens": 100,
+        "total_tokens": 300
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts)
+    
+    # Should continue as we're below the max
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Now exceed the max
+    token_counts_over_limit = {
+        "input_tokens": 200,
+        "output_tokens": 200,  # Over the limit
+        "total_tokens": 400
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts_over_limit)
+    
+    # Should stop
+    result = callback.on_step_end(mock_optimizer)
+    assert result is False
+    
+    # Test with total tokens
+    callback = TokenCountCallback(
+        max_tokens_for_termination=500,
+        token_type_for_termination="total_tokens"
+    )
+    
+    token_counts = {
+        "input_tokens": 200,
+        "output_tokens": 100,
+        "total_tokens": 300
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts)
+    
+    # Should continue as we're below the max
+    result = callback.on_step_end(mock_optimizer)
+    assert result is True
+    
+    # Now exceed the max
+    token_counts_over_limit = {
+        "input_tokens": 300,
+        "output_tokens": 300,
+        "total_tokens": 600  # Over the limit
+    }
+    mock_optimizer.predictor.llm.get_token_count = MagicMock(return_value=token_counts_over_limit)
+    
+    # Should stop
+    result = callback.on_step_end(mock_optimizer)
+    assert result is False
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..4ad14f0
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,60 @@
+# tests/conftest.py
+import pytest
+
+from promptolution.config import ExperimentConfig
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_task import MockTask
+from tests.mocks.mock_predictor import MockPredictor
+
+from tests.mocks.mock_optimizer import MockOptimizer
+
+@pytest.fixture
+def base_optimizer_config():
+    """Fixture providing a basic optimizer configuration."""
+    return ExperimentConfig(
+        optimizer_name="test_optimizer",
+        n_steps=5,
+        population_size=8,
+        random_seed=42
+    )
+
+
+@pytest.fixture
+def initial_prompts():
+    """Fixture providing initial prompts for optimizer testing."""
+    return [
+        "Classify the sentiment of the text.",
+        "Determine if the text is positive or negative.",
+        "Analyze the sentiment in the following text."
+    ]
+
+
+@pytest.fixture
+def dummy_task():
+    """Fixture providing a dummy task for optimizer testing."""
+    task = MockTask(predetermined_scores=[0.6, 0.7, 0.8])
+    return task
+
+
+@pytest.fixture
+def dummy_predictor():
+    """Fixture providing a dummy predictor for optimizer testing."""
+    return MockPredictor(
+        classes=["positive", "neutral", "negative"]
+    )
+
+
+@pytest.fixture
+def dummy_llm():
+    """Fixture providing a dummy LLM for optimizer testing."""
+    llm = MockLLM()
+    llm._get_response = lambda prompts, system_prompts: [
+        "<prompt>Generated prompt for test</prompt>" for _ in prompts
+    ]
+    return llm
+
+
+@pytest.fixture
+def mock_optimizer():
+    """Fixture providing a MockOptimizer for testing."""
+    return MockOptimizer()
\ No newline at end of file
diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py
new file mode 100644
index 0000000..dd212f1
--- /dev/null
+++ b/tests/helpers/test_helpers.py
@@ -0,0 +1,316 @@
+import pytest
+import pandas as pd
+import numpy as np
+from unittest.mock import patch, MagicMock
+
+from promptolution.helpers import run_experiment, run_optimization, run_evaluation
+from promptolution.config import ExperimentConfig
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_predictor import MockPredictor
+from tests.mocks.mock_task import MockTask
+
+
+@pytest.fixture
+def sample_df():
+    """Fixture providing a sample DataFrame for testing."""
+    data = {
+        'x': [
+            "This product is amazing!",
+            "I'm disappointed with this purchase.",
+            "The quality is average, nothing special.",
+            "Worst product ever, avoid at all costs!",
+            "Decent product, does what it's supposed to."
+        ],
+        'y': [
+            "positive",
+            "negative",
+            "neutral",
+            "negative", 
+            "positive"
+        ]
+    }
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def experiment_config():
+    """Fixture providing a configuration for experiments."""
+    return ExperimentConfig(
+        optimizer_name="evoprompt_ga",
+        task_name="classification",
+        task_description="Classify sentiment.",
+        llm_name="mock",
+        predictor_name="first_occurrence",
+        classes=["positive", "neutral", "negative"],
+        n_steps=2,
+        prepend_exemplars=False
+    )
+
+
+@pytest.fixture
+def experiment_config_with_exemplars():
+    """Fixture providing a configuration with exemplars enabled."""
+    return ExperimentConfig(
+        optimizer_name="evoprompt_ga",
+        task_name="classification",
+        task_description="Classify sentiment.",
+        llm_name="mock",
+        predictor_name="first_occurrence",
+        classes=["positive", "neutral", "negative"],
+        n_steps=2,
+        prepend_exemplars=True,
+        exemplar_selector="random",
+        n_exemplars=2
+    )
+
+
+@patch('promptolution.helpers.get_llm')
+@patch('promptolution.helpers.get_predictor')
+@patch('promptolution.helpers.get_task')
+@patch('promptolution.helpers.get_optimizer')
+def test_run_optimization(mock_get_optimizer, mock_get_task, mock_get_predictor, mock_get_llm, 
+                          sample_df, experiment_config):
+    """Test the run_optimization function."""
+    # Set up mocks
+    mock_llm = MockLLM()
+    mock_predictor = MockPredictor(classes=experiment_config.classes)
+    mock_predictor.extraction_description = "Extraction description." 
+    mock_task = MockTask()
+    mock_optimizer = MagicMock()
+    
+    # Configure mocks to return our test objects
+    mock_get_llm.return_value = mock_llm
+    mock_get_predictor.return_value = mock_predictor
+    mock_get_task.return_value = mock_task
+    mock_get_optimizer.return_value = mock_optimizer
+    
+    # Set up optimizer to return some prompts
+    optimized_prompts = [
+        "Classify this as positive or negative:",
+        "Determine the sentiment (positive/negative/neutral):",
+        "Is this text positive, negative, or neutral?"
+    ]
+    mock_optimizer.optimize.return_value = optimized_prompts
+    
+    # Run the function
+    result = run_optimization(sample_df, experiment_config)
+    
+    # Verify the results
+    assert result == optimized_prompts
+    
+    # Verify mocks were called
+    mock_get_llm.assert_called_once_with(config=experiment_config)
+    mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config)
+    mock_get_optimizer.assert_called_once_with(
+        predictor=mock_predictor,
+        meta_llm=mock_llm,
+        task=mock_task,
+        config=experiment_config
+    )
+    mock_optimizer.optimize.assert_called_once_with(n_steps=experiment_config.n_steps)
+
+
+@patch('promptolution.helpers.get_llm')
+@patch('promptolution.helpers.get_predictor')
+@patch('promptolution.helpers.get_task')
+@patch('promptolution.helpers.get_optimizer')
+@patch('promptolution.helpers.get_exemplar_selector')
+def test_run_optimization_with_exemplars(mock_get_exemplar_selector, mock_get_optimizer,
+                                         mock_get_task, mock_get_predictor, mock_get_llm,
+                                         sample_df, experiment_config_with_exemplars):
+    """Test run_optimization with exemplar selection enabled."""
+    # Set up mocks
+    mock_llm = MockLLM()
+    mock_predictor = MockPredictor(classes=experiment_config_with_exemplars.classes)
+    mock_predictor.extraction_description = "Extraction description."
+    mock_task = MockTask()
+    mock_optimizer = MagicMock()
+    mock_selector = MagicMock()
+    
+    # Configure mocks to return our test objects
+    mock_get_llm.return_value = mock_llm
+    mock_get_predictor.return_value = mock_predictor
+    mock_get_task.return_value = mock_task
+    mock_get_optimizer.return_value = mock_optimizer
+    mock_get_exemplar_selector.return_value = mock_selector
+    
+    # Set up optimizer to return some prompts
+    optimized_prompts = [
+        "Classify this as positive or negative:",
+        "Determine the sentiment (positive/negative/neutral):"
+    ]
+    mock_optimizer.optimize.return_value = optimized_prompts
+    
+    # Set up exemplar selector
+    exemplar_prompts = [
+        "Example 1: 'Great product!' - positive\nExample 2: 'Terrible!' - negative\nClassify this as positive or negative:",
+        "Example 1: 'Great product!' - positive\nExample 2: 'Terrible!' - negative\nDetermine the sentiment (positive/negative/neutral):"
+    ]
+    mock_selector.select_exemplars.side_effect = exemplar_prompts
+    
+    # Run the function
+    result = run_optimization(sample_df, experiment_config_with_exemplars)
+    
+    # Verify the results
+    assert result == exemplar_prompts
+    
+    # Verify mocks were called
+    mock_get_llm.assert_called_once_with(config=experiment_config_with_exemplars)
+    mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config_with_exemplars)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config_with_exemplars)
+    mock_get_optimizer.assert_called_once_with(
+        predictor=mock_predictor,
+        meta_llm=mock_llm,
+        task=mock_task,
+        config=experiment_config_with_exemplars
+    )
+    mock_optimizer.optimize.assert_called_once_with(n_steps=experiment_config_with_exemplars.n_steps)
+    
+    # Verify exemplar selector was called
+    mock_get_exemplar_selector.assert_called_once_with(
+        experiment_config_with_exemplars.exemplar_selector,
+        mock_task,
+        mock_predictor
+    )
+    assert mock_selector.select_exemplars.call_count == 2
+
+
+@patch('promptolution.helpers.get_llm')
+@patch('promptolution.helpers.get_predictor')
+@patch('promptolution.helpers.get_task')
+def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, 
+                        sample_df, experiment_config):
+    """Test the run_evaluation function."""
+    # Set up mocks
+    mock_llm = MockLLM()
+    mock_predictor = MockPredictor()
+    
+    # Use MagicMock instead of MockTask
+    mock_task = MagicMock()
+    mock_task.classes = ["positive", "neutral", "negative"]
+    
+    # Configure mocks to return our test objects
+    mock_get_llm.return_value = mock_llm
+    mock_get_predictor.return_value = mock_predictor
+    mock_get_task.return_value = mock_task
+    
+    # Set up task to return scores
+    prompts = [
+        "Classify this as positive or negative:",
+        "Determine the sentiment (positive/negative/neutral):",
+        "Is this text positive, negative, or neutral?"
+    ]
+    
+    # Now this will work because mock_task is a MagicMock
+    mock_task.evaluate.return_value = np.array([0.8, 0.7, 0.9])
+    
+    # Run the function
+    result = run_evaluation(sample_df, experiment_config, prompts)
+    
+    # Verify the results
+    assert isinstance(result, pd.DataFrame)
+    assert "prompt" in result.columns
+    assert "score" in result.columns
+    assert len(result) == 3
+    
+    # Verify the DataFrame is sorted by score (descending)
+    assert result.iloc[0]["score"] == 0.9
+    assert result.iloc[1]["score"] == 0.8
+    assert result.iloc[2]["score"] == 0.7
+    
+    # Verify mocks were called
+    mock_get_llm.assert_called_once_with(config=experiment_config)
+    mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config)
+    mock_task.evaluate.assert_called_once_with(prompts, mock_predictor)
+
+
+@patch('promptolution.helpers.run_optimization')
+@patch('promptolution.helpers.run_evaluation')
+def test_run_experiment(mock_run_evaluation, mock_run_optimization, 
+                        sample_df, experiment_config):
+    """Test the run_experiment function."""
+    # Set up mocks
+    optimized_prompts = [
+        "Classify this as positive or negative:",
+        "Determine the sentiment (positive/negative/neutral):"
+    ]
+    mock_run_optimization.return_value = optimized_prompts
+    
+    # Create a sample results DataFrame
+    eval_results = pd.DataFrame({
+        "prompt": optimized_prompts,
+        "score": [0.8, 0.7]
+    })
+    mock_run_evaluation.return_value = eval_results
+    
+    # Run the function
+    result = run_experiment(sample_df, experiment_config)
+    
+    # Verify results
+    assert result is eval_results
+    
+    # Verify the train-test split
+    mock_run_optimization_args = mock_run_optimization.call_args[0]
+    mock_run_evaluation_args = mock_run_evaluation.call_args[0]
+    
+    train_df = mock_run_optimization_args[0]
+    test_df = mock_run_evaluation_args[0]
+    
+    # Check that we have a 80-20 split
+    assert len(train_df) == 4  # 80% of 5 rows
+    assert len(test_df) == 1   # 20% of 5 rows
+    
+    # Check that no data is lost
+    assert len(train_df) + len(test_df) == len(sample_df)
+    
+    # Verify the prompts were passed to evaluation
+    assert mock_run_evaluation.call_args[0][2] == optimized_prompts
+
+def test_helpers_integration(sample_df, experiment_config):
+    """Integration test for helper functions - this tests the full experiment flow."""
+    # This test will use the actual functions but with mocked components
+    with patch('promptolution.helpers.get_llm') as mock_get_llm, \
+         patch('promptolution.helpers.get_predictor') as mock_get_predictor, \
+         patch('promptolution.helpers.get_task') as mock_get_task, \
+         patch('promptolution.helpers.get_optimizer') as mock_get_optimizer:
+         
+        # Set up mocks
+        mock_llm = MockLLM()
+        mock_predictor = MockPredictor(classes=experiment_config.classes)
+        mock_predictor.extraction_description = "Extract the sentiment."
+        
+        # Use a MagicMock instead of MockTask
+        mock_task = MagicMock()
+        mock_task.classes = ["positive", "neutral", "negative"]
+        mock_task.evaluate = MagicMock(return_value=np.array([0.85, 0.75]))
+        
+        mock_optimizer = MagicMock()
+        
+        # Configure mocks
+        mock_get_llm.return_value = mock_llm
+        mock_get_predictor.return_value = mock_predictor
+        mock_get_task.return_value = mock_task
+        mock_get_optimizer.return_value = mock_optimizer
+        
+        # Set up optimizer to return prompts
+        optimized_prompts = [
+            "Classify sentiment:",
+            "Determine if positive/negative:"
+        ]
+        mock_optimizer.optimize.return_value = optimized_prompts
+        
+        # Run the experiment
+        result = run_experiment(sample_df, experiment_config)
+        
+        # Verify results
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 2
+        assert all(p in result["prompt"].values for p in optimized_prompts)
+        
+        # Verify optimization was called
+        mock_optimizer.optimize.assert_called_once()
+        
+        # Verify evaluation was called
+        mock_task.evaluate.assert_called()
\ No newline at end of file
diff --git a/tests/llms/test_api_llm.py b/tests/llms/test_api_llm.py
new file mode 100644
index 0000000..8529d2c
--- /dev/null
+++ b/tests/llms/test_api_llm.py
@@ -0,0 +1,39 @@
+from unittest.mock import MagicMock, patch
+
+from promptolution.llms.api_llm import APILLM
+
+
+def test_api_llm_initialization():
+    """Test that APILLM initializes correctly."""
+    # Create patches for all dependencies
+    with patch('promptolution.llms.api_llm.AsyncOpenAI') as mock_client_class, \
+         patch('promptolution.llms.api_llm.asyncio') as mock_asyncio:
+        
+        # Configure the mocks
+        mock_client = MagicMock()
+        mock_client_class.return_value = mock_client
+        
+        mock_semaphore = MagicMock()
+        mock_asyncio.Semaphore.return_value = mock_semaphore
+        
+        # Create APILLM instance
+        api_llm = APILLM(
+            api_url="https://api.example.com",
+            llm="gpt-4",
+            token="test-token",
+            max_concurrent_calls=10
+        )
+        
+        # Verify AsyncOpenAI was called correctly
+        mock_client_class.assert_called_once()
+        args, kwargs = mock_client_class.call_args
+        assert kwargs["base_url"] == "https://api.example.com"
+        assert kwargs["api_key"] == "test-token"
+        
+        # Verify semaphore was created
+        mock_asyncio.Semaphore.assert_called_once_with(10)
+        
+        # Verify instance attributes
+        assert api_llm.api_url == "https://api.example.com"
+        assert api_llm.llm == "gpt-4"
+        assert api_llm.max_concurrent_calls == 10
\ No newline at end of file
diff --git a/tests/llms/test_base_llm.py b/tests/llms/test_base_llm.py
new file mode 100644
index 0000000..26fe1c7
--- /dev/null
+++ b/tests/llms/test_base_llm.py
@@ -0,0 +1,26 @@
+from promptolution.llms.base_llm import DummyLLM
+
+def test_base_llm_token_counting():
+    """Test token counting functionality."""
+    llm = DummyLLM()
+    
+    # Get initial token count
+    initial_count = llm.get_token_count()
+    assert initial_count["input_tokens"] == 0
+    assert initial_count["output_tokens"] == 0
+    
+    # Process some text
+    prompts = ["This is a test prompt with several words."]
+    llm.get_response(prompts)
+    
+    # Check updated token count
+    updated_count = llm.get_token_count()
+    assert updated_count["input_tokens"] > 0
+    assert updated_count["output_tokens"] > 0
+    
+    # Reset token count
+    llm.reset_token_count()
+    reset_count = llm.get_token_count()
+    assert reset_count["input_tokens"] == 0
+    assert reset_count["output_tokens"] == 0
+
diff --git a/tests/llms/test_local_llm.py b/tests/llms/test_local_llm.py
new file mode 100644
index 0000000..966abb1
--- /dev/null
+++ b/tests/llms/test_local_llm.py
@@ -0,0 +1,80 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from promptolution.llms.local_llm import LocalLLM
+
+
+@pytest.fixture
+def mock_local_dependencies():
+    """Set up mocks for LocalLLM dependencies."""
+    with patch('promptolution.llms.local_llm.transformers') as mock_transformers, \
+         patch('promptolution.llms.local_llm.torch') as mock_torch:
+        
+        # Configure mock pipeline
+        mock_pipeline = MagicMock()
+        mock_pipeline.return_value = [{"generated_text": "Mock response 1"}, {"generated_text": "Mock response 2"}]
+        mock_transformers.pipeline.return_value = mock_pipeline
+        
+        # Configure mock tokenizer
+        mock_pipeline.tokenizer = MagicMock()
+        mock_pipeline.tokenizer.pad_token_id = None
+        mock_pipeline.tokenizer.eos_token_id = 50256
+        mock_pipeline.tokenizer.padding_side = None
+        
+        yield {
+            'transformers': mock_transformers,
+            'pipeline': mock_pipeline,
+            'torch': mock_torch
+        }
+
+
+def test_local_llm_initialization(mock_local_dependencies):
+    """Test that LocalLLM initializes correctly."""
+    # Create LocalLLM instance
+    local_llm = LocalLLM(
+        model_id="gpt2",
+        batch_size=4
+    )
+    
+    # Verify pipeline was created correctly
+    mock_local_dependencies['transformers'].pipeline.assert_called_once_with(
+        "text-generation",
+        model="gpt2",
+        model_kwargs={"torch_dtype": mock_local_dependencies['torch'].bfloat16},
+        device_map="auto",
+        max_new_tokens=256,
+        batch_size=4,
+        num_return_sequences=1,
+        return_full_text=False,
+    )
+    
+    # Verify tokenizer attributes were set
+    assert local_llm.pipeline.tokenizer.pad_token_id == local_llm.pipeline.tokenizer.eos_token_id
+    assert local_llm.pipeline.tokenizer.padding_side == "left"
+
+
+def test_local_llm_get_response(mock_local_dependencies):
+    """Test that LocalLLM._get_response works correctly."""
+    # Create LocalLLM instance
+    local_llm = LocalLLM(model_id="gpt2")
+    
+    # Mock torch.no_grad context
+    with patch('promptolution.llms.local_llm.torch.no_grad') as mock_no_grad:
+        mock_no_grad.return_value.__enter__ = MagicMock()
+        mock_no_grad.return_value.__exit__ = MagicMock()
+        
+        # Call _get_response
+        prompts = ["Test prompt 1", "Test prompt 2"]
+        system_prompts = ["Be helpful", "Be concise"]
+        responses = local_llm._get_response(prompts, system_prompts)
+        
+        # Verify pipeline was called
+        local_llm.pipeline.assert_called_once()
+        
+        # Verify torch.no_grad was used
+        mock_no_grad.assert_called_once()
+        
+        # Verify responses
+        assert len(responses) == 2
+        assert responses[0] == "Mock response 1"
+        assert responses[1] == "Mock response 2"
\ No newline at end of file
diff --git a/tests/llms/test_vllm.py b/tests/llms/test_vllm.py
new file mode 100644
index 0000000..8bd7096
--- /dev/null
+++ b/tests/llms/test_vllm.py
@@ -0,0 +1,112 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from promptolution.llms.vllm import VLLM
+
+@pytest.fixture
+def mock_vllm_dependencies():
+    """Set up comprehensive mocks for VLLM dependencies."""
+    # Mock the key components
+    with patch('promptolution.llms.vllm.LLM') as mock_llm_class, \
+         patch('promptolution.llms.vllm.SamplingParams') as mock_sampling_params, \
+         patch('promptolution.llms.vllm.AutoTokenizer') as mock_tokenizer_class, \
+         patch('promptolution.llms.vllm.torch') as mock_torch:
+        
+        # Create and configure mock LLM
+        mock_llm = MagicMock()
+        mock_llm_class.return_value = mock_llm
+        
+        # Configure LLM engine with cache config for batch size calculation
+        mock_cache_config = MagicMock()
+        mock_cache_config.num_gpu_blocks = 100
+        mock_cache_config.block_size = 16
+        
+        mock_executor = MagicMock()
+        mock_executor.cache_config = mock_cache_config
+        
+        mock_engine = MagicMock()
+        mock_engine.model_executor = mock_executor
+        
+        mock_llm.llm_engine = mock_engine
+        
+        # Set up the generate method to return appropriate number of responses
+        def mock_generate_side_effect(prompts_list, *args, **kwargs):
+            """Return one output per input prompt"""
+            return [
+                MagicMock(outputs=[MagicMock(text=f"Mocked response for prompt {i}")])
+                for i, _ in enumerate(prompts_list)
+            ]
+        
+        # Use side_effect instead of return_value for dynamic behavior
+        mock_llm.generate.side_effect = mock_generate_side_effect
+        
+        # Configure mock tokenizer
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+        mock_tokenizer.apply_chat_template.return_value = "<mocked_chat_template>"
+        mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+        
+        yield {
+            'llm_class': mock_llm_class,
+            'llm': mock_llm,
+            'tokenizer_class': mock_tokenizer_class,
+            'tokenizer': mock_tokenizer,
+            'sampling_params': mock_sampling_params
+        }
+
+
+def test_vllm_get_response(mock_vllm_dependencies):
+    """Test that VLLM._get_response works correctly with explicit batch_size."""
+    # Create VLLM instance with explicit batch_size to avoid calculation
+    vllm = VLLM(
+        model_id="mock-model",
+        batch_size=4  # Set an explicit batch_size to avoid computation
+    )
+    
+    # Call get_response
+    prompts = ["Test prompt 1", "Test prompt 2"]
+    system_prompts = ["Be helpful", "Be concise"]
+    responses = vllm._get_response(prompts, system_prompts)
+    
+    # Verify tokenizer was used correctly
+    assert mock_vllm_dependencies['tokenizer'].apply_chat_template.call_count == 2
+    
+    # Verify LLM generate was called
+    mock_vllm_dependencies['llm'].generate.assert_called_once()
+    
+    # Verify responses
+    assert len(responses) == 2
+    assert responses[0] == "Mocked response for prompt 0"
+    assert responses[1] == "Mocked response for prompt 1"
+
+
+def test_vllm_with_auto_batch_size(mock_vllm_dependencies):
+    """Test VLLM with automatic batch size calculation."""
+    # Create VLLM instance with batch_size=None to trigger auto calculation
+    vllm = VLLM(
+        model_id="mock-model",
+        batch_size=None,
+        max_model_len=2048
+    )
+    
+    # Force a non-zero batch size
+    mock_vllm_dependencies['llm'].llm_engine.model_executor.cache_config.num_gpu_blocks = 1000
+    
+    # Create a new instance to recalculate batch size
+    vllm = VLLM(
+        model_id="mock-model",
+        batch_size=None,
+        max_model_len=2048
+    )
+    
+    # Verify batch_size is greater than zero
+    assert vllm.batch_size > 0, "Batch size should be greater than zero"
+    
+    # Test with a single prompt
+    prompts = ["Test prompt"]
+    system_prompts = ["Be helpful"]
+    responses = vllm._get_response(prompts, system_prompts)
+    
+    # Verify we get exactly one response for one prompt
+    assert len(responses) == 1
+    assert responses[0] == "Mocked response for prompt 0"
\ No newline at end of file
diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py
new file mode 100644
index 0000000..9f3859a
--- /dev/null
+++ b/tests/mocks/mock_llm.py
@@ -0,0 +1,76 @@
+from typing import List
+
+from promptolution.llms.base_llm import BaseLLM
+
+
+class MockLLM(BaseLLM):
+    """Mock LLM for testing purposes.
+    
+    This class allows precise control over responses for testing without loading actual models.
+    """
+    
+    def __init__(self, predetermined_responses=None, *args, **kwargs):
+        """Initialize the MockLLM with optional predetermined responses.
+        
+        Args:
+            predetermined_responses (Dict or List): Mapping from prompts to responses,
+                or a list of responses to return in sequence.
+            *args, **kwargs: Arguments to pass to BaseLLM.
+        """
+        super().__init__(*args, **kwargs)
+        self.predetermined_responses = predetermined_responses or {}
+        self.call_history = []
+        self.response_index = 0
+    
+    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+        """Generate predetermined responses for the given prompts.
+        
+        Records the inputs for later verification in tests.
+        
+        Args:
+            prompts (List[str]): Input prompts
+            system_prompts (List[str]): System prompts
+            
+        Returns:
+            List[str]: Predetermined responses
+        """
+        # Record the call for test assertions
+        self.call_history.append({
+            'prompts': prompts,
+            'system_prompts': system_prompts
+        })
+        
+        # Handle case where there's a single system prompt for multiple prompts
+        if len(system_prompts) == 1 and len(prompts) > 1:
+            system_prompts = system_prompts * len(prompts)
+        
+        results = []
+        for i, prompt in enumerate(prompts):
+            # Handle dictionary-based responses
+            if isinstance(self.predetermined_responses, dict):
+                # Try exact match first
+                if prompt in self.predetermined_responses:
+                    results.append(self.predetermined_responses[prompt])
+                # Try system prompt combination
+                elif i < len(system_prompts) and (prompt, system_prompts[i]) in self.predetermined_responses:
+                    results.append(self.predetermined_responses[(prompt, system_prompts[i])])
+                # Default response
+                else:
+                    results.append(f"Mock response for: {prompt}")
+            # Handle list-based responses (return in sequence)
+            elif isinstance(self.predetermined_responses, list):
+                if self.response_index < len(self.predetermined_responses):
+                    results.append(self.predetermined_responses[self.response_index])
+                    self.response_index += 1
+                else:
+                    results.append(f"Mock response for: {prompt}")
+            # Default fallback
+            else:
+                results.append(f"Mock response for: {prompt}")
+                
+        return results
+    
+    def reset(self):
+        """Reset the call history and response index."""
+        self.call_history = []
+        self.response_index = 0
\ No newline at end of file
diff --git a/tests/mocks/mock_optimizer.py b/tests/mocks/mock_optimizer.py
new file mode 100644
index 0000000..90332c7
--- /dev/null
+++ b/tests/mocks/mock_optimizer.py
@@ -0,0 +1,56 @@
+from unittest.mock import MagicMock
+
+class MockOptimizer:
+    """Mock optimizer for testing callbacks and other components.
+    
+    This class simulates an optimizer without requiring actual optimization processes.
+    """
+    
+    def __init__(self, prompts=None, scores=None):
+        """Initialize the MockOptimizer with optional prompts and scores.
+        
+        Args:
+            prompts: List of prompts to use (defaults to sample prompts)
+            scores: List of scores to use (defaults to sample scores)
+        """
+        self.prompts = prompts or [
+            "Sample prompt 1", 
+            "Sample prompt 2", 
+            "Sample prompt 3"
+        ]
+        self.scores = scores or [0.8, 0.7, 0.6]
+        
+        # Create mock LLMs
+        self.meta_llm = MagicMock()
+        self.meta_llm.input_token_count = 100
+        self.meta_llm.output_token_count = 50
+        
+        # Create mock predictor
+        self.predictor = MagicMock()
+        self.predictor.llm = MagicMock()
+        
+        # Set up token counting
+        self.token_counts = {
+            "input_tokens": 200,
+            "output_tokens": 100,
+            "total_tokens": 300
+        }
+        self.predictor.llm.get_token_count = MagicMock(return_value=self.token_counts)
+    
+    def set_token_counts(self, input_tokens=None, output_tokens=None, total_tokens=None):
+        """Set custom token counts for testing.
+        
+        Args:
+            input_tokens: Number of input tokens
+            output_tokens: Number of output tokens
+            total_tokens: Total number of tokens
+        """
+        if input_tokens is not None:
+            self.token_counts["input_tokens"] = input_tokens
+        if output_tokens is not None:
+            self.token_counts["output_tokens"] = output_tokens
+        if total_tokens is not None:
+            self.token_counts["total_tokens"] = total_tokens
+        
+        # Update the mock method
+        self.predictor.llm.get_token_count = MagicMock(return_value=self.token_counts)
\ No newline at end of file
diff --git a/tests/mocks/mock_predictor.py b/tests/mocks/mock_predictor.py
new file mode 100644
index 0000000..53e21b5
--- /dev/null
+++ b/tests/mocks/mock_predictor.py
@@ -0,0 +1,66 @@
+from typing import List, Optional, Tuple
+import numpy as np
+
+from promptolution.llms.base_llm import BaseLLM
+from promptolution.predictors.base_predictor import BasePredictor
+
+
+class MockPredictor(BasePredictor):
+    """Mock predictor for testing purposes.
+    
+    This class allows precise control over prediction behavior for testing
+    without loading actual models or running real inference.
+    """
+    
+    def __init__(
+        self, 
+        llm: Optional[BaseLLM] = None, 
+        classes: List[str] = None,
+        predetermined_predictions: Optional[dict] = None,
+        *args, 
+        **kwargs
+    ):
+        """Initialize the MockPredictor.
+        
+        Args:
+            llm: Language model to use (can be a MockLLM)
+            classes: List of possible class labels
+            predetermined_predictions: Dictionary mapping inputs to predictions
+            *args, **kwargs: Additional arguments to pass to BasePredictor
+        """
+        super().__init__(llm=llm, *args, **kwargs)
+        self.classes = classes or ["class_a", "class_b", "class_c"]
+        self.predetermined_predictions = predetermined_predictions or {}
+        self.call_history = []
+    
+    def _extract_preds(self, preds: List[str], shape: Tuple[int, int] = None) -> np.ndarray:
+        """Extract predictions based on predetermined mapping or default behavior.
+        
+        Args:
+            preds: Raw text predictions
+            shape: Shape for reshaping results (optional)
+            
+        Returns:
+            np.ndarray: Extracted predictions
+        """
+        # Record call for test assertions
+        self.call_history.append({
+            'preds': preds,
+            'shape': shape
+        })
+        
+        results = []
+        for pred in preds:
+            if pred in self.predetermined_predictions:
+                results.append(self.predetermined_predictions[pred])
+            else:
+                # Default to first class if no match
+                results.append(self.classes[0])
+        
+        results_array = np.array(results)
+        
+        # Reshape if shape is provided
+        if shape is not None:
+            results_array = results_array.reshape(shape)
+            
+        return results_array
\ No newline at end of file
diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py
new file mode 100644
index 0000000..061366d
--- /dev/null
+++ b/tests/mocks/mock_task.py
@@ -0,0 +1,99 @@
+import numpy as np
+from typing import List
+
+from promptolution.tasks.base_task import BaseTask
+
+
+class MockTask(BaseTask):
+    """Mock task for testing optimizers.
+    
+    This class simulates a classification task without requiring 
+    actual data or model inference.
+    """
+    
+    def __init__(self, predetermined_scores=None):
+        """Initialize the MockTask with optional predetermined scores.
+        
+        Args:
+            predetermined_scores: Dictionary mapping prompts to scores,
+                or a list of scores to return in sequence, or a function
+                that generates scores based on prompts.
+        """
+        super().__init__()
+        self.predetermined_scores = predetermined_scores or {}
+        self.call_history = []
+        self.score_index = 0
+        
+        # Default attributes similar to ClassificationTask
+        self.description = "Mock classification task"
+        self.classes = ["positive", "neutral", "negative"]
+        self.xs = np.array(["Sample text 1", "Sample text 2", "Sample text 3"])
+        self.ys = np.array(["positive", "negative", "neutral"])
+        self.initial_prompts = ["Classify:", "Determine:"]
+    
+    def evaluate(
+        self, 
+        prompts: List[str], 
+        predictor, 
+        system_prompts: List[str] = None,
+        subsample: bool = False,
+        n_samples: int = 20,
+        return_seq: bool = False
+    ) -> np.ndarray:
+        """Evaluate prompts with predetermined scores.
+        
+        Args:
+            prompts: List of prompts to evaluate
+            predictor: Predictor (ignored in mock)
+            system_prompts: System prompts (ignored in mock)
+            subsample: Whether to subsample (ignored in mock)
+            n_samples: Number of samples (ignored in mock)
+            return_seq: Whether to return sequences
+            
+        Returns:
+            np.ndarray of scores, and optionally sequences
+        """
+        # Record the call
+        self.call_history.append({
+            'prompts': prompts,
+            'predictor': predictor,
+            'system_prompts': system_prompts,
+            'subsample': subsample,
+            'n_samples': n_samples,
+            'return_seq': return_seq
+        })
+        
+        scores = []
+        for prompt in prompts:
+            # Handle different types of predetermined_scores
+            if callable(self.predetermined_scores):
+                # If it's a function, call it with the prompt
+                score = self.predetermined_scores(prompt)
+            elif isinstance(self.predetermined_scores, dict) and prompt in self.predetermined_scores:
+                # If it's a dict, look up the prompt
+                score = self.predetermined_scores[prompt]
+            elif isinstance(self.predetermined_scores, list):
+                # If it's a list, return items in sequence (cycling if needed)
+                if self.score_index < len(self.predetermined_scores):
+                    score = self.predetermined_scores[self.score_index]
+                    self.score_index = (self.score_index + 1) % len(self.predetermined_scores)
+                else:
+                    score = 0.5  # Default score
+            else:
+                # Generate a somewhat predictable score based on prompt length
+                # (longer prompts get slightly higher scores)
+                score = 0.5 + 0.01 * (len(prompt) % 10)
+            
+            scores.append(score)
+        
+        scores_array = np.array(scores)
+        
+        if return_seq:
+            # Generate dummy sequences
+            seqs = [
+                [f"Input: {x}\nOutput: {prompt}" for x in self.xs]
+                for prompt in prompts
+            ]
+            return scores_array, seqs
+        
+        return scores_array
\ No newline at end of file
diff --git a/tests/optimizers/test_evoprompt_de.py b/tests/optimizers/test_evoprompt_de.py
new file mode 100644
index 0000000..473c24d
--- /dev/null
+++ b/tests/optimizers/test_evoprompt_de.py
@@ -0,0 +1,160 @@
+import pytest
+import numpy as np
+from unittest.mock import patch
+
+from promptolution.optimizers.evoprompt_de import EvoPromptDE
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_task import MockTask
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def meta_llm_for_de():
+    """Fixture providing a MockLLM for EvoPromptDE with appropriate responses."""
+    llm = MockLLM()
+    
+    # Set up response generation for meta prompts
+    def get_response_for_de(prompts, system_prompts=None):
+        return [f"<prompt>DE improved prompt {i}</prompt>" for i in range(len(prompts))]
+    
+    llm.get_response = get_response_for_de
+    
+    return llm
+
+
+@pytest.fixture
+def initial_prompts():
+    """Fixture providing a set of initial prompts for testing."""
+    return [
+        "Classify the following text as positive or negative.",
+        "Determine if the sentiment of the text is positive or negative.",
+        "Is the following text positive or negative?",
+        "Analyze the sentiment in the following text.",
+        "Evaluate the sentiment of the text.",
+    ]
+
+
+@pytest.fixture
+def mock_task_for_de():
+    """Fixture providing a MockTask for EvoPromptDE."""
+    # Create scores that improve for "DE improved" prompts
+    def score_function(prompt):
+        if "DE improved" in prompt:
+            return 0.85  # Higher score for DE-generated prompts
+        return 0.7  # Base score for initial prompts
+    
+    return MockTask(predetermined_scores=score_function)
+
+
+def test_evoprompt_de_initialization(meta_llm_for_de, initial_prompts, mock_task_for_de):
+    """Test that EvoPromptDE initializes correctly."""
+    optimizer = EvoPromptDE(
+        predictor=MockPredictor(),
+        task=mock_task_for_de,
+        initial_prompts=initial_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm_for_de,
+        donor_random=False,
+        n_eval_samples=15
+    )
+    
+    # Verify initialization
+    assert optimizer.meta_llm == meta_llm_for_de
+    assert optimizer.prompt_template == "Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>"
+    assert optimizer.donor_random == False
+    assert optimizer.n_eval_samples == 15
+    assert optimizer.prompts == initial_prompts
+
+
+def test_evoprompt_de_pre_optimization_loop(meta_llm_for_de, initial_prompts, mock_task_for_de):
+    """Test the _pre_optimization_loop method."""
+    optimizer = EvoPromptDE(
+        predictor=MockPredictor(),
+        task=mock_task_for_de,
+        initial_prompts=initial_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm_for_de
+    )
+    
+    # Call pre-optimization loop
+    optimizer._pre_optimization_loop()
+    
+    # Verify that scores were evaluated
+    assert hasattr(optimizer, 'scores')
+    assert len(optimizer.scores) == len(initial_prompts)
+    
+    # Verify that prompts and scores were sorted by score (descending)
+    assert all(optimizer.scores[i] >= optimizer.scores[i+1] for i in range(len(optimizer.scores)-1))
+
+
+def test_evoprompt_de_step_with_donor_random(meta_llm_for_de, initial_prompts, mock_task_for_de):
+    """Test the _step method with donor_random=True."""
+    optimizer = EvoPromptDE(
+        predictor=MockPredictor(),
+        task=mock_task_for_de,
+        initial_prompts=initial_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm_for_de,
+        donor_random=True
+    )
+    
+    # Set up initial state
+    optimizer.prompts = initial_prompts
+    optimizer.scores = [0.7, 0.75, 0.65, 0.8, 0.6] 
+    
+    # Run step
+    new_prompts = optimizer._step()
+    
+    # Verify that step returned expected prompts
+    assert len(new_prompts) == len(initial_prompts)
+    
+    # Prompts should be sorted by score
+    assert all(optimizer.scores[i] >= optimizer.scores[i+1] for i in range(len(optimizer.scores)-1))
+
+
+def test_evoprompt_de_step_with_best_donor(meta_llm_for_de, initial_prompts, mock_task_for_de):
+    """Test the _step method with donor_random=False (using best prompt as donor)."""
+    optimizer = EvoPromptDE(
+        predictor=MockPredictor(),
+        task=mock_task_for_de,
+        initial_prompts=initial_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm_for_de,
+        donor_random=False
+    )
+    
+    # Set up initial state
+    optimizer.prompts = initial_prompts
+    optimizer.scores = [0.8, 0.7, 0.6, 0.4, 0.3]  # First prompt is best
+    
+    # Patch np.random.choice to control randomness
+    with patch('numpy.random.choice') as mock_choice:
+        # Set up mock to return predictable choices
+        mock_choice.side_effect = lambda arr, size=None, replace=None: np.array([arr[0], arr[1], arr[2]])
+        
+        # Run step
+        new_prompts = optimizer._step()
+    
+    # Verify that step returned expected prompts
+    assert len(new_prompts) == len(initial_prompts)
+
+
+def test_evoprompt_de_optimize(meta_llm_for_de, initial_prompts, mock_task_for_de):
+    """Test the optimize method."""
+    optimizer = EvoPromptDE(
+        predictor=MockPredictor(),
+        task=mock_task_for_de,
+        initial_prompts=initial_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm_for_de
+    )
+    
+    # Run optimization for 2 steps
+    optimized_prompts = optimizer.optimize(2)
+    
+    # Verify that optimization completed and returned prompts
+    assert len(optimized_prompts) == len(initial_prompts)
+    
+    # The prompts should have been improved
+    assert any("DE improved" in prompt for prompt in optimized_prompts)
+    
\ No newline at end of file
diff --git a/tests/optimizers/test_evoprompt_ga.py b/tests/optimizers/test_evoprompt_ga.py
new file mode 100644
index 0000000..64090be
--- /dev/null
+++ b/tests/optimizers/test_evoprompt_ga.py
@@ -0,0 +1,251 @@
+import pytest
+
+from promptolution.optimizers.evoprompt_ga import EvoPromptGA
+from promptolution.config import ExperimentConfig
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_task import MockTask
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def meta_llm_mock():
+    """Fixture providing a MockLLM configured for meta-prompt responses."""
+    # Responses for meta prompts that extract the prompt between <prompt> tags
+    responses = {
+        # Simple replacement pattern for testing
+        "Combine these prompts to create a better one: <prompt1> and <prompt2>.": "<prompt>Combined prompt</prompt>",
+        "Improve upon these prompts: <prompt1> and <prompt2>.": "<prompt>Improved prompt</prompt>",
+    }
+    
+    # For more complex patterns, we can use a function
+    def meta_prompt_response_generator(prompt):
+        if "Combine these prompts" in prompt:
+            # Extract the prompts from the meta-prompt
+            parts = prompt.split("<prompt1>")[1].split("</prompt1>")[0]
+            return f"<prompt>Combined: {parts[:20]}...</prompt>"
+        elif "Improve upon" in prompt:
+            return "<prompt>Improved version of prompt</prompt>"
+        else:
+            return "<prompt>Default meta-response</prompt>"
+    
+    # Create a more sophisticated mock that generates responses based on inputs
+    llm = MockLLM()
+    llm._get_response = lambda prompts, system_prompts: [
+        f"<prompt>Meta-generated prompt for input {i}</prompt>" for i, _ in enumerate(prompts)
+    ]
+    
+    return llm
+
+
+@pytest.fixture
+def initial_prompts():
+    """Fixture providing a set of initial prompts for testing."""
+    return [
+        "Classify the following text as positive or negative.",
+        "Determine if the sentiment of the text is positive or negative.",
+        "Is the following text positive or negative?",
+    ]
+
+
+@pytest.fixture
+def experiment_config():
+    """Fixture providing a basic experiment configuration."""
+    return ExperimentConfig(
+        optimizer_name="test_optimizer",
+        n_steps=3,
+        population_size=3,
+        random_seed=42
+    )
+
+
+@pytest.fixture
+def mock_task_with_scores():
+    """Fixture providing a MockTask with predetermined scoring behavior."""
+    # A function that generates scores based on the prompt
+    def score_function(prompt):
+        # Prefer longer prompts for testing purposes
+        return min(0.9, 0.5 + 0.01 * len(prompt))
+    
+    return MockTask(predetermined_scores=score_function)
+
+
+def test_evoprompt_ga_initialization(meta_llm_mock, initial_prompts, experiment_config, mock_task_with_scores):
+    """Test that EvoPromptGA initializes correctly."""
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=mock_task_with_scores,
+        predictor=MockPredictor()
+    )
+    
+    # Verify initialization
+    assert optimizer.prompt_template == "Combine these prompts to create a better one: <prompt1> and <prompt2>."
+    assert optimizer.meta_llm == meta_llm_mock
+    assert optimizer.selection_mode == "random"
+    assert optimizer.config.optimizer_name == "test_optimizer"
+    assert optimizer.config.n_steps == 3
+    assert optimizer.config.population_size == 3
+    assert optimizer.task == mock_task_with_scores
+    assert optimizer.prompts == initial_prompts
+
+
+def test_evoprompt_ga_pre_optimization_loop(meta_llm_mock, initial_prompts, experiment_config, mock_task_with_scores):
+    """Test the _pre_optimization_loop method."""
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=mock_task_with_scores,
+        predictor=MockPredictor()
+    )
+    
+    # Call _pre_optimization_loop directly
+    optimizer._pre_optimization_loop()
+    
+    # Verify that the task's evaluate method was called
+    assert len(mock_task_with_scores.call_history) == 1
+    assert mock_task_with_scores.call_history[0]['prompts'] == initial_prompts
+    
+    # Verify that scores and prompts were set and sorted
+    assert hasattr(optimizer, 'scores')
+    assert len(optimizer.scores) == len(initial_prompts)
+    
+    # Check if sorting by score happened
+    assert optimizer.scores == sorted(optimizer.scores, reverse=True)
+
+
+def test_evoprompt_ga_crossover(meta_llm_mock, initial_prompts, experiment_config):
+    """Test the _crossover method."""
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=MockTask(),
+        predictor=MockPredictor()
+    )
+    
+    # Set up scores for testing
+    optimizer.prompts = initial_prompts
+    optimizer.scores = [0.8, 0.7, 0.6]
+    
+    # Test random selection mode
+    optimizer.selection_mode = "random"
+    child_prompts_random = optimizer._crossover(optimizer.prompts, optimizer.scores)
+    assert len(child_prompts_random) == len(initial_prompts)
+    
+    # Test wheel selection mode
+    optimizer.selection_mode = "wheel"
+    child_prompts_wheel = optimizer._crossover(optimizer.prompts, optimizer.scores)
+    assert len(child_prompts_wheel) == len(initial_prompts)
+    
+    # Test tournament selection mode
+    optimizer.selection_mode = "tour"
+    child_prompts_tour = optimizer._crossover(optimizer.prompts, optimizer.scores)
+    assert len(child_prompts_tour) == len(initial_prompts)
+
+
+def test_evoprompt_ga_step(meta_llm_mock, initial_prompts, experiment_config, mock_task_with_scores):
+    """Test the _step method."""
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=mock_task_with_scores,
+        predictor=MockPredictor()
+    )
+    
+    # Set up state for testing
+    optimizer.prompts = initial_prompts
+    optimizer.scores = [0.8, 0.7, 0.6]
+    
+    # Call the step method
+    new_prompts = optimizer._step()
+    
+    # Verify results
+    assert len(new_prompts) == len(initial_prompts)
+    
+    # Check that task.evaluate was called
+    assert len(mock_task_with_scores.call_history) >= 1
+    
+    # Check that scores were updated
+    assert hasattr(optimizer, 'scores')
+    assert len(optimizer.scores) == len(initial_prompts)
+
+
+def test_evoprompt_ga_optimize(meta_llm_mock, initial_prompts, experiment_config, mock_task_with_scores):
+    """Test the optimize method."""
+    # Create mock callback
+    mock_callback = type('MockCallback', (), {
+        'on_step_end': lambda self, optimizer: True,
+        'on_train_end': lambda self, optimizer: None
+    })()
+    
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=mock_task_with_scores,
+        predictor=MockPredictor(),
+        callbacks=[mock_callback]
+    )
+    
+    # Run optimization
+    optimized_prompts = optimizer.optimize(2)
+    
+    # Verify results
+    assert len(optimized_prompts) == len(initial_prompts)
+    
+    # Check that the optimization process produced different prompts
+    assert optimized_prompts != initial_prompts
+    
+    # Check that task.evaluate was called multiple times
+    assert len(mock_task_with_scores.call_history) >= 2
+
+
+def test_evoprompt_ga_with_early_stopping(meta_llm_mock, initial_prompts, experiment_config, mock_task_with_scores):
+    """Test optimization with early stopping."""
+    # Create mock callback that stops after first step
+    call_count = 0
+    mock_callback = type('MockCallback', (), {
+        'on_step_end': lambda self, optimizer: False if call_count > 0 else True,
+        'on_train_end': lambda self, optimizer: None
+    })()
+    
+    # Mock the callback's on_step_end to count calls and stop after the first
+    def mock_on_step_end(optimizer):
+        nonlocal call_count
+        call_count += 1
+        return call_count <= 1
+    
+    mock_callback.on_step_end = mock_on_step_end
+    
+    optimizer = EvoPromptGA(
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm_mock,
+        selection_mode="random",
+        config=experiment_config,
+        initial_prompts=initial_prompts,
+        task=mock_task_with_scores,
+        predictor=MockPredictor(),
+        callbacks=[mock_callback]
+    )
+    
+    # Run optimization for 5 steps, but it should stop after 1
+    optimized_prompts = optimizer.optimize(5)
+    
+    # Verify results
+    assert len(optimized_prompts) == len(initial_prompts)
+    
+    # Check that the callback was called exactly twice (once for stopping)
+    assert call_count == 2
\ No newline at end of file
diff --git a/tests/optimizers/test_opro.py b/tests/optimizers/test_opro.py
new file mode 100644
index 0000000..5e36982
--- /dev/null
+++ b/tests/optimizers/test_opro.py
@@ -0,0 +1,261 @@
+import pytest
+import numpy as np
+
+from promptolution.optimizers.opro import Opro
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_task import MockTask
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def meta_llm_for_opro():
+    """Fixture providing a MockLLM for OPRO with appropriate responses."""
+    llm = MockLLM()
+    
+    # Have the LLM return responses with <prompt> tags
+    def get_response_with_tags(prompts, system_prompts=None):
+        return ["<prompt>Improved classification prompt: Identify the sentiment as positive, neutral, or negative.</prompt>"]
+    
+    # Instead of overriding _get_response, set up the mock to use get_response
+    llm.get_response = get_response_with_tags
+    
+    return llm
+
+
+@pytest.fixture
+def initial_prompts():
+    """Fixture providing a set of initial prompts for testing."""
+    return [
+        "Classify the following text as positive or negative.",
+        "Determine if the sentiment of the text is positive or negative.",
+        "Is the following text positive or negative?",
+    ]
+
+
+@pytest.fixture
+def mock_task_for_opro():
+    """Fixture providing a MockTask configured for OPRO testing."""
+    task = MockTask(predetermined_scores=[0.7, 0.8, 0.75])
+    
+    # For OPRO's _sample_examples method
+    task.xs = np.array([
+        "This is amazing!",
+        "I'm disappointed with this.",
+        "The quality is average."
+    ])
+    task.ys = np.array(["positive", "negative", "neutral"])
+    
+    return task
+
+
+def test_opro_initialization(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test that OPRO initializes correctly."""
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=initial_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro,
+        max_num_instructions=10,
+        num_instructions_per_step=4,
+        num_few_shots=2
+    )
+    
+    # Verify initialization
+    assert optimizer.meta_llm == meta_llm_for_opro
+    assert optimizer.meta_prompt_template == "<instructions>\n\n<examples>"
+    assert optimizer.max_num_instructions == 10
+    assert optimizer.num_instructions_per_step == 4
+    assert optimizer.num_few_shots == 2
+    assert optimizer.prompts == initial_prompts
+
+
+def test_opro_sample_examples(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the _sample_examples method."""
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=initial_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro,
+        num_few_shots=2
+    )
+    
+    # Sample examples
+    examples = optimizer._sample_examples()
+    
+    # Verify that examples were sampled
+    assert isinstance(examples, str)
+    assert "Input:" in examples
+    assert "Output:" in examples
+    
+    # Should have 2 examples (as set in num_few_shots)
+    assert examples.count("Input:") == 2
+    assert examples.count("Output:") == 2
+
+
+def test_opro_format_instructions(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the _format_instructions method."""
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=initial_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro
+    )
+    
+    # Set scores for testing
+    optimizer.prompts = initial_prompts
+    optimizer.scores = [0.7, 0.9, 0.5]
+    
+    # Format instructions
+    instructions = optimizer._format_instructions()
+    
+    # Verify that instructions were formatted
+    assert isinstance(instructions, str)
+    assert "text:" in instructions
+    assert "score:" in instructions
+    
+    # Should have formatted all prompts
+    assert instructions.count("text:") == 3
+    assert instructions.count("score:") == 3
+
+
+def test_opro_add_prompt_and_score(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the _add_prompt_and_score method."""
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=initial_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro,
+        max_num_instructions=2  # Small value to test truncation
+    )
+    
+    # Set initial state
+    optimizer.prompts = ["Prompt 1", "Prompt 2"]
+    optimizer.scores = [0.7, 0.6]
+    
+    # Add a new prompt with higher score
+    optimizer._add_prompt_and_score("Prompt 3", 0.8)
+    
+    # Verify that the prompt was added and the list was truncated
+    assert len(optimizer.prompts) == 2
+    assert "Prompt 3" in optimizer.prompts
+    assert "Prompt 1" in optimizer.prompts
+    assert "Prompt 2" not in optimizer.prompts  # Should be removed as lowest score
+    
+    # Verify that adding a duplicate prompt is ignored
+    optimizer._add_prompt_and_score("Prompt 3", 0.9)
+    assert len(optimizer.prompts) == 2
+    assert optimizer.scores == [0.7, 0.8]  # Score should not be updated
+
+
+def test_opro_pre_optimization_loop(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the _pre_optimization_loop method."""
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=initial_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro
+    )
+    
+    # Run pre-optimization loop
+    optimizer._pre_optimization_loop()
+    
+    # Verify that scores were evaluated
+    assert hasattr(optimizer, 'scores')
+    assert len(optimizer.scores) == len(initial_prompts)
+    
+    # Verify that meta_prompt was created
+    assert hasattr(optimizer, 'meta_prompt')
+    assert isinstance(optimizer.meta_prompt, str)
+    assert "<instructions>" not in optimizer.meta_prompt  # Should be replaced
+    assert "<examples>" not in optimizer.meta_prompt  # Should be replaced
+
+def test_opro_step(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the _step method."""
+    # Configure mock_task to return predetermined scores for any prompt
+    def evaluate_mock(prompts, predictor, **kwargs):
+        if isinstance(prompts, str):
+            if "New unique prompt" in prompts:
+                return np.array([0.9])  # Even higher score for our special prompt
+            return np.array([0.85])  # Higher score for new prompts
+        return np.array([0.7] * len(prompts))
+    
+    mock_task_for_opro.evaluate = evaluate_mock
+    
+    # Create a response function that returns unique, identifiable prompts
+    def get_unique_response(prompts, system_prompts=None):
+        return ["<prompt>New unique prompt #12345: Determine sentiment clearly.</prompt>"]
+    
+    # Replace the get_response method
+    meta_llm_for_opro.get_response = get_unique_response
+    
+    # Create a smaller initial prompt set to avoid issues
+    smaller_initial_set = ["Prompt 1", "Prompt 2"]
+    
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=smaller_initial_set,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro,
+        num_instructions_per_step=1  # Generate 1 prompt per step for simplicity
+    )
+    
+    # Set up initial state
+    optimizer.prompts = smaller_initial_set
+    optimizer.scores = [0.7, 0.65]
+    optimizer.meta_prompt = "Meta prompt with instructions and examples"
+    
+    # Run step
+    new_prompts = optimizer._step()
+    
+    # Verify that new prompts were generated
+    assert len(new_prompts) >= len(smaller_initial_set)
+    
+    # At least one prompt should contain our unique identifier
+    assert any("New unique prompt" in prompt for prompt in new_prompts)
+
+def test_opro_optimize(meta_llm_for_opro, initial_prompts, mock_task_for_opro):
+    """Test the optimize method."""
+    # Configure mock_task to return predetermined scores
+    def evaluate_mock(prompts, predictor, **kwargs):
+        if isinstance(prompts, str):
+            # Return a higher score for a specific new prompt format
+            if "New optimized prompt" in prompts:
+                return np.array([0.9])
+            return np.array([0.85])  # Higher score for other new prompts
+        return np.array([0.7] * len(prompts))
+    
+    mock_task_for_opro.evaluate = evaluate_mock
+    
+    # Ensure the meta_llm returns a distinctly different prompt
+    def get_unique_response(prompts, system_prompts=None):
+        # Return a clearly new prompt that won't be in initial_prompts
+        return ["<prompt>New optimized prompt: Analyze the text and classify sentiment.</prompt>"]
+    
+    meta_llm_for_opro.get_response = get_unique_response
+    
+    # Create OPRO with a smaller subset of initial prompts to avoid test issues
+    smaller_initial_set = initial_prompts[:2]  # Just take the first two prompts
+    
+    optimizer = Opro(
+        predictor=MockPredictor(),
+        task=mock_task_for_opro,
+        initial_prompts=smaller_initial_set,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm_for_opro,
+        num_instructions_per_step=1  # Generate 1 prompt per step for simplicity
+    )
+    
+    # Run optimization for 2 steps
+    optimized_prompts = optimizer.optimize(2)
+    
+    # Verify that optimization completed and returned prompts
+    assert len(optimized_prompts) >= len(smaller_initial_set)
+    
+    # Check if any new prompts were added (instead of comparing the entire lists)
+    assert any("New optimized prompt" in prompt for prompt in optimized_prompts)
\ No newline at end of file
diff --git a/tests/optimizers/test_optimizer_integration.py b/tests/optimizers/test_optimizer_integration.py
new file mode 100644
index 0000000..5cf075f
--- /dev/null
+++ b/tests/optimizers/test_optimizer_integration.py
@@ -0,0 +1,122 @@
+import pytest
+
+from promptolution.optimizers.evoprompt_ga import EvoPromptGA
+from promptolution.optimizers.evoprompt_de import EvoPromptDE
+from promptolution.optimizers.opro import Opro
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_task import MockTask
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def meta_llm():
+    """Fixture providing a common MockLLM for all optimizers."""
+    llm = MockLLM()
+    
+    # Set up response generation for meta prompts
+    def get_response_for_meta(prompts, system_prompts=None):
+        return [f"<prompt>Optimized prompt {i}</prompt>" for i in range(len(prompts))]
+    
+    llm.get_response = get_response_for_meta
+    
+    return llm
+
+
+@pytest.fixture
+def initial_prompts():
+    """Fixture providing a set of initial prompts for testing."""
+    return [
+        "Classify the following text as positive or negative.",
+        "Determine if the sentiment of the text is positive or negative.",
+        "Is the following text positive or negative?",
+        "Analyze the sentiment of this text and categorize as positive or negative.",
+        "Evaluate whether the sentiment expressed is positive or negative.",
+    ]
+
+
+@pytest.fixture
+def mock_score_improving_task():
+    """Fixture providing a MockTask with scores that improve over iterations."""
+    iteration_count = [0]  # Use a list to maintain state across calls
+    
+    def score_function(prompt):
+        # Return higher scores for later iterations
+        if "Optimized prompt" in prompt:
+            # Extract prompt number
+            try:
+                num = int(prompt.split("Optimized prompt")[1].strip())
+                return min(0.95, 0.7 + 0.05 * num)  # Increase score with prompt number
+            except (ValueError, IndexError):
+                pass
+            
+            # For any optimized prompt without a number
+            return 0.85
+            
+        # Base score increases slightly with each iteration
+        iteration_count[0] += 1
+        return min(0.8, 0.6 + 0.02 * iteration_count[0])
+    
+    return MockTask(predetermined_scores=score_function)
+
+def test_compare_optimizers(meta_llm, initial_prompts, mock_score_improving_task):
+    """Integration test comparing the three optimizers."""
+    # Create predictors and task
+    predictor = MockPredictor()
+    
+    # Make a copy of initial_prompts to avoid cross-test contamination
+    original_prompts = initial_prompts.copy()
+    
+    # Create optimizers
+    ga_optimizer = EvoPromptGA(
+        predictor=predictor,
+        task=mock_score_improving_task,
+        initial_prompts=original_prompts,
+        prompt_template="Combine these prompts to create a better one: <prompt1> and <prompt2>.",
+        meta_llm=meta_llm,
+        selection_mode="random",
+    )
+    
+    de_optimizer = EvoPromptDE(
+        predictor=predictor,
+        task=mock_score_improving_task,
+        initial_prompts=original_prompts,
+        prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
+        meta_llm=meta_llm,
+        donor_random=False,
+    )
+    
+    opro_optimizer = Opro(
+        predictor=predictor,
+        task=mock_score_improving_task,
+        initial_prompts=original_prompts,
+        prompt_template="<instructions>\n\n<examples>",
+        meta_llm=meta_llm,
+        num_instructions_per_step=2,
+    )
+    
+    # Run optimization for each optimizer
+    ga_prompts = ga_optimizer.optimize(2)
+    de_prompts = de_optimizer.optimize(2)
+    opro_prompts = opro_optimizer.optimize(2)
+    
+    # Verify that all optimizers completed and returned prompts
+    assert len(ga_prompts) == len(initial_prompts)
+    assert len(de_prompts) == len(initial_prompts)
+    assert len(opro_prompts) >= len(initial_prompts)
+    
+    # Verify that the optimized prompts are different from the initial prompts
+    assert ga_prompts != initial_prompts
+    assert de_prompts != initial_prompts
+    print(f"OPRO Prompts: {opro_prompts}")
+    print("original_prompts: ", initial_prompts)
+    assert any(prompt not in initial_prompts for prompt in opro_prompts)
+    
+    # Evaluate the final best prompt from each optimizer to compare
+    ga_score = mock_score_improving_task.evaluate([ga_prompts[0]], predictor)[0]
+    de_score = mock_score_improving_task.evaluate([de_prompts[0]], predictor)[0]
+    opro_score = mock_score_improving_task.evaluate([opro_prompts[0]], predictor)[0]
+    
+    # All optimizers should have improved the prompts
+    assert ga_score > 0.6
+    assert de_score > 0.6
+    assert opro_score > 0.6
\ No newline at end of file
diff --git a/tests/predictors/test_base_predictor.py b/tests/predictors/test_base_predictor.py
new file mode 100644
index 0000000..f507e56
--- /dev/null
+++ b/tests/predictors/test_base_predictor.py
@@ -0,0 +1,99 @@
+import pytest
+import numpy as np
+
+from tests.mocks.mock_llm import MockLLM
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def mock_llm_for_predictor():
+    """Fixture providing a MockLLM with predetermined responses for predictor testing."""
+    responses = {
+        "Classify this text:\nI love this product!": "The sentiment is positive.",
+        "Classify this text:\nI hate this product!": "The sentiment is negative.",
+        "Classify this text:\nThis product is okay.": "The sentiment is neutral."
+    }
+    return MockLLM(predetermined_responses=responses)
+
+
+@pytest.fixture
+def mock_predictor_with_llm(mock_llm_for_predictor):
+    """Fixture providing a MockPredictor with a MockLLM."""
+    predictions = {
+        "The sentiment is positive.": "positive",
+        "The sentiment is negative.": "negative",
+        "The sentiment is neutral.": "neutral"  # Make sure this mapping exists
+    }
+    classes = ["positive", "neutral", "negative"]
+    
+    return MockPredictor(
+        llm=mock_llm_for_predictor,
+        classes=classes,
+        predetermined_predictions=predictions
+    )
+
+
+def test_predictor_predict_flow(mock_predictor_with_llm):
+    """Test the basic prediction flow from prompt to final prediction."""
+    # Input data
+    prompts = ["Classify this text:"]
+    xs = np.array(["I love this product!", "I hate this product!"])
+    
+    # Call predict
+    predictions = mock_predictor_with_llm.predict(prompts, xs)
+    
+    # Verify shape and content of predictions
+    assert predictions.shape == (1, 2)
+    assert predictions[0, 0] == "positive"
+    assert predictions[0, 1] == "negative"
+    
+    # Verify LLM was called with correct prompts
+    assert len(mock_predictor_with_llm.llm.call_history) == 1
+    assert mock_predictor_with_llm.llm.call_history[0]['prompts'] == [
+        "Classify this text:\nI love this product!", 
+        "Classify this text:\nI hate this product!"
+    ]
+
+
+def test_predictor_with_return_seq(mock_predictor_with_llm):
+    """Test prediction with return_seq=True."""
+    # Input data
+    prompts = ["Classify this text:"]
+    xs = np.array(["This product is okay."])
+    
+    # Call predict with return_seq=True
+    predictions, sequences = mock_predictor_with_llm.predict(prompts, xs, return_seq=True)
+    
+    # Verify predictions
+    assert predictions.shape == (1, 1)
+    assert predictions[0, 0] == "neutral"
+    
+    # Verify sequences
+    assert len(sequences) == 1
+    assert isinstance(sequences[0], np.ndarray)
+    assert "This product is okay." in sequences[0][0]
+
+
+def test_multiple_prompts(mock_predictor_with_llm):
+    """Test prediction with multiple prompts."""
+    # Input data with multiple prompts
+    prompts = ["Classify this text:", "Rate this text:"]
+    xs = np.array(["I love this product!"])
+    
+    # Mock LLM responses for the second prompt
+    mock_predictor_with_llm.llm.predetermined_responses.update({
+        "Rate this text:\nI love this product!": "The rating is 5/5."
+    })
+    
+    # Add mapping for the new response
+    mock_predictor_with_llm.predetermined_predictions.update({
+        "The rating is 5/5.": "positive"
+    })
+    
+    # Call predict
+    predictions = mock_predictor_with_llm.predict(prompts, xs)
+    
+    # Verify shape and content
+    assert predictions.shape == (2, 1)
+    assert predictions[0, 0] == "positive"  # First prompt result
+    assert predictions[1, 0] == "positive"  # Second prompt result
\ No newline at end of file
diff --git a/tests/predictors/test_classificator_errors.py b/tests/predictors/test_classificator_errors.py
new file mode 100644
index 0000000..047a944
--- /dev/null
+++ b/tests/predictors/test_classificator_errors.py
@@ -0,0 +1,39 @@
+import pytest
+import numpy as np
+
+from promptolution.predictors.classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
+from tests.mocks.mock_llm import MockLLM
+
+
+def test_invalid_class_labels():
+    """Test that classifier raises an assertion error for invalid class labels."""
+    mock_llm = MockLLM()
+    
+    # Classes should be lowercase
+    invalid_classes = ["Positive", "Neutral", "Negative"]
+    
+    # Should raise an assertion error
+    with pytest.raises(AssertionError):
+        FirstOccurrenceClassificator(llm=mock_llm, classes=invalid_classes)
+    
+    with pytest.raises(AssertionError):
+        MarkerBasedClassificator(llm=mock_llm, classes=invalid_classes)
+
+
+def test_marker_based_missing_markers():
+    """Test MarkerBasedClassificator behavior when markers are missing."""
+    mock_llm = MockLLM(predetermined_responses={
+        "Classify: Missing markers": "This response doesn't have the markers at all."
+    })
+    
+    classifier = MarkerBasedClassificator(
+        llm=mock_llm,
+        classes=["positive", "neutral", "negative"]
+    )
+    
+    # When markers are missing, it should default to first class
+    prompts = ["Classify:"]
+    xs = np.array(["Missing markers"])
+    predictions = classifier.predict(prompts, xs)
+    
+    assert predictions[0, 0] == "positive"  # Should default to first class
\ No newline at end of file
diff --git a/tests/predictors/test_classificator_system_prompts.py b/tests/predictors/test_classificator_system_prompts.py
new file mode 100644
index 0000000..c9ad58a
--- /dev/null
+++ b/tests/predictors/test_classificator_system_prompts.py
@@ -0,0 +1,55 @@
+import pytest
+import numpy as np
+
+from promptolution.predictors.classificator import FirstOccurrenceClassificator
+from tests.mocks.mock_llm import MockLLM
+
+
+@pytest.fixture
+def mock_llm_with_system_prompts():
+    """Fixture providing a MockLLM that respects system prompts."""
+    responses = {
+        # Responses with default system prompt
+        "Classify: I love this product!": "This is positive feedback.",
+        
+        # Responses with custom system prompts
+        ("Classify: I love this product!", "Be specific"): "This text shows a clear positive sentiment.",
+        ("Classify: I love this product!", "Be brief"): "positive",
+        
+        # Additional examples
+        ("Classify: I hate this product!", "Be specific"): "The text expresses strong negative sentiment.",
+        ("Classify: I hate this product!", "Be brief"): "negative"
+    }
+    return MockLLM(predetermined_responses=responses)
+
+
+def test_classificator_with_system_prompts(mock_llm_with_system_prompts):
+    """Test classificators with custom system prompts."""
+    classes = ["positive", "neutral", "negative"]
+    
+    # Create classifier
+    classifier = FirstOccurrenceClassificator(
+        llm=mock_llm_with_system_prompts,
+        classes=classes
+    )
+    
+    # Test with default system prompt
+    prompts = ["Classify:"]
+    xs = np.array(["I love this product!"])
+    default_predictions = classifier.predict(prompts, xs)
+    assert default_predictions[0, 0] == "positive"
+    
+    # Test with custom system prompt
+    custom_system_prompts = ["Be specific"]
+    specific_predictions = classifier.predict(prompts, xs, system_prompts=custom_system_prompts)
+    
+    # The prediction should be the same, but the LLM would have received a different system prompt
+    assert specific_predictions[0, 0] == "positive"
+    
+    # Verify system prompt was passed correctly
+    assert mock_llm_with_system_prompts.call_history[-1]['system_prompts'] == custom_system_prompts
+    
+    # Test with "be brief" system prompt
+    brief_system_prompts = ["Be brief"]
+    brief_predictions = classifier.predict(prompts, xs, system_prompts=brief_system_prompts)
+    assert brief_predictions[0, 0] == "positive"
\ No newline at end of file
diff --git a/tests/predictors/test_classificators.py b/tests/predictors/test_classificators.py
new file mode 100644
index 0000000..a3536e4
--- /dev/null
+++ b/tests/predictors/test_classificators.py
@@ -0,0 +1,174 @@
+import pytest
+import numpy as np
+
+from promptolution.predictors.classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
+from tests.mocks.mock_llm import MockLLM
+
+
+@pytest.fixture
+def sentiment_classes():
+    """Fixture providing sentiment classes for testing."""
+    return ["positive", "neutral", "negative"]
+
+
+@pytest.fixture
+def mock_llm_for_first_occurrence():
+    """Fixture providing a MockLLM for FirstOccurrenceClassificator testing."""
+    responses = {
+        "Classify:\nI love this product!": "This text expresses positive sentiment about the product.",
+        "Classify:\nI hate this product!": "The sentiment here is clearly negative regarding the item.",
+        "Classify:\nThis product is okay.": "The text shows a neutral stance on the product.",
+        "Classify:\nInteresting product": "The sentiment is hard to determine, but not strongly leaning."
+    }
+    return MockLLM(predetermined_responses=responses)
+
+
+@pytest.fixture
+def mock_llm_for_marker_based():
+    """Fixture providing a MockLLM for MarkerBasedClassificator testing."""
+    responses = {
+        "Classify:\nI love this product!": "Let me analyze this... <final_answer>positive</final_answer>",
+        "Classify:\nI hate this product!": "I can detect that this is <final_answer>negative</final_answer> sentiment",
+        "Classify:\nThis product is okay.": "After consideration <final_answer>neutral</final_answer> seems appropriate",
+        "Classify:\nInteresting product": "Not entirely clear but <final_answer>neutral</final_answer> is best",
+        "Classify:\nBroken item": "This shows <final_answer>bad</final_answer> sentiment" # Invalid class for testing default
+    }
+    return MockLLM(predetermined_responses=responses)
+
+
+def test_first_occurrence_classificator(mock_llm_for_first_occurrence, sentiment_classes):
+    """Test the FirstOccurrenceClassificator."""
+    # Create classifier
+    classifier = FirstOccurrenceClassificator(
+        llm=mock_llm_for_first_occurrence,
+        classes=sentiment_classes
+    )
+    
+    # Test with multiple inputs
+    prompts = ["Classify:"]
+    xs = np.array(["I love this product!", "I hate this product!", "This product is okay."])
+    
+    # Make predictions
+    predictions = classifier.predict(prompts, xs)
+    
+    # Verify shape and content
+    assert predictions.shape == (1, 3)
+    assert predictions[0, 0] == "positive"
+    assert predictions[0, 1] == "negative"
+    assert predictions[0, 2] == "neutral"
+    
+    # Test with input that doesn't contain a class directly
+    ambiguous_input = np.array(["Interesting product"])
+    ambiguous_predictions = classifier.predict(prompts, ambiguous_input)
+    
+    # Should default to first class if no match
+    assert ambiguous_predictions[0, 0] == "positive"
+
+
+def test_marker_based_classificator(mock_llm_for_marker_based, sentiment_classes):
+    """Test the MarkerBasedClassificator."""
+    # Create classifier
+    classifier = MarkerBasedClassificator(
+        llm=mock_llm_for_marker_based,
+        classes=sentiment_classes,
+        begin_marker="<final_answer>",
+        end_marker="</final_answer>"
+    )
+    
+    # Test with multiple inputs
+    prompts = ["Classify:"]
+    xs = np.array(["I love this product!", "I hate this product!", "This product is okay."])
+    
+    # Make predictions
+    predictions = classifier.predict(prompts, xs)
+    
+    # Verify shape and content
+    assert predictions.shape == (1, 3)
+    assert predictions[0, 0] == "positive"
+    assert predictions[0, 1] == "negative"
+    assert predictions[0, 2] == "neutral"
+    
+    # Test with invalid class label
+    invalid_input = np.array(["Broken item"])
+    invalid_predictions = classifier.predict(prompts, invalid_input)
+    
+    # Should default to first class if invalid
+    assert invalid_predictions[0, 0] == "positive"
+
+
+def test_marker_based_without_classes(mock_llm_for_marker_based):
+    """Test MarkerBasedClassificator without predefined classes."""
+    # Create classifier without classes
+    classifier = MarkerBasedClassificator(
+        llm=mock_llm_for_marker_based,
+        classes=None,  # No class restrictions
+        begin_marker="<final_answer>",
+        end_marker="</final_answer>"
+    )
+    
+    # Test with multiple inputs
+    prompts = ["Classify:"]
+    xs = np.array(["I love this product!", "Broken item"])
+    
+    # Make predictions
+    predictions = classifier.predict(prompts, xs)
+    
+    # Verify shape and content - should accept any value between markers
+    assert predictions.shape == (1, 2)
+    assert predictions[0, 0] == "positive"
+    assert predictions[0, 1] == "bad"  # Should accept "bad" as it's between markers
+
+
+def test_multiple_prompts_with_classificators(mock_llm_for_first_occurrence, sentiment_classes):
+    """Test using multiple prompts with classificators."""
+    # Create classifier
+    classifier = FirstOccurrenceClassificator(
+        llm=mock_llm_for_first_occurrence,
+        classes=sentiment_classes
+    )
+    
+    # Add responses for a second prompt
+    mock_llm_for_first_occurrence.predetermined_responses.update({
+        "Rate:\nI love this product!": "This deserves a positive rating.",
+        "Rate:\nI hate this product!": "I would rate this as negative."
+    })
+    
+    # Test with multiple prompts
+    prompts = ["Classify:", "Rate:"]
+    xs = np.array(["I love this product!", "I hate this product!"])
+    
+    # Make predictions
+    predictions = classifier.predict(prompts, xs)
+    
+    # Verify shape and content
+    assert predictions.shape == (2, 2)  # (n_prompts, n_samples)
+    assert predictions[0, 0] == "positive"  # First prompt, first sample
+    assert predictions[0, 1] == "negative"  # First prompt, second sample
+    assert predictions[1, 0] == "positive"  # Second prompt, first sample
+    assert predictions[1, 1] == "negative"  # Second prompt, second sample
+
+
+def test_sequence_return_with_classificators(mock_llm_for_marker_based, sentiment_classes):
+    """Test return_seq parameter with classificators."""
+    # Create classifier
+    classifier = MarkerBasedClassificator(
+        llm=mock_llm_for_marker_based,
+        classes=sentiment_classes
+    )
+    
+    # Test with return_seq=True
+    prompts = ["Classify:"]
+    xs = np.array(["I love this product!"])
+    
+    # Make predictions with sequences
+    predictions, sequences = classifier.predict(prompts, xs, return_seq=True)
+    
+    # Verify predictions
+    assert predictions.shape == (1, 1)
+    assert predictions[0, 0] == "positive"
+    
+    # Verify sequences
+    assert len(sequences) == 1
+    assert len(sequences[0]) == 1
+    assert "I love this product!" in sequences[0][0]
+    assert "Let me analyze this..." in sequences[0][0]
\ No newline at end of file
diff --git a/tests/tasks/test_classification_task_mocked_df.py b/tests/tasks/test_classification_task_mocked_df.py
new file mode 100644
index 0000000..96aa6b7
--- /dev/null
+++ b/tests/tasks/test_classification_task_mocked_df.py
@@ -0,0 +1,60 @@
+import pytest
+from unittest.mock import MagicMock
+import numpy as np
+import pandas as pd
+
+from promptolution.tasks.classification_tasks import ClassificationTask
+from tests.mocks.mock_predictor import MockPredictor
+
+
+@pytest.fixture
+def mock_dataframe():
+    """Create a mock DataFrame with controlled behavior."""
+    # Create a mock DataFrame
+    mock_df = MagicMock(spec=pd.DataFrame)
+    
+    # Set up the y_column with lowercase values
+    mock_df.__getitem__.return_value.str.lower = MagicMock(return_value=mock_df.__getitem__.return_value)
+    
+    # Set up unique values for classes
+    mock_df.__getitem__.return_value.unique.return_value = np.array(["positive", "negative", "neutral"])
+    
+    # Set up values for xs and ys
+    mock_df.__getitem__.return_value.values = np.array(["Example 1", "Example 2", "Example 3"])
+    
+    return mock_df
+
+
+def test_task_with_mocked_dataframe(mock_dataframe):
+    """Test ClassificationTask with a mocked DataFrame."""
+    # Initialize task with mocked DataFrame
+    task = ClassificationTask(
+        df=mock_dataframe,
+        description="Mocked DataFrame test",
+        initial_prompts=["Classify:"],
+        x_column="x",
+        y_column="y"
+    )
+    
+    # Verify that the mocked data was used
+    assert len(task.classes) == 3
+    assert len(task.xs) == 3
+    assert len(task.ys) == 3
+    
+    # Create mock predictor
+    mock_predictor = MockPredictor(
+        classes=["positive", "negative", "neutral"],
+        predetermined_predictions={}
+    )
+    
+    # Override predict to return controlled values
+    mock_predictor.predict = MagicMock(return_value=np.array([["positive", "negative", "neutral"]]))
+    
+    # Evaluate
+    scores = task.evaluate(["Classify:"], mock_predictor)
+    
+    # Mock predictor.predict should have been called once
+    mock_predictor.predict.assert_called_once()
+    
+    # Scores should be an array of length 1
+    assert scores.shape == (1,)
\ No newline at end of file
diff --git a/tests/tasks/test_classifications_tasks.py b/tests/tasks/test_classifications_tasks.py
new file mode 100644
index 0000000..fdef369
--- /dev/null
+++ b/tests/tasks/test_classifications_tasks.py
@@ -0,0 +1,205 @@
+import pytest
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score
+
+from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.predictors.classificator import FirstOccurrenceClassificator
+from tests.mocks.mock_llm import MockLLM
+
+
+@pytest.fixture
+def sample_classification_df():
+    """Fixture providing a sample classification DataFrame for testing."""
+    data = {
+        'x': [
+            "This product is amazing!",
+            "I'm disappointed with this purchase.",
+            "The quality is average, nothing special.",
+            "Worst product ever, avoid at all costs!",
+            "Decent product, does what it's supposed to."
+        ],
+        'y': [
+            "positive",
+            "negative",
+            "neutral",
+            "negative", 
+            "positive"
+        ]
+    }
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def mock_llm_for_classification():
+    """Fixture providing a MockLLM with predetermined responses for classification."""
+    responses = {
+        "Classify sentiment: This product is amazing!": "This shows positive sentiment.",
+        "Classify sentiment: I'm disappointed with this purchase.": "This is negative feedback.",
+        "Classify sentiment: The quality is average, nothing special.": "This is a neutral review.",
+        "Classify sentiment: Worst product ever, avoid at all costs!": "Very negative review.",
+        "Classify sentiment: Decent product, does what it's supposed to.": "Somewhat positive feedback.",
+        
+        "Rate the text: This product is amazing!": "Rating: positive",
+        "Rate the text: I'm disappointed with this purchase.": "Rating: negative",
+        "Rate the text: The quality is average, nothing special.": "Rating: neutral",
+        "Rate the text: Worst product ever, avoid at all costs!": "Rating: negative",
+        "Rate the text: Decent product, does what it's supposed to.": "Rating: positive"
+    }
+    return MockLLM(predetermined_responses=responses)
+
+
+@pytest.fixture
+def classification_task(sample_classification_df):
+    """Fixture providing a ClassificationTask instance."""
+    initial_prompts = ["Classify sentiment:", "Rate the text:"]
+    return ClassificationTask(
+        df=sample_classification_df,
+        description="Sentiment classification task",
+        initial_prompts=initial_prompts,
+        x_column="x",
+        y_column="y",
+        metric=accuracy_score
+    )
+
+
+@pytest.fixture
+def classifier_predictor(mock_llm_for_classification):
+    """Fixture providing a FirstOccurrenceClassificator instance."""
+    return FirstOccurrenceClassificator(
+        llm=mock_llm_for_classification,
+        classes=["positive", "neutral", "negative"]
+    )
+
+
+def test_classification_task_initialization(sample_classification_df):
+    """Test that ClassificationTask initializes correctly."""
+    task = ClassificationTask(
+        df=sample_classification_df,
+        description="Sentiment classification task",
+        initial_prompts=["Classify:"],
+        x_column="x",
+        y_column="y"
+    )
+    
+    # Verify attributes
+    assert task.description == "Sentiment classification task"
+    assert len(task.classes) == 3
+    assert set(task.classes) == set(["positive", "neutral", "negative"])
+    assert len(task.xs) == 5
+    assert len(task.ys) == 5
+    assert task.initial_prompts == ["Classify:"]
+    assert task.metric == accuracy_score
+
+
+def test_task_evaluate(classification_task, classifier_predictor):
+    """Test the evaluate method of ClassificationTask."""
+    # Evaluate with a single prompt
+    prompts = ["Classify sentiment:"]
+    scores = classification_task.evaluate(prompts, classifier_predictor)
+    
+    # Verify scores
+    assert isinstance(scores, np.ndarray)
+    assert scores.shape == (1,)  # One score per prompt
+    assert 0 <= scores[0] <= 1  # Score should be between 0 and 1
+    
+    # Evaluate with multiple prompts
+    prompts = ["Classify sentiment:", "Rate the text:"]
+    scores = classification_task.evaluate(prompts, classifier_predictor)
+    
+    # Verify scores for multiple prompts
+    assert scores.shape == (2,)  # Two scores, one per prompt
+    assert all(0 <= score <= 1 for score in scores)
+
+
+def test_task_evaluate_with_subsampling(classification_task, classifier_predictor):
+    """Test the evaluate method with subsampling."""
+    prompts = ["Classify sentiment:"]
+    
+    # Evaluate with subsampling
+    n_samples = 3
+    scores = classification_task.evaluate(
+        prompts, 
+        classifier_predictor,
+        n_samples=n_samples,
+        subsample=True
+    )
+    
+    # Verify scores
+    assert scores.shape == (1,)  # One score per prompt
+    
+    # Test with a different random seed to ensure different subsamples
+    with pytest.raises(AssertionError, match=r'.*Arrays are not equal.*'):
+        # Use a different random seed to force different subsampling
+        np.random.seed(42)
+        scores1 = classification_task.evaluate(
+            prompts, 
+            classifier_predictor,
+            n_samples=n_samples,
+            subsample=True
+        )
+        
+        np.random.seed(43)
+        scores2 = classification_task.evaluate(
+            prompts, 
+            classifier_predictor,
+            n_samples=n_samples,
+            subsample=True
+        )
+        
+        # This should fail because the subsamples should be different
+        np.testing.assert_array_equal(scores1, scores2)
+
+
+def test_task_evaluate_with_return_seq(classification_task, classifier_predictor):
+    """Test the evaluate method with return_seq=True."""
+    prompts = ["Classify sentiment:"]
+    
+    # Evaluate with return_seq=True
+    scores, seqs = classification_task.evaluate(
+        prompts, 
+        classifier_predictor, 
+        n_samples=2,
+        subsample=True,
+        return_seq=True
+    )
+    
+    # Verify scores and sequences
+    assert scores.shape == (1,)  # One score per prompt
+    assert len(seqs) == 1  # One list of sequences per prompt
+    assert len(seqs[0]) == 2  # Two sequences per prompt (n_samples=2)
+    
+    # Check that sequences contain input text
+    for seq in seqs[0]:
+        assert any(sample_text in seq for sample_text in classification_task.xs)
+
+
+def test_task_evaluate_with_system_prompts(classification_task, classifier_predictor, mock_llm_for_classification):
+    """Test the evaluate method with system prompts."""
+    # Add responses for system prompts
+    mock_llm_for_classification.predetermined_responses.update({
+        ("Classify sentiment: This product is amazing!", "Be concise"): "positive",
+        ("Classify sentiment: I'm disappointed with this purchase.", "Be concise"): "negative",
+        ("Classify sentiment: The quality is average, nothing special.", "Be concise"): "neutral",
+        ("Classify sentiment: Worst product ever, avoid at all costs!", "Be concise"): "negative",
+        ("Classify sentiment: Decent product, does what it's supposed to.", "Be concise"): "positive"
+    })
+    
+    prompts = ["Classify sentiment:"]
+    system_prompts = ["Be concise"]
+    
+    # Evaluate with system prompts
+    scores = classification_task.evaluate(
+        prompts, 
+        classifier_predictor,
+        system_prompts=system_prompts
+    )
+    
+    # Verify scores
+    assert scores.shape == (1,)
+    
+    # Verify that system prompts were passed through to the LLM
+    assert any(
+        call['system_prompts'] == system_prompts
+        for call in mock_llm_for_classification.call_history
+    )
\ No newline at end of file