diff --git a/aixplain/factories/finetune_factory.py b/aixplain/factories/finetune_factory/__init__.py similarity index 64% rename from aixplain/factories/finetune_factory.py rename to aixplain/factories/finetune_factory/__init__.py index 91708f4c..b4963a0d 100644 --- a/aixplain/factories/finetune_factory.py +++ b/aixplain/factories/finetune_factory/__init__.py @@ -24,10 +24,13 @@ import logging from typing import Dict, List, Optional, Text import json +from aixplain.factories.finetune_factory.prompt_validator import validate_prompt +from aixplain.modules.finetune import Finetune +from aixplain.modules.finetune.cost import FinetuneCost +from aixplain.modules.finetune.hyperparameters import Hyperparameters +from aixplain.modules.finetune.peft import Peft from aixplain.modules.dataset import Dataset from aixplain.modules.model import Model -from aixplain.modules.finetune import Finetune -from aixplain.modules.finetune_cost import FinetuneCost from aixplain.utils import config from aixplain.utils.file_utils import _request_with_retry from urllib.parse import urljoin @@ -59,7 +62,15 @@ def _create_cost_from_response(cls, response: Dict) -> FinetuneCost: @classmethod def create( - cls, name: Text, dataset_list: List[Dataset], model: Model, train_percentage: float = 100, dev_percentage: float = 0 + cls, + name: Text, + dataset_list: List[Dataset], + model: Model, + prompt: Optional[Text] = None, + hyperparameters: Optional[Hyperparameters] = None, + peft: Optional[Peft] = None, + train_percentage: Optional[float] = 100, + dev_percentage: Optional[float] = 0, ) -> Finetune: """Create a Finetune object with the provided information. @@ -67,9 +78,11 @@ def create( name (Text): Name of the Finetune. dataset_list (List[Dataset]): List of Datasets to be used for fine-tuning. model (Model): Model to be fine-tuned. + prompt (Text, optional): Fine-tuning prompt. Defaults to None. + hyperparameters (Hyperparameters, optional): Hyperparameters for fine-tuning. Defaults to None. + peft (Peft, optional): PEFT (Parameter-Efficient Fine-Tuning) configuration. Defaults to None. train_percentage (float, optional): Percentage of training samples. Defaults to 100. dev_percentage (float, optional): Percentage of development samples. Defaults to 0. - Returns: Finetune: The Finetune object created with the provided information or None if there was an error. """ @@ -78,24 +91,42 @@ def create( assert ( train_percentage + dev_percentage <= 100 ), f"Create FineTune: Train percentage + dev percentage ({train_percentage + dev_percentage}) must be less than or equal to one" + if prompt is not None: + prompt = validate_prompt(prompt, dataset_list) try: url = urljoin(cls.backend_url, f"sdk/finetune/cost-estimation") headers = {"Authorization": f"Token {cls.api_key}", "Content-Type": "application/json"} - payload = json.dumps( - { - "datasets": [ - {"datasetId": dataset.id, "trainPercentage": train_percentage, "devPercentage": dev_percentage} - for dataset in dataset_list - ], - "sourceModelId": model.id, - } - ) + payload = { + "datasets": [ + {"datasetId": dataset.id, "trainPercentage": train_percentage, "devPercentage": dev_percentage} + for dataset in dataset_list + ], + "sourceModelId": model.id, + } + parameters = {} + if prompt is not None: + parameters["prompt"] = prompt + if hyperparameters is not None: + parameters["hyperparameters"] = hyperparameters.to_dict() + if peft is not None: + parameters["peft"] = peft.to_dict() + payload["parameters"] = parameters logging.info(f"Start service for POST Create FineTune - {url} - {headers} - {json.dumps(payload)}") - r = _request_with_retry("post", url, headers=headers, data=payload) + r = _request_with_retry("post", url, headers=headers, json=payload) resp = r.json() logging.info(f"Response for POST Create FineTune - Status {resp}") cost = cls._create_cost_from_response(resp) - return Finetune(name, dataset_list, model, cost, train_percentage=train_percentage, dev_percentage=dev_percentage) + return Finetune( + name, + dataset_list, + model, + cost, + train_percentage=train_percentage, + dev_percentage=dev_percentage, + prompt=prompt, + hyperparameters=hyperparameters, + peft=peft, + ) except Exception: error_message = f"Create FineTune: Error with payload {json.dumps(payload)}" logging.exception(error_message) diff --git a/aixplain/factories/finetune_factory/prompt_validator.py b/aixplain/factories/finetune_factory/prompt_validator.py new file mode 100644 index 00000000..2ed753e1 --- /dev/null +++ b/aixplain/factories/finetune_factory/prompt_validator.py @@ -0,0 +1,41 @@ +from typing import List, Text +from aixplain.modules.dataset import Dataset +import re + + +def _get_data_list(dataset: Dataset): + flatten_target_values = [item for sublist in list(dataset.target_data.values()) for item in sublist] + data_list = list(dataset.source_data.values()) + flatten_target_values + return data_list + + +def validate_prompt(prompt: Text, dataset_list: List[Dataset]) -> Text: + result_prompt = prompt + referenced_data = set(re.findall("<<(.+?)>>", prompt)) + for dataset in dataset_list: + data_list = _get_data_list(dataset) + for data in data_list: + if data.id in referenced_data: + result_prompt = result_prompt.replace(f"<<{data.id}>>", f"<<{data.name}>>") + referenced_data.remove(data.id) + referenced_data.add(data.name) + + # check if dataset list has same data name and it is referenced + name_set = set() + for dataset in dataset_list: + data_list = _get_data_list(dataset) + for data in data_list: + assert not ( + data.name in name_set and data.name in referenced_data + ), "Datasets must not have more than one referenced data with same name" + name_set.add(data.name) + + # check if all referenced data have a respective data in dataset list + for dataset in dataset_list: + data_list = _get_data_list(dataset) + for data in data_list: + if data.name in referenced_data: + result_prompt = result_prompt.replace(f"<<{data.name}>>", f"{{{data.name}}}") + referenced_data.remove(data.name) + assert len(referenced_data) == 0, "Referenced data are not present in dataset list" + return result_prompt diff --git a/aixplain/factories/model_factory.py b/aixplain/factories/model_factory.py index 2e04bf97..950cd25e 100644 --- a/aixplain/factories/model_factory.py +++ b/aixplain/factories/model_factory.py @@ -90,7 +90,7 @@ def get(cls, model_id: Text, api_key: Optional[Text] = None) -> Model: headers = {"x-aixplain-key": f"{cls.aixplain_key}", "Content-Type": "application/json"} else: headers = {"Authorization": f"Token {cls.api_key}", "Content-Type": "application/json"} - logging.info(f"Start service for GET Metric - {url} - {headers}") + logging.info(f"Start service for GET Model - {url} - {headers}") r = _request_with_retry("get", url, headers=headers) resp = r.json() # set api key diff --git a/aixplain/modules/__init__.py b/aixplain/modules/__init__.py index 5d58847b..0902eaf4 100644 --- a/aixplain/modules/__init__.py +++ b/aixplain/modules/__init__.py @@ -28,7 +28,6 @@ from .metric import Metric from .model import Model from .pipeline import Pipeline -from .finetune import Finetune +from .finetune import Finetune, FinetuneCost from .benchmark import Benchmark from .benchmark_job import BenchmarkJob -from .finetune_cost import FinetuneCost diff --git a/aixplain/modules/finetune.py b/aixplain/modules/finetune/__init__.py similarity index 69% rename from aixplain/modules/finetune.py rename to aixplain/modules/finetune/__init__.py index 3bc86ad7..cf311f8e 100644 --- a/aixplain/modules/finetune.py +++ b/aixplain/modules/finetune/__init__.py @@ -20,17 +20,20 @@ Description: FineTune Class """ -from typing import List, Text +from typing import List, Text, Optional import logging -from aixplain.utils.file_utils import _request_with_retry import json from urllib.parse import urljoin -from aixplain.utils import config +from aixplain.modules.finetune.cost import FinetuneCost +from aixplain.modules.finetune.hyperparameters import Hyperparameters +from aixplain.modules.finetune.peft import Peft from aixplain.factories.model_factory import ModelFactory from aixplain.modules.asset import Asset from aixplain.modules.dataset import Dataset from aixplain.modules.model import Model -from aixplain.modules.finetune_cost import FinetuneCost + +from aixplain.utils import config +from aixplain.utils.file_utils import _request_with_retry class Finetune(Asset): @@ -47,6 +50,9 @@ class Finetune(Asset): version (Text): Version of the FineTune. train_percentage (float): Percentage of training samples. dev_percentage (float): Percentage of development samples. + prompt (Text): Fine-tuning prompt. + hyperparameters (Hyperparameters): Hyperparameters for fine-tuning. + peft (Peft): PEFT (Parameter-Efficient Fine-Tuning) configuration. additional_info (dict): Additional information to be saved with the FineTune. backend_url (str): URL of the backend. api_key (str): The TEAM API key used for authentication. @@ -58,12 +64,15 @@ def __init__( dataset_list: List[Dataset], model: Model, cost: FinetuneCost, - id: Text = "", - description: Text = "", - supplier: Text = "aiXplain", - version: Text = "1.0", - train_percentage: float = 100, - dev_percentage: float = 0, + id: Optional[Text] = "", + description: Optional[Text] = "", + supplier: Optional[Text] = "aiXplain", + version: Optional[Text] = "1.0", + train_percentage: Optional[float] = 100, + dev_percentage: Optional[float] = 0, + prompt: Optional[Text] = None, + hyperparameters: Optional[Hyperparameters] = None, + peft: Optional[Peft] = None, **additional_info, ) -> None: """Create a FineTune with the necessary information. @@ -79,6 +88,9 @@ def __init__( version (Text, optional): Version of the FineTune. Defaults to "1.0". train_percentage (float, optional): Percentage of training samples. Defaults to 100. dev_percentage (float, optional): Percentage of development samples. Defaults to 0. + prompt (Text, optional): Fine-tuning prompt. Defaults to None. + hyperparameters (Hyperparameters, optional): Hyperparameters for fine-tuning. Defaults to None. + peft (Peft, optional): PEFT (Parameter-Efficient Fine-Tuning) configuration. Defaults to None. **additional_info: Additional information to be saved with the FineTune. """ super().__init__(id, name, description, supplier, version) @@ -87,6 +99,9 @@ def __init__( self.cost = cost self.train_percentage = train_percentage self.dev_percentage = dev_percentage + self.prompt = prompt + self.hyperparameters = hyperparameters + self.peft = peft self.additional_info = additional_info self.backend_url = config.BACKEND_URL self.api_key = config.TEAM_API_KEY @@ -102,22 +117,28 @@ def start(self) -> Model: try: url = urljoin(self.backend_url, f"sdk/finetune") headers = {"Authorization": f"Token {self.api_key}", "Content-Type": "application/json"} - payload = json.dumps( - { - "name": self.name, - "datasets": [ - { - "datasetId": dataset.id, - "trainSamplesPercentage": self.train_percentage, - "devSamplesPercentage": self.dev_percentage, - } - for dataset in self.dataset_list - ], - "sourceModelId": self.model.id, - } - ) + payload = { + "name": self.name, + "datasets": [ + { + "datasetId": dataset.id, + "trainSamplesPercentage": self.train_percentage, + "devSamplesPercentage": self.dev_percentage, + } + for dataset in self.dataset_list + ], + "sourceModelId": self.model.id, + } + parameters = {} + if self.prompt is not None: + parameters["prompt"] = self.prompt + if self.hyperparameters is not None: + parameters["hyperparameters"] = self.hyperparameters.to_dict() + if self.peft is not None: + parameters["peft"] = self.peft.to_dict() + payload["parameters"] = parameters logging.info(f"Start service for POST Start FineTune - {url} - {headers} - {json.dumps(payload)}") - r = _request_with_retry("post", url, headers=headers, data=payload) + r = _request_with_retry("post", url, headers=headers, json=payload) resp = r.json() logging.info(f"Response for POST Start FineTune - Name: {self.name} / Status {resp}") return ModelFactory().get(resp["id"]) diff --git a/aixplain/modules/finetune_cost.py b/aixplain/modules/finetune/cost.py similarity index 100% rename from aixplain/modules/finetune_cost.py rename to aixplain/modules/finetune/cost.py diff --git a/aixplain/modules/finetune/hyperparameters.py b/aixplain/modules/finetune/hyperparameters.py new file mode 100644 index 00000000..3a68a9d7 --- /dev/null +++ b/aixplain/modules/finetune/hyperparameters.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from dataclasses_json import dataclass_json + + +@dataclass_json +@dataclass +class Hyperparameters(object): + epochs: int = 4 + train_batch_size: int = 4 + eval_batch_size: int = 4 + learning_rate: float = 2e-5 + warmup_steps: int = 500 + generation_max_length: int = 225 + tokenizer_batch_size: int = 256 + gradient_checkpointing: bool = False + gradient_accumulation_steps: int = 1 + max_seq_length: int = 4096 diff --git a/aixplain/modules/finetune/peft.py b/aixplain/modules/finetune/peft.py new file mode 100644 index 00000000..d17efecf --- /dev/null +++ b/aixplain/modules/finetune/peft.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass +from dataclasses_json import dataclass_json + + +@dataclass_json +@dataclass +class Peft(object): + peft_lora_r: int = 8 + peft_lora_alpha: int = 32 + peft_lora_dropout: float = 0.05 diff --git a/aixplain/modules/model.py b/aixplain/modules/model.py index a92bde55..440181a1 100644 --- a/aixplain/modules/model.py +++ b/aixplain/modules/model.py @@ -279,3 +279,17 @@ def check_finetune_status(self): message = f"Status {status_code} - {message}" error_message = f"Check FineTune status Model: Error {message}" logging.exception(error_message) + + def delete(self) -> None: + """Delete Model service""" + try: + url = urljoin(self.backend_url, f"sdk/models/{self.id}") + headers = {"Authorization": f"Token {self.api_key}", "Content-Type": "application/json"} + logging.info(f"Start service for DELETE Model - {url} - {headers}") + r = _request_with_retry("delete", url, headers=headers) + if r.status_code != 200: + raise Exception() + except Exception: + message = "Model Deletion Error: Make sure the model exists and you are the owner." + logging.error(message) + raise Exception(f"{message}") diff --git a/pyproject.toml b/pyproject.toml index 4ec45221..ab7b901e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,8 @@ dependencies = [ "validators>=0.20.0", "filetype>=1.2.0", "click>=8.1.7", - "PyYAML>=6.0.1" + "PyYAML>=6.0.1", + "dataclasses-json==0.6.1" ] [project.urls] diff --git a/tests/functional/finetune/data/finetune_test_end2end.json b/tests/functional/finetune/data/finetune_test_end2end.json new file mode 100644 index 00000000..ae7e7bb5 --- /dev/null +++ b/tests/functional/finetune/data/finetune_test_end2end.json @@ -0,0 +1,12 @@ +[ + { + "model_name": "Chat GPT 3.5", + "dataset_name": "Test text generation dataset", + "inference_data": "Hello!" + }, + { + "model_name": "GPT2", + "dataset_name": "Test text generation dataset", + "inference_data": "Hello!" + } +] \ No newline at end of file diff --git a/tests/functional/finetune/data/finetune_test_list_data.json b/tests/functional/finetune/data/finetune_test_list_data.json index 4f322075..b5b13a57 100644 --- a/tests/functional/finetune/data/finetune_test_list_data.json +++ b/tests/functional/finetune/data/finetune_test_list_data.json @@ -1,11 +1,5 @@ [ { - "function": "translation", - "source_language": {"language": "en", "dialect": ""}, - "target_language": {"language": "fr", "dialect": ""} - }, - { - "function": "speech-recognition", - "source_language": {"language": "en", "dialect": ""} + "function": "text-generation" } ] \ No newline at end of file diff --git a/tests/functional/finetune/finetune_functional_test.py b/tests/functional/finetune/finetune_functional_test.py index 62fe3892..f5143be6 100644 --- a/tests/functional/finetune/finetune_functional_test.py +++ b/tests/functional/finetune/finetune_functional_test.py @@ -24,13 +24,13 @@ from aixplain.factories import ModelFactory from aixplain.factories import DatasetFactory from aixplain.factories import FinetuneFactory -from aixplain.modules import FinetuneCost +from aixplain.modules.finetune.cost import FinetuneCost from aixplain.enums import Function, Language import pytest TIMEOUT = 20000.0 -RUN_FILE = "tests/functional/finetune/data/finetune_test_run_data.json" +RUN_FILE = "tests/functional/finetune/data/finetune_test_end2end.json" LIST_FILE = "tests/functional/finetune/data/finetune_test_list_data.json" @@ -47,9 +47,10 @@ def run_input_map(request): def list_input_map(request): return request.param -def test_run(run_input_map): - model = ModelFactory.get(run_input_map["model_id"]) - dataset_list = [DatasetFactory.get(run_input_map["dataset_id"])] + +def test_end2end_text_generation(run_input_map): + model = ModelFactory.list(query=run_input_map["model_name"], is_finetunable=True)["results"][0] + dataset_list = [DatasetFactory.list(query=run_input_map["dataset_name"])["results"][0]] finetune = FinetuneFactory.create(str(uuid.uuid4()), dataset_list, model) assert type(finetune.cost) is FinetuneCost cost_map = finetune.cost.to_dict() @@ -64,6 +65,10 @@ def test_run(run_input_map): assert status != "failed" end = time.time() assert finetune_model.check_finetune_status() == "onboarded" + result = finetune_model.run(run_input_map["inference_data"]) + assert result is not None + finetune_model.delete() + def test_list_finetunable_models(list_input_map): model_list = ModelFactory.list( @@ -71,5 +76,5 @@ def test_list_finetunable_models(list_input_map): source_languages=Language(list_input_map["source_language"]) if "source_language" in list_input_map else None, target_languages=Language(list_input_map["target_language"]) if "target_language" in list_input_map else None, is_finetunable=True, - ) + )["results"] assert len(model_list) > 0 diff --git a/tests/functional/general_assets/asset_functional_test.py b/tests/functional/general_assets/asset_functional_test.py index d8c79594..0c410df2 100644 --- a/tests/functional/general_assets/asset_functional_test.py +++ b/tests/functional/general_assets/asset_functional_test.py @@ -68,3 +68,9 @@ def test_model_query(): models = ModelFactory.list(query=query)["results"] for model in models: assert query in model.name + + +def test_model_deletion(): + model = ModelFactory.get("640b517694bf816d35a59125") + with pytest.raises(Exception): + model.delete()