diff --git a/aixplain/enums/data_type.py b/aixplain/enums/data_type.py
index d13542f9..fa79d070 100644
--- a/aixplain/enums/data_type.py
+++ b/aixplain/enums/data_type.py
@@ -35,4 +35,4 @@ class DataType(Enum):
VIDEO = "video"
def __str__(self):
- return self._value_
\ No newline at end of file
+ return self._value_
diff --git a/aixplain/factories/pipeline_factory.py b/aixplain/factories/pipeline_factory.py
index 078bcae6..404a5556 100644
--- a/aixplain/factories/pipeline_factory.py
+++ b/aixplain/factories/pipeline_factory.py
@@ -22,6 +22,7 @@
"""
import json
import logging
+import os
from typing import Dict, List, Optional, Text, Union
from aixplain.enums.data_type import DataType
from aixplain.enums.function import Function
@@ -207,7 +208,7 @@ def list(
output_data_types = [output_data_types]
payload["inputDataTypes"] = [data_type.value for data_type in output_data_types]
- logging.info(f"Start service for POST List Dataset - {url} - {headers} - {json.dumps(payload)}")
+ logging.info(f"Start service for POST List Pipeline - {url} - {headers} - {json.dumps(payload)}")
r = _request_with_retry("post", url, headers=headers, json=payload)
resp = r.json()
@@ -220,3 +221,40 @@ def list(
for pipeline in results:
pipelines.append(cls.__from_response(pipeline))
return {"results": pipelines, "page_total": page_total, "page_number": page_number, "total": total}
+
+ @classmethod
+ def create(cls, name: Text, pipeline: Union[Text, Dict], status: Text = "draft") -> Pipeline:
+ """Pipeline Creation
+
+ Args:
+ name (Text): Pipeline Name
+ pipeline (Union[Text, Dict]): Pipeline as a Python dictionary or in a JSON file
+ status (Text, optional): Status of the pipeline. Currently only draft pipelines can be saved. Defaults to "draft".
+
+ Raises:
+ Exception: Currently just the creation of draft pipelines are supported
+
+ Returns:
+ Pipeline: instance of the new pipeline
+ """
+ try:
+ assert status == "draft", "Pipeline Creation Error: Currently just the creation of draft pipelines are supported."
+ if isinstance(pipeline, str) is True:
+ _, ext = os.path.splitext(pipeline)
+ assert (
+ os.path.exists(pipeline) and ext == ".json"
+ ), "Pipeline Creation Error: Make sure the pipeline to be save is in a JSON file."
+ with open(pipeline) as f:
+ pipeline = json.load(f)
+
+ # prepare payload
+ payload = {"name": name, "status": "draft", "architecture": pipeline}
+ url = urljoin(cls.backend_url, "sdk/pipelines")
+ headers = {"Authorization": f"Token {config.TEAM_API_KEY}", "Content-Type": "application/json"}
+ logging.info(f"Start service for POST Create Pipeline - {url} - {headers} - {json.dumps(payload)}")
+ r = _request_with_retry("post", url, headers=headers, json=payload)
+ response = r.json()
+
+ return Pipeline(response["id"], name, config.TEAM_API_KEY)
+ except Exception as e:
+ raise Exception(e)
diff --git a/aixplain/modules/metadata.py b/aixplain/modules/metadata.py
index f7fac5f5..07007ebe 100644
--- a/aixplain/modules/metadata.py
+++ b/aixplain/modules/metadata.py
@@ -43,6 +43,7 @@ def __init__(
file_extension: Optional[FileType] = None,
languages: List[Language] = [],
dsubtype: DataSubtype = DataSubtype.OTHER,
+ id: Optional[Text] = None,
**kwargs
) -> None:
"""MetaData Class
@@ -62,6 +63,7 @@ def __init__(
file_extension (Optional[FileType], optional): File extension (e.g. CSV, TXT, etc.). Defaults to None.
languages (List[Language], optional): List of languages which the data consists of. Defaults to [].
dsubtype (DataSubtype, optional): Data subtype (e.g., age, topic, race, split, etc.), used in datasets metadata. Defaults to Other.
+ id (Optional[Text], optional): Data ID. Defaults to None.
"""
self.name = name
if isinstance(dtype, str):
@@ -91,4 +93,5 @@ def __init__(
language = Language(language)
self.languages.append(language)
self.dsubtype = dsubtype
+ self.id = id
self.kwargs = kwargs
diff --git a/aixplain/modules/pipeline.py b/aixplain/modules/pipeline.py
index 011f3114..3de49756 100644
--- a/aixplain/modules/pipeline.py
+++ b/aixplain/modules/pipeline.py
@@ -23,11 +23,13 @@
import time
import json
+import os
import logging
from aixplain.modules.asset import Asset
from aixplain.utils import config
from aixplain.utils.file_utils import _request_with_retry
from typing import Dict, Optional, Text, Union
+from urllib.parse import urljoin
class Pipeline(Asset):
@@ -306,3 +308,32 @@ def run_async(
if resp is not None:
response["error"] = resp
return response
+
+ def update(self, pipeline: Union[Text, Dict]):
+ """Update Pipeline
+
+ Args:
+ pipeline (Union[Text, Dict]): Pipeline as a Python dictionary or in a JSON file
+
+ Raises:
+ Exception: Make sure the pipeline to be save is in a JSON file.
+ """
+ try:
+ if isinstance(pipeline, str) is True:
+ _, ext = os.path.splitext(pipeline)
+ assert (
+ os.path.exists(pipeline) and ext == ".json"
+ ), "Pipeline Update Error: Make sure the pipeline to be save is in a JSON file."
+ with open(pipeline) as f:
+ pipeline = json.load(f)
+
+ # prepare payload
+ payload = {"name": self.name, "status": "draft", "architecture": pipeline}
+ url = urljoin(config.BACKEND_URL, f"sdk/pipelines/{self.id}")
+ headers = {"Authorization": f"Token {config.TEAM_API_KEY}", "Content-Type": "application/json"}
+ logging.info(f"Start service for PUT Update Pipeline - {url} - {headers} - {json.dumps(payload)}")
+ r = _request_with_retry("put", url, headers=headers, json=payload)
+ response = r.json()
+ logging.info(f"Pipeline {response['id']} Updated.")
+ except Exception as e:
+ raise Exception(e)
diff --git a/aixplain/processes/data_onboarding/onboard_functions.py b/aixplain/processes/data_onboarding/onboard_functions.py
index 091458fd..35a64e12 100644
--- a/aixplain/processes/data_onboarding/onboard_functions.py
+++ b/aixplain/processes/data_onboarding/onboard_functions.py
@@ -97,11 +97,11 @@ def process_data_files(
-1,
0,
)
- if metadata.dtype in [DataType.AUDIO, DataType.IMAGE] or metadata.dsubtype == DataSubtype.INTERVAL:
+ if metadata.dtype in [DataType.AUDIO, DataType.IMAGE, DataType.LABEL] or metadata.dsubtype == DataSubtype.INTERVAL:
files, data_column_idx, start_column_idx, end_column_idx, nrows = process_media_files.run(
metadata=metadata, paths=paths, folder=folder
)
- elif metadata.dtype in [DataType.TEXT, DataType.LABEL]:
+ elif metadata.dtype in [DataType.TEXT]:
files, data_column_idx, nrows = process_text_files.run(metadata=metadata, paths=paths, folder=folder)
return files, data_column_idx, start_column_idx, end_column_idx, nrows
diff --git a/aixplain/processes/data_onboarding/process_media_files.py b/aixplain/processes/data_onboarding/process_media_files.py
index 3f95b1e3..8b333d72 100644
--- a/aixplain/processes/data_onboarding/process_media_files.py
+++ b/aixplain/processes/data_onboarding/process_media_files.py
@@ -16,6 +16,7 @@
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple
+from urllib.parse import urlparse
AUDIO_MAX_SIZE = 50000000
IMAGE_TEXT_MAX_SIZE = 25000000
@@ -45,6 +46,11 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
Returns:
Tuple[List[File], int, int, int]: list of s3 links; data, start and end columns index, and number of rows
"""
+ if metadata.dtype != DataType.LABEL:
+ assert (
+ metadata.storage_type != StorageType.TEXT
+ ), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text.'
+
# if files are stored locally, create a folder to store it
media_folder = Path(".")
if metadata.storage_type == StorageType.FILE:
@@ -95,6 +101,10 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
assert (
os.path.getsize(media_path) <= AUDIO_MAX_SIZE
), f'Data Asset Onboarding Error: Local audio file "{media_path}" exceeds the size limit of 50 MB.'
+ elif metadata.dtype == DataType.LABEL:
+ assert (
+ os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
+ ), f'Data Asset Onboarding Error: Local label file "{media_path}" exceeds the size limit of 25 MB.'
else:
assert (
os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
diff --git a/aixplain/processes/data_onboarding/process_text_files.py b/aixplain/processes/data_onboarding/process_text_files.py
index 48db3f4e..1ba7f47e 100644
--- a/aixplain/processes/data_onboarding/process_text_files.py
+++ b/aixplain/processes/data_onboarding/process_text_files.py
@@ -26,7 +26,7 @@ def process_text(content: str, storage_type: StorageType) -> Text:
Text: textual content
"""
if storage_type == StorageType.FILE:
- # Check the size of file and assert a limit of 50 MB
+ # Check the size of file and assert a limit of 25 MB
assert (
os.path.getsize(content) <= 25000000
), f'Data Asset Onboarding Error: Local text file "{content}" exceeds the size limit of 25 MB.'
diff --git a/docs/assets/architecture.png b/docs/assets/architecture.png
new file mode 100644
index 00000000..91131c72
Binary files /dev/null and b/docs/assets/architecture.png differ
diff --git a/docs/development/developer_guide.md b/docs/development/developer_guide.md
index cfcedbe6..aaabc8f5 100644
--- a/docs/development/developer_guide.md
+++ b/docs/development/developer_guide.md
@@ -40,8 +40,13 @@ set LOG_LEVEL=DEBUG
%env LOG_LEVEL=DEBUG
```
+## Architecture
-## Data Asset Onboard
+### Diagram
+
+
+
+### Data Asset Onboard
The image below depicts the onboard process of a data asset (e.g. corpora and datasets):
diff --git a/docs/samples/label_dataset_onboarding/corpus/images/1.jpg b/docs/samples/label_dataset_onboarding/corpus/images/1.jpg
new file mode 100644
index 00000000..ae3d592c
Binary files /dev/null and b/docs/samples/label_dataset_onboarding/corpus/images/1.jpg differ
diff --git a/docs/samples/label_dataset_onboarding/corpus/images/2.png b/docs/samples/label_dataset_onboarding/corpus/images/2.png
new file mode 100644
index 00000000..ba23ab11
Binary files /dev/null and b/docs/samples/label_dataset_onboarding/corpus/images/2.png differ
diff --git a/docs/samples/label_dataset_onboarding/corpus/index.csv b/docs/samples/label_dataset_onboarding/corpus/index.csv
new file mode 100644
index 00000000..69ba347a
--- /dev/null
+++ b/docs/samples/label_dataset_onboarding/corpus/index.csv
@@ -0,0 +1,3 @@
+,images,labels
+0,corpus/images/1.jpg,corpus/labels/1.json
+1,corpus/images/2.png,corpus/labels/2.json
diff --git a/docs/samples/label_dataset_onboarding/corpus/labels/1.json b/docs/samples/label_dataset_onboarding/corpus/labels/1.json
new file mode 100644
index 00000000..6947447f
--- /dev/null
+++ b/docs/samples/label_dataset_onboarding/corpus/labels/1.json
@@ -0,0 +1,9 @@
+{
+ "data": "arcade",
+ "boundingBox": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ }
+}
\ No newline at end of file
diff --git a/docs/samples/label_dataset_onboarding/corpus/labels/2.json b/docs/samples/label_dataset_onboarding/corpus/labels/2.json
new file mode 100644
index 00000000..b990cfd3
--- /dev/null
+++ b/docs/samples/label_dataset_onboarding/corpus/labels/2.json
@@ -0,0 +1,9 @@
+{
+ "data": "building",
+ "boundingBox": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ }
+}
\ No newline at end of file
diff --git a/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb b/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb
new file mode 100644
index 00000000..f499dd51
--- /dev/null
+++ b/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb
@@ -0,0 +1,399 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Image Label Detection Dataset Onboarding\n",
+ "\n",
+ "This notebook demonstrates how to onboard a dataset with label data into aiXplain platform using its SDK."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Credentials\n",
+ "\n",
+ "To use the aiXplain SDK, you may be registered in our platform and have an API key. The step-by-step on how to do it is better described [here](/docs/user/api_setup.md)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "os.environ[\"TEAM_API_KEY\"] = \"YOUR_TEAM_API_KEY_HERE\""
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data\n",
+ "\n",
+ "In this example we will show how to onboard a sample dataset of images and their corresponding labels. To onboard it, the data needs to be depicted in a CSV file, which will be fed to the SDK. \n",
+ "\n",
+ "Label data should have be one or more elements in a JSON file according to one of the following structure:\n",
+ "\n",
+ "```json\n",
+ "{\n",
+ " \"data\": \"TEXT_AUDIO_LABEL\",\n",
+ " \"boundingBox\": {\n",
+ " \"start\": 0, // start character\n",
+ " \"end\": 0, // end character\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"data\": \"AUDIO_LABEL\",\n",
+ " \"boundingBox\": {\n",
+ " \"start\": 0, // start second\n",
+ " \"end\": 0 // end second\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"data\": \"IMAGE_LABEL\",\n",
+ " \"boundingBox\": {\n",
+ " \"top\": 0, // top percentage of the image\n",
+ " \"bottom\": 0, // bottom percentage of the image\n",
+ " \"left\": 0, // left percentage of the image\n",
+ " \"right\": 0 // right percentage of the image\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"data\": \"VIDEO_LABEL\",\n",
+ " \"boundingBox\": {\n",
+ " \"start\": 0, // start second\n",
+ " \"end\": 0, // end second\n",
+ " \"top\": 0, // top percentage of the image\n",
+ " \"bottom\": 0, // bottom percentage of the image\n",
+ " \"left\": 0, // left percentage of the image\n",
+ " \"right\": 0 // right percentage of the image\n",
+ " }\n",
+ "}\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/1p/jbswfpbs73q5qbbh78dzj5xm0000gn/T/ipykernel_47954/611755932.py:1: DeprecationWarning: \n",
+ "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
+ "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
+ "but was not found to be installed on your system.\n",
+ "If this would cause problems for you,\n",
+ "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
+ " \n",
+ " import pandas as pd\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " images | \n",
+ " labels | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " corpus/images/1.jpg | \n",
+ " corpus/labels/1.json | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " corpus/images/2.png | \n",
+ " corpus/labels/2.json | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 images labels\n",
+ "0 0 corpus/images/1.jpg corpus/labels/1.json\n",
+ "1 1 corpus/images/2.png corpus/labels/2.json"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "upload_file = \"corpus/index.csv\"\n",
+ "data = pd.read_csv(upload_file)\n",
+ "data"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import\n",
+ "\n",
+ "Let's now import the necessary classes to onboard the corpus."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from aixplain.enums import DataType, DataSubtype, Function, Language, License, StorageType\n",
+ "from aixplain.factories import DatasetFactory\n",
+ "from aixplain.modules import MetaData"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Metadata\n",
+ "\n",
+ "Besides the CSV file, a schema must be fed to the SDK giving some information about the input and output data to be onboarded, such as: \n",
+ "\n",
+ "1. Data Name\n",
+ "2. Data Type: Audio, Text, Image, Video, Label, etc.\n",
+ "3. Storage Type: whether the data is depicted in the CSV (Text), in a local file (File) or in a public link (URL)\n",
+ "4. Start Column (optional): the column which depicts the beginning of the segment in the original file\n",
+ "5. End Column (optional): the column which depicts the end of the segment in the original file\n",
+ "6. Languages (optional): the languages depicted in the data"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's instantiate the metadata for the images:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_meta = MetaData(\n",
+ " name=\"images\", \n",
+ " dtype=\"image\", \n",
+ " storage_type=\"file\", \n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now for the labels...\n",
+ "\n",
+ "(See how we can use enumerations instead of strings to specify some information)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "label_meta = MetaData(\n",
+ " name=\"labels\", \n",
+ " dtype=DataType.LABEL, \n",
+ " storage_type=StorageType.FILE,\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's now create the schemas for the input and output data of the dataset. Since this is a image label detection dataset, the images will be set as the input and the labels as the output data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_schema = [image_meta]\n",
+ "output_schema = [label_meta]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Finally we can called the `create` method to onboard the dataset, specifying the name, description, license, path to the content files and schemas. \n",
+ "\n",
+ "See that a Dataset ID will be provided as response together with the status of the onboarding process."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " Dataset's inputs onboard progress: 0%| | 0/1 [00:00, ?it/s]\n",
+ "\u001b[A\n",
+ " Dataset's inputs onboard progress: 100%|██████████| 1/1 [00:06<00:00, 6.71s/it]\n",
+ " Dataset's outputs onboard progress: 0%| | 0/1 [00:00, ?it/s]\n",
+ "\u001b[A\n",
+ " Dataset's outputs onboard progress: 100%|██████████| 1/1 [00:02<00:00, 2.51s/it]\n",
+ " Dataset's hypotheses onboard progress: 0it [00:00, ?it/s]\n",
+ " Dataset's meta onboard progress: 0it [00:00, ?it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'status': 'onboarding', 'asset_id': '6615453db2166233fe1ab291'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "payload = DatasetFactory.create(\n",
+ " name=\"dataset_onboarding_demo\",\n",
+ " description=\"This is an image label detection corpus\",\n",
+ " license=License.MIT,\n",
+ " function=Function.IMAGE_LABEL_DETECTION,\n",
+ " content_path=upload_file,\n",
+ " input_schema=input_schema,\n",
+ " output_schema=output_schema\n",
+ ")\n",
+ "print(payload)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can then check the dataset using the `get` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:root:Start service for GET Dataset - https://dev-platform-api.aixplain.com/sdk/datasets/6615453db2166233fe1ab291/overview - {'Authorization': 'Token 9136c08bf02b5552885b9f2a5e0fae517d81ff2fa6fe7084a3adb655c4aa7215', 'Content-Type': 'application/json'}\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'id': '6615453db2166233fe1ab291',\n",
+ " 'name': 'dataset_onboarding_demo',\n",
+ " 'description': 'This is an image label detection corpus',\n",
+ " 'supplier': 'aiXplain',\n",
+ " 'version': '1.0',\n",
+ " 'license': ,\n",
+ " 'privacy': ,\n",
+ " 'cost': 0,\n",
+ " 'onboard_status': ,\n",
+ " 'function': ,\n",
+ " 'source_data': {'images': },\n",
+ " 'target_data': {'labels': []},\n",
+ " 'hypotheses': {},\n",
+ " 'metadata': {},\n",
+ " 'tags': [],\n",
+ " 'length': None,\n",
+ " 'kwargs': {}}"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset = DatasetFactory.get(payload[\"asset_id\"])\n",
+ "dataset.__dict__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/user/user_doc.md b/docs/user/user_doc.md
index 6b52e7d0..89efa478 100644
--- a/docs/user/user_doc.md
+++ b/docs/user/user_doc.md
@@ -286,6 +286,9 @@ Using the aiXplain SDK, you can also onboard your dataset into the aiXplain plat
- Machine translation dataset directly from s3:
- [](https://colab.research.google.com/drive/1Asnjeq5JQ9pV6UUQ2Z20XtrjnoaFD0nf?usp=sharing)
+- Image Label Detection Dataset:
+ - [Link](../samples/label_dataset_onboarding/label_dataset_onboarding.ipynb)
+
## FineTune
[FineTune](https://aixplain.com/platform/finetune) allows you to customize models by tuning them using your data and enhancing their performance. Set up and start fine-tuning with a few lines of code. Once fine-tuning is complete, the model will be deployed into your assets, ready for you to use.
diff --git a/pyproject.toml b/pyproject.toml
index 9ad67878..112c8f9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ namespaces = true
[project]
name = "aiXplain"
-version = "0.2.5rc"
+version = "0.2.12"
description = "aiXplain SDK adds AI functions to software."
readme = "README.md"
requires-python = ">=3.5, <4"
@@ -51,7 +51,7 @@ dependencies = [
"filetype>=1.2.0",
"click>=7.1.2,<8.0.0",
"PyYAML>=6.0.1",
- "dataclasses-json==0.6.1"
+ "dataclasses-json>=0.5.2"
]
[project.urls]
diff --git a/tests/functional/finetune/data/finetune_test_end2end.json b/tests/functional/finetune/data/finetune_test_end2end.json
index ead1fd88..80768de9 100644
--- a/tests/functional/finetune/data/finetune_test_end2end.json
+++ b/tests/functional/finetune/data/finetune_test_end2end.json
@@ -10,17 +10,17 @@
{
"model_name": "aiR",
"model_id": "6499cc946eb5633de15d82a1",
- "dataset_name": "Test search dataset metadata",
+ "dataset_name": "Test search dataset",
"inference_data": "Hello!",
"required_dev": false,
- "search_metadata": true
+ "search_metadata": false
},
{
"model_name": "vectara",
"model_id": "655e20f46eb563062a1aa301",
- "dataset_name": "Test search dataset metadata",
+ "dataset_name": "Test search dataset",
"inference_data": "Hello!",
"required_dev": false,
- "search_metadata": true
+ "search_metadata": false
}
]
\ No newline at end of file
diff --git a/tests/functional/finetune/finetune_functional_test.py b/tests/functional/finetune/finetune_functional_test.py
index 94693f05..ffa9ad5a 100644
--- a/tests/functional/finetune/finetune_functional_test.py
+++ b/tests/functional/finetune/finetune_functional_test.py
@@ -83,6 +83,8 @@ def test_end2end(run_input_map):
time.sleep(5)
end = time.time()
assert finetune_model.check_finetune_status().model_status.value == "onboarded"
+ time.sleep(30)
+ print(f"Model dict: {finetune_model.__dict__}")
result = finetune_model.run(run_input_map["inference_data"])
print(f"Result: {result}")
assert result is not None
diff --git a/tests/functional/model/hf_onboarding_test.py b/tests/functional/model/hf_onboarding_test.py
index b70b0580..47a38361 100644
--- a/tests/functional/model/hf_onboarding_test.py
+++ b/tests/functional/model/hf_onboarding_test.py
@@ -1,11 +1,14 @@
__author__ = "michaellam"
-import time
+import pytest
+import time
from aixplain.factories.model_factory import ModelFactory
from tests.test_utils import delete_asset
from aixplain.utils import config
+
+@pytest.mark.skip(reason="Model Deployment is deactivated for improvements.")
def test_deploy_model():
# Start the deployment
model_name = "Test Model"
@@ -26,6 +29,8 @@ def test_deploy_model():
# Clean up
delete_asset(model_id, config.TEAM_API_KEY)
+
+@pytest.mark.skip(reason="Model Deployment is deactivated for improvements.")
def test_nonexistent_model():
# Start the deployment
model_name = "Test Model"
@@ -34,6 +39,8 @@ def test_nonexistent_model():
assert response["statusCode"] == 400
assert response["message"] == "err.unable_to_onboard_model"
+
+@pytest.mark.skip(reason="Model Deployment is deactivated for improvements.")
def test_size_limit():
# Start the deployment
model_name = "Test Model"
@@ -42,10 +49,12 @@ def test_size_limit():
assert response["statusCode"] == 400
assert response["message"] == "err.unable_to_onboard_model"
+
+@pytest.mark.skip(reason="Model Deployment is deactivated for improvements.")
def test_gated_model():
# Start the deployment
model_name = "Test Model"
repo_id = "meta-llama/Llama-2-7b-hf"
response = ModelFactory.deploy_huggingface_model(model_name, repo_id, "mock_key")
assert response["statusCode"] == 400
- assert response["message"] == "err.unable_to_onboard_model"
\ No newline at end of file
+ assert response["message"] == "err.unable_to_onboard_model"
diff --git a/tests/functional/pipelines/create_test.py b/tests/functional/pipelines/create_test.py
new file mode 100644
index 00000000..f2c1a9c9
--- /dev/null
+++ b/tests/functional/pipelines/create_test.py
@@ -0,0 +1,64 @@
+__author__ = "thiagocastroferreira"
+
+"""
+Copyright 2022 The aiXplain SDK authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import pytest
+from aixplain.factories import PipelineFactory
+from aixplain.modules import Pipeline
+from uuid import uuid4
+
+
+def test_create_pipeline_from_json():
+ pipeline_json = "tests/functional/pipelines/data/pipeline.json"
+ pipeline_name = str(uuid4())
+ pipeline = PipelineFactory.create(name=pipeline_name, pipeline=pipeline_json)
+
+ assert isinstance(pipeline, Pipeline)
+ assert pipeline.id != ""
+
+
+def test_create_pipeline_from_string():
+ pipeline_json = "tests/functional/pipelines/data/pipeline.json"
+ with open(pipeline_json) as f:
+ pipeline_dict = json.load(f)
+
+ pipeline_name = str(uuid4())
+ pipeline = PipelineFactory.create(name=pipeline_name, pipeline=pipeline_dict)
+
+ assert isinstance(pipeline, Pipeline)
+ assert pipeline.id != ""
+
+
+def test_update_pipeline():
+ pipeline_json = "tests/functional/pipelines/data/pipeline.json"
+ with open(pipeline_json) as f:
+ pipeline_dict = json.load(f)
+
+ pipeline_name = str(uuid4())
+ pipeline = PipelineFactory.create(name=pipeline_name, pipeline=pipeline_dict)
+
+ pipeline.update(pipeline=pipeline_json)
+ assert isinstance(pipeline, Pipeline)
+ assert pipeline.id != ""
+
+
+def test_create_pipeline_wrong_path():
+ pipeline_name = str(uuid4())
+
+ with pytest.raises(Exception):
+ pipeline = PipelineFactory.create(name=pipeline_name, pipeline="/")
diff --git a/tests/functional/pipelines/data/pipeline.json b/tests/functional/pipelines/data/pipeline.json
new file mode 100644
index 00000000..f48d6d4d
--- /dev/null
+++ b/tests/functional/pipelines/data/pipeline.json
@@ -0,0 +1,100 @@
+{
+ "links": [
+ {
+ "from": 0,
+ "to": 1,
+ "paramMapping": [
+ {
+ "from": "input",
+ "to": "text"
+ }
+ ]
+ },
+ {
+ "from": 1,
+ "to": 2,
+ "paramMapping": [
+ {
+ "from": "data",
+ "to": "text"
+ }
+ ]
+ },
+ {
+ "from": 2,
+ "to": 3,
+ "paramMapping": [
+ {
+ "from": "data",
+ "to": "output"
+ }
+ ]
+ }
+ ],
+ "nodes": [
+ {
+ "number": 0,
+ "type": "INPUT"
+ },
+ {
+ "number": 1,
+ "type": "ASSET",
+ "function": "sentiment-analysis",
+ "inputValues": [
+ {
+ "code": "language",
+ "value": "en"
+ },
+ {
+ "code": "text",
+ "dataType": "text"
+ }
+ ],
+ "assetId": "6172874f720b09325cbcdc33",
+ "assetType": "MODEL",
+ "autoSelectOptions": [],
+ "functionType": "AI",
+ "status": "Exists",
+ "outputValues": [
+ {
+ "code": "data",
+ "dataType": "label"
+ }
+ ]
+ },
+ {
+ "number": 2,
+ "type": "ASSET",
+ "function": "translation",
+ "inputValues": [
+ {
+ "code": "sourcelanguage",
+ "value": "en"
+ },
+ {
+ "code": "targetlanguage",
+ "value": "es"
+ },
+ {
+ "code": "text",
+ "dataType": "text"
+ }
+ ],
+ "assetId": "61b097551efecf30109d3316",
+ "assetType": "MODEL",
+ "autoSelectOptions": [],
+ "functionType": "AI",
+ "status": "Exists",
+ "outputValues": [
+ {
+ "code": "data",
+ "dataType": "text"
+ }
+ ]
+ },
+ {
+ "number": 3,
+ "type": "OUTPUT"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/tests/functional/pipelines/run_test.py b/tests/functional/pipelines/run_test.py
index e7af6c4e..e4389587 100644
--- a/tests/functional/pipelines/run_test.py
+++ b/tests/functional/pipelines/run_test.py
@@ -109,24 +109,24 @@ def test_run_multipipe_with_datasets(batchmode: bool):
assert response["status"] == "SUCCESS"
-@pytest.mark.parametrize("batchmode", [True, False])
-def test_run_segment_reconstruct(batchmode: bool):
+def test_run_segment_reconstruct():
pipeline = PipelineFactory.list(query="Segmentation/Reconstruction Functional Test - DO NOT DELETE")["results"][0]
- response = pipeline.run("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", **{"batchmode": batchmode})
+ response = pipeline.run("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav")
assert response["status"] == "SUCCESS"
output = response["data"][0]
assert output["label"] == "Output 1"
-@pytest.mark.parametrize("batchmode", [True, False])
-def test_run_metric(batchmode: bool):
+def test_run_metric():
pipeline = PipelineFactory.list(query="ASR Metric Functional Test - DO NOT DELETE")["results"][0]
- response = pipeline.run({
- "AudioInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav",
- "ReferenceInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt"
- }, **{"batchmode": batchmode})
-
+ response = pipeline.run(
+ {
+ "AudioInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav",
+ "ReferenceInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt",
+ }
+ )
+
assert response["status"] == "SUCCESS"
assert len(response["data"]) == 2
assert response["data"][0]["label"] in ["TranscriptOutput", "ScoreOutput"]
@@ -134,34 +134,30 @@ def test_run_metric(batchmode: bool):
@pytest.mark.parametrize(
- "batchmode,input_data,output_data",
+ "input_data,output_data",
[
- (True, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"),
- (False, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"),
- (True, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput"),
- (False, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput")
- ]
+ ("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"),
+ ("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput"),
+ ],
)
-def test_run_router(batchmode: bool, input_data: str, output_data: str):
+def test_run_router(input_data: str, output_data: str):
pipeline = PipelineFactory.list(query="Router Test - DO NOT DELETE")["results"][0]
- response = pipeline.run(input_data, **{"batchmode": batchmode})
-
+ response = pipeline.run(input_data)
+
assert response["status"] == "SUCCESS"
assert response["data"][0]["label"] == output_data
@pytest.mark.parametrize(
- "batchmode,input_data,output_data",
+ "input_data,output_data",
[
- (True, "I love it.", "PositiveOutput"),
- (False, "I love it.", "PositiveOutput"),
- (True, "I hate it.", "NegativeOutput"),
- (False, "I hate it.", "NegativeOutput")
- ]
+ ("I love it.", "PositiveOutput"),
+ ("I hate it.", "NegativeOutput"),
+ ],
)
-def test_run_decision(batchmode: bool, input_data: str, output_data: str):
+def test_run_decision(input_data: str, output_data: str):
pipeline = PipelineFactory.list(query="Decision Test - DO NOT DELETE")["results"][0]
- response = pipeline.run(input_data, **{"batchmode": batchmode})
-
+ response = pipeline.run(input_data)
+
assert response["status"] == "SUCCESS"
- assert response["data"][0]["label"] == output_data
\ No newline at end of file
+ assert response["data"][0]["label"] == output_data
diff --git a/tests/image_upload_e2e_test.py b/tests/image_upload_e2e_test.py
index 5e46c325..0e2ccbc5 100644
--- a/tests/image_upload_e2e_test.py
+++ b/tests/image_upload_e2e_test.py
@@ -6,8 +6,10 @@
from tests.test_utils import delete_asset, delete_service_account
from aixplain.utils import config
import docker
-import os
+import pytest
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_create_and_upload_model():
# List the host machines
host_response = ModelFactory.list_host_machines()
@@ -44,7 +46,7 @@ def test_create_and_upload_model():
# Log into the image repository.
login_response = ModelFactory.asset_repo_login()
-
+
assert login_response["username"] == "AWS"
assert login_response["registry"] == "535945872701.dkr.ecr.us-east-1.amazonaws.com"
assert "password" in login_response.keys()
@@ -55,12 +57,12 @@ def test_create_and_upload_model():
# Push an image to ECR
# os.system("aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 535945872701.dkr.ecr.us-east-1.amazonaws.com")
- low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
+ low_level_client = docker.APIClient(base_url="unix://var/run/docker.sock")
# low_level_client.pull("535945872701.dkr.ecr.us-east-1.amazonaws.com/bash")
# low_level_client.tag("535945872701.dkr.ecr.us-east-1.amazonaws.com/bash", f"{registry}/{repo_name}")
low_level_client.pull("bash")
low_level_client.tag("bash", f"{registry}/{repo_name}")
- low_level_client.push(f"{registry}/{repo_name}", auth_config={"username":username, "password":password})
+ low_level_client.push(f"{registry}/{repo_name}", auth_config={"username": username, "password": password})
# Send an email to finalize onboarding process
ModelFactory.onboard_model(model_id, "latest", "fake_hash")
diff --git a/tests/image_upload_functional_test.py b/tests/image_upload_functional_test.py
index 0d6aa219..b9dd3ebf 100644
--- a/tests/image_upload_functional_test.py
+++ b/tests/image_upload_functional_test.py
@@ -6,6 +6,8 @@
from aixplain.factories.model_factory import ModelFactory
import pytest
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_login():
response = ModelFactory.asset_repo_login()
assert response["username"] == "AWS"
@@ -15,6 +17,8 @@ def test_login():
# Test cleanup
delete_service_account(config.TEAM_API_KEY)
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_create_asset_repo():
with open(Path("tests/test_requests/create_asset_request.json")) as f:
mock_register_payload = json.load(f)
@@ -33,6 +37,8 @@ def test_create_asset_repo():
# Test cleanup
delete_asset(response["id"], config.TEAM_API_KEY)
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_list_host_machines():
response = ModelFactory.list_host_machines()
for hosting_machine_dict in response:
@@ -42,6 +48,8 @@ def test_list_host_machines():
assert "memory" in hosting_machine_dict.keys()
assert "hourlyCost" in hosting_machine_dict.keys()
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_get_functions():
# Verbose
response = ModelFactory.list_functions(True)
@@ -53,7 +61,7 @@ def test_get_functions():
assert "name" in item.keys()
# Non-verbose
- response = ModelFactory.list_functions() # Not verbose by default
+ response = ModelFactory.list_functions() # Not verbose by default
items = response["items"]
for item in items:
assert "output" not in item.keys()
@@ -61,6 +69,7 @@ def test_get_functions():
assert "id" not in item.keys()
assert "name" in item.keys()
+
@pytest.mark.skip(reason="Not included in first release")
def list_image_repo_tags():
response = ModelFactory.list_image_repo_tags()
diff --git a/tests/image_upload_test.py b/tests/image_upload_test.py
index bb120533..fb919171 100644
--- a/tests/image_upload_test.py
+++ b/tests/image_upload_test.py
@@ -13,8 +13,9 @@
API_FIXED_HEADER = {"x-api-key": f"{config.TEAM_API_KEY}", "Content-Type": "application/json"}
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_login():
- url = urljoin(config.BACKEND_URL, f"sdk/ecr/login")
+ url = urljoin(config.BACKEND_URL, f"sdk/ecr/login")
with requests_mock.Mocker() as mock:
with open(Path("tests/mock_responses/login_response.json")) as f:
mock_json = json.load(f)
@@ -22,8 +23,10 @@ def test_login():
creds = ModelFactory.asset_repo_login(config.TEAM_API_KEY)
assert creds == mock_json
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_create_asset_repo():
- url_register = urljoin(config.BACKEND_URL, f"sdk/models/register")
+ url_register = urljoin(config.BACKEND_URL, f"sdk/models/register")
url_function = urljoin(config.BACKEND_URL, f"sdk/functions")
with requests_mock.Mocker() as mock:
with open(Path("tests/mock_responses/create_asset_repo_response.json")) as f:
@@ -32,12 +35,15 @@ def test_create_asset_repo():
with open(Path("tests/mock_responses/list_functions_response.json")) as f:
mock_json_functions = json.load(f)
mock.get(url_function, headers=AUTH_FIXED_HEADER, json=mock_json_functions)
- model_id = ModelFactory.create_asset_repo("mock_name", "mock_machines", "mock_version",
- "mock_description", "Speech Recognition", "en", config.TEAM_API_KEY)
+ model_id = ModelFactory.create_asset_repo(
+ "mock_name", "mock_machines", "mock_version", "mock_description", "Speech Recognition", "en", config.TEAM_API_KEY
+ )
assert model_id == mock_json_register
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_list_host_machines():
- url = urljoin(config.BACKEND_URL, f"sdk/hosting-machines")
+ url = urljoin(config.BACKEND_URL, f"sdk/hosting-machines")
with requests_mock.Mocker() as mock:
with open(Path("tests/mock_responses/list_host_machines_response.json")) as f:
mock_json = json.load(f)
@@ -49,8 +55,10 @@ def test_list_host_machines():
for key in machine_dict.keys():
assert machine_dict[key] == mock_json_dict[key]
+
+@pytest.mark.skip(reason="Model Upload is deactivated for improvements.")
def test_get_functions():
- url = urljoin(config.BACKEND_URL, f"sdk/functions")
+ url = urljoin(config.BACKEND_URL, f"sdk/functions")
with requests_mock.Mocker() as mock:
with open(Path("tests/mock_responses/list_functions_response.json")) as f:
mock_json = json.load(f)
@@ -58,10 +66,11 @@ def test_get_functions():
functions = ModelFactory.list_functions(config.TEAM_API_KEY)
assert functions == mock_json
+
@pytest.mark.skip(reason="Not currently supported.")
def test_list_image_repo_tags():
model_id = "mock_id"
- url = urljoin(config.BACKEND_URL, f"sdk/models/{model_id}/images")
+ url = urljoin(config.BACKEND_URL, f"sdk/models/{model_id}/images")
with requests_mock.Mocker() as mock:
with open(Path("tests/mock_responses/list_image_repo_tags_response.json")) as f:
mock_json = json.load(f)
diff --git a/tests/unit/pipeline_test.py b/tests/unit/pipeline_test.py
new file mode 100644
index 00000000..68a399aa
--- /dev/null
+++ b/tests/unit/pipeline_test.py
@@ -0,0 +1,39 @@
+__author__ = "thiagocastroferreira"
+
+"""
+Copyright 2022 The aiXplain SDK authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from dotenv import load_dotenv
+
+load_dotenv()
+import requests_mock
+from aixplain.utils import config
+from aixplain.factories import PipelineFactory
+from aixplain.modules import Pipeline
+from urllib.parse import urljoin
+import pytest
+
+
+def test_create_pipeline():
+ with requests_mock.Mocker() as mock:
+ url = urljoin(config.BACKEND_URL, "sdk/pipelines")
+ headers = {"x-api-key": config.TEAM_API_KEY, "Content-Type": "application/json"}
+ ref_response = {"id": "12345"}
+ mock.post(url, headers=headers, json=ref_response)
+ ref_pipeline = Pipeline(id="12345", name="Pipeline Test", api_key=config.TEAM_API_KEY)
+ hyp_pipeline = PipelineFactory.create(pipeline={}, name="Pipeline Test")
+ assert hyp_pipeline.id == ref_pipeline.id
+ assert hyp_pipeline.name == ref_pipeline.name