Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aixplain/enums/data_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ class DataType(Enum):
VIDEO = "video"

def __str__(self):
return self._value_
return self._value_
40 changes: 39 additions & 1 deletion aixplain/factories/pipeline_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"""
import json
import logging
import os
from typing import Dict, List, Optional, Text, Union
from aixplain.enums.data_type import DataType
from aixplain.enums.function import Function
Expand Down Expand Up @@ -207,7 +208,7 @@ def list(
output_data_types = [output_data_types]
payload["inputDataTypes"] = [data_type.value for data_type in output_data_types]

logging.info(f"Start service for POST List Dataset - {url} - {headers} - {json.dumps(payload)}")
logging.info(f"Start service for POST List Pipeline - {url} - {headers} - {json.dumps(payload)}")
r = _request_with_retry("post", url, headers=headers, json=payload)
resp = r.json()

Expand All @@ -220,3 +221,40 @@ def list(
for pipeline in results:
pipelines.append(cls.__from_response(pipeline))
return {"results": pipelines, "page_total": page_total, "page_number": page_number, "total": total}

@classmethod
def create(cls, name: Text, pipeline: Union[Text, Dict], status: Text = "draft") -> Pipeline:
"""Pipeline Creation

Args:
name (Text): Pipeline Name
pipeline (Union[Text, Dict]): Pipeline as a Python dictionary or in a JSON file
status (Text, optional): Status of the pipeline. Currently only draft pipelines can be saved. Defaults to "draft".

Raises:
Exception: Currently just the creation of draft pipelines are supported

Returns:
Pipeline: instance of the new pipeline
"""
try:
assert status == "draft", "Pipeline Creation Error: Currently just the creation of draft pipelines are supported."
if isinstance(pipeline, str) is True:
_, ext = os.path.splitext(pipeline)
assert (
os.path.exists(pipeline) and ext == ".json"
), "Pipeline Creation Error: Make sure the pipeline to be save is in a JSON file."
with open(pipeline) as f:
pipeline = json.load(f)

# prepare payload
payload = {"name": name, "status": "draft", "architecture": pipeline}
url = urljoin(cls.backend_url, "sdk/pipelines")
headers = {"Authorization": f"Token {config.TEAM_API_KEY}", "Content-Type": "application/json"}
logging.info(f"Start service for POST Create Pipeline - {url} - {headers} - {json.dumps(payload)}")
r = _request_with_retry("post", url, headers=headers, json=payload)
response = r.json()

return Pipeline(response["id"], name, config.TEAM_API_KEY)
except Exception as e:
raise Exception(e)
3 changes: 3 additions & 0 deletions aixplain/modules/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
file_extension: Optional[FileType] = None,
languages: List[Language] = [],
dsubtype: DataSubtype = DataSubtype.OTHER,
id: Optional[Text] = None,
**kwargs
) -> None:
"""MetaData Class
Expand All @@ -62,6 +63,7 @@ def __init__(
file_extension (Optional[FileType], optional): File extension (e.g. CSV, TXT, etc.). Defaults to None.
languages (List[Language], optional): List of languages which the data consists of. Defaults to [].
dsubtype (DataSubtype, optional): Data subtype (e.g., age, topic, race, split, etc.), used in datasets metadata. Defaults to Other.
id (Optional[Text], optional): Data ID. Defaults to None.
"""
self.name = name
if isinstance(dtype, str):
Expand Down Expand Up @@ -91,4 +93,5 @@ def __init__(
language = Language(language)
self.languages.append(language)
self.dsubtype = dsubtype
self.id = id
self.kwargs = kwargs
31 changes: 31 additions & 0 deletions aixplain/modules/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@

import time
import json
import os
import logging
from aixplain.modules.asset import Asset
from aixplain.utils import config
from aixplain.utils.file_utils import _request_with_retry
from typing import Dict, Optional, Text, Union
from urllib.parse import urljoin


class Pipeline(Asset):
Expand Down Expand Up @@ -306,3 +308,32 @@ def run_async(
if resp is not None:
response["error"] = resp
return response

def update(self, pipeline: Union[Text, Dict]):
"""Update Pipeline

Args:
pipeline (Union[Text, Dict]): Pipeline as a Python dictionary or in a JSON file

Raises:
Exception: Make sure the pipeline to be save is in a JSON file.
"""
try:
if isinstance(pipeline, str) is True:
_, ext = os.path.splitext(pipeline)
assert (
os.path.exists(pipeline) and ext == ".json"
), "Pipeline Update Error: Make sure the pipeline to be save is in a JSON file."
with open(pipeline) as f:
pipeline = json.load(f)

# prepare payload
payload = {"name": self.name, "status": "draft", "architecture": pipeline}
url = urljoin(config.BACKEND_URL, f"sdk/pipelines/{self.id}")
headers = {"Authorization": f"Token {config.TEAM_API_KEY}", "Content-Type": "application/json"}
logging.info(f"Start service for PUT Update Pipeline - {url} - {headers} - {json.dumps(payload)}")
r = _request_with_retry("put", url, headers=headers, json=payload)
response = r.json()
logging.info(f"Pipeline {response['id']} Updated.")
except Exception as e:
raise Exception(e)
4 changes: 2 additions & 2 deletions aixplain/processes/data_onboarding/onboard_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ def process_data_files(
-1,
0,
)
if metadata.dtype in [DataType.AUDIO, DataType.IMAGE] or metadata.dsubtype == DataSubtype.INTERVAL:
if metadata.dtype in [DataType.AUDIO, DataType.IMAGE, DataType.LABEL] or metadata.dsubtype == DataSubtype.INTERVAL:
files, data_column_idx, start_column_idx, end_column_idx, nrows = process_media_files.run(
metadata=metadata, paths=paths, folder=folder
)
elif metadata.dtype in [DataType.TEXT, DataType.LABEL]:
elif metadata.dtype in [DataType.TEXT]:
files, data_column_idx, nrows = process_text_files.run(metadata=metadata, paths=paths, folder=folder)
return files, data_column_idx, start_column_idx, end_column_idx, nrows

Expand Down
24 changes: 24 additions & 0 deletions aixplain/processes/data_onboarding/process_media_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple
from urllib.parse import urlparse

AUDIO_MAX_SIZE = 50000000
IMAGE_TEXT_MAX_SIZE = 25000000
Expand Down Expand Up @@ -45,6 +46,15 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
Returns:
Tuple[List[File], int, int, int]: list of s3 links; data, start and end columns index, and number of rows
"""
if metadata.dtype == DataType.LABEL:
assert (
metadata.storage_type != StorageType.TEXT
), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text. Label data should be stored in a JSON file.'
else:
assert (
metadata.storage_type != StorageType.TEXT
), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text.'

# if files are stored locally, create a folder to store it
media_folder = Path(".")
if metadata.storage_type == StorageType.FILE:
Expand Down Expand Up @@ -95,6 +105,14 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
assert (
os.path.getsize(media_path) <= AUDIO_MAX_SIZE
), f'Data Asset Onboarding Error: Local audio file "{media_path}" exceeds the size limit of 50 MB.'
elif metadata.dtype == DataType.LABEL:
assert (
os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
), f'Data Asset Onboarding Error: JSON file with labels "{media_path}" exceeds the size limit of 25 MB.'
_, extension = os.path.splitext(media_path)
assert (
extension == ".json"
), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.'
else:
assert (
os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
Expand All @@ -105,6 +123,12 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
shutil.copy2(media_path, new_path)
batch.append(fname)
else:
if metadata.dtype == DataType.LABEL:
path = urlparse(media_path).path
_, extension = os.path.splitext(path)
assert (
extension == ".json"
), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.'
batch.append(media_path)

# crop intervals can not be used with interval data types
Expand Down
2 changes: 1 addition & 1 deletion aixplain/processes/data_onboarding/process_text_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def process_text(content: str, storage_type: StorageType) -> Text:
Text: textual content
"""
if storage_type == StorageType.FILE:
# Check the size of file and assert a limit of 50 MB
# Check the size of file and assert a limit of 25 MB
assert (
os.path.getsize(content) <= 25000000
), f'Data Asset Onboarding Error: Local text file "{content}" exceeds the size limit of 25 MB.'
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions docs/samples/label_dataset_onboarding/corpus/index.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,images,labels
0,corpus/images/1.jpg,corpus/labels/1.json
1,corpus/images/2.png,corpus/labels/2.json
9 changes: 9 additions & 0 deletions docs/samples/label_dataset_onboarding/corpus/labels/1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"data": "arcade",
"boundingBox": {
"top": 0,
"bottom": 0,
"left": 0,
"right": 0
}
}
9 changes: 9 additions & 0 deletions docs/samples/label_dataset_onboarding/corpus/labels/2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"data": "building",
"boundingBox": {
"top": 0,
"bottom": 0,
"left": 0,
"right": 0
}
}
Loading