diff --git a/aixplain/enums/data_type.py b/aixplain/enums/data_type.py index d13542f9..fa79d070 100644 --- a/aixplain/enums/data_type.py +++ b/aixplain/enums/data_type.py @@ -35,4 +35,4 @@ class DataType(Enum): VIDEO = "video" def __str__(self): - return self._value_ \ No newline at end of file + return self._value_ diff --git a/aixplain/modules/metadata.py b/aixplain/modules/metadata.py index f7fac5f5..07007ebe 100644 --- a/aixplain/modules/metadata.py +++ b/aixplain/modules/metadata.py @@ -43,6 +43,7 @@ def __init__( file_extension: Optional[FileType] = None, languages: List[Language] = [], dsubtype: DataSubtype = DataSubtype.OTHER, + id: Optional[Text] = None, **kwargs ) -> None: """MetaData Class @@ -62,6 +63,7 @@ def __init__( file_extension (Optional[FileType], optional): File extension (e.g. CSV, TXT, etc.). Defaults to None. languages (List[Language], optional): List of languages which the data consists of. Defaults to []. dsubtype (DataSubtype, optional): Data subtype (e.g., age, topic, race, split, etc.), used in datasets metadata. Defaults to Other. + id (Optional[Text], optional): Data ID. Defaults to None. """ self.name = name if isinstance(dtype, str): @@ -91,4 +93,5 @@ def __init__( language = Language(language) self.languages.append(language) self.dsubtype = dsubtype + self.id = id self.kwargs = kwargs diff --git a/aixplain/processes/data_onboarding/onboard_functions.py b/aixplain/processes/data_onboarding/onboard_functions.py index 091458fd..35a64e12 100644 --- a/aixplain/processes/data_onboarding/onboard_functions.py +++ b/aixplain/processes/data_onboarding/onboard_functions.py @@ -97,11 +97,11 @@ def process_data_files( -1, 0, ) - if metadata.dtype in [DataType.AUDIO, DataType.IMAGE] or metadata.dsubtype == DataSubtype.INTERVAL: + if metadata.dtype in [DataType.AUDIO, DataType.IMAGE, DataType.LABEL] or metadata.dsubtype == DataSubtype.INTERVAL: files, data_column_idx, start_column_idx, end_column_idx, nrows = process_media_files.run( metadata=metadata, paths=paths, folder=folder ) - elif metadata.dtype in [DataType.TEXT, DataType.LABEL]: + elif metadata.dtype in [DataType.TEXT]: files, data_column_idx, nrows = process_text_files.run(metadata=metadata, paths=paths, folder=folder) return files, data_column_idx, start_column_idx, end_column_idx, nrows diff --git a/aixplain/processes/data_onboarding/process_media_files.py b/aixplain/processes/data_onboarding/process_media_files.py index 3f95b1e3..c0009eca 100644 --- a/aixplain/processes/data_onboarding/process_media_files.py +++ b/aixplain/processes/data_onboarding/process_media_files.py @@ -16,6 +16,7 @@ from pathlib import Path from tqdm import tqdm from typing import List, Tuple +from urllib.parse import urlparse AUDIO_MAX_SIZE = 50000000 IMAGE_TEXT_MAX_SIZE = 25000000 @@ -45,6 +46,15 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) -> Returns: Tuple[List[File], int, int, int]: list of s3 links; data, start and end columns index, and number of rows """ + if metadata.dtype == DataType.LABEL: + assert ( + metadata.storage_type != StorageType.TEXT + ), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text. Label data should be stored in a JSON file.' + else: + assert ( + metadata.storage_type != StorageType.TEXT + ), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text.' + # if files are stored locally, create a folder to store it media_folder = Path(".") if metadata.storage_type == StorageType.FILE: @@ -95,6 +105,14 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) -> assert ( os.path.getsize(media_path) <= AUDIO_MAX_SIZE ), f'Data Asset Onboarding Error: Local audio file "{media_path}" exceeds the size limit of 50 MB.' + elif metadata.dtype == DataType.LABEL: + assert ( + os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE + ), f'Data Asset Onboarding Error: JSON file with labels "{media_path}" exceeds the size limit of 25 MB.' + _, extension = os.path.splitext(media_path) + assert ( + extension == ".json" + ), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.' else: assert ( os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE @@ -105,6 +123,12 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) -> shutil.copy2(media_path, new_path) batch.append(fname) else: + if metadata.dtype == DataType.LABEL: + path = urlparse(media_path).path + _, extension = os.path.splitext(path) + assert ( + extension == ".json" + ), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.' batch.append(media_path) # crop intervals can not be used with interval data types diff --git a/aixplain/processes/data_onboarding/process_text_files.py b/aixplain/processes/data_onboarding/process_text_files.py index 48db3f4e..1ba7f47e 100644 --- a/aixplain/processes/data_onboarding/process_text_files.py +++ b/aixplain/processes/data_onboarding/process_text_files.py @@ -26,7 +26,7 @@ def process_text(content: str, storage_type: StorageType) -> Text: Text: textual content """ if storage_type == StorageType.FILE: - # Check the size of file and assert a limit of 50 MB + # Check the size of file and assert a limit of 25 MB assert ( os.path.getsize(content) <= 25000000 ), f'Data Asset Onboarding Error: Local text file "{content}" exceeds the size limit of 25 MB.' diff --git a/docs/samples/label_dataset_onboarding/corpus/images/1.jpg b/docs/samples/label_dataset_onboarding/corpus/images/1.jpg new file mode 100644 index 00000000..ae3d592c Binary files /dev/null and b/docs/samples/label_dataset_onboarding/corpus/images/1.jpg differ diff --git a/docs/samples/label_dataset_onboarding/corpus/images/2.png b/docs/samples/label_dataset_onboarding/corpus/images/2.png new file mode 100644 index 00000000..ba23ab11 Binary files /dev/null and b/docs/samples/label_dataset_onboarding/corpus/images/2.png differ diff --git a/docs/samples/label_dataset_onboarding/corpus/index.csv b/docs/samples/label_dataset_onboarding/corpus/index.csv new file mode 100644 index 00000000..69ba347a --- /dev/null +++ b/docs/samples/label_dataset_onboarding/corpus/index.csv @@ -0,0 +1,3 @@ +,images,labels +0,corpus/images/1.jpg,corpus/labels/1.json +1,corpus/images/2.png,corpus/labels/2.json diff --git a/docs/samples/label_dataset_onboarding/corpus/labels/1.json b/docs/samples/label_dataset_onboarding/corpus/labels/1.json new file mode 100644 index 00000000..6947447f --- /dev/null +++ b/docs/samples/label_dataset_onboarding/corpus/labels/1.json @@ -0,0 +1,9 @@ +{ + "data": "arcade", + "boundingBox": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + } +} \ No newline at end of file diff --git a/docs/samples/label_dataset_onboarding/corpus/labels/2.json b/docs/samples/label_dataset_onboarding/corpus/labels/2.json new file mode 100644 index 00000000..b990cfd3 --- /dev/null +++ b/docs/samples/label_dataset_onboarding/corpus/labels/2.json @@ -0,0 +1,9 @@ +{ + "data": "building", + "boundingBox": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + } +} \ No newline at end of file diff --git a/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb b/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb new file mode 100644 index 00000000..f499dd51 --- /dev/null +++ b/docs/samples/label_dataset_onboarding/label_dataset_onboarding.ipynb @@ -0,0 +1,399 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Image Label Detection Dataset Onboarding\n", + "\n", + "This notebook demonstrates how to onboard a dataset with label data into aiXplain platform using its SDK." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Credentials\n", + "\n", + "To use the aiXplain SDK, you may be registered in our platform and have an API key. The step-by-step on how to do it is better described [here](/docs/user/api_setup.md)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"TEAM_API_KEY\"] = \"YOUR_TEAM_API_KEY_HERE\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data\n", + "\n", + "In this example we will show how to onboard a sample dataset of images and their corresponding labels. To onboard it, the data needs to be depicted in a CSV file, which will be fed to the SDK. \n", + "\n", + "Label data should have be one or more elements in a JSON file according to one of the following structure:\n", + "\n", + "```json\n", + "{\n", + " \"data\": \"TEXT_AUDIO_LABEL\",\n", + " \"boundingBox\": {\n", + " \"start\": 0, // start character\n", + " \"end\": 0, // end character\n", + " }\n", + "}\n", + "\n", + "{\n", + " \"data\": \"AUDIO_LABEL\",\n", + " \"boundingBox\": {\n", + " \"start\": 0, // start second\n", + " \"end\": 0 // end second\n", + " }\n", + "}\n", + "\n", + "{\n", + " \"data\": \"IMAGE_LABEL\",\n", + " \"boundingBox\": {\n", + " \"top\": 0, // top percentage of the image\n", + " \"bottom\": 0, // bottom percentage of the image\n", + " \"left\": 0, // left percentage of the image\n", + " \"right\": 0 // right percentage of the image\n", + " }\n", + "}\n", + "\n", + "{\n", + " \"data\": \"VIDEO_LABEL\",\n", + " \"boundingBox\": {\n", + " \"start\": 0, // start second\n", + " \"end\": 0, // end second\n", + " \"top\": 0, // top percentage of the image\n", + " \"bottom\": 0, // bottom percentage of the image\n", + " \"left\": 0, // left percentage of the image\n", + " \"right\": 0 // right percentage of the image\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/1p/jbswfpbs73q5qbbh78dzj5xm0000gn/T/ipykernel_47954/611755932.py:1: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0imageslabels
00corpus/images/1.jpgcorpus/labels/1.json
11corpus/images/2.pngcorpus/labels/2.json
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 images labels\n", + "0 0 corpus/images/1.jpg corpus/labels/1.json\n", + "1 1 corpus/images/2.png corpus/labels/2.json" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "upload_file = \"corpus/index.csv\"\n", + "data = pd.read_csv(upload_file)\n", + "data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import\n", + "\n", + "Let's now import the necessary classes to onboard the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from aixplain.enums import DataType, DataSubtype, Function, Language, License, StorageType\n", + "from aixplain.factories import DatasetFactory\n", + "from aixplain.modules import MetaData" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metadata\n", + "\n", + "Besides the CSV file, a schema must be fed to the SDK giving some information about the input and output data to be onboarded, such as: \n", + "\n", + "1. Data Name\n", + "2. Data Type: Audio, Text, Image, Video, Label, etc.\n", + "3. Storage Type: whether the data is depicted in the CSV (Text), in a local file (File) or in a public link (URL)\n", + "4. Start Column (optional): the column which depicts the beginning of the segment in the original file\n", + "5. End Column (optional): the column which depicts the end of the segment in the original file\n", + "6. Languages (optional): the languages depicted in the data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's instantiate the metadata for the images:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "image_meta = MetaData(\n", + " name=\"images\", \n", + " dtype=\"image\", \n", + " storage_type=\"file\", \n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now for the labels...\n", + "\n", + "(See how we can use enumerations instead of strings to specify some information)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "label_meta = MetaData(\n", + " name=\"labels\", \n", + " dtype=DataType.LABEL, \n", + " storage_type=StorageType.FILE,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now create the schemas for the input and output data of the dataset. Since this is a image label detection dataset, the images will be set as the input and the labels as the output data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "input_schema = [image_meta]\n", + "output_schema = [label_meta]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally we can called the `create` method to onboard the dataset, specifying the name, description, license, path to the content files and schemas. \n", + "\n", + "See that a Dataset ID will be provided as response together with the status of the onboarding process." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " Dataset's inputs onboard progress: 0%| | 0/1 [00:00,\n", + " 'privacy': ,\n", + " 'cost': 0,\n", + " 'onboard_status': ,\n", + " 'function': ,\n", + " 'source_data': {'images': },\n", + " 'target_data': {'labels': []},\n", + " 'hypotheses': {},\n", + " 'metadata': {},\n", + " 'tags': [],\n", + " 'length': None,\n", + " 'kwargs': {}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = DatasetFactory.get(payload[\"asset_id\"])\n", + "dataset.__dict__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/user/user_doc.md b/docs/user/user_doc.md index 5b19c273..400ad0d2 100644 --- a/docs/user/user_doc.md +++ b/docs/user/user_doc.md @@ -282,6 +282,9 @@ Using the aiXplain SDK, you can also onboard your dataset into the aiXplain plat - Machine translation dataset directly from s3: - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Asnjeq5JQ9pV6UUQ2Z20XtrjnoaFD0nf?usp=sharing) +- Image Label Detection Dataset: + - [Link](../samples/label_dataset_onboarding/label_dataset_onboarding.ipynb) + ## FineTune [FineTune](https://aixplain.com/platform/finetune) allows you to customize models by tuning them using your data and enhancing their performance. Set up and start fine-tuning with a few lines of code. Once fine-tuning is complete, the model will be deployed into your assets, ready for you to use. diff --git a/tests/functional/pipelines/run_test.py b/tests/functional/pipelines/run_test.py index e7af6c4e..e4389587 100644 --- a/tests/functional/pipelines/run_test.py +++ b/tests/functional/pipelines/run_test.py @@ -109,24 +109,24 @@ def test_run_multipipe_with_datasets(batchmode: bool): assert response["status"] == "SUCCESS" -@pytest.mark.parametrize("batchmode", [True, False]) -def test_run_segment_reconstruct(batchmode: bool): +def test_run_segment_reconstruct(): pipeline = PipelineFactory.list(query="Segmentation/Reconstruction Functional Test - DO NOT DELETE")["results"][0] - response = pipeline.run("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", **{"batchmode": batchmode}) + response = pipeline.run("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav") assert response["status"] == "SUCCESS" output = response["data"][0] assert output["label"] == "Output 1" -@pytest.mark.parametrize("batchmode", [True, False]) -def test_run_metric(batchmode: bool): +def test_run_metric(): pipeline = PipelineFactory.list(query="ASR Metric Functional Test - DO NOT DELETE")["results"][0] - response = pipeline.run({ - "AudioInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", - "ReferenceInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt" - }, **{"batchmode": batchmode}) - + response = pipeline.run( + { + "AudioInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", + "ReferenceInput": "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", + } + ) + assert response["status"] == "SUCCESS" assert len(response["data"]) == 2 assert response["data"][0]["label"] in ["TranscriptOutput", "ScoreOutput"] @@ -134,34 +134,30 @@ def test_run_metric(batchmode: bool): @pytest.mark.parametrize( - "batchmode,input_data,output_data", + "input_data,output_data", [ - (True, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"), - (False, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"), - (True, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput"), - (False, "https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput") - ] + ("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.wav", "AudioOutput"), + ("https://aixplain-platform-assets.s3.amazonaws.com/samples/en/CPAC1x2.txt", "TextOutput"), + ], ) -def test_run_router(batchmode: bool, input_data: str, output_data: str): +def test_run_router(input_data: str, output_data: str): pipeline = PipelineFactory.list(query="Router Test - DO NOT DELETE")["results"][0] - response = pipeline.run(input_data, **{"batchmode": batchmode}) - + response = pipeline.run(input_data) + assert response["status"] == "SUCCESS" assert response["data"][0]["label"] == output_data @pytest.mark.parametrize( - "batchmode,input_data,output_data", + "input_data,output_data", [ - (True, "I love it.", "PositiveOutput"), - (False, "I love it.", "PositiveOutput"), - (True, "I hate it.", "NegativeOutput"), - (False, "I hate it.", "NegativeOutput") - ] + ("I love it.", "PositiveOutput"), + ("I hate it.", "NegativeOutput"), + ], ) -def test_run_decision(batchmode: bool, input_data: str, output_data: str): +def test_run_decision(input_data: str, output_data: str): pipeline = PipelineFactory.list(query="Decision Test - DO NOT DELETE")["results"][0] - response = pipeline.run(input_data, **{"batchmode": batchmode}) - + response = pipeline.run(input_data) + assert response["status"] == "SUCCESS" - assert response["data"][0]["label"] == output_data \ No newline at end of file + assert response["data"][0]["label"] == output_data