In [None]:
#| default_exp components.datablob

Note: 

While writing doc strings, please use the below syntax for linking methods/classes. So that the methods/classes gets highlighted in the browser and clicking on it will take the user to the linked function

    - To link a method from the class same file please use the `method_name` format.
    - To link a method from a different Class (can in a seperate file also) please use `Classname.method_name` format.

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.


In [None]:
#| export

from typing import *

In [None]:
#| exporti

import os
import requests
from contextlib import contextmanager

import pandas as pd
from fastcore.foundation import patch
from datetime import datetime, timedelta
from pathlib import Path
from tqdm import tqdm

from airt.logger import get_logger, set_level
from airt.helper import (
    get_data,
    post_data,
    delete_data,
    add_ready_column,
    generate_df,
    get_values_from_item,
    get_attributes_from_instances,
    add_example_to_docs
)

from airt.components.client import Client
from airt.components.datasource import DataSource
from airt.components.progress_status import ProgressStatus
from airt.constant import CLIENT_DB_USERNAME, CLIENT_DB_PASSWORD

In [None]:
from urllib.parse import quote_plus as urlquote

import logging
import pytest
import tempfile
import shutil
import json

from sqlmodel import create_engine
from azure.identity import DefaultAzureCredential
from azure.mgmt.storage import StorageManagementClient

import airt.sanitizer
from airt.docstring.helpers import run_examples_from_docstring
from airt.constant import SERVICE_USERNAME, SERVICE_PASSWORD

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
display(logger.getEffectiveLevel())
assert logger.getEffectiveLevel() == logging.INFO

logger.debug("This is a debug message")
logger.info("This is an info")
logger.warning("This is a warning")
logger.error("This is an error")

20

[INFO] __main__: This is an info
[ERROR] __main__: This is an error


In [None]:
TEST_S3_URI = "s3://test-airt-service/ecommerce_behavior_notebooks"
TEST_S3_CSV_URI = "s3://test-airt-service/ecommerce_behavior_csv"
TEST_AZURE_URI = "https://testairtservice.blob.core.windows.net/test-container/ecommerce_behavior_notebooks"
RANDOM_UUID_FOR_TESTING = "00000000-0000-0000-0000-000000000000"

In [None]:
#| exporti

DEFAULT_AZURE_BLOB_STORAGE_REGION = "westeurope"
DEFAULT_S3_REGION = "eu-west-1"

In [None]:
#| export


class DataBlob:
    """A class for importing and processing data from sources such as CSV/parquet files, databases, AWS S3 buckets, and Azure Blob Storage.

    Currently, the only way to instantiate the DataBlob class is to call one of the following static methods
    `from_local`, `from_mysql`, `from_clickhouse`, `from_s3`, or `from_azure_blob_storage` which imports the data in
    the parquet file format from:

     - a local CSV/parquet file,

     - a MySql database,

     - a ClickHouse database

     - an AWS S3 bucket, and

     - an Azure Blob Storage respectively.

    We intend to support additional databases and storage mediums in future releases.
    """

    BASIC_DB_COLS = [
        "uuid",
        "datasources",
        "type",
        "source",
        "region",
        "cloud_provider",
        "tags",
        "pulled_on",
        "completed_steps",
        "total_steps",
        "folder_size",
    ]

    ALL_DB_COLS = BASIC_DB_COLS + ["user", "error", "disabled"]

    COLS_TO_RENAME = {
        "uuid": "datablob_uuid",
        "datasources": "datasource_uuids",
        "user": "user_uuid",
    }

    _default_provider_and_regions: List[Tuple[str, str]] = []

    def __init__(
        self,
        uuid: str,
        type: Optional[str] = None,
        source: Optional[str] = None,
        region: Optional[str] = None,
        cloud_provider: Optional[str] = None,
        datasources: Optional[List[str]] = None,
        total_steps: Optional[int] = None,
        completed_steps: Optional[int] = None,
        folder_size: Optional[int] = None,
        disabled: Optional[bool] = None,
        pulled_on: Optional[str] = None,
        user: Optional[str] = None,
        tags: Optional[List] = None,
        error: Optional[str] = None,
    ):
        """Constructs a new DataBlob instance.

        Warning:
            Do not construct this object directly by calling the constructor, please use `from_s3`, `from_azure_blob_storage`, 
            `from_mysql`, `from_clickhouse` or `from_local` methods instead.

        Args:
            uuid: Datablob uuid.
            source: The URI of the data that was used to create the datablob.
            type: The type of source used to generate the datablob. Depending on the source type, one of the following 
                values will be assigned: "s3", "local", "db", or "azure_blob_storage".
            region: The destination cloud provider's region to store the datablob. If None (default value) then the default region will be assigned based on the cloud provider.
            cloud_provider: Cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
            datasources: The uuids of the datasources created from the datablob.
            total_steps: The number of steps required to upload the datablob to the server.
            completed_steps: The number of steps completed during the datablob's upload to the server.
            folder_size: The uploaded datablob's size in bytes.
            disabled: A flag that indicates the datablob's status. If the datablob is deleted, then **False** will be set.
            pulled_on: The most recent date the datablob was uploaded.
            user: The uuid of the user who created the datablob.
            tags: Tag names associated with the datablob.
            error: Contains the error message if the processing of the datablob fails.
        """
        self.uuid = uuid
        self.type = type
        self.source = source
        self.region = region
        self.cloud_provider = cloud_provider
        self.datasources = datasources
        self.total_steps = total_steps
        self.completed_steps = completed_steps
        self.folder_size = folder_size
        self.disabled = disabled
        self.pulled_on = pulled_on
        self.user = user
        self.tags = tags
        self.error = error

    @staticmethod
    def _get_tag_name_and_datasource_id(res: Dict[str, Any]) -> Dict[str, Any]:
        """Get tag name and datasource ids as string seperated by comma.

        Args:
            res: The response object.

        Returns:
            The modified response object with tag name and datasource ids as string seperated by comma.
        """
        res["tags"] = get_values_from_item(res["tags"], "name")
        res["datasources"] = get_values_from_item(res["datasources"])

        return res

    @staticmethod
    def from_s3(
        *,
        uri: str,
        access_key: Optional[str] = None,
        secret_key: Optional[str] = None,
        cloud_provider: Optional[str] = None,
        region: Optional[str] = None,
        tag: Optional[str] = None,
    ) -> "DataBlob":
        """Create and return a datablob that encapsulates the data from an AWS S3 bucket.

        Args:
            uri: AWS S3 bucket uri.
            access_key: Access key for the S3 bucket. If **None** (default value), then the value
                from **AWS_ACCESS_KEY_ID** environment variable will be used.
            secret_key: Secret key for the S3 bucket. If **None** (default value), then the value
                from **AWS_SECRET_ACCESS_KEY** environment variable will be used.
            cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
                If **None** (default value), then **aws**  will be used as the cloud storage provider.
            region: The region of the destination cloud provider where the datablob will be stored. If **None** (default value) then the default region will be assigned based on 
                the cloud provider. In the case of **aws**, the datablob's source bucket region will be used, whereas **azure** will use **westeurope**. The supported AWS regions 
                are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, 
                us-east-1, us-east-2, us-west-1, us-west-2. The supported Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, 
                brazilsouth, canadacentral, canadaeast, centralindia, centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, 
                japanwest, koreacentral, koreasouth, northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, 
                switzerlandnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.
            tag: A string to tag the datablob. If not passed, then the tag **latest** will be assigned to the datablob.

        Returns:
            An instance of the `DataBlob` class.

        Raises:
            ValueError: If parameters to the API are invalid.
            ConnectionError: If the server address is invalid or not reachable.

        Here's an example of how to create a Datablob from an AWS S3 bucket:
        
        Example:
            ```python
            # Importing necessary libraries
            from  airt.client import Client, DataBlob

            # Authenticate
            Client.get_token(username="{fill in username}", password="{fill in password}")

            # Create a datablob
            # In this example, the access_key and the secret_key are set in the 
            # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables. The region
            # is set to eu-west-3, feel free to change the cloud provider and the region 
            # to suit your needs.
            db = DataBlob.from_s3(
                uri="{fill in uri}",
                cloud_provider="aws",
                region="eu-west-3"
            )
            
            # Display the status in a progress bar
            db.progress_bar()

            # Print the details of the newly created datablob
            # If the upload is successful, the ready flag should be set to True
            print(db.details())
            ```
        """
        access_key = (
            access_key if access_key is not None else os.environ["AWS_ACCESS_KEY_ID"]
        )
        secret_key = (
            secret_key
            if secret_key is not None
            else os.environ["AWS_SECRET_ACCESS_KEY"]
        )
        
        cloud_provider, region = DataBlob._get_cloud_provider_and_region(cloud_provider=cloud_provider, region=region, set_source_region=True) # type: ignore

        response = Client._post_data(
            relative_url="/datablob/from_s3",
            json=dict(
                uri=uri,
                access_key=access_key,
                secret_key=secret_key,
                region=region,
                cloud_provider=cloud_provider,
                tag=tag,
            ),
        )

        return DataBlob(
            uuid=response["uuid"], type=response["type"], source=response["source"]
        )
    
    @classmethod
    def from_azure_blob_storage(
        cls,
        uri: str,
        credential: str,
        cloud_provider: Optional[str] = None,
        region: Optional[str] = None,
        tag: Optional[str] = None,
    ) -> "DataBlob":
        """Create and return a datablob that encapsulates the data from an Azure Blob Storage.

        Args:
            uri: Azure Blob Storage URI of the source file.
            credential: Credential to access the Azure Blob Storage.
            cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
                If **None** (default value), then **azure**  will be used as the cloud storage provider.
            region: The destination cloud provider's region to store the datablob. If **None** (default value) then the default region will be assigned based on the cloud 
                provider. In the case of **aws**, **eu-west-1** will be used and in the case of **azure**, **westeurope** will be used. The supported AWS regions 
                are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, 
                us-east-1, us-east-2, us-west-1, us-west-2. The supported Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, 
                brazilsouth, canadacentral, canadaeast, centralindia, centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, 
                japanwest, koreacentral, koreasouth, northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, 
                switzerlandnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.
            tag: A string to tag the datablob. If not passed, then the tag **latest** will be assigned to the datablob.

        Returns:
            An instance of the `DataBlob` class.

        Raises:
            ValueError: If parameters to the API are invalid.
            ConnectionError: If the server address is invalid or not reachable.
            
        To create a Datablob from Azure Blob Storage, you must have a valid Azure Blob Storage credential.

        If you don't know how to get the Azure Blob Storage credential, you can follow the below python example. It's one of the ways to get the Azure Blob Storage credential.

        - If you don't already have it, please install the Azure Storage Management (azure-mgmt-storage) and Azure Resource Management (azure-mgmt-resource) python client libraries using pip.

        - Ensure the following four environment variables are set into your current working environment with appropriate values.

            - AZURE_TENANT_ID

            - AZURE_CLIENT_ID

            - AZURE_CLIENT_SECRET

            - AZURE_SUBSCRIPTION_ID

        - Assign the resource group name in the GROUP_NAME variable and the storage account name in the STORAGE_ACCOUNT_NAME variable.

        - Below is a sample code to create a datablob and storing it in S3. Please copy it and replace the placeholders with appropriate values
            
        Example:
            ```python
            # Importing necessary libraries
            import os

            from azure.identity import DefaultAzureCredential
            from azure.mgmt.storage import StorageManagementClient

            from  airt.client import Client, DataBlob

            # Create a credential for accessing Azure Blob Storage
            # Setting the required environment variables
            os.environ["AZURE_SUBSCRIPTION_ID"] = "{fill in azure_subscription_id}"
            os.environ["AZURE_CLIENT_ID"] = "{fill in azure_client_id}"
            os.environ["AZURE_CLIENT_SECRET"] = "{fill in azure_client_secret}"
            os.environ["AZURE_TENANT_ID"]= "{fill in azure_tenant_id}"

            # Setting the resource group name and storage account name
            azure_group_name = "{fill in azure_group_name}"
            azure_storage_account_name = "{fill in azure_storage_account_name}"

            # Retrieving the credential
            azure_storage_client = StorageManagementClient(
                DefaultAzureCredential(), os.environ["AZURE_SUBSCRIPTION_ID"]
            )
            azure_storage_keys = azure_storage_client.storage_accounts.list_keys(
                azure_group_name, azure_storage_account_name
            )
            azure_storage_keys = {v.key_name: v.value for v in azure_storage_keys.keys}
            credential = azure_storage_keys['key1']


            # Authenticate
            Client.get_token(username="{fill in username}", password="{fill in password}")
            
            # Create a datablob
            # In this example, the datablob will be stored in an AWS S3 bucket. The region
            # is set to eu-west-1 (default), feel free to change the cloud provider and
            # the region to suit your needs.
            db = DataBlob.from_azure_blob_storage(
                uri="{fill in uri}",
                cloud_provider="aws", 
                credential=credential
            )
            
            # Display the status in a progress bar
            db.progress_bar()
            
            # Print the details of the newly created datablob
            # If the upload is successful, the ready flag should be set to True
            print(db.details())
            ```
        """
        cloud_provider, region = DataBlob._get_cloud_provider_and_region(cloud_provider=cloud_provider, region=region, default_cloud_provider="azure") # type: ignore

        response = Client._post_data(
            relative_url="/datablob/from_azure_blob_storage",
            json=dict(
                uri=uri,
                credential=credential,
                region=region,
                cloud_provider=cloud_provider,
                tag=tag,
            ),
        )

        return DataBlob(
            uuid=response["uuid"], type=response["type"], source=response["source"]
        )

    @staticmethod
    def from_mysql(
        *,
        host: str,
        database: str,
        table: str,
        port: int = 3306,
        cloud_provider: Optional[str] = None,
        region: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[str] = None,
        tag: Optional[str] = None,
    ) -> "DataBlob":
        """Create and return a datablob that encapsulates the data from a mysql database.

        If the database requires authentication, pass the username/password as parameters or store it in
        the **AIRT_CLIENT_DB_USERNAME** and **AIRT_CLIENT_DB_PASSWORD** environment variables.

        Args:
            host: Remote database host name.
            database: Database name.
            table: Table name.
            port: Host port number. If not passed, then the default value **3306** will be used.
            cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
                If **None** (default value), then **aws**  will be used as the cloud storage provider.
            region: The destination cloud provider's region to store the datablob. If **None** (default value) then the default region will be assigned based on the cloud 
                provider. In the case of **aws**, **eu-west-1** will be used and in the case of **azure**, **westeurope** will be used. The supported AWS regions 
                are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, 
                us-east-1, us-east-2, us-west-1, us-west-2. The supported Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, 
                brazilsouth, canadacentral, canadaeast, centralindia, centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, 
                japanwest, koreacentral, koreasouth, northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, 
                switzerlandnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.
            username: Database username. If not passed, the default value **"root"** will be used unless the value is explicitly set in the environment variable 
                **AIRT_CLIENT_DB_USERNAME**.            
            password: Database password. If not passed, the default value **""** will be used unless the value is explicitly set in the environment variable 
                **AIRT_CLIENT_DB_PASSWORD**. 
            tag: A string to tag the datablob. If not passed, then the tag **latest** will be assigned to the datablob.

        Returns:
           An instance of the `DataBlob` class.

        Raises:
            ValueError: If parameters to the API are invalid.
            ConnectionError: If the server address is invalid or not reachable.

        Here's an example of how to create a Datablob from a MySQL database:
        
        Example:
            ```python
            # Importing necessary libraries
            from  airt.client import Client, DataBlob

            # Authenticate
            Client.get_token(username="{fill in username}", password="{fill in password}")

            # Create a datablob
            # In this example, the datablob will be stored in an AWS S3 bucket. The region
            # is set to eu-west-3, feel free to change the cloud provider and the region 
            # to suit your needs.
            db = DataBlob.from_mysql(
                username="{fill in database_username}",
                password="{fill in database_password}",
                host="{fill in host}",
                database="{fill in database}",
                table="{fill in table}",
                port="{fill in port}",
                cloud_provider="aws",
                region="eu-west-3"
            )

            # Display the status in a progress bar
            db.progress_bar()

            # Print the details of the newly created datablob
            # If the upload is successful, the ready flag should be set to True
            print(db.details())
            ```
        """
        username = (
            username
            if username is not None
            else os.environ.get(CLIENT_DB_USERNAME, "root")
        )

        password = (
            password if password is not None else os.environ.get(CLIENT_DB_PASSWORD, "")
        )

        cloud_provider, region = DataBlob._get_cloud_provider_and_region(cloud_provider, region) # type: ignore

        json_req = dict(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            table=table,
            region=region,
            cloud_provider=cloud_provider,
            tag=tag,
        )

        response = Client._post_data(relative_url=f"/datablob/from_mysql", json=json_req)

        return DataBlob(
            uuid=response["uuid"], type=response["type"], source=response["source"]
        )

    @staticmethod
    def from_clickhouse(
        *,
        host: str,
        database: str,
        table: str,
        protocol: str,
        index_column: str,
        timestamp_column: str,
        port: int = 0,
        cloud_provider: Optional[str] = None,
        region: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[str] = None,
        filters: Optional[Dict[str, Any]] = None,
        tag: Optional[str] = None,
    ) -> "DataBlob":
        """Create and return a datablob that encapsulates the data from a ClickHouse database.

        If the database requires authentication, pass the username/password as parameters or store it in
        the **CLICKHOUSE_USERNAME** and **CLICKHOUSE_PASSWORD** environment variables.

        Args:
            host: Remote database host name.
            database: Database name.
            table: Table name.
            protocol: Protocol to use. The valid values are "native" and "http".
            index_column: The column to use as index (row labels).
            timestamp_column: Timestamp column name in the tabel.
            port: Host port number. If not passed, then the default value **0** will be used.
            cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
                If **None** (default value), then **aws**  will be used as the cloud storage provider.
            region: The destination cloud provider's region to store the datablob. If **None** (default value) then the default region will be assigned based on the cloud 
                provider. In the case of **aws**, **eu-west-1** will be used and in the case of **azure**, **westeurope** will be used. The supported AWS regions 
                are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, 
                us-east-1, us-east-2, us-west-1, us-west-2. The supported Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, 
                brazilsouth, canadacentral, canadaeast, centralindia, centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, 
                japanwest, koreacentral, koreasouth, northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, 
                switzerlandnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.
            username: Database username. If not passed, the default value "root" will be used unless the value is explicitly set in the environment variable 
                **CLICKHOUSE_USERNAME**.
            password: Database password. If not passed, the default value "root" will be used unless the value is explicitly set in the environment variable 
                **CLICKHOUSE_PASSWORD**.            
            filters: Additional parameters to be used when importing data. For example, if you want to filter and extract data only for a specific user_id, pass {"user_id": 1}.
            tag: A string to tag the datablob. If not passed, then the tag **latest** will be assigned to the datablob.

        Returns:
           An instance of the `DataBlob` class.

        Raises:
            ValueError: If parameters to the API are invalid.
            ConnectionError: If the server address is invalid or not reachable.

        Here's an example of how to create a Datablob from a ClickHouse database:
        
        Example:
            ```python
            # Importing necessary libraries
            from  airt.client import Client, DataBlob

            # Authenticate
            Client.get_token(username="{fill in username}", password="{fill in password}")

            # Create a datablob
            # In this example, the datablob will be stored in an AWS S3 bucket. The region
            # is set to eu-west-3, feel free to change the cloud provider and the region 
            # to suit your needs.
            db = DataBlob.from_clickhouse(
                username="{fill in database_username}",
                password="{fill in database_password}",
                host="{fill in host}",
                database="{fill in database}",
                table="{fill in table}",
                index_column="{fill in index_column}",
                timestamp_column="{fill in timestamp_column}",
                port="{fill in port}", 
                filters={fill in filters},
                protocol="native",
                cloud_provider="aws",
                region="eu-west-3"
            )

            # Display the status in a progress bar
            db.progress_bar()

            # Print the details of the newly created datablob
            # If the upload is successful, the ready flag should be set to True
            print(db.details())
            ```
        """
        username = (
            username
            if username is not None
            else os.environ.get("CLICKHOUSE_USERNAME", "root")
        )

        password = (
            password
            if password is not None
            else os.environ.get("CLICKHOUSE_PASSWORD", "")
        )

        cloud_provider, region = DataBlob._get_cloud_provider_and_region(cloud_provider, region) # type: ignore

        json_req = dict(
            host=host,
            database=database,
            table=table,
            protocol=protocol,
            port=port,
            username=username,
            password=password,
            index_column=index_column,
            timestamp_column=timestamp_column,
            filters=filters,
            region=region,
            cloud_provider=cloud_provider,
            tag=tag,
        )

        response = Client._post_data(
            relative_url=f"/datablob/from_clickhouse", json=json_req
        )

        return DataBlob(
            uuid=response["uuid"], type=response["type"], source=response["source"]
        )

    @staticmethod
    def _upload_to_s3_with_retry(
        file_to_upload: Path,
        presigned_url: str,
        presigned_fields: Dict[str, Any],
        max_retry: int = 3,
        curr_iteration: int = 1,
    ):
        """Upload local files to s3 using presigned url

        Args:
            file_to_upload: path of file to upload
            presigned_url: presigned url to upload to
            presigned_fields: presigned fields provided by boto3
            max_retry: maximum retry count
            curr_iteration: current iteration count for internal use
        """
        try:
            with open(file_to_upload, "rb") as f:
                files = {"file": (str(file_to_upload), f)}
                response = requests.post(
                    presigned_url, data=presigned_fields, files=files
                )
                if not response.status_code == 204:
                    raise ValueError(response.text)

        except requests.exceptions.ConnectionError as e:
            if curr_iteration == max_retry:
                raise e
            DataBlob._upload_to_s3_with_retry(
                file_to_upload,
                presigned_url,
                presigned_fields,
                max_retry,
                curr_iteration + 1,
            )

    @staticmethod
    def from_local(
        path: Union[str, Path],
        cloud_provider: Optional[str] = None,
        region: Optional[str] = None,
        tag: Optional[str] = None,
        show_progress: Optional[bool] = True,
    ) -> "DataBlob":
        """Create and return a datablob from local file.
        
        The API currently allows users to create datablobs from CSV or Parquet files. We intend to support additional file formats in future releases.

        Args:
            path: The relative or absolute path to a local file or to a directory containing the source files.
            cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
                If **None** (default value), then **aws**  will be used as the cloud storage provider.
            region: The destination cloud provider's region to store the datablob. If **None** (default value) then the default region will be assigned based on the cloud 
                provider. In the case of **aws**, **eu-west-1** will be used and in the case of **azure**, **westeurope** will be used. The supported AWS regions 
                are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, 
                us-east-1, us-east-2, us-west-1, us-west-2. The supported Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, 
                brazilsouth, canadacentral, canadaeast, centralindia, centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, 
                japanwest, koreacentral, koreasouth, northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, 
                switzerlandnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.
            tag: A string to tag the datablob. If not passed, then the tag **latest** will be assigned to the datablob.
            show_progress: Flag to set the progressbar visibility. If not passed, then the default value **True** will be used.

        Returns:
           An instance of the `DataBlob` class.

        Raises:
            ValueError: If parameters to the API are invalid.
            ConnectionError: If the server address is invalid or not reachable.

        Here's an example of how to create a Datablob from a local file:
        
        Example:
            ```python
            # Importing necessary libraries
            from  airt.client import Client, DataBlob

            # Authenticate
            Client.get_token(username="{fill in username}", password="{fill in password}")

            # Create a datablob
            # In this example, the datablob will be stored in an AWS S3 bucket. The region
            # is set to eu-west-3, feel free to change the cloud provider and the region 
            # to suit your needs.
            db = DataBlob.from_local(
                path="{fill in path}",
                cloud_provider="aws",
                region="eu-west-3"
            )

            # Display the status in a progress bar
            db.progress_bar()

            # Print the details of the newly created datablob
            # If the upload is successful, the ready flag should be set to True
            print(db.details())

            ```
        """
        path = Path(path)
        cloud_provider, region = DataBlob._get_cloud_provider_and_region(cloud_provider, region) # type: ignore

        # Step 1: get presigned URL
        _path = f"local:{str(path)}"

        response = Client._post_data(
            relative_url=f"/datablob/from_local/start",
            json=dict(path=_path, region=region, cloud_provider=cloud_provider, tag=tag),
        )

        # Step 2: download the csv to the s3 bucket
        files = list(path.glob("*")) if path.is_dir() else [path]

        # Initiate progress bar
        t = tqdm(total=len(files), disable=not show_progress)

        for file_to_upload in files:
            DataBlob._upload_to_s3_with_retry(
                file_to_upload=file_to_upload,
                presigned_url=response["presigned"]["url"],
                presigned_fields=response["presigned"]["fields"],
            )
            t.update()

        t.close()
        return DataBlob(uuid=response["uuid"], type=response["type"])

    @staticmethod
    def ls(
        offset: int = 0,
        limit: int = 100,
        disabled: bool = False,
        completed: bool = False,
    ) -> List["DataBlob"]:
        """Return the list of DataBlob instances

        Args:
            offset: The number of datablobs to offset at the beginning. If **None**,
                then the default value **0** will be used.
            limit: The maximum number of datablobs to return from the server. If **None**,
                then the default value **100** will be used.
            disabled: If set to **True**, then only the deleted datablobs will be returned.
                Else, the default value **False** will be used to return only the list
                of active datablobs.
            completed: If set to **True**, then only the datablobs that are successfully downloaded
                to the server will be returned. Else, the default value **False** will be used to
                return all the datablobs.

        Returns:
            A list of DataBlob instances available in the server.

        Raises:
            ConnectionError: If the server address is invalid or not reachable.
        """
        lists = Client._get_data(
            relative_url=f"/datablob/?disabled={disabled}&completed={completed}&offset={offset}&limit={limit}"
        )

        dbx = [
            DataBlob(
                uuid=db["uuid"],
                type=db["type"],
                source=db["source"],
                region=db["region"],
                cloud_provider=db["cloud_provider"],
                datasources=db["datasources"],
                total_steps=db["total_steps"],
                completed_steps=db["completed_steps"],
                folder_size=db["folder_size"],
                disabled=db["disabled"],
                pulled_on=db["pulled_on"],
                user=db["user"],
                tags=db["tags"],
                error=db["error"],
            )
            for db in lists
        ]

        return dbx

    @staticmethod
    def as_df(dbx: List["DataBlob"]) -> pd.DataFrame:
        """Return the details of datablob instances as a pandas dataframe.

        Args:
            dbx: List of datablob instances.

        Returns:
            Details of all the datablobs in a dataframe.

        Raises:
            ConnectionError: If the server address is invalid or not reachable.
        """
        db_lists = get_attributes_from_instances(dbx, DataBlob.ALL_DB_COLS)  # type: ignore

        for db in db_lists:
            db = DataBlob._get_tag_name_and_datasource_id(db)

        lists_df = generate_df(db_lists, DataBlob.BASIC_DB_COLS)
        df = add_ready_column(lists_df)

        df = df.rename(columns=DataBlob.COLS_TO_RENAME)

        return df

    def is_ready(self) -> bool:
        """Check if the method's progress is complete.

        !!! info

            This method will return `True` immediately and will not wait for the progress to finish
            if the datablob is created using the `from_local` method.

        Returns:
            **True** if the upload progress is completed, else **False**.
        """
        if self.type in ["local"]:
            return True

        progress_status = ProgressStatus(relative_url=f"/datablob/{self.uuid}")

        return progress_status.is_ready()

    def progress_bar(self, sleep_for: Union[int, float] = 5, timeout: int = 0):
        """Blocks the execution and displays a progress bar showing the remote action progress.

        !!! info

            This method will not check the progress if the datablob is created using the
            `from_local` method.

        Args:
            sleep_for: The time interval in seconds between successive API calls.
            timeout: The maximum time allowed in seconds for the asynchronous call to complete. If not the
                progressbar will be terminated.

        Raises:
            ConnectionError: If the server address is invalid or not reachable.
            TimeoutError: in case of connection timeout.
        """
        if self.type not in ["local"]:
            progress_status = ProgressStatus(
                relative_url=f"/datablob/{self.uuid}",
                sleep_for=sleep_for,
                timeout=timeout,
            )

            progress_status.progress_bar()

    def wait(self, sleep_for: Union[int, float] = 1, timeout: int = 0):
        """Blocks execution while waiting for the remote action to complete.

        !!! info

            This method will not check the progress if the datablob is created using the
            `from_local` method.

        Args:
            sleep_for: The time interval in seconds between successive API calls.
            timeout: The maximum time allowed in seconds for the asynchronous call to complete. If not the
                progressbar will be terminated.

        Raises:
            ConnectionError: If the server address is invalid or not reachable.
            TimeoutError: in case of timeout.
        """
        if self.type not in ["local"]:
            progress_status = ProgressStatus(
                relative_url=f"/datablob/{self.uuid}",
                sleep_for=sleep_for,
                timeout=timeout,
            )

            progress_status.wait()

    def to_datasource(
        self,
        *,
        file_type:str,
        index_column: str,
        sort_by: Union[str, List[str]],
        deduplicate_data: bool = False,
        blocksize: str = "256MB",
        **kwargs,
    ) -> DataSource:
        raise NotImplementedError()

    def details(self) -> pd.DataFrame:
        raise NotImplementedError()

    def tag(self, name: str) -> pd.DataFrame:
        raise NotImplementedError()

    def delete(self) -> pd.DataFrame:
        raise NotImplementedError()

In [None]:
#| exporti

def _docstring_example():
    """
    Example:
        ```python
        # Importing necessary libraries
        from  airt.client import Client, DataBlob

        # Authenticate
        Client.get_token(username="{fill in username}", password="{fill in password}")

        # Create a datablob
        # In this example, the datablob will be stored in an AWS S3 bucket. The 
        # access_key and the secret_key are set in the AWS_ACCESS_KEY_ID and 
        # AWS_SECRET_ACCESS_KEY environment variables, and the region is set to 
        # eu-west-3; feel free to change the cloud provider and the region to 
        # suit your needs.
        db = DataBlob.from_s3(
            uri="{fill in uri}",
            cloud_provider="aws",
            region="eu-west-3"
        )

        # Display the status in a progress bar
        # Call the wait method to wait for the progress to finish but
        # without displaying an interactive progress bar.
        db.progress_bar()
        
        # Display the ready status
        # If the datablob is successfully uploaded, True will be returned.
        print(db.is_ready())

        # Print the details of the newly created datablob
        print(db.details())
        
        # Display the details of all datablob created by the currently
        # logged-in user
        print(DataBlob.as_df(DataBlob.ls()))
        
        # Create a datasource
        ds = db.to_datasource(
            file_type="{fill in file_type}",
            index_column="{fill in index_column}",
            sort_by="{fill in sort_by}",
        )

        # Display the status in a progress bar
        ds.progress_bar()

        # Display the head of the data to ensure everything is fine.
        print(ds.head())
        
        # Tag the datablob
        print(db.tag(name="{fill in tag_name}"))

        # Delete the datablob
        print(db.delete())
        ```
    """
    pass

In [None]:
# Run example for _docstring_example
username = os.environ[SERVICE_USERNAME]
password = os.environ[SERVICE_PASSWORD]

run_examples_from_docstring(
    _docstring_example,
    username=username,
    password=password,
    uri=TEST_S3_URI,
    file_type="parquet",
    index_column="user_id",
    sort_by="event_time",
    tag_name="v1.0"
)

In [None]:
#| exporti

add_example_to_docs(DataBlob, _docstring_example.__doc__) # type: ignore
add_example_to_docs(DataBlob.ls, _docstring_example.__doc__) # type: ignore
add_example_to_docs(DataBlob.as_df, _docstring_example.__doc__) # type: ignore
add_example_to_docs(DataBlob.wait, _docstring_example.__doc__) # type: ignore
add_example_to_docs(DataBlob.is_ready, _docstring_example.__doc__) # type: ignore
add_example_to_docs(DataBlob.progress_bar, _docstring_example.__doc__) # type: ignore

In [None]:
#| export


@patch(cls_method=True)
@contextmanager
def set_default_cloud_provider(
    cls: DataBlob, cloud_provider: str, region: Optional[str] = None
) -> Iterator[None]:
    """Sets the default destination value for the cloud_provider and the region.
    
    Whenever you call the from_\* methods of the `DataBlob` class inside this context manager, the destination cloud_provider and region set in this context 
    will be passed to the from_\* methods, unless you explicitely override it in the parameter.

    Args:
        cloud_provider: The destination cloud storage provider's name to store the datablob. Currently, the API only supports **aws** and **azure** as cloud storage providers.
        region: The destination cloud provider's region to store the datablob. The supported AWS regions are: ap-northeast-1, ap-northeast-2, ap-south-1, ap-southeast-1, 
            ap-southeast-2, ca-central-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, us-east-1, us-east-2, us-west-1, us-west-2. The supported 
            Azure Blob Storage regions are: australiacentral, australiacentral2, australiaeast, australiasoutheast, brazilsouth, canadacentral, canadaeast, centralindia, 
            centralus, eastasia, eastus, eastus2, francecentral, francesouth, germanynorth, germanywestcentral, japaneast, japanwest, koreacentral, koreasouth, 
            northcentralus, northeurope, norwayeast, norwaywest, southafricanorth, southafricawest, southcentralus, southeastasia, southindia, switzerlandnorth, 
            switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2.

    Returns:
        A context manager that specifies the cloud provider and region to use.

    Here's an example of creating a datablob from Azure Blob Storage and storing it in AWS S3:
    
    Example:
        ```python
        # Importing necessary libraries
        import os

        from azure.identity import DefaultAzureCredential
        from azure.mgmt.storage import StorageManagementClient

        from  airt.client import Client, DataBlob

        # Create a credential for accessing Azure Blob Storage
        # Setting the required environment variables
        os.environ["AZURE_SUBSCRIPTION_ID"] = "{fill in azure_subscription_id}"
        os.environ["AZURE_CLIENT_ID"] = "{fill in azure_client_id}"
        os.environ["AZURE_CLIENT_SECRET"] = "{fill in azure_client_secret}"
        os.environ["AZURE_TENANT_ID"]= "{fill in azure_tenant_id}"

        # Setting the resource group name and storage account name
        azure_group_name = "{fill in azure_group_name}"
        azure_storage_account_name = "{fill in azure_storage_account_name}"

        # Retrieving the credential
        azure_storage_client = StorageManagementClient(
            DefaultAzureCredential(), os.environ["AZURE_SUBSCRIPTION_ID"]
        )
        azure_storage_keys = azure_storage_client.storage_accounts.list_keys(
            azure_group_name, azure_storage_account_name
        )
        azure_storage_keys = {v.key_name: v.value for v in azure_storage_keys.keys}
        credential = azure_storage_keys['key1']


        # Authenticate
        Client.get_token(username="{fill in username}", password="{fill in password}")

        # Create a datablob
        # In this example, the datablobs created inside the context manager will be 
        # stored in an AWS S3 bucket with the region set to eu-west-3.
        with DataBlob.set_default_cloud_provider(
            cloud_provider="aws",
            region="eu-west-3"
        ):
            db = DataBlob.from_azure_blob_storage(
                uri="{fill in uri}",
                credential=credential
            )
        
        # Display the status in a progress bar    
        db.progress_bar()
        
        # Print the details of the newly created datablob
        # If the upload is successful, the ready flag should be set to True
        print(db.details())
        ```
    """

    cls._default_provider_and_regions.append((cloud_provider, region))  # type: ignore

    yield

    cls._default_provider_and_regions.pop()


@patch(cls_method=True)
def _get_default_provider_and_regions(
    cls: DataBlob,
) -> Tuple[Optional[str], Optional[str]]:
    
    if len(cls._default_provider_and_regions) == 0:
        return None, None
    
    else:
        return cls._default_provider_and_regions[-1]


@patch(cls_method=True)
def _get_cloud_provider_and_region(
    cls: DataBlob,
    cloud_provider: Optional[str] = None,
    region: Optional[str] = None,
    set_source_region: Optional[bool] = False,
    default_cloud_provider: str = "aws",
) -> Tuple[str, Optional[str]]:
    
    if (cloud_provider is None) and (region is not None):
        raise ValueError("You must specify a cloud_provider if are specifying a region.")

    if (cloud_provider is None) and (region is None):
        cloud_provider, region = cls._get_default_provider_and_regions() # type: ignore
    
    if cloud_provider is None:
        ret_val_cloud_provider = default_cloud_provider
    else:
        ret_val_cloud_provider = cloud_provider
    
    if region is None:
        if ret_val_cloud_provider == "azure":
            region = DEFAULT_AZURE_BLOB_STORAGE_REGION
        else:
            region = None if set_source_region else DEFAULT_S3_REGION

    return ret_val_cloud_provider, region

In [None]:
with pytest.raises(ValueError) as e:
    DataBlob._get_cloud_provider_and_region(cloud_provider=None, region="US")
display(e.value)

_cloud_provider, _region = DataBlob._get_cloud_provider_and_region(cloud_provider=None, region=None, default_cloud_provider="azure")
display(_cloud_provider, _region)
assert _cloud_provider == "azure"
assert _region == DEFAULT_AZURE_BLOB_STORAGE_REGION, _region

_cloud_provider, _region = DataBlob._get_cloud_provider_and_region(cloud_provider=None, region=None)
display(_cloud_provider, _region)
assert _cloud_provider == "aws"
assert _region == DEFAULT_S3_REGION, _region

_cloud_provider, _region = DataBlob._get_cloud_provider_and_region(cloud_provider=None, region=None, set_source_region=True)
display(_cloud_provider, _region)
assert _cloud_provider == "aws"
assert _region == None, _region

_cloud_provider, _region = DataBlob._get_cloud_provider_and_region("azure")
display(_cloud_provider, _region)
assert _cloud_provider == "azure"
assert _region == DEFAULT_AZURE_BLOB_STORAGE_REGION, _region

_cloud_provider, _region = DataBlob._get_cloud_provider_and_region("aws")
display(_cloud_provider, _region)
assert _cloud_provider == "aws"
assert _region == DEFAULT_S3_REGION, _region

with DataBlob.set_default_cloud_provider("azure", "US"):
    _cloud_provider, _region = DataBlob._get_cloud_provider_and_region("azure")
    display(_cloud_provider, _region)
    assert _cloud_provider == "azure"
    assert _region == "westeurope", _region
    
    with DataBlob.set_default_cloud_provider("aws", "US"):
        _cloud_provider, _region = DataBlob._get_cloud_provider_and_region("aws")
        display(_cloud_provider, _region)
        assert _cloud_provider == "aws"
        assert _region == "eu-west-1", _region
        
        with DataBlob.set_default_cloud_provider("aws"):
            _cloud_provider, _region = DataBlob._get_cloud_provider_and_region("aws")
            display(_cloud_provider, _region)
            assert _cloud_provider == "aws"
            assert _region == "eu-west-1", DEFAULT_S3_REGION
                    
        _cloud_provider, _region = DataBlob._get_cloud_provider_and_region("azure")
        display(_cloud_provider, _region)
        assert _cloud_provider == "azure"
        assert _region == "westeurope", _region

ValueError('You must specify a cloud_provider if are specifying a region.')

'azure'

'westeurope'

'aws'

'eu-west-1'

'aws'

None

'azure'

'westeurope'

'aws'

'eu-west-1'

'azure'

'westeurope'

'aws'

'eu-west-1'

'aws'

'eu-west-1'

'azure'

'westeurope'

In [None]:
# Run example for DataBlob.set_default_cloud_provider

username = os.environ[SERVICE_USERNAME]
password = os.environ[SERVICE_PASSWORD]

run_examples_from_docstring(
    DataBlob.set_default_cloud_provider,
    azure_subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
    azure_client_id=os.environ["AZURE_CLIENT_ID"],
    azure_client_secret=os.environ["AZURE_CLIENT_SECRET"],
    azure_tenant_id=os.environ["AZURE_TENANT_ID"],
    azure_group_name="test-airt-service",
    azure_storage_account_name="testairtservice",
    username=username,
    password=password,
    uri="https://testairtservice.blob.core.windows.net/test-container/ecommerce_behavior_notebooks",
)

<module>:6: No type or annotation for parameter 'cloud_provider'
<module>:7: No type or annotation for parameter 'region'
<module>:15: No type or annotation for returned value 1


In [None]:
# Run example for DataBlob.from_s3

username = os.environ[SERVICE_USERNAME]
password = os.environ[SERVICE_PASSWORD]

run_examples_from_docstring(
    DataBlob.from_s3,
    username=username,
    password=password,
    uri=TEST_S3_URI
)

<module>:3: No type or annotation for parameter 'uri'
<module>:4: No type or annotation for parameter 'access_key'
<module>:6: No type or annotation for parameter 'secret_key'
<module>:8: No type or annotation for parameter 'cloud_provider'
<module>:10: No type or annotation for parameter 'region'
<module>:17: No type or annotation for parameter 'tag'
<module>:20: No type or annotation for returned value 1
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'


In [None]:
# Run example for DataBlob.from_azure_blob_storage

username = os.environ[SERVICE_USERNAME]
password = os.environ[SERVICE_PASSWORD]

run_examples_from_docstring(
    DataBlob.from_azure_blob_storage,
    azure_subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
    azure_client_id=os.environ["AZURE_CLIENT_ID"],
    azure_client_secret=os.environ["AZURE_CLIENT_SECRET"],
    azure_tenant_id=os.environ["AZURE_TENANT_ID"],
    azure_group_name="test-airt-service",
    azure_storage_account_name="testairtservice",
    username=username,
    password=password,
    uri="https://testairtservice.blob.core.windows.net/test-container/ecommerce_behavior_notebooks",
)

<module>:3: No type or annotation for parameter 'uri'
<module>:4: No type or annotation for parameter 'credential'
<module>:5: No type or annotation for parameter 'cloud_provider'
<module>:7: No type or annotation for parameter 'region'
<module>:14: No type or annotation for parameter 'tag'
<module>:17: No type or annotation for returned value 1
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'


In [None]:
# Run example for DataBlob.from_mysql
# Creating a test_table
database_username=os.environ["DB_USERNAME"]
database_password=os.environ["DB_PASSWORD"]
host=os.environ["DB_HOST"]
port=os.environ["DB_PORT"]
database=os.environ["DB_DATABASE"]
database_server=os.environ["DB_DATABASE_SERVER"]
table="test_db_pull"

def get_db_engine():    
    quoted_password = urlquote(database_password)
    conn_str = f"{database_server}://{database_username}:{quoted_password}@{host}:{port}/{database}"
    engine = create_engine(conn_str)
    return engine

with tempfile.TemporaryDirectory(prefix="test_s3_download_") as d:
    !aws s3 sync {TEST_S3_URI} {d}
    !ls {d}
    
    engine = get_db_engine()
    
    df = pd.read_parquet(d)
    try:
        df.to_sql("test_db_pull", con=engine, if_exists="fail")
    except ValueError as e:
        display(e)

run_examples_from_docstring(
    DataBlob.from_mysql,
    username=os.environ[SERVICE_USERNAME],
    password=os.environ[SERVICE_PASSWORD],
    database_username=database_username,
    database_password=database_password,
    host=host,
    database=database,
    table=table,
    port=port,
)

download: s3://test-airt-service/ecommerce_behavior_notebooks/_common_metadata to ../../../tmp/test_s3_download_br6k91fp/_common_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/_metadata to ../../../tmp/test_s3_download_br6k91fp/_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.15.parquet to ../../../tmp/test_s3_download_br6k91fp/part.15.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.1.parquet to ../../../tmp/test_s3_download_br6k91fp/part.1.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.0.parquet to ../../../tmp/test_s3_download_br6k91fp/part.0.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.11.parquet to ../../../tmp/test_s3_download_br6k91fp/part.11.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.12.parquet to ../../../tmp/test_s3_download_br6k91fp/part.12.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/

<module>:6: No type or annotation for parameter 'host'
<module>:7: No type or annotation for parameter 'database'
<module>:8: No type or annotation for parameter 'table'
<module>:9: No type or annotation for parameter 'port'
<module>:10: No type or annotation for parameter 'cloud_provider'
<module>:12: No type or annotation for parameter 'region'
<module>:19: No type or annotation for parameter 'username'
<module>:21: No type or annotation for parameter 'password'
<module>:23: No type or annotation for parameter 'tag'
<module>:26: No type or annotation for returned value 1
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'


In [None]:
# Run example for DataBlob.from_clickhouse


run_examples_from_docstring(
    DataBlob.from_clickhouse,
    username=os.environ[SERVICE_USERNAME],
    password=os.environ[SERVICE_PASSWORD],
    database_username=os.environ.get("CLICKHOUSE_USERNAME"),
    database_password=os.environ.get("CLICKHOUSE_PASSWORD"),
    host=os.environ.get("CLICKHOUSE_HOST"),
    database=os.environ.get("CLICKHOUSE_DATABASE"),
    table=os.environ.get("CLICKHOUSE_EVENTS_TABLE"),
    index_column = "PersonId",
    timestamp_column = "OccurredTimeTicks",
    filters = "{'AccountId': 312571}",
    port="0",
)

<module>:6: No type or annotation for parameter 'host'
<module>:7: No type or annotation for parameter 'database'
<module>:8: No type or annotation for parameter 'table'
<module>:9: No type or annotation for parameter 'protocol'
<module>:10: No type or annotation for parameter 'index_column'
<module>:11: No type or annotation for parameter 'timestamp_column'
<module>:12: No type or annotation for parameter 'port'
<module>:13: No type or annotation for parameter 'cloud_provider'
<module>:15: No type or annotation for parameter 'region'
<module>:22: No type or annotation for parameter 'username'
<module>:24: No type or annotation for parameter 'password'
<module>:26: No type or annotation for parameter 'filters'
<module>:27: No type or annotation for parameter 'tag'
<module>:30: No type or annotation for returned value 1
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute

In [None]:
# Run example for DataBlob.from_local
# Helper function to download a sample csv file into the temp directory for DataBlob.from_local

def get_test_csv_path() -> Path:
    """Downloads the account_312571_events from the s3 bucket and stores it in temp folder. 
    Finally converts the downloaded account_312571_events files to a csv file and returns the
    path of the temp folder and the temp csv file.
    """
    temp_dirpath = Path(tempfile.mkdtemp(prefix="test_s3_download_"))

    !aws s3 sync {TEST_S3_URI} {temp_dirpath / "parquet"}

    parquet_path = Path(temp_dirpath / "parquet")
    csv_dirpath = Path(temp_dirpath / "csv")
    os.mkdir(csv_dirpath) 
    
    for i, f in enumerate(list(parquet_path.glob("*.parquet"))):
        df = pd.read_parquet(f)
        df.to_csv(csv_dirpath / f"file-{i}.csv", index=False)

    display(list(csv_dirpath.glob("*")))

    return temp_dirpath, csv_dirpath, parquet_path

# Create temp directory
temp_dir, csv_dirpath, parquet_path = get_test_csv_path()

run_examples_from_docstring(
    DataBlob.from_local,
    username=os.environ[SERVICE_USERNAME],
    password=os.environ[SERVICE_PASSWORD],
    path=str(csv_dirpath)
)

download: s3://test-airt-service/ecommerce_behavior_notebooks/_common_metadata to ../../../tmp/test_s3_download_3mlmcv5d/parquet/_common_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/_metadata to ../../../tmp/test_s3_download_3mlmcv5d/parquet/_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.10.parquet to ../../../tmp/test_s3_download_3mlmcv5d/parquet/part.10.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.11.parquet to ../../../tmp/test_s3_download_3mlmcv5d/parquet/part.11.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.15.parquet to ../../../tmp/test_s3_download_3mlmcv5d/parquet/part.15.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.14.parquet to ../../../tmp/test_s3_download_3mlmcv5d/parquet/part.14.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.13.parquet to ../../../tmp/test_s3_download_3mlmcv5d/parquet/part.13.parquet
do

[Path('/tmp/test_s3_download_3mlmcv5d/csv/file-15.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-13.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-17.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-4.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-19.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-3.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-18.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-2.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-11.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-9.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-0.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-7.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-5.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-1.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-16.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-6.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv/file-8.csv'),
 Path('/tmp/test_s3_download_3mlmcv5d/csv

<module>:5: No type or annotation for parameter 'path'
<module>:6: No type or annotation for parameter 'cloud_provider'
<module>:8: No type or annotation for parameter 'region'
<module>:15: No type or annotation for parameter 'tag'
<module>:16: No type or annotation for parameter 'show_progress'
<module>:19: No type or annotation for returned value 1
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'
Failed to parse annotation from 'Name' node: 'NoneType' object has no attribute 'resolve'


In [None]:
# Tests for DataBlob._get_tag_name_and_datasource_id:
RANDOM_UUID_FOR_TESTING = "00000000-0000-0000-0000-000000000000"

res = {
    "id": RANDOM_UUID_FOR_TESTING,
    "datasources": [],
    "tags": [{"id": 1, "name": "latest", "created": "2022-03-25T07:22:07"}],
}

expected = {"id": RANDOM_UUID_FOR_TESTING, "datasources": "<none>", "tags": "latest"}

actual = DataBlob._get_tag_name_and_datasource_id(res)
display(f"{actual=}")
assert actual == expected

"actual={'id': '00000000-0000-0000-0000-000000000000', 'datasources': '<none>', 'tags': 'latest'}"

In [None]:
# Tests for DataBlob._get_tag_name_and_datasource_id:

res = {
    "id": RANDOM_UUID_FOR_TESTING,
    "datasources": [ RANDOM_UUID_FOR_TESTING, RANDOM_UUID_FOR_TESTING, RANDOM_UUID_FOR_TESTING],
    "tags": [{"id": 1, "name": "latest", "created": "2022-03-25T07:22:07"}],
}

expected = {"id": RANDOM_UUID_FOR_TESTING, "datasources": f"{RANDOM_UUID_FOR_TESTING}, {RANDOM_UUID_FOR_TESTING}, {RANDOM_UUID_FOR_TESTING}", "tags": "latest"}

actual = DataBlob._get_tag_name_and_datasource_id(res)

display(f"{actual=}")
assert actual == expected

"actual={'id': '00000000-0000-0000-0000-000000000000', 'datasources': '00000000-0000-0000-0000-000000000000, 00000000-0000-0000-0000-000000000000, 00000000-0000-0000-0000-000000000000', 'tags': 'latest'}"

In [None]:
def remove_hypens_from_id(id:str) -> str:
    return "".join((id).split("-"))

In [None]:
actual = remove_hypens_from_id(RANDOM_UUID_FOR_TESTING)
assert len(actual) == 32
actual

'00000000000000000000000000000000'

In [None]:
#| export


@patch
def details(self: DataBlob) -> pd.DataFrame:
    """Return details of a datablob.

    Returns:
        The datablob details as a pandas dataframe.

    Raises:
        ConnectionError: If the server address is invalid or not reachable.
    """

    details = Client._get_data(relative_url=f"/datablob/{self.uuid}")
    
    details = DataBlob._get_tag_name_and_datasource_id(details)

    details_df = pd.DataFrame([details])[DataBlob.ALL_DB_COLS]
    
    details_df = details_df.rename(columns=DataBlob.COLS_TO_RENAME)

    return add_ready_column(details_df)

In [None]:
#| exporti

add_example_to_docs(DataBlob.details, _docstring_example.__doc__) # type: ignore

In [None]:
#| exporti


DataBlob.details.__doc__ = DataBlob.details.__doc__ + f"\n    Columns in the resulting dataframe are: {', '.join(DataBlob.ALL_DB_COLS)}." # type: ignore

In [None]:
details_doc = DataBlob.details.__doc__

display(details_doc)
assert ', '.join(DataBlob.ALL_DB_COLS) in details_doc

'Return details of a datablob.\n\nReturns:\n    The datablob details as a pandas dataframe.\n\nRaises:\n    ConnectionError: If the server address is invalid or not reachable.\n\n\nExample:\n    ```python\n    # Importing necessary libraries\n    from  airt.client import Client, DataBlob\n\n    # Authenticate\n    Client.get_token(username="{fill in username}", password="{fill in password}")\n\n    # Create a datablob\n    # In this example, the datablob will be stored in an AWS S3 bucket. The \n    # access_key and the secret_key are set in the AWS_ACCESS_KEY_ID and \n    # AWS_SECRET_ACCESS_KEY environment variables, and the region is set to \n    # eu-west-3; feel free to change the cloud provider and the region to \n    # suit your needs.\n    db = DataBlob.from_s3(\n        uri="{fill in uri}",\n        cloud_provider="aws",\n        region="eu-west-3"\n    )\n\n    # Display the status in a progress bar\n    # Call the wait method to wait for the progress to finish but\n    # wit

In [None]:
# Helper function to create a s3 datablob

# Authenticate
Client.get_token()

_db = None
@contextmanager
def generate_db(cloud_provider: Optional[str] = "aws", region: Optional[str] = None, force_create: bool = False):
    global _db
    
    if _db is None or force_create:
        with DataBlob.set_default_cloud_provider(cloud_provider=cloud_provider, region=region):
            _db = DataBlob.from_s3(
                uri=TEST_S3_URI,
                access_key=os.environ["AWS_ACCESS_KEY_ID"],
                secret_key=os.environ["AWS_SECRET_ACCESS_KEY"]
            )
            _db.progress_bar()
    yield _db

In [None]:
# Tests for Datablob.details

for region in [None, "eu-west-1"]:
    
    with generate_db(region=region, force_create=True) as db:
        df = db.details()
        assert df.datablob_uuid[0] == db.uuid
        assert len(remove_hypens_from_id(df.datablob_uuid[0])) == 32
        assert df.shape == (1, len(DataBlob.ALL_DB_COLS) -1 ), df.shape

        display(f"{df['tags'].item()=}")
        display(df)
        assert df["source"][0] == TEST_S3_URI
        if region is not None:
            assert df["region"][0] == region
        else:
            assert len(df["region"][0]) > 0

100%|██████████| 1/1 [00:45<00:00, 45.65s/it]


"df['tags'].item()='latest'"

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,user_uuid,error,disabled,ready
0,a67d7ceb-b5f3-4cfd-bc08-043442cc8973,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,eu-west-3,aws,latest,2023-02-23T09:50:51,10191763,d78ee2d4-9135-4dcd-8e96-21e127ba32c6,,False,True


100%|██████████| 1/1 [00:40<00:00, 40.59s/it]


"df['tags'].item()='latest'"

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,user_uuid,error,disabled,ready
0,28d97722-fbdc-4c8b-a6c4-65672407bce9,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,eu-west-1,aws,latest,2023-02-23T09:51:48,10191763,d78ee2d4-9135-4dcd-8e96-21e127ba32c6,,False,True


In [None]:
# Tests for DataBlob.from_s3: Setting the cloud_provider to azure

with generate_db(cloud_provider="azure", region="westeurope", force_create=True) as db:

    display(f"{db.uuid=}")
    assert len(remove_hypens_from_id(db.uuid)) == 32

    display(f"{db.is_ready()=}")
    assert db.is_ready()
    assert db.source == TEST_S3_URI

100%|██████████| 1/1 [01:41<00:00, 101.39s/it]


"db.uuid='a146f37d-44f9-4763-8cd6-f9a76414eef6'"

'db.is_ready()=True'

In [None]:
# Tests for DataBlob.from_s3
# Testing negative scenario. Passing invalid s3 url

fake_uri = "s3://fake-bucket-not-existing/fake-object-not-existing"
with DataBlob.set_default_cloud_provider(cloud_provider = "aws", region="eu-west-1"):
    db = DataBlob.from_s3(uri=fake_uri)

display(f"{db.uuid=}")
assert len(remove_hypens_from_id(db.uuid)) == 32
assert db.source == fake_uri

with pytest.raises(ValueError) as e:
    db.progress_bar()
    
display(f"{str(e.value)}")

assert "An error occurred (NoSuchBucket) when calling the ListObjects operation" in str(e.value)

"db.uuid='98d15c9f-17e1-4ba5-bae5-80ee3a5c482c'"

  0%|          | 0/1 [00:05<?, ?it/s]


'An error occurred (NoSuchBucket) when calling the ListObjects operation: The specified bucket does not exist'

In [None]:
# Tests for DataBlob.from_azure_blob_storage: Positive scenario: Passing the credential in the parameter

storage_client = StorageManagementClient(
    DefaultAzureCredential(), os.environ["AZURE_SUBSCRIPTION_ID"]
)
keys = storage_client.storage_accounts.list_keys(
    "test-airt-service", "testairtservice"
)
credential = keys.keys[0].value

for region in ["westeurope", "northeurope"]:
#     with DataBlob.set_default_azure_blob_storage_region(region):
    with DataBlob.set_default_cloud_provider(cloud_provider="azure", region=region):
        db = DataBlob.from_azure_blob_storage(uri=TEST_AZURE_URI, credential=credential)

        display(f"{db.uuid=}")
        assert len(remove_hypens_from_id(db.uuid)) == 32

        display(f"{db.is_ready()=}")
        assert not db.is_ready()
        db.progress_bar()

        display(f"{db.is_ready()=}")
        assert db.is_ready()
        assert db.source == TEST_AZURE_URI
        
        df = db.details()
        display(df)
        assert df["region"][0] == region

"db.uuid='a64cb073-eff9-4725-8bba-95896776eb07'"

'db.is_ready()=False'

100%|██████████| 1/1 [00:50<00:00, 50.75s/it]


'db.is_ready()=True'

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,user_uuid,error,disabled,ready
0,a64cb073-eff9-4725-8bba-95896776eb07,<none>,azure_blob_storage,https://testairtservice.blob.core.windows.net/...,westeurope,azure,latest,2023-02-23T09:54:40,10191763,d78ee2d4-9135-4dcd-8e96-21e127ba32c6,,False,True


"db.uuid='290a0e04-0f66-4ffe-b51d-a9fd7cda6c25'"

'db.is_ready()=False'

100%|██████████| 1/1 [01:05<00:00, 65.88s/it]


'db.is_ready()=True'

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,user_uuid,error,disabled,ready
0,290a0e04-0f66-4ffe-b51d-a9fd7cda6c25,<none>,azure_blob_storage,https://testairtservice.blob.core.windows.net/...,northeurope,azure,latest,2023-02-23T09:55:58,10191763,d78ee2d4-9135-4dcd-8e96-21e127ba32c6,,False,True


In [None]:
# Tests for DataBlob.from_azure_blob_storage: Positive scenario: Setting the cloud_provider to aws


db = DataBlob.from_azure_blob_storage(uri=TEST_AZURE_URI, credential=credential, cloud_provider="aws", region="eu-west-1")

display(f"{db.uuid=}")
assert len(remove_hypens_from_id(db.uuid)) == 32

display(f"{db.is_ready()=}")
assert not db.is_ready()
db.progress_bar()

display(f"{db.is_ready()=}")
assert db.is_ready()
assert db.source == TEST_AZURE_URI

"db.uuid='cfbe8a09-b93e-4e1c-bc1a-2ff02c94fbc4'"

'db.is_ready()=False'

100%|██████████| 1/1 [00:35<00:00, 35.51s/it]


'db.is_ready()=True'

In [None]:
# Tests for DataBlob.from_azure_blob_storage: Negative scenario: Passing invalid url and different region

invalid_url = "https://invalid_url"
region = "northeurope"
cloud_provider="azure"

db = DataBlob.from_azure_blob_storage(uri=invalid_url, credential=credential, cloud_provider=cloud_provider, region=region)
df = db.details()
assert df['region'][0] == region

with pytest.raises(ValueError) as e:
    db.progress_bar()

display(f"{str(e.value)}")

  0%|          | 0/1 [00:15<?, ?it/s]


"remote_url='https://invalid_url', subclasses=[<class 'airt.remote_path.LocalPath'>, <class 'airt.remote_path.S3Path'>, <class 'airt.remote_path.AzureBlobPath'>]"

In [None]:
# Tests for DataBlob.from_clickhouse:

# Testing positive scenario.

host = os.environ.get("CLICKHOUSE_HOST")
database = os.environ.get("CLICKHOUSE_DATABASE")
table = os.environ.get("CLICKHOUSE_EVENTS_TABLE")
protocol = "native"
index_column = "PersonId"
timestamp_column = "OccurredTimeTicks"
filters = {"AccountId": 312571}


region = "eu-west-1"

# with DataBlob.set_default_s3_region(region=region):
with DataBlob.set_default_cloud_provider(cloud_provider="aws", region=region):
    data_blob_clickhouse = DataBlob.from_clickhouse(
        host=host,
        database=database,
        table=table,
        protocol=protocol,
        index_column=index_column,
        timestamp_column=timestamp_column,
        filters=filters
    )

    data_blob_clickhouse.progress_bar()

display(f"{data_blob_clickhouse.uuid=}")
assert len(remove_hypens_from_id(data_blob_clickhouse.uuid)) == 32
assert data_blob_clickhouse.source == f"clickhouse+{protocol}://{host}:0/{database}/{table}"

df = data_blob_clickhouse.details()
assert df["region"][0] == region

100%|██████████| 1/1 [00:35<00:00, 35.56s/it]


"data_blob_clickhouse.uuid='792b2cfe-a80a-4a26-89eb-471f018427b3'"

In [None]:
# Tests for DataBlob.from_clickhouse:

# Testing negative scenario. Passing wrong username and password

username = "fake-username"
password = "fake-password"


data_blob_clickhouse = DataBlob.from_clickhouse(
    host=host,
    database=database,
    table=table,
    protocol=protocol,
    index_column=index_column,
    timestamp_column=timestamp_column,
    username=username,
    password=password,
    filters=filters,
)

with pytest.raises(ValueError) as e:
    data_blob_clickhouse.progress_bar()

display(f"{str(e.value)=}")
assert (
    "Exception: fake-username: Authentication failed: password is incorrect, or there is no user with such name."
    in str(e.value)
)
assert data_blob_clickhouse.source == f"clickhouse+{protocol}://{host}:0/{database}/{table}"

  0%|          | 0/1 [00:10<?, ?it/s]


"str(e.value)='Orig exception: Code: 516.\\nDB::Exception: fake-username: Authentication failed: password is incorrect, or there is no user with such name.. Stack trace:\\n\\n0. DB::Exception::Exception(DB::Exception::MessageMasked&&, int, bool) @ 0xddb0df5 in /usr/bin/clickh'"

In [None]:
# Tests for DataBlob.from_mysql
# Testing positive scenario.

username=os.environ["DB_USERNAME"]
password=os.environ["DB_PASSWORD"]
host=os.environ["DB_HOST"]
port=int(os.environ["DB_PORT"])
database=os.environ["DB_DATABASE"]
database_server=os.environ["DB_DATABASE_SERVER"]
table="test_db_pull"

# Creating a new db data source

data_blob_db = DataBlob.from_mysql(
    host=host,
    database=database,
    table=table,
    port=port,
    username=username,
    password=password,
)

display(f"{data_blob_db.is_ready()=}")
assert not data_blob_db.is_ready()

data_blob_db.progress_bar()

display(f"{data_blob_db.is_ready()=}")
assert data_blob_db.is_ready()
assert data_blob_db.source == f"{database_server}://{host}:{port}/{database}/{table}"

'data_blob_db.is_ready()=False'

100%|██████████| 1/1 [00:15<00:00, 15.28s/it]


'data_blob_db.is_ready()=True'

In [None]:
# Tests for DataBlob.from_mysql:
# Testing negative scenario. Passing wrong host values

data_blob_db = DataBlob.from_mysql(host="fake-host-name", database="fake-host-database", table="fake-host-table")

with pytest.raises(ValueError) as e:
    data_blob_db.progress_bar()

display(f"{str(e.value)=}")
assert "Unknown MySQL server host 'fake-host-name'" in str(e.value)
assert data_blob_db.source == f"{database_server}://fake-host-name:{port}/fake-host-database/fake-host-table"

  0%|          | 0/1 [00:05<?, ?it/s]


'str(e.value)=\'(MySQLdb.OperationalError) (2005, "Unknown MySQL server host \\\'fake-host-name\\\' (-3)")\\n(Background on this error at: https://sqlalche.me/e/14/e3q8)\''

In [None]:
# Tests for DataBlob.from_mysql:

# Checking negative scenario. The username and password not passed in params nor set in the env variables

# Clearing previously set env variables
if os.environ.get(CLIENT_DB_USERNAME):
    del os.environ[CLIENT_DB_USERNAME]

if os.environ.get(CLIENT_DB_PASSWORD):
    del os.environ[CLIENT_DB_PASSWORD]

data_blob_db = DataBlob.from_mysql(
    host=os.environ["DB_HOST"],
    database=os.environ["DB_DATABASE"],
    table="test_db_pull",
    port=int(os.environ["DB_PORT"]),
)

with pytest.raises(ValueError) as e:
    data_blob_db.progress_bar()

display(f"{str(e.value)=}")
assert "Access denied for user" in str(e.value)

# setting back the environment variable
os.environ[CLIENT_DB_USERNAME] = os.environ["DB_USERNAME"]
os.environ[CLIENT_DB_PASSWORD] = os.environ["DB_PASSWORD"]

  0%|          | 0/1 [00:05<?, ?it/s]


'str(e.value)=\'(MySQLdb.OperationalError) (1045, "Access denied for user \\\'root\\\'@\\\'172.23.0.6\\\' (using password: NO)")\\n(Background on this error at: https://sqlalche.me/e/14/e3q8)\''

In [None]:
# Tests for DataBlob.from_mysql:

# Checking positive scenario: Storing the database username and password in the environment variables
data_blob_db = DataBlob.from_mysql(
    host=os.environ["DB_HOST"],
    database=os.environ["DB_DATABASE"],
    table="test_db_pull",
    port=int(os.environ["DB_PORT"]),
)


data_blob_db.progress_bar()

display(f"{data_blob_db.uuid=}")
assert len(remove_hypens_from_id(data_blob_db.uuid)) == 32

100%|██████████| 1/1 [00:15<00:00, 15.25s/it]


"data_blob_db.uuid='5cb2a542-b368-45e8-ba24-429c6d7222ae'"

In [None]:
# Tests for DataBlob.from_local:

# Testing positive scenario. Multiple file upload.

# Create temp directory
temp_dir, csv_dirpath, parquet_path = get_test_csv_path()


display("Uploading CSV files with show_progress=True:")
db_local_csv = DataBlob.from_local(
    path=csv_dirpath
)

display(f"{db_local_csv.uuid=}")
assert len(remove_hypens_from_id(db_local_csv.uuid)) == 32

download: s3://test-airt-service/ecommerce_behavior_notebooks/_common_metadata to ../../../tmp/test_s3_download_wk0ykm9e/parquet/_common_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/_metadata to ../../../tmp/test_s3_download_wk0ykm9e/parquet/_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.10.parquet to ../../../tmp/test_s3_download_wk0ykm9e/parquet/part.10.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.0.parquet to ../../../tmp/test_s3_download_wk0ykm9e/parquet/part.0.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.11.parquet to ../../../tmp/test_s3_download_wk0ykm9e/parquet/part.11.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.13.parquet to ../../../tmp/test_s3_download_wk0ykm9e/parquet/part.13.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.12.parquet to ../../../tmp/test_s3_download_wk0ykm9e/parquet/part.12.parquet
down

[Path('/tmp/test_s3_download_wk0ykm9e/csv/file-15.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-13.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-17.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-4.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-19.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-3.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-18.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-2.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-11.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-9.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-0.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-7.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-5.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-1.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-16.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-6.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv/file-8.csv'),
 Path('/tmp/test_s3_download_wk0ykm9e/csv

'Uploading CSV files with show_progress=True:'

100%|██████████| 20/20 [01:11<00:00,  3.56s/it]


"db_local_csv.uuid='4911c647-dae1-469e-8768-a7c45c105c5c'"

In [None]:
# Uploading Parquet files with with show_progress=False

display("\n\nUploading Parquet files with with show_progress=False:")
db_local_parquet = DataBlob.from_local(
    path=parquet_path,
    show_progress=False
)

display(f"{db_local_parquet.uuid=}")
assert len(remove_hypens_from_id(db_local_parquet.uuid)) == 32

db_details = db_local_parquet.details()

display(f"{db_details['source'][0]=}")
assert db_details["source"][0] == f"local:{str(parquet_path)}"
assert db_details["region"][0] == DEFAULT_S3_REGION

# Deleting the temp directory
# shutil.rmtree(temp_dir)
# display(f"{temp_dir.exists()=}")
# assert not temp_dir.exists()

'\n\nUploading Parquet files with with show_progress=False:'

"db_local_parquet.uuid='096063b8-16da-453f-863f-61567b25a904'"

"db_details['source'][0]='local:/tmp/test_s3_download_wk0ykm9e/parquet'"

In [None]:
# Tests for DataBlob.from_local:

# Testing positive scenario. Single file upload.

# Create temp directory
# temp_dir, csv_dirpath, parquet_path = get_test_csv_path()

display("Uploading CSV file with show_progress=True:")

csv_file_path = csv_dirpath / "file-1.csv"
db_local_csv = DataBlob.from_local(
    path=csv_file_path
)

display(f"{db_local_csv.uuid=}")
assert len(remove_hypens_from_id(db_local_csv.uuid)) == 32

display("\n\nUploading CSV file with with show_progress=False:\n")
db_local_csv = DataBlob.from_local(
    path=csv_file_path,
    show_progress=False
)

display(f"{db_local_csv.uuid=}")
assert len(remove_hypens_from_id(db_local_csv.uuid)) == 32

db_local_csv_details = db_local_csv.details()

display(f"{db_local_csv_details['source'][0]=}")
assert db_local_csv_details["source"][0] == f"local:{str(csv_file_path)}"


# Deleting the temp directory
shutil.rmtree(temp_dir)
display(f"{temp_dir.exists()=}")
assert not temp_dir.exists()

'Uploading CSV file with show_progress=True:'

100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


"db_local_csv.uuid='e5a7e776-3ccf-4ae7-b21c-3af178daa251'"

'\n\nUploading CSV file with with show_progress=False:\n'

"db_local_csv.uuid='9b69af4f-5c45-4492-9eaf-318ca768571b'"

"db_local_csv_details['source'][0]='local:/tmp/test_s3_download_wk0ykm9e/csv/file-1.csv'"

'temp_dir.exists()=False'

In [None]:
# Tests for DataBlob.ls:
# Tests for the offset and limit parameters

dbx = DataBlob.ls()

display(f"{len(dbx)=}")
assert len(dbx) >= 0

# Testing list with offset and limit
offset = 1
limit = 3

dbx = DataBlob.ls(offset=offset, limit=limit)

display(f"{len(dbx)=}")
assert 0 <= len(dbx) <= limit

# Testing list with invalid offset and limit
offset = 1_000_000_000
limit = 3

dbx = DataBlob.ls(offset=offset, limit=limit)

display(f"{len(dbx)=}")
assert dbx == []

'len(dbx)=25'

'len(dbx)=3'

'len(dbx)=0'

In [None]:
# Tests for DataBlob.ls:
# Tests for the completed parameter

# Create a datablob
db = DataBlob.from_s3(
    uri=TEST_S3_URI,
)

# Passing completed=False. Should show all the datablobs.
dbx = DataBlob.ls(completed=False, limit=5000)
db_id_list = [db.uuid for db in dbx]

display(f"{db_id_list=}")
assert db.uuid in db_id_list

# Passing completed=True. Should show only the pulled datablobs.
dbx = DataBlob.ls(completed=True, limit=5000)

db_id_list = [db.uuid for db in dbx]
display(f"{db_id_list=}")
assert db.uuid not in db_id_list

"db_id_list=['e73f421b-3a91-4a6b-a269-444a3c7d1c90', '92c9bcb5-8681-4c4d-a9d8-9317ffbaef08', '56cb3306-27e3-4604-b4b0-59216fe996b3', '0cc90f14-b0bd-4976-a090-48b4996b74d4', 'cd976ec5-54b7-4462-ad00-b9517ccc6b67', '0fe1ff4c-f176-4965-9b38-1650b0ae850b', 'a67d7ceb-b5f3-4cfd-bc08-043442cc8973', '28d97722-fbdc-4c8b-a6c4-65672407bce9', 'a146f37d-44f9-4763-8cd6-f9a76414eef6', '98d15c9f-17e1-4ba5-bae5-80ee3a5c482c', 'a64cb073-eff9-4725-8bba-95896776eb07', '290a0e04-0f66-4ffe-b51d-a9fd7cda6c25', 'cfbe8a09-b93e-4e1c-bc1a-2ff02c94fbc4', 'c4c969e4-3e27-4375-b6c4-1ff536354d9b', '792b2cfe-a80a-4a26-89eb-471f018427b3', '3e2d89f8-88e3-40cd-a4a6-e589c116377b', '0f1ecfc4-85ed-409d-b027-1bd211c2cced', '3a928206-0868-4ae8-aa22-80f2301313d2', '35955244-47bb-434d-9afd-0271b78e30e8', '863e53bf-5f1c-431b-8ffc-eb0c2b2e8d2a', '5cb2a542-b368-45e8-ba24-429c6d7222ae', '4911c647-dae1-469e-8768-a7c45c105c5c', '096063b8-16da-453f-863f-61567b25a904', 'e5a7e776-3ccf-4ae7-b21c-3af178daa251', '9b69af4f-5c45-4492-9eaf-31

"db_id_list=['e73f421b-3a91-4a6b-a269-444a3c7d1c90', '92c9bcb5-8681-4c4d-a9d8-9317ffbaef08', '56cb3306-27e3-4604-b4b0-59216fe996b3', '0cc90f14-b0bd-4976-a090-48b4996b74d4', 'cd976ec5-54b7-4462-ad00-b9517ccc6b67', 'a67d7ceb-b5f3-4cfd-bc08-043442cc8973', '28d97722-fbdc-4c8b-a6c4-65672407bce9', 'a146f37d-44f9-4763-8cd6-f9a76414eef6', 'a64cb073-eff9-4725-8bba-95896776eb07', '290a0e04-0f66-4ffe-b51d-a9fd7cda6c25', 'cfbe8a09-b93e-4e1c-bc1a-2ff02c94fbc4', '792b2cfe-a80a-4a26-89eb-471f018427b3', '3a928206-0868-4ae8-aa22-80f2301313d2', '5cb2a542-b368-45e8-ba24-429c6d7222ae']"

In [None]:
# Tests for DataBlob.as_df:

dbx = DataBlob.ls()

df = DataBlob.as_df(dbx)

for c in ["datasource_uuids", "datablob_uuid", "source"]:
    assert c in list(df.columns)


assert df.shape == (len(dbx), len(DataBlob.BASIC_DB_COLS) - 1)
assert "<none>" in df["datasource_uuids"].to_list()

df[df.type == "s3"].head()

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,ready
1,92c9bcb5-8681-4c4d-a9d8-9317ffbaef08,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,eu-west-3,aws,<none>,2023-02-23T09:46:27,10191763.0,True
6,a67d7ceb-b5f3-4cfd-bc08-043442cc8973,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,eu-west-3,aws,<none>,2023-02-23T09:50:51,10191763.0,True
7,28d97722-fbdc-4c8b-a6c4-65672407bce9,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,eu-west-1,aws,<none>,2023-02-23T09:51:48,10191763.0,True
8,a146f37d-44f9-4763-8cd6-f9a76414eef6,<none>,s3,s3://test-airt-service/ecommerce_behavior_note...,westeurope,azure,<none>,2023-02-23T09:53:17,10191763.0,True
9,98d15c9f-17e1-4ba5-bae5-80ee3a5c482c,<none>,s3,s3://fake-bucket-not-existing/fake-object-not-...,eu-west-1,aws,latest,,,False


In [None]:
# Tests for DataBlob.as_df:
# Testing with empty response

dbx = []

df = DataBlob.as_df(dbx)

for c in ["datasource_uuids", "datablob_uuid"]:
    assert c in list(df.columns)

assert df.shape == (len(dbx), len(DataBlob.BASIC_DB_COLS) - 1)

df

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,ready


In [None]:
#| export


@patch
def to_datasource(
    self: DataBlob,
    *,
    file_type:str,
    index_column: str,
    sort_by: Union[str, List[str]],
    deduplicate_data: bool = False,
    blocksize: str = "256MB",
    **kwargs,
) -> DataSource:
    """Process the datablob and return a datasource object.

    Args:
        file_type: The file type of the datablob. Currently, the API only supports **"csv"** and **"parquet"** as file types.
        index_column: The column to use as index (row labels).
        sort_by: The column(s) to sort the data. Can either be a string or a list of strings.
        deduplicate_data: If set to **True** (default value **False**), the datasource will be created with duplicate rows removed.
        blocksize: The number of bytes used to split larger files. If None, then the default value **256MB** will be used.
        kwargs: Additional keyword arguments to use while processing the data.e.g: To skip 100 lines from the bottom of file, 
            pass **{"skipfooter": 100}

    Returns:
        An instance of the `DataSource` class.

    Raises:
        ValueError: If the CSV file processing fails.
        ConnectionError: If the server address is invalid or not reachable.
    """
    json_req = dict(
        file_type=file_type,
        deduplicate_data=deduplicate_data,
        index_column=index_column,
        sort_by=sort_by,
        blocksize=blocksize,
        kwargs=kwargs,
    )
    response = Client._post_data(
        relative_url=f"/datablob/{self.uuid}/to_datasource", json=json_req
    )

    return DataSource(uuid=response["uuid"])

In [None]:
#| exporti

add_example_to_docs(DataBlob.to_datasource, _docstring_example.__doc__) # type: ignore

In [None]:
# Tests for DataBlob.to_datasource:
# Positive case: Uploading a from from s3

with generate_db() as db:
    # Creating ds with kwargs
    ds = db.to_datasource(
        file_type="parquet",
        index_column="user_id",
        sort_by="event_time",
        **{"parse_dates": ["event_time"], "skipfooter": 100}
    )

    display(f"{ds.uuid=}")
    assert len (ds.uuid.replace('-', '')) == 32

    ds.progress_bar()

    display(f"{len(ds.head())=}")
    assert len(ds.head()) == 10

    display(f"{ds.dtypes['event_time'][0]=}")
    assert ds.dtypes["event_time"][0] == 'datetime64[ns, UTC]', ds.dtypes["event_time"][0]

    ds.head()

"ds.uuid='8218dde1-b39b-4dba-8f57-fc826e57573f'"

100%|██████████| 1/1 [00:55<00:00, 55.80s/it]


'len(ds.head())=10'

"ds.dtypes['event_time'][0]='datetime64[ns, UTC]'"

In [None]:
# Tests for DataBlob.to_datasource:
# Positive case: Uploading a single csv file from local

# Downloading the sample 
temp_dir, csv_dirpath, paruqet_dirpath = get_test_csv_path()

db_local_csv = DataBlob.from_local(
    path=csv_dirpath / "file-1.csv"
)

display(f"{db_local_csv.uuid=}")
assert len(remove_hypens_from_id(db_local_csv.uuid)) == 32

display(f"{db_local_csv.is_ready()=}")
assert db_local_csv.is_ready()
db_local_csv.progress_bar()

ds = db_local_csv.to_datasource(
    file_type="csv",
    index_column="user_id",
    sort_by="event_time",
)

display(f"{ds.uuid=}")
assert len (ds.uuid.replace('-', '')) == 32

ds.progress_bar()

display(f"{len(ds.head())=}")
assert len(ds.head()) == 10

ds.head()

download: s3://test-airt-service/ecommerce_behavior_notebooks/_common_metadata to ../../../tmp/test_s3_download_w9woyb82/parquet/_common_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/_metadata to ../../../tmp/test_s3_download_w9woyb82/parquet/_metadata
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.0.parquet to ../../../tmp/test_s3_download_w9woyb82/parquet/part.0.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.14.parquet to ../../../tmp/test_s3_download_w9woyb82/parquet/part.14.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.10.parquet to ../../../tmp/test_s3_download_w9woyb82/parquet/part.10.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.11.parquet to ../../../tmp/test_s3_download_w9woyb82/parquet/part.11.parquet
download: s3://test-airt-service/ecommerce_behavior_notebooks/part.12.parquet to ../../../tmp/test_s3_download_w9woyb82/parquet/part.12.parquet
down

[Path('/tmp/test_s3_download_w9woyb82/csv/file-15.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-13.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-17.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-4.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-19.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-3.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-18.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-2.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-11.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-9.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-0.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-7.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-5.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-1.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-16.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-6.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv/file-8.csv'),
 Path('/tmp/test_s3_download_w9woyb82/csv

100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


"db_local_csv.uuid='88c66045-05d8-46f1-8885-183e7395aa9a'"

'db_local_csv.is_ready()=True'

"ds.uuid='a979066a-edbc-402d-99ca-df01b3cc3fcf'"

100%|██████████| 1/1 [00:30<00:00, 30.46s/it]


'len(ds.head())=10'

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_session
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
447889667,2019-11-06 06:54:15+00:00,view,1306569,2053013558920217191,computers.notebook,acer,1029.6,219e1ee2-dc29-40fe-84e5-386309f11d82
490599351,2019-11-06 00:42:06+00:00,view,1305808,2053013558920217191,computers.notebook,hp,869.78,3ec0eb00-1072-4ce1-b221-94d9ffe25072
497403461,2019-11-06 05:37:02+00:00,view,1307310,2053013558920217191,computers.notebook,acer,283.07,e53c264b-db03-4e0f-b7ca-9f2f893f32c4
497403461,2019-11-06 05:38:47+00:00,view,1307310,2053013558920217191,computers.notebook,acer,283.07,e53c264b-db03-4e0f-b7ca-9f2f893f32c4
499359460,2019-11-06 01:36:08+00:00,view,1307076,2053013558920217191,computers.notebook,asus,669.23,28cfd249-10d7-4ca4-8d59-77d4f5bfec8c
500383663,2019-11-05 20:33:53+00:00,view,1307004,2053013558920217191,computers.notebook,lenovo,290.6,fb1c58ec-3ded-453e-b90a-9aff884f2653
501884298,2019-11-06 06:13:47+00:00,view,1300742,2053013558920217191,computers.notebook,apple,2181.22,7ec67179-bd15-4c55-8c6d-8f643ffe5bc4
501884298,2019-11-06 06:14:35+00:00,view,1306315,2053013558920217191,computers.notebook,apple,1492.7,7ec67179-bd15-4c55-8c6d-8f643ffe5bc4
501884298,2019-11-06 06:14:47+00:00,view,1306198,2053013558920217191,computers.notebook,apple,1783.55,7ec67179-bd15-4c55-8c6d-8f643ffe5bc4
501884298,2019-11-06 06:15:34+00:00,view,1304409,2053013558920217191,computers.notebook,apple,1402.87,7ec67179-bd15-4c55-8c6d-8f643ffe5bc4


In [None]:
#| export


@patch
def tag(self: DataBlob, name: str) -> pd.DataFrame:
    """Tag an existing datablob in the server.

    Args:
        name: A string to tag the datablob.

    Returns:
        A pandas dataframe with the details of the tagged datablob.

    Raises:
        ConnectionError: If the server address is invalid or not reachable.
    """
    response = Client._post_data(
        relative_url=f"/datablob/{self.uuid}/tag", json=dict(name=name)
    )
    
    response = DataBlob._get_tag_name_and_datasource_id(response)

    df = pd.DataFrame([response])[DataBlob.BASIC_DB_COLS]

    df = df.rename(columns=DataBlob.COLS_TO_RENAME)
    
    return add_ready_column(df)

In [None]:
#| exporti

add_example_to_docs(DataBlob.tag, _docstring_example.__doc__) # type: ignore

In [None]:
# Tests for DataBlob.tag

with generate_db() as db:

    # getting the details of the data source
    df = db.tag(name="v1.1.0")

    display(df)
    assert 'v1.1.0' in df.tags[0], df.tags[0]
    assert df["source"][0] == TEST_S3_URI

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,ready
0,a146f37d-44f9-4763-8cd6-f9a76414eef6,8218dde1-b39b-4dba-8f57-fc826e57573f,s3,s3://test-airt-service/ecommerce_behavior_note...,westeurope,azure,v1.1.0,2023-02-23T09:53:17,10191763,True


In [None]:
#| export


@patch
def delete(self: DataBlob) -> pd.DataFrame:
    """Delete a datablob from the server.

    Returns:
        A pandas DataFrame encapsulating the details of the deleted datablob.

    Raises:
        ConnectionError: If the server address is invalid or not reachable.
    """

    response = Client._delete_data(relative_url=f"/datablob/{self.uuid}")
    
    response = DataBlob._get_tag_name_and_datasource_id(response)

    df = pd.DataFrame([response])[DataBlob.BASIC_DB_COLS]
    
    df = df.rename(columns=DataBlob.COLS_TO_RENAME)

    return add_ready_column(df)

In [None]:
#| exporti

add_example_to_docs(DataBlob.delete, _docstring_example.__doc__) # type: ignore

In [None]:
# Tests for Datablob.delete
# Testing positive scenario

with generate_db() as db:
    df = db.delete()

    display(df)
    assert df.datablob_uuid[0] == db.uuid
    assert df.shape == (1, len(DataBlob.BASIC_DB_COLS) - 1), df.shape
    assert df["source"][0] == TEST_S3_URI
    
    # Passing disabled=False. Should show only the active datablobs.
    dbx = DataBlob.ls(disabled=False, limit=5000)
    db_id_list = [db.uuid for db in dbx]

    display(f"{db_id_list=}")
    assert db.uuid not in db_id_list

    # Passing disabled=True. Should show only the deleted datablobs.
    dbx = DataBlob.ls(disabled=True, limit=5000)

    db_id_list = [db.uuid for db in dbx]
    display(f"{db_id_list=}")
    assert db.uuid in db_id_list

    # Testing negative scenario. Deleting already deleted datablob
    with pytest.raises(ValueError) as e:
        db.delete()

    display(f"{e.value=}")

Unnamed: 0,datablob_uuid,datasource_uuids,type,source,region,cloud_provider,tags,pulled_on,folder_size,ready
0,a146f37d-44f9-4763-8cd6-f9a76414eef6,8218dde1-b39b-4dba-8f57-fc826e57573f,s3,s3://test-airt-service/ecommerce_behavior_note...,westeurope,azure,v1.1.0,2023-02-23T09:53:17,10191763,True


"db_id_list=['e73f421b-3a91-4a6b-a269-444a3c7d1c90', '92c9bcb5-8681-4c4d-a9d8-9317ffbaef08', '56cb3306-27e3-4604-b4b0-59216fe996b3', '0cc90f14-b0bd-4976-a090-48b4996b74d4', 'cd976ec5-54b7-4462-ad00-b9517ccc6b67', '0fe1ff4c-f176-4965-9b38-1650b0ae850b', 'a67d7ceb-b5f3-4cfd-bc08-043442cc8973', '28d97722-fbdc-4c8b-a6c4-65672407bce9', '98d15c9f-17e1-4ba5-bae5-80ee3a5c482c', 'a64cb073-eff9-4725-8bba-95896776eb07', '290a0e04-0f66-4ffe-b51d-a9fd7cda6c25', 'cfbe8a09-b93e-4e1c-bc1a-2ff02c94fbc4', 'c4c969e4-3e27-4375-b6c4-1ff536354d9b', '792b2cfe-a80a-4a26-89eb-471f018427b3', '3e2d89f8-88e3-40cd-a4a6-e589c116377b', '0f1ecfc4-85ed-409d-b027-1bd211c2cced', '3a928206-0868-4ae8-aa22-80f2301313d2', '35955244-47bb-434d-9afd-0271b78e30e8', '863e53bf-5f1c-431b-8ffc-eb0c2b2e8d2a', '5cb2a542-b368-45e8-ba24-429c6d7222ae', '4911c647-dae1-469e-8768-a7c45c105c5c', '096063b8-16da-453f-863f-61567b25a904', 'e5a7e776-3ccf-4ae7-b21c-3af178daa251', '9b69af4f-5c45-4492-9eaf-318ca768571b', '173c5812-e3b5-4fd2-9537-13

"db_id_list=['2266a8b6-b242-4400-96e6-a3eda7157482', 'a146f37d-44f9-4763-8cd6-f9a76414eef6']"

"e.value=ValueError('The datablob has already been deleted.')"

In [None]:
# Tests for Datablob.delete
# Testing negative scenario. Deleting invalid datablob


with pytest.raises(ValueError) as e:
    db = DataBlob(uuid=RANDOM_UUID_FOR_TESTING, type="s3")
    db.delete()

display(f"{e.value=}")

"e.value=ValueError('The datablob uuid is incorrect. Please try again.')"